diff --git a/src/transform/clean.py b/src/transform/clean.py index ea4424a..ec4c989 100644 --- a/src/transform/clean.py +++ b/src/transform/clean.py @@ -28,3 +28,14 @@ def cast_types(df: pd.DataFrame, schema: dict) -> pd.DataFrame: if col in df.columns: df[col] = df[col].astype(dtype) return df + + +def strip_pii(df: pd.DataFrame, pii_cols: list[str]) -> pd.DataFrame: + """Replace PII columns with SHA-256 hashes for anonymisation.""" + import hashlib + for col in pii_cols: + if col in df.columns: + df[col] = df[col].astype(str).apply( + lambda v: hashlib.sha256(v.encode()).hexdigest() + ) + return df