Add PII anonymisation step — legal requirement from data protection review
This commit is contained in:
parent
15a5606f45
commit
773e577dad
1 changed files with 11 additions and 0 deletions
|
|
@ -28,3 +28,14 @@ def cast_types(df: pd.DataFrame, schema: dict) -> pd.DataFrame:
|
||||||
if col in df.columns:
|
if col in df.columns:
|
||||||
df[col] = df[col].astype(dtype)
|
df[col] = df[col].astype(dtype)
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def strip_pii(df: pd.DataFrame, pii_cols: list[str]) -> pd.DataFrame:
|
||||||
|
"""Replace PII columns with SHA-256 hashes for anonymisation."""
|
||||||
|
import hashlib
|
||||||
|
for col in pii_cols:
|
||||||
|
if col in df.columns:
|
||||||
|
df[col] = df[col].astype(str).apply(
|
||||||
|
lambda v: hashlib.sha256(v.encode()).hexdigest()
|
||||||
|
)
|
||||||
|
return df
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue