From 773e577dad1148d42f3f6733b642cbcdea6c718d Mon Sep 17 00:00:00 2001 From: Nina Seidel Date: Tue, 24 Mar 2026 09:30:00 +0000 Subject: [PATCH] =?UTF-8?q?Add=20PII=20anonymisation=20step=20=E2=80=94=20?= =?UTF-8?q?legal=20requirement=20from=20data=20protection=20review?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/transform/clean.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/transform/clean.py b/src/transform/clean.py index ea4424a..ec4c989 100644 --- a/src/transform/clean.py +++ b/src/transform/clean.py @@ -28,3 +28,14 @@ def cast_types(df: pd.DataFrame, schema: dict) -> pd.DataFrame: if col in df.columns: df[col] = df[col].astype(dtype) return df + + +def strip_pii(df: pd.DataFrame, pii_cols: list[str]) -> pd.DataFrame: + """Replace PII columns with SHA-256 hashes for anonymisation.""" + import hashlib + for col in pii_cols: + if col in df.columns: + df[col] = df[col].astype(str).apply( + lambda v: hashlib.sha256(v.encode()).hexdigest() + ) + return df