diff --git a/src/transform/clean.py b/src/transform/clean.py new file mode 100644 index 0000000..0dfc348 --- /dev/null +++ b/src/transform/clean.py @@ -0,0 +1,30 @@ +import pandas as pd + + +def drop_nulls(df: pd.DataFrame, required_cols: list[str]) -> pd.DataFrame: + before = len(df) + df = df.dropna(subset=required_cols) + dropped = before - len(df) + if dropped: + print(f"[clean] Dropped {dropped} rows with nulls in {required_cols}") + return df + + +def normalise_timestamps(df: pd.DataFrame, col: str) -> pd.DataFrame: + df[col] = pd.to_datetime(df[col], utc=True) + return df + + +def deduplicate(df: pd.DataFrame, key_cols: list[str]) -> pd.DataFrame: + before = len(df) + df = df.drop_duplicates(subset=key_cols, keep="last") + print(f"[clean] Dedup removed {before - len(df)} rows") + return df + + +def cast_types(df: pd.DataFrame, schema: dict) -> pd.DataFrame: + """Cast columns to specified types. schema = {col: dtype}""" + for col, dtype in schema.items(): + if col in df.columns: + df[col] = df[col].astype(dtype) + return df