Add transform helpers: null drop, dedup, type casting
This commit is contained in:
parent
37c3266d9c
commit
21fb333812
1 changed files with 30 additions and 0 deletions
30
src/transform/clean.py
Normal file
30
src/transform/clean.py
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def drop_nulls(df: pd.DataFrame, required_cols: list[str]) -> pd.DataFrame:
|
||||||
|
before = len(df)
|
||||||
|
df = df.dropna(subset=required_cols)
|
||||||
|
dropped = before - len(df)
|
||||||
|
if dropped:
|
||||||
|
print(f"[clean] Dropped {dropped} rows with nulls in {required_cols}")
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def normalise_timestamps(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
||||||
|
df[col] = pd.to_datetime(df[col], utc=True)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def deduplicate(df: pd.DataFrame, key_cols: list[str]) -> pd.DataFrame:
|
||||||
|
before = len(df)
|
||||||
|
df = df.drop_duplicates(subset=key_cols, keep="last")
|
||||||
|
print(f"[clean] Dedup removed {before - len(df)} rows")
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def cast_types(df: pd.DataFrame, schema: dict) -> pd.DataFrame:
|
||||||
|
"""Cast columns to specified types. schema = {col: dtype}"""
|
||||||
|
for col, dtype in schema.items():
|
||||||
|
if col in df.columns:
|
||||||
|
df[col] = df[col].astype(dtype)
|
||||||
|
return df
|
||||||
Loading…
Reference in a new issue