import pandas as pd import pytest from src.transform.clean import drop_nulls, deduplicate, cast_types def test_drop_nulls_removes_rows(): df = pd.DataFrame({"id": [1, 2, None], "val": ["a", "b", "c"]}) result = drop_nulls(df, ["id"]) assert len(result) == 2 assert 3 not in result["id"].values def test_deduplicate_keeps_last(): df = pd.DataFrame({"id": [1, 1, 2], "val": ["old", "new", "only"]}) result = deduplicate(df, ["id"]) assert len(result) == 2 assert result[result["id"] == 1]["val"].values[0] == "new" def test_cast_types(): df = pd.DataFrame({"count": ["1", "2", "3"]}) result = cast_types(df, {"count": int}) assert result["count"].dtype == int def test_strip_pii_hashes_values(): import hashlib df = pd.DataFrame({"email": ["alice@example.com"]}) from src.transform.clean import strip_pii result = strip_pii(df, ["email"]) expected = hashlib.sha256(b"alice@example.com").hexdigest() assert result["email"].values[0] == expected