Skip to content

Commit 64b8107

Browse files
committed
Adding address missingness checker #115
1 parent 9930335 commit 64b8107

File tree

2 files changed

+62
-0
lines changed

2 files changed

+62
-0
lines changed

pipeline/orchestrator.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,9 @@ def run_step_2_preprocess(
227227
df_filtered = preprocess.filter_columns(mapped_df)
228228
df = preprocess.ensure_required_columns(df_filtered)
229229

230+
# Check that addresses are complete, return only complete rows
231+
df = preprocess.check_addresses_complete(df)
232+
230233
# Load configuration
231234
vaccine_reference_path = preprocess.VACCINE_REFERENCE_PATH
232235
vaccine_reference = json.loads(vaccine_reference_path.read_text(encoding="utf-8"))

pipeline/preprocess.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,65 @@ def format_iso_date_for_language(iso_date: str, language: str) -> str:
172172

173173
return format_date(date_obj, format="long", locale=locale)
174174

175+
def check_addresses_complete(df: pd.DataFrame) -> pd.DataFrame:
176+
"""
177+
Check if address fields are complete in the DataFrame.
178+
179+
Adds a boolean 'address_complete' column based on presence of
180+
street address, city, province, and postal code.
181+
"""
182+
183+
df = df.copy()
184+
185+
# Normalize text fields: convert to string, strip whitespace, convert "" to NA
186+
address_cols = [
187+
"STREET_ADDRESS_LINE_1",
188+
"STREET_ADDRESS_LINE_2",
189+
"CITY",
190+
"PROVINCE",
191+
"POSTAL_CODE",
192+
]
193+
194+
for col in address_cols:
195+
df[col] = (
196+
df[col]
197+
.astype(str)
198+
.str.strip()
199+
.replace({"": pd.NA, "nan": pd.NA})
200+
)
201+
202+
# Build combined address line
203+
df["ADDRESS"] = (
204+
df["STREET_ADDRESS_LINE_1"].fillna("") + " " +
205+
df["STREET_ADDRESS_LINE_2"].fillna("")
206+
).str.strip()
207+
208+
df["ADDRESS"] = df["ADDRESS"].replace({"": pd.NA})
209+
210+
# Check completeness
211+
df["address_complete"] = (
212+
df["ADDRESS"].notna()
213+
& df["CITY"].notna()
214+
& df["PROVINCE"].notna()
215+
& df["POSTAL_CODE"].notna()
216+
)
217+
218+
if not df["address_complete"].all():
219+
incomplete_count = (~df["address_complete"]).sum()
220+
LOG.warning(
221+
"There are %d records with incomplete address information.",
222+
incomplete_count,
223+
)
224+
225+
incomplete_records = df.loc[~df["address_complete"]]
226+
227+
incomplete_path = Path("incomplete_addresses.csv")
228+
incomplete_records.to_csv(incomplete_path, index=False)
229+
LOG.info("Incomplete address records written to %s", incomplete_path)
230+
231+
# Return only rows with complete addresses
232+
return df.loc[df["address_complete"]].drop(columns=["address_complete"])
233+
175234

176235
def convert_date_iso(date_str: str) -> str:
177236
"""Convert a date from English display format to ISO format.

0 commit comments

Comments
 (0)