Skip to content

Commit 44cd0c9

Browse files
authored
Merge pull request #116 from WDGPH/feat/quality-address-checker
Feat/quality address checker
2 parents 1e0d3e8 + 4582837 commit 44cd0c9

File tree

4 files changed

+65
-5
lines changed

4 files changed

+65
-5
lines changed

input/rodent_dataset.xlsx

128 Bytes
Binary file not shown.

pipeline/bundle_pdfs.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,9 @@
44
accompanying manifest records. It can be invoked as a CLI tool or imported for
55
unit testing. Bundling supports three modes:
66
7-
* Size-based (default): chunk the ordered list of PDFs into groups of
8-
``bundle_size``.
9-
* School-based: group by ``school_code`` and then chunk each group while
10-
preserving client order.
11-
* Board-based: group by ``board_code`` and chunk each group.
7+
* Size-based (default): bundle the clients into fixed-size groups, i.e., 100 per bundle.
8+
* School-based: group the clients by school code then bundle the clients into fixed-sized groups
9+
* Board-based: group the clients by board code then bundle the clients into fixed-sized groups
1210
1311
Each bundle produces a merged PDF inside ``output/pdf_combined`` and a manifest JSON
1412
record inside ``output/metadata`` that captures critical metadata for audits.

pipeline/orchestrator.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,9 @@ def run_step_2_preprocess(
227227
df_filtered = preprocess.filter_columns(mapped_df)
228228
df = preprocess.ensure_required_columns(df_filtered)
229229

230+
# Check that addresses are complete, return only complete rows
231+
df = preprocess.check_addresses_complete(df)
232+
230233
# Load configuration
231234
vaccine_reference_path = preprocess.VACCINE_REFERENCE_PATH
232235
vaccine_reference = json.loads(vaccine_reference_path.read_text(encoding="utf-8"))

pipeline/preprocess.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,65 @@ def format_iso_date_for_language(iso_date: str, language: str) -> str:
172172

173173
return format_date(date_obj, format="long", locale=locale)
174174

175+
def check_addresses_complete(df: pd.DataFrame) -> pd.DataFrame:
176+
"""
177+
Check if address fields are complete in the DataFrame.
178+
179+
Adds a boolean 'address_complete' column based on presence of
180+
street address, city, province, and postal code.
181+
"""
182+
183+
df = df.copy()
184+
185+
# Normalize text fields: convert to string, strip whitespace, convert "" to NA
186+
address_cols = [
187+
"STREET_ADDRESS_LINE_1",
188+
"STREET_ADDRESS_LINE_2",
189+
"CITY",
190+
"PROVINCE",
191+
"POSTAL_CODE",
192+
]
193+
194+
for col in address_cols:
195+
df[col] = (
196+
df[col]
197+
.astype(str)
198+
.str.strip()
199+
.replace({"": pd.NA, "nan": pd.NA})
200+
)
201+
202+
# Build combined address line
203+
df["ADDRESS"] = (
204+
df["STREET_ADDRESS_LINE_1"].fillna("") + " " +
205+
df["STREET_ADDRESS_LINE_2"].fillna("")
206+
).str.strip()
207+
208+
df["ADDRESS"] = df["ADDRESS"].replace({"": pd.NA})
209+
210+
# Check completeness
211+
df["address_complete"] = (
212+
df["ADDRESS"].notna()
213+
& df["CITY"].notna()
214+
& df["PROVINCE"].notna()
215+
& df["POSTAL_CODE"].notna()
216+
)
217+
218+
if not df["address_complete"].all():
219+
incomplete_count = (~df["address_complete"]).sum()
220+
LOG.warning(
221+
"There are %d records with incomplete address information.",
222+
incomplete_count,
223+
)
224+
225+
incomplete_records = df.loc[~df["address_complete"]]
226+
227+
incomplete_path = Path("output/incomplete_addresses.csv")
228+
incomplete_records.to_csv(incomplete_path, index=False)
229+
LOG.info("Incomplete address records written to %s", incomplete_path)
230+
231+
# Return only rows with complete addresses
232+
return df.loc[df["address_complete"]].drop(columns=["address_complete"])
233+
175234

176235
def convert_date_iso(date_str: str) -> str:
177236
"""Convert a date from English display format to ISO format.

0 commit comments

Comments
 (0)