CoronaWhy · cgomez9 · Apr 15, 2020 · Apr 15, 2020 · Apr 15, 2020 · Apr 15, 2020
diff --git a/notebooks/data_sources/US_Census.ipynb b/notebooks/data_sources/US_Census.ipynb
diff --git a/task_geo/common/fips_codes.py b/task_geo/common/fips_codes.py
diff --git a/task_geo/data_sources/demographics/us_census/__main__.py b/task_geo/data_sources/demographics/us_census/__main__.py
@@ -6,11 +6,27 @@
     - Converts and exports to CSV
 """
 
+import argparse
+
 from task_geo.data_sources.demographics.us_census import us_census
 
 
+def get_argparser():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        '-o', '--output', required=True,
+        help='Destination file to store the processed dataset.')
+
+    return parser
+
+
 def main():
-    us_census().to_csv("us-census-data.csv", header=True)
+    parser = get_argparser()
+    args = parser.parse_args()
+
+    dataset = us_census()
+    dataset.to_csv(args.output, index=False, header=True)
 
 
 if __name__ == '__main__':

diff --git a/task_geo/data_sources/demographics/us_census/audit.md b/task_geo/data_sources/demographics/us_census/audit.md
@@ -0,0 +1,44 @@
+# US Census
+
+## General information
+
+- **Description**: Annual Population Estimates for the United States, States and Counties: April 1, 2010 to July 1, 2019.
+- **Credits**: The United States Census Bureau https://data.census.gov/.
+- **Source**: [API Source](https://api.census.gov/data/2019/pep/population?get=LASTUPDATE,POP,DENSITY,UNIVERSE&for=county:*&in=state:*)
+
+## Description
+
+**last_update**
+- Description: Last update of estimation
+- Type: datetime
+
+**population_estimate**
+- Description: Population estimate of county
+- Type: integer
+
+**density**
+- Description: Population density of county
+- Type: float
+
+**state**
+- Description: FIPS Code of county state
+- Type: string
+
+**county**
+- Description: FIPS Code of county
+- Type: string
+
+**state_name**
+- Description: Name of county state
+- Type: string
+
+**county_name**
+- Description: Name of county
+- Type: string
+
+## Transformations applied
+
+- Remove columns from first rows and set them to the dataframe
+- Rename "lastupdate" and "pop" columns well formatted and with more explicit names
+- Create a column "state_name" with the state name of the FIPS code in "state" column
+- Create a column "county_name" with the state name of the FIPS code in "county" column
diff --git a/task_geo/data_sources/demographics/us_census/datapackage.json b/task_geo/data_sources/demographics/us_census/datapackage.json
@@ -0,0 +1,48 @@
+{
+    "title": "United States Census Bureau Population Estimation",
+    "description": "Annual Population Estimates for the United States, States and Counties: April 1, 2010 to July 1, 2019",
+    "licenses": [{"name": "copyright-authors"}],
+    "fields": [
+        {
+            "name": "last_update",
+            "description": "Last update of estimation",
+            "type": "datetime"
+        },
+        {
+            "name": "population_estimate",
+            "description": "Population estimate of county",
+            "type": "integer"
+        },
+        {
+            "name": "density",
+            "description": "Population density of county",
+            "type": "float"
+        },
+        {
+            "name": "state",
+            "description": "FIPS Code of county state",
+            "type": "string"
+        },
+        {
+            "name": "county",
+            "description": "FIPS Code of county",
+            "type": "string"
+        },
+        {
+            "name": "state_name",
+            "description": "Name of county state",
+            "type": "string"
+        },
+        {
+            "name": "county_name",
+            "description": "Name of county",
+            "type": "string"
+        }
+    ],
+    "keywords": [
+        "US",
+        "population",
+        "estimation",
+        "census"
+    ]
+}
diff --git a/task_geo/data_sources/demographics/us_census/us_census.py b/task_geo/data_sources/demographics/us_census/us_census.py
@@ -2,21 +2,21 @@
 us_census.py
 
 Functions:
-    - us_census_connector: Extracts data from CSV URL
-    - us_census_formatter: Cleans CSV data
+    - us_census_connector: Extracts data from JSON URL
+    - us_census_formatter: Cleans and format data
     - us_census: Combines the two previous functions
 
 Data Credits:
     The United States Census Bureau
     https://data.census.gov/
 """
-
-import urllib.request
-import zipfile
-
 import pandas as pd
 
-url = 'https://data.census.gov/api/access/table/download?download_id=iuGrLXEBm-bIwvlxENnx'
+from task_geo.common.fips_codes import county_fips_to_name, state_fips_to_name
+
+URL = 'https://api.census.gov/data/2019/pep/population?get=LASTUPDATE,POP,' \
+      'DENSITY&for=county:*&in=state:*&key=5436a8b95e523baaa40c22ec906af88a93f405eb '
+API_KEY = '5436a8b95e523baaa40c22ec906af88a93f405eb'
 
 
 def us_census():
@@ -37,22 +37,13 @@ def us_census_connector():
 
 
     Description:
-        - Opens the zip file URL and extracts the correct CSV
-        - Correct CSV: ACS 5Y Statistics
+        - Read the dataset in JSON format
 
     Returns:
         data (DataFrame with CSV Data)
     """
 
-    urllib.request.urlretrieve(url, "uscensus.zip")
-
-    with zipfile.ZipFile("uscensus.zip") as myzip:
-
-        listFiles = myzip.namelist()
-
-        myzip.extract(listFiles[5])
-        data = pd.read_csv(listFiles[5], low_memory=False)
-
+    data = pd.read_json(URL)
     return data
 
 
@@ -63,26 +54,33 @@ def us_census_formatter(data):
         data(pandas.DataFrame): Data as returned by us_census_connector.
 
     Description:
-        - Drop unnecessary columns and set index to county
-        - Make column values more readable
+        - Set columns
+        - Rename and lower column names
+        - Format dates
+        - Enrich state and county data
 
     Returns:
         pandas.DataFrame
     """
-
-    data.columns = data.iloc[0]
+    columns = list(data.iloc[0].map(lambda column: column.lower()))
+    columns[columns.index('lastupdate')] = 'last_update'
+    columns[columns.index('pop')] = 'population_estimate'
+    data.columns = columns
     data.drop(0, inplace=True)
-    data.drop("id", axis=1, inplace=True)
-    data = data.set_index('Geographic Area Name')
-
-    cols = [c for c in data.columns if '2018' in c]
-    data = data[cols]
-    data.columns = [x.split("!!")[-1] for x in data.columns]
-
-    data = data.replace("N", 0.0)
-    data.columns = [x.lower() for x in data.columns]
-
-    data.drop(data.columns[-1], axis=1, inplace=True)
-    data.drop(data.columns[-1], axis=1, inplace=True)
+    data["county"] = data["state"] + data["county"]
+    data["sub_region"] = data["county"].apply(
+        lambda county: county_fips_to_name(county))
+    data["region"] = data["state"].apply(lambda state: state_fips_to_name(state))
+    data["country"] = 'USA'
+    data['last_update'] = pd.to_datetime(data.last_update)
+    cols_ordered = [
+        'country', 'region', 'sub_region',
+        'last_update', 'population_estimate', 'density',
+    ]
+    data = data.reindex(columns=cols_ordered)
+    data = data.astype({
+        'population_estimate': 'float32',
+        'density': 'float32'
+    })
 
     return data
diff --git a/tests/data_sources/us_census/test_formatter.py b/tests/data_sources/us_census/test_formatter.py
@@ -0,0 +1,20 @@
+from unittest import TestCase
+
+import pandas as pd
+
+from task_geo.data_sources.demographics.us_census.us_census import us_census_formatter
+from task_geo.testing import check_dataset_format
+
+
+class TestUsCensusFormatter(TestCase):
+
+    def test_formatter(self):
+        """Validate formatter output for datasource US census."""
+        # Setup
+        fixture = pd.read_json('tests/fixtures/us_census.json')
+
+        # Run
+        data = us_census_formatter(fixture)
+
+        # Check
+        check_dataset_format(data)