Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,607 changes: 3,607 additions & 0 deletions notebooks/data_sources/US_Census.ipynb

Large diffs are not rendered by default.

3,296 changes: 3,296 additions & 0 deletions task_geo/common/fips_codes.py

Large diffs are not rendered by default.

18 changes: 17 additions & 1 deletion task_geo/data_sources/demographics/us_census/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,27 @@
- Converts and exports to CSV
"""

import argparse

from task_geo.data_sources.demographics.us_census import us_census


def get_argparser():
parser = argparse.ArgumentParser()

parser.add_argument(
'-o', '--output', required=True,
help='Destination file to store the processed dataset.')

return parser


def main():
us_census().to_csv("us-census-data.csv", header=True)
parser = get_argparser()
args = parser.parse_args()

dataset = us_census()
dataset.to_csv(args.output, index=False, header=True)


if __name__ == '__main__':
Expand Down
44 changes: 44 additions & 0 deletions task_geo/data_sources/demographics/us_census/audit.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# US Census

## General information

- **Description**: Annual Population Estimates for the United States, States and Counties: April 1, 2010 to July 1, 2019.
- **Credits**: The United States Census Bureau https://data.census.gov/.
- **Source**: [API Source](https://api.census.gov/data/2019/pep/population?get=LASTUPDATE,POP,DENSITY,UNIVERSE&for=county:*&in=state:*)

## Description

**last_update**
- Description: Last update of estimation
- Type: datetime

**population_estimate**
- Description: Population estimate of county
- Type: integer

**density**
- Description: Population density of county
- Type: float

**state**
- Description: FIPS Code of county state
- Type: string

**county**
- Description: FIPS Code of county
- Type: string

**state_name**
- Description: Name of county state
- Type: string

**county_name**
- Description: Name of county
- Type: string

## Transformations applied

- Remove columns from first rows and set them to the dataframe
- Rename "lastupdate" and "pop" columns well formatted and with more explicit names
- Create a column "state_name" with the state name of the FIPS code in "state" column
- Create a column "county_name" with the state name of the FIPS code in "county" column
48 changes: 48 additions & 0 deletions task_geo/data_sources/demographics/us_census/datapackage.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"title": "United States Census Bureau Population Estimation",
"description": "Annual Population Estimates for the United States, States and Counties: April 1, 2010 to July 1, 2019",
"licenses": [{"name": "copyright-authors"}],
"fields": [
{
"name": "last_update",
"description": "Last update of estimation",
"type": "datetime"
},
{
"name": "population_estimate",
"description": "Population estimate of county",
"type": "integer"
},
{
"name": "density",
"description": "Population density of county",
"type": "float"
},
{
"name": "state",
"description": "FIPS Code of county state",
"type": "string"
},
{
"name": "county",
"description": "FIPS Code of county",
"type": "string"
},
{
"name": "state_name",
"description": "Name of county state",
"type": "string"
},
{
"name": "county_name",
"description": "Name of county",
"type": "string"
}
],
"keywords": [
"US",
"population",
"estimation",
"census"
]
}
66 changes: 32 additions & 34 deletions task_geo/data_sources/demographics/us_census/us_census.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@
us_census.py

Functions:
- us_census_connector: Extracts data from CSV URL
- us_census_formatter: Cleans CSV data
- us_census_connector: Extracts data from JSON URL
- us_census_formatter: Cleans and format data
- us_census: Combines the two previous functions

Data Credits:
The United States Census Bureau
https://data.census.gov/
"""

import urllib.request
import zipfile

import pandas as pd

url = 'https://data.census.gov/api/access/table/download?download_id=iuGrLXEBm-bIwvlxENnx'
from task_geo.common.fips_codes import county_fips_to_name, state_fips_to_name

URL = 'https://api.census.gov/data/2019/pep/population?get=LASTUPDATE,POP,' \
'DENSITY&for=county:*&in=state:*&key=5436a8b95e523baaa40c22ec906af88a93f405eb '
API_KEY = '5436a8b95e523baaa40c22ec906af88a93f405eb'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are the usage limits and politics of this API?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to Terms of Service: "Right to Limit
Your use of the API may be subject to certain limitations on access, calls, or use as set forth within this Agreement or otherwise provided by the Census Bureau. If the Census Bureau reasonably believes that you have attempted to exceed or circumvent these limits, your ability to use the API may be temporarily or permanently blocked. The Census Bureau may monitor your use of the API to improve the service or to insure compliance with this Agreement."



def us_census():
Expand All @@ -37,22 +37,13 @@ def us_census_connector():


Description:
- Opens the zip file URL and extracts the correct CSV
- Correct CSV: ACS 5Y Statistics
- Read the dataset in JSON format

Returns:
data (DataFrame with CSV Data)
"""

urllib.request.urlretrieve(url, "uscensus.zip")

with zipfile.ZipFile("uscensus.zip") as myzip:

listFiles = myzip.namelist()

myzip.extract(listFiles[5])
data = pd.read_csv(listFiles[5], low_memory=False)

data = pd.read_json(URL)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need to assign the dataframe to a variable and then return it, you can return it directly:

return pd.read_json(URL)

return data


Expand All @@ -63,26 +54,33 @@ def us_census_formatter(data):
data(pandas.DataFrame): Data as returned by us_census_connector.

Description:
- Drop unnecessary columns and set index to county
- Make column values more readable
- Set columns
- Rename and lower column names
- Format dates
- Enrich state and county data

Returns:
pandas.DataFrame
"""

data.columns = data.iloc[0]
columns = list(data.iloc[0].map(lambda column: column.lower()))
columns[columns.index('lastupdate')] = 'last_update'
columns[columns.index('pop')] = 'population_estimate'
data.columns = columns
data.drop(0, inplace=True)
data.drop("id", axis=1, inplace=True)
data = data.set_index('Geographic Area Name')

cols = [c for c in data.columns if '2018' in c]
data = data[cols]
data.columns = [x.split("!!")[-1] for x in data.columns]

data = data.replace("N", 0.0)
data.columns = [x.lower() for x in data.columns]

data.drop(data.columns[-1], axis=1, inplace=True)
data.drop(data.columns[-1], axis=1, inplace=True)
data["county"] = data["state"] + data["county"]
data["sub_region"] = data["county"].apply(
lambda county: county_fips_to_name(county))
data["region"] = data["state"].apply(lambda state: state_fips_to_name(state))
data["country"] = 'USA'
data['last_update'] = pd.to_datetime(data.last_update)
cols_ordered = [
'country', 'region', 'sub_region',
'last_update', 'population_estimate', 'density',
]
data = data.reindex(columns=cols_ordered)
data = data.astype({
'population_estimate': 'float32',
'density': 'float32'
})

return data
20 changes: 20 additions & 0 deletions tests/data_sources/us_census/test_formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from unittest import TestCase

import pandas as pd

from task_geo.data_sources.demographics.us_census.us_census import us_census_formatter
from task_geo.testing import check_dataset_format


class TestUsCensusFormatter(TestCase):

def test_formatter(self):
"""Validate formatter output for datasource US census."""
# Setup
fixture = pd.read_json('tests/fixtures/us_census.json')

# Run
data = us_census_formatter(fixture)

# Check
check_dataset_format(data)
Loading