Skip to content

Commit 1e0e693

Browse files
committed
POC for volgistics importer
1 parent 072661b commit 1e0e693

File tree

2 files changed

+114
-41
lines changed

2 files changed

+114
-41
lines changed

src/server/api/file_uploader.py

Lines changed: 32 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from donations_importer import validate_import_sfd
44
from flask import current_app
55
from models import ManualMatches, SalesForceContacts, ShelterluvPeople, Volgistics
6-
from shifts_importer import validate_import_vs
6+
from shifts_importer import open_volgistics, validate_import_vs, volgistics_people_import
77
from werkzeug.utils import secure_filename
88

99
import structlog
@@ -26,39 +26,43 @@ def determine_upload_type(file, file_extension, conn):
2626
# automatically pulling from vendor APIs directly, in which case we'd know
2727
# what kind of data we had.
2828
if file_extension == "csv":
29-
logger.debug("File extension is CSV")
30-
df = pd.read_csv(file, dtype="string")
29+
logger.warn("%s: We no longer support CSV files", file.filename)
30+
# df = pd.read_csv(file, dtype="string")
3131

32-
if {"salesforcecontacts", "volgistics", "shelterluvpeople"}.issubset(df.columns):
33-
logger.debug("File appears to be salesforcecontacts, volgistics, or shelterluvpeople (manual)")
34-
ManualMatches.insert_from_df(df, conn)
35-
return
36-
elif {"Animal_ids", "Internal-ID"}.issubset(df.columns):
37-
logger.debug("File appears to be shelterluvpeople")
38-
ShelterluvPeople.insert_from_df(df, conn)
39-
return
32+
# if {"salesforcecontacts", "volgistics", "shelterluvpeople"}.issubset(df.columns):
33+
# logger.debug("File appears to be salesforcecontacts, volgistics, or shelterluvpeople (manual)")
34+
# ManualMatches.insert_from_df(df, conn)
35+
# return
36+
# elif {"Animal_ids", "Internal-ID"}.issubset(df.columns):
37+
# logger.debug("File appears to be shelterluvpeople")
38+
# ShelterluvPeople.insert_from_df(df, conn)
39+
return
4040

4141
if file_extension == "xlsx":
42-
excel_file = pd.ExcelFile(file)
43-
if {"Master", "Service"}.issubset(excel_file.sheet_names):
42+
# excel_file = pd.ExcelFile(file)
43+
# if {"Master", "Service"}.issubset(excel_file.sheet_names):
4444
logger.debug("File appears to be Volgistics")
4545
# Volgistics
46-
validate_import_vs(file, conn)
47-
Volgistics.insert_from_file(excel_file, conn)
46+
workbook = open_volgistics(file)
47+
validate_import_vs(workbook, conn)
48+
# Volgistics.insert_from_file(excel_file, conn)
49+
# Insert Volg people
50+
volgistics_people_import(workbook,conn)
51+
workbook.close()
4852
return
4953

50-
df = pd.read_excel(excel_file)
51-
if "Contact ID 18" in df.columns:
52-
# Salesforce something-or-other
53-
if "Amount" in df.columns:
54-
# Salesforce donations
55-
logger.debug("File appears to be Salesforce donations")
56-
validate_import_sfd(file, conn)
57-
return
58-
else:
59-
# Salesforce contacts
60-
logger.debug("File appears to be Salesforce contacts")
61-
SalesForceContacts.insert_from_file_df(df, conn)
62-
return
54+
# df = pd.read_excel(excel_file)
55+
# if "Contact ID 18" in df.columns:
56+
# # Salesforce something-or-other
57+
# if "Amount" in df.columns:
58+
# # Salesforce donations
59+
# logger.debug("File appears to be Salesforce donations")
60+
# validate_import_sfd(file, conn)
61+
# return
62+
# else:
63+
# # Salesforce contacts
64+
# logger.debug("File appears to be Salesforce contacts")
65+
# SalesForceContacts.insert_from_file_df(df, conn)
66+
# return
6367

6468
logger.error("Don't know how to process file: %s", file.filename)

src/server/shifts_importer.py

Lines changed: 82 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import re
22
from flask.globals import current_app
3-
3+
from datetime import datetime, timedelta
44
from openpyxl import load_workbook
55
from jellyfish import jaro_similarity
66

@@ -12,12 +12,16 @@
1212

1313
from sqlalchemy import insert, Table, Column, MetaData, exc
1414
from sqlalchemy.dialects.postgresql import Insert
15+
16+
from sqlalchemy.orm import sessionmaker
17+
18+
1519
metadata = MetaData()
1620

1721

1822
MINIMUM_SIMILARITY = 0.85 # How good does the table match need to be?
1923

20-
expected_columns = {
24+
expected_shifts_columns = {
2125
'Number' : 'volg_id',
2226
'Site' : 'site',
2327
'Place' : None,
@@ -33,29 +37,38 @@
3337
'Volunteers' : None
3438
}
3539

36-
def validate_import_vs(filename, conn):
40+
41+
def open_volgistics(filename):
42+
logger.info("Loading '%s' - this is slow", filename.filename )
43+
start = datetime.now()
44+
wb = load_workbook(filename) # ,read_only=True should be faster but gets size incorrect
45+
end = datetime.now()
46+
logger.info("Loaded '%s' complete in %d seconds", filename.filename, (end-start).seconds )
47+
return wb
48+
49+
def validate_import_vs(workbook, conn):
3750
""" Validate that the XLSX column names int the file are close enough to expectations that we can trust the data.
3851
If so, insert the data into the volgisticsshifts table.
3952
"""
4053

41-
logger.info('------ Loading %s ', filename.filename )
42-
wb = load_workbook(filename) # ,read_only=True should be faster but gets size incorrect
43-
ws = wb['Service'] # Needs to be 'Service' sheet
54+
# logger.info('------ Loading %s ', filename.filename )
55+
# wb = load_workbook(filename) # ,read_only=True should be faster but gets size incorrect
56+
ws = workbook['Service'] # Needs to be 'Service' sheet
4457
# ws.reset_dimensions() # Tells openpyxl to ignore what sheet says and check for itself
4558
ws.calculate_dimension()
4659

4760
columns = ws.max_column
4861
if columns > 26:
49-
# TODO: Handle AA, AB, usw...
50-
logger.warn("Column count > 26; columns after Z not processed")
62+
# Only 13 actually populated
63+
logger.info("Column count > 26; columns after Z not processed")
5164
columns = 26
5265

5366
header = [cell.value for cell in ws[1]]
5467

5568
min_similarity = 1.0
5669
min_column = None
5770

58-
for expected, got in zip(expected_columns.keys(), header):
71+
for expected, got in zip(expected_shifts_columns.keys(), header):
5972
jsim = jaro_similarity(expected, got)
6073
if jsim < min_similarity :
6174
min_similarity = jsim
@@ -83,9 +96,11 @@ def validate_import_vs(filename, conn):
8396
for row in ws.values:
8497
if seen_header:
8598
row_count += 1
86-
if row_count % 1000 == 0:
99+
if (row_count % 1000 == 0) and (row_count % 5000 != 0):
87100
logger.debug("Row: %s", str(row_count) )
88-
zrow = dict(zip(expected_columns.values(), row))
101+
if row_count % 5000 == 0:
102+
logger.info("Row: %s", str(row_count) )
103+
zrow = dict(zip(expected_shifts_columns.values(), row))
89104
# zrow is a dict of db_col:value pairs, with at most one key being None (as it overwrote any previous)
90105
# We need to remove the None item, if it exists
91106
try:
@@ -132,5 +147,59 @@ def validate_import_vs(filename, conn):
132147

133148
logger.info("Total rows: %s Dupes: %s Missing volgistics id: %s", str(row_count), str(dupes), str(missing_volgistics_id) )
134149
logger.info("Other integrity exceptions: %s Other exceptions: %s", str(other_exceptions), str(other_integrity) )
135-
wb.close()
136-
return { True : "File imported" }
150+
# workbook.close()
151+
return { True : "File imported" }
152+
153+
154+
def volgistics_people_import(workbook,conn):
155+
156+
ws = workbook['Master'] # Needs to be 'Service' sheet
157+
# ws.reset_dimensions() # Tells openpyxl to ignore what sheet says and check for itself
158+
ws.calculate_dimension()
159+
160+
columns = ws.max_column
161+
162+
#TODO: Validate header row to ensure source cols haven't changed
163+
164+
Session = sessionmaker(engine)
165+
session = Session()
166+
metadata = MetaData()
167+
volg_table = Table("volgistics", metadata, autoload=True, autoload_with=engine)
168+
169+
170+
# Cells are addressed as ws[row][col] with row being 1-based and col being 0-based
171+
172+
insert_list = []
173+
174+
#TODO: Create a dict from header row so can reference r["number"] instead of r[15]
175+
176+
177+
for r in ws.iter_rows(min_row=2, max_col=42,values_only=True):
178+
insert_list.append(
179+
{
180+
"number": r[15],
181+
"last_name": r[3],
182+
"first_name": r[4],
183+
"middle_name": r[5],
184+
"complete_address": r[16],
185+
"street_1": r[17],
186+
"street_2": r[18],
187+
"street_3": r[19],
188+
"city": r[20],
189+
"state": r[21],
190+
"zip": r[22],
191+
"all_phone_numbers": r[27],
192+
"home": r[28],
193+
"work": r[30],
194+
"cell": r[32],
195+
"email": r[41]
196+
}
197+
)
198+
199+
200+
ret = session.execute(volg_table.insert(insert_list))
201+
202+
session.commit() # Commit all inserted rows
203+
session.close()
204+
205+
logger.debug('%d rows inserted', ret.rowcount)

0 commit comments

Comments
 (0)