POC for volgistics importer

c-simpson · c-simpson · commit 1e0e693c5521 · 2023-03-06T21:48:59.000-05:00
diff --git a/src/server/api/file_uploader.py b/src/server/api/file_uploader.py
@@ -3,7 +3,7 @@
 from donations_importer import validate_import_sfd
 from flask import current_app
 from models import ManualMatches, SalesForceContacts, ShelterluvPeople, Volgistics
-from shifts_importer import validate_import_vs
+from shifts_importer import open_volgistics, validate_import_vs, volgistics_people_import
 from werkzeug.utils import secure_filename
 
 import structlog
@@ -26,39 +26,43 @@ def determine_upload_type(file, file_extension, conn):
     # automatically pulling from vendor APIs directly, in which case we'd know
     # what kind of data we had.
     if file_extension == "csv":
-        logger.debug("File extension is CSV")
-        df = pd.read_csv(file, dtype="string")
+        logger.warn("%s: We no longer support CSV files", file.filename)
+        # df = pd.read_csv(file, dtype="string")
 
-        if {"salesforcecontacts", "volgistics", "shelterluvpeople"}.issubset(df.columns):
-            logger.debug("File appears to be salesforcecontacts, volgistics, or shelterluvpeople (manual)")
-            ManualMatches.insert_from_df(df, conn)
-            return
-        elif {"Animal_ids", "Internal-ID"}.issubset(df.columns):
-            logger.debug("File appears to be shelterluvpeople")
-            ShelterluvPeople.insert_from_df(df, conn)
-            return
+        # if {"salesforcecontacts", "volgistics", "shelterluvpeople"}.issubset(df.columns):
+        #     logger.debug("File appears to be salesforcecontacts, volgistics, or shelterluvpeople (manual)")
+        #     ManualMatches.insert_from_df(df, conn)
+        #     return
+        # elif {"Animal_ids", "Internal-ID"}.issubset(df.columns):
+        #     logger.debug("File appears to be shelterluvpeople")
+        #     ShelterluvPeople.insert_from_df(df, conn)
+        return
 
     if file_extension == "xlsx":
-        excel_file = pd.ExcelFile(file)
-        if {"Master", "Service"}.issubset(excel_file.sheet_names):
+        # excel_file = pd.ExcelFile(file)
+        # if {"Master", "Service"}.issubset(excel_file.sheet_names):
             logger.debug("File appears to be Volgistics")
             # Volgistics
-            validate_import_vs(file, conn)
-            Volgistics.insert_from_file(excel_file, conn)
+            workbook = open_volgistics(file)
+            validate_import_vs(workbook, conn)
+            # Volgistics.insert_from_file(excel_file, conn)
+            # Insert Volg people
+            volgistics_people_import(workbook,conn)
+            workbook.close()
             return
 
-        df = pd.read_excel(excel_file)
-        if "Contact ID 18" in df.columns:
-            # Salesforce something-or-other
-            if "Amount" in df.columns:
-                # Salesforce donations
-                logger.debug("File appears to be Salesforce donations")
-                validate_import_sfd(file, conn)
-                return
-            else:
-                # Salesforce contacts
-                logger.debug("File appears to be Salesforce contacts")
-                SalesForceContacts.insert_from_file_df(df, conn)
-                return
+        # df = pd.read_excel(excel_file)
+        # if "Contact ID 18" in df.columns:
+        #     # Salesforce something-or-other
+        #     if "Amount" in df.columns:
+        #         # Salesforce donations
+        #         logger.debug("File appears to be Salesforce donations")
+        #         validate_import_sfd(file, conn)
+        #         return
+        #     else:
+        #         # Salesforce contacts
+        #         logger.debug("File appears to be Salesforce contacts")
+        #         SalesForceContacts.insert_from_file_df(df, conn)
+        #         return
 
     logger.error("Don't know how to process file: %s",  file.filename)
diff --git a/src/server/shifts_importer.py b/src/server/shifts_importer.py
@@ -1,6 +1,6 @@
 import re
 from flask.globals import current_app
-
+from datetime import datetime, timedelta 
 from openpyxl import load_workbook
 from jellyfish import jaro_similarity
 
@@ -12,12 +12,16 @@
 
 from sqlalchemy import  insert,  Table,  Column, MetaData, exc
 from sqlalchemy.dialects.postgresql import Insert
+
+from sqlalchemy.orm import sessionmaker
+
+
 metadata = MetaData()
 
 
 MINIMUM_SIMILARITY = 0.85  # How good does the table match need to be?
 
-expected_columns =  {
+expected_shifts_columns =  {
             'Number' : 'volg_id',
             'Site' : 'site',
             'Place' : None,
@@ -33,29 +37,38 @@
             'Volunteers' : None
             }
 
-def validate_import_vs(filename, conn):
+
+def open_volgistics(filename):
+    logger.info("Loading '%s' - this is slow",  filename.filename )
+    start = datetime.now()
+    wb = load_workbook(filename)   #  ,read_only=True should be faster but gets size incorrect 
+    end = datetime.now()
+    logger.info("Loaded '%s' complete in %d seconds",  filename.filename, (end-start).seconds )
+    return wb
+
+def validate_import_vs(workbook, conn):
     """ Validate that the XLSX column names int the file are close enough to expectations that we can trust the data.
         If so, insert the data into the volgisticsshifts table. 
     """
 
-    logger.info('------ Loading %s ',  filename.filename )
-    wb = load_workbook(filename)   #  ,read_only=True should be faster but gets size incorrect 
-    ws = wb['Service']   # Needs to be 'Service' sheet
+    # logger.info('------ Loading %s ',  filename.filename )
+    # wb = load_workbook(filename)   #  ,read_only=True should be faster but gets size incorrect 
+    ws = workbook['Service']   # Needs to be 'Service' sheet
     # ws.reset_dimensions()   # Tells openpyxl to ignore what sheet says and check for itself
     ws.calculate_dimension()
 
     columns = ws.max_column
     if columns > 26:
-        # TODO: Handle AA, AB, usw...
-        logger.warn("Column count > 26; columns after Z not processed")
+        # Only 13 actually populated 
+        logger.info("Column count > 26; columns after Z not processed")
         columns = 26
 
     header = [cell.value for cell in ws[1]]
 
     min_similarity = 1.0
     min_column = None
 
-    for expected, got in zip(expected_columns.keys(), header):
+    for expected, got in zip(expected_shifts_columns.keys(), header):
         jsim = jaro_similarity(expected, got) 
         if jsim < min_similarity :
             min_similarity = jsim
@@ -83,9 +96,11 @@ def validate_import_vs(filename, conn):
         for row in ws.values:        
             if seen_header: 
                 row_count += 1
-                if row_count % 1000 == 0:
+                if (row_count % 1000 == 0) and (row_count % 5000 != 0):
                     logger.debug("Row: %s", str(row_count) )
-                zrow = dict(zip(expected_columns.values(), row))  
+                if row_count % 5000 == 0:
+                    logger.info("Row: %s", str(row_count) )
+                zrow = dict(zip(expected_shifts_columns.values(), row))  
                 # zrow is a dict of db_col:value pairs, with at most one key being None (as it overwrote any previous)
                 # We need to remove the None item, if it exists
                 try:
@@ -132,5 +147,59 @@ def validate_import_vs(filename, conn):
 
         logger.info("Total rows: %s  Dupes: %s Missing volgistics id: %s",  str(row_count), str(dupes), str(missing_volgistics_id)  )
         logger.info("Other integrity exceptions: %s  Other exceptions: %s",  str(other_exceptions),  str(other_integrity) )
-        wb.close()
-        return { True : "File imported" }
+        # workbook.close()
+        return { True : "File imported" }
+    
+
+def volgistics_people_import(workbook,conn):
+
+    ws = workbook['Master']   # Needs to be 'Service' sheet
+    # ws.reset_dimensions()   # Tells openpyxl to ignore what sheet says and check for itself
+    ws.calculate_dimension()
+
+    columns = ws.max_column
+
+    #TODO: Validate header row to ensure source cols haven't changed
+
+    Session = sessionmaker(engine)
+    session = Session()
+    metadata = MetaData()
+    volg_table = Table("volgistics", metadata, autoload=True, autoload_with=engine)
+
+
+    # Cells are addressed as ws[row][col] with row being 1-based and col being 0-based
+
+    insert_list = []
+
+    #TODO: Create a dict from header row so can reference r["number"] instead of r[15]
+
+
+    for r in ws.iter_rows(min_row=2, max_col=42,values_only=True):
+        insert_list.append(
+            {
+                "number": r[15],
+                "last_name": r[3],
+                "first_name": r[4],
+                "middle_name": r[5],
+                "complete_address": r[16],
+                "street_1": r[17],
+                "street_2": r[18],
+                "street_3": r[19],
+                "city": r[20],
+                "state": r[21],
+                "zip": r[22],
+                "all_phone_numbers": r[27],
+                "home": r[28],
+                "work": r[30],
+                "cell": r[32],
+                "email": r[41]
+            }
+        )
+
+
+    ret = session.execute(volg_table.insert(insert_list))
+
+    session.commit()  # Commit all inserted rows
+    session.close()
+
+    logger.debug('%d rows inserted', ret.rowcount)