diff --git a/requirements.txt b/requirements.txt index cf307f1..76e0396 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,12 @@ + +gspread==6.1.2 hubspot-api-client==8.2.1 notion-client==2.2.1 numpy==1.26.4 pandas==1.3.4 pyodbc==5.1.0 +pytest==8.3.2 python-slugify==8.0.4 requests==2.32.3 + + diff --git a/wherescape/connectors/gsheet/__init__.py b/wherescape/connectors/gsheet/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wherescape/connectors/gsheet/create_metadata.py b/wherescape/connectors/gsheet/create_metadata.py new file mode 100644 index 0000000..f7f9282 --- /dev/null +++ b/wherescape/connectors/gsheet/create_metadata.py @@ -0,0 +1,88 @@ +import logging +from datetime import datetime, UTC + +from ...helper_functions import create_column_names, create_display_names, prepare_metadata_query +from ...wherescape import WhereScape +from .gsheets_wrapper import Gsheet +from .gsheets_parsing import parse_gspread_arguments + + +def python_gsheet_create_metadata(): + """ + Function that creates a load table in Wherescape based on the data + in a provided Google sheet file. + """ + start_time = datetime.now(tz=UTC) + # Initialize Wherescape + logging.info("Connecting to WhereScape") + wherescape_instance = WhereScape() + logging.info( + "Start time: %s for gsheet_load_data_os." % start_time.strftime("%Y-%m-%d %H:%M:%S") + ) + + load_table_name = wherescape_instance.table + url = wherescape_instance.query_meta( + "select lt_file_path from ws_load_tab where lt_table_name = ?", + [load_table_name], + )[0][0] + workbook_details = wherescape_instance.query_meta( + "select lt_file_name from ws_load_tab where lt_table_name = ?", + [load_table_name], + )[0][0] + logging.info(f"Metadata. URL: {url} ; Details : {workbook_details}") + + args = parse_gspread_arguments(workbook_details) + if args.debug: + logging.warning("Debug mode on -> do not use for production.") + + gsheet = Gsheet(args, url) + + header_row = gsheet.get_header() + column_types = gsheet.get_column_types() + title = gsheet.get_worksheet().title + lt_obj_key = wherescape_instance.object_key + + display_names = create_display_names(header_row) + column_names = create_column_names(header_row) + source_columns, comments = set_source_columns_and_comments(header_row) + + sql = prepare_metadata_query( + lt_obj_key = lt_obj_key, + src_table_name = title, + columns=column_names, + display_names=display_names, + types=column_types, + comments=comments, + source_columns=source_columns, + ) + logging.info(f"Stored details for {len(header_row)} columns") + + wherescape_instance.push_to_meta(sql) + logging.info("--> Metadata updated. Table can be created.") + + end_time = datetime.now(tz=UTC) + logging.info("End time: %s" % end_time.strftime("%Y-%m-%d %H:%M:%S")) + logging.info("Time elapsed: %s seconds" % (end_time - start_time).seconds) + + +def set_source_columns_and_comments(header_row: list): + """ + Fuction to determine source_column and comments for metadata. + + Params: + header_row (list): header values. + + Returns: + - list: source_column values. + - list: comment values. + """ + comments = [] + source_columns = [] + + for value in header_row: + src_column_name = value.rstrip() + + comments.append(src_column_name[0:1023].replace("'", "''")) + source_columns.append(src_column_name) + + return source_columns, comments diff --git a/wherescape/connectors/gsheet/gsheets_parsing.py b/wherescape/connectors/gsheet/gsheets_parsing.py new file mode 100644 index 0000000..d875a24 --- /dev/null +++ b/wherescape/connectors/gsheet/gsheets_parsing.py @@ -0,0 +1,81 @@ +import argparse +import logging +import shlex + +from gspread.utils import a1_range_to_grid_range + + +def parse_gspread_arguments(argument: str) -> argparse.Namespace: + """ + Converts an argument string into args object. + + Parameters: + - argument (str): arguments for the parser collected in a string. + + Returns + - args (Namespace): object with all arguments provided stored within. + """ + if argument == "": + logging.info("No arguments provided. Using defaults.") + + argument_list = shlex.split(argument) + + parser = create_parser() + + try: + args = parser.parse_args(argument_list) + except SystemExit as ex: + logging.warning("There might be a mistake with the arguments. Ensure it's all correct.") + logging.error(ex) + + if args.range: + args.range = args.range.upper() + if args.header_range: + args.header_range = args.header_range.upper() + + logging.info( + f"workbook_name: {args.workbook_name}, sheet: {args.sheet}, range: {args.range}, hr: {args.header_range}, no_header: {str(args.no_header)}, debug: {args.debug}" + ) + + if args.header_range and args.no_header: + logging.error( + "You cannot specify both a header_range and --no_header in the object source File Name." + ) + if args.header_range and not args.range: + logging.error( + "A --header_range can not be specified without specifying a --range." + ) + + if args.header_range and args.range: + row_index_header_range = a1_range_to_grid_range(args.header_range).get( + "startRowIndex" + ) + row_index_range = a1_range_to_grid_range(args.range).get("startRowIndex") + if row_index_header_range != row_index_range: + logging.warning( + "If both a range and a header_range are specified, they should overlap." + ) + return args + + +def create_parser(): + """ + Method to create parser with arguments for workbook_details. + + Return: + - parser containing possible args. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "workbook_name", help="Name of the Google Sheet/ workbook", default=None + ) # positional argument + parser.add_argument("--sheet", help="Name of the sheet in the workbook") + parser.add_argument("--range", help="Cell range to retrieve") + parser.add_argument("--header_range", help="Cell range to be used as header") + parser.add_argument( + "--no_header", action="store_true", help="Specify if table has no header" + ) + parser.add_argument( + "-d", "--debug", action="store_true", help="Print debug messages" + ) + return parser diff --git a/wherescape/connectors/gsheet/gsheets_wrapper.py b/wherescape/connectors/gsheet/gsheets_wrapper.py new file mode 100644 index 0000000..14d8480 --- /dev/null +++ b/wherescape/connectors/gsheet/gsheets_wrapper.py @@ -0,0 +1,259 @@ +import gspread +import logging +import os +import sys + +from datetime import datetime +from gspread.client import Client +from gspread.spreadsheet import Spreadsheet +from gspread.worksheet import Worksheet +from gspread.exceptions import ( + SpreadsheetNotFound, + WorksheetNotFound, + APIError, +) + +from ...helper_functions import remove_empty_rows_and_columns, get_python_type + + +class Gsheet: + def __init__(self, args = None, url: str = "", test: bool = False): + """ + Init for Gsheet. set all variables. + + Args: + - args: args provided for processing correct data + - url (str): link to the spreadsheet being uploaded + - test (bool): False by default. set True in tests to not require args + """ + self._client: Client = _authorize() + if test: + logging.warning("Marked as TEST. no params will be set") + elif args: + self.set_gsheet_variables(url, args) + else: + logging.error("No args provided") + + def set_gsheet_variables(self, url: str, args): + """ + Function to set all of the variables for gsheet. + """ + # Set spreadsheet and worksheet. + self._set_spreadsheet(url=url, name=args.workbook_name) + self._set_worksheet(title=args.sheet) + + # Set all content. + self._set_content(args.range) + self._set_header(args.no_header, args.header_range) + self._set_column_types() + + def _set_spreadsheet(self, url: str = "", name: str = ""): + """ + Attempts to retreive the spreadsheet from google drive. + Requires either url or name. + + Params: + - url (str): url to the spreadsheet document. + - name (str): name of the spreadsheet document. + """ + try: + if url: + try: + self.spreadsheet = self._client.open_by_url(url) + logging.info("spreadsheet file has been obtained.") + return + except SpreadsheetNotFound: + logging.error("Invalid URL") + + if name: + try: + self.spreadsheet = self._client.open(name) + logging.info("spreadsheet file has been obtained.") + return + except SpreadsheetNotFound: + logging.error("Invalid workbook name") + + logging.error("Enter a valid workbook URL or workbook name") + # Raised when both url and name don't find a spreadsheet. + raise SpreadsheetNotFound + + except PermissionError as pe: + logging.error("Invalid Permissions, make sure access is granted.") + raise pe + + def get_spreadsheet(self) -> Spreadsheet: + """ + Getter for Spreadsheet. + + Returns: + - Spreadsheet. + """ + return self.spreadsheet + + def _set_worksheet(self, title: str | None = None): + """ + Sets the worksheet based on the given title. + A spreadsheet needs to have already been set. + + Args: + - title (str) : name of spreadsheet default value is standard among most + spreadsheet applications + """ + try: + if title is None: + logging.info("No name was provided. Using the first sheet") + self.worksheet = self.spreadsheet.get_worksheet(0) + else: + self.worksheet = self.spreadsheet.worksheet(title) + except AttributeError as ae: + logging.error("No spreadsheet was provided") + raise SpreadsheetNotFound + except WorksheetNotFound as notFound: + logging.warning(f"No worksheet was found with {title}") + raise notFound + + logging.info(f"worksheet found with title {self.worksheet.title}") + + def get_worksheet(self) -> Worksheet: + """ + Getter for worksheet. + + Returns: + - Worksheet + """ + return self.worksheet + + def _set_content(self, range: str | None = None): + """ + Retreives content from gsheet. removes empty rows. + """ + if range: + try: + content = self.worksheet.get(range, ) + except APIError as e: + logging.error(f"Invalid range: {range}") + raise e + else: + content = self.worksheet.get_all_values() + self.content = remove_empty_rows_and_columns(content) + + def get_content(self) -> list: + """ + Getter for content + + Returns: + - List of lists containing the content of the sheet. + """ + return self.content + + def _set_header( + self, + no_header: bool | None = None, + header_range: str | None = None, + ): + """ + Creates the header for the content. Takes it from content if no_header is False or None. + + Params: + - no_header (str): (optional) input of args.no_header + - header_Range (str): (optional) range of the header example: "A1:B5" + """ + # Set content if none is set yet. + if self.content is None: + self._set_content() + + if not no_header: + if header_range: + try: + row = self.worksheet.get(header_range)[0] + self.header = ["column_" + str(i + 1) if value == "" else value for i, value in enumerate(row)] + self.content.pop(0) + except APIError as e: + logging.error(f"Invalid Header range: {header_range}") + raise e + else: + row = self.content.pop(0) + self.header = ["column_" + str(i + 1) if value == "" else value for i, value in enumerate(row)] + else: + self.header = ["column_" + str(i + 1) for i in range(len(self.content[0]))] + + def get_header(self) -> list: + """ + Getter for the header. + + Returns + - List containing header values. + """ + return self.header + + def _set_column_types(self): + """ + Set a list with the postgrestype for each column in content. + """ + if not self.content or self.header == []: + self._set_header() # will also set content + + postgres_types = [] + + for c in range(len(self.header)): + if self.header[c] == "dss_record_source": + postgres_types.append("varchar(256)") + elif self.header[c] == "dss_load_date": + postgres_types.append("timestamp") + else: + column_values = [self.content[r][c] for r in range(len(self.content))] + if len(column_values) == 0: + postgres_types.append("text") + else: + python_type = get_python_type(column_values) + if python_type == int or python_type == float: + postgres_types.append("numeric") + elif python_type == datetime: + postgres_types.append("timestamp") + elif python_type == bool: + postgres_types.append("bool") + else: + postgres_types.append("text") + + + self.column_types = postgres_types + + def get_column_types(self) -> list: + """ + Get column types. Calls setter if not yet set. + + Returns: + - list of column types + """ + if not self.column_types: + self._set_column_types() + return self.column_types + + + +def _authorize() -> Client: + """ + Authorizes access for user. + + Returns: + - Client for authorization when handling data + """ + json_keyfile = _read_secret() + return gspread.auth.service_account( + json_keyfile, + ) + +def _read_secret() -> str: + """ + Locates the json file with the secret depending on the OS. + + Returns: + - path to secret file. + """ + if sys.platform == "win32": + datapath = os.getenv("APPDATA") + return os.path.join(datapath, "gspread", "google-drive-client-secret.json") + else: + datapath = os.path.expanduser("~") + return os.path.join(datapath, ".gspread", "google-drive-client-secret.json") + diff --git a/wherescape/connectors/gsheet/load_data.py b/wherescape/connectors/gsheet/load_data.py new file mode 100644 index 0000000..69c4c1d --- /dev/null +++ b/wherescape/connectors/gsheet/load_data.py @@ -0,0 +1,178 @@ +import logging +import re + +from datetime import datetime, UTC +from itertools import zip_longest + +from .gsheets_wrapper import Gsheet +from .gsheets_parsing import parse_gspread_arguments +from ...helper_functions import set_date_to_ymd +from ...wherescape import WhereScape + + +def python_gsheet_load_data(): + """ + Loads content of a google sheet file into a table from a google sheet. + Relevant metadata must already be created. + """ + start_time = datetime.now(tz=UTC) + logging.info("Start time: %s" % start_time.strftime("%Y-%m-%d %H:%M:%S")) + + wherescape = WhereScape() + table_name = wherescape.table + column_names, column_types = wherescape.get_columns() + try: + assert column_names is not None + assert column_types is not None + except AssertionError as e: + logging.error("no column_names or column_types found in wherescape") + raise e + + url = wherescape.query_meta( + "select lt_file_path from ws_load_tab where lt_table_name = ?", + [table_name], + )[0][0] + workbook_details = wherescape.query_meta( + "select lt_file_name from ws_load_tab where lt_table_name = ?", + [table_name], + )[0][0] + + logging.info(f"Metadata. URL: {url} ; Details : {workbook_details}") + + args = parse_gspread_arguments(workbook_details) + gsheet = Gsheet(args, url) + content = gsheet.get_content() + # For name consistency. + gsheet_header = gsheet.get_header() + + # Missing from wherescape (added after metadata upload) + added_columns, added_indexes = get_missing_columns(column_names, gsheet_header) + content, gsheet_header = remove_extra_columns(content, gsheet_header, added_indexes) + if len(added_columns) > 0: + logging.warn(f"New columns in gsheet data: {added_columns}") + + # Missing from upload (removed after metadata upload) + removed_columns, removed_indexes = get_missing_columns(gsheet_header, column_names) + content, gsheet_header = add_empty_columns(content, gsheet_header, removed_indexes, column_names) + if len(removed_columns) > 0: + logging.warn(f"Colums missing from gsheet data: {removed_columns}") + + dss_title = gsheet.get_spreadsheet().title.replace(" ", "_") + for row in content: + # Add content for dss columns + row.append(f"{dss_title}") + row.append(f"{start_time.strftime("%Y-%m-%d %H:%M:%S.%f")}") + + for i in range(len(gsheet_header)): + if column_types[i] == "timestamp": + transp_content = [list(i) for i in zip_longest(*content, fillvalue=None)] + for j in range(len(transp_content[i])): + transp_content[i][j] = set_date_to_ymd(transp_content[i][j]) + content = [list(i) for i in zip_longest(*transp_content, fillvalue=None)] + + column_names_string = ",".join(column for column in column_names) + question_mark_string = ",".join("?" for _ in column_names) + sql = f"INSERT INTO {table_name} ({column_names_string}) VALUES ({question_mark_string})" + + wherescape.push_many_to_target(sql, content) + logging.info(f"{len(content)} rows successfully inserted in {table_name} from google data.") + + # Final logging + end_time = datetime.now(tz=UTC) + logging.info( + "Time elapsed: %s seconds for gitlab_load_data" + % (end_time - start_time).seconds + ) + +def get_missing_columns(input_header: list, expected: list) -> tuple: + """ + Returns columns that are in compare and not in input. + dss_record_source and dss_load_date are considered expected as missing if not present. + + Args: + - input_header (list): list of strings to check + - expected (list): list of strings expected in input_header (larger if any are missing) + + Returns + - columns (list): list of columns unexpectedly missing. + - indexes (list): list of index values of missing columns. + """ + # WS might end with digits. Remove those to compare. + if input_header[0] is not None: + input_header = input_header + elif expected[0] is not None: + expected = expected + + columns = [] + indexes = [] + for column in expected: + if column not in input_header: + # Columns missing in expected are considered + if column not in ["dss_record_source", "dss_load_date"]: + columns.append(column) + indexes.append(expected.index(column)) + return columns, indexes + +def add_empty_columns(content: list, header: list, indexes: list, full_header: list) -> tuple: + """ + adds columns where columns are missing to both the header and the content. + + Params: + - content (list): full content to add empty columns to. + - header (list): header to add missing headers to. + - indexes (list): indexes of missing columns. + - full_header (list): expected containing correct header names. + + Returns: + - content (list): content including new columns for missing fields. + - header (list): header including new column names for missing fields. + """ + transposed = [list(i) for i in zip_longest(*content, fillvalue=None)] + for i in indexes: + if full_header[i] in ["dss_record_source", "dss_load_date"]: + continue + transposed = transposed[:i] + [[None for _ in range(len(content))]] + transposed[i:] + header = header[:i] + [full_header[i]] + header[i:] + content = [list(i) for i in zip_longest(*transposed, fillvalue=None)] + + return content, header + +def remove_extra_columns(content: list, header: list, indexes: list)-> tuple: + """ + remove columns from content that aren't listed for it's destination. + + Params: + - content (list): full content to remove unlisted columns to. + - header (list): header to remove unlisted headers to. + - indexes (list): indexes of unlisted columns. + + Returns: + - content (list): content excluding columns for unlisted fields. + - header (list): header excluding columns for unlisted fields. + """ + transposed = [list(i) for i in zip_longest(*content, fillvalue=None)] + indexes.reverse() + for i in indexes: + del transposed[i] + del header[i] + content = [list(i) for i in zip_longest(*transposed, fillvalue=None)] + return content, header + +def remove_final_digits(headers: list[str]) -> list: + """ + Removes _000 (or other digits) from the end of words in a list if they are there. + This method is to make comparing easier since the numbers might differ if the columns are not the same. + + Params: + - header (list): header to remove extra digits from. + + Returns: + - result (list): new header without the _000. + """ + result = [] + for header in headers: + if re.search(pattern=r"_\d{3}$", string=header) is None: + result.append(headers) + else: + result.append(headers[:-4]) + return result diff --git a/wherescape/connectors/gsheet/readme.md b/wherescape/connectors/gsheet/readme.md new file mode 100644 index 0000000..a2f25c2 --- /dev/null +++ b/wherescape/connectors/gsheet/readme.md @@ -0,0 +1,50 @@ + +Default scopes for Gspread are + +DEFAULT_SCOPES =[ + 'https://www.googleapis.com/auth/spreadsheets', + 'https://www.googleapis.com/auth/drive' + ] +# Gsheet connector + +Gsheet Connector for WhereScape. Takes care of creating metadata for a loading data and uploading the data from a gsheet file. + +## WhereScape Parameters + +## Connection Details +An authentication user is required from Wherescape. For this, a client secret has to be created in the Google API Console. +Its secret should be stored in `%%APPDATAPP\gspread` for Windows or in `~/.gspread` for Unix. + +The default scopes for this client secret are: + +``` +DEFAULT_SCOPES =[ + 'https://www.googleapis.com/auth/spreadsheets', + 'https://www.googleapis.com/auth/drive' + ] +``` + +## Load table + +Create 1 load table. with a script based load. +Under Source. ensure a link is set to the file in the `Source Directory` and any desired arguments are provided in `Source File Name`. + + +## Host script +Create a new python host script and add it to the load table. Example code: + +``` +from wherescape_os.wherescape.connectors.gsheet.create_metadata import gsheet_create_metadata + +gsheet_create_metadata() + +``` + +Host scripts to create: +* python_gsheet_create_metadata +* python_gsheet_load_data + +# Usage + +First attach the metadata host script to the load table and ensure there's no pre-load action set. +After creating the table, attach the load_data host script to the load table and set pre-load to struncate. diff --git a/wherescape/connectors/gsheet/tests/test_gsheets_wrapper.py b/wherescape/connectors/gsheet/tests/test_gsheets_wrapper.py new file mode 100644 index 0000000..20c1322 --- /dev/null +++ b/wherescape/connectors/gsheet/tests/test_gsheets_wrapper.py @@ -0,0 +1,162 @@ +from datetime import datetime +import logging +import pytest + +from wherescape.connectors.gsheet.gsheets_wrapper import Gsheet, get_python_type +from gspread.exceptions import SpreadsheetNotFound, WorksheetNotFound + + +NO_ACCESS_URL = "https://docs.google.com/spreadsheets/d/1lhrCYDeMpX8DUdoI_JC4hEmhhVqUUOkCIkTKgHpcN0o/edit?usp=drive_link" +NO_ACCESS_NAME = "no access" +BASIC_FILE_URL = "https://docs.google.com/spreadsheets/d/1O8BhaD385kPxxQGUeyU0DoGPLyBQITFCihje2av0POk/edit?usp=drive_link" +BASIC_FILE_NAME = "basic data file" +DIFF_START_CELL_URL = "https://docs.google.com/spreadsheets/d/15W15G9ERorhGT5QhvO8IJVTWvtbQC9Puzxi3txmU9ZM/edit?usp=drive_link" +DIFF_START_CELL_NAME = "middle_starter_cell_50_rows" +FAKE_URL = "https://docs.google.com/spreadsheets/d/1pdqdoPnhdTIjccXstqK7ReWzFT0rxez3gdJItO9j0Ng/edit?usp=drive_link" +BAD_URL = "https://docs.google.com/spreadsheets/d/1pdqdoPnhdTIjccXstqK7" + +class TestGsheet: + def setup_method(self, method): + self.gsheet = Gsheet(test=True) + + def test_set_spreadsheet_on_url(self): + """ + Test that a spreadsheet can be set using a valid url. + """ + url = BASIC_FILE_URL + self.gsheet._set_spreadsheet(url=url) + + assert self.gsheet.spreadsheet is not None + + def test_set_spreadsheet_on_name(self): + """ + Test that a spreadsheet can be set using a valid name. + """ + name = BASIC_FILE_NAME + self.gsheet._set_spreadsheet(name=name) + + assert self.gsheet.spreadsheet is not None + + @pytest.mark.parametrize( + ("url", "exception"), + ( + (NO_ACCESS_URL, PermissionError), + (FAKE_URL, SpreadsheetNotFound), + (BAD_URL, SpreadsheetNotFound), + (FAKE_URL, SpreadsheetNotFound) + ) + ) + def test_set_spreadsheet_errors(self, url, exception): + """ + Test that the correct error is thrown given the url. + """ + with pytest.raises(exception): + self.gsheet._set_spreadsheet(url=url) + + def test_set_worksheet_no_input(self): + """ + Test a worksheet is set without any inputs. + """ + self.gsheet._set_spreadsheet(url=BASIC_FILE_URL) + logging.warning(self.gsheet.spreadsheet) + self.gsheet._set_worksheet() + + assert (self.gsheet.spreadsheet is not None) + + def test_set_worksheet_with_input(self): + """ + Test worksheet gets sst using the title of the worksheet. + """ + title = "Second" + url = DIFF_START_CELL_URL + self.gsheet._set_spreadsheet(url=url) + self.gsheet._set_worksheet(title=title) + + assert (self.gsheet.spreadsheet is not None) + + def test_set_worksheet_no_spreadsheet(self): + """ + Test SpreadsheetNotFound thrown when no spreadsheet is set when + calling set_worksheet. + """ + with pytest.raises(SpreadsheetNotFound): + self.gsheet._set_worksheet() + + def test_set_worksheet_incorrect_input(self): + """ + Test raises WorksheetNotFound if no worksheet was found with the given title. + """ + self.gsheet._set_spreadsheet(url=BASIC_FILE_URL) + with pytest.raises(WorksheetNotFound): + self.gsheet._set_worksheet(title="sdfsffuysdkjfhsdkjf") + + @pytest.mark.parametrize( + ("url", "rows", "columns"), + ( + (BASIC_FILE_URL, 51, 7), + (DIFF_START_CELL_URL, 50, 7), + ) + ) + def test_set_content_has_all_content_of_sheet(self, url, rows, columns): + """ + Check that set_content collects akk tge data + """ + self.gsheet._set_spreadsheet(url=url) + self.gsheet._set_worksheet() + self.gsheet._set_content() + content = self.gsheet.content + + assert len(content) == rows + assert len(content[0]) == columns + + def test_set_header_gets_first_row_if_header(self): + """ + Test to see if + """ + self.gsheet._set_spreadsheet(url=BASIC_FILE_URL) + self.gsheet._set_worksheet() + self.gsheet._set_content() + content = self.gsheet.get_content() + first_row = content[0] + self.gsheet._set_header() + + assert self.gsheet.header == first_row + + def test_set_header_no_header_given(self): + self.gsheet._set_spreadsheet(url=BASIC_FILE_URL) + self.gsheet._set_worksheet() + self.gsheet._set_content() + content = self.gsheet.get_content() + first_row = self.header = ["column_" + str(i + 1) for i in range(len(content[0]))] + self.gsheet._set_header(no_header=True) + + assert self.gsheet.header == first_row + + def test_set_column_types(self): + self.gsheet._set_spreadsheet(url=BASIC_FILE_URL) + self.gsheet._set_worksheet() + self.gsheet._set_content() + self.gsheet._set_header() + self.gsheet._set_column_types() + + expected = ["text", "numeric", "numeric", "text", "text", "text", "timestamp"] + assert len(self.gsheet.column_types) == len(expected) + assert self.gsheet.column_types == expected + +@pytest.mark.parametrize( + ("input", "expected_type"), + ( + (["4","1","4562","4567","234634","34532"], int), + (["dsjkfh", "skdjhf", "jkghsdf"], str), + (["43.6", "4534.34", "457424.644"], float), + (["2024-05-30", "2023-12-05", "2023-06-28"], datetime), + (["5461", "534.687", "74849"], float), + (["sdferw", "234234", "sdfjhs", "jghfd"], str), + (["2024-05-30", "sdffsdfwe", "", ""], str), + (["", "", ""], str), + ) + ) +def test_get_python_type(input, expected_type): + result = get_python_type(input) + + assert result == expected_type diff --git a/wherescape/connectors/hubspot/hubspot_wrapper.py b/wherescape/connectors/hubspot/hubspot_wrapper.py index d50ed56..554f230 100644 --- a/wherescape/connectors/hubspot/hubspot_wrapper.py +++ b/wherescape/connectors/hubspot/hubspot_wrapper.py @@ -140,7 +140,7 @@ def get_object(self, record_id: str, hs_object: str, properties: list = []): response = basic_api.get_by_id(record_id, properties=properties) return response except api_error.ApiException as e: - logging.error("An exception occured when calling %s batch_api_>update\n %s" % (hs_object, e)) + logging.error(f"An exception occured when calling {hs_object} batch_api_>update\n {e}") def get_property_names(self, object_name: str): """ @@ -183,7 +183,7 @@ def get_all( """ results = [] basic_api = getattr(self.client.crm, hs_object).basic_api - error_api = getattr(hubspot.crm, HubspotObjectEnum(hs_object)) + error_api = getattr(hubspot.crm, hs_object) try: api_response = basic_api.get_page(properties=properties, limit=100) results.extend(api_response.results) diff --git a/wherescape/connectors/hubspot/utils.py b/wherescape/connectors/hubspot/utils.py index b749445..f130297 100644 --- a/wherescape/connectors/hubspot/utils.py +++ b/wherescape/connectors/hubspot/utils.py @@ -1,4 +1,3 @@ - def get_double_nerd_ids(tickets: list) -> list: """ Function to retrieve all nerd ticket id that appear multiple times. diff --git a/wherescape/helper_functions.py b/wherescape/helper_functions.py index 56c2e2c..6318334 100644 --- a/wherescape/helper_functions.py +++ b/wherescape/helper_functions.py @@ -1,3 +1,6 @@ +import ast +import json + from dateutil.parser import parse from slugify import slugify @@ -10,18 +13,36 @@ def create_column_names(display_names=[]): Columns get truncated to 59 characters, because 63 characters is the max column lenght for Postgres columns. """ - i = 0 columns = [] for display_name in display_names: column = slugify(display_name, separator="_", max_length=59) if column == "": column = "column" - column = f"{column}_{str(i + 1).zfill(3)}" + column = f"{column}" columns.append(column) - i += 1 return columns +def remove_empty_rows_and_columns(input: list) -> list: + """ + Returns the list with the emtpy rows removed. + + Params: + - input (list): List of lists containing the content. + + Returns + - List of list. + """ + content = [row for row in input if not all(cell == "" for cell in row)] + # switch and empty + content_transposed = [list(i) for i in zip(*content)] + content_transposed = [ + row for row in content_transposed if not all(cell == "" for cell in row) + ] + # switch again + return [list(i) for i in zip(*content_transposed)] + + def create_display_names(columns=[]): """ Change column names in to display names. @@ -120,13 +141,11 @@ def filter_dict(dict_to_filter, keys_to_keep): Returns: dict: The dict with only the key, value pairs you want to keep. """ - return dict( - [ - (key, dict_to_filter[key]) + return { + key: dict_to_filter[key] for key in dict_to_filter if key in set(keys_to_keep) - ] - ) + } def flatten_json(json_response, name_to_skip=None): @@ -186,12 +205,13 @@ def fill_out_empty_keys(cleaned_json, keys_to_keep, overwrite): out[key] = cleaned_json[key] return out + def is_date(string, fuzzy=False): """ Return whether the string can be interpreted as a date. string: str, string to check for date - fuzzy: bool, ignore unknown tokens in string if True + fuzzy: bool, ignore unknown tokens in string if True. """ try: parse(string, fuzzy=fuzzy) @@ -201,3 +221,69 @@ def is_date(string, fuzzy=False): return False except OverflowError: return False + + +def set_date_to_ymd(value: str | None) -> str | None: + """ + Set the dateformat of a datetime string to YYYY-mm-dd. + + Args: + - value (str): value to set dateformat for. + + Returns: + - string of date of format YYYY-mm-dd + """ + return parse(value).strftime("%Y-%m-%d") if value is not None else value + + +def get_python_type(column_values: list) -> type: + """ + Returns string of the Python type fit for the data in the list. + + Params: + - column_values (list): list of the values. + """ + values = [] + is_bool = True + types = set() + + for item in column_values: + if item not in ["TRUE", "FALSE", "1", "0"]: + is_bool = False + + if is_bool: + return bool + else: + for item in column_values: + values.append(convert_string(item)) + types = {type(item) for item in values} + + if len(types) > 1: + for t in values: + if not (isinstance(t, int) or isinstance(t, float)): + return str + return float + else: + return next(iter(types)) + + +def convert_string(value: str): + """ + Determines literal python type of a string value. + + Params: + - value (str): value to determine literal type of. + + Returns + - Any value as it's literal object type + """ + try: + return (ast.literal_eval(value)) + except (ValueError, SyntaxError): + try: + return (json.loads(value)) + except (ValueError, TypeError): + try: + return (parse(value)) + except (ValueError, OverflowError): + return (value) diff --git a/wherescape/wherescape.py b/wherescape/wherescape.py index 8894255..73a5371 100644 --- a/wherescape/wherescape.py +++ b/wherescape/wherescape.py @@ -106,7 +106,8 @@ def get_columns(self): sql = "SELECT sc_col_name, sc_data_type FROM ws_stage_col WHERE sc_obj_key = ? ORDER BY sc_order" else: logging.warning("Invalid schema: %s", self.schema) - return None + # returning same amount removes problems showing in IDE + return (None, None) results = self.query_meta(sql, [self.object_key]) if results: