From 4110f7899fa7cec0eeb77fee7c1fe58a560e0b4f Mon Sep 17 00:00:00 2001 From: Esther Zigterman Rustenburg Date: Mon, 18 Mar 2024 13:45:33 +0100 Subject: [PATCH 1/7] feat: add gsheet to os DATA-1762 --- requirements.txt | 5 + wherescape/connectors/gsheet/__init__.py | 0 .../connectors/gsheet/create_metadata.py | 89 ++++++ .../connectors/gsheet/gsheets_parsing.py | 80 ++++++ .../connectors/gsheet/gsheets_wrapper.py | 256 ++++++++++++++++++ wherescape/connectors/gsheet/load_data.py | 174 ++++++++++++ wherescape/connectors/gsheet/readme.md | 7 + .../connectors/gsheet/tests/fixtures.py | 8 + .../gsheet/tests/test_gsheets_wrapper.py | 229 ++++++++++++++++ .../connectors/hubspot/hubspot_wrapper.py | 4 +- wherescape/connectors/hubspot/utils.py | 1 - wherescape/helper_functions.py | 100 ++++++- 12 files changed, 944 insertions(+), 9 deletions(-) create mode 100644 wherescape/connectors/gsheet/__init__.py create mode 100644 wherescape/connectors/gsheet/create_metadata.py create mode 100644 wherescape/connectors/gsheet/gsheets_parsing.py create mode 100644 wherescape/connectors/gsheet/gsheets_wrapper.py create mode 100644 wherescape/connectors/gsheet/load_data.py create mode 100644 wherescape/connectors/gsheet/readme.md create mode 100644 wherescape/connectors/gsheet/tests/fixtures.py create mode 100644 wherescape/connectors/gsheet/tests/test_gsheets_wrapper.py diff --git a/requirements.txt b/requirements.txt index cf307f1..76e0396 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,12 @@ + +gspread==6.1.2 hubspot-api-client==8.2.1 notion-client==2.2.1 numpy==1.26.4 pandas==1.3.4 pyodbc==5.1.0 +pytest==8.3.2 python-slugify==8.0.4 requests==2.32.3 + + diff --git a/wherescape/connectors/gsheet/__init__.py b/wherescape/connectors/gsheet/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wherescape/connectors/gsheet/create_metadata.py b/wherescape/connectors/gsheet/create_metadata.py new file mode 100644 index 0000000..86075fe --- /dev/null +++ b/wherescape/connectors/gsheet/create_metadata.py @@ -0,0 +1,89 @@ +import logging +from datetime import datetime, UTC + +from ...helper_functions import create_column_names, create_display_names, prepare_metadata_query +from ...wherescape import WhereScape +from .gsheets_wrapper import Gsheet, set_gsheet_variables +from .gsheets_parsing import parse_gspread_arguments + + +def gsheet_create_metadata(): + """ + Function that creates a load table in Wherescape based on the data + in a provided Google sheet file. + """ + start_time = datetime.now(tz=UTC) + # Initialize Wherescape + logging.info("Connecting to WhereScape") + wherescape_instance = WhereScape() + logging.info( + "Start time: %s for gsheet_load_data_os." % start_time.strftime("%Y-%m-%d %H:%M:%S") + ) + gsheet: Gsheet = Gsheet() + + load_table_name = wherescape_instance.table + url = wherescape_instance.query_meta( + "select lt_file_path from ws_load_tab where lt_table_name = ?", + [load_table_name], + )[0][0] + workbook_details = wherescape_instance.query_meta( + "select lt_file_name from ws_load_tab where lt_table_name = ?", + [load_table_name], + )[0][0] + logging.info(f"Metadata. URL: {url} ; Details : {workbook_details}") + + args = parse_gspread_arguments(workbook_details) + if args.debug: + logging.warning("Debug mode on -> do not use for production.") + + set_gsheet_variables(gsheet, url, args) + + header_row = gsheet.get_header() + column_types = gsheet.get_column_types() + title = gsheet.get_worksheet().title + lt_obj_key = wherescape_instance.object_key + + display_names = create_display_names(header_row) + column_names = create_column_names(header_row) + source_columns, comments = set_source_columns_and_comments(header_row) + + sql = prepare_metadata_query( + lt_obj_key = lt_obj_key, + src_table_name = title, + columns=column_names, + display_names=display_names, + types=column_types, + comments=comments, + source_columns=source_columns, + ) + logging.info(f"Stored details for {len(header_row)} columns") + + wherescape_instance.push_to_meta(sql) + logging.info("--> Metadata updated. Table can be created.") + + end_time = datetime.now(tz=UTC) + logging.info("End time: %s" % end_time.strftime("%Y-%m-%d %H:%M:%S")) + logging.info("Time elapsed: %s seconds" % (end_time - start_time).seconds) + + +def set_source_columns_and_comments(header_row: list): + """ + Fuction to determine source_column and comments for metadata. + + Params: + header_row (list): header values. + + Returns: + - list: source_column values. + - list: comment values. + """ + comments = [] + source_columns = [] + + for value in header_row: + src_column_name = value.rstrip() + + comments.append(src_column_name[0:1023].replace("'", "''")) + source_columns.append(src_column_name) + + return source_columns, comments diff --git a/wherescape/connectors/gsheet/gsheets_parsing.py b/wherescape/connectors/gsheet/gsheets_parsing.py new file mode 100644 index 0000000..9e4e987 --- /dev/null +++ b/wherescape/connectors/gsheet/gsheets_parsing.py @@ -0,0 +1,80 @@ +import argparse +import logging +import shlex + +from gspread.utils import a1_range_to_grid_range + + +def parse_gspread_arguments(argument: str) -> argparse.Namespace: + """ + Converts an argument string into args object. + + Parameters: + - argument (str): arguments for the parser collected in a string. + + Returns + - args (Namespace): object with all arguments provided stored within. + """ + if argument == "": + logging.info("No arguments provided. Using defaults.") + + argument_list = shlex.split(argument) + + parser = create_parser() + + try: + args = parser.parse_args(argument_list) + except SystemExit as ex: + logging.error(ex) + + if args.range: + args.range = args.range.upper() + if args.header_range: + args.header_range = args.header_range.upper() + + logging.info( + f"workbook_name: {args.workbook_name}, sheet: {args.sheet}, range: {args.range}, hr: {args.header_range}, no_header: {str(args.no_header)}, debug: {args.debug}" + ) + + if args.header_range and args.no_header: + logging.error( + "You cannot specify both a header_range and --no_header in the object source File Name." + ) + if args.header_range and not args.range: + logging.error( + "A --header_range can not be specified without specifying a --range." + ) + + if args.header_range and args.range: + row_index_header_range = a1_range_to_grid_range(args.header_range).get( + "startRowIndex" + ) + row_index_range = a1_range_to_grid_range(args.range).get("startRowIndex") + if row_index_header_range == row_index_range: + logging.error( + "If both a range and a header_range are specified, they can not overlap." + ) + return args + + +def create_parser(): + """ + Method to create parser with arguments for workbook_details. + + Return: + - parser containing possible args. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--workbook_name", action="store", help="Name of the Google Sheet/ workbook" + ) # positional argument + parser.add_argument("--sheet", help="Name of the sheet in the workbook", default="Sheet1") + parser.add_argument("--range", help="Cell range to retrieve") + parser.add_argument("--header_range", action="store_true", help="Cell range to be used as header") + parser.add_argument( + "--no_header", action="store_true", help="Specify if table has no header" + ) + parser.add_argument( + "-d", "--debug", action="store_true", help="Print debug messages" + ) + return parser diff --git a/wherescape/connectors/gsheet/gsheets_wrapper.py b/wherescape/connectors/gsheet/gsheets_wrapper.py new file mode 100644 index 0000000..5bec4e5 --- /dev/null +++ b/wherescape/connectors/gsheet/gsheets_wrapper.py @@ -0,0 +1,256 @@ +import gspread +import logging +import os +import sys + +from datetime import datetime +from gspread import ( + Client, + Spreadsheet, + SpreadsheetNotFound, + Worksheet, + WorksheetNotFound, +) + +from ...helper_functions import convert_string, remove_empty_rows_and_columns, get_python_type + + +class Gsheet: + def __init__(self): + self._client: Client = _authorize() + self.spreadsheet: Spreadsheet = None + self.worksheet: Worksheet = None + self.header: list = [] + self.content = None + self.column_types = None + + def set_spreadsheet(self, url: str = "", name: str = ""): + """ + Attempts to retreive the spreadsheet from google drive. + Requires either url or name. + + Params: + - url (str): url to the spreadsheet document. + - name (str): name of the spreadsheet document. + """ + try: + if url: + try: + self.spreadsheet = self._client.open_by_url(url) + logging.info("spreadsheet file has been obtained.") + return + except SpreadsheetNotFound as notFound: + logging.error("Invalid URL") + + if name: + try: + self.spreadsheet = self._client.open(name) + logging.info("spreadsheet file has been obtained.") + return + except SpreadsheetNotFound as notFound: + logging.error("Invalid workbook name") + + logging.error("Enter a valid workbook URL or workbook name") + # Raised when both url and name don't find a spreadsheet. + raise notFound + + except PermissionError as pe: + logging.error("Invalid Permissions, make sure access is granted.") + raise pe + + def get_spreadsheet( + self, url: str = "", name: str = "" + ) -> Spreadsheet: + """ + Getter for Spreadsheet. Calls setter if not already present. + + Args: + - url (str): (optional) link to the spreadsheet. + - name (str): (optional) name of spreadsheet. + + Returns: + - Spreadsheet. + """ + if self.spreadsheet is not None: + return self.spreadsheet + else: + self.set_spreadsheet(url, name) + return self.spreadsheet + + def set_worksheet(self, title: str = "Sheet1"): + """ + Sets the worksheet based on the given title. + A spreadsheet needs to have already been set. + + Args: + - title (str) : name of spreadsheet default value is standard among most + spreadsheet applications + """ + try: + if self.spreadsheet is None: + logging.error("No spreadsheet available to take the worksheet from.") + raise SpreadsheetNotFound + else: + self.worksheet = self.spreadsheet.worksheet(title) + except WorksheetNotFound: + logging.warning("Invalid worksheet title in --sheet") + try: + # using the automated name used by Apple. + # in case it was imported to google from there. + title = "Table1" + self.worksheet = self.spreadsheet.worksheet(title) + logging.info(f"worksheet found with title {title}") + except WorksheetNotFound as notFound: + logging.error("No worksheet was found") + raise notFound + logging.info(f"worksheet found with title {title}") + + def get_worksheet(self, title: str = "") -> Worksheet: + """ + Getter for worksheet. Calls setter if not already present. + + Params: + - title (str) : title of the worksheet + + Returns: + - Worksheet + """ + if self.worksheet: + pass + elif title: + self.set_worksheet(title) + else: + self.set_worksheet() + return self.worksheet + + def set_content(self): + """ + Retreives content from gsheet. removes empty rows. + """ + content = self.worksheet.get_all_values() + self.content = remove_empty_rows_and_columns(content) + + def get_content(self) -> list: + """ + Getter for content. Calls setter if not already present. + + Returns: + - List of lists containing the content of the sheet. + """ + if self.content is None: + self.set_content() + return self.content + + def set_header(self, no_header: str = None): + """ + Creates the header for the content. Takes it from content if no_header is False. + + Params: + - no_header (str): (optional) input of args.no_header (legacy naming) + """ + # Set content if none is set yet. + if self.content is None: + self.set_content() + + if not no_header: + row = self.content.pop(0) + self.header = ["column_" + str(i + 1) if value == "" else value for i, value in enumerate(row)] + else: + self.header = ["column_" + str(i + 1) for i in range(len(self.content[0]))] + + def get_header(self, no_header: str = None) -> list: + """ + Getter for the header. Calls setter if not already present. + + Params: + - no_header (str): (optional) input of args.no_header (legacy naming) + + Returns + - List containing header values. + """ + if self.header == []: + self.set_header(no_header) + return self.header + + def set_column_types(self): + """ + Set a list with the postgrestype for each column in content. + """ + if not self.content or self.header == []: + self.set_header() # will also set content + + postgres_types = [] + + for c in range(len(self.header)): + if self.header[c] == "dss_record_source": + postgres_types.append("varchar(256)") + elif self.header[c] == "dss_load_date": + postgres_types.append("timestamp") + else: + column_values = [self.content[r][c] for r in range(len(self.content))] + if len(column_values) == 0: + postgres_types.append("text") + else: + python_type = get_python_type(column_values) + if python_type == int or python_type == float: + postgres_types.append("numeric") + elif python_type == datetime: + postgres_types.append("timestamp") + elif python_type == bool: + postgres_types.append("bool") + else: + postgres_types.append("text") + + + self.column_types = postgres_types + + def get_column_types(self) -> list: + """ + Get column types. Calls setter if not yet set. + + Returns: + - list of column types + """ + if not self.column_types: + self.set_column_types() + return self.column_types + +def set_gsheet_variables(gsheet: Gsheet, url: str, args): + """ + Function to set all a bunch of variables for gsheet. + """ + # Set spreadsheet and worksheet. + gsheet.set_spreadsheet(url=url, name=args.workbook_name) + gsheet.set_worksheet(title=args.sheet) + + # Set all content. + gsheet.set_content() + gsheet.set_header(args.no_header) + gsheet.set_column_types() + +def _authorize() -> Client: + """ + Authorizes access for user. + + Returns: + - Client for authorization when handling data + """ + json_keyfile = _read_secret() + return gspread.service_account( + json_keyfile, + ) + +def _read_secret() -> str: + """ + Locates the json file with the secret depending on the OS. + + Returns: + - path to secret file. + """ + if sys.platform == "win32": + datapath = os.getenv("APPDATA") + return os.path.join(datapath, "gspread", "google-drive-client-secret.json") + else: + datapath = os.path.expanduser("~") + return os.path.join(datapath, ".gspread", "google-drive-client-secret.json") + diff --git a/wherescape/connectors/gsheet/load_data.py b/wherescape/connectors/gsheet/load_data.py new file mode 100644 index 0000000..f84ff5d --- /dev/null +++ b/wherescape/connectors/gsheet/load_data.py @@ -0,0 +1,174 @@ +import logging +import re + +from datetime import datetime, UTC +from itertools import zip_longest + +from .gsheets_wrapper import Gsheet, set_gsheet_variables +from .gsheets_parsing import parse_gspread_arguments +from ...helper_functions import set_date_to_ymd +from ...wherescape import WhereScape + + +def google_sheet_load_data(): + """ + Loads content of a google sheet file into a table from a google sheet. + Relevant metadata must already be created. + """ + start_time = datetime.now(tz=UTC) + logging.info("Start time: %s" % start_time.strftime("%Y-%m-%d %H:%M:%S")) + + wherescape = WhereScape() + gsheet = Gsheet() + table_name = wherescape.table + column_names, column_types = wherescape.get_columns() + + url = wherescape.query_meta( + "select lt_file_path from ws_load_tab where lt_table_name = ?", + [table_name], + )[0][0] + workbook_details = wherescape.query_meta( + "select lt_file_name from ws_load_tab where lt_table_name = ?", + [table_name], + )[0][0] + + logging.info(f"Metadata. URL: {url} ; Details : {workbook_details}") + + args = parse_gspread_arguments(workbook_details) + set_gsheet_variables(gsheet, url, args) + content = gsheet.get_content() + # For name consistency. + # gsheet_header = create_column_names(gsheet.get_header()) + gsheet_header = gsheet.get_header() + + # Missing from wherescape (added after metadata upload) + added_columns, added_indexes = get_missing_columns(column_names, gsheet_header) + content, gsheet_header = remove_extra_columns(content, gsheet_header, added_indexes) + if len(added_columns) > 0: + logging.warn(f"New columns in gsheet data: {added_columns}") + + # Missing from upload (removed after metadata upload) + removed_columns, removed_indexes = get_missing_columns(gsheet_header, column_names) + content, gsheet_header = add_empty_columns(content, gsheet_header, removed_indexes, column_names) + if len(removed_columns) > 0: + logging.warn(f"Colums missing from gsheet data: {removed_columns}") + + dss_title = gsheet.get_spreadsheet().title.replace(" ", "_") + for row in content: + # Add content for dss columns + row.append(f"{dss_title}") + row.append(f"{start_time.strftime("%Y-%m-%d %H:%M:%S.%f")}") + + for i in range(len(gsheet_header)): + if column_types[i] == "timestamp": + transp_content = [list(i) for i in zip_longest(*content, fillvalue=None)] + for j in range(len(transp_content[i])): + transp_content[i][j] = set_date_to_ymd(transp_content[i][j]) + content = [list(i) for i in zip_longest(*transp_content, fillvalue=None)] + + column_names_string = ",".join(column for column in column_names) + question_mark_string = ",".join("?" for _ in column_names) + sql = f"INSERT INTO {table_name} ({column_names_string}) VALUES ({question_mark_string})" + + wherescape.push_many_to_target(sql, content) + logging.info(f"{len(content)} rows successfully inserted in {table_name} from google data.") + + # Final logging + end_time = datetime.now(tz=UTC) + logging.info( + "Time elapsed: %s seconds for gitlab_load_data" + % (end_time - start_time).seconds + ) + +def get_missing_columns(input_header: list, expected: list) -> tuple: + """ + Returns columns that are in compare and not in input. + dss_record_source and dss_load_date are considered expected as missing if not present. + + Args: + - input_header (list): list of strings to check + - expected (list): list of strings expected in input_header (larger if any are missing) + + Returns + - columns (list): list of columns unexpectedly missing. + - indexes (list): list of index values of missing columns. + """ + # WS might end with digits. Remove those to compare. + if re.search(r'_\d{3}$', input_header[0]) is not None: + input_header = remove_final_digits(input_header) + elif re.search(r'_\d{3}$', expected[0]) is not None: + expected = remove_final_digits(expected) + + columns = [] + indexes = [] + for column in expected: + if column not in input_header: + # Columns missing in expected are considered + if column not in ["dss_record_source", "dss_load_date"]: + columns.append(column) + indexes.append(expected.index(column)) + return columns, indexes + +def add_empty_columns(content: list, header: list, indexes: list, full_header: list) -> tuple: + """ + adds columns where columns are missing to both the header and the content. + + Params: + - content (list): full content to add empty columns to. + - header (list): header to add missing headers to. + - indexes (list): indexes of missing columns. + - full_header (list): expected containing correct header names. + + Returns: + - content (list): content including new columns for missing fields. + - header (list): header including new column names for missing fields. + """ + transposed = [list(i) for i in zip_longest(*content, fillvalue=None)] + for i in indexes: + if full_header[i] in ["dss_record_source", "dss_load_date"]: + continue + transposed = transposed[:i] + [[None for _ in range(len(content))]] + transposed[i:] + header = header[:i] + [full_header[i]] + header[i:] + content = [list(i) for i in zip_longest(*transposed, fillvalue=None)] + + return content, header + +def remove_extra_columns(content: list, header: list, indexes: list)-> tuple: + """ + remove columns from content that aren't listed for it's destination. + + Params: + - content (list): full content to add empty columns to. + - header (list): header to add missing headers to. + - indexes (list): indexes of missing columns. + + Returns: + - content (list): content including new columns for missing fields. + - header (list): header including new column names for missing fields. + """ + transposed = [list(i) for i in zip_longest(*content, fillvalue=None)] + indexes.reverse() + for i in indexes: + del transposed[i] + del header[i] + content = [list(i) for i in zip_longest(*transposed, fillvalue=None)] + return content, header + +def remove_final_digits(header: list) -> list: + """ + Removes _000 (or other digits) from the end of words in a list if they are there. + This method is to make comparing easier since the numbers might differ if the columns are not the same. + + Params: + - header (list): header to remove extra digits from. + + Returns: + - result (list): new header without the _000. + """ + result = [] + for header in header: + if re.search(r'_\d{3}$', header) is None: + result.append(header) + else: + result.append(header[:-4]) + return result diff --git a/wherescape/connectors/gsheet/readme.md b/wherescape/connectors/gsheet/readme.md new file mode 100644 index 0000000..a6c76f1 --- /dev/null +++ b/wherescape/connectors/gsheet/readme.md @@ -0,0 +1,7 @@ + +Default scopes for Gspread are + +DEFAULT_SCOPES =[ + 'https://www.googleapis.com/auth/spreadsheets', + 'https://www.googleapis.com/auth/drive' + ] diff --git a/wherescape/connectors/gsheet/tests/fixtures.py b/wherescape/connectors/gsheet/tests/fixtures.py new file mode 100644 index 0000000..48985f4 --- /dev/null +++ b/wherescape/connectors/gsheet/tests/fixtures.py @@ -0,0 +1,8 @@ +import pytest + +from wherescape.connectors.gsheet.gsheets_wrapper import Gsheet + + +@pytest.fixture +def gsheet(): + return Gsheet() \ No newline at end of file diff --git a/wherescape/connectors/gsheet/tests/test_gsheets_wrapper.py b/wherescape/connectors/gsheet/tests/test_gsheets_wrapper.py new file mode 100644 index 0000000..f8eb5ba --- /dev/null +++ b/wherescape/connectors/gsheet/tests/test_gsheets_wrapper.py @@ -0,0 +1,229 @@ +from datetime import datetime +import pytest + +from wherescape.connectors.gsheet.gsheets_wrapper import Gsheet, get_python_type +from gspread.exceptions import SpreadsheetNotFound + + +NO_ACCESS_URL = "https://docs.google.com/spreadsheets/d/1lhrCYDeMpX8DUdoI_JC4hEmhhVqUUOkCIkTKgHpcN0o/edit?usp=drive_link" +NO_ACCESS_NAME = "no access" +BASIC_FILE_URL = "https://docs.google.com/spreadsheets/d/1O8BhaD385kPxxQGUeyU0DoGPLyBQITFCihje2av0POk/edit?usp=drive_link" +BASIC_FILE_NAME = "basic data file" +DIFF_START_CELL_URL = "https://docs.google.com/spreadsheets/d/15W15G9ERorhGT5QhvO8IJVTWvtbQC9Puzxi3txmU9ZM/edit?usp=drive_link" +DIFF_START_CELL_NAME = "middle_starter_cell_50_rows" +NO_HEADER_URL = "https://docs.google.com/spreadsheets/d/1pdqdoPnkwTIjccXstqT6TeWzFT0rxez3KdJItO9j0Ng/edit?usp=drive_link" +NO_HEADER_NAME = "No header File" +FAKE_URL = "https://docs.google.com/spreadsheets/d/1pdqdoPnhdTIjccXstqK7ReWzFT0rxez3gdJItO9j0Ng/edit?usp=drive_link" +BAD_URL = "https://docs.google.com/spreadsheets/d/1pdqdoPnhdTIjccXstqK7" +NOT_URL = "skjdhfsjdfgfjsdhgfikrfgakerfdfggfd" + +class TestGsheet: + def setup_method(self, method): + self.gsheet = Gsheet() + + def test_set_spreadsheet_on_url(self): + """ + Test that a spreadsheet can be set using a valid url. + """ + url = BASIC_FILE_URL + self.gsheet.set_spreadsheet(url=url) + + assert self.gsheet.spreadsheet is not None + + def test_set_spreadsheet_on_name(self): + """ + Test that a spreadsheet can be set using a valid name. + """ + name = BASIC_FILE_NAME + self.gsheet.set_spreadsheet(name=name) + + assert self.gsheet.spreadsheet is not None + + @pytest.mark.parametrize( + ("url", "exception"), + ( + (NO_ACCESS_URL, PermissionError), + (FAKE_URL, SpreadsheetNotFound), + (BAD_URL, SpreadsheetNotFound), + (FAKE_URL, SpreadsheetNotFound) + ) + ) + def test_set_spreadsheet_errors(self, url, exception): + """ + Test that the correct error is thrown given the url. + """ + with pytest.raises(exception): + self.gsheet.set_spreadsheet(url=url) + + def test_get_spreadsheet_calls_setter_when_not_set(self, mocker): + """ + Test that the setter is called when no spreadsheet is set before calling + get spreadsheet. + """ + mock_method = mocker.patch.object(self.gsheet, "set_spreadsheet") + self.gsheet.set_spreadsheet() + + # Assertion + mock_method.assert_called_once() + + def test_set_worksheet_no_input(self): + """ + Test a worksheet is set without any inputs. + """ + self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) + self.gsheet.set_worksheet() + + assert (self.gsheet.spreadsheet is not None) + + def test_set_worksheet_with_input(self): + """ + Test worksheet gets sst using the title of the worksheet. + """ + title = "First" + url = DIFF_START_CELL_URL + self.gsheet.set_spreadsheet(url=url) + self.gsheet.set_worksheet(title=title) + + assert (self.gsheet.spreadsheet is not None) + + def test_set_worksheet_no_spreadsheet(self): + """ + Test SpreadsheetNotFound thrown when no spreadsheet is set when + calling set_worksheet. + """ + with pytest.raises(SpreadsheetNotFound): + self.gsheet.set_worksheet() + + def test_set_worksheet_incorrect_input(self): + """ + Test to see if it uses the second title option if the first one fails. + """ + self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) + self.gsheet.set_worksheet(title="sdfsffuysdkjfhsdkjf") + + assert (self.gsheet.spreadsheet is not None) + assert (self.gsheet.worksheet.title == "Table1") + + def test_get_worksheet_calls_setter_when_not_set(self, mocker): + """ + Test to see if setter is called if worksheet is not yet set while + calling its getter. + """ + self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) + mock_method = mocker.patch.object(self.gsheet, "set_worksheet") + self.gsheet.get_worksheet() + + # Assertion + mock_method.assert_called_once() + + @pytest.mark.parametrize( + ("url", "rows", "columns"), + ( + (BASIC_FILE_URL, 51, 7), + (DIFF_START_CELL_URL, 51, 9), + ) + ) + def test_set_content_has_all_content_of_sheet(self, url, rows, columns): + """ + Check that set_content collects akk tge data + """ + self.gsheet.set_spreadsheet(url=url) + self.gsheet.set_worksheet() + self.gsheet.set_content() + content = self.gsheet.content + + assert len(content) == rows + assert len(content[0]) == columns + + def test_get_content_calls_setter_if_not_set(self, mocker): + """ + Test to see if setter is called if content is not yet set while + calling get_content. + """ + self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) + self.gsheet.set_worksheet() + mock_method = mocker.patch.object(self.gsheet, "set_content") + self.gsheet.get_content() + + # Assertion + mock_method.assert_called_once() + + def test_set_header_gets_first_row_if_header(self): + """ + Test to see if + """ + self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) + self.gsheet.set_worksheet() + content = self.gsheet.get_content() + first_row = content[0] + self.gsheet.set_header() + + assert self.gsheet.header == first_row + + def test_set_header_no_header_given(self): + self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) + self.gsheet.set_worksheet() + content = self.gsheet.get_content() + first_row = self.header = ["column_" + str(i + 1) for i in range(len(content[0]))] + self.gsheet.set_header(no_header="true") + + assert self.gsheet.header == first_row + + def test_get_header_calls_setter_if_not_set(self, mocker): + """ + Test to see if setter is called if content is not yet set while + calling get_content. + """ + self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) + self.gsheet.set_worksheet() + self.gsheet.set_content() + mock_method = mocker.patch.object(self.gsheet, "set_header") + self.gsheet.get_header() + + # Assertion + mock_method.assert_called_once() + + def test_set_column_types(self): + self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) + self.gsheet.set_worksheet() + self.gsheet.set_content() + self.gsheet.set_header() + self.gsheet.set_column_types() + + expected = ["text", "numeric", "numeric", "text", "text", "text", "timestamp"] + assert len(self.gsheet.column_types) == len(expected) + assert self.gsheet.column_types == expected + + def test_get_column_types_calls_setter_if_not_set(self, mocker): + self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) + self.gsheet.set_worksheet() + self.gsheet.set_content() + self.gsheet.set_header() + mock_method = mocker.patch.object(self.gsheet, "set_column_types") + self.gsheet.get_column_types() + + mock_method.assert_called_once() + +def test_set_gsheet_variables(): + pass + +def test_remove_empy_rows_and_columns(): + pass + +@pytest.mark.parametrize( + ("input", "expected_type"), + ( + (["4","1","4562","4567","234634","34532"], int), + (["dsjkfh", "skdjhf", "jkghsdf"], str), + (["43.6", "4534.34", "457424.644"], float), + (["2024-05-30", "2023-12-05", "2023-06-28"], datetime), + (["5461", "534.687", "74849"], float), + (["sdferw", "234234", "sdfjhs", "jghfd"], str), + (["2024-05-30", "sdffsdfwe", "", ""], str), + (["", "", ""], str), + ) + ) +def test_get_python_type(input, expected_type): + result = get_python_type(input) + + assert result == expected_type diff --git a/wherescape/connectors/hubspot/hubspot_wrapper.py b/wherescape/connectors/hubspot/hubspot_wrapper.py index d50ed56..554f230 100644 --- a/wherescape/connectors/hubspot/hubspot_wrapper.py +++ b/wherescape/connectors/hubspot/hubspot_wrapper.py @@ -140,7 +140,7 @@ def get_object(self, record_id: str, hs_object: str, properties: list = []): response = basic_api.get_by_id(record_id, properties=properties) return response except api_error.ApiException as e: - logging.error("An exception occured when calling %s batch_api_>update\n %s" % (hs_object, e)) + logging.error(f"An exception occured when calling {hs_object} batch_api_>update\n {e}") def get_property_names(self, object_name: str): """ @@ -183,7 +183,7 @@ def get_all( """ results = [] basic_api = getattr(self.client.crm, hs_object).basic_api - error_api = getattr(hubspot.crm, HubspotObjectEnum(hs_object)) + error_api = getattr(hubspot.crm, hs_object) try: api_response = basic_api.get_page(properties=properties, limit=100) results.extend(api_response.results) diff --git a/wherescape/connectors/hubspot/utils.py b/wherescape/connectors/hubspot/utils.py index b749445..f130297 100644 --- a/wherescape/connectors/hubspot/utils.py +++ b/wherescape/connectors/hubspot/utils.py @@ -1,4 +1,3 @@ - def get_double_nerd_ids(tickets: list) -> list: """ Function to retrieve all nerd ticket id that appear multiple times. diff --git a/wherescape/helper_functions.py b/wherescape/helper_functions.py index 56c2e2c..2fcdae8 100644 --- a/wherescape/helper_functions.py +++ b/wherescape/helper_functions.py @@ -1,3 +1,6 @@ +import ast +import json + from dateutil.parser import parse from slugify import slugify @@ -22,6 +25,26 @@ def create_column_names(display_names=[]): return columns +def remove_empty_rows_and_columns(input: list) -> list: + """ + Returns the list with the emtpy rows removed. + + Params: + - input (list): List of lists containing the content. + + Returns + - List of list. + """ + content = [row for row in input if not all(cell == "" for cell in row)] + # switch and empty + content_transposed = [list(i) for i in zip(*content)] + content_transposed = [ + row for row in content_transposed if not all(cell == "" for cell in row) + ] + # switch again + return [list(i) for i in zip(*content_transposed)] + + def create_display_names(columns=[]): """ Change column names in to display names. @@ -120,13 +143,11 @@ def filter_dict(dict_to_filter, keys_to_keep): Returns: dict: The dict with only the key, value pairs you want to keep. """ - return dict( - [ - (key, dict_to_filter[key]) + return { + key: dict_to_filter[key] for key in dict_to_filter if key in set(keys_to_keep) - ] - ) + } def flatten_json(json_response, name_to_skip=None): @@ -186,12 +207,13 @@ def fill_out_empty_keys(cleaned_json, keys_to_keep, overwrite): out[key] = cleaned_json[key] return out + def is_date(string, fuzzy=False): """ Return whether the string can be interpreted as a date. string: str, string to check for date - fuzzy: bool, ignore unknown tokens in string if True + fuzzy: bool, ignore unknown tokens in string if True. """ try: parse(string, fuzzy=fuzzy) @@ -201,3 +223,69 @@ def is_date(string, fuzzy=False): return False except OverflowError: return False + + +def set_date_to_ymd(value: str | None) -> str | None: + """ + Set the dateformat of a datetime string to YYYY-mm-dd. + + Args: + - value (str): value to set dateformat for. + + Returns: + - string of date of format YYYY-mm-dd + """ + return parse(value).strftime("%Y-%m-%d") if value is not None else value + + +def get_python_type(column_values: list) -> str: + """ + Returns string of the Python type fit for the data in the list. + + Params: + - column_values (list): list of the values. + """ + values = [] + is_bool = True + types = set() + + for item in column_values: + if item not in ["TRUE", "FALSE", "1", "0"]: + is_bool = False + + if is_bool: + return bool + else: + for item in column_values: + values.append(convert_string(item)) + types = {type(item) for item in values} + + if len(types) > 1: + for t in values: + if not (isinstance(t, int) or isinstance(t, float)): + return str + return float + else: + return next(iter(types)) + + +def convert_string(value: str): + """ + Determines literal python type of a string value. + + Params: + - value (str): value to determine literal type of. + + Returns + - Any value as it's literal object type + """ + try: + return (ast.literal_eval(value)) + except (ValueError, SyntaxError): + try: + return (json.loads(value)) + except (ValueError, TypeError): + try: + return (parse(value)) + except (ValueError, OverflowError): + return (value) From 0660074ca94a701e593bfc58b6e4d15c494e7c9e Mon Sep 17 00:00:00 2001 From: Esther ZR Date: Thu, 5 Sep 2024 13:26:54 +0200 Subject: [PATCH 2/7] feat: gsheet readme updated --- .../connectors/gsheet/gsheets_parsing.py | 2 +- wherescape/connectors/gsheet/readme.md | 44 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/wherescape/connectors/gsheet/gsheets_parsing.py b/wherescape/connectors/gsheet/gsheets_parsing.py index 9e4e987..07f0d6d 100644 --- a/wherescape/connectors/gsheet/gsheets_parsing.py +++ b/wherescape/connectors/gsheet/gsheets_parsing.py @@ -66,7 +66,7 @@ def create_parser(): """ parser = argparse.ArgumentParser() parser.add_argument( - "--workbook_name", action="store", help="Name of the Google Sheet/ workbook" + "workbook_name", action="store", help="Name of the Google Sheet/ workbook" ) # positional argument parser.add_argument("--sheet", help="Name of the sheet in the workbook", default="Sheet1") parser.add_argument("--range", help="Cell range to retrieve") diff --git a/wherescape/connectors/gsheet/readme.md b/wherescape/connectors/gsheet/readme.md index a6c76f1..fad32ce 100644 --- a/wherescape/connectors/gsheet/readme.md +++ b/wherescape/connectors/gsheet/readme.md @@ -5,3 +5,47 @@ DEFAULT_SCOPES =[ 'https://www.googleapis.com/auth/spreadsheets', 'https://www.googleapis.com/auth/drive' ] +# Gsheet connector + +Gsheet Connector for WhereSCape. Takes care of creating metadata for a loading data and uploading the data from a gsheet file. + +# Preparation +An authentication user is required from Wherescape. For this, a client secret has to be created in the Google API Console. +Its secret should be stored in `%%APPDATAPP\gspread` for Windows orin `~/.gspread` for Unix. + +The default scopes for this client secret are: + +``` +DEFAULT_SCOPES =[ + 'https://www.googleapis.com/auth/spreadsheets', + 'https://www.googleapis.com/auth/drive' + ] +``` + +## WhereScape Parameters + + +## Load table + +Create 1 load table. with a script based load. +Under Source. ensure a link is set to the file in the `Source Directory` and any desired arguments are provided in `Source File Name`. + + +## Host script +Create a new python host script and add it to the load table. Example code: + +``` +from wherescape_os.wherescape.connectors.gsheet.create_metadata import gsheet_create_metadata + +gsheet_create_metadata() + +``` + +Host scripts to create: +* python_gsheet_create_metadata +* python_gsheet_load_data + +# Usage + +First attach the metadata host script to the load table and ensure there's no pre-load action set. +After creating the table, attach the load_data host script to the load table and set pre-load to struncate. From e83fbe1048a32e3b4ecd4ecb691b296d293dc676 Mon Sep 17 00:00:00 2001 From: Esther ZR Date: Thu, 5 Sep 2024 14:37:40 +0200 Subject: [PATCH 3/7] feat: range allowed --- .../connectors/gsheet/gsheets_parsing.py | 10 +-- .../connectors/gsheet/gsheets_wrapper.py | 69 +++++++++++-------- wherescape/connectors/gsheet/load_data.py | 3 +- 3 files changed, 48 insertions(+), 34 deletions(-) diff --git a/wherescape/connectors/gsheet/gsheets_parsing.py b/wherescape/connectors/gsheet/gsheets_parsing.py index 07f0d6d..5780d02 100644 --- a/wherescape/connectors/gsheet/gsheets_parsing.py +++ b/wherescape/connectors/gsheet/gsheets_parsing.py @@ -51,7 +51,7 @@ def parse_gspread_arguments(argument: str) -> argparse.Namespace: ) row_index_range = a1_range_to_grid_range(args.range).get("startRowIndex") if row_index_header_range == row_index_range: - logging.error( + logging.warning( "If both a range and a header_range are specified, they can not overlap." ) return args @@ -66,13 +66,13 @@ def create_parser(): """ parser = argparse.ArgumentParser() parser.add_argument( - "workbook_name", action="store", help="Name of the Google Sheet/ workbook" + "workbook_name", help="Name of the Google Sheet/ workbook", default=None ) # positional argument - parser.add_argument("--sheet", help="Name of the sheet in the workbook", default="Sheet1") + parser.add_argument("--sheet", help="Name of the sheet in the workbook") parser.add_argument("--range", help="Cell range to retrieve") - parser.add_argument("--header_range", action="store_true", help="Cell range to be used as header") + parser.add_argument("--header_range", help="Cell range to be used as header") parser.add_argument( - "--no_header", action="store_true", help="Specify if table has no header" + "--no_header", action="store_true", help="Specify if table has no header" ) parser.add_argument( "-d", "--debug", action="store_true", help="Print debug messages" diff --git a/wherescape/connectors/gsheet/gsheets_wrapper.py b/wherescape/connectors/gsheet/gsheets_wrapper.py index 5bec4e5..4dd3689 100644 --- a/wherescape/connectors/gsheet/gsheets_wrapper.py +++ b/wherescape/connectors/gsheet/gsheets_wrapper.py @@ -7,11 +7,16 @@ from gspread import ( Client, Spreadsheet, - SpreadsheetNotFound, Worksheet, +) +from gspread.exceptions import ( + SpreadsheetNotFound, WorksheetNotFound, + APIError, ) + + from ...helper_functions import convert_string, remove_empty_rows_and_columns, get_python_type @@ -77,7 +82,7 @@ def get_spreadsheet( self.set_spreadsheet(url, name) return self.spreadsheet - def set_worksheet(self, title: str = "Sheet1"): + def set_worksheet(self, title: str = None): """ Sets the worksheet based on the given title. A spreadsheet needs to have already been set. @@ -88,22 +93,15 @@ def set_worksheet(self, title: str = "Sheet1"): """ try: if self.spreadsheet is None: - logging.error("No spreadsheet available to take the worksheet from.") - raise SpreadsheetNotFound + logging.info("No name was provided. Using the first sheet") + self.spreadsheet = self.worksheet.get_worksheet(0) else: self.worksheet = self.spreadsheet.worksheet(title) - except WorksheetNotFound: - logging.warning("Invalid worksheet title in --sheet") - try: - # using the automated name used by Apple. - # in case it was imported to google from there. - title = "Table1" - self.worksheet = self.spreadsheet.worksheet(title) - logging.info(f"worksheet found with title {title}") - except WorksheetNotFound as notFound: - logging.error("No worksheet was found") - raise notFound - logging.info(f"worksheet found with title {title}") + except WorksheetNotFound as notFound: + logging.warning("No worksheet was found") + raise notFound + + logging.info(f"worksheet found") def get_worksheet(self, title: str = "") -> Worksheet: """ @@ -123,14 +121,21 @@ def get_worksheet(self, title: str = "") -> Worksheet: self.set_worksheet() return self.worksheet - def set_content(self): + def set_content(self, range: str = None): """ Retreives content from gsheet. removes empty rows. """ - content = self.worksheet.get_all_values() + if range: + try: + content = self.worksheet.get(range) + except APIError as e: + logging.error(f"Invalid range: {range}") + raise e + else: + content = self.worksheet.get_all_values() self.content = remove_empty_rows_and_columns(content) - def get_content(self) -> list: + def get_content(self, range: str = None) -> list: """ Getter for content. Calls setter if not already present. @@ -138,38 +143,48 @@ def get_content(self) -> list: - List of lists containing the content of the sheet. """ if self.content is None: - self.set_content() + self.set_content(range) return self.content - def set_header(self, no_header: str = None): + def set_header(self, no_header: str = None, header_range: str = None): """ Creates the header for the content. Takes it from content if no_header is False. Params: - - no_header (str): (optional) input of args.no_header (legacy naming) + - no_header (str): (optional) input of args.no_header + - header_Range (str): (optional) range of the header example: "A1:B5" """ # Set content if none is set yet. if self.content is None: self.set_content() if not no_header: + if header_range: + try: + row = self.worksheet.get(header_range) + self.header = ["column_" + str(i + 1) if value == "" else value for i, value in enumerate(row)] + except APIError as e: + logging.error(f"Invalid Header range: {header_range}") + raise e + row = self.content.pop(0) self.header = ["column_" + str(i + 1) if value == "" else value for i, value in enumerate(row)] else: self.header = ["column_" + str(i + 1) for i in range(len(self.content[0]))] - def get_header(self, no_header: str = None) -> list: + def get_header(self, no_header: bool = False, range: str = None) -> list: """ Getter for the header. Calls setter if not already present. Params: - - no_header (str): (optional) input of args.no_header (legacy naming) + - no_header (str): (optional) True if no header is provided (legacy naming) + - range (str): (optional) range of the header. Returns - List containing header values. """ if self.header == []: - self.set_header(no_header) + self.set_header(no_header, range) return self.header def set_column_types(self): @@ -224,8 +239,8 @@ def set_gsheet_variables(gsheet: Gsheet, url: str, args): gsheet.set_worksheet(title=args.sheet) # Set all content. - gsheet.set_content() - gsheet.set_header(args.no_header) + gsheet.set_content(args.range) + gsheet.set_header(args.no_header, args.header_range) gsheet.set_column_types() def _authorize() -> Client: diff --git a/wherescape/connectors/gsheet/load_data.py b/wherescape/connectors/gsheet/load_data.py index f84ff5d..c9a9b66 100644 --- a/wherescape/connectors/gsheet/load_data.py +++ b/wherescape/connectors/gsheet/load_data.py @@ -36,9 +36,8 @@ def google_sheet_load_data(): args = parse_gspread_arguments(workbook_details) set_gsheet_variables(gsheet, url, args) - content = gsheet.get_content() + content = gsheet.get_content(args.range) # For name consistency. - # gsheet_header = create_column_names(gsheet.get_header()) gsheet_header = gsheet.get_header() # Missing from wherescape (added after metadata upload) From daf264458315d3cc580b54627fad129a56d04618 Mon Sep 17 00:00:00 2001 From: Esther ZR Date: Thu, 5 Sep 2024 16:04:04 +0200 Subject: [PATCH 4/7] fix: range issues --- wherescape/connectors/gsheet/gsheets_parsing.py | 4 ++-- wherescape/connectors/gsheet/gsheets_wrapper.py | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/wherescape/connectors/gsheet/gsheets_parsing.py b/wherescape/connectors/gsheet/gsheets_parsing.py index 5780d02..c3b4047 100644 --- a/wherescape/connectors/gsheet/gsheets_parsing.py +++ b/wherescape/connectors/gsheet/gsheets_parsing.py @@ -50,9 +50,9 @@ def parse_gspread_arguments(argument: str) -> argparse.Namespace: "startRowIndex" ) row_index_range = a1_range_to_grid_range(args.range).get("startRowIndex") - if row_index_header_range == row_index_range: + if row_index_header_range != row_index_range: logging.warning( - "If both a range and a header_range are specified, they can not overlap." + "If both a range and a header_range are specified, they should overlap." ) return args diff --git a/wherescape/connectors/gsheet/gsheets_wrapper.py b/wherescape/connectors/gsheet/gsheets_wrapper.py index 4dd3689..51f0280 100644 --- a/wherescape/connectors/gsheet/gsheets_wrapper.py +++ b/wherescape/connectors/gsheet/gsheets_wrapper.py @@ -127,7 +127,7 @@ def set_content(self, range: str = None): """ if range: try: - content = self.worksheet.get(range) + content = self.worksheet.get(range, ) except APIError as e: logging.error(f"Invalid range: {range}") raise e @@ -161,14 +161,15 @@ def set_header(self, no_header: str = None, header_range: str = None): if not no_header: if header_range: try: - row = self.worksheet.get(header_range) + row = self.worksheet.get(header_range)[0] self.header = ["column_" + str(i + 1) if value == "" else value for i, value in enumerate(row)] + self.content.pop(0) except APIError as e: logging.error(f"Invalid Header range: {header_range}") raise e - - row = self.content.pop(0) - self.header = ["column_" + str(i + 1) if value == "" else value for i, value in enumerate(row)] + else: + row = self.content.pop(0) + self.header = ["column_" + str(i + 1) if value == "" else value for i, value in enumerate(row)] else: self.header = ["column_" + str(i + 1) for i in range(len(self.content[0]))] From 810b7c9a4d9899e4170aa83a0bfae58a8a3f2efa Mon Sep 17 00:00:00 2001 From: Esther ZR Date: Thu, 5 Sep 2024 16:18:21 +0200 Subject: [PATCH 5/7] docs: added info --- wherescape/connectors/gsheet/gsheets_parsing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/wherescape/connectors/gsheet/gsheets_parsing.py b/wherescape/connectors/gsheet/gsheets_parsing.py index c3b4047..d875a24 100644 --- a/wherescape/connectors/gsheet/gsheets_parsing.py +++ b/wherescape/connectors/gsheet/gsheets_parsing.py @@ -25,6 +25,7 @@ def parse_gspread_arguments(argument: str) -> argparse.Namespace: try: args = parser.parse_args(argument_list) except SystemExit as ex: + logging.warning("There might be a mistake with the arguments. Ensure it's all correct.") logging.error(ex) if args.range: From fb66c3934069404449840752386858a1e6824043 Mon Sep 17 00:00:00 2001 From: Esther ZR Date: Fri, 8 Nov 2024 15:49:29 +0100 Subject: [PATCH 6/7] feat: improve code, fix some issues. NOT FULLY TESTED YET and readme also not fully updated yet --- .../connectors/gsheet/create_metadata.py | 7 +- .../connectors/gsheet/gsheets_wrapper.py | 143 ++++++++---------- wherescape/connectors/gsheet/load_data.py | 43 +++--- wherescape/connectors/gsheet/readme.md | 11 +- .../connectors/gsheet/tests/fixtures.py | 8 - .../gsheet/tests/test_gsheets_wrapper.py | 135 +++++------------ wherescape/helper_functions.py | 4 +- wherescape/wherescape.py | 3 +- 8 files changed, 135 insertions(+), 219 deletions(-) delete mode 100644 wherescape/connectors/gsheet/tests/fixtures.py diff --git a/wherescape/connectors/gsheet/create_metadata.py b/wherescape/connectors/gsheet/create_metadata.py index 86075fe..f7f9282 100644 --- a/wherescape/connectors/gsheet/create_metadata.py +++ b/wherescape/connectors/gsheet/create_metadata.py @@ -3,11 +3,11 @@ from ...helper_functions import create_column_names, create_display_names, prepare_metadata_query from ...wherescape import WhereScape -from .gsheets_wrapper import Gsheet, set_gsheet_variables +from .gsheets_wrapper import Gsheet from .gsheets_parsing import parse_gspread_arguments -def gsheet_create_metadata(): +def python_gsheet_create_metadata(): """ Function that creates a load table in Wherescape based on the data in a provided Google sheet file. @@ -19,7 +19,6 @@ def gsheet_create_metadata(): logging.info( "Start time: %s for gsheet_load_data_os." % start_time.strftime("%Y-%m-%d %H:%M:%S") ) - gsheet: Gsheet = Gsheet() load_table_name = wherescape_instance.table url = wherescape_instance.query_meta( @@ -36,7 +35,7 @@ def gsheet_create_metadata(): if args.debug: logging.warning("Debug mode on -> do not use for production.") - set_gsheet_variables(gsheet, url, args) + gsheet = Gsheet(args, url) header_row = gsheet.get_header() column_types = gsheet.get_column_types() diff --git a/wherescape/connectors/gsheet/gsheets_wrapper.py b/wherescape/connectors/gsheet/gsheets_wrapper.py index 51f0280..14d8480 100644 --- a/wherescape/connectors/gsheet/gsheets_wrapper.py +++ b/wherescape/connectors/gsheet/gsheets_wrapper.py @@ -4,32 +4,50 @@ import sys from datetime import datetime -from gspread import ( - Client, - Spreadsheet, - Worksheet, -) +from gspread.client import Client +from gspread.spreadsheet import Spreadsheet +from gspread.worksheet import Worksheet from gspread.exceptions import ( SpreadsheetNotFound, WorksheetNotFound, APIError, ) - - -from ...helper_functions import convert_string, remove_empty_rows_and_columns, get_python_type +from ...helper_functions import remove_empty_rows_and_columns, get_python_type class Gsheet: - def __init__(self): + def __init__(self, args = None, url: str = "", test: bool = False): + """ + Init for Gsheet. set all variables. + + Args: + - args: args provided for processing correct data + - url (str): link to the spreadsheet being uploaded + - test (bool): False by default. set True in tests to not require args + """ self._client: Client = _authorize() - self.spreadsheet: Spreadsheet = None - self.worksheet: Worksheet = None - self.header: list = [] - self.content = None - self.column_types = None + if test: + logging.warning("Marked as TEST. no params will be set") + elif args: + self.set_gsheet_variables(url, args) + else: + logging.error("No args provided") + + def set_gsheet_variables(self, url: str, args): + """ + Function to set all of the variables for gsheet. + """ + # Set spreadsheet and worksheet. + self._set_spreadsheet(url=url, name=args.workbook_name) + self._set_worksheet(title=args.sheet) + + # Set all content. + self._set_content(args.range) + self._set_header(args.no_header, args.header_range) + self._set_column_types() - def set_spreadsheet(self, url: str = "", name: str = ""): + def _set_spreadsheet(self, url: str = "", name: str = ""): """ Attempts to retreive the spreadsheet from google drive. Requires either url or name. @@ -44,7 +62,7 @@ def set_spreadsheet(self, url: str = "", name: str = ""): self.spreadsheet = self._client.open_by_url(url) logging.info("spreadsheet file has been obtained.") return - except SpreadsheetNotFound as notFound: + except SpreadsheetNotFound: logging.error("Invalid URL") if name: @@ -52,37 +70,27 @@ def set_spreadsheet(self, url: str = "", name: str = ""): self.spreadsheet = self._client.open(name) logging.info("spreadsheet file has been obtained.") return - except SpreadsheetNotFound as notFound: + except SpreadsheetNotFound: logging.error("Invalid workbook name") logging.error("Enter a valid workbook URL or workbook name") # Raised when both url and name don't find a spreadsheet. - raise notFound + raise SpreadsheetNotFound except PermissionError as pe: logging.error("Invalid Permissions, make sure access is granted.") raise pe - def get_spreadsheet( - self, url: str = "", name: str = "" - ) -> Spreadsheet: + def get_spreadsheet(self) -> Spreadsheet: """ - Getter for Spreadsheet. Calls setter if not already present. - - Args: - - url (str): (optional) link to the spreadsheet. - - name (str): (optional) name of spreadsheet. + Getter for Spreadsheet. Returns: - Spreadsheet. """ - if self.spreadsheet is not None: - return self.spreadsheet - else: - self.set_spreadsheet(url, name) - return self.spreadsheet + return self.spreadsheet - def set_worksheet(self, title: str = None): + def _set_worksheet(self, title: str | None = None): """ Sets the worksheet based on the given title. A spreadsheet needs to have already been set. @@ -92,36 +100,30 @@ def set_worksheet(self, title: str = None): spreadsheet applications """ try: - if self.spreadsheet is None: + if title is None: logging.info("No name was provided. Using the first sheet") - self.spreadsheet = self.worksheet.get_worksheet(0) + self.worksheet = self.spreadsheet.get_worksheet(0) else: self.worksheet = self.spreadsheet.worksheet(title) + except AttributeError as ae: + logging.error("No spreadsheet was provided") + raise SpreadsheetNotFound except WorksheetNotFound as notFound: - logging.warning("No worksheet was found") + logging.warning(f"No worksheet was found with {title}") raise notFound - logging.info(f"worksheet found") + logging.info(f"worksheet found with title {self.worksheet.title}") - def get_worksheet(self, title: str = "") -> Worksheet: + def get_worksheet(self) -> Worksheet: """ - Getter for worksheet. Calls setter if not already present. - - Params: - - title (str) : title of the worksheet + Getter for worksheet. Returns: - Worksheet """ - if self.worksheet: - pass - elif title: - self.set_worksheet(title) - else: - self.set_worksheet() return self.worksheet - def set_content(self, range: str = None): + def _set_content(self, range: str | None = None): """ Retreives content from gsheet. removes empty rows. """ @@ -135,20 +137,22 @@ def set_content(self, range: str = None): content = self.worksheet.get_all_values() self.content = remove_empty_rows_and_columns(content) - def get_content(self, range: str = None) -> list: + def get_content(self) -> list: """ - Getter for content. Calls setter if not already present. + Getter for content Returns: - List of lists containing the content of the sheet. """ - if self.content is None: - self.set_content(range) return self.content - def set_header(self, no_header: str = None, header_range: str = None): + def _set_header( + self, + no_header: bool | None = None, + header_range: str | None = None, + ): """ - Creates the header for the content. Takes it from content if no_header is False. + Creates the header for the content. Takes it from content if no_header is False or None. Params: - no_header (str): (optional) input of args.no_header @@ -156,7 +160,7 @@ def set_header(self, no_header: str = None, header_range: str = None): """ # Set content if none is set yet. if self.content is None: - self.set_content() + self._set_content() if not no_header: if header_range: @@ -173,27 +177,21 @@ def set_header(self, no_header: str = None, header_range: str = None): else: self.header = ["column_" + str(i + 1) for i in range(len(self.content[0]))] - def get_header(self, no_header: bool = False, range: str = None) -> list: + def get_header(self) -> list: """ - Getter for the header. Calls setter if not already present. - - Params: - - no_header (str): (optional) True if no header is provided (legacy naming) - - range (str): (optional) range of the header. + Getter for the header. Returns - List containing header values. """ - if self.header == []: - self.set_header(no_header, range) return self.header - def set_column_types(self): + def _set_column_types(self): """ Set a list with the postgrestype for each column in content. """ if not self.content or self.header == []: - self.set_header() # will also set content + self._set_header() # will also set content postgres_types = [] @@ -228,21 +226,10 @@ def get_column_types(self) -> list: - list of column types """ if not self.column_types: - self.set_column_types() + self._set_column_types() return self.column_types -def set_gsheet_variables(gsheet: Gsheet, url: str, args): - """ - Function to set all a bunch of variables for gsheet. - """ - # Set spreadsheet and worksheet. - gsheet.set_spreadsheet(url=url, name=args.workbook_name) - gsheet.set_worksheet(title=args.sheet) - # Set all content. - gsheet.set_content(args.range) - gsheet.set_header(args.no_header, args.header_range) - gsheet.set_column_types() def _authorize() -> Client: """ @@ -252,7 +239,7 @@ def _authorize() -> Client: - Client for authorization when handling data """ json_keyfile = _read_secret() - return gspread.service_account( + return gspread.auth.service_account( json_keyfile, ) diff --git a/wherescape/connectors/gsheet/load_data.py b/wherescape/connectors/gsheet/load_data.py index c9a9b66..69c4c1d 100644 --- a/wherescape/connectors/gsheet/load_data.py +++ b/wherescape/connectors/gsheet/load_data.py @@ -4,13 +4,13 @@ from datetime import datetime, UTC from itertools import zip_longest -from .gsheets_wrapper import Gsheet, set_gsheet_variables +from .gsheets_wrapper import Gsheet from .gsheets_parsing import parse_gspread_arguments from ...helper_functions import set_date_to_ymd from ...wherescape import WhereScape -def google_sheet_load_data(): +def python_gsheet_load_data(): """ Loads content of a google sheet file into a table from a google sheet. Relevant metadata must already be created. @@ -19,9 +19,14 @@ def google_sheet_load_data(): logging.info("Start time: %s" % start_time.strftime("%Y-%m-%d %H:%M:%S")) wherescape = WhereScape() - gsheet = Gsheet() table_name = wherescape.table column_names, column_types = wherescape.get_columns() + try: + assert column_names is not None + assert column_types is not None + except AssertionError as e: + logging.error("no column_names or column_types found in wherescape") + raise e url = wherescape.query_meta( "select lt_file_path from ws_load_tab where lt_table_name = ?", @@ -35,8 +40,8 @@ def google_sheet_load_data(): logging.info(f"Metadata. URL: {url} ; Details : {workbook_details}") args = parse_gspread_arguments(workbook_details) - set_gsheet_variables(gsheet, url, args) - content = gsheet.get_content(args.range) + gsheet = Gsheet(args, url) + content = gsheet.get_content() # For name consistency. gsheet_header = gsheet.get_header() @@ -93,10 +98,10 @@ def get_missing_columns(input_header: list, expected: list) -> tuple: - indexes (list): list of index values of missing columns. """ # WS might end with digits. Remove those to compare. - if re.search(r'_\d{3}$', input_header[0]) is not None: - input_header = remove_final_digits(input_header) - elif re.search(r'_\d{3}$', expected[0]) is not None: - expected = remove_final_digits(expected) + if input_header[0] is not None: + input_header = input_header + elif expected[0] is not None: + expected = expected columns = [] indexes = [] @@ -137,13 +142,13 @@ def remove_extra_columns(content: list, header: list, indexes: list)-> tuple: remove columns from content that aren't listed for it's destination. Params: - - content (list): full content to add empty columns to. - - header (list): header to add missing headers to. - - indexes (list): indexes of missing columns. + - content (list): full content to remove unlisted columns to. + - header (list): header to remove unlisted headers to. + - indexes (list): indexes of unlisted columns. Returns: - - content (list): content including new columns for missing fields. - - header (list): header including new column names for missing fields. + - content (list): content excluding columns for unlisted fields. + - header (list): header excluding columns for unlisted fields. """ transposed = [list(i) for i in zip_longest(*content, fillvalue=None)] indexes.reverse() @@ -153,7 +158,7 @@ def remove_extra_columns(content: list, header: list, indexes: list)-> tuple: content = [list(i) for i in zip_longest(*transposed, fillvalue=None)] return content, header -def remove_final_digits(header: list) -> list: +def remove_final_digits(headers: list[str]) -> list: """ Removes _000 (or other digits) from the end of words in a list if they are there. This method is to make comparing easier since the numbers might differ if the columns are not the same. @@ -165,9 +170,9 @@ def remove_final_digits(header: list) -> list: - result (list): new header without the _000. """ result = [] - for header in header: - if re.search(r'_\d{3}$', header) is None: - result.append(header) + for header in headers: + if re.search(pattern=r"_\d{3}$", string=header) is None: + result.append(headers) else: - result.append(header[:-4]) + result.append(headers[:-4]) return result diff --git a/wherescape/connectors/gsheet/readme.md b/wherescape/connectors/gsheet/readme.md index fad32ce..a2f25c2 100644 --- a/wherescape/connectors/gsheet/readme.md +++ b/wherescape/connectors/gsheet/readme.md @@ -7,11 +7,13 @@ DEFAULT_SCOPES =[ ] # Gsheet connector -Gsheet Connector for WhereSCape. Takes care of creating metadata for a loading data and uploading the data from a gsheet file. +Gsheet Connector for WhereScape. Takes care of creating metadata for a loading data and uploading the data from a gsheet file. -# Preparation +## WhereScape Parameters + +## Connection Details An authentication user is required from Wherescape. For this, a client secret has to be created in the Google API Console. -Its secret should be stored in `%%APPDATAPP\gspread` for Windows orin `~/.gspread` for Unix. +Its secret should be stored in `%%APPDATAPP\gspread` for Windows or in `~/.gspread` for Unix. The default scopes for this client secret are: @@ -22,9 +24,6 @@ DEFAULT_SCOPES =[ ] ``` -## WhereScape Parameters - - ## Load table Create 1 load table. with a script based load. diff --git a/wherescape/connectors/gsheet/tests/fixtures.py b/wherescape/connectors/gsheet/tests/fixtures.py deleted file mode 100644 index 48985f4..0000000 --- a/wherescape/connectors/gsheet/tests/fixtures.py +++ /dev/null @@ -1,8 +0,0 @@ -import pytest - -from wherescape.connectors.gsheet.gsheets_wrapper import Gsheet - - -@pytest.fixture -def gsheet(): - return Gsheet() \ No newline at end of file diff --git a/wherescape/connectors/gsheet/tests/test_gsheets_wrapper.py b/wherescape/connectors/gsheet/tests/test_gsheets_wrapper.py index f8eb5ba..20c1322 100644 --- a/wherescape/connectors/gsheet/tests/test_gsheets_wrapper.py +++ b/wherescape/connectors/gsheet/tests/test_gsheets_wrapper.py @@ -1,8 +1,9 @@ from datetime import datetime +import logging import pytest from wherescape.connectors.gsheet.gsheets_wrapper import Gsheet, get_python_type -from gspread.exceptions import SpreadsheetNotFound +from gspread.exceptions import SpreadsheetNotFound, WorksheetNotFound NO_ACCESS_URL = "https://docs.google.com/spreadsheets/d/1lhrCYDeMpX8DUdoI_JC4hEmhhVqUUOkCIkTKgHpcN0o/edit?usp=drive_link" @@ -11,22 +12,19 @@ BASIC_FILE_NAME = "basic data file" DIFF_START_CELL_URL = "https://docs.google.com/spreadsheets/d/15W15G9ERorhGT5QhvO8IJVTWvtbQC9Puzxi3txmU9ZM/edit?usp=drive_link" DIFF_START_CELL_NAME = "middle_starter_cell_50_rows" -NO_HEADER_URL = "https://docs.google.com/spreadsheets/d/1pdqdoPnkwTIjccXstqT6TeWzFT0rxez3KdJItO9j0Ng/edit?usp=drive_link" -NO_HEADER_NAME = "No header File" FAKE_URL = "https://docs.google.com/spreadsheets/d/1pdqdoPnhdTIjccXstqK7ReWzFT0rxez3gdJItO9j0Ng/edit?usp=drive_link" BAD_URL = "https://docs.google.com/spreadsheets/d/1pdqdoPnhdTIjccXstqK7" -NOT_URL = "skjdhfsjdfgfjsdhgfikrfgakerfdfggfd" class TestGsheet: def setup_method(self, method): - self.gsheet = Gsheet() + self.gsheet = Gsheet(test=True) def test_set_spreadsheet_on_url(self): """ Test that a spreadsheet can be set using a valid url. """ url = BASIC_FILE_URL - self.gsheet.set_spreadsheet(url=url) + self.gsheet._set_spreadsheet(url=url) assert self.gsheet.spreadsheet is not None @@ -35,7 +33,7 @@ def test_set_spreadsheet_on_name(self): Test that a spreadsheet can be set using a valid name. """ name = BASIC_FILE_NAME - self.gsheet.set_spreadsheet(name=name) + self.gsheet._set_spreadsheet(name=name) assert self.gsheet.spreadsheet is not None @@ -53,25 +51,15 @@ def test_set_spreadsheet_errors(self, url, exception): Test that the correct error is thrown given the url. """ with pytest.raises(exception): - self.gsheet.set_spreadsheet(url=url) - - def test_get_spreadsheet_calls_setter_when_not_set(self, mocker): - """ - Test that the setter is called when no spreadsheet is set before calling - get spreadsheet. - """ - mock_method = mocker.patch.object(self.gsheet, "set_spreadsheet") - self.gsheet.set_spreadsheet() - - # Assertion - mock_method.assert_called_once() + self.gsheet._set_spreadsheet(url=url) def test_set_worksheet_no_input(self): """ Test a worksheet is set without any inputs. """ - self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) - self.gsheet.set_worksheet() + self.gsheet._set_spreadsheet(url=BASIC_FILE_URL) + logging.warning(self.gsheet.spreadsheet) + self.gsheet._set_worksheet() assert (self.gsheet.spreadsheet is not None) @@ -79,10 +67,10 @@ def test_set_worksheet_with_input(self): """ Test worksheet gets sst using the title of the worksheet. """ - title = "First" + title = "Second" url = DIFF_START_CELL_URL - self.gsheet.set_spreadsheet(url=url) - self.gsheet.set_worksheet(title=title) + self.gsheet._set_spreadsheet(url=url) + self.gsheet._set_worksheet(title=title) assert (self.gsheet.spreadsheet is not None) @@ -92,124 +80,69 @@ def test_set_worksheet_no_spreadsheet(self): calling set_worksheet. """ with pytest.raises(SpreadsheetNotFound): - self.gsheet.set_worksheet() + self.gsheet._set_worksheet() def test_set_worksheet_incorrect_input(self): """ - Test to see if it uses the second title option if the first one fails. + Test raises WorksheetNotFound if no worksheet was found with the given title. """ - self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) - self.gsheet.set_worksheet(title="sdfsffuysdkjfhsdkjf") - - assert (self.gsheet.spreadsheet is not None) - assert (self.gsheet.worksheet.title == "Table1") - - def test_get_worksheet_calls_setter_when_not_set(self, mocker): - """ - Test to see if setter is called if worksheet is not yet set while - calling its getter. - """ - self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) - mock_method = mocker.patch.object(self.gsheet, "set_worksheet") - self.gsheet.get_worksheet() - - # Assertion - mock_method.assert_called_once() + self.gsheet._set_spreadsheet(url=BASIC_FILE_URL) + with pytest.raises(WorksheetNotFound): + self.gsheet._set_worksheet(title="sdfsffuysdkjfhsdkjf") @pytest.mark.parametrize( ("url", "rows", "columns"), ( (BASIC_FILE_URL, 51, 7), - (DIFF_START_CELL_URL, 51, 9), + (DIFF_START_CELL_URL, 50, 7), ) ) def test_set_content_has_all_content_of_sheet(self, url, rows, columns): """ Check that set_content collects akk tge data """ - self.gsheet.set_spreadsheet(url=url) - self.gsheet.set_worksheet() - self.gsheet.set_content() + self.gsheet._set_spreadsheet(url=url) + self.gsheet._set_worksheet() + self.gsheet._set_content() content = self.gsheet.content assert len(content) == rows assert len(content[0]) == columns - def test_get_content_calls_setter_if_not_set(self, mocker): - """ - Test to see if setter is called if content is not yet set while - calling get_content. - """ - self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) - self.gsheet.set_worksheet() - mock_method = mocker.patch.object(self.gsheet, "set_content") - self.gsheet.get_content() - - # Assertion - mock_method.assert_called_once() - def test_set_header_gets_first_row_if_header(self): """ Test to see if """ - self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) - self.gsheet.set_worksheet() + self.gsheet._set_spreadsheet(url=BASIC_FILE_URL) + self.gsheet._set_worksheet() + self.gsheet._set_content() content = self.gsheet.get_content() first_row = content[0] - self.gsheet.set_header() + self.gsheet._set_header() assert self.gsheet.header == first_row def test_set_header_no_header_given(self): - self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) - self.gsheet.set_worksheet() + self.gsheet._set_spreadsheet(url=BASIC_FILE_URL) + self.gsheet._set_worksheet() + self.gsheet._set_content() content = self.gsheet.get_content() first_row = self.header = ["column_" + str(i + 1) for i in range(len(content[0]))] - self.gsheet.set_header(no_header="true") + self.gsheet._set_header(no_header=True) assert self.gsheet.header == first_row - def test_get_header_calls_setter_if_not_set(self, mocker): - """ - Test to see if setter is called if content is not yet set while - calling get_content. - """ - self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) - self.gsheet.set_worksheet() - self.gsheet.set_content() - mock_method = mocker.patch.object(self.gsheet, "set_header") - self.gsheet.get_header() - - # Assertion - mock_method.assert_called_once() - def test_set_column_types(self): - self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) - self.gsheet.set_worksheet() - self.gsheet.set_content() - self.gsheet.set_header() - self.gsheet.set_column_types() + self.gsheet._set_spreadsheet(url=BASIC_FILE_URL) + self.gsheet._set_worksheet() + self.gsheet._set_content() + self.gsheet._set_header() + self.gsheet._set_column_types() expected = ["text", "numeric", "numeric", "text", "text", "text", "timestamp"] assert len(self.gsheet.column_types) == len(expected) assert self.gsheet.column_types == expected - def test_get_column_types_calls_setter_if_not_set(self, mocker): - self.gsheet.set_spreadsheet(url=BASIC_FILE_URL) - self.gsheet.set_worksheet() - self.gsheet.set_content() - self.gsheet.set_header() - mock_method = mocker.patch.object(self.gsheet, "set_column_types") - self.gsheet.get_column_types() - - mock_method.assert_called_once() - -def test_set_gsheet_variables(): - pass - -def test_remove_empy_rows_and_columns(): - pass - @pytest.mark.parametrize( ("input", "expected_type"), ( diff --git a/wherescape/helper_functions.py b/wherescape/helper_functions.py index 2fcdae8..e98477e 100644 --- a/wherescape/helper_functions.py +++ b/wherescape/helper_functions.py @@ -19,7 +19,7 @@ def create_column_names(display_names=[]): column = slugify(display_name, separator="_", max_length=59) if column == "": column = "column" - column = f"{column}_{str(i + 1).zfill(3)}" + column = f"{column}" columns.append(column) i += 1 return columns @@ -238,7 +238,7 @@ def set_date_to_ymd(value: str | None) -> str | None: return parse(value).strftime("%Y-%m-%d") if value is not None else value -def get_python_type(column_values: list) -> str: +def get_python_type(column_values: list) -> type: """ Returns string of the Python type fit for the data in the list. diff --git a/wherescape/wherescape.py b/wherescape/wherescape.py index 8894255..73a5371 100644 --- a/wherescape/wherescape.py +++ b/wherescape/wherescape.py @@ -106,7 +106,8 @@ def get_columns(self): sql = "SELECT sc_col_name, sc_data_type FROM ws_stage_col WHERE sc_obj_key = ? ORDER BY sc_order" else: logging.warning("Invalid schema: %s", self.schema) - return None + # returning same amount removes problems showing in IDE + return (None, None) results = self.query_meta(sql, [self.object_key]) if results: From 7c312806254bafc166223aac00c641b006bba403 Mon Sep 17 00:00:00 2001 From: Esther ZR Date: Fri, 8 Nov 2024 16:04:02 +0100 Subject: [PATCH 7/7] fix: remove extra value --- wherescape/helper_functions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/wherescape/helper_functions.py b/wherescape/helper_functions.py index e98477e..6318334 100644 --- a/wherescape/helper_functions.py +++ b/wherescape/helper_functions.py @@ -13,7 +13,6 @@ def create_column_names(display_names=[]): Columns get truncated to 59 characters, because 63 characters is the max column lenght for Postgres columns. """ - i = 0 columns = [] for display_name in display_names: column = slugify(display_name, separator="_", max_length=59) @@ -21,7 +20,6 @@ def create_column_names(display_names=[]): column = "column" column = f"{column}" columns.append(column) - i += 1 return columns