From 9c2f5e22c6f57fe2896f904f17be683a935e7281 Mon Sep 17 00:00:00 2001 From: Brendan O'Connell Date: Fri, 8 Aug 2025 16:09:59 +0200 Subject: [PATCH 1/6] Created get_work, create contributors --- README.md | 2 +- bookloader.py | 2 + csvloader.py | 384 ++++++++++++++++++++++++++++++++++++++++++++++++++ loader.py | 2 + 4 files changed, 389 insertions(+), 1 deletion(-) create mode 100644 csvloader.py diff --git a/README.md b/README.md index 328deba..c1c2c03 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ pip install -r requirements.txt ## CLI Usage -Available modes, depending on publisher input: `OBP` (Open Book Publishers), `punctum` (punctum books), `AM` (African Minds), `UWP` (University of Westminster Press), `WHP` (The White Horse Press), `EDITUS` (Editus), `EDUEPB` (EDUEPB), `EDUFBA` (EDUFBA), `Rosario` (Editorial Universidad del Rosario), `Leuven` (Leuven University Press), `LHarmattan` (L'Harmattan) +Available modes, depending on publisher input: `OBP` (Open Book Publishers), `punctum` (punctum books), `AM` (African Minds), `UWP` (University of Westminster Press), `WHP` (The White Horse Press), `EDITUS` (Editus), `EDUEPB` (EDUEPB), `EDUFBA` (EDUFBA), `Rosario` (Editorial Universidad del Rosario), `Leuven` (Leuven University Press), `LHarmattan` (L'Harmattan), `CSV` (Thoth CSV template) ### Live Thoth API ``` diff --git a/bookloader.py b/bookloader.py index bdbae55..7f4511f 100644 --- a/bookloader.py +++ b/bookloader.py @@ -103,9 +103,11 @@ class BookLoader: "Foreword": "FOREWORD_BY", "A24": "INTRODUCTION_BY", "Introduction": "INTRODUCTION_BY", + "Introduction By": "INTRODUCTION_BY", "writer of introduction": "INTRODUCTION_BY", "A15": "PREFACE_BY", "Preface": "PREFACE_BY", + "Preface By": "PREFACE_BY", "Music editor": "MUSIC_EDITOR", "Research By": "RESEARCH_BY", "Contributions By": "CONTRIBUTIONS_BY", diff --git a/csvloader.py b/csvloader.py new file mode 100644 index 0000000..1606809 --- /dev/null +++ b/csvloader.py @@ -0,0 +1,384 @@ +#!/usr/bin/env python +"""Load book metadata from official Thoth CSV template into Thoth""" + +import logging +import sys +import re +import pandas as pd +from bookloader import BookLoader +from thothlibrary import ThothError + + +class CSVLoader(BookLoader): + """Thoth CSV template specific logic to ingest metadata from CSV into Thoth""" + single_imprint = True + cache_contributors = True + cache_institutions = False + publisher_name = "Insert publisher name here" + publisher_shortname = "Insert publisher shortname here" + publisher_url = "https://insertpublisherwebsite.com" + + def run(self): + """Process CSV and call Thoth to insert its data""" + + # find number of contributor columns. The template has columns for 5 contributors + # by default, but publishers may add additional sets of columns for additional contributors. + contributions_index = self.get_highest_contributor_index(self.data.columns) + # logging.info(self.data) + + for index, row in self.data.iterrows(): + logging.info("\n\n\n\n**********") + print(row['title'], type(row['title'])) + print(self.data.columns) + # Data start in row 2, so start counting in logging from there + logging.info(f"processing book from row {index + 2}: {row['title']}") + work = self.get_work(row) + # logging.info(work) + # try to find the work in Thoth + try: + work_id = self.thoth.work_by_doi(work['doi']).workId + existing_work = self.thoth.work_by_id(work_id) + # if work is found, try to update it with the new data + if existing_work: + try: + existing_work.update((k, v) for k, v in work.items() if v is not None) + self.thoth.update_work(existing_work) + logging.info(f"workId for updated work: {work_id}") + # if update fails, log the error and exit the import + except ThothError as t: + logging.error(f"Failed to update work with id {work_id}, exception: {t}") + sys.exit(1) + # if work isn't found, create it + except (IndexError, AttributeError, ThothError): + work_id = self.thoth.create_work(work) + logging.info(f"created work with workId: {work_id}") + work = self.thoth.work_by_id(work_id) + self.create_contributors(row, work, contributions_index) + continue + self.create_publications(row, work) + self.create_languages(row, work) + self.create_series(row, work) + self.create_subjects(row, work) + + def get_work(self, row): + """Returns a dictionary with all attributes of a 'work' + + row: current row number + """ + + work_type = row["work_type"] + work_status = row["work_status"] + doi = f"https://doi.org/{row["doi"]}" + + # Exit with error if any of the required fields for Work are not present + if pd.isna(work_type) or pd.isna(work_status) or pd.isna(row["title"]): + logging.error("Work missing a required field: work_type, work_status, or title") + sys.exit(1) + + title = self.sanitise_title(row["title"], row["subtitle"]) + + work = { + "workType": self.work_types[work_type], + "workStatus": self.work_statuses[work_status], + "fullTitle": title["fullTitle"], + "title": title["title"], + "subtitle": title["subtitle"], + "reference": None, + "edition": row["edition"], + "imprintId": self.imprint_id, + "doi": doi, + "publicationDate": row["publication_date"], + "withdrawnDate": row["withdrawn_date"], + "place": row["place_of_publication"], + "width": None, + "height": None, + "pageCount": row["page_count"], + "pageBreakdown": row["page_breakdown"], + "imageCount": row["image_count"], + "tableCount": row["table_count"], + "audioCount": row["audio_count"], + "videoCount": row["video_count"], + "license": row["license"], + "copyrightHolder": row["copyright_holder"], + "landingPage": row["landing_page"], + "lccn": None, + "oclc": None, + "shortAbstract": row["short_abstract"], + "longAbstract": row["long_abstract"], + "generalNote": None, + "toc": None, + "coverUrl": row["cover_url"], + "coverCaption": None, + "firstPage": None, + "lastPage": None, + "pageInterval": None, + } + # Convert NaN to None for all fields + work = {k: (None if pd.isna(v) else v) for k, v in work.items()} + return work + + def create_contributors(self, row, work, contributions_index): + """Creates/updates all contributors associated with the current work and their contributions + + row: current CSV row + + work: Work from Thoth + + contributions_index: Number of sets of contribution columns in the CSV, determined by publisher + """ + + highest_contribution_ordinal = max((c.contributionOrdinal for c in work.contributions), default=0) + + for index in range(1, contributions_index + 1): + first_name = row[f"contribution_{index}_first_name"] + surname = row[f"contribution_{index}_surname"] + if not surname: + continue + first_name = first_name.strip() + surname = surname.strip() + full_name = f"{first_name} {surname}" + orcid = None + orcid_column = f"contribution_{index}_orcid" + if orcid_column in row and pd.notna(row[orcid_column]) and row[orcid_column]: + orcid = f"https://orcid.org/{row[orcid_column]}" + website = row[f"contribution_{index}_website"] + contributor = { + "firstName": first_name, + "lastName": surname, + "fullName": full_name, + "orcid": orcid, + "website": website + + } + # Convert NaN to None for all fields + contributor = {k: (None if pd.isna(v) else v) for k, v in contributor.items()} + logging.info(contributor) + + if full_name not in self.all_contributors: + contributor_id = self.thoth.create_contributor(contributor) + logging.info(f"created contributor: {full_name}, {contributor_id}") + # cache new contributor + self.all_contributors[full_name] = contributor_id + else: + contributor_id = self.all_contributors[full_name] + logging.info(f"contributor {full_name} already in Thoth, skipping") + existing_contribution = next( + (c for c in work.contributions if c.contributor.contributorId == contributor_id), + None) + if not existing_contribution: + contribution_type = self.contribution_types[row[f"contribution_{index}_role"]] + main = self.is_main_contribution(row[f"contribution_{index}_main_contribution"]) + contribution = { + "workId": work.workId, + "contributorId": contributor_id, + "contributionType": contribution_type, + "mainContribution": main, + "contributionOrdinal": highest_contribution_ordinal + 1, + "biography": row[f"contribution_{index}_biography"], + "firstName": first_name, + "lastName": surname, + "fullName": full_name + } + # Convert NaN to None for all fields + contribution = {k: (None if pd.isna(v) else v) for k, v in contribution.items()} + logging.info(contribution) + self.thoth.create_contribution(contribution) + logging.info(f"created contribution for {full_name}, type: {contribution_type}") + highest_contribution_ordinal += 1 + else: + logging.info(f"existing contribution for {full_name}, type: {contribution_type}") + + def create_publications(self, row, work): + """Creates PDF and paperback publications associated with the current work + + row: current CSV record + + work: Work from Thoth + """ + isbn = self.sanitise_isbn(row["scs023_isbn"].strip()) + print_landing_page = row["scs023_printed_version"] + pdf_full_text = row["fulltext_repository"] + + publications = [["PDF", None, work.landingPage]] + # some rows don't have landing page for print + # only create a print Publication in Thoth if print_landing_page exists + if print_landing_page: + publications.append(["PAPERBACK", isbn, print_landing_page]) + + for publication_type, isbn, landing_page in publications: + publication = { + "workId": work.workId, + "publicationType": publication_type, + "isbn": isbn, + "widthMm": None, + "widthIn": None, + "heightMm": None, + "heightIn": None, + "depthMm": None, + "depthIn": None, + "weightG": None, + "weightOz": None, + } + + existing_pub = next((p for p in work.publications if p.publicationType == publication_type), None) + if existing_pub: + publication_id = existing_pub.publicationId + logging.info(f"existing {publication_type} publication: {publication_id}") + else: + publication_id = self.thoth.create_publication(publication) + logging.info(f"created {publication_type} publication: {publication_id}") + if (existing_pub and + any(location.locationPlatform == "PUBLISHER_WEBSITE" for location in existing_pub.locations)): + logging.info("existing location") + continue + location = { + "publicationId": publication_id, + "landingPage": landing_page, + "fullTextUrl": pdf_full_text if publication_type == "PDF" else None, + "locationPlatform": "PUBLISHER_WEBSITE", + "canonical": "true", + } + self.thoth.create_location(location) + logging.info(f"created location: with publicationId {publication_id}") + + def create_languages(self, row, work): + """Creates language associated with the current work + + row: current CSV record + + work: Work from Thoth + """ + csv_language_codes = row["language_ISO"].split("|") + for csv_language in csv_language_codes: + language_code = csv_language.upper() + # CSV contains "fra" for French instead of "fre" + if language_code == "FRA": + language_code = "FRE" + # check to see if work already has this language + if any(language.languageCode == language_code for language in work.languages): + logging.info("existing language") + return + language = { + "workId": work.workId, + "languageCode": language_code, + "languageRelation": "ORIGINAL", + "mainLanguage": "true" + } + self.thoth.create_language(language) + logging.info(f"created language {language_code} for workId: {work.workId}") + + def create_series(self, row, work): + """Creates series associated with the current work + + row: current CSV row + + work: current work + """ + series_name = row["scs023_series"] + if not series_name: + logging.info(f"{work.fullTitle} missing series name; skipping create_series") + return + if series_name not in self.all_series: + try: + issn = self.sanitise_issn(row["scs023_issn"]) + except ValueError as e: + logging.error(f"{e} ({work.workId})") + issn = None + series = { + "seriesType": "BOOK_SERIES", + "seriesName": series_name, + "issnDigital": issn, + "issnPrint": issn, + "seriesUrl": None, + "seriesDescription": None, + "seriesCfpUrl": None, + "imprintId": self.imprint_id + } + series_id = self.thoth.create_series(series) + logging.info(f"new series created: {series['seriesName']}") + self.all_series[series_name] = series_id + else: + logging.info(f"existing series {series_name}") + series_id = self.all_series[series_name] + + # find all existing issues in Series + current_series = self.thoth.series(series_id) + issue_work_ids = [] + + for issue in current_series.issues: + issue_work_ids.append(issue.work.workId) + + # find out if current work already has an issue. If not, create a new one. + if work.workId not in issue_work_ids: + # count them + number_of_issues = len(current_series.issues) + + # assign next highest issueOrdinal + issue = { + "seriesId": series_id, + "workId": work.workId, + "issueOrdinal": number_of_issues + 1, + } + self.thoth.create_issue(issue) + logging.info(f"Created new issue of {current_series} for work") + else: + logging.info(f"Series Issue already exists for work; skipping creating issue of {current_series}") + + def create_subjects(self, row, work): + """Creates all subjects associated with the current work + + row: current row in CSV + + work: Work from Thoth + """ + keyword_subjects = row["scs023_keywords"].split("|") + + # correctly parse "scs023_field_science" into keywords and add them to keyword_subjects + # example field value: + # "Társadalom és gazdaságtörténet / Social and economic history (12979)| + # Újkori és jelenkori történelem / Modern and contemporary history (12977)" + fields_science = row["scs023_field_science"].split("|") + for field in fields_science: + hungarian_field, second_part = field.split(" / ") + english_field = second_part.rsplit(" ", 1)[0] + keyword_subjects.append(hungarian_field) + keyword_subjects.append(english_field) + + bisac_subjects = row["BISAC_code"].split("|") + thema_subjects = row["Thema_code"].split("|") + + def create_subject(subject_type, subject_code, subject_ordinal): + subject = { + "workId": work.workId, + "subjectType": subject_type, + "subjectCode": subject_code, + "subjectOrdinal": subject_ordinal + } + self.thoth.create_subject(subject) + + def prepare_subject(subjects_array, subject_type): + for subject_ordinal, subject_code in enumerate(subjects_array, start=1): + # check if the work already has a subject with an existing subject type/subject code combination + if not any( + subject.subjectCode == subject_code and subject.subjectType == subject_type + for subject in work.subjects + ): + create_subject(subject_type, subject_code, subject_ordinal) + logging.info(f"New {subject_type} {subject_code} added as Subject") + else: + logging.info(f"Existing {subject_type} {subject_code} already associated with Work") + + prepare_subject(keyword_subjects, "KEYWORD") + prepare_subject(bisac_subjects, "BISAC") + prepare_subject(thema_subjects, "THEMA") + + def get_highest_contributor_index(self, columns): + max_index = 0 + pattern = re.compile(r"contribution_(\d+)_first_name") + for column in columns: + match = pattern.match(column) + if match: + index = int(match.group(1)) + if index > max_index: + max_index = index + return max_index diff --git a/loader.py b/loader.py index d414cbe..44b9d17 100755 --- a/loader.py +++ b/loader.py @@ -29,6 +29,7 @@ from leuvenloader import LeuvenLoader from lharmattanloader import LHarmattanLoader from ubiquityapiloader import UbiquityAPILoader +from csvloader import CSVLoader LOADERS = { "OBP": OBPBookLoader, @@ -54,6 +55,7 @@ "UOL": UOLLoader, "Leuven": LeuvenLoader, "LHarmattan": LHarmattanLoader, + "CSV": CSVLoader, } ARGS = [ From 32e5571da8f0065ebffe95ba0103254772f59560 Mon Sep 17 00:00:00 2001 From: Brendan O'Connell Date: Thu, 14 Aug 2025 16:00:34 +0200 Subject: [PATCH 2/6] complete logic for create_contributors, add logic for create_languages, create subjects --- csvloader.py | 234 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 144 insertions(+), 90 deletions(-) diff --git a/csvloader.py b/csvloader.py index 1606809..265da65 100644 --- a/csvloader.py +++ b/csvloader.py @@ -12,7 +12,7 @@ class CSVLoader(BookLoader): """Thoth CSV template specific logic to ingest metadata from CSV into Thoth""" single_imprint = True - cache_contributors = True + cache_contributors = False cache_institutions = False publisher_name = "Insert publisher name here" publisher_shortname = "Insert publisher shortname here" @@ -28,8 +28,8 @@ def run(self): for index, row in self.data.iterrows(): logging.info("\n\n\n\n**********") - print(row['title'], type(row['title'])) - print(self.data.columns) + # print(row['title'], type(row['title'])) + # print(self.data.columns) # Data start in row 2, so start counting in logging from there logging.info(f"processing book from row {index + 2}: {row['title']}") work = self.get_work(row) @@ -53,19 +53,19 @@ def run(self): work_id = self.thoth.create_work(work) logging.info(f"created work with workId: {work_id}") work = self.thoth.work_by_id(work_id) - self.create_contributors(row, work, contributions_index) + # self.create_contributors(row, work, contributions_index) + self.create_languages(row, work) + self.create_subjects(row, work) continue self.create_publications(row, work) - self.create_languages(row, work) self.create_series(row, work) - self.create_subjects(row, work) def get_work(self, row): """Returns a dictionary with all attributes of a 'work' row: current row number """ - + work_type = row["work_type"] work_status = row["work_status"] doi = f"https://doi.org/{row["doi"]}" @@ -138,9 +138,10 @@ def create_contributors(self, row, work, contributions_index): surname = surname.strip() full_name = f"{first_name} {surname}" orcid = None - orcid_column = f"contribution_{index}_orcid" - if orcid_column in row and pd.notna(row[orcid_column]) and row[orcid_column]: - orcid = f"https://orcid.org/{row[orcid_column]}" + orcid_value = row[f"contribution_{index}_orcid"] + # only assign value to orcid variable if a value is present in the row + if pd.notna(orcid_value): + orcid = f"https://orcid.org/{orcid_value}" website = row[f"contribution_{index}_website"] contributor = { "firstName": first_name, @@ -148,12 +149,10 @@ def create_contributors(self, row, work, contributions_index): "fullName": full_name, "orcid": orcid, "website": website - } # Convert NaN to None for all fields contributor = {k: (None if pd.isna(v) else v) for k, v in contributor.items()} - logging.info(contributor) - + if full_name not in self.all_contributors: contributor_id = self.thoth.create_contributor(contributor) logging.info(f"created contributor: {full_name}, {contributor_id}") @@ -181,12 +180,140 @@ def create_contributors(self, row, work, contributions_index): } # Convert NaN to None for all fields contribution = {k: (None if pd.isna(v) else v) for k, v in contribution.items()} - logging.info(contribution) - self.thoth.create_contribution(contribution) + contribution_id = self.thoth.create_contribution(contribution) logging.info(f"created contribution for {full_name}, type: {contribution_type}") highest_contribution_ordinal += 1 else: - logging.info(f"existing contribution for {full_name}, type: {contribution_type}") + logging.info(f"existing contribution for {full_name}, type: {existing_contribution.contributionType}, skipping") + contribution_id = existing_contribution.contributionId + + # find if institution name is present, if not, no institution can be found or created + institution_name = row[f"contribution_{index}_affiliation_institution_name"] + if pd.isna(institution_name): + logging.info("no institution name, skipping creating Affiliation") + continue + + # retrieve institution or create if it doesn't exist + if institution_name in self.all_institutions: + institution_id = self.all_institutions[institution_name] + logging.info(f"existing institution {institution_name} found in cached institutions") + else: + ror = None + if row[f"contribution_{index}_affiliation_institution_ror"]: + ror = f"https://ror.org/{row[f"contribution_{index}_affiliation_institution_ror"]}" + institution = { + "institutionName": institution_name, + "institutionDoi": None, + "ror": ror, + "countryCode": None, + } + institution_id = self.thoth.create_institution(institution) + # cache new institution + self.all_institutions[institution_name] = institution_id + logging.info(f"created and cached new institution {institution_name}") + + existing_affiliations = next( + (c.affiliations for c in work.contributions if c.contributionId == contribution_id), []) + if any(a.institution.institutionId == institution_id for a in existing_affiliations): + logging.info(f"contribution for {full_name} already has affiliation, skipping") + continue + else: + # create affiliation + position = None + if row[f"contribution_{index}_affiliation_position"]: + position = row[f"contribution_{index}_affiliation_position"] + # each contributor can only have 1 affiliation in CSV, so affiliationOrdinal is + # harcoded as 1 + affiliation = { + "contributionId": contribution_id, + "institutionId": institution_id, + "position": position, + "affiliationOrdinal": 1 + } + self.thoth.create_affiliation(affiliation) + + def create_languages(self, row, work): + """Creates languages associated with the current work + + row: current CSV record + + work: Work from Thoth + """ + original_language_codes = row["original_language"] + translated_from_language_codes = row["translated_from_language"] + translated_into_language_codes = row["translated_into_language"] + + all_languages = [ + [original_language_codes, "ORIGINAL"], [translated_from_language_codes, "TRANSLATED_FROM"], [translated_into_language_codes, "TRANSLATED_INTO"] + ] + + for languages, language_relation in all_languages: + if pd.notna(languages): + # language codes are separated by ; + languages_array = languages.split(";") + for language_code in languages_array: + if any(work_language.languageCode == language_code for work_language in work.languages): + logging.info(f"existing language {language_code}") + continue + language = { + "workId": work.workId, + "languageCode": language_code, + "languageRelation": language_relation, + "mainLanguage": "true" + } + self.thoth.create_language(language) + logging.info(f"created language {language_code}") + else: + logging.info(f"no languages for {language_relation}") + + def create_subjects(self, row, work): + """Creates all subjects associated with the current work + + row: current row in CSV + + work: Work from Thoth + """ + keyword_subjects = row["scs023_keywords"].split("|") + + # correctly parse "scs023_field_science" into keywords and add them to keyword_subjects + # example field value: + # "Társadalom és gazdaságtörténet / Social and economic history (12979)| + # Újkori és jelenkori történelem / Modern and contemporary history (12977)" + fields_science = row["scs023_field_science"].split("|") + for field in fields_science: + hungarian_field, second_part = field.split(" / ") + english_field = second_part.rsplit(" ", 1)[0] + keyword_subjects.append(hungarian_field) + keyword_subjects.append(english_field) + + bisac_subjects = row["BISAC_code"].split("|") + thema_subjects = row["Thema_code"].split("|") + + def create_subject(subject_type, subject_code, subject_ordinal): + subject = { + "workId": work.workId, + "subjectType": subject_type, + "subjectCode": subject_code, + "subjectOrdinal": subject_ordinal + } + self.thoth.create_subject(subject) + + def prepare_subject(subjects_array, subject_type): + for subject_ordinal, subject_code in enumerate(subjects_array, start=1): + # check if the work already has a subject with an existing subject type/subject code combination + if not any( + subject.subjectCode == subject_code and subject.subjectType == subject_type + for subject in work.subjects + ): + create_subject(subject_type, subject_code, subject_ordinal) + logging.info(f"New {subject_type} {subject_code} added as Subject") + else: + logging.info(f"Existing {subject_type} {subject_code} already associated with Work") + + prepare_subject(keyword_subjects, "KEYWORD") + prepare_subject(bisac_subjects, "BISAC") + prepare_subject(thema_subjects, "THEMA") + def create_publications(self, row, work): """Creates PDF and paperback publications associated with the current work @@ -241,32 +368,6 @@ def create_publications(self, row, work): self.thoth.create_location(location) logging.info(f"created location: with publicationId {publication_id}") - def create_languages(self, row, work): - """Creates language associated with the current work - - row: current CSV record - - work: Work from Thoth - """ - csv_language_codes = row["language_ISO"].split("|") - for csv_language in csv_language_codes: - language_code = csv_language.upper() - # CSV contains "fra" for French instead of "fre" - if language_code == "FRA": - language_code = "FRE" - # check to see if work already has this language - if any(language.languageCode == language_code for language in work.languages): - logging.info("existing language") - return - language = { - "workId": work.workId, - "languageCode": language_code, - "languageRelation": "ORIGINAL", - "mainLanguage": "true" - } - self.thoth.create_language(language) - logging.info(f"created language {language_code} for workId: {work.workId}") - def create_series(self, row, work): """Creates series associated with the current work @@ -324,54 +425,7 @@ def create_series(self, row, work): else: logging.info(f"Series Issue already exists for work; skipping creating issue of {current_series}") - def create_subjects(self, row, work): - """Creates all subjects associated with the current work - - row: current row in CSV - - work: Work from Thoth - """ - keyword_subjects = row["scs023_keywords"].split("|") - - # correctly parse "scs023_field_science" into keywords and add them to keyword_subjects - # example field value: - # "Társadalom és gazdaságtörténet / Social and economic history (12979)| - # Újkori és jelenkori történelem / Modern and contemporary history (12977)" - fields_science = row["scs023_field_science"].split("|") - for field in fields_science: - hungarian_field, second_part = field.split(" / ") - english_field = second_part.rsplit(" ", 1)[0] - keyword_subjects.append(hungarian_field) - keyword_subjects.append(english_field) - - bisac_subjects = row["BISAC_code"].split("|") - thema_subjects = row["Thema_code"].split("|") - - def create_subject(subject_type, subject_code, subject_ordinal): - subject = { - "workId": work.workId, - "subjectType": subject_type, - "subjectCode": subject_code, - "subjectOrdinal": subject_ordinal - } - self.thoth.create_subject(subject) - - def prepare_subject(subjects_array, subject_type): - for subject_ordinal, subject_code in enumerate(subjects_array, start=1): - # check if the work already has a subject with an existing subject type/subject code combination - if not any( - subject.subjectCode == subject_code and subject.subjectType == subject_type - for subject in work.subjects - ): - create_subject(subject_type, subject_code, subject_ordinal) - logging.info(f"New {subject_type} {subject_code} added as Subject") - else: - logging.info(f"Existing {subject_type} {subject_code} already associated with Work") - - prepare_subject(keyword_subjects, "KEYWORD") - prepare_subject(bisac_subjects, "BISAC") - prepare_subject(thema_subjects, "THEMA") - + def get_highest_contributor_index(self, columns): max_index = 0 pattern = re.compile(r"contribution_(\d+)_first_name") From 92d6452e6b595250baf67da8a119b175fb03d97b Mon Sep 17 00:00:00 2001 From: Brendan O'Connell Date: Mon, 18 Aug 2025 16:32:38 +0200 Subject: [PATCH 3/6] Add logic for create_languages, create_subject, create_publication, create_price, create_location, begin refactoring --- csvloader.py | 259 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 161 insertions(+), 98 deletions(-) diff --git a/csvloader.py b/csvloader.py index 265da65..97ae39b 100644 --- a/csvloader.py +++ b/csvloader.py @@ -12,8 +12,10 @@ class CSVLoader(BookLoader): """Thoth CSV template specific logic to ingest metadata from CSV into Thoth""" single_imprint = True - cache_contributors = False - cache_institutions = False + cache_contributors = True + cache_institutions = True + cache_issues = True + cache_series = True publisher_name = "Insert publisher name here" publisher_shortname = "Insert publisher shortname here" publisher_url = "https://insertpublisherwebsite.com" @@ -24,16 +26,12 @@ def run(self): # find number of contributor columns. The template has columns for 5 contributors # by default, but publishers may add additional sets of columns for additional contributors. contributions_index = self.get_highest_contributor_index(self.data.columns) - # logging.info(self.data) for index, row in self.data.iterrows(): logging.info("\n\n\n\n**********") - # print(row['title'], type(row['title'])) - # print(self.data.columns) # Data start in row 2, so start counting in logging from there logging.info(f"processing book from row {index + 2}: {row['title']}") work = self.get_work(row) - # logging.info(work) # try to find the work in Thoth try: work_id = self.thoth.work_by_doi(work['doi']).workId @@ -41,7 +39,7 @@ def run(self): # if work is found, try to update it with the new data if existing_work: try: - existing_work.update((k, v) for k, v in work.items() if v is not None) + self.convert_nan_to_none(existing_work) self.thoth.update_work(existing_work) logging.info(f"workId for updated work: {work_id}") # if update fails, log the error and exit the import @@ -53,10 +51,9 @@ def run(self): work_id = self.thoth.create_work(work) logging.info(f"created work with workId: {work_id}") work = self.thoth.work_by_id(work_id) - # self.create_contributors(row, work, contributions_index) + self.create_contributors(row, work, contributions_index) self.create_languages(row, work) self.create_subjects(row, work) - continue self.create_publications(row, work) self.create_series(row, work) @@ -68,14 +65,20 @@ def get_work(self, row): work_type = row["work_type"] work_status = row["work_status"] - doi = f"https://doi.org/{row["doi"]}" # Exit with error if any of the required fields for Work are not present if pd.isna(work_type) or pd.isna(work_status) or pd.isna(row["title"]): - logging.error("Work missing a required field: work_type, work_status, or title") + logging.error("Work missing a required field: title, workType, or workStatus") sys.exit(1) - title = self.sanitise_title(row["title"], row["subtitle"]) + subtitle = None + if pd.notna(row["subtitle"]): + subtitle = row["subtitle"] + title = self.sanitise_title(row["title"], subtitle) + + doi = None + if pd.notna(row["doi"]): + doi = f"https://doi.org/{row["doi"]}" work = { "workType": self.work_types[work_type], @@ -132,7 +135,9 @@ def create_contributors(self, row, work, contributions_index): for index in range(1, contributions_index + 1): first_name = row[f"contribution_{index}_first_name"] surname = row[f"contribution_{index}_surname"] - if not surname: + + # surname is required to create Contributor. If it is NaN, continue + if pd.isna(surname): continue first_name = first_name.strip() surname = surname.strip() @@ -160,18 +165,21 @@ def create_contributors(self, row, work, contributions_index): self.all_contributors[full_name] = contributor_id else: contributor_id = self.all_contributors[full_name] - logging.info(f"contributor {full_name} already in Thoth, skipping") + logging.info(f"contributor {full_name} already in Thoth, skipping creation") existing_contribution = next( (c for c in work.contributions if c.contributor.contributorId == contributor_id), None) if not existing_contribution: - contribution_type = self.contribution_types[row[f"contribution_{index}_role"]] - main = self.is_main_contribution(row[f"contribution_{index}_main_contribution"]) + if pd.notna(row[f"contribution_{index}_role"]): + contribution_type = self.contribution_types[row[f"contribution_{index}_role"]] + else: + logging.error("no contributionType for contribution, cannot create") + continue contribution = { "workId": work.workId, "contributorId": contributor_id, "contributionType": contribution_type, - "mainContribution": main, + "mainContribution": "true", "contributionOrdinal": highest_contribution_ordinal + 1, "biography": row[f"contribution_{index}_biography"], "firstName": first_name, @@ -190,7 +198,7 @@ def create_contributors(self, row, work, contributions_index): # find if institution name is present, if not, no institution can be found or created institution_name = row[f"contribution_{index}_affiliation_institution_name"] if pd.isna(institution_name): - logging.info("no institution name, skipping creating Affiliation") + logging.info("no institution name, skipping creating Institution and Affiliation") continue # retrieve institution or create if it doesn't exist @@ -199,8 +207,8 @@ def create_contributors(self, row, work, contributions_index): logging.info(f"existing institution {institution_name} found in cached institutions") else: ror = None - if row[f"contribution_{index}_affiliation_institution_ror"]: - ror = f"https://ror.org/{row[f"contribution_{index}_affiliation_institution_ror"]}" + if pd.notna(row[f"contribution_{index}_affiliation_institution_ror"]): + ror = f"https://ror.org/{row[f"contribution_{index}_affiliation_institution_ror"].strip()}" institution = { "institutionName": institution_name, "institutionDoi": None, @@ -220,10 +228,10 @@ def create_contributors(self, row, work, contributions_index): else: # create affiliation position = None - if row[f"contribution_{index}_affiliation_position"]: + if pd.notna(row[f"contribution_{index}_affiliation_position"]): position = row[f"contribution_{index}_affiliation_position"] - # each contributor can only have 1 affiliation in CSV, so affiliationOrdinal is - # harcoded as 1 + # each contributor only has 1 affiliation in CSV, so affiliationOrdinal is + # hardcoded as 1 affiliation = { "contributionId": contribution_id, "institutionId": institution_id, @@ -232,6 +240,17 @@ def create_contributors(self, row, work, contributions_index): } self.thoth.create_affiliation(affiliation) + def get_highest_contributor_index(self, columns): + max_index = 0 + pattern = re.compile(r"contribution_(\d+)_first_name") + for column in columns: + match = pattern.match(column) + if match: + index = int(match.group(1)) + if index > max_index: + max_index = index + return max_index + def create_languages(self, row, work): """Creates languages associated with the current work @@ -273,21 +292,11 @@ def create_subjects(self, row, work): work: Work from Thoth """ - keyword_subjects = row["scs023_keywords"].split("|") - - # correctly parse "scs023_field_science" into keywords and add them to keyword_subjects - # example field value: - # "Társadalom és gazdaságtörténet / Social and economic history (12979)| - # Újkori és jelenkori történelem / Modern and contemporary history (12977)" - fields_science = row["scs023_field_science"].split("|") - for field in fields_science: - hungarian_field, second_part = field.split(" / ") - english_field = second_part.rsplit(" ", 1)[0] - keyword_subjects.append(hungarian_field) - keyword_subjects.append(english_field) - - bisac_subjects = row["BISAC_code"].split("|") - thema_subjects = row["Thema_code"].split("|") + + thema_subjects = row["thema_subjects"].split(";") if pd.notna(row["thema_subjects"]) else None + bic_subjects = row["bic_subjects"].split(";") if pd.notna(row["bic_subjects"]) else None + bisac_subjects = row["bisac_subjects"].split(";") if pd.notna(row["bisac_subjects"]) else None + keyword_subjects = row["keywords"].split(";") if pd.notna(row["keywords"]) else None def create_subject(subject_type, subject_code, subject_ordinal): subject = { @@ -299,43 +308,43 @@ def create_subject(subject_type, subject_code, subject_ordinal): self.thoth.create_subject(subject) def prepare_subject(subjects_array, subject_type): - for subject_ordinal, subject_code in enumerate(subjects_array, start=1): - # check if the work already has a subject with an existing subject type/subject code combination - if not any( - subject.subjectCode == subject_code and subject.subjectType == subject_type - for subject in work.subjects - ): - create_subject(subject_type, subject_code, subject_ordinal) - logging.info(f"New {subject_type} {subject_code} added as Subject") - else: - logging.info(f"Existing {subject_type} {subject_code} already associated with Work") + if subjects_array: + for subject_ordinal, subject_code in enumerate(subjects_array, start=1): + # check if the work already has a subject with an existing subject type/subject code combination + if not any( + subject.subjectCode == subject_code and subject.subjectType == subject_type + for subject in work.subjects + ): + create_subject(subject_type, subject_code, subject_ordinal) + logging.info(f"New {subject_type} {subject_code} added as Subject") + else: + logging.info(f"Existing {subject_type} {subject_code} already associated with Work") + else: + logging.info(f"no subjects for {subject_type}") - prepare_subject(keyword_subjects, "KEYWORD") - prepare_subject(bisac_subjects, "BISAC") prepare_subject(thema_subjects, "THEMA") - + prepare_subject(bic_subjects, "BIC") + prepare_subject(bisac_subjects, "BISAC") + prepare_subject(keyword_subjects, "KEYWORD") def create_publications(self, row, work): - """Creates PDF and paperback publications associated with the current work + """Creates print and digital publications associated with the current work row: current CSV record work: Work from Thoth """ - isbn = self.sanitise_isbn(row["scs023_isbn"].strip()) - print_landing_page = row["scs023_printed_version"] - pdf_full_text = row["fulltext_repository"] - publications = [["PDF", None, work.landingPage]] - # some rows don't have landing page for print - # only create a print Publication in Thoth if print_landing_page exists - if print_landing_page: - publications.append(["PAPERBACK", isbn, print_landing_page]) + publication_types = ["paperback", "hardback", "pdf", "epub"] + + for publication_type in publication_types: + isbn = None + if pd.notna(row[f"publication_{publication_type}_isbn"]): + isbn = self.sanitise_isbn(str(row[f"publication_{publication_type}_isbn"]).strip()) - for publication_type, isbn, landing_page in publications: publication = { "workId": work.workId, - "publicationType": publication_type, + "publicationType": publication_type.upper(), "isbn": isbn, "widthMm": None, "widthIn": None, @@ -347,26 +356,80 @@ def create_publications(self, row, work): "weightOz": None, } - existing_pub = next((p for p in work.publications if p.publicationType == publication_type), None) + # Check if publication already exists + existing_pub = next((p for p in work.publications if p.publicationType == publication_type.upper()), None) if existing_pub: publication_id = existing_pub.publicationId logging.info(f"existing {publication_type} publication: {publication_id}") - else: - publication_id = self.thoth.create_publication(publication) - logging.info(f"created {publication_type} publication: {publication_id}") - if (existing_pub and - any(location.locationPlatform == "PUBLISHER_WEBSITE" for location in existing_pub.locations)): - logging.info("existing location") continue - location = { - "publicationId": publication_id, - "landingPage": landing_page, - "fullTextUrl": pdf_full_text if publication_type == "PDF" else None, - "locationPlatform": "PUBLISHER_WEBSITE", - "canonical": "true", - } - self.thoth.create_location(location) - logging.info(f"created location: with publicationId {publication_id}") + + # Create new publication if it doesn't already exist + publication_id = self.thoth.create_publication(publication) + logging.info(f"created {publication_type} publication: {publication_id}") + + # Handle separate logic for print and digital + if publication_type in ["paperback", "hardback"]: + self.create_price(row, publication_type, publication_id) + elif publication_type in ["pdf", "epub"]: + self.create_location(row, publication_type, publication_id) + + def create_price(self, row, publication_type, publication_id): + """Create price for paperback and hardback publications""" + for index in range(1, 3): + unit_price = None + if pd.notna(row[f"publication_{publication_type}_price_{index}_unit_price"]): + unit_price = row[f"publication_{publication_type}_price_{index}_unit_price"] + + currency = None + if pd.notna(row[f"publication_{publication_type}_price_{index}_currency_code"]): + currency = row[f"publication_{publication_type}_price_{index}_currency_code"] + + if unit_price and currency: + price = { + "publicationId": publication_id, + "currencyCode": currency, + "unitPrice": unit_price + } + self.thoth.create_price(price) + logging.info(f"created price in {currency} for publication {publication_id}") + else: + logging.info(f"missing unit price and/or currency for {publication_type} price {index}, skipping") + + def create_location(self, row, publication_type, publication_id): + """Create location for PDF and EPUB publications""" + + landing_page = row[f"publication_{publication_type}_location_landing_page"] + full_text = row[f"publication_{publication_type}_location_full_text_url"] + location_platform = row[f"publication_{publication_type}_location_platform"] + + if not pd.notna(location_platform): + logging.info(f"no location platform for {publication_type} publication, skip creating location") + return + + # Transform e.g. "Publisher Website" to "PUBLISHER_WEBSITE" + location_platform = location_platform.upper().replace(" ", "_") + + location_platforms = ["PROJECT_MUSE", "OAPEN", "DOAB", "JSTOR", "EBSCO_HOST", + "OCLC_KB", "PROQUEST_KB", "PROQUEST_EXLIBRIS", "EBSCO_KB", + "JISC_KB", "GOOGLE_BOOKS", "INTERNET_ARCHIVE", "SCIENCE_OPEN", + "ZENODO", "PUBLISHER_WEBSITE", "THOTH", "OTHER"] + + if location_platform not in location_platforms: + logging.error("Location Platform is not supported by Thoth") + sys.exit(1) + + location = { + "publicationId": publication_id, + "landingPage": landing_page, + "fullTextUrl": full_text, + "locationPlatform": location_platform, + "canonical": "true", + } + + # Convert NaN to None for all fields + location = {k: (None if pd.isna(v) else v) for k, v in location.items()} + self.thoth.create_location(location) + logging.info(f"created location for {location_platform} with publicationId {publication_id}") def create_series(self, row, work): """Creates series associated with the current work @@ -375,16 +438,18 @@ def create_series(self, row, work): work: current work """ - series_name = row["scs023_series"] - if not series_name: + + series_name = row["series_name"] + if pd.isna(series_name): logging.info(f"{work.fullTitle} missing series name; skipping create_series") return if series_name not in self.all_series: - try: - issn = self.sanitise_issn(row["scs023_issn"]) - except ValueError as e: - logging.error(f"{e} ({work.workId})") - issn = None + issn = None + if pd.notna(row["series_issn"]): + try: + issn = self.sanitise_issn(row["series_issn"]) + except ValueError as e: + logging.error(f"{e} ({work.workId})") series = { "seriesType": "BOOK_SERIES", "seriesName": series_name, @@ -414,25 +479,23 @@ def create_series(self, row, work): # count them number_of_issues = len(current_series.issues) - # assign next highest issueOrdinal + # if series_issue_number is present in CSV, use it + if pd.notna(row["series_issue_number"]): + issue_ordinal = row["series_issue_number"] + # otherwise assign next highest issueOrdinal + else: + issue_ordinal = number_of_issues + 1 + issue = { "seriesId": series_id, "workId": work.workId, - "issueOrdinal": number_of_issues + 1, + "issueOrdinal": issue_ordinal, } self.thoth.create_issue(issue) logging.info(f"Created new issue of {current_series} for work") else: logging.info(f"Series Issue already exists for work; skipping creating issue of {current_series}") - - def get_highest_contributor_index(self, columns): - max_index = 0 - pattern = re.compile(r"contribution_(\d+)_first_name") - for column in columns: - match = pattern.match(column) - if match: - index = int(match.group(1)) - if index > max_index: - max_index = index - return max_index + def convert_nan_to_none(self, data_dict): + """Convert NaN values to None for all fields in a dictionary""" + return {k: (None if pd.isna(v) else v) for k, v in data_dict.items()} From 5d93e79d5dee20eda1ee86b99f16f13536c79fe9 Mon Sep 17 00:00:00 2001 From: Brendan O'Connell Date: Wed, 20 Aug 2025 15:16:10 +0200 Subject: [PATCH 4/6] use convert_nan_to_none method --- csvloader.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/csvloader.py b/csvloader.py index 97ae39b..5412729 100644 --- a/csvloader.py +++ b/csvloader.py @@ -39,7 +39,7 @@ def run(self): # if work is found, try to update it with the new data if existing_work: try: - self.convert_nan_to_none(existing_work) + existing_work.update((k, v) for k, v in work.items() if v is not None) self.thoth.update_work(existing_work) logging.info(f"workId for updated work: {work_id}") # if update fails, log the error and exit the import @@ -117,7 +117,7 @@ def get_work(self, row): "pageInterval": None, } # Convert NaN to None for all fields - work = {k: (None if pd.isna(v) else v) for k, v in work.items()} + work = self.convert_nan_to_none(work) return work def create_contributors(self, row, work, contributions_index): @@ -156,7 +156,7 @@ def create_contributors(self, row, work, contributions_index): "website": website } # Convert NaN to None for all fields - contributor = {k: (None if pd.isna(v) else v) for k, v in contributor.items()} + contributor = self.convert_nan_to_none(contributor) if full_name not in self.all_contributors: contributor_id = self.thoth.create_contributor(contributor) @@ -187,7 +187,7 @@ def create_contributors(self, row, work, contributions_index): "fullName": full_name } # Convert NaN to None for all fields - contribution = {k: (None if pd.isna(v) else v) for k, v in contribution.items()} + contribution = self.convert_nan_to_none(contribution) contribution_id = self.thoth.create_contribution(contribution) logging.info(f"created contribution for {full_name}, type: {contribution_type}") highest_contribution_ordinal += 1 @@ -427,7 +427,7 @@ def create_location(self, row, publication_type, publication_id): } # Convert NaN to None for all fields - location = {k: (None if pd.isna(v) else v) for k, v in location.items()} + location = self.convert_nan_to_none(location) self.thoth.create_location(location) logging.info(f"created location for {location_platform} with publicationId {publication_id}") From 6c90b13823f4271e1d445dcaf106f412892c1c4f Mon Sep 17 00:00:00 2001 From: Brendan O'Connell Date: Wed, 19 Nov 2025 14:54:51 +0100 Subject: [PATCH 5/6] Partially addressed review comments --- csvloader.py | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/csvloader.py b/csvloader.py index 5412729..e9a2f66 100644 --- a/csvloader.py +++ b/csvloader.py @@ -133,15 +133,15 @@ def create_contributors(self, row, work, contributions_index): highest_contribution_ordinal = max((c.contributionOrdinal for c in work.contributions), default=0) for index in range(1, contributions_index + 1): - first_name = row[f"contribution_{index}_first_name"] - surname = row[f"contribution_{index}_surname"] + given_name = row[f"contribution_{index}_given_name"] + family_name = row[f"contribution_{index}_family_name"] - # surname is required to create Contributor. If it is NaN, continue - if pd.isna(surname): + # family_name is required to create Contributor. If it is NaN, continue + if pd.isna(family_name): continue - first_name = first_name.strip() - surname = surname.strip() - full_name = f"{first_name} {surname}" + given_name = given_name.strip() + family_name = family_name.strip() + full_name = f"{given_name} {family_name}" orcid = None orcid_value = row[f"contribution_{index}_orcid"] # only assign value to orcid variable if a value is present in the row @@ -149,8 +149,8 @@ def create_contributors(self, row, work, contributions_index): orcid = f"https://orcid.org/{orcid_value}" website = row[f"contribution_{index}_website"] contributor = { - "firstName": first_name, - "lastName": surname, + "firstName": given_name, + "lastName": family_name, "fullName": full_name, "orcid": orcid, "website": website @@ -158,17 +158,25 @@ def create_contributors(self, row, work, contributions_index): # Convert NaN to None for all fields contributor = self.convert_nan_to_none(contributor) - if full_name not in self.all_contributors: + if orcid and orcid in self.all_contributors: + contributor_id = self.all_contributors[orcid] + logging.info(f"existing contributor with ORCID: {full_name}, {orcid}") + elif full_name in self.all_contributors: + contributor_id = self.all_contributors[full_name] + logging.info(f"existing contributor (name): {full_name}") + else: + # Create new contributor contributor_id = self.thoth.create_contributor(contributor) logging.info(f"created contributor: {full_name}, {contributor_id}") - # cache new contributor + + # Cache by both name and ORCID if available self.all_contributors[full_name] = contributor_id - else: - contributor_id = self.all_contributors[full_name] - logging.info(f"contributor {full_name} already in Thoth, skipping creation") + if orcid: + self.all_contributors[orcid] = contributor_id + existing_contribution = next( - (c for c in work.contributions if c.contributor.contributorId == contributor_id), - None) + (c for c in work.contributions if c.contributor.contributorId == contributor_id), + None) if not existing_contribution: if pd.notna(row[f"contribution_{index}_role"]): contribution_type = self.contribution_types[row[f"contribution_{index}_role"]] @@ -182,8 +190,8 @@ def create_contributors(self, row, work, contributions_index): "mainContribution": "true", "contributionOrdinal": highest_contribution_ordinal + 1, "biography": row[f"contribution_{index}_biography"], - "firstName": first_name, - "lastName": surname, + "firstName": given_name, + "lastName": family_name, "fullName": full_name } # Convert NaN to None for all fields @@ -242,7 +250,7 @@ def create_contributors(self, row, work, contributions_index): def get_highest_contributor_index(self, columns): max_index = 0 - pattern = re.compile(r"contribution_(\d+)_first_name") + pattern = re.compile(r"contribution_(\d+)_given_name") for column in columns: match = pattern.match(column) if match: From bfd64b22a2393127f8ef0decdbf7e6ed960802d3 Mon Sep 17 00:00:00 2001 From: Brendan O'Connell Date: Fri, 28 Nov 2025 16:05:29 +0100 Subject: [PATCH 6/6] Minor improvements to loader used in ingesting Venice University Press CSV --- bookloader.py | 3 ++- csvloader.py | 22 +++++++++++++--------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/bookloader.py b/bookloader.py index e802586..76186b6 100644 --- a/bookloader.py +++ b/bookloader.py @@ -58,7 +58,8 @@ class BookLoader: "edited_volume": "EDITED_BOOK", "Journal Issue": "JOURNAL_ISSUE", "Journal": "JOURNAL_ISSUE", - "textbook": "TEXTBOOK" + "textbook": "TEXTBOOK", + "Textbook": "TEXTBOOK", } work_statuses = { "Active": "ACTIVE", diff --git a/csvloader.py b/csvloader.py index e9a2f66..25048f6 100644 --- a/csvloader.py +++ b/csvloader.py @@ -16,9 +16,9 @@ class CSVLoader(BookLoader): cache_institutions = True cache_issues = True cache_series = True - publisher_name = "Insert publisher name here" - publisher_shortname = "Insert publisher shortname here" - publisher_url = "https://insertpublisherwebsite.com" + publisher_name = "Edizioni Ca’ Foscari – Venice University Press" + publisher_shortname = "ECF" + publisher_url = "https://edizionicafoscari.unive.it/" def run(self): """Process CSV and call Thoth to insert its data""" @@ -95,7 +95,7 @@ def get_work(self, row): "place": row["place_of_publication"], "width": None, "height": None, - "pageCount": row["page_count"], + "pageCount": int(row["page_count"]) if pd.notna(row["page_count"]) else None, "pageBreakdown": row["page_breakdown"], "imageCount": row["image_count"], "tableCount": row["table_count"], @@ -301,10 +301,14 @@ def create_subjects(self, row, work): work: Work from Thoth """ - thema_subjects = row["thema_subjects"].split(";") if pd.notna(row["thema_subjects"]) else None - bic_subjects = row["bic_subjects"].split(";") if pd.notna(row["bic_subjects"]) else None - bisac_subjects = row["bisac_subjects"].split(";") if pd.notna(row["bisac_subjects"]) else None - keyword_subjects = row["keywords"].split(";") if pd.notna(row["keywords"]) else None + thema_subjects = ([s.strip() for s in row["thema_subjects"].split(";")] + if pd.notna(row["thema_subjects"]) else None) + bic_subjects = ([s.strip() for s in row["bic_subjects"].split(";")] + if pd.notna(row["bic_subjects"]) else None) + bisac_subjects = ([s.strip() for s in row["bisac_subjects"].split(";")] + if pd.notna(row["bisac_subjects"]) else None) + keyword_subjects = ([s.strip() for s in row["keywords"].split(";")] + if pd.notna(row["keywords"]) else None) def create_subject(subject_type, subject_code, subject_ordinal): subject = { @@ -489,7 +493,7 @@ def create_series(self, row, work): # if series_issue_number is present in CSV, use it if pd.notna(row["series_issue_number"]): - issue_ordinal = row["series_issue_number"] + issue_ordinal = int(row["series_issue_number"]) # otherwise assign next highest issueOrdinal else: issue_ordinal = number_of_issues + 1