From 9c2f5e22c6f57fe2896f904f17be683a935e7281 Mon Sep 17 00:00:00 2001
From: Brendan O'Connell <brendan@openbookpublishers.com>
Date: Fri, 8 Aug 2025 16:09:59 +0200
Subject: [PATCH 1/6] Created get_work, create contributors

---
 README.md     |   2 +-
 bookloader.py |   2 +
 csvloader.py  | 384 ++++++++++++++++++++++++++++++++++++++++++++++++++
 loader.py     |   2 +
 4 files changed, 389 insertions(+), 1 deletion(-)
 create mode 100644 csvloader.py

diff --git a/README.md b/README.md
index 328deba..c1c2c03 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ pip install -r requirements.txt
 
 ## CLI Usage
 
-Available modes, depending on publisher input: `OBP` (Open Book Publishers), `punctum` (punctum books), `AM` (African Minds), `UWP` (University of Westminster Press), `WHP` (The White Horse Press), `EDITUS` (Editus), `EDUEPB` (EDUEPB), `EDUFBA` (EDUFBA), `Rosario` (Editorial Universidad del Rosario), `Leuven` (Leuven University Press), `LHarmattan` (L'Harmattan)
+Available modes, depending on publisher input: `OBP` (Open Book Publishers), `punctum` (punctum books), `AM` (African Minds), `UWP` (University of Westminster Press), `WHP` (The White Horse Press), `EDITUS` (Editus), `EDUEPB` (EDUEPB), `EDUFBA` (EDUFBA), `Rosario` (Editorial Universidad del Rosario), `Leuven` (Leuven University Press), `LHarmattan` (L'Harmattan), `CSV` (Thoth CSV template)
 
 ### Live Thoth API
 ```
diff --git a/bookloader.py b/bookloader.py
index bdbae55..7f4511f 100644
--- a/bookloader.py
+++ b/bookloader.py
@@ -103,9 +103,11 @@ class BookLoader:
         "Foreword": "FOREWORD_BY",
         "A24": "INTRODUCTION_BY",
         "Introduction": "INTRODUCTION_BY",
+        "Introduction By": "INTRODUCTION_BY",
         "writer of introduction": "INTRODUCTION_BY",
         "A15": "PREFACE_BY",
         "Preface": "PREFACE_BY",
+        "Preface By": "PREFACE_BY",
         "Music editor": "MUSIC_EDITOR",
         "Research By": "RESEARCH_BY",
         "Contributions By": "CONTRIBUTIONS_BY",
diff --git a/csvloader.py b/csvloader.py
new file mode 100644
index 0000000..1606809
--- /dev/null
+++ b/csvloader.py
@@ -0,0 +1,384 @@
+#!/usr/bin/env python
+"""Load book metadata from official Thoth CSV template into Thoth"""
+
+import logging
+import sys
+import re
+import pandas as pd
+from bookloader import BookLoader
+from thothlibrary import ThothError
+
+
+class CSVLoader(BookLoader):
+    """Thoth CSV template specific logic to ingest metadata from CSV into Thoth"""
+    single_imprint = True
+    cache_contributors = True
+    cache_institutions = False
+    publisher_name = "Insert publisher name here"
+    publisher_shortname = "Insert publisher shortname here"
+    publisher_url = "https://insertpublisherwebsite.com"
+
+    def run(self):
+        """Process CSV and call Thoth to insert its data"""
+
+        # find number of contributor columns. The template has columns for 5 contributors
+        # by default, but publishers may add additional sets of columns for additional contributors.
+        contributions_index = self.get_highest_contributor_index(self.data.columns)
+        # logging.info(self.data)
+
+        for index, row in self.data.iterrows():
+            logging.info("\n\n\n\n**********")
+            print(row['title'], type(row['title']))
+            print(self.data.columns)
+            # Data start in row 2, so start counting in logging from there
+            logging.info(f"processing book from row {index + 2}: {row['title']}")
+            work = self.get_work(row)
+            # logging.info(work)
+            # try to find the work in Thoth
+            try:
+                work_id = self.thoth.work_by_doi(work['doi']).workId
+                existing_work = self.thoth.work_by_id(work_id)
+                # if work is found, try to update it with the new data
+                if existing_work:
+                    try:
+                        existing_work.update((k, v) for k, v in work.items() if v is not None)
+                        self.thoth.update_work(existing_work)
+                        logging.info(f"workId for updated work: {work_id}")
+                    # if update fails, log the error and exit the import
+                    except ThothError as t:
+                        logging.error(f"Failed to update work with id {work_id}, exception: {t}")
+                        sys.exit(1)
+            # if work isn't found, create it
+            except (IndexError, AttributeError, ThothError):
+                work_id = self.thoth.create_work(work)
+                logging.info(f"created work with workId: {work_id}")
+            work = self.thoth.work_by_id(work_id)
+            self.create_contributors(row, work, contributions_index)
+            continue
+            self.create_publications(row, work)
+            self.create_languages(row, work)
+            self.create_series(row, work)
+            self.create_subjects(row, work)
+
+    def get_work(self, row):
+        """Returns a dictionary with all attributes of a 'work'
+
+        row: current row number
+        """
+        
+        work_type = row["work_type"]
+        work_status = row["work_status"]
+        doi = f"https://doi.org/{row["doi"]}"
+
+        # Exit with error if any of the required fields for Work are not present
+        if pd.isna(work_type) or pd.isna(work_status) or pd.isna(row["title"]):
+            logging.error("Work missing a required field: work_type, work_status, or title")
+            sys.exit(1)
+
+        title = self.sanitise_title(row["title"], row["subtitle"])
+
+        work = {
+            "workType": self.work_types[work_type],
+            "workStatus": self.work_statuses[work_status],
+            "fullTitle": title["fullTitle"],
+            "title": title["title"],
+            "subtitle": title["subtitle"],
+            "reference": None,
+            "edition": row["edition"],
+            "imprintId": self.imprint_id,
+            "doi": doi,
+            "publicationDate": row["publication_date"],
+            "withdrawnDate": row["withdrawn_date"],
+            "place": row["place_of_publication"],
+            "width": None,
+            "height": None,
+            "pageCount": row["page_count"],
+            "pageBreakdown": row["page_breakdown"],
+            "imageCount": row["image_count"],
+            "tableCount": row["table_count"],
+            "audioCount": row["audio_count"],
+            "videoCount": row["video_count"],
+            "license": row["license"],
+            "copyrightHolder": row["copyright_holder"],
+            "landingPage": row["landing_page"],
+            "lccn": None,
+            "oclc": None,
+            "shortAbstract": row["short_abstract"],
+            "longAbstract": row["long_abstract"],
+            "generalNote": None,
+            "toc": None,
+            "coverUrl": row["cover_url"],
+            "coverCaption": None,
+            "firstPage": None,
+            "lastPage": None,
+            "pageInterval": None,
+        }
+        # Convert NaN to None for all fields
+        work = {k: (None if pd.isna(v) else v) for k, v in work.items()}
+        return work
+
+    def create_contributors(self, row, work, contributions_index):
+        """Creates/updates all contributors associated with the current work and their contributions
+
+        row: current CSV row
+
+        work: Work from Thoth
+
+        contributions_index: Number of sets of contribution columns in the CSV, determined by publisher
+        """
+
+        highest_contribution_ordinal = max((c.contributionOrdinal for c in work.contributions), default=0)
+
+        for index in range(1, contributions_index + 1):
+            first_name = row[f"contribution_{index}_first_name"]
+            surname = row[f"contribution_{index}_surname"]
+            if not surname:
+                continue
+            first_name = first_name.strip()
+            surname = surname.strip()
+            full_name = f"{first_name} {surname}"
+            orcid = None
+            orcid_column = f"contribution_{index}_orcid"
+            if orcid_column in row and pd.notna(row[orcid_column]) and row[orcid_column]:
+                orcid = f"https://orcid.org/{row[orcid_column]}"
+            website = row[f"contribution_{index}_website"]
+            contributor = {
+                "firstName": first_name,
+                "lastName": surname,
+                "fullName": full_name,
+                "orcid": orcid,
+                "website": website
+
+            }
+            # Convert NaN to None for all fields
+            contributor = {k: (None if pd.isna(v) else v) for k, v in contributor.items()}
+            logging.info(contributor)
+            
+            if full_name not in self.all_contributors:
+                contributor_id = self.thoth.create_contributor(contributor)
+                logging.info(f"created contributor: {full_name}, {contributor_id}")
+                # cache new contributor
+                self.all_contributors[full_name] = contributor_id
+            else:
+                contributor_id = self.all_contributors[full_name]
+                logging.info(f"contributor {full_name} already in Thoth, skipping")
+            existing_contribution = next(
+                (c for c in work.contributions if c.contributor.contributorId == contributor_id),
+                None)
+            if not existing_contribution:
+                contribution_type = self.contribution_types[row[f"contribution_{index}_role"]]
+                main = self.is_main_contribution(row[f"contribution_{index}_main_contribution"])
+                contribution = {
+                    "workId": work.workId,
+                    "contributorId": contributor_id,
+                    "contributionType": contribution_type,
+                    "mainContribution": main,
+                    "contributionOrdinal": highest_contribution_ordinal + 1,
+                    "biography": row[f"contribution_{index}_biography"],
+                    "firstName": first_name,
+                    "lastName": surname,
+                    "fullName": full_name
+                }
+                # Convert NaN to None for all fields
+                contribution = {k: (None if pd.isna(v) else v) for k, v in contribution.items()}
+                logging.info(contribution)
+                self.thoth.create_contribution(contribution)
+                logging.info(f"created contribution for {full_name}, type: {contribution_type}")
+                highest_contribution_ordinal += 1
+            else:
+                logging.info(f"existing contribution for {full_name}, type: {contribution_type}")
+
+    def create_publications(self, row, work):
+        """Creates PDF and paperback publications associated with the current work
+
+        row: current CSV record
+
+        work: Work from Thoth
+        """
+        isbn = self.sanitise_isbn(row["scs023_isbn"].strip())
+        print_landing_page = row["scs023_printed_version"]
+        pdf_full_text = row["fulltext_repository"]
+
+        publications = [["PDF", None, work.landingPage]]
+        # some rows don't have landing page for print
+        # only create a print Publication in Thoth if print_landing_page exists
+        if print_landing_page:
+            publications.append(["PAPERBACK", isbn, print_landing_page])
+
+        for publication_type, isbn, landing_page in publications:
+            publication = {
+                "workId": work.workId,
+                "publicationType": publication_type,
+                "isbn": isbn,
+                "widthMm": None,
+                "widthIn": None,
+                "heightMm": None,
+                "heightIn": None,
+                "depthMm": None,
+                "depthIn": None,
+                "weightG": None,
+                "weightOz": None,
+            }
+
+            existing_pub = next((p for p in work.publications if p.publicationType == publication_type), None)
+            if existing_pub:
+                publication_id = existing_pub.publicationId
+                logging.info(f"existing {publication_type} publication: {publication_id}")
+            else:
+                publication_id = self.thoth.create_publication(publication)
+                logging.info(f"created {publication_type} publication: {publication_id}")
+            if (existing_pub and
+                    any(location.locationPlatform == "PUBLISHER_WEBSITE" for location in existing_pub.locations)):
+                logging.info("existing location")
+                continue
+            location = {
+                "publicationId": publication_id,
+                "landingPage": landing_page,
+                "fullTextUrl": pdf_full_text if publication_type == "PDF" else None,
+                "locationPlatform": "PUBLISHER_WEBSITE",
+                "canonical": "true",
+            }
+            self.thoth.create_location(location)
+            logging.info(f"created location: with publicationId {publication_id}")
+
+    def create_languages(self, row, work):
+        """Creates language associated with the current work
+
+        row: current CSV record
+
+        work: Work from Thoth
+        """
+        csv_language_codes = row["language_ISO"].split("|")
+        for csv_language in csv_language_codes:
+            language_code = csv_language.upper()
+            # CSV contains "fra" for French instead of "fre"
+            if language_code == "FRA":
+                language_code = "FRE"
+            # check to see if work already has this language
+            if any(language.languageCode == language_code for language in work.languages):
+                logging.info("existing language")
+                return
+            language = {
+                "workId": work.workId,
+                "languageCode": language_code,
+                "languageRelation": "ORIGINAL",
+                "mainLanguage": "true"
+            }
+            self.thoth.create_language(language)
+            logging.info(f"created language {language_code} for workId: {work.workId}")
+
+    def create_series(self, row, work):
+        """Creates series associated with the current work
+
+        row: current CSV row
+
+        work: current work
+        """
+        series_name = row["scs023_series"]
+        if not series_name:
+            logging.info(f"{work.fullTitle} missing series name; skipping create_series")
+            return
+        if series_name not in self.all_series:
+            try:
+                issn = self.sanitise_issn(row["scs023_issn"])
+            except ValueError as e:
+                logging.error(f"{e} ({work.workId})")
+                issn = None
+            series = {
+                "seriesType": "BOOK_SERIES",
+                "seriesName": series_name,
+                "issnDigital": issn,
+                "issnPrint": issn,
+                "seriesUrl": None,
+                "seriesDescription": None,
+                "seriesCfpUrl": None,
+                "imprintId": self.imprint_id
+            }
+            series_id = self.thoth.create_series(series)
+            logging.info(f"new series created: {series['seriesName']}")
+            self.all_series[series_name] = series_id
+        else:
+            logging.info(f"existing series {series_name}")
+            series_id = self.all_series[series_name]
+
+        # find all existing issues in Series
+        current_series = self.thoth.series(series_id)
+        issue_work_ids = []
+
+        for issue in current_series.issues:
+            issue_work_ids.append(issue.work.workId)
+
+        # find out if current work already has an issue. If not, create a new one.
+        if work.workId not in issue_work_ids:
+            # count them
+            number_of_issues = len(current_series.issues)
+
+            # assign next highest issueOrdinal
+            issue = {
+                "seriesId": series_id,
+                "workId": work.workId,
+                "issueOrdinal": number_of_issues + 1,
+            }
+            self.thoth.create_issue(issue)
+            logging.info(f"Created new issue of {current_series} for work")
+        else:
+            logging.info(f"Series Issue already exists for work; skipping creating issue of {current_series}")
+
+    def create_subjects(self, row, work):
+        """Creates all subjects associated with the current work
+
+        row: current row in CSV
+
+        work: Work from Thoth
+        """
+        keyword_subjects = row["scs023_keywords"].split("|")
+
+        # correctly parse "scs023_field_science" into keywords and add them to keyword_subjects
+        # example field value:
+        # "Társadalom és gazdaságtörténet / Social and economic history (12979)|
+        # Újkori és jelenkori történelem / Modern and contemporary history (12977)"
+        fields_science = row["scs023_field_science"].split("|")
+        for field in fields_science:
+            hungarian_field, second_part = field.split(" / ")
+            english_field = second_part.rsplit(" ", 1)[0]
+            keyword_subjects.append(hungarian_field)
+            keyword_subjects.append(english_field)
+
+        bisac_subjects = row["BISAC_code"].split("|")
+        thema_subjects = row["Thema_code"].split("|")
+
+        def create_subject(subject_type, subject_code, subject_ordinal):
+            subject = {
+                "workId": work.workId,
+                "subjectType": subject_type,
+                "subjectCode": subject_code,
+                "subjectOrdinal": subject_ordinal
+            }
+            self.thoth.create_subject(subject)
+
+        def prepare_subject(subjects_array, subject_type):
+            for subject_ordinal, subject_code in enumerate(subjects_array, start=1):
+                # check if the work already has a subject with an existing subject type/subject code combination
+                if not any(
+                    subject.subjectCode == subject_code and subject.subjectType == subject_type 
+                    for subject in work.subjects
+                ):
+                    create_subject(subject_type, subject_code, subject_ordinal)
+                    logging.info(f"New {subject_type} {subject_code} added as Subject")
+                else:
+                    logging.info(f"Existing {subject_type} {subject_code} already associated with Work")
+
+        prepare_subject(keyword_subjects, "KEYWORD")
+        prepare_subject(bisac_subjects, "BISAC")
+        prepare_subject(thema_subjects, "THEMA")
+
+    def get_highest_contributor_index(self, columns):
+        max_index = 0
+        pattern = re.compile(r"contribution_(\d+)_first_name")
+        for column in columns:
+            match = pattern.match(column)
+            if match:
+                index = int(match.group(1))
+                if index > max_index:
+                    max_index = index
+        return max_index
diff --git a/loader.py b/loader.py
index d414cbe..44b9d17 100755
--- a/loader.py
+++ b/loader.py
@@ -29,6 +29,7 @@
 from leuvenloader import LeuvenLoader
 from lharmattanloader import LHarmattanLoader
 from ubiquityapiloader import UbiquityAPILoader
+from csvloader import CSVLoader
 
 LOADERS = {
     "OBP": OBPBookLoader,
@@ -54,6 +55,7 @@
     "UOL": UOLLoader,
     "Leuven": LeuvenLoader,
     "LHarmattan": LHarmattanLoader,
+    "CSV": CSVLoader,
 }
 
 ARGS = [

From 32e5571da8f0065ebffe95ba0103254772f59560 Mon Sep 17 00:00:00 2001
From: Brendan O'Connell <brendan@openbookpublishers.com>
Date: Thu, 14 Aug 2025 16:00:34 +0200
Subject: [PATCH 2/6] complete logic for create_contributors, add logic for
 create_languages, create subjects

---
 csvloader.py | 234 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 144 insertions(+), 90 deletions(-)

diff --git a/csvloader.py b/csvloader.py
index 1606809..265da65 100644
--- a/csvloader.py
+++ b/csvloader.py
@@ -12,7 +12,7 @@
 class CSVLoader(BookLoader):
     """Thoth CSV template specific logic to ingest metadata from CSV into Thoth"""
     single_imprint = True
-    cache_contributors = True
+    cache_contributors = False
     cache_institutions = False
     publisher_name = "Insert publisher name here"
     publisher_shortname = "Insert publisher shortname here"
@@ -28,8 +28,8 @@ def run(self):
 
         for index, row in self.data.iterrows():
             logging.info("\n\n\n\n**********")
-            print(row['title'], type(row['title']))
-            print(self.data.columns)
+            # print(row['title'], type(row['title']))
+            # print(self.data.columns)
             # Data start in row 2, so start counting in logging from there
             logging.info(f"processing book from row {index + 2}: {row['title']}")
             work = self.get_work(row)
@@ -53,19 +53,19 @@ def run(self):
                 work_id = self.thoth.create_work(work)
                 logging.info(f"created work with workId: {work_id}")
             work = self.thoth.work_by_id(work_id)
-            self.create_contributors(row, work, contributions_index)
+            # self.create_contributors(row, work, contributions_index)
+            self.create_languages(row, work)
+            self.create_subjects(row, work)
             continue
             self.create_publications(row, work)
-            self.create_languages(row, work)
             self.create_series(row, work)
-            self.create_subjects(row, work)
 
     def get_work(self, row):
         """Returns a dictionary with all attributes of a 'work'
 
         row: current row number
         """
-        
+
         work_type = row["work_type"]
         work_status = row["work_status"]
         doi = f"https://doi.org/{row["doi"]}"
@@ -138,9 +138,10 @@ def create_contributors(self, row, work, contributions_index):
             surname = surname.strip()
             full_name = f"{first_name} {surname}"
             orcid = None
-            orcid_column = f"contribution_{index}_orcid"
-            if orcid_column in row and pd.notna(row[orcid_column]) and row[orcid_column]:
-                orcid = f"https://orcid.org/{row[orcid_column]}"
+            orcid_value = row[f"contribution_{index}_orcid"]
+            # only assign value to orcid variable if a value is present in the row
+            if pd.notna(orcid_value):
+                orcid = f"https://orcid.org/{orcid_value}"
             website = row[f"contribution_{index}_website"]
             contributor = {
                 "firstName": first_name,
@@ -148,12 +149,10 @@ def create_contributors(self, row, work, contributions_index):
                 "fullName": full_name,
                 "orcid": orcid,
                 "website": website
-
             }
             # Convert NaN to None for all fields
             contributor = {k: (None if pd.isna(v) else v) for k, v in contributor.items()}
-            logging.info(contributor)
-            
+
             if full_name not in self.all_contributors:
                 contributor_id = self.thoth.create_contributor(contributor)
                 logging.info(f"created contributor: {full_name}, {contributor_id}")
@@ -181,12 +180,140 @@ def create_contributors(self, row, work, contributions_index):
                 }
                 # Convert NaN to None for all fields
                 contribution = {k: (None if pd.isna(v) else v) for k, v in contribution.items()}
-                logging.info(contribution)
-                self.thoth.create_contribution(contribution)
+                contribution_id = self.thoth.create_contribution(contribution)
                 logging.info(f"created contribution for {full_name}, type: {contribution_type}")
                 highest_contribution_ordinal += 1
             else:
-                logging.info(f"existing contribution for {full_name}, type: {contribution_type}")
+                logging.info(f"existing contribution for {full_name}, type: {existing_contribution.contributionType}, skipping")
+                contribution_id = existing_contribution.contributionId
+
+            # find if institution name is present, if not, no institution can be found or created
+            institution_name = row[f"contribution_{index}_affiliation_institution_name"]
+            if pd.isna(institution_name):
+                logging.info("no institution name, skipping creating Affiliation")
+                continue
+
+            # retrieve institution or create if it doesn't exist
+            if institution_name in self.all_institutions:
+                institution_id = self.all_institutions[institution_name]
+                logging.info(f"existing institution {institution_name} found in cached institutions")
+            else:
+                ror = None
+                if row[f"contribution_{index}_affiliation_institution_ror"]:
+                    ror = f"https://ror.org/{row[f"contribution_{index}_affiliation_institution_ror"]}"
+                institution = {
+                    "institutionName": institution_name,
+                    "institutionDoi": None,
+                    "ror": ror,
+                    "countryCode": None,
+                }
+                institution_id = self.thoth.create_institution(institution)
+                # cache new institution
+                self.all_institutions[institution_name] = institution_id
+                logging.info(f"created and cached new institution {institution_name}")
+
+            existing_affiliations = next(
+                (c.affiliations for c in work.contributions if c.contributionId == contribution_id), [])
+            if any(a.institution.institutionId == institution_id for a in existing_affiliations):
+                logging.info(f"contribution for {full_name} already has affiliation, skipping")
+                continue
+            else:
+                # create affiliation
+                position = None
+                if row[f"contribution_{index}_affiliation_position"]:
+                    position = row[f"contribution_{index}_affiliation_position"]
+                # each contributor can only have 1 affiliation in CSV, so affiliationOrdinal is
+                # harcoded as 1
+                affiliation = {
+                    "contributionId": contribution_id,
+                    "institutionId": institution_id,
+                    "position": position,
+                    "affiliationOrdinal": 1
+                }
+                self.thoth.create_affiliation(affiliation)
+
+    def create_languages(self, row, work):
+        """Creates languages associated with the current work
+
+        row: current CSV record
+
+        work: Work from Thoth
+        """
+        original_language_codes = row["original_language"]
+        translated_from_language_codes = row["translated_from_language"]
+        translated_into_language_codes = row["translated_into_language"]
+
+        all_languages = [
+            [original_language_codes, "ORIGINAL"], [translated_from_language_codes, "TRANSLATED_FROM"], [translated_into_language_codes, "TRANSLATED_INTO"]
+        ]
+
+        for languages, language_relation in all_languages:
+            if pd.notna(languages):
+                # language codes are separated by ;
+                languages_array = languages.split(";")
+                for language_code in languages_array:
+                    if any(work_language.languageCode == language_code for work_language in work.languages):
+                        logging.info(f"existing language {language_code}")
+                        continue
+                    language = {
+                        "workId": work.workId,
+                        "languageCode": language_code,
+                        "languageRelation": language_relation,
+                        "mainLanguage": "true"
+                    }
+                    self.thoth.create_language(language)
+                    logging.info(f"created language {language_code}")
+            else:
+                logging.info(f"no languages for {language_relation}")
+
+    def create_subjects(self, row, work):
+        """Creates all subjects associated with the current work
+
+        row: current row in CSV
+
+        work: Work from Thoth
+        """
+        keyword_subjects = row["scs023_keywords"].split("|")
+
+        # correctly parse "scs023_field_science" into keywords and add them to keyword_subjects
+        # example field value:
+        # "Társadalom és gazdaságtörténet / Social and economic history (12979)|
+        # Újkori és jelenkori történelem / Modern and contemporary history (12977)"
+        fields_science = row["scs023_field_science"].split("|")
+        for field in fields_science:
+            hungarian_field, second_part = field.split(" / ")
+            english_field = second_part.rsplit(" ", 1)[0]
+            keyword_subjects.append(hungarian_field)
+            keyword_subjects.append(english_field)
+
+        bisac_subjects = row["BISAC_code"].split("|")
+        thema_subjects = row["Thema_code"].split("|")
+
+        def create_subject(subject_type, subject_code, subject_ordinal):
+            subject = {
+                "workId": work.workId,
+                "subjectType": subject_type,
+                "subjectCode": subject_code,
+                "subjectOrdinal": subject_ordinal
+            }
+            self.thoth.create_subject(subject)
+
+        def prepare_subject(subjects_array, subject_type):
+            for subject_ordinal, subject_code in enumerate(subjects_array, start=1):
+                # check if the work already has a subject with an existing subject type/subject code combination
+                if not any(
+                    subject.subjectCode == subject_code and subject.subjectType == subject_type
+                    for subject in work.subjects
+                ):
+                    create_subject(subject_type, subject_code, subject_ordinal)
+                    logging.info(f"New {subject_type} {subject_code} added as Subject")
+                else:
+                    logging.info(f"Existing {subject_type} {subject_code} already associated with Work")
+
+        prepare_subject(keyword_subjects, "KEYWORD")
+        prepare_subject(bisac_subjects, "BISAC")
+        prepare_subject(thema_subjects, "THEMA")
+
 
     def create_publications(self, row, work):
         """Creates PDF and paperback publications associated with the current work
@@ -241,32 +368,6 @@ def create_publications(self, row, work):
             self.thoth.create_location(location)
             logging.info(f"created location: with publicationId {publication_id}")
 
-    def create_languages(self, row, work):
-        """Creates language associated with the current work
-
-        row: current CSV record
-
-        work: Work from Thoth
-        """
-        csv_language_codes = row["language_ISO"].split("|")
-        for csv_language in csv_language_codes:
-            language_code = csv_language.upper()
-            # CSV contains "fra" for French instead of "fre"
-            if language_code == "FRA":
-                language_code = "FRE"
-            # check to see if work already has this language
-            if any(language.languageCode == language_code for language in work.languages):
-                logging.info("existing language")
-                return
-            language = {
-                "workId": work.workId,
-                "languageCode": language_code,
-                "languageRelation": "ORIGINAL",
-                "mainLanguage": "true"
-            }
-            self.thoth.create_language(language)
-            logging.info(f"created language {language_code} for workId: {work.workId}")
-
     def create_series(self, row, work):
         """Creates series associated with the current work
 
@@ -324,54 +425,7 @@ def create_series(self, row, work):
         else:
             logging.info(f"Series Issue already exists for work; skipping creating issue of {current_series}")
 
-    def create_subjects(self, row, work):
-        """Creates all subjects associated with the current work
-
-        row: current row in CSV
-
-        work: Work from Thoth
-        """
-        keyword_subjects = row["scs023_keywords"].split("|")
-
-        # correctly parse "scs023_field_science" into keywords and add them to keyword_subjects
-        # example field value:
-        # "Társadalom és gazdaságtörténet / Social and economic history (12979)|
-        # Újkori és jelenkori történelem / Modern and contemporary history (12977)"
-        fields_science = row["scs023_field_science"].split("|")
-        for field in fields_science:
-            hungarian_field, second_part = field.split(" / ")
-            english_field = second_part.rsplit(" ", 1)[0]
-            keyword_subjects.append(hungarian_field)
-            keyword_subjects.append(english_field)
-
-        bisac_subjects = row["BISAC_code"].split("|")
-        thema_subjects = row["Thema_code"].split("|")
-
-        def create_subject(subject_type, subject_code, subject_ordinal):
-            subject = {
-                "workId": work.workId,
-                "subjectType": subject_type,
-                "subjectCode": subject_code,
-                "subjectOrdinal": subject_ordinal
-            }
-            self.thoth.create_subject(subject)
-
-        def prepare_subject(subjects_array, subject_type):
-            for subject_ordinal, subject_code in enumerate(subjects_array, start=1):
-                # check if the work already has a subject with an existing subject type/subject code combination
-                if not any(
-                    subject.subjectCode == subject_code and subject.subjectType == subject_type 
-                    for subject in work.subjects
-                ):
-                    create_subject(subject_type, subject_code, subject_ordinal)
-                    logging.info(f"New {subject_type} {subject_code} added as Subject")
-                else:
-                    logging.info(f"Existing {subject_type} {subject_code} already associated with Work")
-
-        prepare_subject(keyword_subjects, "KEYWORD")
-        prepare_subject(bisac_subjects, "BISAC")
-        prepare_subject(thema_subjects, "THEMA")
-
+    
     def get_highest_contributor_index(self, columns):
         max_index = 0
         pattern = re.compile(r"contribution_(\d+)_first_name")

From 92d6452e6b595250baf67da8a119b175fb03d97b Mon Sep 17 00:00:00 2001
From: Brendan O'Connell <brendan@openbookpublishers.com>
Date: Mon, 18 Aug 2025 16:32:38 +0200
Subject: [PATCH 3/6] Add logic for create_languages, create_subject,
 create_publication, create_price, create_location, begin refactoring

---
 csvloader.py | 259 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 161 insertions(+), 98 deletions(-)

diff --git a/csvloader.py b/csvloader.py
index 265da65..97ae39b 100644
--- a/csvloader.py
+++ b/csvloader.py
@@ -12,8 +12,10 @@
 class CSVLoader(BookLoader):
     """Thoth CSV template specific logic to ingest metadata from CSV into Thoth"""
     single_imprint = True
-    cache_contributors = False
-    cache_institutions = False
+    cache_contributors = True
+    cache_institutions = True
+    cache_issues = True
+    cache_series = True
     publisher_name = "Insert publisher name here"
     publisher_shortname = "Insert publisher shortname here"
     publisher_url = "https://insertpublisherwebsite.com"
@@ -24,16 +26,12 @@ def run(self):
         # find number of contributor columns. The template has columns for 5 contributors
         # by default, but publishers may add additional sets of columns for additional contributors.
         contributions_index = self.get_highest_contributor_index(self.data.columns)
-        # logging.info(self.data)
 
         for index, row in self.data.iterrows():
             logging.info("\n\n\n\n**********")
-            # print(row['title'], type(row['title']))
-            # print(self.data.columns)
             # Data start in row 2, so start counting in logging from there
             logging.info(f"processing book from row {index + 2}: {row['title']}")
             work = self.get_work(row)
-            # logging.info(work)
             # try to find the work in Thoth
             try:
                 work_id = self.thoth.work_by_doi(work['doi']).workId
@@ -41,7 +39,7 @@ def run(self):
                 # if work is found, try to update it with the new data
                 if existing_work:
                     try:
-                        existing_work.update((k, v) for k, v in work.items() if v is not None)
+                        self.convert_nan_to_none(existing_work)
                         self.thoth.update_work(existing_work)
                         logging.info(f"workId for updated work: {work_id}")
                     # if update fails, log the error and exit the import
@@ -53,10 +51,9 @@ def run(self):
                 work_id = self.thoth.create_work(work)
                 logging.info(f"created work with workId: {work_id}")
             work = self.thoth.work_by_id(work_id)
-            # self.create_contributors(row, work, contributions_index)
+            self.create_contributors(row, work, contributions_index)
             self.create_languages(row, work)
             self.create_subjects(row, work)
-            continue
             self.create_publications(row, work)
             self.create_series(row, work)
 
@@ -68,14 +65,20 @@ def get_work(self, row):
 
         work_type = row["work_type"]
         work_status = row["work_status"]
-        doi = f"https://doi.org/{row["doi"]}"
 
         # Exit with error if any of the required fields for Work are not present
         if pd.isna(work_type) or pd.isna(work_status) or pd.isna(row["title"]):
-            logging.error("Work missing a required field: work_type, work_status, or title")
+            logging.error("Work missing a required field: title, workType, or workStatus")
             sys.exit(1)
 
-        title = self.sanitise_title(row["title"], row["subtitle"])
+        subtitle = None
+        if pd.notna(row["subtitle"]):
+            subtitle = row["subtitle"]
+        title = self.sanitise_title(row["title"], subtitle)
+
+        doi = None
+        if pd.notna(row["doi"]):
+            doi = f"https://doi.org/{row["doi"]}"
 
         work = {
             "workType": self.work_types[work_type],
@@ -132,7 +135,9 @@ def create_contributors(self, row, work, contributions_index):
         for index in range(1, contributions_index + 1):
             first_name = row[f"contribution_{index}_first_name"]
             surname = row[f"contribution_{index}_surname"]
-            if not surname:
+
+            # surname is required to create Contributor. If it is NaN, continue
+            if pd.isna(surname):
                 continue
             first_name = first_name.strip()
             surname = surname.strip()
@@ -160,18 +165,21 @@ def create_contributors(self, row, work, contributions_index):
                 self.all_contributors[full_name] = contributor_id
             else:
                 contributor_id = self.all_contributors[full_name]
-                logging.info(f"contributor {full_name} already in Thoth, skipping")
+                logging.info(f"contributor {full_name} already in Thoth, skipping creation")
             existing_contribution = next(
                 (c for c in work.contributions if c.contributor.contributorId == contributor_id),
                 None)
             if not existing_contribution:
-                contribution_type = self.contribution_types[row[f"contribution_{index}_role"]]
-                main = self.is_main_contribution(row[f"contribution_{index}_main_contribution"])
+                if pd.notna(row[f"contribution_{index}_role"]):
+                    contribution_type = self.contribution_types[row[f"contribution_{index}_role"]]
+                else:
+                    logging.error("no contributionType for contribution, cannot create")
+                    continue
                 contribution = {
                     "workId": work.workId,
                     "contributorId": contributor_id,
                     "contributionType": contribution_type,
-                    "mainContribution": main,
+                    "mainContribution": "true",
                     "contributionOrdinal": highest_contribution_ordinal + 1,
                     "biography": row[f"contribution_{index}_biography"],
                     "firstName": first_name,
@@ -190,7 +198,7 @@ def create_contributors(self, row, work, contributions_index):
             # find if institution name is present, if not, no institution can be found or created
             institution_name = row[f"contribution_{index}_affiliation_institution_name"]
             if pd.isna(institution_name):
-                logging.info("no institution name, skipping creating Affiliation")
+                logging.info("no institution name, skipping creating Institution and Affiliation")
                 continue
 
             # retrieve institution or create if it doesn't exist
@@ -199,8 +207,8 @@ def create_contributors(self, row, work, contributions_index):
                 logging.info(f"existing institution {institution_name} found in cached institutions")
             else:
                 ror = None
-                if row[f"contribution_{index}_affiliation_institution_ror"]:
-                    ror = f"https://ror.org/{row[f"contribution_{index}_affiliation_institution_ror"]}"
+                if pd.notna(row[f"contribution_{index}_affiliation_institution_ror"]):
+                    ror = f"https://ror.org/{row[f"contribution_{index}_affiliation_institution_ror"].strip()}"
                 institution = {
                     "institutionName": institution_name,
                     "institutionDoi": None,
@@ -220,10 +228,10 @@ def create_contributors(self, row, work, contributions_index):
             else:
                 # create affiliation
                 position = None
-                if row[f"contribution_{index}_affiliation_position"]:
+                if pd.notna(row[f"contribution_{index}_affiliation_position"]):
                     position = row[f"contribution_{index}_affiliation_position"]
-                # each contributor can only have 1 affiliation in CSV, so affiliationOrdinal is
-                # harcoded as 1
+                # each contributor only has 1 affiliation in CSV, so affiliationOrdinal is
+                # hardcoded as 1
                 affiliation = {
                     "contributionId": contribution_id,
                     "institutionId": institution_id,
@@ -232,6 +240,17 @@ def create_contributors(self, row, work, contributions_index):
                 }
                 self.thoth.create_affiliation(affiliation)
 
+    def get_highest_contributor_index(self, columns):
+        max_index = 0
+        pattern = re.compile(r"contribution_(\d+)_first_name")
+        for column in columns:
+            match = pattern.match(column)
+            if match:
+                index = int(match.group(1))
+                if index > max_index:
+                    max_index = index
+        return max_index
+
     def create_languages(self, row, work):
         """Creates languages associated with the current work
 
@@ -273,21 +292,11 @@ def create_subjects(self, row, work):
 
         work: Work from Thoth
         """
-        keyword_subjects = row["scs023_keywords"].split("|")
-
-        # correctly parse "scs023_field_science" into keywords and add them to keyword_subjects
-        # example field value:
-        # "Társadalom és gazdaságtörténet / Social and economic history (12979)|
-        # Újkori és jelenkori történelem / Modern and contemporary history (12977)"
-        fields_science = row["scs023_field_science"].split("|")
-        for field in fields_science:
-            hungarian_field, second_part = field.split(" / ")
-            english_field = second_part.rsplit(" ", 1)[0]
-            keyword_subjects.append(hungarian_field)
-            keyword_subjects.append(english_field)
-
-        bisac_subjects = row["BISAC_code"].split("|")
-        thema_subjects = row["Thema_code"].split("|")
+
+        thema_subjects = row["thema_subjects"].split(";") if pd.notna(row["thema_subjects"]) else None
+        bic_subjects = row["bic_subjects"].split(";") if pd.notna(row["bic_subjects"]) else None
+        bisac_subjects = row["bisac_subjects"].split(";") if pd.notna(row["bisac_subjects"]) else None
+        keyword_subjects = row["keywords"].split(";") if pd.notna(row["keywords"]) else None
 
         def create_subject(subject_type, subject_code, subject_ordinal):
             subject = {
@@ -299,43 +308,43 @@ def create_subject(subject_type, subject_code, subject_ordinal):
             self.thoth.create_subject(subject)
 
         def prepare_subject(subjects_array, subject_type):
-            for subject_ordinal, subject_code in enumerate(subjects_array, start=1):
-                # check if the work already has a subject with an existing subject type/subject code combination
-                if not any(
-                    subject.subjectCode == subject_code and subject.subjectType == subject_type
-                    for subject in work.subjects
-                ):
-                    create_subject(subject_type, subject_code, subject_ordinal)
-                    logging.info(f"New {subject_type} {subject_code} added as Subject")
-                else:
-                    logging.info(f"Existing {subject_type} {subject_code} already associated with Work")
+            if subjects_array:
+                for subject_ordinal, subject_code in enumerate(subjects_array, start=1):
+                    # check if the work already has a subject with an existing subject type/subject code combination
+                    if not any(
+                        subject.subjectCode == subject_code and subject.subjectType == subject_type
+                        for subject in work.subjects
+                    ):
+                        create_subject(subject_type, subject_code, subject_ordinal)
+                        logging.info(f"New {subject_type} {subject_code} added as Subject")
+                    else:
+                        logging.info(f"Existing {subject_type} {subject_code} already associated with Work")
+            else:
+                logging.info(f"no subjects for {subject_type}")
 
-        prepare_subject(keyword_subjects, "KEYWORD")
-        prepare_subject(bisac_subjects, "BISAC")
         prepare_subject(thema_subjects, "THEMA")
-
+        prepare_subject(bic_subjects, "BIC")
+        prepare_subject(bisac_subjects, "BISAC")
+        prepare_subject(keyword_subjects, "KEYWORD")
 
     def create_publications(self, row, work):
-        """Creates PDF and paperback publications associated with the current work
+        """Creates print and digital publications associated with the current work
 
         row: current CSV record
 
         work: Work from Thoth
         """
-        isbn = self.sanitise_isbn(row["scs023_isbn"].strip())
-        print_landing_page = row["scs023_printed_version"]
-        pdf_full_text = row["fulltext_repository"]
 
-        publications = [["PDF", None, work.landingPage]]
-        # some rows don't have landing page for print
-        # only create a print Publication in Thoth if print_landing_page exists
-        if print_landing_page:
-            publications.append(["PAPERBACK", isbn, print_landing_page])
+        publication_types = ["paperback", "hardback", "pdf", "epub"]
+
+        for publication_type in publication_types:
+            isbn = None
+            if pd.notna(row[f"publication_{publication_type}_isbn"]):
+                isbn = self.sanitise_isbn(str(row[f"publication_{publication_type}_isbn"]).strip())
 
-        for publication_type, isbn, landing_page in publications:
             publication = {
                 "workId": work.workId,
-                "publicationType": publication_type,
+                "publicationType": publication_type.upper(),
                 "isbn": isbn,
                 "widthMm": None,
                 "widthIn": None,
@@ -347,26 +356,80 @@ def create_publications(self, row, work):
                 "weightOz": None,
             }
 
-            existing_pub = next((p for p in work.publications if p.publicationType == publication_type), None)
+            # Check if publication already exists
+            existing_pub = next((p for p in work.publications if p.publicationType == publication_type.upper()), None)
             if existing_pub:
                 publication_id = existing_pub.publicationId
                 logging.info(f"existing {publication_type} publication: {publication_id}")
-            else:
-                publication_id = self.thoth.create_publication(publication)
-                logging.info(f"created {publication_type} publication: {publication_id}")
-            if (existing_pub and
-                    any(location.locationPlatform == "PUBLISHER_WEBSITE" for location in existing_pub.locations)):
-                logging.info("existing location")
                 continue
-            location = {
-                "publicationId": publication_id,
-                "landingPage": landing_page,
-                "fullTextUrl": pdf_full_text if publication_type == "PDF" else None,
-                "locationPlatform": "PUBLISHER_WEBSITE",
-                "canonical": "true",
-            }
-            self.thoth.create_location(location)
-            logging.info(f"created location: with publicationId {publication_id}")
+
+            # Create new publication if it doesn't already exist
+            publication_id = self.thoth.create_publication(publication)
+            logging.info(f"created {publication_type} publication: {publication_id}")
+
+            # Handle separate logic for print and digital
+            if publication_type in ["paperback", "hardback"]:
+                self.create_price(row, publication_type, publication_id)
+            elif publication_type in ["pdf", "epub"]:
+                self.create_location(row, publication_type, publication_id)
+
+    def create_price(self, row, publication_type, publication_id):
+        """Create price for paperback and hardback publications"""
+        for index in range(1, 3):
+            unit_price = None
+            if pd.notna(row[f"publication_{publication_type}_price_{index}_unit_price"]):
+                unit_price = row[f"publication_{publication_type}_price_{index}_unit_price"]
+
+            currency = None
+            if pd.notna(row[f"publication_{publication_type}_price_{index}_currency_code"]):
+                currency = row[f"publication_{publication_type}_price_{index}_currency_code"]
+
+            if unit_price and currency:
+                price = {
+                    "publicationId": publication_id,
+                    "currencyCode": currency,
+                    "unitPrice": unit_price
+                }
+                self.thoth.create_price(price)
+                logging.info(f"created price in {currency} for publication {publication_id}")
+            else:
+                logging.info(f"missing unit price and/or currency for {publication_type} price {index}, skipping")
+
+    def create_location(self, row, publication_type, publication_id):
+        """Create location for PDF and EPUB publications"""
+
+        landing_page = row[f"publication_{publication_type}_location_landing_page"]
+        full_text = row[f"publication_{publication_type}_location_full_text_url"]
+        location_platform = row[f"publication_{publication_type}_location_platform"]
+
+        if not pd.notna(location_platform):
+            logging.info(f"no location platform for {publication_type} publication, skip creating location")
+            return
+
+        # Transform e.g. "Publisher Website" to "PUBLISHER_WEBSITE"
+        location_platform = location_platform.upper().replace(" ", "_")
+
+        location_platforms = ["PROJECT_MUSE", "OAPEN", "DOAB", "JSTOR", "EBSCO_HOST",
+                              "OCLC_KB", "PROQUEST_KB", "PROQUEST_EXLIBRIS", "EBSCO_KB",
+                              "JISC_KB", "GOOGLE_BOOKS", "INTERNET_ARCHIVE", "SCIENCE_OPEN",
+                              "ZENODO", "PUBLISHER_WEBSITE", "THOTH", "OTHER"]
+
+        if location_platform not in location_platforms:
+            logging.error("Location Platform is not supported by Thoth")
+            sys.exit(1)
+
+        location = {
+            "publicationId": publication_id,
+            "landingPage": landing_page,
+            "fullTextUrl": full_text,
+            "locationPlatform": location_platform,
+            "canonical": "true",
+        }
+
+        # Convert NaN to None for all fields
+        location = {k: (None if pd.isna(v) else v) for k, v in location.items()}
+        self.thoth.create_location(location)
+        logging.info(f"created location for {location_platform} with publicationId {publication_id}")
 
     def create_series(self, row, work):
         """Creates series associated with the current work
@@ -375,16 +438,18 @@ def create_series(self, row, work):
 
         work: current work
         """
-        series_name = row["scs023_series"]
-        if not series_name:
+
+        series_name = row["series_name"]
+        if pd.isna(series_name):
             logging.info(f"{work.fullTitle} missing series name; skipping create_series")
             return
         if series_name not in self.all_series:
-            try:
-                issn = self.sanitise_issn(row["scs023_issn"])
-            except ValueError as e:
-                logging.error(f"{e} ({work.workId})")
-                issn = None
+            issn = None
+            if pd.notna(row["series_issn"]):
+                try:
+                    issn = self.sanitise_issn(row["series_issn"])
+                except ValueError as e:
+                    logging.error(f"{e} ({work.workId})")
             series = {
                 "seriesType": "BOOK_SERIES",
                 "seriesName": series_name,
@@ -414,25 +479,23 @@ def create_series(self, row, work):
             # count them
             number_of_issues = len(current_series.issues)
 
-            # assign next highest issueOrdinal
+            # if series_issue_number is present in CSV, use it
+            if pd.notna(row["series_issue_number"]):
+                issue_ordinal = row["series_issue_number"]
+            # otherwise assign next highest issueOrdinal
+            else:
+                issue_ordinal = number_of_issues + 1
+
             issue = {
                 "seriesId": series_id,
                 "workId": work.workId,
-                "issueOrdinal": number_of_issues + 1,
+                "issueOrdinal": issue_ordinal,
             }
             self.thoth.create_issue(issue)
             logging.info(f"Created new issue of {current_series} for work")
         else:
             logging.info(f"Series Issue already exists for work; skipping creating issue of {current_series}")
 
-    
-    def get_highest_contributor_index(self, columns):
-        max_index = 0
-        pattern = re.compile(r"contribution_(\d+)_first_name")
-        for column in columns:
-            match = pattern.match(column)
-            if match:
-                index = int(match.group(1))
-                if index > max_index:
-                    max_index = index
-        return max_index
+    def convert_nan_to_none(self, data_dict):
+        """Convert NaN values to None for all fields in a dictionary"""
+        return {k: (None if pd.isna(v) else v) for k, v in data_dict.items()}

From 5d93e79d5dee20eda1ee86b99f16f13536c79fe9 Mon Sep 17 00:00:00 2001
From: Brendan O'Connell <brendan@openbookpublishers.com>
Date: Wed, 20 Aug 2025 15:16:10 +0200
Subject: [PATCH 4/6] use convert_nan_to_none method

---
 csvloader.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/csvloader.py b/csvloader.py
index 97ae39b..5412729 100644
--- a/csvloader.py
+++ b/csvloader.py
@@ -39,7 +39,7 @@ def run(self):
                 # if work is found, try to update it with the new data
                 if existing_work:
                     try:
-                        self.convert_nan_to_none(existing_work)
+                        existing_work.update((k, v) for k, v in work.items() if v is not None)
                         self.thoth.update_work(existing_work)
                         logging.info(f"workId for updated work: {work_id}")
                     # if update fails, log the error and exit the import
@@ -117,7 +117,7 @@ def get_work(self, row):
             "pageInterval": None,
         }
         # Convert NaN to None for all fields
-        work = {k: (None if pd.isna(v) else v) for k, v in work.items()}
+        work = self.convert_nan_to_none(work)
         return work
 
     def create_contributors(self, row, work, contributions_index):
@@ -156,7 +156,7 @@ def create_contributors(self, row, work, contributions_index):
                 "website": website
             }
             # Convert NaN to None for all fields
-            contributor = {k: (None if pd.isna(v) else v) for k, v in contributor.items()}
+            contributor = self.convert_nan_to_none(contributor)
 
             if full_name not in self.all_contributors:
                 contributor_id = self.thoth.create_contributor(contributor)
@@ -187,7 +187,7 @@ def create_contributors(self, row, work, contributions_index):
                     "fullName": full_name
                 }
                 # Convert NaN to None for all fields
-                contribution = {k: (None if pd.isna(v) else v) for k, v in contribution.items()}
+                contribution = self.convert_nan_to_none(contribution)
                 contribution_id = self.thoth.create_contribution(contribution)
                 logging.info(f"created contribution for {full_name}, type: {contribution_type}")
                 highest_contribution_ordinal += 1
@@ -427,7 +427,7 @@ def create_location(self, row, publication_type, publication_id):
         }
 
         # Convert NaN to None for all fields
-        location = {k: (None if pd.isna(v) else v) for k, v in location.items()}
+        location = self.convert_nan_to_none(location)
         self.thoth.create_location(location)
         logging.info(f"created location for {location_platform} with publicationId {publication_id}")
 

From 6c90b13823f4271e1d445dcaf106f412892c1c4f Mon Sep 17 00:00:00 2001
From: Brendan O'Connell <brendan@openbookpublishers.com>
Date: Wed, 19 Nov 2025 14:54:51 +0100
Subject: [PATCH 5/6] Partially addressed review comments

---
 csvloader.py | 46 +++++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/csvloader.py b/csvloader.py
index 5412729..e9a2f66 100644
--- a/csvloader.py
+++ b/csvloader.py
@@ -133,15 +133,15 @@ def create_contributors(self, row, work, contributions_index):
         highest_contribution_ordinal = max((c.contributionOrdinal for c in work.contributions), default=0)
 
         for index in range(1, contributions_index + 1):
-            first_name = row[f"contribution_{index}_first_name"]
-            surname = row[f"contribution_{index}_surname"]
+            given_name = row[f"contribution_{index}_given_name"]
+            family_name = row[f"contribution_{index}_family_name"]
 
-            # surname is required to create Contributor. If it is NaN, continue
-            if pd.isna(surname):
+            # family_name is required to create Contributor. If it is NaN, continue
+            if pd.isna(family_name):
                 continue
-            first_name = first_name.strip()
-            surname = surname.strip()
-            full_name = f"{first_name} {surname}"
+            given_name = given_name.strip()
+            family_name = family_name.strip()
+            full_name = f"{given_name} {family_name}"
             orcid = None
             orcid_value = row[f"contribution_{index}_orcid"]
             # only assign value to orcid variable if a value is present in the row
@@ -149,8 +149,8 @@ def create_contributors(self, row, work, contributions_index):
                 orcid = f"https://orcid.org/{orcid_value}"
             website = row[f"contribution_{index}_website"]
             contributor = {
-                "firstName": first_name,
-                "lastName": surname,
+                "firstName": given_name,
+                "lastName": family_name,
                 "fullName": full_name,
                 "orcid": orcid,
                 "website": website
@@ -158,17 +158,25 @@ def create_contributors(self, row, work, contributions_index):
             # Convert NaN to None for all fields
             contributor = self.convert_nan_to_none(contributor)
 
-            if full_name not in self.all_contributors:
+            if orcid and orcid in self.all_contributors:
+                contributor_id = self.all_contributors[orcid]
+                logging.info(f"existing contributor with ORCID: {full_name}, {orcid}")
+            elif full_name in self.all_contributors:
+                contributor_id = self.all_contributors[full_name]
+                logging.info(f"existing contributor (name): {full_name}")
+            else:
+                # Create new contributor
                 contributor_id = self.thoth.create_contributor(contributor)
                 logging.info(f"created contributor: {full_name}, {contributor_id}")
-                # cache new contributor
+
+                # Cache by both name and ORCID if available
                 self.all_contributors[full_name] = contributor_id
-            else:
-                contributor_id = self.all_contributors[full_name]
-                logging.info(f"contributor {full_name} already in Thoth, skipping creation")
+                if orcid:
+                    self.all_contributors[orcid] = contributor_id
+
             existing_contribution = next(
-                (c for c in work.contributions if c.contributor.contributorId == contributor_id),
-                None)
+                    (c for c in work.contributions if c.contributor.contributorId == contributor_id),
+                    None)
             if not existing_contribution:
                 if pd.notna(row[f"contribution_{index}_role"]):
                     contribution_type = self.contribution_types[row[f"contribution_{index}_role"]]
@@ -182,8 +190,8 @@ def create_contributors(self, row, work, contributions_index):
                     "mainContribution": "true",
                     "contributionOrdinal": highest_contribution_ordinal + 1,
                     "biography": row[f"contribution_{index}_biography"],
-                    "firstName": first_name,
-                    "lastName": surname,
+                    "firstName": given_name,
+                    "lastName": family_name,
                     "fullName": full_name
                 }
                 # Convert NaN to None for all fields
@@ -242,7 +250,7 @@ def create_contributors(self, row, work, contributions_index):
 
     def get_highest_contributor_index(self, columns):
         max_index = 0
-        pattern = re.compile(r"contribution_(\d+)_first_name")
+        pattern = re.compile(r"contribution_(\d+)_given_name")
         for column in columns:
             match = pattern.match(column)
             if match:

From bfd64b22a2393127f8ef0decdbf7e6ed960802d3 Mon Sep 17 00:00:00 2001
From: Brendan O'Connell <brendan@openbookpublishers.com>
Date: Fri, 28 Nov 2025 16:05:29 +0100
Subject: [PATCH 6/6] Minor improvements to loader used in ingesting Venice
 University Press CSV

---
 bookloader.py |  3 ++-
 csvloader.py  | 22 +++++++++++++---------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/bookloader.py b/bookloader.py
index e802586..76186b6 100644
--- a/bookloader.py
+++ b/bookloader.py
@@ -58,7 +58,8 @@ class BookLoader:
         "edited_volume": "EDITED_BOOK",
         "Journal Issue": "JOURNAL_ISSUE",
         "Journal": "JOURNAL_ISSUE",
-        "textbook": "TEXTBOOK"
+        "textbook": "TEXTBOOK",
+        "Textbook": "TEXTBOOK",
     }
     work_statuses = {
         "Active": "ACTIVE",
diff --git a/csvloader.py b/csvloader.py
index e9a2f66..25048f6 100644
--- a/csvloader.py
+++ b/csvloader.py
@@ -16,9 +16,9 @@ class CSVLoader(BookLoader):
     cache_institutions = True
     cache_issues = True
     cache_series = True
-    publisher_name = "Insert publisher name here"
-    publisher_shortname = "Insert publisher shortname here"
-    publisher_url = "https://insertpublisherwebsite.com"
+    publisher_name = "Edizioni Ca’ Foscari – Venice University Press"
+    publisher_shortname = "ECF"
+    publisher_url = "https://edizionicafoscari.unive.it/"
 
     def run(self):
         """Process CSV and call Thoth to insert its data"""
@@ -95,7 +95,7 @@ def get_work(self, row):
             "place": row["place_of_publication"],
             "width": None,
             "height": None,
-            "pageCount": row["page_count"],
+            "pageCount": int(row["page_count"]) if pd.notna(row["page_count"]) else None,
             "pageBreakdown": row["page_breakdown"],
             "imageCount": row["image_count"],
             "tableCount": row["table_count"],
@@ -301,10 +301,14 @@ def create_subjects(self, row, work):
         work: Work from Thoth
         """
 
-        thema_subjects = row["thema_subjects"].split(";") if pd.notna(row["thema_subjects"]) else None
-        bic_subjects = row["bic_subjects"].split(";") if pd.notna(row["bic_subjects"]) else None
-        bisac_subjects = row["bisac_subjects"].split(";") if pd.notna(row["bisac_subjects"]) else None
-        keyword_subjects = row["keywords"].split(";") if pd.notna(row["keywords"]) else None
+        thema_subjects = ([s.strip() for s in row["thema_subjects"].split(";")]
+                          if pd.notna(row["thema_subjects"]) else None)
+        bic_subjects = ([s.strip() for s in row["bic_subjects"].split(";")]
+                        if pd.notna(row["bic_subjects"]) else None)
+        bisac_subjects = ([s.strip() for s in row["bisac_subjects"].split(";")]
+                          if pd.notna(row["bisac_subjects"]) else None)
+        keyword_subjects = ([s.strip() for s in row["keywords"].split(";")]
+                            if pd.notna(row["keywords"]) else None)
 
         def create_subject(subject_type, subject_code, subject_ordinal):
             subject = {
@@ -489,7 +493,7 @@ def create_series(self, row, work):
 
             # if series_issue_number is present in CSV, use it
             if pd.notna(row["series_issue_number"]):
-                issue_ordinal = row["series_issue_number"]
+                issue_ordinal = int(row["series_issue_number"])
             # otherwise assign next highest issueOrdinal
             else:
                 issue_ordinal = number_of_issues + 1