diff --git a/.gitignore b/.gitignore index cec82e6..663ce1e 100644 --- a/.gitignore +++ b/.gitignore @@ -8,8 +8,8 @@ /.bundle # Ignore the default SQLite database. -/db/*.sqlite3 -/db/*.sqlite3-journal +# /db/*.sqlite3 +# /db/*.sqlite3-journal # Ignore all logfiles and tempfiles. /log/* @@ -34,7 +34,7 @@ capybara-*.html /log/* /tmp/* /data/* -/db/*.sqlite3 +# /db/*.sqlite3 /public/* !/public/.keep /coverage/ diff --git a/Gemfile b/Gemfile index bb9d4db..f0afe27 100644 --- a/Gemfile +++ b/Gemfile @@ -26,6 +26,7 @@ gem "faraday_middleware-aws-sigv4", "~> 0.3.0" gem "faraday-excon" gem "uuid", "~> 2.3", ">= 2.3.9" gem "oj", "~> 3.16", ">= 3.16.11" +gem "sqlite3", "~> 1.4" # This gem will allow us to write tests without the need for a database gem "activerecord-nulldb-adapter", "~> 1.1", ">= 1.1.1" diff --git a/Gemfile.lock b/Gemfile.lock index 3effb52..2916919 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -354,6 +354,9 @@ GEM simplecov_json_formatter (~> 0.1) simplecov-html (0.13.1) simplecov_json_formatter (0.1.4) + sqlite3 (1.7.3-aarch64-linux) + sqlite3 (1.7.3-arm64-darwin) + sqlite3 (1.7.3-x86_64-linux) stringio (3.1.5) sync (0.5.0) systemu (2.6.5) @@ -422,6 +425,7 @@ DEPENDENCIES shoryuken (~> 4.0) shoulda-matchers simplecov + sqlite3 (~> 1.4) uuid (~> 2.3, >= 2.3.9) RUBY VERSION diff --git a/app/controllers/enrichments_controller.rb b/app/controllers/enrichments_controller.rb new file mode 100644 index 0000000..9deacfb --- /dev/null +++ b/app/controllers/enrichments_controller.rb @@ -0,0 +1,92 @@ +# frozen_string_literal: true + +# sample ingestion file = 20250426_arxiv_sample_3_matches + +class EnrichmentsController < ApplicationController + def doi + doi = fetch_doi(params[:doi]) + + if doi.blank? + render(json: { error: "doi not found" }, status: :not_found) + return + end + + enrichments = Enrichment.where(doi: params[:doi]) + + if enrichments.empty? + render(json: { error: "no enrichments found for #{params[:doi]}" }, status: :not_found) + return + end + + enrichments.each do |enrichment| + action_strategy_pattern(enrichment, doi) + build_enrichments_field(enrichment, doi) + end + + render(json: doi) + end + + def dois + page_size = 5 + page = params[:page]&.to_i || 0 + dois = Enrichment.order(:doi).distinct.offset(page_size * (page - 1)).limit(page_size).pluck(:doi) + enriched_dois = [] + Rails.logger.info("########dois") + Rails.logger.info(dois) + + dois.each do |doi| + enrichments = Enrichment.where(doi: doi) + enriched_doi = fetch_doi(doi) + + next if enriched_doi.blank? + + enrichments.each do |enrichment| + action_strategy_pattern(enrichment, enriched_doi) + build_enrichments_field(enrichment, enriched_doi) + enriched_dois << enriched_doi + end + end + + render(json: enriched_dois) + end + + private + + def fetch_doi(doi) + response = Faraday.get("https://api.datacite.org/dois/#{doi}?detail=true&publisher=true&affiliation=true") + JSON.parse(response.body).dig("data", "attributes") if response.success? + end + + def action_strategy_pattern(enrichment, doi) + action = enrichment["action"] + case action + when "insert" + doi[enrichment["field"]] ||= [] + doi[enrichment["field"]] << enrichment["enriched_value"] + when "update" + doi[enrichment["field"]] = enrichment["enriched_value"] + when "update_child" + field = enrichment["field"] + doi[field].each_with_index do |item, index| + if item == enrichment["original_value"] + doi[field][index] = enrichment["enriched_value"] + end + end + when "delete_child" + field = enrichment["field"] + doi[field] ||= [] + doi[field].each_with_index do |item, index| + if item == enrichment["original_value"] + doi[field].delete_at(index) + break + end + end + end + end + + def build_enrichments_field(enrichment, doi) + doi["relationships"] ||= {} + doi["relationships"]["enrichments"] ||= [] + doi["relationships"]["enrichments"] << enrichment + end +end diff --git a/app/models/enrichment.rb b/app/models/enrichment.rb new file mode 100644 index 0000000..306b5ce --- /dev/null +++ b/app/models/enrichment.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +class PocRecord < ActiveRecord::Base # rubocop:disable Rails/ApplicationRecord + self.abstract_class = true + + connects_to database: { writing: :enrichments } +end + +class Enrichment < PocRecord + serialize :enriched_value, coder: JSON + serialize :original_value, coder: JSON +end diff --git a/config/database.yml b/config/database.yml index 2318259..d380a14 100644 --- a/config/database.yml +++ b/config/database.yml @@ -15,18 +15,32 @@ defaults: &defaults <<: *mysql development: - <<: *defaults + primary: + <<: *defaults + + enrichments: + adapter: sqlite3 + database: db/enrichments.sqlite3 test: # <<: *defaults - adapter: nulldb + primary: + adapter: nulldb # database: <%= ENV['MYSQL_DATABASE'] %>_test<%= ENV['TEST_ENV_NUMBER'] %> + enrichments: + adapter: sqlite3 + database: db/enrichments.sqlite3 production: <<: *defaults stage: - <<: *defaults + primary: + <<: *defaults + + enrichments: + adapter: sqlite3 + database: db/enrichments.sqlite3 uat: <<: *defaults diff --git a/config/routes.rb b/config/routes.rb index 86f2ddd..b6557ec 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -4,4 +4,7 @@ get "heartbeat", to: "heartbeat#index" root to: "heartbeat#index" + + get "enriched-doi/*doi", to: "enrichments#doi", constraints: { id: /.+/ }, format: false + get "enriched-dois", to: "enrichments#dois", constraints: { id: /.+/ }, format: false end diff --git a/db/enrichments.sqlite3 b/db/enrichments.sqlite3 new file mode 100644 index 0000000..133c13d Binary files /dev/null and b/db/enrichments.sqlite3 differ diff --git a/lib/tasks/enrichment.rake b/lib/tasks/enrichment.rake new file mode 100644 index 0000000..d0eb788 --- /dev/null +++ b/lib/tasks/enrichment.rake @@ -0,0 +1,130 @@ +# frozen_string_literal: true + +require "csv" +require "json" + +namespace :enrichment do + desc "Create the enrichments sqlite database table" + task create_sqlite_table: :environment do + ActiveRecord::Base.establish_connection(:enrichments) + ActiveRecord::Base.connection.execute(<<-SQL) + CREATE TABLE IF NOT EXISTS enrichments ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + doi TEXT, + source TEXT, + process TEXT, + field TEXT, + action TEXT, + original_value TEXT, + enriched_value TEXT, + created DATETIME, + updated DATETIME, + produced DATETIME + ); + SQL + end + + desc "Ingest ARXIV data" + task ingest_arxiv: :environment do + csv_path = Rails.root.join("lib/data/arxiv_preprint_matching.csv") + count = 0 + + CSV.foreach(csv_path, headers: true) do |row| + count += 1 + + break if count == 2001 + + item = row.to_hash + + enrichment = Enrichment.new( + doi: item["doi"], + source: item["source"], + process: item["process"], + field: item["field"], + action: item["action"], + original_value: item["originalValue"], + enriched_value: JSON.parse(item["enrichedValue"]), + created: Time.current.utc, + updated: Time.current.utc, + produced: item["produced"], + ) + + if enrichment.save + puts("Created enrichment for #{item["doi"]}") + else + puts("Failed to create enrichment for #{item["doi"]}") + puts(enrichment.errors.full_messages.join(",")) + end + end + end + + # desc "Ingest ARXIV data" + # task ingest_arxiv: :environment do + # file = File.read("lib/data/20250615_arxiv_preprint_matching_results.json") + # data = JSON.parse(file) + # count = 0 + + # data.each do |item| + # count += 1 + + # break if count == 2001 + + # enrichment = Enrichment.new( + # doi: item["input_doi"], + # source: "COMET", + # process: "10.0000/FAKE.PROCESS", + # field: "relatedIdentifiers", + # action: "insert", + # original_value: nil, + # enriched_value: { + # relationType: "Preprint", + # relatedIdentifier: item["matched_doi"], + # relatedIdentifierType: "DOI", + # }, + # created: Time.current.utc, + # updated: Time.current.utc, + # produced: Time.current.utc - 5.days, + # ) + + # if enrichment.save + # puts("Created enrichment for #{item["input_doi"]}") + # else + # puts("Failed to create enrichment for #{item["input_doi"]}") + # puts(enrichment.errors.full_messages.join(",")) + # end + # end + # end + + desc "Ingest procedural resource type" + task ingest_procedural_resource_type: :environment do + file = File.read("lib/data/datacite_procedural_resource_type_general_reclassifications_datacite_lookup_format.json") + data = JSON.parse(file) + count = 0 + + data.each do |item| + count += 1 + + break if count == 2001 + + enrichment = Enrichment.new( + doi: item["doi"], + source: "COMET", + process: "10.0000/FAKE.PROCESS", + field: "types", + action: "update", + original_value: item["currentTypes"], + enriched_value: item["reclassifiedTypes"], + created: Time.current.utc, + updated: Time.current.utc, + produced: Time.current.utc - 5.days, + ) + + if enrichment.save + puts("Created enrichment for #{item["doi"]}") + else + puts("Failed to create enrichment for #{item["doi"]}") + puts(enrichment.errors.full_messages.join(",")) + end + end + end +end