Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
/.bundle

# Ignore the default SQLite database.
/db/*.sqlite3
/db/*.sqlite3-journal
# /db/*.sqlite3
# /db/*.sqlite3-journal

# Ignore all logfiles and tempfiles.
/log/*
Expand All @@ -34,7 +34,7 @@ capybara-*.html
/log/*
/tmp/*
/data/*
/db/*.sqlite3
# /db/*.sqlite3
/public/*
!/public/.keep
/coverage/
Expand Down
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ gem "faraday_middleware-aws-sigv4", "~> 0.3.0"
gem "faraday-excon"
gem "uuid", "~> 2.3", ">= 2.3.9"
gem "oj", "~> 3.16", ">= 3.16.11"
gem "sqlite3", "~> 1.4"

# This gem will allow us to write tests without the need for a database
gem "activerecord-nulldb-adapter", "~> 1.1", ">= 1.1.1"
Expand Down
4 changes: 4 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,9 @@ GEM
simplecov_json_formatter (~> 0.1)
simplecov-html (0.13.1)
simplecov_json_formatter (0.1.4)
sqlite3 (1.7.3-aarch64-linux)
sqlite3 (1.7.3-arm64-darwin)
sqlite3 (1.7.3-x86_64-linux)
stringio (3.1.5)
sync (0.5.0)
systemu (2.6.5)
Expand Down Expand Up @@ -422,6 +425,7 @@ DEPENDENCIES
shoryuken (~> 4.0)
shoulda-matchers
simplecov
sqlite3 (~> 1.4)
uuid (~> 2.3, >= 2.3.9)

RUBY VERSION
Expand Down
92 changes: 92 additions & 0 deletions app/controllers/enrichments_controller.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# frozen_string_literal: true

# sample ingestion file = 20250426_arxiv_sample_3_matches

class EnrichmentsController < ApplicationController
def doi
doi = fetch_doi(params[:doi])

if doi.blank?
render(json: { error: "doi not found" }, status: :not_found)
return
end

enrichments = Enrichment.where(doi: params[:doi])

if enrichments.empty?
render(json: { error: "no enrichments found for #{params[:doi]}" }, status: :not_found)
return
end

enrichments.each do |enrichment|
action_strategy_pattern(enrichment, doi)
build_enrichments_field(enrichment, doi)
end

render(json: doi)
end

def dois
page_size = 5
page = params[:page]&.to_i || 0
dois = Enrichment.order(:doi).distinct.offset(page_size * (page - 1)).limit(page_size).pluck(:doi)
enriched_dois = []
Rails.logger.info("########dois")
Rails.logger.info(dois)

dois.each do |doi|
enrichments = Enrichment.where(doi: doi)
enriched_doi = fetch_doi(doi)

next if enriched_doi.blank?

enrichments.each do |enrichment|
action_strategy_pattern(enrichment, enriched_doi)
build_enrichments_field(enrichment, enriched_doi)
enriched_dois << enriched_doi
end
end

render(json: enriched_dois)
end

private

def fetch_doi(doi)
response = Faraday.get("https://api.datacite.org/dois/#{doi}?detail=true&publisher=true&affiliation=true")
JSON.parse(response.body).dig("data", "attributes") if response.success?
end

def action_strategy_pattern(enrichment, doi)
action = enrichment["action"]
case action
when "insert"
doi[enrichment["field"]] ||= []
doi[enrichment["field"]] << enrichment["enriched_value"]
when "update"
doi[enrichment["field"]] = enrichment["enriched_value"]
when "update_child"
field = enrichment["field"]
doi[field].each_with_index do |item, index|
if item == enrichment["original_value"]
doi[field][index] = enrichment["enriched_value"]
end
end
when "delete_child"
field = enrichment["field"]
doi[field] ||= []
doi[field].each_with_index do |item, index|
if item == enrichment["original_value"]
doi[field].delete_at(index)
break
end
end
end
end

def build_enrichments_field(enrichment, doi)
doi["relationships"] ||= {}
doi["relationships"]["enrichments"] ||= []
doi["relationships"]["enrichments"] << enrichment
end
end
12 changes: 12 additions & 0 deletions app/models/enrichment.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# frozen_string_literal: true

class PocRecord < ActiveRecord::Base # rubocop:disable Rails/ApplicationRecord
self.abstract_class = true

connects_to database: { writing: :enrichments }
end

class Enrichment < PocRecord
serialize :enriched_value, coder: JSON
serialize :original_value, coder: JSON
end
20 changes: 17 additions & 3 deletions config/database.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,32 @@ defaults: &defaults
<<: *mysql

development:
<<: *defaults
primary:
<<: *defaults

enrichments:
adapter: sqlite3
database: db/enrichments.sqlite3

test:
# <<: *defaults
adapter: nulldb
primary:
adapter: nulldb
# database: <%= ENV['MYSQL_DATABASE'] %>_test<%= ENV['TEST_ENV_NUMBER'] %>
enrichments:
adapter: sqlite3
database: db/enrichments.sqlite3

production:
<<: *defaults

stage:
<<: *defaults
primary:
<<: *defaults

enrichments:
adapter: sqlite3
database: db/enrichments.sqlite3

uat:
<<: *defaults
3 changes: 3 additions & 0 deletions config/routes.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,7 @@
get "heartbeat", to: "heartbeat#index"

root to: "heartbeat#index"

get "enriched-doi/*doi", to: "enrichments#doi", constraints: { id: /.+/ }, format: false
get "enriched-dois", to: "enrichments#dois", constraints: { id: /.+/ }, format: false
end
Binary file added db/enrichments.sqlite3
Binary file not shown.
130 changes: 130 additions & 0 deletions lib/tasks/enrichment.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# frozen_string_literal: true

require "csv"
require "json"

namespace :enrichment do
desc "Create the enrichments sqlite database table"
task create_sqlite_table: :environment do
ActiveRecord::Base.establish_connection(:enrichments)
ActiveRecord::Base.connection.execute(<<-SQL)
CREATE TABLE IF NOT EXISTS enrichments (
id INTEGER PRIMARY KEY AUTOINCREMENT,
doi TEXT,
source TEXT,
process TEXT,
field TEXT,
action TEXT,
original_value TEXT,
enriched_value TEXT,
created DATETIME,
updated DATETIME,
produced DATETIME
);
SQL
end

desc "Ingest ARXIV data"
task ingest_arxiv: :environment do
csv_path = Rails.root.join("lib/data/arxiv_preprint_matching.csv")
count = 0

CSV.foreach(csv_path, headers: true) do |row|
count += 1

break if count == 2001

item = row.to_hash

enrichment = Enrichment.new(
doi: item["doi"],
source: item["source"],
process: item["process"],
field: item["field"],
action: item["action"],
original_value: item["originalValue"],
enriched_value: JSON.parse(item["enrichedValue"]),
created: Time.current.utc,
updated: Time.current.utc,
produced: item["produced"],
)

if enrichment.save
puts("Created enrichment for #{item["doi"]}")
else
puts("Failed to create enrichment for #{item["doi"]}")
puts(enrichment.errors.full_messages.join(","))
end
end
end

# desc "Ingest ARXIV data"
# task ingest_arxiv: :environment do
# file = File.read("lib/data/20250615_arxiv_preprint_matching_results.json")
# data = JSON.parse(file)
# count = 0

# data.each do |item|
# count += 1

# break if count == 2001

# enrichment = Enrichment.new(
# doi: item["input_doi"],
# source: "COMET",
# process: "10.0000/FAKE.PROCESS",
# field: "relatedIdentifiers",
# action: "insert",
# original_value: nil,
# enriched_value: {
# relationType: "Preprint",
# relatedIdentifier: item["matched_doi"],
# relatedIdentifierType: "DOI",
# },
# created: Time.current.utc,
# updated: Time.current.utc,
# produced: Time.current.utc - 5.days,
# )

# if enrichment.save
# puts("Created enrichment for #{item["input_doi"]}")
# else
# puts("Failed to create enrichment for #{item["input_doi"]}")
# puts(enrichment.errors.full_messages.join(","))
# end
# end
# end

desc "Ingest procedural resource type"
task ingest_procedural_resource_type: :environment do
file = File.read("lib/data/datacite_procedural_resource_type_general_reclassifications_datacite_lookup_format.json")
data = JSON.parse(file)
count = 0

data.each do |item|
count += 1

break if count == 2001

enrichment = Enrichment.new(
doi: item["doi"],
source: "COMET",
process: "10.0000/FAKE.PROCESS",
field: "types",
action: "update",
original_value: item["currentTypes"],
enriched_value: item["reclassifiedTypes"],
created: Time.current.utc,
updated: Time.current.utc,
produced: Time.current.utc - 5.days,
)

if enrichment.save
puts("Created enrichment for #{item["doi"]}")
else
puts("Failed to create enrichment for #{item["doi"]}")
puts(enrichment.errors.full_messages.join(","))
end
end
end
end