Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
66 commits
Select commit Hold shift + click to select a range
e76a164
Merge pull request #449 from ror-community/staging
jrhoads Jun 17, 2025
79af204
Merge pull request #452 from ror-community/staging
jrhoads Jul 30, 2025
23b2169
Adding single search option to v2 affiliation search
jpbarrett13 Sep 10, 2025
f856b5c
Merge pull request #454 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 10, 2025
36da26c
Adding testing
jpbarrett13 Sep 10, 2025
c75787f
Adding single search
jpbarrett13 Sep 10, 2025
1e95d15
Merge pull request #455 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 10, 2025
e9c0325
Adding single search
jpbarrett13 Sep 10, 2025
52ddaeb
Merge pull request #456 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 10, 2025
6538b2d
Adding single search
jpbarrett13 Sep 10, 2025
c747a2c
Merge pull request #457 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 10, 2025
134f107
Adding single search
jpbarrett13 Sep 10, 2025
8865686
Adding single search
jpbarrett13 Sep 10, 2025
cade367
Merge pull request #458 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 10, 2025
4c1b8cb
Adding single search
jpbarrett13 Sep 10, 2025
15b4370
Merge pull request #459 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 10, 2025
384146a
Adding single search
jpbarrett13 Sep 10, 2025
6746470
Merge pull request #460 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 10, 2025
034350c
Adding single search
jpbarrett13 Sep 11, 2025
40984bc
Merge pull request #461 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 11, 2025
fbcf8e5
Adding single search
jpbarrett13 Sep 11, 2025
dcd9066
Merge pull request #462 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 11, 2025
490a1d8
Adding single search
jpbarrett13 Sep 11, 2025
3e623b9
Merge pull request #463 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 11, 2025
c18c19d
Adding single search
jpbarrett13 Sep 11, 2025
9021788
Merge pull request #464 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 11, 2025
ee3b463
Adding single search
jpbarrett13 Sep 11, 2025
102e074
Merge pull request #465 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 11, 2025
3516e7e
Adding single search
jpbarrett13 Sep 11, 2025
8f935da
Merge pull request #466 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 11, 2025
7057d95
Adding single search
jpbarrett13 Sep 11, 2025
90a9391
Merge pull request #467 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 11, 2025
57201c7
Adding single search
jpbarrett13 Sep 11, 2025
5e723e4
Merge pull request #468 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 11, 2025
a0dd3e7
Adding single search
jpbarrett13 Sep 15, 2025
ef63078
Merge pull request #469 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 15, 2025
b8e8ea0
Adding single search
jpbarrett13 Sep 15, 2025
9cfea1a
Merge pull request #470 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 15, 2025
5162244
Adding single search
jpbarrett13 Sep 15, 2025
3534ede
Merge pull request #471 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 15, 2025
380acaf
Adding single search
jpbarrett13 Sep 15, 2025
d3b1988
Merge pull request #472 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 15, 2025
4b88d88
Adding single search
jpbarrett13 Sep 19, 2025
e8ec3bf
Merge pull request #473 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 19, 2025
ea185da
Adding single search
jpbarrett13 Sep 19, 2025
c62f0be
Merge pull request #474 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 19, 2025
88faf1f
Adding single search
jpbarrett13 Sep 19, 2025
6c10d5c
Merge pull request #475 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 19, 2025
e8cf6f5
Adding single search
jpbarrett13 Sep 19, 2025
6c7285f
Merge pull request #476 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 19, 2025
d44bbab
Adding single search
jpbarrett13 Sep 19, 2025
c3e643f
Merge pull request #477 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 19, 2025
5e48232
Adding single search
jpbarrett13 Sep 19, 2025
055d47b
Merge pull request #478 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 19, 2025
91607c1
Adding single search
jpbarrett13 Sep 19, 2025
de364d7
Merge pull request #479 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 19, 2025
6d3b2cf
Adding single search
jpbarrett13 Sep 19, 2025
ab4295d
Merge pull request #480 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 19, 2025
4fd646a
Adding single search
jpbarrett13 Sep 19, 2025
f10b6c3
Merge pull request #481 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 19, 2025
fccb83a
Adding single search
jpbarrett13 Sep 19, 2025
65a6dc3
Merge pull request #482 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 19, 2025
fdaac4a
Adding single search
jpbarrett13 Sep 19, 2025
ba8e376
Merge pull request #483 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 19, 2025
ea05660
Adding single search
jpbarrett13 Sep 19, 2025
0b3340a
Merge pull request #484 from ror-community/single-search-marple-imple…
jpbarrett13 Sep 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ jobs:
working-directory: ./ror-api
run: |
python manage.py test rorapi.tests.tests_unit
# python manage.py test rorapi.tests.tests_affiliations
# TODO fix these tests running in GitHub Action
# python manage.py test rorapi.tests_integration
# python manage.py test rorapi.tests_functional
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ launchdarkly-server-sdk==7.6.1
jsonschema==3.2.0
python-magic
iso639-lang
rapidfuzz==3.6.1
mysqlclient==2.2.7
bleach==6.0.0
pycountry==22.3.5
Expand Down
19 changes: 19 additions & 0 deletions rorapi/common/es_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,25 @@ def add_string_query(self, terms):
query=Q("query_string", query=terms, fuzzy_max_expansions=1),
)

def add_affiliation_query(self, terms, max_candidates):
# print(terms)
self.search = self.search.query(
"nested",
path="affiliation_match.names",
score_mode="max",
query=Q("match", **{"affiliation_match.names.name": terms})
).extra(size=max_candidates)

'''
Nested(
path="outer_nested_field",
query=Q(
"nested",
path="outer_nested_field.inner_nested_field",
query=Q("match", outer_nested_field__inner_nested_field__some_field="some_value")
)
'''

def add_string_query_advanced(self, terms):
self.search = self.search.query(
"bool",
Expand Down
318 changes: 318 additions & 0 deletions rorapi/common/matching_single_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
import os
import re
import unicodedata
import unidecode
import json

from rorapi.common.models import Errors
from rorapi.settings import ES7
from rorapi.common.es_utils import ESQueryBuilder
from rorapi.v1.models import MatchingResult as MatchingResultV1
from rorapi.v2.models import MatchingResult as MatchingResultV2

from collections import namedtuple
from functools import lru_cache
from rapidfuzz import fuzz
from itertools import groupby

MIN_SCORE = 96
MIN_SCORE_FOR_RETURN = 50

MATCHING_TYPE_SINGLE = "SINGLE SEARCH"

# Matching strategy from Marple:
# https://gitlab.com/crossref/labs/marple/-/blob/main/strategies_available/affiliation_single_search/strategy.py?ref_type=heads

@lru_cache(maxsize=None)
def load_countries():
"""Load custom country code map from countries.txt. Tried to use geonames but it gave worse results since it only
includes country names. Might be worth it to try again in the future."""
countries = []
with open(os.path.join(os.path.split(__file__)[0], "countries.txt")) as file:
lines = [line.strip().split() for line in file]
countries = [(line[0], " ".join(line[1:])) for line in lines]
return countries


GEONAMES_COUNTRIES = load_countries()


def to_region(c):
"""Map country code to "region" string.
This effectively groups countries often confused in the data to make sure
the scoring functions do not reject potential matching candidates."""

return {
"GB": "GB-UK",
"UK": "GB-UK",
"CN": "CN-HK-TW",
"HK": "CN-HK-TW",
"TW": "CN-HK-TW",
"PR": "US-PR",
"US": "US-PR",
}.get(c, c)


def get_country_codes(string):
string = unidecode.unidecode(string).strip()
lower = re.sub(r"\s+", " ", string.lower())
lower_alpha = re.sub(r"\s+", " ", re.sub("[^a-z]", " ", string.lower()))
alpha = re.sub(r"\s+", " ", re.sub("[^a-zA-Z]", " ", string))
codes = []
for code, name in GEONAMES_COUNTRIES:
if re.search("[^a-z]", name):
score = fuzz.partial_ratio(name, lower)
elif len(name) == 2:
score = max([fuzz.ratio(name.upper(), t) for t in alpha.split()] + [0])
else:
score = max([fuzz.ratio(name, t) for t in lower_alpha.split()] + [0])
if score >= 90:
codes.append(code.upper())
return list(set(codes))


def get_countries(string):
"""Extract country codes the the string and map to regions."""

codes = get_country_codes(string)
return [to_region(c) for c in codes]


def check_latin_chars(s):
for ch in s:
if ch.isalpha():
if "LATIN" not in unicodedata.name(ch):
return False
return True


def normalize(s):
"""Normalize string for matching."""

if check_latin_chars(s):
s = re.sub(r"\s+", " ", unidecode.unidecode(s).strip().lower())
else:
s = re.sub(r"\s+", " ", s.strip().lower())
s = re.sub(
"(?<![a-z])univ$",
"university",
re.sub(
r"(?<![a-z])univ[\. ]",
"university ",
re.sub(r"(?<![a-z])u\.(?! ?[a-z]\.)", "university ", s),
),
)
s = re.sub(
"(?<![a-z])lab$",
"laboratory",
re.sub("(?<![a-z])lab[^a-z]", "laboratory ", s),
)
s = re.sub(
"(?<![a-z])inst$",
"institute",
re.sub("(?<![a-z])inst[^a-z]", "institute ", s),
)
s = re.sub(
"(?<![a-z])tech$",
"technology",
re.sub("(?<![a-z])tech[^a-z]", "technology ", s),
)
s = re.sub(r"(?<![a-z])u\. ?s\.", "united states", s)
s = re.sub("&", " and ", re.sub("&amp;", " and ", s))
s = re.sub("^the ", "", s)
s = re.sub(r"\s+", " ", s.strip().lower())
return s


def last_non_overlapping(candidates):
matched = None
for candidate in candidates:
overlap = False
for other in candidates:
if candidate.organization["_id"] == other.organization["_id"]:
continue
if (
candidate.start <= other.start <= candidate.end
or candidate.start <= other.end <= candidate.end
or other.start <= candidate.start <= other.end
or other.start <= candidate.end <= other.end
):
overlap = True
if not overlap:
matched = candidate
return matched


def is_better(aff, candidate, other):
score = 0
if "univ" in candidate.name.lower() and "univ" not in other.name.lower():
score += 1
if "univ" not in candidate.name.lower() and "univ" in other.name.lower():
score -= 1
c_diff = abs(len(candidate.name) - len(aff))
o_diff = abs(len(other.name) - len(aff))
if o_diff - c_diff > 4:
score += 1
if c_diff - o_diff > 4:
score -= 1
if candidate.start > other.end:
score += 1
if other.start > candidate.end:
score -= 1
if candidate.score > 99 and other.score < 99:
score += 1
if candidate.score < 99 and other.score > 99:
score -= 1
return score > 0


def rescore(aff, candidates):
new_scores = []
for candidate in candidates:
ns = 0
for other in candidates:
if is_better(aff, candidate, other):
ns += 1
new_scores.append(ns)
return [c._replace(rescore=ns) for c, ns in zip(candidates, new_scores)]


def score(aff, candidate):
best = MatchedOrganization(
organization=candidate,
name="",
score=0,
rescore=0,
start=-1,
end=-1,
matching_type=MATCHING_TYPE_SINGLE,
substring=aff,
chosen=False,
)
for candidate_name in candidate["_source"]["affiliation_match"]["names"]:
if hasattr(candidate_name, "name"):
name = candidate_name["name"]
if (
name.lower() in ["university school", "university hospital"]
or len(name) >= len(aff) + 4
or len(name) < 5
or (" " not in name and aff.lower() != name.lower())
or (" " not in aff and aff.lower() != name.lower())
):
continue
alignment = fuzz.partial_ratio_alignment(normalize(aff), normalize(name))
if alignment.score > best.score:
best = MatchedOrganization(
organization=candidate,
name=name,
score=alignment.score,
rescore=alignment.score,
start=alignment.src_start,
end=alignment.src_end,
matching_type=MATCHING_TYPE_SINGLE,
substring=aff,
chosen=False,
)
return best

def choose_candidate(rescored):

top_score = max([c.rescore for c in rescored])
top_scored = [c for c in rescored if c.rescore == top_score]

if len(top_scored) == 1:
return top_scored[0]

return last_non_overlapping(top_scored)

MatchedOrganization = namedtuple(
"MatchedOrganization",
[
"organization",
"name",
"score",
"rescore",
"start",
"end",
"matching_type",
"substring",
"chosen",
],
)
MatchedOrganization.__new__.__defaults__ = (None, None, 0, 0, 0, 0, None, None, False)

def match_by_query(text, query, countries):
"""Match affiliation text using specific ES query."""
scored_candidates = []
scored_candidates_to_return = []
chosen_candidate = None
chosen_true = None
results = query.execute()
candidates = results.hits.hits
if candidates:
active_candidates = [score(text, c) for c in candidates if c["_source"]["status"] == "active"]
scored_candidates_to_return = [s for s in active_candidates if s.score >= MIN_SCORE_FOR_RETURN]
scored_candidates = [s for s in scored_candidates_to_return if s.score >= MIN_SCORE]
#### choose candidate ####
if scored_candidates:
if (len(scored_candidates) == 1):
chosen_candidate = scored_candidates[0]
rescored_candidates = rescore(text, scored_candidates)
if rescored_candidates:
chosen_candidate = choose_candidate(rescored_candidates)
if chosen_candidate:
if (countries
and to_region(chosen_candidate[0]["_source"]["locations"][0]["geonames_details"]["country_code"])
not in countries):
pass
else:
chosen_true = MatchedOrganization(
organization=chosen_candidate.organization,
name=chosen_candidate.name,
rescore=chosen_candidate.rescore,
score=round(chosen_candidate.score / 100, 2),
start=chosen_candidate.start,
end=chosen_candidate.end,
matching_type=MATCHING_TYPE_SINGLE,
substring=chosen_candidate.substring,
chosen=True,
)

return chosen_true, scored_candidates_to_return


def get_output(chosen, all_matched):
all_matched = sorted(all_matched, key=lambda x: x.score, reverse=True)[:10]
all_matched = [
s._replace(score=round(s.score / 100, 2)) for s in all_matched
]
if chosen:
all_matched = [
a
for a in all_matched
if a.organization["_id"] != chosen.organization["_id"]
]
all_matched.insert(0, chosen)
return all_matched


def get_candidates(aff, countries, version):
qb = ESQueryBuilder(version)
qb.add_affiliation_query(aff, 200)
return match_by_query(aff, qb.get_query(), countries)


def match_affiliation(affiliation, version):
countries = get_countries(affiliation)
chosen, all_matched = get_candidates(affiliation, countries, version)
return get_output(chosen, all_matched)


def match_organizations(params, version):
if "affiliation" in params:
matched = match_affiliation(params.get("affiliation"), version)

if version == "v2":
return None, MatchingResultV2(matched)
return None, MatchingResultV1(matched)
return Errors(["'affiliation' parameter missing"]), None
10 changes: 9 additions & 1 deletion rorapi/common/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from rorapi.common.csv_utils import validate_csv
from rorapi.settings import REST_FRAMEWORK, ES7, ES_VARS
from rorapi.common.matching import match_organizations
from rorapi.common.matching_single_search import match_organizations as single_search_match_organizations
from rorapi.common.models import (
Errors
)
Expand Down Expand Up @@ -155,7 +156,14 @@ def list(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]):
if "format" in params:
del params["format"]
if "affiliation" in params:
errors, organizations = match_organizations(params, version)
if version == "v2":
if "single_search" in params:
# errors, organizations = match_organizations(params, version)
errors, organizations = single_search_match_organizations(params, version)
else:
errors, organizations = match_organizations(params, version)
else:
errors, organizations = match_organizations(params, version)
else:
errors, organizations = search_organizations(params, version)
if errors is not None:
Expand Down
Loading
Loading