diff --git a/.github/workflows/nameai-python-unit-tests.yml b/.github/workflows/nameai-python-unit-tests.yml index 55ae711e2..c1e24f9cb 100644 --- a/.github/workflows/nameai-python-unit-tests.yml +++ b/.github/workflows/nameai-python-unit-tests.yml @@ -37,7 +37,11 @@ jobs: - name: Install dependencies working-directory: ./apps/api.nameai.io - run: poetry install --extras "lambda" + run: poetry install --extras "lambda" --with dev + + - name: Download required data files + working-directory: ./apps/api.nameai.io + run: poetry run python -m nameai.download - name: Run tests working-directory: ./apps/api.nameai.io diff --git a/apps/api.nameai.io/.env.example b/apps/api.nameai.io/.env.example index 17f52c28c..2545a203f 100644 --- a/apps/api.nameai.io/.env.example +++ b/apps/api.nameai.io/.env.example @@ -18,4 +18,4 @@ ALCHEMY_URI_SEPOLIA=https://eth-sepolia.g.alchemy.com/v2/[YOUR_ALCHEMY_API_KEY] # - https://discuss.ens.domains/t/ens-subgraph-migration-to-the-decentralised-version/19183 # - https://thegraph.com/explorer/subgraphs/5XqPmWe6gjyrJtFn9cLy237i4cWw2j9HcUJEXsP5qGtH?view=Query&chain=arbitrum-one ENS_SUBGRAPH_URL_MAINNET=https://api.thegraph.com/subgraphs/name/ensdomains/ens -ENS_SUBGRAPH_URL_SEPOLIA=https://api.studio.thegraph.com/query/49574/enssepolia/version/latest \ No newline at end of file +ENS_SUBGRAPH_URL_SEPOLIA=https://api.studio.thegraph.com/query/49574/enssepolia/version/latest diff --git a/apps/api.nameai.io/Dockerfile b/apps/api.nameai.io/Dockerfile index 87affa174..2fef20918 100644 --- a/apps/api.nameai.io/Dockerfile +++ b/apps/api.nameai.io/Dockerfile @@ -4,4 +4,5 @@ RUN yum install gcc -y COPY pyproject.toml poetry.lock LICENSE README.md ./ COPY nameai ./nameai/ RUN pip install --no-cache-dir .[lambda] +RUN python -m nameai.download CMD [ "nameai.root_api.handler" ] \ No newline at end of file diff --git a/apps/api.nameai.io/nameai/all_tokenizer.py b/apps/api.nameai.io/nameai/all_tokenizer.py index 1d801d3e2..f17f3f900 100644 --- a/apps/api.nameai.io/nameai/all_tokenizer.py +++ b/apps/api.nameai.io/nameai/all_tokenizer.py @@ -69,19 +69,25 @@ def dfs(self, index, result, gap_before=False): class AllTokenizer: - """Return all tokenizations. It is a generator.""" + """ + General-purpose tokenizer that finds all possible word combinations in text. + + Uses an Aho-Corasick automaton with multiple dictionaries to identify + valid words. Can produce tokenizations with gaps. + Yields tokenizations as tuples of tokens. + """ def __init__(self, config): self.config = config - self.skip_non_words = config.tokenization.skip_non_words - self.with_gaps = config.tokenization.with_gaps + self.skip_non_words = config.tokenization.all_tokenizer.skip_non_words + self.with_gaps = config.tokenization.all_tokenizer.with_gaps @static_property def automaton(self): automaton = ahocorasick.Automaton() should_be_tokenized = set() - with open(get_resource_path(self.config.tokenization.should_be_tokenized), encoding='utf-8') as f: + with open(get_resource_path(self.config.tokenization.all_tokenizer.should_be_tokenized), encoding='utf-8') as f: for line in f: word = line.strip().lower() should_be_tokenized.add(word) diff --git a/apps/api.nameai.io/nameai/config/prod_config.yaml b/apps/api.nameai.io/nameai/config/prod_config.yaml index 69a5f8fc9..aa10871ff 100644 --- a/apps/api.nameai.io/nameai/config/prod_config.yaml +++ b/apps/api.nameai.io/nameai/config/prod_config.yaml @@ -4,12 +4,26 @@ tokenization: dictionary: words.txt custom_dictionary: custom_dictionary.txt domain_specific_dictionary: domain_specific_dictionary.txt - should_be_tokenized: should_be_tokenized.txt - skip_non_words: false - with_gaps: true + all_tokenizer: + should_be_tokenized: should_be_tokenized.txt + skip_non_words: false + with_gaps: true + person_names: + first_names: pn_firstnames.json + last_names: pn_lastnames.json + other: pn_other.json + country_stats: pn_country_stats.json + country_bonus: 100 ngrams: unigrams: unigram_freq.csv bigrams: bigram_freq.csv custom_dictionary: custom_dictionary.txt domain_specific_dictionary: domain_specific_dictionary.txt custom_token_frequency: 500000 +s3_resources: + bucket: prod-name-generator-namegeneratori-inputss3bucket-c26jqo3twfxy + person_names: + first_names_key: person_names_firstnames.json + last_names_key: person_names_lastnames.json + other_key: person_names_other.json + country_stats_key: person_names_country_stats.json diff --git a/apps/api.nameai.io/nameai/data/tests/person_names_quality.json b/apps/api.nameai.io/nameai/data/tests/person_names_quality.json new file mode 100644 index 000000000..d0479ec9e --- /dev/null +++ b/apps/api.nameai.io/nameai/data/tests/person_names_quality.json @@ -0,0 +1,55 @@ +{ + "simple_names": { + "piotrwiśniewski": ["piotr", "wiśniewski"], + "camilleclaudel": ["camille", "claudel"], + "johnnydepp": ["johnny", "depp"], + "giancarloesposito": ["giancarlo", "esposito"], + "maríagarcía": ["maría", "garcía"], + "viktororbán": ["viktor", "orbán"], + "sørenkierkegaard": ["søren", "kierkegaard"], + "oceanneguérin": ["oceanne", "guérin"] + }, + "ambiguous_names": { + "dragonfernandez": { + "person_name": ["dragon", "fernandez"], + "words": ["dragon", "fern", "a", "ndez"] + }, + "wolfsmith": { + "person_name": ["wolf", "smith"], + "words": ["wolf", "smith"] + }, + "blacksmith": { + "person_name": null, + "words": ["black", "smith"] + }, + "charleswood": { + "person_name": ["charles", "wood"], + "words": ["char", "les", "wood"] + }, + "maylin": { + "person_name": ["may", "lin"], + "words": ["may", "lin"] + }, + "natalieportman": { + "person_name": ["natalie", "portman"], + "words": ["nat", "alie", "port", "man"] + }, + "sunyoung": { + "person_name": ["sunyoung"], + "words": ["suny", "oung"] + }, + "annalísa": { + "person_name": ["anna", "lísa"], + "words": ["ann", "alísa"] + } + }, + "non_names": { + "dragonfernouch": ["dragon", "fern", "ouch"], + "webmaster": ["webmaster"], + "quantumleap": ["quantum", "leap"], + "neuralnet": ["neural", "net"], + "deepmatrix": ["deep", "matrix"], + "cloudsync": ["cloud", "sync"], + "byteflow": ["byte", "flow"] + } +} \ No newline at end of file diff --git a/apps/api.nameai.io/nameai/download.py b/apps/api.nameai.io/nameai/download.py new file mode 100644 index 000000000..5f7a01f6d --- /dev/null +++ b/apps/api.nameai.io/nameai/download.py @@ -0,0 +1,53 @@ +import boto3 +import botocore +from omegaconf import DictConfig +import hydra +import os + +from nameai.data import get_resource_path + + +class S3Downloader: + def __init__(self, bucket: str): + self.s3_client = None + self.bucket = bucket + self.region_name = 'us-east-1' + + def get_client(self): + if self.s3_client is None: + self.s3_client = boto3.client( + 's3', region_name=self.region_name, config=botocore.config.Config(signature_version=botocore.UNSIGNED) + ) + return self.s3_client + + def download_file(self, key: str, local_path: str, overwrite: bool = False): + if os.path.exists(local_path) and not overwrite: + return + self.get_client().download_file(self.bucket, key, local_path) + + +def download_files(config: DictConfig): + """Download files using provided config""" + downloader = S3Downloader(config.s3_resources.bucket) + files_to_download = [ + (config.s3_resources.person_names.first_names_key, config.tokenization.person_names.first_names), + (config.s3_resources.person_names.last_names_key, config.tokenization.person_names.last_names), + (config.s3_resources.person_names.other_key, config.tokenization.person_names.other), + (config.s3_resources.person_names.country_stats_key, config.tokenization.person_names.country_stats), + ] + + for s3_key, local_path in files_to_download: + downloader.download_file( + key=s3_key, + local_path=get_resource_path(local_path), + overwrite=True, + ) + + +@hydra.main(config_path='./config', config_name='prod_config', version_base=None) +def download_files_main(config: DictConfig): + download_files(config) + + +if __name__ == '__main__': + download_files_main() diff --git a/apps/api.nameai.io/nameai/ngrams.py b/apps/api.nameai.io/nameai/ngrams.py index 42124dfbb..c1503e2b3 100644 --- a/apps/api.nameai.io/nameai/ngrams.py +++ b/apps/api.nameai.io/nameai/ngrams.py @@ -82,13 +82,13 @@ def all_unigrams_count(self) -> int: def all_bigrams_count(self) -> int: return self._bigrams_and_count[1] - def unigram_count(self, word: str) -> int: + def unigram_count(self, word: str) -> int | float: return self.unigrams.get(word, self.oov_count(word)) def bigram_count(self, word: str) -> Optional[int]: return self.bigrams.get(word, None) - def oov_count(self, word: str) -> int: + def oov_count(self, word: str) -> float: return (1 / 100) ** (len(word)) def word_probability(self, word: str) -> float: diff --git a/apps/api.nameai.io/nameai/nlp_inspector.py b/apps/api.nameai.io/nameai/nlp_inspector.py index 7167c9dad..56094beec 100644 --- a/apps/api.nameai.io/nameai/nlp_inspector.py +++ b/apps/api.nameai.io/nameai/nlp_inspector.py @@ -10,6 +10,7 @@ ) from nameai.all_tokenizer import AllTokenizer from nameai.ngrams import Ngrams +from nameai.person_names import PersonNameTokenizer def init_inspector(): @@ -49,6 +50,7 @@ class NLPInspector: def __init__(self, config): self.inspector = init_inspector() self.tokenizer = AllTokenizer(config) + self.person_names_tokenizer = PersonNameTokenizer(config) self.ngrams = Ngrams(config) def nlp_analyse_label(self, label: str) -> NLPLabelAnalysis: @@ -92,28 +94,52 @@ def base_analyse_label(self, label: str): return self.inspector.analyse_label(label, simple_confusables=True) def tokenize(self, label: str, tokenizations_limit: int) -> tuple[list[dict], bool]: - tokenizeds_iterator = self.tokenizer.tokenize(label) + """ + Tokenize text using both person name and general-purpose tokenizers. + + Combines results from PersonNameTokenizer (with name-specific probabilities) + and AllTokenizer (with ngram-based probabilities). + Returns tokenizations sorted by probability. + """ + all_tokenizer_iterator = self.tokenizer.tokenize(label) + person_names_iterator = self.person_names_tokenizer.tokenize_with_scores(label) + tokenizeds = [] partial_tokenization = False try: used = set() i = 0 - for tokenized in tokenizeds_iterator: + + # first add person name tokenizations with their original scores + for tokenized, log_prob in person_names_iterator: + if tokenized not in used: + if i == tokenizations_limit: + partial_tokenization = True + break + used.add(tokenized) + i += 1 + tokenizeds.append({'tokens': tokenized, 'log_probability': log_prob, 'source': 'person_names'}) + + # then add regular tokenizations + for tokenized in all_tokenizer_iterator: if tokenized not in used: if i == tokenizations_limit: partial_tokenization = True break used.add(tokenized) i += 1 - tokenizeds.append(tokenized) + # for AllTokenizer tokenizations, use ngrams probability + tokenizeds.append( + { + 'tokens': tokenized, + 'log_probability': self.ngrams.sequence_log_probability(tokenized), + 'source': 'ngrams', + } + ) + except RecursionError: partial_tokenization = True - tokenizeds = [ - {'tokens': tokenized, 'log_probability': self.ngrams.sequence_log_probability(tokenized)} - for tokenized in tokenizeds - ] - for tokenized in tokenizeds: tokenized['tokens'] = tuple(uniq_gaps(tokenized['tokens'])) tokenized['probability'] = math.exp(tokenized['log_probability']) diff --git a/apps/api.nameai.io/nameai/person_names.py b/apps/api.nameai.io/nameai/person_names.py new file mode 100644 index 000000000..e51548811 --- /dev/null +++ b/apps/api.nameai.io/nameai/person_names.py @@ -0,0 +1,272 @@ +import collections +import copy +import json +import math +from typing import Iterator, Optional +from omegaconf import DictConfig + +from nameai.data import get_resource_path + + +class PersonNames: + """ + Analyzes and scores potential person name interpretations in text. + + Uses statistical data about first names, last names, and their frequency per country + to evaluate different possible interpretations of a text string as a person's name. + Scoring is weighted by country-specific internet user statistics to reflect + real-world name likelihood. + """ + + def __init__(self, config: DictConfig): + pn_config = config.tokenization.person_names + self.firstnames = json.load(open(get_resource_path(pn_config.first_names))) + self.lastnames = json.load(open(get_resource_path(pn_config.last_names))) + other = json.load(open(get_resource_path(pn_config.other))) + self.countries: dict[str, int] = other['all'] + self.firstname_initials: dict[str, dict[str, int]] = other['firstname_initials'] + self.lastname_initials: dict[str, dict[str, int]] = other['lastname_initials'] + self.country_stats = json.load(open(get_resource_path(pn_config.country_stats))) + self.all_internet_users: int = sum(x[0] for x in self.country_stats.values()) + self.all_population: int = sum(x[1] for x in self.country_stats.values()) + self.country_bonus = pn_config.country_bonus + self.allow_cross_country = False + + def print_missing_countries(self): + for country, stats in sorted(self.country_stats.items(), key=lambda x: x[1][0], reverse=True): + if country not in self.countries: + print('X', country, stats) + else: + print(country, stats) + + def get_population(self, country: str) -> Optional[int]: + try: + return self.country_stats[country][1] + except Exception: + return None + + def get_internet_users(self, country: str) -> Optional[int]: + try: + return self.country_stats[country][0] + except Exception: + return None + + def get_internet_users_weight(self, country: str) -> Optional[float]: + try: + return self.country_stats[country][0] / self.all_internet_users + except Exception: + return None + + def single_name(self, name: str, name_stats: dict[str, dict[str, int]]) -> dict: + name_prob = { + country: sum(gender_counts.values()) / self.countries[country] * self.get_internet_users_weight(country) + for country, gender_counts in name_stats.items() + } + + genders = {} + for country, gender_counts in name_stats.items(): + m = gender_counts.get('M', 1) + f = gender_counts.get('F', 1) + genders[country] = {'M': m / (m + f), 'F': f / (m + f)} + + interpretation = {} + interpretation['names'] = [name_stats] + interpretation['prob'] = name_prob + interpretation['tokenization'] = (name,) + interpretation['genders'] = genders + return interpretation + + def name_with_initial( + self, + name: str, + initial: str, + name_stats: dict[str, dict[str, int]], + initial_firstname: bool, + initial_first: bool, + ) -> dict: + name_prob = { + country: sum(gender_counts.values()) + / self.countries[country] + * ( + self.firstname_initials[country].get(initial, 1) + if initial_firstname + else self.lastname_initials[country].get(initial, 1) + ) + / self.countries[country] + * self.get_internet_users_weight(country) + for country, gender_counts in name_stats.items() + } + + genders = {} + for country, gender_counts in name_stats.items(): + m = gender_counts.get('M', 1) + f = gender_counts.get('F', 1) + genders[country] = {'M': m / (m + f), 'F': f / (m + f)} + + interpretation = {} + if initial_first: + interpretation['tokenization'] = (initial, name) + else: + interpretation['tokenization'] = (name, initial) + + interpretation['names'] = [name_stats] + interpretation['prob'] = name_prob + interpretation['genders'] = genders + return interpretation + + def two_names( + self, name1: str, name2: str, name1_stats: dict[str, dict[str, int]], name2_stats: dict[str, dict[str, int]] + ) -> dict: + name1_prob = { + country: sum(gender_counts.values()) / self.countries[country] + for country, gender_counts in name1_stats.items() + } + name2_prob = { + country: sum(gender_counts.values()) / self.countries[country] + for country, gender_counts in name2_stats.items() + } + interpretation = {} + interpretation['names'] = [name1_stats, name2_stats] + interpretation['tokenization'] = (name1, name2) + + probs = collections.defaultdict(list) + probs2 = {} + genders = {} + for name_prob in [name1_prob, name2_prob]: + for country, prob in name_prob.items(): + probs[country].append(prob) + for country, probs in probs.items(): + if len(probs) == 1: + if not self.allow_cross_country: + continue + probs.append(1 / self.countries[country]) + probs2[country] = math.prod(probs) + probs2[country] *= self.get_internet_users_weight(country) + + m = name1_stats.get(country, {}).get('M', 1) * name2_stats.get(country, {}).get('M', 1) + f = name1_stats.get(country, {}).get('F', 1) * name2_stats.get(country, {}).get('F', 1) + genders[country] = {'M': m / (m + f), 'F': f / (m + f)} + interpretation['prob'] = probs2 + interpretation['genders'] = genders + + return interpretation + + def anal(self, input_name: str) -> list[dict]: + interpretations = [] + # only one name + name_stats = copy.copy(self.firstnames.get(input_name, None)) + if name_stats: + interpretation = self.single_name(input_name, name_stats) + interpretation['type'] = 'first' + interpretations.append(interpretation) + + name_stats = copy.copy(self.lastnames.get(input_name, None)) + if name_stats: + interpretation = self.single_name(input_name, name_stats) + interpretation['type'] = 'last' + interpretations.append(interpretation) + + # one name with initial + for name, initial, initial_first in [ + (input_name[1:], input_name[:1], True), + (input_name[:-1], input_name[-1:], False), + ]: + if not initial or not name: + continue + name_stats = copy.copy(self.firstnames.get(name, None)) + if name_stats: + interpretation = self.name_with_initial( + name, initial, name_stats, initial_firstname=False, initial_first=initial_first + ) + interpretation['type'] = 'first with initial' + interpretations.append(interpretation) + + name_stats = copy.copy(self.lastnames.get(name, None)) + if name_stats: + interpretation = self.name_with_initial( + name, initial, name_stats, initial_firstname=True, initial_first=initial_first + ) + interpretation['type'] = 'last with initial' + interpretations.append(interpretation) + + # two names + for i in range(1, len(input_name)): + name1 = input_name[:i] + name2 = input_name[i:] + name1_result = copy.copy(self.firstnames.get(name1, None)) + name2_result = copy.copy(self.lastnames.get(name2, None)) + if name1_result and name2_result: + interpretation = self.two_names(name1, name2, name1_result, name2_result) + interpretation['type'] = 'first last' + interpretations.append(interpretation) + + name1_result = copy.copy(self.lastnames.get(name1, None)) + name2_result = copy.copy(self.firstnames.get(name2, None)) + if name1_result and name2_result: + interpretation = self.two_names(name1, name2, name1_result, name2_result) + interpretation['type'] = 'last first' + interpretations.append(interpretation) + + return interpretations + + def tokenize( + self, input_name: str, user_country: str = None, topn: int = 1 + ) -> list[tuple[float, str, tuple[str, ...], list[str], dict[str, float]]]: + """Return best country interpretation.""" + all_interpretations = self.score(input_name, user_country) + return all_interpretations[:topn] + + def score( + self, input_name: str, user_country: str | None = None + ) -> list[tuple[float, str, tuple[str, ...], list[str], dict[str, float]]]: + """Return best interpretation.""" + interpretations = self.anal(input_name) + + all_interpretations = [] + for r in interpretations: + if user_country in r['prob']: + r['prob'][user_country] = r['prob'][user_country] * self.country_bonus + + for country, prob in r['prob'].items(): + all_interpretations.append( + (prob, country, r['tokenization'], r['type'], r['genders'].get(country, None)) + ) + + return sorted(all_interpretations, reverse=True) + + +class PersonNameTokenizer: + """ + Specialized tokenizer for identifying person names in text. + + Uses statistical name data and filtering to identify valid name tokens. + Yields tokenizations as tuples of tokens paired with their log probability. + """ + + def __init__(self, config: DictConfig): + super().__init__() + self.pn = PersonNames(config) + self.should_be_tokenized = set() + with open(get_resource_path(config.tokenization.all_tokenizer.should_be_tokenized), encoding='utf-8') as f: + for line in f: + word = line.strip().lower() + self.should_be_tokenized.add(word) + + def _get_scores(self, label: str) -> list[tuple[float, str, tuple[str, ...], str, dict[str, float]]]: + """Get or compute scores for a label""" + return self.pn.score(label) + + def tokenize_with_scores(self, label: str) -> Iterator[tuple[tuple[str, ...], float]]: + """ + Tokenize a label into possible person name interpretations with their scores + returns an iterator of (tokenization, log_probability) pairs + """ + seen = set() + for prob, country, tokenization, type_, genders in self._get_scores(label): + if ( # skip if any token is in should_be_tokenized list or is a single letter + tokenization not in seen + and all(len(t) > 1 for t in tokenization) + and not any(t.lower() in self.should_be_tokenized for t in tokenization) + ): + seen.add(tokenization) + yield tokenization, math.log(prob) if prob > 0 else -float('inf') diff --git a/apps/api.nameai.io/poetry.lock b/apps/api.nameai.io/poetry.lock index 1baf8d433..d4ca322e9 100644 --- a/apps/api.nameai.io/poetry.lock +++ b/apps/api.nameai.io/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -327,6 +327,44 @@ files = [ {file = "bitarray-2.9.2.tar.gz", hash = "sha256:a8f286a51a32323715d77755ed959f94bef13972e9a2fe71b609e40e6d27957e"}, ] +[[package]] +name = "boto3" +version = "1.36.14" +description = "The AWS SDK for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "boto3-1.36.14-py3-none-any.whl", hash = "sha256:e2dab15944c3f517c88850d60b07f2f6fd3bc69aa51c47670e4f45d62a8c41fd"}, + {file = "boto3-1.36.14.tar.gz", hash = "sha256:4b0b8dd593b95f32a5a761dee65094423fbd06a4ad09f26b2e6c80493139569f"}, +] + +[package.dependencies] +botocore = ">=1.36.14,<1.37.0" +jmespath = ">=0.7.1,<2.0.0" +s3transfer = ">=0.11.0,<0.12.0" + +[package.extras] +crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] + +[[package]] +name = "botocore" +version = "1.36.14" +description = "Low-level, data-driven core of boto 3." +optional = false +python-versions = ">=3.8" +files = [ + {file = "botocore-1.36.14-py3-none-any.whl", hash = "sha256:546d0c071e9c8aeaca399d71bec414abe6434460f7d6640cbd92d4b1c3eb443e"}, + {file = "botocore-1.36.14.tar.gz", hash = "sha256:53feff270078c23ba852fb2638fde6c5f74084cfc019dd5433e865cd04065c60"}, +] + +[package.dependencies] +jmespath = ">=0.7.1,<2.0.0" +python-dateutil = ">=2.1,<3.0.0" +urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""} + +[package.extras] +crt = ["awscrt (==0.23.8)"] + [[package]] name = "certifi" version = "2024.8.30" @@ -1324,6 +1362,17 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "jmespath" +version = "1.0.1" +description = "JSON Matching Expressions" +optional = false +python-versions = ">=3.7" +files = [ + {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, + {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, +] + [[package]] name = "jsonschema" version = "4.23.0" @@ -2087,6 +2136,20 @@ files = [ [package.dependencies] pytest = ">=3.1" +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + [[package]] name = "python-dotenv" version = "1.0.1" @@ -2495,6 +2558,23 @@ files = [ {file = "ruff-0.6.7.tar.gz", hash = "sha256:44e52129d82266fa59b587e2cd74def5637b730a69c4542525dfdecfaae38bd5"}, ] +[[package]] +name = "s3transfer" +version = "0.11.2" +description = "An Amazon S3 Transfer Manager" +optional = false +python-versions = ">=3.8" +files = [ + {file = "s3transfer-0.11.2-py3-none-any.whl", hash = "sha256:be6ecb39fadd986ef1701097771f87e4d2f821f27f6071c872143884d2950fbc"}, + {file = "s3transfer-0.11.2.tar.gz", hash = "sha256:3b39185cb72f5acc77db1a58b6e25b977f28d20496b6e58d6813d75f464d632f"}, +] + +[package.dependencies] +botocore = ">=1.36.0,<2.0a.0" + +[package.extras] +crt = ["botocore[crt] (>=1.36.0,<2.0a.0)"] + [[package]] name = "setuptools" version = "75.1.0" @@ -2515,6 +2595,17 @@ enabler = ["pytest-enabler (>=2.2)"] test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.11.*)", "pytest-mypy"] +[[package]] +name = "six" +version = "1.17.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, + {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, +] + [[package]] name = "sniffio" version = "1.3.1" @@ -3010,4 +3101,4 @@ lambda = ["mangum"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "453b73989a1dc02fa3b1a79b727b2f3c0e2a7049dc2435391d0ed6000cc7717c" +content-hash = "0ae1b34f136e9fad1241d06fa24f68aa3238121a865ca39427ffeed18d037e59" diff --git a/apps/api.nameai.io/pyproject.toml b/apps/api.nameai.io/pyproject.toml index 059df002c..73dc3da1e 100644 --- a/apps/api.nameai.io/pyproject.toml +++ b/apps/api.nameai.io/pyproject.toml @@ -21,6 +21,7 @@ httpx = "^0.25.0" python-dotenv = "^1.0.0" pyahocorasick = "^2.0.0" setuptools = "^75.1.0" +boto3 = "^1.36.14" [tool.poetry.extras] diff --git a/apps/api.nameai.io/start-local.sh b/apps/api.nameai.io/start-local.sh index d93497840..61f70255c 100644 --- a/apps/api.nameai.io/start-local.sh +++ b/apps/api.nameai.io/start-local.sh @@ -3,4 +3,5 @@ pip install --upgrade pip pip install poetry pip install uvicorn pip install .[lambda] +python -m nameai.download uvicorn nameai.root_api:app \ No newline at end of file diff --git a/apps/api.nameai.io/tests/load_tests/README.md b/apps/api.nameai.io/tests/load_tests/README.md new file mode 100644 index 000000000..5901884f0 --- /dev/null +++ b/apps/api.nameai.io/tests/load_tests/README.md @@ -0,0 +1,57 @@ +# Load Tests for NameAI API + +This directory contains load testing scripts for the NameAI API using [Locust](https://locust.io/). + +## Start NameAI API + +In one terminal, start the NameAI API: + +```bash +poetry run uvicorn nameai.nameai_api:app +``` + +## Install locust + +In another terminal, activate the poetry environment and install locust: + +```bash +poetry run pip install locust +``` + +## Run tests + +Navigate to the `load_tests` directory and use one of the following options: + +### Tests in Web UI + +Start the load test with: +```bash +poetry run locust -f performance.py +``` +Then open http://localhost:8089 in your browser to: +- Configure number of users +- Set spawn rate +- Start/stop tests +- View real-time metrics and charts + +### Headless tests + +You can run headless tests with these parameters: +```bash +poetry run locust -f performance.py --headless -u 100 -r 10 --run-time 1m -H "http://localhost:8000" --only-summary +``` + +This will: +- Run with 100 users +- Spawn 10 users per second +- Run for 1 minute +- Generate HTML reports + + +### Test latency for different number of users + +```bash +poetry run bash run_load_tests.sh +``` + +This will run the test with different number of users and save the results in `latency_results.csv`. diff --git a/apps/api.nameai.io/tests/load_tests/latency_results.csv b/apps/api.nameai.io/tests/load_tests/latency_results.csv new file mode 100644 index 000000000..5c09f8dd5 --- /dev/null +++ b/apps/api.nameai.io/tests/load_tests/latency_results.csv @@ -0,0 +1,6 @@ +users,requests,failures,mean_latency,median_latency,p95_latency +16,939,0,11.60211740468521,7,13 +32,1919,0,12.957146057850554,6,16 +64,3778,0,25.72315333006236,7,64 +128,7360,0,59.24790962773564,18,150 +256,10123,0,493.8000638233782,440.0,880 diff --git a/apps/api.nameai.io/tests/load_tests/performance.py b/apps/api.nameai.io/tests/load_tests/performance.py new file mode 100644 index 000000000..9026e28a6 --- /dev/null +++ b/apps/api.nameai.io/tests/load_tests/performance.py @@ -0,0 +1,71 @@ +import random + +from locust import HttpUser, task, between + + +input_labels = [ + 'giancarloesposito', + 'piotrwiśniewski', + 'dragonfernandez', + 'wolfsmith', + 'mrscopcake', + 'likemrscopcake', + 'cryptocurrency', + 'blockchain', + 'yerbamate', + 'javascript', + 'superduper', + 'ucberkeley', + 'moshpit', + 'coffeebean', + 'laptoplaptop', + 'americanairlines', + 'usarmy', + 'greenriver', + 'counterstrike', + 'rocknroll', + 'sanfrancisco', + 'ilikeyourcat', + 'catlikeiyour', + 'xchange', + 'bball', + 'nft', + 'sdfbgfdbgjkdfjgdfhjfgdjfgdsjh', + '[003fda97309fd6aa9d7753dcffa37da8bb964d0fb99eba99d0770e76fc5bac91]', + 'lapśtop', + 'łcatł', + 'laptop', + 'toplap', + 'repeatable', + 'bothering', + 'rakuten', + 'livecam', + 'miinibaashkiminasiganibiitoosijiganibadagwiingweshiganibakwezhigan', + 'yorknewŁyork123', + 'counterstrike', + 'avadakedavra', + 'lumosreparo', + 'americanairlines', + 'greenriver', + 'uc', + 'us', + 'nft', +] + + +class NameAIUser(HttpUser): + wait_time = between(0.2, 1.6) + + @task(1) + def inspect_label_get(self): + self.client.get(f'/inspect-label/{random.choice(input_labels)}') + + @task(1) + def inspect_label_post(self): + self.client.post('/inspect-label', json={'label': random.choice(input_labels)}) + + @task(1) + def inspect_name(self): + self.client.post( + '/inspect-name', json={'name': f'{random.choice(input_labels)}.eth', 'network_name': 'mainnet'} + ) diff --git a/apps/api.nameai.io/tests/load_tests/run_load_tests.sh b/apps/api.nameai.io/tests/load_tests/run_load_tests.sh new file mode 100755 index 000000000..c392d88ad --- /dev/null +++ b/apps/api.nameai.io/tests/load_tests/run_load_tests.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +user_counts=(16 32 64 128 256) +output_file="latency_results.csv" +echo "users,requests,failures,mean_latency,median_latency,p95_latency" > $output_file + +for n_users in "${user_counts[@]}" +do + echo "Running test with $n_users users..." + + # run locust with specified number of users (--spawn-rate is set to n_users/10 for gradual ramp-up) + locust -f performance.py \ + --headless \ + --users $n_users \ + --spawn-rate $(($n_users/10)) \ + --run-time 1m \ + --host "http://localhost:8000" \ + --only-summary \ + --csv="stats_$n_users" + + # extract metrics from the csv file ("Aggregated" row) + stats=$(tail -n 1 "stats_${n_users}_stats.csv") + + # extract relevant columns + echo "$stats" | awk -F',' '{print "'$n_users'," $3 "," $4 "," $6 "," $5 "," $16}' >> $output_file + + # clean up all temporary files + rm -f "stats_${n_users}_stats.csv" \ + "stats_${n_users}_stats_history.csv" \ + "stats_${n_users}_failures.csv" \ + "stats_${n_users}_exceptions.csv" + + # wait between tests to let system stabilize + sleep 5 +done + +echo "Testing complete. Results saved to $output_file" diff --git a/apps/api.nameai.io/tests/test_nameai.py b/apps/api.nameai.io/tests/test_nameai.py index d195477c5..8492cfefe 100644 --- a/apps/api.nameai.io/tests/test_nameai.py +++ b/apps/api.nameai.io/tests/test_nameai.py @@ -18,11 +18,11 @@ def nameai(): def test_normalized(nameai: 'NameAI'): result = nameai.inspect_label('nick') assert abs(result.nameai.purity_score - 0.9976234705882353) < 0.0001, result.nameai.purity_score - assert abs(result.nameai.sort_score - 0.9354685918689098) < 0.0001, result.nameai.sort_score + assert abs(result.nameai.sort_score - 0.93694557738369) < 0.0001, result.nameai.sort_score assert result.nameai.analysis.status == 'normalized' - assert abs(result.nameai.analysis.probability - 0.0000317942695746393) < 0.0001, result.nameai.analysis.probability + assert abs(result.nameai.analysis.probability - 0.00019752378433969656) < 0.0001, result.nameai.analysis.probability assert ( - abs(result.nameai.analysis.log_probability - -10.356224486471852) < 0.0001 + abs(result.nameai.analysis.log_probability - -8.529651553837413) < 0.0001 ), result.nameai.analysis.log_probability assert result.nameai.analysis.word_count == 1 assert result.nameguard.rating.name == 'WARN' @@ -38,13 +38,13 @@ def test_name(nameai: 'NameAI'): result = nameai.inspect_name('nick') assert result.nameai.analysis.inspection.label == 'nick' assert abs(result.nameai.purity_score - 0.9976234705882353) < 0.0001, result.nameai.purity_score - assert abs(result.nameai.sort_score - 0.9354685918689098) < 0.0001, result.nameai.sort_score + assert abs(result.nameai.sort_score - 0.93694557738369) < 0.0001, result.nameai.sort_score assert result.nameai.analysis.status == 'normalized' result = nameai.inspect_name('nick.eth') assert result.nameai.analysis.inspection.label == 'nick' assert abs(result.nameai.purity_score - 0.9976234705882353) < 0.0001, result.nameai.purity_score - assert abs(result.nameai.sort_score - 0.9354685918689098) < 0.0001, result.nameai.sort_score + assert abs(result.nameai.sort_score - 0.93694557738369) < 0.0001, result.nameai.sort_score assert result.nameai.analysis.status == 'normalized' result = nameai.inspect_name('nick.eth.eth') diff --git a/apps/api.nameai.io/tests/test_nlp_inspector.py b/apps/api.nameai.io/tests/test_nlp_inspector.py index 6bc0eddce..360a56353 100644 --- a/apps/api.nameai.io/tests/test_nlp_inspector.py +++ b/apps/api.nameai.io/tests/test_nlp_inspector.py @@ -97,3 +97,130 @@ def test_inspector_word_count(nlp_inspector: 'NLPInspector'): result = nlp_inspector.nlp_analyse_label('toplap') assert result.word_count == 2 + + +def test_inspector_simple_names(nlp_inspector: 'NLPInspector'): + """Test that simple person names are correctly identified""" + from nameai.data import get_resource_path + import json + + with open(get_resource_path('tests/person_names_quality.json')) as f: + quality_tests = json.load(f) + + failures = [] + for input_text, expected_tokens in quality_tests['simple_names'].items(): + tokenizations, _ = nlp_inspector.tokenize(input_text, 1000) + expected_tokens = tuple(expected_tokens) + if tokenizations[0]['tokens'] != expected_tokens or tokenizations[0]['source'] != 'person_names': + failures.append( + f"\nInput: '{input_text}'\nExpected: {expected_tokens} (person_names)\n" + f"Got: {tokenizations[0]['tokens']} ({tokenizations[0]['source']})" + ) + + if failures: + print('\n=== Simple Names Test Failures ===') + for failure in failures: + print(failure) + print(f'\nTotal failures: {len(failures)} out of {len(quality_tests["simple_names"])} test cases') + assert False, 'Some simple name tests failed. See above for details.' + + +def test_inspector_ambiguous_names(nlp_inspector: 'NLPInspector'): + """Test that ambiguous names are handled correctly""" + from nameai.data import get_resource_path + import json + + with open(get_resource_path('tests/person_names_quality.json')) as f: + quality_tests = json.load(f) + + failures = [] + for input_text, interpretation2expected_tokens in quality_tests['ambiguous_names'].items(): + tokenizations, _ = nlp_inspector.tokenize(input_text, 1000) + if interpretation2expected_tokens['person_name'] is not None: + expected_tokens = tuple(interpretation2expected_tokens['person_name']) + if tokenizations[0]['tokens'] != expected_tokens or tokenizations[0]['source'] != 'person_names': + failures.append( + f"\nInput: '{input_text}'\nExpected: {expected_tokens} (person_names)\n" + f"Got: {tokenizations[0]['tokens']} ({tokenizations[0]['source']})" + ) + else: + if tokenizations[0]['source'] != 'ngrams': + failures.append( + f"\nInput: '{input_text}'\nExpected ngrams source\n" f"Got: {tokenizations[0]['source']}" + ) + expected_words = tuple(interpretation2expected_tokens['words']) + found_words = False + for tokenization in tokenizations: + if tokenization['tokens'] == expected_words: + found_words = True + break + if not found_words: + failures.append( + f"\nInput: '{input_text}'\nExpected words tokenization: {expected_words}\n" + f"Got tokenizations: {[t['tokens'] for t in tokenizations[:5]]}" + ) + + if failures: + print('\n=== Ambiguous Names Test Failures ===') + for failure in failures: + print(failure) + print(f'\nTotal failures: {len(failures)} out of {len(quality_tests["ambiguous_names"])} test cases') + assert False, 'Some ambiguous name tests failed. See above for details.' + + +def test_inspector_non_names(nlp_inspector: 'NLPInspector'): + """Test that non-names are correctly identified""" + from nameai.data import get_resource_path + import json + + with open(get_resource_path('tests/person_names_quality.json')) as f: + quality_tests = json.load(f) + + failures = [] + for input_text, expected_tokens in quality_tests['non_names'].items(): + tokenizations, _ = nlp_inspector.tokenize(input_text, 1000) + expected_tuple = tuple(expected_tokens) + if tokenizations[0]['tokens'] != expected_tuple or tokenizations[0]['source'] != 'ngrams': + failures.append( + f"\nInput: '{input_text}'\nExpected: {expected_tokens} (ngrams)\n" + f"Got: {tokenizations[0]['tokens']} ({tokenizations[0]['source']})" + ) + + if failures: + print('\n=== Non-Names Test Failures ===') + for failure in failures: + print(failure) + print(f'\nTotal failures: {len(failures)} out of {len(quality_tests["non_names"])} test cases') + assert False, 'Some non-name tests failed. See above for details.' + + +def test_inspector_tokenization_quality(nlp_inspector: 'NLPInspector'): + """Test combined tokenizer quality using the same test cases as AllTokenizer""" + from nameai.data import get_resource_path + import json + + # Load tokenization quality test cases + with open(get_resource_path('tests/tokenization_quality.json')) as f: + quality_tests = json.load(f) + + failures = [] + for input_text, expected_tokens in quality_tests.items(): + tokenizations, _ = nlp_inspector.tokenize(input_text, 1000) + expected_tuple = tuple(expected_tokens) + found = False + for tokenization in tokenizations: + if tokenization['tokens'] == expected_tuple: + found = True + break + if not found: + failures.append( + f"\nInput: '{input_text}'\nExpected: {expected_tokens}\n" + f"Got: {[t['tokens'] for t in tokenizations[:5]]}" + ) + + if failures: + print('\n=== Combined Tokenization Quality Test Failures ===') + for failure in failures: + print(failure) + print(f'\nTotal failures: {len(failures)} out of {len(quality_tests)} test cases') + assert False, 'Some combined tokenization quality tests failed. See above for details.' diff --git a/apps/api.nameai.io/tests/test_tokenizer.py b/apps/api.nameai.io/tests/test_tokenizer.py index d7ca1de41..8de8957ef 100644 --- a/apps/api.nameai.io/tests/test_tokenizer.py +++ b/apps/api.nameai.io/tests/test_tokenizer.py @@ -3,12 +3,13 @@ import pytest from pytest import mark from hydra import initialize_config_module, compose +import math from mocked_static_property import mock_static_property @contextmanager -def init_tokenizer(overrides): +def init_all_tokenizer(overrides): with mock_static_property(): from nameai.all_tokenizer import AllTokenizer @@ -18,14 +19,24 @@ def init_tokenizer(overrides): yield tokenizer +@contextmanager +def init_person_name_tokenizer(overrides): + from nameai.person_names import PersonNameTokenizer + + with initialize_config_module(version_base=None, config_module='nameai.config'): + config = compose(config_name='prod_config', overrides=overrides) + tokenizer = PersonNameTokenizer(config) + yield tokenizer + + @mark.parametrize( 'overrides', [ - (['tokenization.skip_non_words=false', 'tokenization.with_gaps=false']), + (['tokenization.all_tokenizer.skip_non_words=false', 'tokenization.all_tokenizer.with_gaps=false']), ], ) def test_all_tokenizer_skip_one_letter_words(overrides: List[str]): - with init_tokenizer(overrides) as tokenizer: + with init_all_tokenizer(overrides) as tokenizer: tokenized_labels = list(tokenizer.tokenize('yorknewŁyork123')) assert ( @@ -51,11 +62,11 @@ def test_all_tokenizer_skip_one_letter_words(overrides: List[str]): @mark.parametrize( 'overrides', [ - (['tokenization.skip_non_words=true']), + (['tokenization.all_tokenizer.skip_non_words=true']), ], ) def test_all_tokenizer_skip_non_words(overrides: List[str]): - with init_tokenizer(overrides) as tokenizer: + with init_all_tokenizer(overrides) as tokenizer: tokenized_labels = list(tokenizer.tokenize('yorknewŁyork123')) # 0 tokenizations assert list(tokenized_labels) == [] @@ -71,11 +82,11 @@ def test_all_tokenizer_skip_non_words(overrides: List[str]): @mark.parametrize( 'overrides', [ - (['tokenization.skip_non_words=true']), + (['tokenization.all_tokenizer.skip_non_words=true']), ], ) def test_all_tokenizer_skip_one_letter_words_and_non_words_no_ias(overrides: List[str]): - with init_tokenizer(overrides) as tokenizer: + with init_all_tokenizer(overrides) as tokenizer: tokenized_labels = list(tokenizer.tokenize('laptop')) assert ('laptop',) in tokenized_labels assert ( @@ -100,11 +111,11 @@ def test_all_tokenizer_skip_one_letter_words_and_non_words_no_ias(overrides: Lis @mark.parametrize( 'overrides', [ - (['tokenization.skip_non_words=false', 'tokenization.with_gaps=true']), + (['tokenization.all_tokenizer.skip_non_words=false', 'tokenization.all_tokenizer.with_gaps=true']), ], ) def test_all_tokenizer_skip_one_letter_words_and_non_words_no_ias_with_gaps(overrides: List[str]): - with init_tokenizer(overrides) as tokenizer: + with init_all_tokenizer(overrides) as tokenizer: tokenized_labels = list(tokenizer.tokenize('lapŁtop')) assert ( @@ -122,22 +133,22 @@ def test_all_tokenizer_skip_one_letter_words_and_non_words_no_ias_with_gaps(over @mark.parametrize( 'overrides', [ - (['tokenization.skip_non_words=false', 'tokenization.with_gaps=true']), + (['tokenization.all_tokenizer.skip_non_words=false', 'tokenization.all_tokenizer.with_gaps=true']), ], ) def test_all_tokenizer_time(overrides): - with init_tokenizer(overrides) as tokenizer: + with init_all_tokenizer(overrides) as tokenizer: next(tokenizer.tokenize('miinibaashkiminasiganibiitoosijiganibadagwiingweshiganibakwezhigan')) @mark.parametrize( 'overrides', [ - (['tokenization.skip_non_words=false', 'tokenization.with_gaps=true']), + (['tokenization.all_tokenizer.skip_non_words=false', 'tokenization.all_tokenizer.with_gaps=true']), ], ) def test_all_tokenizer_skip_one_letter_words_and_non_words_no_ias_with_gaps23(overrides: List[str]): - with init_tokenizer(overrides) as tokenizer: + with init_all_tokenizer(overrides) as tokenizer: tokenized_labels = list(tokenizer.tokenize('laptop😀ą')) print(tokenized_labels) assert ('laptop', '') in tokenized_labels @@ -150,7 +161,7 @@ def test_all_tokenizer_skip_one_letter_words_and_non_words_no_ias_with_gaps23(ov @pytest.mark.execution_timeout(10) def test_all_tokenizer_reccurence(): - with init_tokenizer([]) as tokenizer: + with init_all_tokenizer([]) as tokenizer: next(tokenizer.tokenize('test' * 900)) with pytest.raises(RecursionError): @@ -161,11 +172,11 @@ def test_all_tokenizer_reccurence(): @mark.parametrize( 'overrides', [ - (['tokenization.skip_non_words=false', 'tokenization.with_gaps=true']), + (['tokenization.all_tokenizer.skip_non_words=false', 'tokenization.all_tokenizer.with_gaps=true']), ], ) def test_all_tokenizer_reccurence2(overrides): - with init_tokenizer(overrides) as tokenizer: + with init_all_tokenizer(overrides) as tokenizer: tokenized = tokenizer.tokenize('i' * 4 * 950) next(tokenized) with pytest.raises(RecursionError): @@ -174,7 +185,7 @@ def test_all_tokenizer_reccurence2(overrides): def test_all_tokenizer_custom_dict(): - with init_tokenizer([]) as tokenizer: + with init_all_tokenizer([]) as tokenizer: tokenized_labels = list(tokenizer.tokenize('nfttop')) assert ( 'nft', @@ -187,7 +198,7 @@ def test_all_tokenizer_custom_dict(): tokenized_labels = list(tokenizer.tokenize('york')) assert ('york',) in tokenized_labels - with init_tokenizer(['tokenization.custom_dictionary=tests/empty.txt']) as tokenizer: + with init_all_tokenizer(['tokenization.custom_dictionary=tests/empty.txt']) as tokenizer: tokenized_labels = list(tokenizer.tokenize('nfttop')) assert ( 'nft', @@ -202,7 +213,7 @@ def test_all_tokenizer_custom_dict(): def test_all_tokenizer_quality(): - with init_tokenizer([]) as tokenizer: + with init_all_tokenizer([]) as tokenizer: from nameai.data import get_resource_path for multiword in open(get_resource_path('should_be_tokenized.txt')): @@ -212,7 +223,7 @@ def test_all_tokenizer_quality(): def test_all_tokenizer_quality2(): - with init_tokenizer([]) as tokenizer: + with init_all_tokenizer([]) as tokenizer: from nameai.data import get_resource_path import json @@ -228,8 +239,136 @@ def test_all_tokenizer_quality2(): failures.append(f"\nInput: '{input_text}'\nExpected: {expected_tokens}\nGot: {tokenized_labels}") if failures: - print('\n=== Tokenization Quality Test Failures ===') + print('\n=== AllTokenizer Quality Test Failures ===') for failure in failures: print(failure) print(f'\nTotal failures: {len(failures)} out of {len(quality_tests)} test cases') assert False, 'Some tokenization quality tests failed. See above for details.' + + +def test_person_name_tokenizer_simple_names(): + """Verify tokenization of clear person names.""" + with init_person_name_tokenizer([]) as tokenizer: + from nameai.data import get_resource_path + import json + + with open(get_resource_path('tests/person_names_quality.json')) as f: + quality_tests = json.load(f) + + failures = [] + for input_label, expected_tokens in quality_tests['simple_names'].items(): + tokenized_labels = list(tokenizer.tokenize_with_scores(input_label)) + expected_tuple = tuple(expected_tokens) + found = False + for tokens, score in tokenized_labels: + if tokens == expected_tuple: + found = True + assert score > -float('inf'), f'Expected valid score for {input_label}' + break + if not found: + failures.append(f'Failed to find expected tokenization for {input_label}') + + if failures: + print('\n=== PersonNameTokenizer Quality Test Failures [simple_names] ===') + for failure in failures: + print(failure) + print(f'\nTotal failures: {len(failures)} out of {len(quality_tests)} test cases') + assert False, 'Some tokenization quality tests failed. See above for details.' + + +def test_person_name_tokenizer_ambiguous_names(): + """Verify handling of ambiguous inputs that could be names.""" + with init_person_name_tokenizer([]) as tokenizer: + from nameai.data import get_resource_path + import json + + with open(get_resource_path('tests/person_names_quality.json')) as f: + quality_tests = json.load(f) + + failures = [] + for input_label, interpretation2expected_tokens in quality_tests['ambiguous_names'].items(): + tokenized_labels = list(tokenizer.tokenize_with_scores(input_label)) + if interpretation2expected_tokens['person_name'] is not None: + person_name_tokens = tuple(interpretation2expected_tokens['person_name']) + found = False + for tokens, score in tokenized_labels: + if tokens == person_name_tokens: + found = True + assert score > -float('inf'), f'Expected valid score for {input_label}' + break + if not found: + failures.append(f'Failed to find person name tokenization for {input_label}') + + if failures: + print('\n=== PersonNameTokenizer Quality Test Failures [ambiguous_names] ===') + for failure in failures: + print(failure) + print(f'\nTotal failures: {len(failures)} out of {len(quality_tests)} test cases') + assert False, 'Some tokenization quality tests failed. See above for details.' + + +def test_person_name_tokenizer_non_names_low_scores(): + """Verify that non-name inputs get low (< 1e-10) probability scores.""" + with init_person_name_tokenizer([]) as tokenizer: + from nameai.data import get_resource_path + import json + + with open(get_resource_path('tests/person_names_quality.json')) as f: + quality_tests = json.load(f) + + failures = [] + for input_label in quality_tests['non_names'].keys(): + tokenized_labels = list(tokenizer.tokenize_with_scores(input_label)) + for tokens, log_prob in tokenized_labels: + if log_prob >= math.log(1e-10): + failures.append(f'Expected very low score for non-name {input_label}, got {log_prob}') + + if failures: + print('\n=== PersonNameTokenizer Quality Test Failures [non_names] ===') + for failure in failures: + print(failure) + print(f'\nTotal failures: {len(failures)} out of {len(quality_tests)} test cases') + assert False, 'Some tokenization quality tests failed. See above for details.' + + +def test_person_name_tokenizer_probability_ranges(): + """ + Verify probability scoring across input categories. + + Tests probability ranges for: + 1. Clear names: high scores (> log(1e-8)) + 2. Ambiguous cases: medium scores (log(1e-12) to log(1e-8)) + 3. Non-names: very low scores (< log(1e-15)) + """ + with init_person_name_tokenizer([]) as tokenizer: + # test clear person names + tokenizations = list(tokenizer.tokenize_with_scores('giancarloesposito')) + assert any( + score > math.log(1e-8) for _, score in tokenizations + ), 'Clear person name should have high probability' + + tokenizations = list(tokenizer.tokenize_with_scores('piotrwiśniewski')) + assert any( + score > math.log(1e-8) for _, score in tokenizations + ), 'Clear person name should have high probability' + + # test ambiguous cases + tokenizations = list(tokenizer.tokenize_with_scores('dragonfernandez')) + assert any( + math.log(1e-12) < score < math.log(1e-8) for _, score in tokenizations + ), 'Ambiguous case should have medium probability' + + tokenizations = list(tokenizer.tokenize_with_scores('wolfsmith')) + assert any( + math.log(1e-12) < score < math.log(1e-8) for _, score in tokenizations + ), 'Ambiguous case should have medium probability' + + # test non-names + tokenizations = list(tokenizer.tokenize_with_scores('cryptocurrency')) + assert all(score < math.log(1e-15) for _, score in tokenizations), 'Non-name should have very low probability' + + tokenizations = list(tokenizer.tokenize_with_scores('blockchain')) + assert all(score < math.log(1e-15) for _, score in tokenizations), 'Non-name should have very low probability' + + tokenizations = list(tokenizer.tokenize_with_scores('yerbamate')) + assert all(score < math.log(1e-15) for _, score in tokenizations), 'Non-name should have very low probability'