diff --git a/research/elasticsearch/populate.py b/research/elasticsearch/populate.py index e2d1cccc..e40c0420 100644 --- a/research/elasticsearch/populate.py +++ b/research/elasticsearch/populate.py @@ -116,9 +116,17 @@ def initialize_index(es: Elasticsearch): "data.names.tokenized_name": {"type": "text", "similarity": "BM25_b0"}, "data.collection_description": {"type": "text", "similarity": "BM25"}, "data.collection_keywords": {"type": "text", "similarity": "BM25"}, - # "template.collection_articles": {"type": "text", "similarity": "BM25"}, # TODO remove? "template.collection_rank": {"type": "rank_feature"}, "metadata.members_count": {"type": "rank_feature"}, + "template.members_rank_mean": {"type": "rank_feature"}, + "template.members_rank_median": {"type": "rank_feature"}, + "template.members_system_interesting_score_mean": {"type": "rank_feature"}, + "template.members_system_interesting_score_median": {"type": "rank_feature"}, + "template.valid_members_count": {"type": "rank_feature"}, + "template.invalid_members_count": {"type": "rank_feature", "positive_score_impact": False}, + "template.valid_members_ratio": {"type": "rank_feature"}, + "template.nonavailable_members_count": {"type": "rank_feature"}, + "template.nonavailable_members_ratio": {"type": "rank_feature"}, } }, } @@ -136,7 +144,7 @@ def insert_collection(es: Elasticsearch, collection: dict): def insert_collections(es: Elasticsearch, collections: Iterable[dict]): - number_of_docs = 500000 + number_of_docs = 561000 progress = tqdm(unit="docs", total=number_of_docs) successes = 0 @@ -154,14 +162,20 @@ def insert_collections(es: Elasticsearch, collections: Iterable[dict]): def gen(path, limit): with jsonlines.open(path, 'r') as reader: + too_long = 0 for doc in islice(reader.iter(skip_empty=True, skip_invalid=True), limit): # rank_feature must be positive - if doc['data']['collection_name'].startswith('Lists of'): - continue - doc['template']['collection_rank'] = max(1, doc['template']['collection_rank']) # remove? - doc['metadata']['members_count'] = len(doc['data']['names']) + # if doc['data']['collection_name'].startswith('Lists of'): + # continue + # doc['template']['collection_rank'] = max(1, doc['template']['collection_rank']) # remove? + # doc['metadata']['members_count'] = len(doc['data']['names']) + + doc['template']['nonavailable_members_count'] += 1 #TODO? + doc['template']['invalid_members_count'] += 1 #TODO? + if doc['metadata']['members_count'] > 10000: + too_long += 1 continue yield { @@ -169,6 +183,7 @@ def gen(path, limit): # "_type": '_doc', "_source": doc } + print(f'{too_long} collections too long') if __name__ == '__main__': @@ -188,9 +203,13 @@ def gen(path, limit): else: es = connect_to_elasticsearch(args.scheme, args.host, args.port, args.username, args.password) + es.indices.delete(index=INDEX_NAME) + initialize_index(es) insert_collections(es, gen(args.input, args.limit)) search = es.search(index=INDEX_NAME, body={'query': {'bool': {}}}) print(f'Documents overall in {INDEX_NAME} - {len(search["hits"]["hits"])}') + # 103 collections too long + # Indexed 424307 documents \ No newline at end of file diff --git a/research/elasticsearch/search.py b/research/elasticsearch/search.py index f46dbd37..91994245 100644 --- a/research/elasticsearch/search.py +++ b/research/elasticsearch/search.py @@ -1,33 +1,22 @@ import json from argparse import ArgumentParser +from math import log10 -from elasticsearch import Elasticsearch -from tqdm import tqdm +from copy import deepcopy +from elasticsearch import Elasticsearch from populate import INDEX_NAME, connect_to_elasticsearch_using_cloud_id, connect_to_elasticsearch -# INDEX_NAME = 'collections14all' - - - - - -def search_by_name(query, limit, with_rank=True): - body = { +COMMON_QUERY = { "query": { "bool": { "must": [ { "multi_match": { - "query": query, "fields": [ - "data.collection_name^3", + "data.collection_name^2", "data.collection_name.exact^3", - "data.collection_description^2", - "data.collection_keywords^2", - "data.names.normalized_name", - "data.names.tokenized_name", ], "type": "cross_fields", } @@ -37,29 +26,47 @@ def search_by_name(query, limit, with_rank=True): { "rank_feature": { "field": "template.collection_rank", - "boost": 100, - # "log": { - # "scaling_factor": 4 - # } + "boost": 10, + "log": { + "scaling_factor": 1 + } } }, { "rank_feature": { "field": "metadata.members_count", + "boost": 10, + "log": { + "scaling_factor": 1 + } } } ] } }, - "size": limit, - } +} + + +def search_by_name(query, limit, with_rank=True, with_keyword_description=False): + body = deepcopy(COMMON_QUERY) + body['query']['bool']['must'][0]['multi_match']['query'] = query + if with_keyword_description: + body['query']['bool']['must'][0]['multi_match']['fields'] += [ + "data.collection_description^2", + "data.collection_keywords^2", + ] + + + print(f'{body["query"]["bool"]}') + if not with_rank: del body['query']['bool']['should'] response = es.search( index=INDEX_NAME, - body=body, + query = body['query'], + size=limit, explain=args.explain ) @@ -68,50 +75,23 @@ def search_by_name(query, limit, with_rank=True): def search_by_all(query, limit): + body = deepcopy(COMMON_QUERY) + body['query']['bool']['must'][0]['multi_match']['query'] = query + body['size'] = limit + body['query']['bool']['must'][0]['multi_match']['fields'] += [ + "data.collection_description^2", + "data.collection_keywords^2", + "data.names.normalized_name", + "data.names.tokenized_name", + ] + + print(f'{body["query"]["bool"]}') + response = es.search( index=INDEX_NAME, + query=body['query'], + size=limit, explain=args.explain, - body={ - "query": { - "bool": { - "must": [ - { - "multi_match": { - "query": query, - "fields": [ - "data.collection_name^3", - "data.collection_name.exact^3", - "data.names.normalized_name", - "data.names.tokenized_name", - "data.collection_description^2", - "data.collection_keywords^2", - # "template.collection_articles" - ], - "type": "cross_fields", - } - } - ], - "should": [ - { - "rank_feature": { - "field": "template.collection_rank", - "boost": 100, - # "log": { - # "scaling_factor": 4 - # } - } - }, - { - "rank_feature": { - "field": "metadata.members_count", - } - } - ] - } - - }, - "size": limit, - }, ) hits = response["hits"]["hits"] @@ -119,16 +99,269 @@ def search_by_all(query, limit): def print_exlanation(hits): - print('
Explanation') - print(f'') + print('
Explanation
nameexplanation
') for hit in hits: name = hit['_source']['data']['collection_name'] explanation = hit['_explanation'] print( - f'') + f'') print('
{name}
{json.dumps(explanation, indent=2, ensure_ascii=False)}
{name}
{json.dumps(explanation, indent=2, ensure_ascii=False)}
') +class Search: + def description(self): + return '

%s

' % self.header() + + def values(self, hit, args): + score = "%.1f" % hit['_score'] + name = hit['_source']['data']['collection_name'] + keywords = '; '.join(hit['_source']['data']['collection_keywords'][:10]) + if len(hit['_source']['data']['collection_keywords']) > 10: + keywords += "..." + description = hit['_source']['data']['collection_description'] + rank = "%.3f" % log10(hit['_source']['template']['collection_rank']) + link = hit['_source']['template']['collection_wikipedia_link'] + type_wikidata_ids = ', '.join(['' + name + '' for id, name in hit['_source']['template']['collection_types']]) + #print(hit['_source']['data']) + names = ', '.join([x['normalized_name'] for x in hit['_source']['data']['names'][:args.limit_names]]) + + if len(hit['_source']['data']['names']) > args.limit_names: + names += "..." + + wikidata_id = hit['_source']['template']['collection_wikidata_id'] + wikidata_id = '' + wikidata_id + '' + + members_count = len(hit['_source']['data']['names']) + members_value = f"{members_count} : {'%.3f' % log10(members_count)}" + + rank_mean = "%.1f" % log10(hit['_source']['template']['members_rank_mean']) + rank_median = "%.1f" % log10(hit['_source']['template']['members_rank_median']) + score_mean = "%.3f" % hit['_source']['template']['members_system_interesting_score_mean'] + score_median = "%.3f" % hit['_source']['template']['members_system_interesting_score_median'] + valid_count = hit['_source']['template']['valid_members_count'] + valid_ratio = "%.3f" % hit['_source']['template']['valid_members_ratio'] + noavb_count = hit['_source']['template']['nonavailable_members_count'] + noavb_ratio = "%.3f" % hit['_source']['template']['nonavailable_members_ratio'] + is_merged = hit['_source']['template']['is_merged'] + + return [score, name, rank, members_value, rank_mean, rank_median, score_mean, score_median, valid_count, valid_ratio, noavb_count, noavb_ratio, is_merged, wikidata_id, type_wikidata_ids, description, keywords, names, ] + + def columns(self): + return ['score', 'name', 'rank', 'members', 'm. rank mean', 'm. rank median', 'm. int. score mean', 'm. int. score median', 'valid m. count', 'valid m. ratio', 'nonav. count', 'nonav. ratio', 'is merged', 'wikidata', 'types', 'desciption', 'keywords', 'names'] + + def __call__(self, query, args): + + body = self.body(query) + + print(f'{body["query"]["bool"]}') + + response = es.search( + index=INDEX_NAME, + query=body['query'], + size=args.limit, + explain=args.explain + ) + + hits = response["hits"]["hits"] + return hits + + +class NameRankSearch(Search): + def __call__(self, query, args): + return search_by_name(query, args.limit) + + def header(self): + return 'name with rank' + +class NameSearch(Search): + def __call__(self, query, args): + return search_by_name(query, args.limit, with_rank=False) + + def header(self): + return 'only name without rank' + +class NameKeywordsDescriptionSearch(Search): + def __call__(self, query, args): + return search_by_name(query, args.limit, with_rank=False, with_keyword_description=True) + + def header(self): + return 'name, description, keywords without rank' + +class NameMembersSearch(Search): + def __call__(self, query, args): + return search_by_all(query, args.limit) + + def header(self): + return 'name, description, keywords, members' + + +class NameTypeSearch(Search): + def body(self, query): + body = deepcopy(COMMON_QUERY) + body['query']['bool']['must'][0]['multi_match']['query'] = query + body['query']['bool']['must'][0]['multi_match']['fields'] += [ + "data.collection_types^3", + ] + + return body + + + def header(self): + return 'name, type, rank' + +class MemberMeanSearch(Search): + def body(self, query): + body = deepcopy(COMMON_QUERY) + body['query']['bool']['must'][0]['multi_match']['query'] = query + body['query']['bool']['must'][0]['multi_match']['fields'] += [ + "data.collection_types^3", + ] + + body['query']['bool']['should'] = [ + { + "rank_feature": { + "field": "template.members_rank_mean", + "boost": 10, + "log": { + "scaling_factor": 1 + } + } + } + ] + + return body + + def header(self): + return 'name, type, mean member rank' + + +class MemberMedianSearch(Search): + def body(self, query): + body = deepcopy(COMMON_QUERY) + body['query']['bool']['must'][0]['multi_match']['query'] = query + body['query']['bool']['must'][0]['multi_match']['fields'] += [ + "data.collection_types^3", + ] + + body['query']['bool']['should'] = [ + { + "rank_feature": { + "field": "template.members_rank_median", + "boost": 10, + "log": { + "scaling_factor": 1 + } + } + } + ] + + return body + + def header(self): + return 'name, type, median member rank' + +class ScoreMeanSearch(Search): + def body(self, query): + body = deepcopy(COMMON_QUERY) + body['query']['bool']['must'][0]['multi_match']['query'] = query + body['query']['bool']['must'][0]['multi_match']['fields'] += [ + "data.collection_types^3", + ] + + body['query']['bool']['should'] = [ + { + "rank_feature": { + "field": "template.members_system_interesting_score_mean", + "boost": 40, + "log": { + "scaling_factor": 1 + } + } + } + ] + + return body + + def header(self): + return 'name, type, score mean rank' + +class ScoreMedianSearch(Search): + def body(self, query): + body = deepcopy(COMMON_QUERY) + body['query']['bool']['must'][0]['multi_match']['query'] = query + body['query']['bool']['must'][0]['multi_match']['fields'] += [ + "data.collection_types^3", + ] + + body['query']['bool']['should'] = [ + { + "rank_feature": { + "field": "template.members_system_interesting_score_median", + "boost": 40, + "log": { + "scaling_factor": 1 + } + } + } + ] + + return body + + def header(self): + return 'name, type, score median rank' + +class ValidRatioSearch(Search): + def body(self, query): + body = deepcopy(COMMON_QUERY) + body['query']['bool']['must'][0]['multi_match']['query'] = query + body['query']['bool']['must'][0]['multi_match']['fields'] += [ + "data.collection_types^3", + ] + + body['query']['bool']['should'] = [ + { + "rank_feature": { + "field": "template.valid_members_ratio", + "boost": 40, + "log": { + "scaling_factor": 1 + } + } + } + ] + + return body + + def header(self): + return 'name, type, valid ratio' + +class NonavbRatioSearch(Search): + def body(self, query): + body = deepcopy(COMMON_QUERY) + body['query']['bool']['must'][0]['multi_match']['query'] = query + body['query']['bool']['must'][0]['multi_match']['fields'] += [ + "data.collection_types^3", + ] + + body['query']['bool']['should'] = [ + { + "rank_feature": { + "field": "template.nonavailable_members_ratio", + "boost": 40, + "log": { + "scaling_factor": 1 + } + } + } + ] + + return body + + def header(self): + return 'name, type, nonavailable members ratio' + +MemberMedianSearch(),ScoreMeanSearch(),ScoreMedianSearch(),ValidRatioSearch(),NonavbRatioSearch() + if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('queries', nargs='+', help='queries') @@ -148,64 +381,26 @@ def print_exlanation(hits): else: es = connect_to_elasticsearch(args.scheme, args.host, args.port, args.username, args.password) - for query in tqdm(args.queries): - print(f'

{query}

') - - print(f'

only collection

') - hits = search_by_name(query, args.limit) - print('') - print(f'') - for hit in hits: - score = hit['_score'] - name = hit['_source']['data']['collection_name'] - rank = hit['_source']['template']['collection_rank'] - link = hit['_source']['template']['collection_wikipedia_link'] - type_wikidata_id = hit['_source']['template']['collection_type_wikidata_id'] - wikidata_id = hit['_source']['template']['collection_wikidata_id'] - print( - f'') - print('
scorenamerankwikidatatype
{score}{name}{rank}{wikidata_id}{type_wikidata_id}
') - - if args.explain: print_exlanation(hits) - - print(f'

only collection without rank

') - hits = search_by_name(query, args.limit, with_rank=False) - print('') - print(f'') - for hit in hits: - score = hit['_score'] - name = hit['_source']['data']['collection_name'] - rank = hit['_source']['template']['collection_rank'] - link = hit['_source']['template']['collection_wikipedia_link'] - type_wikidata_id = hit['_source']['template']['collection_type_wikidata_id'] - wikidata_id = hit['_source']['template']['collection_wikidata_id'] - print( - f'') - print('
scorenamerankwikidatatype
{score}{name}{rank}{wikidata_id}{type_wikidata_id}
') - - if args.explain: print_exlanation(hits) - - print(f'

collection + names

') - hits = search_by_all(query, args.limit) - print('') - print(f'') - for hit in hits: - score = hit['_score'] - name = hit['_source']['data']['collection_name'] - rank = hit['_source']['template']['collection_rank'] - link = hit['_source']['template']['collection_wikipedia_link'] - type_wikidata_id = hit['_source']['template']['collection_type_wikidata_id'] - wikidata_id = hit['_source']['template']['collection_wikidata_id'] - names = f"{len(hit['_source']['data']['names'])}: " + ', '.join( - [x['normalized_name'] for x in hit['_source']['data']['names'][:args.limit_names]]) - print( - f'') - - # print(hit['_score'], hit['_source']['data']['collection_name'], 'RANK:', - # hit['_source']['template']['collection_rank'], - # hit['_source']['template']['collection_wikipedia_link']) - # print(', '.join([x['normalized_name'] for x in hit['_source']['data']['names']])) - print() - print('
scorenamerankwikidatatypenames
{score}{name}{rank}{wikidata_id}{type_wikidata_id}{names}
') - - if args.explain: print_exlanation(hits) + print(f'') + + for search in [NameTypeSearch(),MemberMeanSearch(),MemberMedianSearch(),ScoreMeanSearch(),ScoreMedianSearch(),ValidRatioSearch(),NonavbRatioSearch()]: + print(f'

{search.description()}

') + + for query in args.queries: + #for search in [NameRankSearch(), NameSearch(), NameKeywordsDescriptionSearch(), NameMembersSearch()]: + print(f'

{query}

') + hits = search(query, args) + print('') + print(f'') + for column in search.columns(): + print(f'') + print(f'') + + for hit in hits: + print('') + for value in search.values(hit, args): + print(f'') + print('') + print('
{column}
{value}
') + + if args.explain: print_exlanation(hits) \ No newline at end of file