Skip to content

Commit fa1b7e5

Browse files
committed
refactoring for new way to page results
1 parent 0d4effd commit fa1b7e5

File tree

1 file changed

+38
-86
lines changed

1 file changed

+38
-86
lines changed

src/main.py

Lines changed: 38 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -159,34 +159,38 @@ def print_roundtrip(response, *args, **kwargs):
159159

160160

161161
def get_patent(patent_number, fields=None):
162-
patent_query = '{"patent_number":"%s"}' % patent_number
163-
fields = ('["patent_number","patent_title","patent_abstract","patent_date","patent_year",'
164-
'"cpc_current.cpc_subsection_id","cpc_current","patent_kind","patent_type",'
165-
'"assignees_at_grant.country","assignees_at_grant.assignee_id",'
166-
'"assignees_at_grant.organization"]')
162+
patent_query = '{"patent_id":"%s"}' % patent_number
163+
fields = ('["patent_id","patent_title","patent_abstract","patent_date","patent_year",'
164+
'"cpc_current.cpc_group_id","cpc_current","patent_kind","patent_type",'
165+
'"assignees.assignee_country","assignees.assignee_assignee_id",'
166+
'"assignees.assignee_organization",,"assignees.assignee_name_first","assignees.assignee_name_last"]')
167167
return patentsview_post_request(patent_search_endpoint, patent_query, fields)
168168

169169

170170
def get_all_company_patents(company, beginning_year=None, end_year=None, verbose=False):
171+
171172
first_page = get_one_page_of_company_patents(company, beginning_year, end_year, verbose=verbose)
172173
patents = first_page["patents"]
173174
number_of_pages = 1
175+
174176
# API change, attribute was total_patent_count, now total_hits
175177
if first_page["total_hits"] > first_page["count"]:
176-
number_of_pages = first_page["total_hits"] // 25
177-
if first_page["total_hits"] % 25:
178+
after = patents[-1]["patent_id"]
179+
number_of_pages = first_page["total_hits"] // REQUEST_SIZE
180+
if first_page["total_hits"] % REQUEST_SIZE:
178181
number_of_pages += 1
179-
for page_number in range(2, number_of_pages + 1):
180-
page_results = get_one_page_of_company_patents(company, beginning_year, end_year, page_number, verbose=verbose)
181-
if page_results["patents"]:
182-
patents += page_results["patents"]
182+
for page_number in range(1, number_of_pages):
183+
page_results = get_one_page_of_company_patents(company, beginning_year, end_year, verbose=verbose, after=after)
184+
if page_results["patents"]:
185+
patents += page_results["patents"]
186+
after = patents[-1]["patent_id"]
183187
# TODO see if it is better to yield instead of to return
184188
return patents
185189

186190

187-
def get_one_page_of_company_patents(company, beginning_year=None, end_year=None, page=1, perpage=25, verbose=False):
188-
print("Requesting PatentsView: %s, page %d" % (company, page))
189-
company_query = '{"%s":{"assignees_at_grant.organization":"%s"}}' % (COMPANY_SEARCH_CRITERIA, company)
191+
def get_one_page_of_company_patents(company, beginning_year=None, end_year=None, perpage=REQUEST_SIZE, verbose=False, after=""):
192+
print("Requesting PatentsView: %s" % (company))
193+
company_query = '{"%s":{"assignees.assignee_organization":"%s"}}' % (COMPANY_SEARCH_CRITERIA, company)
190194
date_range = None
191195

192196
if beginning_year is not None and end_year is not None:
@@ -267,7 +271,7 @@ def patentsview_post_request(endpoint, query_param, format_param=None, options_p
267271

268272
raise Exception("Status code: %s\r\n%s\n" % (r.status_code, r.text, extra))
269273

270-
return r.text
274+
return json.loads(r.text)
271275

272276

273277
def insert_alternate_names(primary_id, alternate_names, commit_after_insert=True):
@@ -339,84 +343,32 @@ def add_cited_patents(limit=REQUEST_SIZE, verbose=False):
339343
for patents in fetch_patents_by_number(patent_search_endpoint, cited_patents_to_add, results_format, limit=limit, verbose=verbose):
340344
add_patents(patents)
341345

342-
def add_cited_patent_numbers(patents_list, limit=25, verbose=False):
343-
results_format = '["patent_number","cited_patent_number"]'
346+
def add_cited_patent_numbers(patents_list, limit=REQUEST_SIZE, verbose=False):
347+
results_format = '["patent_id","citation_patent_id"]'
344348
# now we only get the citations from the citation endpoint
345-
cited_patents = fetch_patents_by_number(citation_search_endpoint, patents_list, results_format, limit=limit, verbose=verbose)
346-
347-
# we have to call to get the details of the cited patents and then add them to the database
348-
# TODO refactor to make this a constant etc
349-
patents_format = ('["patent_number","patent_date","patent_year",'
350-
'"assignees_at_grant.organization","cpc_current",'
351-
'"patent_title","assignees_at_grant.name_first","assignees_at_grant.name_last"]'
352-
)
353349

354-
for patents in fetch_patents_by_number(patent_search_endpoint, cited_patents, patents_format, limit=limit, verbose=verbose):
355-
add_cited_patent_numbers_to_db(patents)
350+
for citations in fetch_patents_by_number(citation_search_endpoint, patents_list, results_format, limit=limit, verbose=verbose):
351+
add_cited_patent_numbers_to_db(citations)
356352

353+
def fetch_patents_by_number(search_endpoint, patents_list, results_format, limit=REQUEST_SIZE, verbose=False):
357354

358-
def fetch_patents_by_number(search_endpoint, patents_list, results_format, limit=25, verbose=False):
359-
q_list = ['"%s"' % patent_number for patent_number in patents_list]
360-
q_str = '{"patent_number":[%s]}' % ",".join(q_list)
355+
# We'll post requests in batches of limit number of patents at a time
356+
# so we won't have to page
357+
number_of_batches = (len(patents_list) + limit - 1) // limit
361358

362-
# ** the api does accept POST requests
363-
# PatentsView only accepts GET requests; the endpoints for GET requests have a max length of 2000 characters.
364-
# As such if the length of the endpoint exceeds the maximum allowed length, a '414 URI Too Long' error is returned.
365-
# (for an explanation see: https://stackoverflow.com/a/50018203/6288413)
366-
# To circumvent the issue, we have to break up the query into chunks
367-
patents = []
368-
endpoint_length = len(patent_search_endpoint) + len('&q=') + len(q_str) + len('&f=') + len(results_format)
369-
370-
# TODO: rework this
371-
# ** the options parameter needs to be set to retrieve move than 25 patents at a time
372-
# With the new API the default is increased to 100
373-
# ** options parameter "size": 1000 is max request size now, originally "per_page": 10000
374-
375-
# The PatentsView API apparently only allows 25 patents to be looked up at a time, hence the need for limit
376-
# TODO: investigate why this is and if there is a way to change it
377-
378-
# we're calling either the patent endpoint or the new patent_citation endpoint
379-
# the nested entity we need will either be patents or patent_citations
359+
# Currently we're calling either the patent endpoint or patent_citation endpoint
360+
# the returned entity will either be patents or patent_citations
380361
m = re.search(r'/([^/]*)/$', search_endpoint)
381362
return_entity = m.group(1) + "s"
382-
print("return entity: {}".format(return_entity))
383-
384-
if endpoint_length < 2000:
385-
response = patentsview_post_request(search_endpoint, q_str, results_format, verbose=verbose)
386-
results = json.loads(response)
387-
# 'patents' if search_endpoint end in /patent/
388-
patents = results[return_entity]
389-
yield patents
390-
else:
391-
if limit and ((endpoint_length // 2000) < (endpoint_length // limit)):
392-
number_of_chunks = endpoint_length // limit + 1
393-
else:
394-
number_of_chunks = endpoint_length // 2000 + 1
395-
396-
interval = max(len(q_list) // number_of_chunks, limit)
397-
num_intervals = range(len(q_list) // interval + 2)
398-
399-
for i in num_intervals:
400-
start_index = i * interval
401-
end_index = (i + 1) * interval
402-
print("start index: {} end_index: {}".format(start_index, end_index))
403-
404-
q_str = '{"patent_number":[%s]}' % ",".join(q_list[start_index:end_index])
405-
response = patentsview_post_request(search_endpoint, q_str, results_format, verbose=verbose)
406-
407-
results = json.loads(response)
408-
if verbose:
409-
print(results)
410-
if results[return_entity]:
411-
patents += results[return_entity]
412-
413-
# This is to potentially avoid a "Segmentation Fault (core dumped)" error
414-
415-
# TODO change this to an implementation that is more programmatic
416-
if len(patents) >= 1000:
417-
yield patents
418-
patents = []
419-
yield patents
363+
options_param = '{"size":%d}' % (limit)
364+
365+
for i in range(number_of_batches):
366+
start_index = i * limit
367+
end_index = (i + 1) * limit
368+
q_str = '{"patent_id":[%s]}' % ",".join(patents_list[start_index:end_index])
369+
results = patentsview_post_request(search_endpoint, q_str, results_format, options_param=options_param, verbose=verbose)
370+
entity = results[return_entity]
371+
yield entity
420372

421373

422374
def add_cited_patent_numbers_to_db(citations: List) -> None:

0 commit comments

Comments
 (0)