@@ -159,34 +159,38 @@ def print_roundtrip(response, *args, **kwargs):
159159
160160
161161def get_patent (patent_number , fields = None ):
162- patent_query = '{"patent_number ":"%s"}' % patent_number
163- fields = ('["patent_number ","patent_title","patent_abstract","patent_date","patent_year",'
164- '"cpc_current.cpc_subsection_id ","cpc_current","patent_kind","patent_type",'
165- '"assignees_at_grant.country ","assignees_at_grant.assignee_id ",'
166- '"assignees_at_grant.organization "]' )
162+ patent_query = '{"patent_id ":"%s"}' % patent_number
163+ fields = ('["patent_id ","patent_title","patent_abstract","patent_date","patent_year",'
164+ '"cpc_current.cpc_group_id ","cpc_current","patent_kind","patent_type",'
165+ '"assignees.assignee_country ","assignees.assignee_assignee_id ",'
166+ '"assignees.assignee_organization",,"assignees.assignee_name_first","assignees.assignee_name_last "]' )
167167 return patentsview_post_request (patent_search_endpoint , patent_query , fields )
168168
169169
170170def get_all_company_patents (company , beginning_year = None , end_year = None , verbose = False ):
171+
171172 first_page = get_one_page_of_company_patents (company , beginning_year , end_year , verbose = verbose )
172173 patents = first_page ["patents" ]
173174 number_of_pages = 1
175+
174176 # API change, attribute was total_patent_count, now total_hits
175177 if first_page ["total_hits" ] > first_page ["count" ]:
176- number_of_pages = first_page ["total_hits" ] // 25
177- if first_page ["total_hits" ] % 25 :
178+ after = patents [- 1 ]["patent_id" ]
179+ number_of_pages = first_page ["total_hits" ] // REQUEST_SIZE
180+ if first_page ["total_hits" ] % REQUEST_SIZE :
178181 number_of_pages += 1
179- for page_number in range (2 , number_of_pages + 1 ):
180- page_results = get_one_page_of_company_patents (company , beginning_year , end_year , page_number , verbose = verbose )
181- if page_results ["patents" ]:
182- patents += page_results ["patents" ]
182+ for page_number in range (1 , number_of_pages ):
183+ page_results = get_one_page_of_company_patents (company , beginning_year , end_year , verbose = verbose , after = after )
184+ if page_results ["patents" ]:
185+ patents += page_results ["patents" ]
186+ after = patents [- 1 ]["patent_id" ]
183187 # TODO see if it is better to yield instead of to return
184188 return patents
185189
186190
187- def get_one_page_of_company_patents (company , beginning_year = None , end_year = None , page = 1 , perpage = 25 , verbose = False ):
188- print ("Requesting PatentsView: %s, page %d " % (company , page ))
189- company_query = '{"%s":{"assignees_at_grant.organization ":"%s"}}' % (COMPANY_SEARCH_CRITERIA , company )
191+ def get_one_page_of_company_patents (company , beginning_year = None , end_year = None , perpage = REQUEST_SIZE , verbose = False , after = "" ):
192+ print ("Requesting PatentsView: %s" % (company ))
193+ company_query = '{"%s":{"assignees.assignee_organization ":"%s"}}' % (COMPANY_SEARCH_CRITERIA , company )
190194 date_range = None
191195
192196 if beginning_year is not None and end_year is not None :
@@ -267,7 +271,7 @@ def patentsview_post_request(endpoint, query_param, format_param=None, options_p
267271
268272 raise Exception ("Status code: %s\r \n %s\n " % (r .status_code , r .text , extra ))
269273
270- return r .text
274+ return json . loads ( r .text )
271275
272276
273277def insert_alternate_names (primary_id , alternate_names , commit_after_insert = True ):
@@ -339,84 +343,32 @@ def add_cited_patents(limit=REQUEST_SIZE, verbose=False):
339343 for patents in fetch_patents_by_number (patent_search_endpoint , cited_patents_to_add , results_format , limit = limit , verbose = verbose ):
340344 add_patents (patents )
341345
342- def add_cited_patent_numbers (patents_list , limit = 25 , verbose = False ):
343- results_format = '["patent_number ","cited_patent_number "]'
346+ def add_cited_patent_numbers (patents_list , limit = REQUEST_SIZE , verbose = False ):
347+ results_format = '["patent_id ","citation_patent_id "]'
344348 # now we only get the citations from the citation endpoint
345- cited_patents = fetch_patents_by_number (citation_search_endpoint , patents_list , results_format , limit = limit , verbose = verbose )
346-
347- # we have to call to get the details of the cited patents and then add them to the database
348- # TODO refactor to make this a constant etc
349- patents_format = ('["patent_number","patent_date","patent_year",'
350- '"assignees_at_grant.organization","cpc_current",'
351- '"patent_title","assignees_at_grant.name_first","assignees_at_grant.name_last"]'
352- )
353349
354- for patents in fetch_patents_by_number (patent_search_endpoint , cited_patents , patents_format , limit = limit , verbose = verbose ):
355- add_cited_patent_numbers_to_db (patents )
350+ for citations in fetch_patents_by_number (citation_search_endpoint , patents_list , results_format , limit = limit , verbose = verbose ):
351+ add_cited_patent_numbers_to_db (citations )
356352
353+ def fetch_patents_by_number (search_endpoint , patents_list , results_format , limit = REQUEST_SIZE , verbose = False ):
357354
358- def fetch_patents_by_number ( search_endpoint , patents_list , results_format , limit = 25 , verbose = False ):
359- q_list = [ '"%s"' % patent_number for patent_number in patents_list ]
360- q_str = '{"patent_number":[%s]}' % "," . join ( q_list )
355+ # We'll post requests in batches of limit number of patents at a time
356+ # so we won't have to page
357+ number_of_batches = ( len ( patents_list ) + limit - 1 ) // limit
361358
362- # ** the api does accept POST requests
363- # PatentsView only accepts GET requests; the endpoints for GET requests have a max length of 2000 characters.
364- # As such if the length of the endpoint exceeds the maximum allowed length, a '414 URI Too Long' error is returned.
365- # (for an explanation see: https://stackoverflow.com/a/50018203/6288413)
366- # To circumvent the issue, we have to break up the query into chunks
367- patents = []
368- endpoint_length = len (patent_search_endpoint ) + len ('&q=' ) + len (q_str ) + len ('&f=' ) + len (results_format )
369-
370- # TODO: rework this
371- # ** the options parameter needs to be set to retrieve move than 25 patents at a time
372- # With the new API the default is increased to 100
373- # ** options parameter "size": 1000 is max request size now, originally "per_page": 10000
374-
375- # The PatentsView API apparently only allows 25 patents to be looked up at a time, hence the need for limit
376- # TODO: investigate why this is and if there is a way to change it
377-
378- # we're calling either the patent endpoint or the new patent_citation endpoint
379- # the nested entity we need will either be patents or patent_citations
359+ # Currently we're calling either the patent endpoint or patent_citation endpoint
360+ # the returned entity will either be patents or patent_citations
380361 m = re .search (r'/([^/]*)/$' , search_endpoint )
381362 return_entity = m .group (1 ) + "s"
382- print ("return entity: {}" .format (return_entity ))
383-
384- if endpoint_length < 2000 :
385- response = patentsview_post_request (search_endpoint , q_str , results_format , verbose = verbose )
386- results = json .loads (response )
387- # 'patents' if search_endpoint end in /patent/
388- patents = results [return_entity ]
389- yield patents
390- else :
391- if limit and ((endpoint_length // 2000 ) < (endpoint_length // limit )):
392- number_of_chunks = endpoint_length // limit + 1
393- else :
394- number_of_chunks = endpoint_length // 2000 + 1
395-
396- interval = max (len (q_list ) // number_of_chunks , limit )
397- num_intervals = range (len (q_list ) // interval + 2 )
398-
399- for i in num_intervals :
400- start_index = i * interval
401- end_index = (i + 1 ) * interval
402- print ("start index: {} end_index: {}" .format (start_index , end_index ))
403-
404- q_str = '{"patent_number":[%s]}' % "," .join (q_list [start_index :end_index ])
405- response = patentsview_post_request (search_endpoint , q_str , results_format , verbose = verbose )
406-
407- results = json .loads (response )
408- if verbose :
409- print (results )
410- if results [return_entity ]:
411- patents += results [return_entity ]
412-
413- # This is to potentially avoid a "Segmentation Fault (core dumped)" error
414-
415- # TODO change this to an implementation that is more programmatic
416- if len (patents ) >= 1000 :
417- yield patents
418- patents = []
419- yield patents
363+ options_param = '{"size":%d}' % (limit )
364+
365+ for i in range (number_of_batches ):
366+ start_index = i * limit
367+ end_index = (i + 1 ) * limit
368+ q_str = '{"patent_id":[%s]}' % "," .join (patents_list [start_index :end_index ])
369+ results = patentsview_post_request (search_endpoint , q_str , results_format , options_param = options_param , verbose = verbose )
370+ entity = results [return_entity ]
371+ yield entity
420372
421373
422374def add_cited_patent_numbers_to_db (citations : List ) -> None :
0 commit comments