@@ -242,8 +242,34 @@ def get_company_primary_id(name):
242242 return None
243243
244244
245- def add_cited_patents (patents_list , limit = 25 , verbose = False ):
245+ def fetch_all_cited_patent_numbers_for_all_patents_in_db (verbose = False ):
246+ l = []
247+ for number in session .query (Patent .patent_number ).all ():
248+ l .append (number .patent_number )
249+ add_cited_patent_numbers (l , verbose = verbose )
250+
251+
252+ def add_cited_patents (limit = 25 , verbose = False ):
253+ # This function populates the patents table with the missing information for the
254+ # patent numbers found in the cited_patents table
255+ # TODO refactor this function to accept a list of patents
256+ results_format = ('["patent_number","patent_date","patent_year","assignee_organization","app_date",'
257+ '"patent_title","uspc_mainclass_id","assignee_first_name","assignee_last_name"]'
258+ )
259+ patents_in_db = session .query (Patent .patent_number )
260+ cited_patents_to_add = [x .cited_patent_number for x in session .query (CitedPatent .cited_patent_number )\
261+ .filter (~ CitedPatent .cited_patent_number .in_ (patents_in_db )).all ()]
262+ for patents in fetch_patents_by_number (cited_patents_to_add , results_format , limit = limit , verbose = verbose ):
263+ add_patents (patents )
264+
265+
266+ def add_cited_patent_numbers (patents_list , limit = 25 , verbose = False ):
246267 results_format = '["patent_number","cited_patent_number"]'
268+ for patents in fetch_patents_by_number (patents_list , results_format , limit = limit , verbose = verbose ):
269+ add_cited_patent_numbers_to_db (patents )
270+
271+
272+ def fetch_patents_by_number (patents_list , results_format , limit = 25 , verbose = False ):
247273 q_list = ['"%s"' % patent_number for patent_number in patents_list ]
248274 q_str = '{"patent_number":[%s]}' % "," .join (q_list )
249275
@@ -260,6 +286,7 @@ def add_cited_patents(patents_list, limit=25, verbose=False):
260286 response = patentsview_get_request (patent_search_endpoint , q_str , results_format , verbose = verbose )
261287 results = json .loads (response )
262288 patents = results ['patents' ]
289+ yield patents
263290 else :
264291 if limit and ((endpoint_length // 2000 ) < (endpoint_length // limit )):
265292 number_of_chunks = endpoint_length // limit + 1
@@ -274,16 +301,29 @@ def add_cited_patents(patents_list, limit=25, verbose=False):
274301 q_str = '{"patent_number":[%s]}' % "," .join (q_list [start_index :end_index ])
275302 response = patentsview_get_request (patent_search_endpoint , q_str , results_format , verbose = verbose )
276303 results = json .loads (response )
277- print (results )
304+ if verbose :
305+ print (results )
278306 if results ['patents' ]:
279307 patents += results ['patents' ]
280308
309+ # This is to potentially avoid a "Segmentation Fault (core dumped)" error
310+
311+ # TODO change this to an implementation that is more programmatic
312+ if len (patents ) >= 1000 :
313+ yield patents
314+ patents = []
315+ yield patents
316+
317+
318+ def add_cited_patent_numbers_to_db (citing_patent_numbers : List ) -> None :
319+ print ("Adding cited patent numbers to db." )
281320 # Patents that are already in the db
282- cited_patents_in_db = [(x .citing_patent_number , x .cited_patent_number ) for x in session .query (CitedPatent ).all ()]
321+ cited_patents_in_db = [(x .citing_patent_number , x .cited_patent_number ) for x in
322+ session .query (CitedPatent ).all ()]
283323 # Patents fetched
284324 cited_patent_objects = []
285325 # Add ALL cited patents to cited_patent_objects list
286- for patent in patents :
326+ for patent in citing_patent_numbers :
287327 patent_number = patent ["patent_number" ]
288328 for cited_patent_number in patent ["cited_patents" ]:
289329 # Check if there are cited patents in the results and if they are already in the database
@@ -367,7 +407,7 @@ def add_patents(patents):
367407 session .commit ()
368408
369409
370- def fetch_patents_for_all_companies_in_db (resume_from_company_id = None ):
410+ def fetch_patents_for_all_companies_in_db (resume_from_company_id = None , verbose = False ):
371411 if resume_from_company_id and type (resume_from_company_id ) == int :
372412 company_query = session .query (Company .id ).filter (Company .id >= resume_from_company_id ).order_by (Company .id .asc ()).all ()
373413 else :
@@ -380,23 +420,16 @@ def fetch_patents_for_all_companies_in_db(resume_from_company_id=None):
380420 alternate_names = session .query (AlternateName .name , AlternateName .id ).filter_by (company_id = company_id ).all ()
381421
382422 for org in primary_names :
383- patents = get_all_company_patents (org [0 ], verbose = True )
423+ patents = get_all_company_patents (org [0 ], verbose = verbose )
384424 if patents :
385425 add_patents (patents )
386426
387427 for org , alternate_name_id in alternate_names :
388- patents = get_all_company_patents (org , verbose = True )
428+ patents = get_all_company_patents (org , verbose = verbose )
389429 if patents :
390430 add_patents (patents )
391431
392432
393- def fetch_all_cited_patent_numbers_for_all_patents_in_db ():
394- l = []
395- for number in session .query (Patent .patent_number ).all ():
396- l .append (number .patent_number )
397- add_cited_patents (l , verbose = True )
398-
399-
400433def main ():
401434 options = get_options ()
402435
@@ -416,6 +449,7 @@ def main():
416449 end_date = options .end_date [0 ]
417450
418451 # TODO: implement functionality that uses the Start and End dates
452+ """
419453 if options.fetch_patents_for_all_companies:
420454 company_id = options.resume_from_company_id
421455 if company_id:
@@ -425,6 +459,10 @@ def main():
425459 print("Fetching patents for all companies in the database.")
426460 fetch_patents_for_all_companies_in_db()
427461
462+ fetch_all_cited_patent_numbers_for_all_patents_in_db()
463+ """
464+ add_cited_patents ()
465+
428466
429467def get_options ():
430468 parser = argparse .ArgumentParser (description = "A script that calls the PatentsView API." ,
0 commit comments