@@ -189,25 +189,31 @@ def get_one_page_of_company_patents(company, beginning_year=None, end_year=None,
189189 else :
190190 search_query = company_query
191191
192- results_format = ('["patent_number ","patent_date","patent_year","assignees_at_grant.organization ",'
193- '"cpc_current.cpc_subsection_id ","cpc_current",'
194- '"patent_title","assignees_at_grant.name_first ","assignees_at_grant.name_last "]'
192+ results_format = ('["patent_id ","patent_date","patent_year","assignees.assignee_organization ",'
193+ '"cpc_current.cpc_group_id ","cpc_current",'
194+ '"patent_title","assignees.assignee_name_first ","assignees.assignee_name_last "]'
195195 )
196+
197+ options = {}
198+ options ["size" ] = perpage # the API defaults "size" to 100 rows if not specified and it can be up to 1000
196199
197- # old way: options_param = '{"page":%d,"per_page":%d}' % (page, perpage)
198- # new version of the api:
199- options_param = '{"offset":%d, "size":%d}' % ((page - 1 ) * perpage , perpage )
200+ # paging the newest way, we have to supply the previous pages' last element in the "after" parameter
201+ # it also means that there has to be a sort
202+
203+ if after != "" :
204+ options ["after" ] = after
200205
201- response_in_json = patentsview_post_request (patent_search_endpoint , search_query ,
206+ options_param = json .dumps (options )
207+
208+ response = patentsview_post_request (patent_search_endpoint , search_query ,
202209 results_format , options_param = options_param , verbose = verbose )
203- response = json .loads (response_in_json )
204210 if verbose :
205211 print (response )
206212 return response
207213
208214
209215# https://stackoverflow.com/a/41837318/6288413
210- # sort_param could be something like '[{"patent_date":"desc"},{"patent_number ":"desc"}]'
216+ # sort_param could be something like '[{"patent_date":"desc"},{"patent_id ":"desc"}]'
211217def patentsview_post_request (endpoint , query_param , format_param = None , options_param = None , sort_param = None ,
212218 verbose = False ):
213219 if not endpoint :
@@ -305,17 +311,17 @@ def get_company_primary_id(name):
305311def fetch_all_cited_patent_numbers_for_all_patents_in_db (verbose = False ):
306312 l = []
307313 for number in session .query (Patent .patent_number ).distinct ().all ():
308- l .append (number .patent_number )
314+ l .append ('"' + number .patent_number + '"' )
309315 add_cited_patent_numbers (l , verbose = verbose )
310316
311317
312- def add_cited_patents (limit = 25 , verbose = False ):
318+ def add_cited_patents (limit = REQUEST_SIZE , verbose = False ):
313319 # This function populates the patents table with the missing information for the
314320 # patent numbers found in the cited_patents table
315321 # TODO refactor this function to accept a list of patents
316- results_format = ('["patent_number ","patent_date","patent_year",'
317- '"assignees_at_grant.organization ","cpc_current",'
318- '"patent_title","assignees_at_grant.name_first ","assignees_at_grant.name_last "]'
322+ results_format = ('["patent_id ","patent_date","patent_year",'
323+ '"assignees.assignee_organization ","cpc_current",'
324+ '"patent_title","assignees.assignee_name_first ","assignees.assignee_name_last "]'
319325 )
320326 patents_in_db = session .query (Patent .patent_number )
321327 cited_patents_to_add = [x .cited_patent_number for x in session .query (CitedPatent .cited_patent_number )\
@@ -411,11 +417,11 @@ def add_cited_patent_numbers_to_db(citations: List) -> None:
411417 # Patents fetched
412418 cited_patent_objects = []
413419 # Add ALL cited patents to cited_patent_objects list
414- # API change: now we get a list like these {"patent_number ":"7767404","cited_patent_number ":"7494776"}
420+ # API change: now we get a list like these {"patent_id ":"7767404","citation_patent_id ":"7494776"}
415421 for citation in citations :
416- patent_number = citation ["patent_number " ]
422+ patent_number = citation ["patent_id " ]
417423 # Check if there are cited patents in the results and if they are already in the database
418- cited_patent_number = citation ["cited_patent_number " ]
424+ cited_patent_number = citation ["citation_patent_id " ]
419425 if cited_patent_number :
420426 cited_patent_objects .append ((patent_number , cited_patent_number ))
421427
@@ -429,35 +435,43 @@ def add_cited_patent_numbers_to_db(citations: List) -> None:
429435
430436 session .bulk_save_objects (cited_patent_objects )
431437 session .commit ()
432- # TODO: add patents not in Patents table
433-
434438
435439def add_patents (patents ):
436440 patent_objects = []
437441 for p in patents :
438- cpc_subsection_id = "" # "cpc_subsection": "https://search.patentsview.org/api/v1/cpc_subsection/B01/"
442+ cpc_group_id = "" # now there is a cpc_group_id attribute, ex "cpc_group_id":"A47J43/0727"
439443
440- # Concatenate the CPC cpc_subsection_id codes into the 'cpc_subsection_id ' field
444+ # Concatenate the CPC cpc_group_id codes into the 'cpc_group_id ' field
441445 # Example entry: 'H01; Y02; '
442- for mainclass in p ["cpc_current" ]:
443- if mainclass ["cpc_subsection" ]:
444- m = re .search (r'/([^/]*)/$' , mainclass ["cpc_subsection" ])
445- cpc_subsection_id += m .group (1 ) + "; "
446- if not cpc_subsection_id :
447- cpc_subsection_id = None
446+
447+ # a design patent wouldn't have cpcs and about 50% of plant patents don't have them
448+ if "cpc_current" in p :
449+ for mainclass in p ["cpc_current" ]:
450+ if mainclass ["cpc_group_id" ]:
451+ cpc_group_id += mainclass ["cpc_group_id" ] + "; "
452+ if not cpc_group_id :
453+ cpc_group_id = None
454+ else :
455+ cpc_group_id = None
448456
449457 # A patent can have multiple assignees. If the assignee orgnization is in one of our tables (e.g. Companies,
450458 # AlternateNames), add an entry in the patents table for each name
451- # It's also possible that a cited patent had no assignees_at_grant
452- if "assignees_at_grant " in p :
453- for assignee in p ["assignees_at_grant " ]:
459+ # It's also possible that a cited patent had no assignees
460+ if "assignees " in p :
461+ for assignee in p ["assignees " ]:
454462 # The new version of the API doesn't seem to have an "assignee_key_id"
455463 # There is also an "assignee_key_id" field, which is currently unused
456- assignee_organization = assignee ["organization " ]
464+ assignee_organization = assignee ["assignee_organization " ]
457465 if assignee_organization :
458466 assignee_organization = assignee_organization .lower ()
459- assignee_first_name = assignee ["name_first" ]
460- assignee_last_name = assignee ["name_last" ]
467+ if "assignee_name_first" in assignee :
468+ assignee_first_name = assignee ["assignee_name_first" ]
469+ else :
470+ assignee_first_name = None
471+ if "assignee_name_last" in assignee :
472+ assignee_last_name = assignee ["assignee_name_last" ]
473+ else :
474+ assignee_last_name = None
461475
462476 # Check if the assignee is in one of the tables: companies, alternate_names
463477 assignee_id = session .query (Company .id ).filter (
@@ -474,7 +488,7 @@ def add_patents(patents):
474488 assignee_id = result .company_id
475489 assignee_alternate_id = result .id
476490
477- p_obj = Patent (patent_number = p ["patent_number " ],
491+ p_obj = Patent (patent_number = p ["patent_id " ],
478492 patent_title = p ["patent_title" ],
479493 company_id = assignee_id ,
480494 year = p ["patent_year" ],
@@ -487,7 +501,7 @@ def add_patents(patents):
487501
488502 # Check if the patent is already in the database; add it if it is not
489503 # TODO: change this so that the database is not read so frequently from disk
490- if session .query (Patent ).filter_by (patent_number = p ["patent_number " ],
504+ if session .query (Patent ).filter_by (patent_number = p ["patent_id " ],
491505 company_id = assignee_id ,
492506 company_alternate_name_id = assignee_alternate_id ,
493507 assignee_first_name = assignee_first_name ,
0 commit comments