Skip to content

Commit eeac105

Browse files
committed
attribute name changes
1 parent e4f4ceb commit eeac105

File tree

1 file changed

+49
-35
lines changed

1 file changed

+49
-35
lines changed

src/main.py

Lines changed: 49 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -189,25 +189,31 @@ def get_one_page_of_company_patents(company, beginning_year=None, end_year=None,
189189
else:
190190
search_query = company_query
191191

192-
results_format = ('["patent_number","patent_date","patent_year","assignees_at_grant.organization",'
193-
'"cpc_current.cpc_subsection_id","cpc_current",'
194-
'"patent_title","assignees_at_grant.name_first","assignees_at_grant.name_last"]'
192+
results_format = ('["patent_id","patent_date","patent_year","assignees.assignee_organization",'
193+
'"cpc_current.cpc_group_id","cpc_current",'
194+
'"patent_title","assignees.assignee_name_first","assignees.assignee_name_last"]'
195195
)
196+
197+
options = {}
198+
options["size"] = perpage # the API defaults "size" to 100 rows if not specified and it can be up to 1000
196199

197-
# old way: options_param = '{"page":%d,"per_page":%d}' % (page, perpage)
198-
# new version of the api:
199-
options_param = '{"offset":%d, "size":%d}' % ((page-1) * perpage, perpage)
200+
# paging the newest way, we have to supply the previous pages' last element in the "after" parameter
201+
# it also means that there has to be a sort
202+
203+
if after != "":
204+
options["after"] = after
200205

201-
response_in_json = patentsview_post_request(patent_search_endpoint, search_query,
206+
options_param = json.dumps(options)
207+
208+
response = patentsview_post_request(patent_search_endpoint, search_query,
202209
results_format, options_param=options_param, verbose=verbose)
203-
response = json.loads(response_in_json)
204210
if verbose:
205211
print(response)
206212
return response
207213

208214

209215
# https://stackoverflow.com/a/41837318/6288413
210-
# sort_param could be something like '[{"patent_date":"desc"},{"patent_number":"desc"}]'
216+
# sort_param could be something like '[{"patent_date":"desc"},{"patent_id":"desc"}]'
211217
def patentsview_post_request(endpoint, query_param, format_param=None, options_param=None, sort_param=None,
212218
verbose=False):
213219
if not endpoint:
@@ -305,17 +311,17 @@ def get_company_primary_id(name):
305311
def fetch_all_cited_patent_numbers_for_all_patents_in_db(verbose=False):
306312
l = []
307313
for number in session.query(Patent.patent_number).distinct().all():
308-
l.append(number.patent_number)
314+
l.append('"' + number.patent_number + '"')
309315
add_cited_patent_numbers(l, verbose=verbose)
310316

311317

312-
def add_cited_patents(limit=25, verbose=False):
318+
def add_cited_patents(limit=REQUEST_SIZE, verbose=False):
313319
# This function populates the patents table with the missing information for the
314320
# patent numbers found in the cited_patents table
315321
# TODO refactor this function to accept a list of patents
316-
results_format = ('["patent_number","patent_date","patent_year",'
317-
'"assignees_at_grant.organization","cpc_current",'
318-
'"patent_title","assignees_at_grant.name_first","assignees_at_grant.name_last"]'
322+
results_format = ('["patent_id","patent_date","patent_year",'
323+
'"assignees.assignee_organization","cpc_current",'
324+
'"patent_title","assignees.assignee_name_first","assignees.assignee_name_last"]'
319325
)
320326
patents_in_db = session.query(Patent.patent_number)
321327
cited_patents_to_add = [x.cited_patent_number for x in session.query(CitedPatent.cited_patent_number)\
@@ -411,11 +417,11 @@ def add_cited_patent_numbers_to_db(citations: List) -> None:
411417
# Patents fetched
412418
cited_patent_objects = []
413419
# Add ALL cited patents to cited_patent_objects list
414-
# API change: now we get a list like these {"patent_number":"7767404","cited_patent_number":"7494776"}
420+
# API change: now we get a list like these {"patent_id":"7767404","citation_patent_id":"7494776"}
415421
for citation in citations:
416-
patent_number = citation["patent_number"]
422+
patent_number = citation["patent_id"]
417423
# Check if there are cited patents in the results and if they are already in the database
418-
cited_patent_number = citation["cited_patent_number"]
424+
cited_patent_number = citation["citation_patent_id"]
419425
if cited_patent_number:
420426
cited_patent_objects.append((patent_number, cited_patent_number))
421427

@@ -429,35 +435,43 @@ def add_cited_patent_numbers_to_db(citations: List) -> None:
429435

430436
session.bulk_save_objects(cited_patent_objects)
431437
session.commit()
432-
# TODO: add patents not in Patents table
433-
434438

435439
def add_patents(patents):
436440
patent_objects = []
437441
for p in patents:
438-
cpc_subsection_id = "" # "cpc_subsection": "https://search.patentsview.org/api/v1/cpc_subsection/B01/"
442+
cpc_group_id = "" # now there is a cpc_group_id attribute, ex "cpc_group_id":"A47J43/0727"
439443

440-
# Concatenate the CPC cpc_subsection_id codes into the 'cpc_subsection_id' field
444+
# Concatenate the CPC cpc_group_id codes into the 'cpc_group_id' field
441445
# Example entry: 'H01; Y02; '
442-
for mainclass in p["cpc_current"]:
443-
if mainclass["cpc_subsection"]:
444-
m = re.search(r'/([^/]*)/$', mainclass["cpc_subsection"])
445-
cpc_subsection_id += m.group(1) + "; "
446-
if not cpc_subsection_id:
447-
cpc_subsection_id = None
446+
447+
# a design patent wouldn't have cpcs and about 50% of plant patents don't have them
448+
if "cpc_current" in p:
449+
for mainclass in p["cpc_current"]:
450+
if mainclass["cpc_group_id"]:
451+
cpc_group_id += mainclass["cpc_group_id"] + "; "
452+
if not cpc_group_id:
453+
cpc_group_id = None
454+
else:
455+
cpc_group_id = None
448456

449457
# A patent can have multiple assignees. If the assignee orgnization is in one of our tables (e.g. Companies,
450458
# AlternateNames), add an entry in the patents table for each name
451-
# It's also possible that a cited patent had no assignees_at_grant
452-
if "assignees_at_grant" in p:
453-
for assignee in p["assignees_at_grant"]:
459+
# It's also possible that a cited patent had no assignees
460+
if "assignees" in p:
461+
for assignee in p["assignees"]:
454462
# The new version of the API doesn't seem to have an "assignee_key_id"
455463
# There is also an "assignee_key_id" field, which is currently unused
456-
assignee_organization = assignee["organization"]
464+
assignee_organization = assignee["assignee_organization"]
457465
if assignee_organization:
458466
assignee_organization = assignee_organization.lower()
459-
assignee_first_name = assignee["name_first"]
460-
assignee_last_name = assignee["name_last"]
467+
if "assignee_name_first" in assignee:
468+
assignee_first_name = assignee["assignee_name_first"]
469+
else:
470+
assignee_first_name = None
471+
if "assignee_name_last" in assignee:
472+
assignee_last_name = assignee["assignee_name_last"]
473+
else:
474+
assignee_last_name = None
461475

462476
# Check if the assignee is in one of the tables: companies, alternate_names
463477
assignee_id = session.query(Company.id).filter(
@@ -474,7 +488,7 @@ def add_patents(patents):
474488
assignee_id = result.company_id
475489
assignee_alternate_id = result.id
476490

477-
p_obj = Patent(patent_number=p["patent_number"],
491+
p_obj = Patent(patent_number=p["patent_id"],
478492
patent_title=p["patent_title"],
479493
company_id=assignee_id,
480494
year=p["patent_year"],
@@ -487,7 +501,7 @@ def add_patents(patents):
487501

488502
# Check if the patent is already in the database; add it if it is not
489503
# TODO: change this so that the database is not read so frequently from disk
490-
if session.query(Patent).filter_by(patent_number=p["patent_number"],
504+
if session.query(Patent).filter_by(patent_number=p["patent_id"],
491505
company_id=assignee_id,
492506
company_alternate_name_id=assignee_alternate_id,
493507
assignee_first_name=assignee_first_name,

0 commit comments

Comments
 (0)