Skip to content

Commit 1528557

Browse files
committed
Refactor add_patents() and related functions to allow cited patents to be added without a company ID
1 parent 2ad787e commit 1528557

File tree

1 file changed

+30
-34
lines changed

1 file changed

+30
-34
lines changed

src/main.py

Lines changed: 30 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import re
1313
from typing import List
1414

15-
1615
Base = declarative_base()
1716
engine = create_engine('sqlite:///patentsview.db')
1817

@@ -94,8 +93,8 @@ class CitedPatent(Base):
9493
)
9594

9695
id = Column(Integer, primary_key=True)
97-
citing_patent_number = Column(String, ForeignKey('patents.patent_number'))
98-
cited_patent_number = Column(String, ForeignKey('patents.patent_number'))
96+
citing_patent_number = Column(String) #, ForeignKey('patents.patent_number'))
97+
cited_patent_number = Column(String) #, ForeignKey('patents.patent_number'))
9998

10099
def __init__(self, patent_number, cited_patent_number):
101100
self.citing_patent_number = patent_number
@@ -107,12 +106,10 @@ def __init__(self, patent_number, cited_patent_number):
107106
dbSession = sessionmaker(bind=engine)
108107
session = dbSession()
109108

110-
111109
# setting for searching for company name
112110
# e.g.: "_eq", "_begins", etc.
113111
COMPANY_SEARCH_CRITERIA = '_eq'
114112

115-
116113
# Application Variables
117114
search_base_url = "https://dev.patentsview.org/"
118115
patent_search_endpoint = search_base_url + "api/patents/query"
@@ -227,7 +224,7 @@ def insert_names(file_path):
227224
index = df.columns.get_loc("Name 1")
228225
primary_name = row[index]
229226
primary_id = session.query(Company.id).filter_by(name=primary_name).scalar()
230-
alternate_names = [name for name in row[index+1:] if type(name) == str]
227+
alternate_names = [name for name in row[index + 1:] if type(name) == str]
231228
insert_alternate_names(primary_id, alternate_names, False)
232229
session.commit()
233230

@@ -244,7 +241,7 @@ def get_company_primary_id(name):
244241

245242
def fetch_all_cited_patent_numbers_for_all_patents_in_db(verbose=False):
246243
l = []
247-
for number in session.query(Patent.patent_number).all():
244+
for number in session.query(Patent.patent_number).distinct().all():
248245
l.append(number.patent_number)
249246
add_cited_patent_numbers(l, verbose=verbose)
250247

@@ -259,7 +256,7 @@ def add_cited_patents(limit=25, verbose=False):
259256
patents_in_db = session.query(Patent.patent_number)
260257
cited_patents_to_add = [x.cited_patent_number for x in session.query(CitedPatent.cited_patent_number)\
261258
.filter(~CitedPatent.cited_patent_number.in_(patents_in_db)).all()]
262-
for patents in fetch_patents_by_number(cited_patents_to_add, results_format , limit=limit, verbose=verbose):
259+
for patents in fetch_patents_by_number(cited_patents_to_add, results_format, limit=limit, verbose=verbose):
263260
add_patents(patents)
264261

265262

@@ -376,31 +373,31 @@ def add_patents(patents):
376373
else:
377374
# TODO find a company/patent that satisfies this path so that this can be tested
378375
# TODO handle case where there is no assignee organization, just an individual's first & last name
379-
result = session.query(AlternateName.id, AlternateName.company_id)\
376+
result = session.query(AlternateName.id, AlternateName.company_id) \
380377
.filter(func.lower(AlternateName.name) == assignee_organization).first()
381378
if result:
382379
assignee_id = result.company_id
383380
assignee_alternate_id = result.id
384381

385-
# If it is, add the record
386-
if assignee_id:
387-
p_obj = Patent(patent_number=p["patent_number"],
388-
patent_title=p["patent_title"],
389-
company_id=assignee_id,
390-
year=p["patent_year"],
391-
grant_date=p["patent_date"],
392-
uspc_class=uspc_main_classes,
393-
assignee_first_name=assignee_first_name,
394-
assignee_last_name=assignee_last_name,
395-
company_alternate_name_id=assignee_alternate_id
396-
)
397-
398-
# Check if the patent is already in the database; add it if it is not
399-
# TODO: change this so that the database is not read so frequently from disk
400-
if session.query(Patent)\
401-
.filter_by(patent_number=p["patent_number"], company_id=assignee_id,
402-
company_alternate_name_id=assignee_alternate_id).first() is None:
403-
patent_objects.append(p_obj)
382+
p_obj = Patent(patent_number=p["patent_number"],
383+
patent_title=p["patent_title"],
384+
company_id=assignee_id,
385+
year=p["patent_year"],
386+
grant_date=p["patent_date"],
387+
uspc_class=uspc_main_classes,
388+
assignee_first_name=assignee_first_name,
389+
assignee_last_name=assignee_last_name,
390+
company_alternate_name_id=assignee_alternate_id
391+
)
392+
393+
# Check if the patent is already in the database; add it if it is not
394+
# TODO: change this so that the database is not read so frequently from disk
395+
if session.query(Patent).filter_by(patent_number=p["patent_number"],
396+
company_id=assignee_id,
397+
company_alternate_name_id=assignee_alternate_id,
398+
assignee_first_name=assignee_first_name,
399+
assignee_last_name=assignee_last_name,).first() is None:
400+
patent_objects.append(p_obj)
404401

405402
# Save the patents
406403
session.bulk_save_objects(patent_objects)
@@ -409,7 +406,8 @@ def add_patents(patents):
409406

410407
def fetch_patents_for_all_companies_in_db(resume_from_company_id=None, verbose=False):
411408
if resume_from_company_id and type(resume_from_company_id) == int:
412-
company_query = session.query(Company.id).filter(Company.id >= resume_from_company_id).order_by(Company.id.asc()).all()
409+
company_query = session.query(Company.id).filter(Company.id >= resume_from_company_id).order_by(
410+
Company.id.asc()).all()
413411
else:
414412
company_query = session.query(Company.id).order_by(Company.id.asc()).all()
415413

@@ -449,7 +447,6 @@ def main():
449447
end_date = options.end_date[0]
450448

451449
# TODO: implement functionality that uses the Start and End dates
452-
"""
453450
if options.fetch_patents_for_all_companies:
454451
company_id = options.resume_from_company_id
455452
if company_id:
@@ -460,14 +457,13 @@ def main():
460457
fetch_patents_for_all_companies_in_db()
461458

462459
fetch_all_cited_patent_numbers_for_all_patents_in_db()
463-
"""
464-
add_cited_patents()
460+
add_cited_patents(verbose=True)
465461

466462

467463
def get_options():
468464
parser = argparse.ArgumentParser(description="A script that calls the PatentsView API.",
469-
# formatter_class=argparse.RawDescriptionHelpFormatter
470-
)
465+
# formatter_class=argparse.RawDescriptionHelpFormatter
466+
)
471467

472468
parser.add_argument(
473469
'-p', '--path', type=str, metavar="path",

0 commit comments

Comments
 (0)