1414
1515
1616class HDFTableCopy (BaseCopy ):
17+ """
18+ Class for handling a standard case of reading a table from an HDF file into a pandas
19+ DataFrame, iterating over it in chunks, and COPYing to PostgreSQL via StringIO CSV
20+ """
21+
1722 def __init__ (
1823 self ,
1924 hdf_tables : List [str ],
@@ -24,6 +29,19 @@ def __init__(
2429 sql_table : str = None ,
2530 csv_chunksize : int = 10 ** 6 ,
2631 ):
32+ """
33+ Parameters
34+ ----------
35+ hdf_tables: list of HDF keys with data corresponding to destination SQL table
36+ (assumption being that HDF tables:SQL tables is many:one)
37+ hdf_meta: HDFMetadata object with information from the store
38+ defer_sql_objs: multiprocessing has issue with passing SQLALchemy objects, so if
39+ True, defer attributing these to the object until after pickled by Pool
40+ conn: SQLAlchemy connection managed outside of the object
41+ table_obj: SQLAlchemy object for the destination SQL Table
42+ sql_table: string of SQL table name
43+ csv_chunksize: max rows to keep in memory when generating CSV for COPY
44+ """
2745 super ().__init__ (defer_sql_objs , conn , table_obj , sql_table , csv_chunksize )
2846
2947 self .hdf_tables = hdf_tables
@@ -34,6 +52,17 @@ def __init__(
3452 self .hdf_chunksize = hdf_meta .chunksize
3553
3654 def copy (self , data_formatters = [cast_pandas ], data_formatter_kwargs = {}):
55+ """
56+ Go through sequence to COPY data to PostgreSQL table, including dropping Primary
57+ and Foreign Keys to optimize speed, TRUNCATE table, COPY data, recreate keys,
58+ and run ANALYZE.
59+
60+ Parameters
61+ ----------
62+ data_formatters: list of functions to apply to df during sequence. Note that
63+ each of these functions should be able to handle kwargs for one another
64+ data_formatter_kwargs: list of kwargs to pass to data_formatters functions
65+ """
3766 self .drop_fks ()
3867 self .drop_pk ()
3968
@@ -50,6 +79,15 @@ def copy(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
5079 self .analyze ()
5180
5281 def hdf_to_pg (self , data_formatters = [cast_pandas ], data_formatter_kwargs = {}):
82+ """
83+ Copy each HDF table that relates to SQL table to database
84+
85+ Parameters
86+ ----------
87+ data_formatters: list of functions to apply to df during sequence. Note that
88+ each of these functions should be able to handle kwargs for one another
89+ data_formatter_kwargs: list of kwargs to pass to data_formatters functions
90+ """
5391 if self .hdf_tables is None :
5492 logger .warn (f"No HDF table found for SQL table { self .sql_table } " )
5593 return
@@ -81,6 +119,11 @@ def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
81119
82120
83121class SmallHDFTableCopy (HDFTableCopy ):
122+ """
123+ Class for handling the case where the table is small enough to be stored completely
124+ in-memory for both reading from the HDF as well as COPYing using StringIO.
125+ """
126+
84127 def __init__ (
85128 self ,
86129 hdf_tables : List [str ],
@@ -91,6 +134,19 @@ def __init__(
91134 sql_table : str = None ,
92135 csv_chunksize : int = 10 ** 6 ,
93136 ):
137+ """
138+ Parameters
139+ ----------
140+ hdf_tables: list of HDF keys with data corresponding to destination SQL table
141+ (assumption being that HDF tables:SQL tables is many:one)
142+ hdf_meta: HDFMetadata object with information from the store
143+ defer_sql_objs: multiprocessing has issue with passing SQLALchemy objects, so if
144+ True, defer attributing these to the object until after pickled by Pool
145+ conn: SQLAlchemy connection managed outside of the object
146+ table_obj: SQLAlchemy object for the destination SQL Table
147+ sql_table: string of SQL table name
148+ csv_chunksize: max rows to keep in memory when generating CSV for COPY
149+ """
94150 super ().__init__ (
95151 hdf_tables ,
96152 hdf_meta ,
@@ -102,6 +158,15 @@ def __init__(
102158 )
103159
104160 def hdf_to_pg (self , data_formatters = [cast_pandas ], data_formatter_kwargs = {}):
161+ """
162+ Copy each HDF table that relates to SQL table to database
163+
164+ Parameters
165+ ----------
166+ data_formatters: list of functions to apply to df during sequence. Note that
167+ each of these functions should be able to handle kwargs for one another
168+ data_formatter_kwargs: list of kwargs to pass to data_formatters functions
169+ """
105170 if self .hdf_tables is None :
106171 logger .warn ("No HDF table found for SQL table {self.sql_table}" )
107172 return
@@ -129,6 +194,14 @@ def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
129194
130195
131196class BigHDFTableCopy (HDFTableCopy ):
197+ """
198+ Class for handling the special case of particularly large tables. For these, we
199+ iterate over reading the table in the HDF as well as iterating again over each of
200+ those chunks in order to keep the number of rows stored in-memory to a reasonable
201+ size. Note that these are iterated using pd.read_hdf(..., start, stop) rather than
202+ pd.read_hdf(..., iterator=True) because we found the performance was much better.
203+ """
204+
132205 def __init__ (
133206 self ,
134207 hdf_tables : List [str ],
@@ -139,6 +212,19 @@ def __init__(
139212 sql_table : str = None ,
140213 csv_chunksize : int = 10 ** 6 ,
141214 ):
215+ """
216+ Parameters
217+ ----------
218+ hdf_tables: list of HDF keys with data corresponding to destination SQL table
219+ (assumption being that HDF tables:SQL tables is many:one)
220+ hdf_meta: HDFMetadata object with information from the store
221+ defer_sql_objs: multiprocessing has issue with passing SQLALchemy objects, so if
222+ True, defer attributing these to the object until after pickled by Pool
223+ conn: SQLAlchemy connection managed outside of the object
224+ table_obj: SQLAlchemy object for the destination SQL Table
225+ sql_table: string of SQL table name
226+ csv_chunksize: max rows to keep in memory when generating CSV for COPY
227+ """
142228 super ().__init__ (
143229 hdf_tables ,
144230 hdf_meta ,
@@ -150,6 +236,15 @@ def __init__(
150236 )
151237
152238 def hdf_to_pg (self , data_formatters = [cast_pandas ], data_formatter_kwargs = {}):
239+ """
240+ Copy each HDF table that relates to SQL table to database
241+
242+ Parameters
243+ ----------
244+ data_formatters: list of functions to apply to df during sequence. Note that
245+ each of these functions should be able to handle kwargs for one another
246+ data_formatter_kwargs: list of kwargs to pass to data_formatters functions
247+ """
153248 if self .hdf_tables is None :
154249 logger .warn (f"No HDF table found for SQL table { self .sql_table } " )
155250 return
0 commit comments