@@ -11,42 +11,46 @@ class HDFTableCopy(BaseCopy):
1111
1212 def __init__ (
1313 self ,
14+ file_name ,
1415 hdf_tables ,
15- hdf_meta ,
1616 defer_sql_objs = False ,
1717 conn = None ,
1818 table_obj = None ,
1919 sql_table = None ,
2020 csv_chunksize = 10 ** 6 ,
21+ hdf_chunksize = 10 ** 7 ,
22+ hdf_metadata = None ,
2123 ):
2224 """
2325 Parameters
2426 ----------
27+ file_name
2528 hdf_tables: list of strings
2629 HDF keys with data corresponding to destination SQL table
2730 (assumption being that HDF tables:SQL tables is many:one)
28- hdf_meta: HDFMetadata object
29- Information from the HDF file for use in building copy objects
3031 defer_sql_objs: bool
3132 multiprocessing has issue with passing SQLALchemy objects, so if
3233 True, defer attributing these to the object until after pickled by Pool
33- conn: SQLAlchemy connection
34+ conn: SQLAlchemy connection or None
3435 Managed outside of the object
35- table_obj: SQLAlchemy model object
36+ table_obj: SQLAlchemy model object or None
3637 Destination SQL Table
37- sql_table: string
38+ sql_table: string or None
3839 SQL table name
3940 csv_chunksize: int
4041 Max rows to keep in memory when generating CSV for COPY
42+ hdf_chunksize: int
43+ Max rows to keep in memory when reading HDF file
44+ hdf_metadata: dict or None
45+ Dict of HDF table keys to dict of constant:value pairs. Not actively used by
46+ any pre-defined function, but available to data_formatting method
4147 """
4248 super ().__init__ (defer_sql_objs , conn , table_obj , sql_table , csv_chunksize )
4349
4450 self .hdf_tables = hdf_tables
45-
46- # Info from the HDFMetadata object
47- self .hdf_metadata = hdf_meta .metadata_vars
48- self .file_name = hdf_meta .file_name
49- self .hdf_chunksize = hdf_meta .chunksize
51+ self .hdf_metadata = hdf_metadata
52+ self .file_name = file_name
53+ self .hdf_chunksize = hdf_chunksize
5054
5155 def copy (self , data_formatters = [cast_pandas ], data_formatter_kwargs = {}):
5256 """
@@ -121,46 +125,6 @@ class SmallHDFTableCopy(HDFTableCopy):
121125 in-memory for both reading from the HDF as well as COPYing using StringIO.
122126 """
123127
124- def __init__ (
125- self ,
126- hdf_tables ,
127- hdf_meta ,
128- defer_sql_objs = False ,
129- conn = None ,
130- table_obj = None ,
131- sql_table = None ,
132- csv_chunksize = 10 ** 6 ,
133- ):
134- """
135- Parameters
136- ----------
137- hdf_tables: list of strings
138- HDF keys with data corresponding to destination SQL table
139- (assumption being that HDF tables:SQL tables is many:one)
140- hdf_meta: HDFMetadata object
141- Information from the HDF file for use in building copy objects
142- defer_sql_objs: bool
143- multiprocessing has issue with passing SQLALchemy objects, so if
144- True, defer attributing these to the object until after pickled by Pool
145- conn: SQLAlchemy connection
146- Managed outside of the object
147- table_obj: SQLAlchemy model object
148- Destination SQL Table
149- sql_table: string
150- SQL table name
151- csv_chunksize: int
152- Max rows to keep in memory when generating CSV for COPY
153- """
154- super ().__init__ (
155- hdf_tables ,
156- hdf_meta ,
157- defer_sql_objs ,
158- conn ,
159- table_obj ,
160- sql_table ,
161- csv_chunksize ,
162- )
163-
164128 def hdf_to_pg (self , data_formatters = [cast_pandas ], data_formatter_kwargs = {}):
165129 """
166130 Copy each HDF table that relates to SQL table to database
@@ -206,46 +170,6 @@ class BigHDFTableCopy(HDFTableCopy):
206170 pd.read_hdf(..., iterator=True) because we found the performance was much better.
207171 """
208172
209- def __init__ (
210- self ,
211- hdf_tables ,
212- hdf_meta ,
213- defer_sql_objs = False ,
214- conn = None ,
215- table_obj = None ,
216- sql_table = None ,
217- csv_chunksize = 10 ** 6 ,
218- ):
219- """
220- Parameters
221- ----------
222- hdf_tables: list of strings
223- HDF keys with data corresponding to destination SQL table
224- (assumption being that HDF tables:SQL tables is many:one)
225- hdf_meta: HDFMetadata object
226- Information from the HDF file for use in building copy objects
227- defer_sql_objs: bool
228- multiprocessing has issue with passing SQLALchemy objects, so if
229- True, defer attributing these to the object until after pickled by Pool
230- conn: SQLAlchemy connection
231- Managed outside of the object
232- table_obj: SQLAlchemy model object
233- Destination SQL Table
234- sql_table: string
235- SQL table name
236- csv_chunksize: int
237- Max rows to keep in memory when generating CSV for COPY
238- """
239- super ().__init__ (
240- hdf_tables ,
241- hdf_meta ,
242- defer_sql_objs ,
243- conn ,
244- table_obj ,
245- sql_table ,
246- csv_chunksize ,
247- )
248-
249173 def hdf_to_pg (self , data_formatters = [cast_pandas ], data_formatter_kwargs = {}):
250174 """
251175 Copy each HDF table that relates to SQL table to database
@@ -275,7 +199,7 @@ def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
275199 start = 0
276200
277201 for i in range (n_chunks ):
278- logger .info ("*** HDF chunk {i + 1 } of {} ***" .format (n_chunks ))
202+ logger .info ("*** HDF chunk {i} of {n } ***" .format (i = i + 1 , n = n_chunks ))
279203 logger .info ("Reading HDF table" )
280204 stop = min (start + self .hdf_chunksize , nrows )
281205 df = pd .read_hdf (self .file_name , key = hdf_table , start = start , stop = stop )
0 commit comments