From 34225673ea76920c02969e1c8079f158a4f331fc Mon Sep 17 00:00:00 2001 From: Dan Mahoney Date: Wed, 22 Feb 2023 13:02:51 -0500 Subject: [PATCH 1/2] add support for different data types --- .idea/.gitignore | 8 +++++ .idea/clickhouse-sql-examples.iml | 12 +++++++ .idea/inspectionProfiles/Project_Default.xml | 23 ++++++++++++ .../inspectionProfiles/profiles_settings.xml | 6 ++++ .idea/misc.xml | 4 +++ .idea/modules.xml | 8 +++++ .idea/vcs.xml | 6 ++++ parquet/generate-ch-schema.py | 36 ++++++++++++++----- 8 files changed, 94 insertions(+), 9 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/clickhouse-sql-examples.iml create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/clickhouse-sql-examples.iml b/.idea/clickhouse-sql-examples.iml new file mode 100644 index 0000000..8b8c395 --- /dev/null +++ b/.idea/clickhouse-sql-examples.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..592c8f0 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,23 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d56657a --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..c8ad0a4 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/parquet/generate-ch-schema.py b/parquet/generate-ch-schema.py index 7629668..e8f1744 100644 --- a/parquet/generate-ch-schema.py +++ b/parquet/generate-ch-schema.py @@ -23,12 +23,30 @@ def pq_to_ch_type(pq_type): else: ch_type = "Int256" + elif pq_type.logical_type.type == "INT" and pq_type.physical_type == "INT32": + ch_type = "Int32" + elif pq_type.logical_type.type == "STRING": ch_type = "String" elif pq_type.logical_type.type == "DATE": ch_type = "Date" + elif pq_type.logical_type.type == "NONE" and pq_type.physical_type == "BOOLEAN": + ch_type = "Bool" + + elif pq_type.logical_type.type == "NONE" and pq_type.physical_type == "DOUBLE": + ch_type = "Float64" + + elif pq_type.logical_type.type == "NONE" and pq_type.physical_type == "INT32": + ch_type = "Int32" + + elif pq_type.logical_type.type == "NONE" and pq_type.physical_type == "INT64": + ch_type = "Int64" + + elif pq_type.logical_type.type == "NONE" and pq_type.physical_type == "INT96": + ch_type = "DateTime64" + else: print(pq_type, type(pq_type)) raise Exception(f"Unknown type: {pq_type}") @@ -55,7 +73,7 @@ def fail(msg): exit(1) def process(): - # Print a header. + # Print a header. print("-- Automatically generated DDL and INSERT for Parquet data") # We need region to read S3 and generate URLs. @@ -65,7 +83,7 @@ def process(): print("-- AWS_REGION: " + AWS_REGION) # The AWS access key and secret key are optional for loading into S3. - # You can also use a bucket with open permissions. + # You can also use a bucket with open permissions. AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID') AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY') @@ -74,16 +92,16 @@ def process(): if not S3_DATASET_PATH: fail("S3_DATASET_PATH environment variable not set") print("-- S3_DATASET_PATH: " + S3_DATASET_PATH) - - # We need to get the table from the path. It is the last whole word - # in the path before the trailing /. + + # We need to get the table from the path. It is the last whole word + # in the path before the trailing /. regex_match = re.search('([A-Za-z0-9_]+)/$', S3_DATASET_PATH) if not regex_match: fail("S3_DATASET_PATH must have form bucket/dir/.../dir/table_name/") table_name = regex_match.group(1) print("-- Table name: " + table_name) - - # Open up the parquet dataset in S3. + + # Open up the parquet dataset in S3. s3 = fs.S3FileSystem(region=AWS_REGION) pq_dataset = pq.ParquetDataset(S3_DATASET_PATH, filesystem=s3) @@ -93,7 +111,7 @@ def process(): fail("S3_DATASET_PATH does not have any Parquet files!") pq_fragment = pq_dataset.fragments[0] - # Generate CREATE TABLE command. + # Generate CREATE TABLE command. ch_columns = pq_columns_to_ch_columns(pq_fragment.metadata) print("CREATE TABLE IF NOT EXISTS {0} (".format(table_name)) @@ -106,7 +124,7 @@ def process(): print("ORDER BY tuple()") print("") - # Generate INSERT with SELECT from S3 URL. + # Generate INSERT with SELECT from S3 URL. if AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY: aws_credentials = "'{0}', '{1}',".format(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) else: From ee1575d84a548b2108dd8bed65fc00726b387aae Mon Sep 17 00:00:00 2001 From: Dan Mahoney Date: Wed, 22 Feb 2023 13:03:24 -0500 Subject: [PATCH 2/2] add support for different data types --- .idea/.gitignore | 8 ------- .idea/clickhouse-sql-examples.iml | 12 ---------- .idea/inspectionProfiles/Project_Default.xml | 23 ------------------- .../inspectionProfiles/profiles_settings.xml | 6 ----- .idea/misc.xml | 4 ---- .idea/modules.xml | 8 ------- .idea/vcs.xml | 6 ----- 7 files changed, 67 deletions(-) delete mode 100644 .idea/.gitignore delete mode 100644 .idea/clickhouse-sql-examples.iml delete mode 100644 .idea/inspectionProfiles/Project_Default.xml delete mode 100644 .idea/inspectionProfiles/profiles_settings.xml delete mode 100644 .idea/misc.xml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 13566b8..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml -# Editor-based HTTP Client requests -/httpRequests/ -# Datasource local storage ignored files -/dataSources/ -/dataSources.local.xml diff --git a/.idea/clickhouse-sql-examples.iml b/.idea/clickhouse-sql-examples.iml deleted file mode 100644 index 8b8c395..0000000 --- a/.idea/clickhouse-sql-examples.iml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml deleted file mode 100644 index 592c8f0..0000000 --- a/.idea/inspectionProfiles/Project_Default.xml +++ /dev/null @@ -1,23 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index d56657a..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index c8ad0a4..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file