@@ -6,24 +6,35 @@ using OMOPCommonDataModel
66using Serialization
77using InlineStrings
88using Dates
9- import FeatureTransforms:
10- OneHotEncoding, apply_append
11- using DuckDB
9+ import FeatureTransforms: OneHotEncoding, apply_append
10+ using DuckDB
1211using DBInterface: execute
1312
1413# NOTE: In the future, replace this with OMOP CDM version info directly from OMOPCommonDataModel.jl dependencies.
1514const OMOPCDM_VERSIONS = deserialize (joinpath (@__DIR__ , " .." , " assets" , " version_info" ))
1615
1716# Mapping OMOP CDM datatypes to Julia types
1817const DATATYPE_MAP = Dict (
19- " integer" => Int64, " Integer" => Int64, " bigint" => Int64,
18+ " integer" => Int64,
19+ " Integer" => Int64,
20+ " bigint" => Int64,
2021 " float" => Float64,
21- " date" => Date, " datetime" => DateTime,
22- " varchar(1)" => String, " varchar(2)" => String, " varchar(3)" => String,
23- " varchar(9)" => String, " varchar(10)" => String, " varchar(20)" => String,
24- " varchar(25)" => String, " varchar(50)" => String, " varchar(80)" => String,
25- " varchar(250)" => String, " varchar(255)" => String, " varchar(1000)" => String,
26- " varchar(2000)" => String, " varchar(MAX)" => String
22+ " date" => Date,
23+ " datetime" => DateTime,
24+ " varchar(1)" => String,
25+ " varchar(2)" => String,
26+ " varchar(3)" => String,
27+ " varchar(9)" => String,
28+ " varchar(10)" => String,
29+ " varchar(20)" => String,
30+ " varchar(25)" => String,
31+ " varchar(50)" => String,
32+ " varchar(80)" => String,
33+ " varchar(250)" => String,
34+ " varchar(255)" => String,
35+ " varchar(1000)" => String,
36+ " varchar(2000)" => String,
37+ " varchar(MAX)" => String,
2738)
2839
2940function __init__ ()
@@ -104,23 +115,28 @@ ht = HealthTable(df; disable_type_enforcement = true)
104115Use disable_type_enforcement=true if you're exploring or cleaning data but for modeling or analysis, validated types are strongly recommended.
105116"""
106117function HealthBase. HealthTable (
107- df:: DataFrame ;
108- omop_cdm_version:: String = " v5.4.0" ,
109- disable_type_enforcement= false ,
110- collect_errors= true
118+ df:: DataFrame ;
119+ omop_cdm_version:: String = " v5.4.0" ,
120+ disable_type_enforcement = false ,
121+ collect_errors = true ,
111122)
112123 if ! haskey (OMOPCDM_VERSIONS, omop_cdm_version)
113- throw (ArgumentError (" OMOP CDM version '$(omop_cdm_version) ' is not supported. Available versions: $(keys (OMOPCDM_VERSIONS)) " ))
124+ throw (
125+ ArgumentError (
126+ " OMOP CDM version '$(omop_cdm_version) ' is not supported. Available versions: $(keys (OMOPCDM_VERSIONS)) " ,
127+ ),
128+ )
114129 end
115130
116131 omop_fields = OMOPCDM_VERSIONS[omop_cdm_version][:fields ]
117132 @assert ! isempty (omop_fields) " OMOP CDM version $(omop_cdm_version) has no registered fields."
118- failed_columns = Vector {NamedTuple{(:colname, :type, :expected), Tuple{String, Any, Any}}} ()
133+ failed_columns =
134+ Vector {NamedTuple{(:colname, :type, :expected),Tuple{String,Any,Any}}} ()
119135 extra_columns = String[]
120136
121137 for col in names (df)
122138 col_symbol = Symbol (col)
123-
139+
124140 if ! haskey (omop_fields, col_symbol)
125141 push! (extra_columns, col)
126142 continue
@@ -131,22 +147,43 @@ function HealthBase.HealthTable(
131147
132148 if ! haskey (fieldinfo, :cdmDatatype )
133149 if ! collect_errors
134- throw (ArgumentError (" Column '$(col) ' is missing :cdmDatatype information in the schema." ))
150+ throw (
151+ ArgumentError (
152+ " Column '$(col) ' is missing :cdmDatatype information in the schema." ,
153+ ),
154+ )
135155 end
136- push! (failed_columns, (colname= col, type= actual_type, expected= " <missing from schema>" ))
156+ push! (
157+ failed_columns,
158+ (colname = col, type = actual_type, expected = " <missing from schema>" ),
159+ )
137160 else
138161 expected_string = fieldinfo[:cdmDatatype ]
139162
140163 if ! haskey (DATATYPE_MAP, expected_string)
141- push! (failed_columns, (colname= col, type= actual_type, expected= " Unrecognized OMOP datatype: $(expected_string) " ))
164+ push! (
165+ failed_columns,
166+ (
167+ colname = col,
168+ type = actual_type,
169+ expected = " Unrecognized OMOP datatype: $(expected_string) " ,
170+ ),
171+ )
142172 else
143173 expected_type = DATATYPE_MAP[expected_string]
144174
145- if ! (actual_type <: Union{expected_type, Missing} )
175+ if ! (actual_type <: Union{expected_type,Missing} )
146176 if ! collect_errors
147- throw (ArgumentError (" Column '$(col) ' has type $(actual_type) , but expected a subtype of $(expected_type) ." ))
177+ throw (
178+ ArgumentError (
179+ " Column '$(col) ' has type $(actual_type) , but expected a subtype of $(expected_type) ." ,
180+ ),
181+ )
148182 end
149- push! (failed_columns, (colname= col, type= actual_type, expected= expected_type))
183+ push! (
184+ failed_columns,
185+ (colname = col, type = actual_type, expected = expected_type),
186+ )
150187 end
151188 end
152189
@@ -157,18 +194,28 @@ function HealthBase.HealthTable(
157194 end
158195 end
159196 end
160-
197+
161198 validation_msgs = String[]
162199
163200 if ! isempty (failed_columns)
164- error_details = join ([" Column '$(err. colname) ': has type $(err. type) , expected $(err. expected) " for err in failed_columns], " \n " )
165- push! (validation_msgs, " OMOP CDM type validation failed for the following columns:\n " * error_details)
201+ error_details = join (
202+ [
203+ " Column '$(err. colname) ': has type $(err. type) , expected $(err. expected) "
204+ for err in failed_columns
205+ ],
206+ " \n " ,
207+ )
208+ push! (
209+ validation_msgs,
210+ " OMOP CDM type validation failed for the following columns:\n " * error_details,
211+ )
166212 end
167213
168214 if ! isempty (validation_msgs)
169215 full_message = join (validation_msgs, " \n\n " ) * " \n "
170216 if disable_type_enforcement
171- @warn full_message * " \n Type enforcement is disabled. Unexpected behavior may occur."
217+ @warn full_message *
218+ " \n Type enforcement is disabled. Unexpected behavior may occur."
172219 else
173220 throw (ArgumentError (full_message))
174221 end
@@ -212,7 +259,7 @@ function HealthBase.one_hot_encode(
212259 ht:: HealthTable ;
213260 cols:: Vector{Symbol} ,
214261 drop_original:: Bool = true ,
215- return_features_only:: Bool = false
262+ return_features_only:: Bool = false ,
216263)
217264 df = copy (ht. source)
218265 missing = setdiff (cols, Symbol .(names (df)))
@@ -227,7 +274,7 @@ function HealthBase.one_hot_encode(
227274 cats = unique (skipmissing (df[! , col]))
228275 enc = OneHotEncoding (cats)
229276 header = Symbol .(string (col, " _" , c) for c in cats)
230- df = apply_append (df, enc; cols= [col], header= header)
277+ df = apply_append (df, enc; cols = [col], header = header)
231278 end
232279
233280 drop_original && select! (df, Not (cols))
@@ -266,13 +313,13 @@ ht_mapped = map_concepts(ht, :gender_concept_id, "gender_name", conn; schema = "
266313"""
267314function HealthBase. map_concepts (
268315 ht:: HealthTable ,
269- cols:: Union{Symbol, Vector{Symbol}} ,
316+ cols:: Union{Symbol,Vector{Symbol}} ,
270317 conn:: DuckDB.DB ;
271- new_cols:: Union{Nothing, String, Vector{String}} = nothing ,
318+ new_cols:: Union{Nothing,String,Vector{String}} = nothing ,
272319 drop_original:: Bool = false ,
273320 suffix:: String = " _mapped" ,
274321 concept_table:: String = " concept" ,
275- schema:: String = " main"
322+ schema:: String = " main" ,
276323)
277324 df = copy (ht. source)
278325 _map_concepts! (df, cols, conn; new_cols, drop_original, suffix, concept_table, schema)
@@ -309,13 +356,13 @@ map_concepts!(ht, :gender_concept_id, conn; new_cols="gender_name", schema="dbt_
309356"""
310357function HealthBase. map_concepts! (
311358 ht:: HealthTable ,
312- cols:: Union{Symbol, Vector{Symbol}} ,
359+ cols:: Union{Symbol,Vector{Symbol}} ,
313360 conn:: DuckDB.DB ;
314- new_cols:: Union{Nothing, String, Vector{String}} = nothing ,
361+ new_cols:: Union{Nothing,String,Vector{String}} = nothing ,
315362 drop_original:: Bool = false ,
316363 suffix:: String = " _mapped" ,
317364 concept_table:: String = " concept" ,
318- schema:: String = " main"
365+ schema:: String = " main" ,
319366)
320367 _map_concepts! (
321368 ht. source,
@@ -325,7 +372,7 @@ function HealthBase.map_concepts!(
325372 drop_original = drop_original,
326373 suffix = suffix,
327374 concept_table = concept_table,
328- schema = schema
375+ schema = schema,
329376 )
330377 return ht
331378end
@@ -351,13 +398,13 @@ Low-level internal helper to map concept IDs to names directly on a `DataFrame`.
351398"""
352399function _map_concepts! (
353400 df:: DataFrame ,
354- cols:: Union{Symbol, Vector{Symbol}} ,
401+ cols:: Union{Symbol,Vector{Symbol}} ,
355402 conn:: DuckDB.DB ;
356- new_cols:: Union{Nothing, String, Vector{String}} = nothing ,
403+ new_cols:: Union{Nothing,String,Vector{String}} = nothing ,
357404 drop_original:: Bool = false ,
358405 suffix:: String = " _mapped" ,
359406 concept_table:: String = " concept" ,
360- schema:: String = " main"
407+ schema:: String = " main" ,
361408)
362409 cols = isa (cols, Symbol) ? [cols] : cols
363410
@@ -391,7 +438,10 @@ function _map_concepts!(
391438 continue
392439 end
393440
394- mapping = Dict ((cid => cname) for (cid, cname) in zip (result_df. concept_id, result_df. concept_name))
441+ mapping = Dict (
442+ (cid => cname) for
443+ (cid, cname) in zip (result_df. concept_id, result_df. concept_name)
444+ )
395445 df[! , new_col] = map (x -> get (mapping, x, missing ), df[! , col])
396446
397447 if drop_original
@@ -437,7 +487,8 @@ function HealthBase.apply_vocabulary_compression(
437487 counts = combine (groupby (df, col), nrow => :freq )
438488 to_compress = counts[counts. freq .< min_freq, col]
439489 if ! isempty (to_compress)
440- df[! , dest_col] = map (x -> in (x, to_compress) ? other_label : string (x), df[! , col])
490+ df[! , dest_col] =
491+ map (x -> in (x, to_compress) ? other_label : string (x), df[! , col])
441492 end
442493 end
443494
@@ -449,4 +500,3 @@ function HealthBase.apply_vocabulary_compression(
449500end
450501
451502end
452-
0 commit comments