From a36c7e28e83375c551c9bed4e2c41ccc48edf8c4 Mon Sep 17 00:00:00 2001 From: Will Hulme <25637345+wjchulme@users.noreply.github.com> Date: Wed, 20 Nov 2024 22:32:18 +0000 Subject: [PATCH 1/4] add example actions --- analysis/compare_definition_2.R | 10 +++++ analysis/dataset_definition_1.py | 18 +++++++++ analysis/dataset_definition_2.py | 36 ++++++++++++++++++ analysis/dummy_definition_1.R | 60 +++++++++++++++++++++++++++++ analysis/dummy_definition_2a.R | 35 +++++++++++++++++ analysis/dummy_definition_2b.R | 65 ++++++++++++++++++++++++++++++++ project.yaml | 46 ++++++++++++++++++++-- 7 files changed, 267 insertions(+), 3 deletions(-) create mode 100644 analysis/compare_definition_2.R create mode 100644 analysis/dataset_definition_1.py create mode 100644 analysis/dataset_definition_2.py create mode 100644 analysis/dummy_definition_1.R create mode 100644 analysis/dummy_definition_2a.R create mode 100644 analysis/dummy_definition_2b.R diff --git a/analysis/compare_definition_2.R b/analysis/compare_definition_2.R new file mode 100644 index 0000000..7d24023 --- /dev/null +++ b/analysis/compare_definition_2.R @@ -0,0 +1,10 @@ + +custom_data_a <- feather::read_feather(here::here("output", "dummy_dataset_2a.arrow")) +custom_data_b <- feather::read_feather(here::here("output", "dummy_dataset_2b.arrow")) + +ehrql_data <- readr::read_csv(here::here("output", "dataset_2.csv.gz")) +ehrql_data <- feather::read_feather(here::here("output", "dataset_2.arrow")) + + + + diff --git a/analysis/dataset_definition_1.py b/analysis/dataset_definition_1.py new file mode 100644 index 0000000..fc80272 --- /dev/null +++ b/analysis/dataset_definition_1.py @@ -0,0 +1,18 @@ +from ehrql import create_dataset +from ehrql.tables.tpp import patients, practice_registrations + +dataset = create_dataset() + +index_date = "2020-03-31" + +has_registration = practice_registrations.for_patient_on( + index_date +).exists_for_patient() + +dataset.define_population(has_registration) +dataset.configure_dummy_data(population_size=1000) + +dataset.registered = has_registration +dataset.age = patients.age_on(index_date) +dataset.sex = patients.sex + diff --git a/analysis/dataset_definition_2.py b/analysis/dataset_definition_2.py new file mode 100644 index 0000000..938c47b --- /dev/null +++ b/analysis/dataset_definition_2.py @@ -0,0 +1,36 @@ +from ehrql import create_dataset +from ehrql.tables.tpp import patients, practice_registrations, vaccinations + +dataset = create_dataset() + +index_date = "2020-12-08" + +has_registration = practice_registrations.for_patient_on( + index_date +).exists_for_patient() + +dataset.define_population(has_registration) +dataset.configure_dummy_data(population_size=1000) + +dataset.registered = has_registration +dataset.age = patients.age_on(index_date) +dataset.sex = patients.sex + + +covid_vaccinations = ( + vaccinations + .where(vaccinations.target_disease.is_in(["SARS-2 CORONAVIRUS"])) + .sort_by(vaccinations.date) +) + +covid_vaccinations1 = covid_vaccinations.first_for_patient() +dataset.vaccine_date1 = covid_vaccinations1.date +dataset.vaccine_product1 = covid_vaccinations1.product_name + +covid_vaccinations2 = ( + covid_vaccinations. + where(covid_vaccinations.date>dataset.vaccine_date1). + first_for_patient() + ) +dataset.vaccine_date2 = covid_vaccinations2.date +dataset.vaccine_product2 = covid_vaccinations2.product_name diff --git a/analysis/dummy_definition_1.R b/analysis/dummy_definition_1.R new file mode 100644 index 0000000..684012e --- /dev/null +++ b/analysis/dummy_definition_1.R @@ -0,0 +1,60 @@ + +population_size <- 1000 + +set.seed(420) + +## attempt 1 +# dummy_data <- data.frame( +# patient_id = 1:population_size, +# age = rnorm(population_size, mean= 60, sd=20), +# sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)), +# ) + +## attempt 2 - add registered variable +# dummy_data <- data.frame( +# patient_id = 1:population_size, +# registered = rbinom(population_size, 1, 0.99)==1, +# age = rnorm(population_size, mean= 60, sd=20), +# sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)), +# ) + + +## attempt 3 - ensure registered is output as "T" or "F" so is read by python correctly +# dummy_data <- data.frame( +# patient_id = 1:population_size, +# registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"), +# age = rnorm(population_size, mean= 60, sd=20), +# sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)), +# ) + +# ## attempt 4 - fix age so it's an integer +# dummy_data <- data.frame( +# patient_id = 1:population_size, +# registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"), +# age = as.integer(rnorm(population_size, mean= 60, sd=20)), +# sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)), +# ) + + +## attempt 5 - fix sex so it uses the correct levels +# dummy_data <- data.frame( +# patient_id = 1:population_size, +# registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"), +# age = as.integer(rnorm(population_size, mean= 60, sd=20)), +# sex = sample(c("female", "male", "intersex", "unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)) +# ) + +## attempt 6 - add a diabetes variable that is not in the ehrql definition +dummy_data <- data.frame( + patient_id = 1:population_size, + registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"), + age = as.integer(rnorm(population_size, mean= 55, sd=20)), + sex = sample(c("female", "male", "intersex", "unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)), + diabetes = rbinom(population_size, 1, 0.1) +) + +# write to compressed csv file +readr::write_csv(dummy_data, here::here("output", "dummy_dataset_1.csv.gz")) + +# write to arrow file +# feather::write_feather(dummy_data, here::here("output", "dummy_dataset_1.arrow")) diff --git a/analysis/dummy_definition_2a.R b/analysis/dummy_definition_2a.R new file mode 100644 index 0000000..a727396 --- /dev/null +++ b/analysis/dummy_definition_2a.R @@ -0,0 +1,35 @@ +library(dplyr) + +set.seed(314159) + +pop_n <- 1000 +index_date <- as.Date("2020-12-08") + +pfizer_name <- "COVID-19 mRNA Vaccine Comirnaty 30micrograms/0.3ml dose conc for susp for inj MDV (Pfizer)" +az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)" + +dummy_data_nomissing <- tibble( + patient_id = 1:pop_n, + registered = rbinom(pop_n, 1, 0.99)==1, + age = as.integer(rnorm(pop_n, mean= 55, sd=20)), + sex = sample(c("female", "male", "intersex", "unknown"), size = pop_n, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)), + diabetes = rbinom(pop_n, 1, 0.1)==1, + vaccine_date1 = index_date + runif(pop_n, 0, 150), + vaccine_product1 = sample(c(pfizer_name, az_name), size = pop_n, replace=TRUE, prob = c(0.50, 0.50)), + vaccine_date2 = vaccine_date1 + runif(pop_n, 3*7, 16*7), + vaccine_product2 = if_else(runif(pop_n)<0.95, vaccine_product1, "az"), +) + +dummy_data <- + mutate( + dummy_data_nomissing, + vaccine_date1 = if_else(runif(pop_n)<0.2, as.Date(NA), vaccine_date1), + vaccine_product1 = if_else(is.na(vaccine_date1), NA_character_, vaccine_product1), + vaccine_date2 = if_else(runif(pop_n)<0.1 | is.na(vaccine_date1), as.Date(NA), vaccine_date2), + vaccine_product2 = if_else(is.na(vaccine_date2), NA_character_, vaccine_product2) + ) + + +# write to arrow file +feather::write_feather(dummy_data, path = here::here("output", "dummy_dataset_2a.arrow")) + diff --git a/analysis/dummy_definition_2b.R b/analysis/dummy_definition_2b.R new file mode 100644 index 0000000..59c5599 --- /dev/null +++ b/analysis/dummy_definition_2b.R @@ -0,0 +1,65 @@ +library(dplyr) +library(dd4d) + +set.seed(314159) + +pop_n <- 1000 +index_date <- as.Date("2020-12-08") + +pfizer_name <- "COVID-19 mRNA Vaccine Comirnaty 30micrograms/0.3ml dose conc for susp for inj MDV (Pfizer)" +az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)" + + +# define each variable as a node in a bayesian network + +sim_list_vax_info <- lst( + + registered = bn_node( + ~ rbernoulli(n = ..n, p = 0.99) + ), + + age = bn_node( + ~ as.integer(rnorm(pop_n, mean= 55, sd=20)), + ), + + sex = bn_node( + ~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"), p = c(0.50, 0.49, 0, 0.01)), + missing_rate = ~0.001 # this is shorthand for ~(rbernoulli(n=..n, p = 0.2)) + ), + + diabetes = bn_node( + ~ rbernoulli(n = ..n, p = 0.10) + ), + + vaccine_date1 = bn_node( + ~ index_date + runif(n = ..n, 0, 150), + missing_rate = ~0.2, + ), + vaccine_date2 = bn_node( + ~ covid_vax_1_day + runif(n = ..n, 3*7, 16*7), + missing_rate = ~0.001, + needs = "vaccine_date2" + ), + + vaccine_product1 = bn_node(~ rcat(n = ..n, c("pfizer", "az"), c(0.5, 0.5)), needs = "vaccine_date1"), + vaccine_product2 = bn_node(~ if_else(runif(..n) < 0.95, vaccine_product1, "az"), needs = "vaccine_date2"), + +) + +bn <- bn_create(sim_list, known_variables = known_variables) + +# plot the network +bn_plot(bn) + +# plot the network (connected nodes only) +bn_plot(bn, connected_only = TRUE) + + +# simulate the dataset +dummydata <- bn_simulate(bn, pop_size = population_size, keep_all = FALSE, .id = "patient_id") + + + +# write to arrow file +feather::write_feather(dummy_data, path = here::here("output", "dummy_dataset_2b.arrow")) + diff --git a/project.yaml b/project.yaml index b941ab8..2989462 100644 --- a/project.yaml +++ b/project.yaml @@ -1,8 +1,48 @@ version: '4.0' actions: - generate_dataset: - run: ehrql:v1 generate-dataset analysis/dataset_definition.py --output output/dataset.csv.gz + #generate_dataset: + # run: ehrql:v1 generate-dataset analysis/dataset_definition.py --output output/dataset.csv.gz + # outputs: + # highly_sensitive: + # dataset: output/dataset.csv.gz + + + + dummy_dataset_1: + run: r:latest analysis/dummy_definition_1.R + outputs: + highly_sensitive: + dataset: output/dummy_dataset_1.csv.gz + + generate_dataset_1: + run: ehrql:v1 generate-dataset analysis/dataset_definition_1.py + --output output/dataset_1.csv.gz + --dummy-data-file output/dummy_dataset_1.csv.gz + needs: [dummy_dataset_1] outputs: highly_sensitive: - dataset: output/dataset.csv.gz + dataset: output/dataset_1.csv.gz + + compare_dataset_1: + run: r:latest analysis/compare_definition_1.R + needs: [dummy_dataset_1, generate_dataset_1] + outputs: + highly_sensitive: + csv: output/comparison_1.csv + + dummy_dataset_2: + run: r:latest analysis/dummy_definition_2.R + outputs: + highly_sensitive: + dataset: output/dummy_dataset_2.arrow + + generate_dataset_2: + run: ehrql:v1 generate-dataset analysis/dataset_definition_2.py + --output output/dataset_2.arrow + # --dummy-data-file output/dummy_dataset_2.arrow + # needs: [dummy_dataset_2] + outputs: + highly_sensitive: + dataset: output/dataset_2.arrow + From 46ee2c15ee77ebd98a3fb63890a2899664a6dd0b Mon Sep 17 00:00:00 2001 From: Will Hulme <25637345+wjchulme@users.noreply.github.com> Date: Thu, 21 Nov 2024 09:25:27 +0000 Subject: [PATCH 2/4] fix dummy spec --- analysis/compare_definition_2.R | 3 --- analysis/dummy_definition_1.R | 2 +- analysis/dummy_definition_2b.R | 11 +++++------ project.yaml | 21 ++++++++++----------- 4 files changed, 16 insertions(+), 21 deletions(-) diff --git a/analysis/compare_definition_2.R b/analysis/compare_definition_2.R index 7d24023..af2afba 100644 --- a/analysis/compare_definition_2.R +++ b/analysis/compare_definition_2.R @@ -5,6 +5,3 @@ custom_data_b <- feather::read_feather(here::here("output", "dummy_dataset_2b.ar ehrql_data <- readr::read_csv(here::here("output", "dataset_2.csv.gz")) ehrql_data <- feather::read_feather(here::here("output", "dataset_2.arrow")) - - - diff --git a/analysis/dummy_definition_1.R b/analysis/dummy_definition_1.R index 684012e..e7c5c70 100644 --- a/analysis/dummy_definition_1.R +++ b/analysis/dummy_definition_1.R @@ -57,4 +57,4 @@ dummy_data <- data.frame( readr::write_csv(dummy_data, here::here("output", "dummy_dataset_1.csv.gz")) # write to arrow file -# feather::write_feather(dummy_data, here::here("output", "dummy_dataset_1.arrow")) +feather::write_feather(dummy_data, here::here("output", "dummy_dataset_1.arrow")) diff --git a/analysis/dummy_definition_2b.R b/analysis/dummy_definition_2b.R index 59c5599..2607e30 100644 --- a/analysis/dummy_definition_2b.R +++ b/analysis/dummy_definition_2b.R @@ -12,14 +12,14 @@ az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)" # define each variable as a node in a bayesian network -sim_list_vax_info <- lst( +sim_list <- lst( registered = bn_node( ~ rbernoulli(n = ..n, p = 0.99) ), age = bn_node( - ~ as.integer(rnorm(pop_n, mean= 55, sd=20)), + ~ as.integer(rnorm(..n, mean= 55, sd=20)), ), sex = bn_node( @@ -36,7 +36,7 @@ sim_list_vax_info <- lst( missing_rate = ~0.2, ), vaccine_date2 = bn_node( - ~ covid_vax_1_day + runif(n = ..n, 3*7, 16*7), + ~ vaccine_date1 + runif(n = ..n, 3*7, 16*7), missing_rate = ~0.001, needs = "vaccine_date2" ), @@ -46,7 +46,7 @@ sim_list_vax_info <- lst( ) -bn <- bn_create(sim_list, known_variables = known_variables) +bn <- bn_create(sim_list, known_variables = c("index_date")) # plot the network bn_plot(bn) @@ -54,9 +54,8 @@ bn_plot(bn) # plot the network (connected nodes only) bn_plot(bn, connected_only = TRUE) - # simulate the dataset -dummydata <- bn_simulate(bn, pop_size = population_size, keep_all = FALSE, .id = "patient_id") +dummydata <- bn_simulate(bn, pop_size = pop_n, keep_all = FALSE, .id = "patient_id") diff --git a/project.yaml b/project.yaml index 2989462..9973bc6 100644 --- a/project.yaml +++ b/project.yaml @@ -24,24 +24,23 @@ actions: highly_sensitive: dataset: output/dataset_1.csv.gz - compare_dataset_1: - run: r:latest analysis/compare_definition_1.R - needs: [dummy_dataset_1, generate_dataset_1] - outputs: - highly_sensitive: - csv: output/comparison_1.csv + dummy_dataset_2a: + run: r:latest analysis/dummy_definition_2a.R + outputs: + highly_sensitive: + dataset: output/dummy_dataset_2a.arrow - dummy_dataset_2: - run: r:latest analysis/dummy_definition_2.R + dummy_dataset_2b: + run: r:latest analysis/dummy_definition_2b.R outputs: highly_sensitive: - dataset: output/dummy_dataset_2.arrow + dataset: output/dummy_dataset_2b.arrow generate_dataset_2: run: ehrql:v1 generate-dataset analysis/dataset_definition_2.py --output output/dataset_2.arrow - # --dummy-data-file output/dummy_dataset_2.arrow - # needs: [dummy_dataset_2] + # --dummy-data-file output/dummy_dataset_2b.arrow + # needs: [dummy_dataset_2b] outputs: highly_sensitive: dataset: output/dataset_2.arrow From 3ad530da236105d0b737a3aa86fe12f01e41911f Mon Sep 17 00:00:00 2001 From: Will Hulme <25637345+wjchulme@users.noreply.github.com> Date: Thu, 21 Nov 2024 11:10:40 +0000 Subject: [PATCH 3/4] fix dates, demo feather issue --- analysis/compare_definition_2.R | 11 ++++++++++- analysis/dummy_definition_2b.R | 24 +++++++++++++----------- project.yaml | 30 ++++++++++++++++++------------ 3 files changed, 41 insertions(+), 24 deletions(-) diff --git a/analysis/compare_definition_2.R b/analysis/compare_definition_2.R index af2afba..817d363 100644 --- a/analysis/compare_definition_2.R +++ b/analysis/compare_definition_2.R @@ -1,7 +1,16 @@ +library(dplyr) custom_data_a <- feather::read_feather(here::here("output", "dummy_dataset_2a.arrow")) custom_data_b <- feather::read_feather(here::here("output", "dummy_dataset_2b.arrow")) -ehrql_data <- readr::read_csv(here::here("output", "dataset_2.csv.gz")) +#ehrql_data <- readr::read_csv(here::here("output", "dataset_2.csv.gz")) ehrql_data <- feather::read_feather(here::here("output", "dataset_2.arrow")) + +# just testing yaml for now... +custom_data_b %>% + summarise( + n=n(), + vaccine_date1_missing = sum(is.na(vaccine_date1)) + ) %>% + readr::write_csv(here::here("output", "compare_dataset_2.csv")) diff --git a/analysis/dummy_definition_2b.R b/analysis/dummy_definition_2b.R index 2607e30..c1b5b33 100644 --- a/analysis/dummy_definition_2b.R +++ b/analysis/dummy_definition_2b.R @@ -31,22 +31,22 @@ sim_list <- lst( ~ rbernoulli(n = ..n, p = 0.10) ), - vaccine_date1 = bn_node( - ~ index_date + runif(n = ..n, 0, 150), + vaccine_day1 = bn_node( + ~ runif(n = ..n, 0, 150), missing_rate = ~0.2, ), - vaccine_date2 = bn_node( - ~ vaccine_date1 + runif(n = ..n, 3*7, 16*7), + vaccine_day2 = bn_node( + ~ vaccine_day1 + runif(n = ..n, 3*7, 16*7), missing_rate = ~0.001, - needs = "vaccine_date2" + needs = "vaccine_day2" ), - vaccine_product1 = bn_node(~ rcat(n = ..n, c("pfizer", "az"), c(0.5, 0.5)), needs = "vaccine_date1"), - vaccine_product2 = bn_node(~ if_else(runif(..n) < 0.95, vaccine_product1, "az"), needs = "vaccine_date2"), + vaccine_product1 = bn_node(~ rcat(n = ..n, c("pfizer", "az"), c(0.5, 0.5)), needs = "vaccine_day1"), + vaccine_product2 = bn_node(~ if_else(runif(..n) < 0.95, vaccine_product1, "az"), needs = "vaccine_day2"), ) -bn <- bn_create(sim_list, known_variables = c("index_date")) +bn <- bn_create(sim_list) # plot the network bn_plot(bn) @@ -55,10 +55,12 @@ bn_plot(bn) bn_plot(bn, connected_only = TRUE) # simulate the dataset -dummydata <- bn_simulate(bn, pop_size = pop_n, keep_all = FALSE, .id = "patient_id") - +dummy_data <- bn_simulate(bn, pop_size = pop_n, keep_all = FALSE, .id = "patient_id") +dummy_data_days_to_dates <- dummy_data %>% + mutate(across(num_range("vaccine_day", 1:2), ~ index_date + .)) %>% + rename_with(~ stringr::str_replace(., "_day", "_date"), starts_with("vaccine_day")) # write to arrow file -feather::write_feather(dummy_data, path = here::here("output", "dummy_dataset_2b.arrow")) +feather::write_feather(dummy_data_days_to_dates, path = here::here("output", "dummy_dataset_2b.arrow")) diff --git a/project.yaml b/project.yaml index 9973bc6..21a975d 100644 --- a/project.yaml +++ b/project.yaml @@ -25,23 +25,29 @@ actions: dataset: output/dataset_1.csv.gz dummy_dataset_2a: - run: r:latest analysis/dummy_definition_2a.R - outputs: - highly_sensitive: - dataset: output/dummy_dataset_2a.arrow + run: r:latest analysis/dummy_definition_2a.R + outputs: + highly_sensitive: + dataset: output/dummy_dataset_2a.arrow dummy_dataset_2b: - run: r:latest analysis/dummy_definition_2b.R - outputs: - highly_sensitive: - dataset: output/dummy_dataset_2b.arrow + run: r:latest analysis/dummy_definition_2b.R + outputs: + highly_sensitive: + dataset: output/dummy_dataset_2b.arrow generate_dataset_2: - run: ehrql:v1 generate-dataset analysis/dataset_definition_2.py + run: ehrql:v1 generate-dataset analysis/dataset_definition_2.py --output output/dataset_2.arrow # --dummy-data-file output/dummy_dataset_2b.arrow # needs: [dummy_dataset_2b] - outputs: - highly_sensitive: - dataset: output/dataset_2.arrow + outputs: + highly_sensitive: + dataset: output/dataset_2.arrow + compare_dataset_2: + run: r:latest analysis/compare_definition_2.R + needs: [dummy_dataset_2a, dummy_dataset_2b, generate_dataset_2] + outputs: + highly_sensitive: + csv: output/compare_dataset_2.csv \ No newline at end of file From 723465f1c144cf6c4e79451205b15dfab2d35ebe Mon Sep 17 00:00:00 2001 From: Will Hulme <25637345+wjchulme@users.noreply.github.com> Date: Sat, 23 Nov 2024 12:05:36 +0000 Subject: [PATCH 4/4] fix arrow; add death --- analysis/compare_definition_2.R | 96 ++++++++++++++++++++++++++++---- analysis/dataset_definition_2.py | 16 +++++- analysis/dummy_definition_2a.R | 8 ++- analysis/dummy_definition_2b.R | 39 +++++++------ docs_src/dummy_data_in_r.md | 25 +++++++++ project.yaml | 25 ++++++--- 6 files changed, 166 insertions(+), 43 deletions(-) diff --git a/analysis/compare_definition_2.R b/analysis/compare_definition_2.R index 817d363..11af7c2 100644 --- a/analysis/compare_definition_2.R +++ b/analysis/compare_definition_2.R @@ -1,16 +1,92 @@ library(dplyr) +library(ggplot2) -custom_data_a <- feather::read_feather(here::here("output", "dummy_dataset_2a.arrow")) -custom_data_b <- feather::read_feather(here::here("output", "dummy_dataset_2b.arrow")) +# developed using base R +custom_data_a <- arrow::read_feather(here::here("output", "dummy_dataset_2a.arrow")) -#ehrql_data <- readr::read_csv(here::here("output", "dataset_2.csv.gz")) -ehrql_data <- feather::read_feather(here::here("output", "dataset_2.arrow")) +# developed using dd4d +custom_data_b <- arrow::read_feather(here::here("output", "dummy_dataset_2b.arrow")) +# developed using dd4d then passed through ehrQL for checks +custom_data_c <- arrow::read_feather(here::here("output", "dummy_dataset_2c.arrow")) + +# developed in ehrQL directly +ehrql_data <- arrow::read_feather(here::here("output", "dataset_2.arrow")) + + +simplify_product_names <- function(data){ + + pfizer_name <- "COVID-19 mRNA Vaccine Comirnaty 30micrograms/0.3ml dose conc for susp for inj MDV (Pfizer)" + az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)" + + mutate( + data, + across(starts_with("vaccine_product"), + ~{ + case_when( + .x == pfizer_name ~ "pfizer", + .x == az_name ~ "az", + .default = .x + ) + } + ) + ) +} + +product_xtab <- function(data){ + with( + data, + table(vaccine_product1, vaccine_product2, useNA='ifany') + ) +} + +custom_data_a %>% + simplify_product_names() %>% + product_xtab() -# just testing yaml for now... custom_data_b %>% - summarise( - n=n(), - vaccine_date1_missing = sum(is.na(vaccine_date1)) - ) %>% - readr::write_csv(here::here("output", "compare_dataset_2.csv")) + simplify_product_names() %>% + product_xtab() + +custom_data_c %>% + simplify_product_names() %>% + product_xtab() + +ehrql_data %>% + simplify_product_names() %>% + with( + table(vax1 = vaccine_product1 %in% c("", NA), vax2 = vaccine_product2 %in% c("", NA)) + ) + + +plot_dates <- function(data){ + + data %>% + simplify_product_names() %>% + ggplot() + + geom_point(aes(x=vaccine_date1, y = vaccine_date2))+ + scale_x_date(labels = ~ format(.x, "%Y-%b"))+ + scale_y_date(labels = ~ format(.x, "%Y-%b"))+ + theme_bw() +} + +plot_dates(custom_data_c) +plot_dates(ehrql_data) + + + +plot_death <- function(data){ + + data %>% + simplify_product_names() %>% + ggplot() + + geom_histogram(aes(x=death_date))+ + scale_x_date(labels = ~ format(.x, "%Y-%b"))+ + theme_bw() +} + +sum(custom_data_c$death_date% - mutate(across(num_range("vaccine_day", 1:2), ~ index_date + .)) %>% - rename_with(~ stringr::str_replace(., "_day", "_date"), starts_with("vaccine_day")) + mutate(across(contains("_date"), ~ index_date + .)) # write to arrow file -feather::write_feather(dummy_data_days_to_dates, path = here::here("output", "dummy_dataset_2b.arrow")) +arrow::write_feather(dummy_data_days_to_dates, sink = here::here("output", "dummy_dataset_2b.arrow")) diff --git a/docs_src/dummy_data_in_r.md b/docs_src/dummy_data_in_r.md index e69de29..68cced4 100644 --- a/docs_src/dummy_data_in_r.md +++ b/docs_src/dummy_data_in_r.md @@ -0,0 +1,25 @@ +### Generate a dummy dataset using a custom R script + + +## Why use custom dummy data? +* Complete control over distribution of variables and dependencies between variables +* Complete control over dataset size +* No ad-hoc adjustments here and there - do it all in one place, once +* ehrQL will check that the custom dataset is "correct" (to an extent) + + +## Challenges +* Maintaining consistency between custom dummy data specification and ehrQL specification as the dataset definition evolves +* Need some understanding of data simulation + +## Why use dd4d instead of base R? +* An API that supports common simulation patterns, such as: + * Specifying dependent missingness + * Flagging latent variables which are needed for simulation but do not appear in the final dataset +* Supports common simulation functions that are awkward in base R (eg `rcat` and `rbernoulli`) +* Supports inspection and visualisation of the specified DAG +* Tests that catch errors early, e.g. testing that the specification isn't cyclic and if so which path is cyclic. +* Variables do not have to be specified in topological order (unlike in a `mutate` step). +* You get all this whilst retaining: + * Support for arbitrary distributions and dependencies + * IDE syntax highlighting / error-checking (unlike if dependencies were defined with strings). diff --git a/project.yaml b/project.yaml index 21a975d..3bb2760 100644 --- a/project.yaml +++ b/project.yaml @@ -36,18 +36,25 @@ actions: highly_sensitive: dataset: output/dummy_dataset_2b.arrow + dummy_dataset_2c: + run: ehrql:v1 generate-dataset analysis/dataset_definition_2.py + --output output/dummy_dataset_2c.arrow + --dummy-data-file output/dummy_dataset_2b.arrow + needs: [dummy_dataset_2b] + outputs: + highly_sensitive: + dataset: output/dummy_dataset_2c.arrow + generate_dataset_2: run: ehrql:v1 generate-dataset analysis/dataset_definition_2.py - --output output/dataset_2.arrow - # --dummy-data-file output/dummy_dataset_2b.arrow - # needs: [dummy_dataset_2b] + --output output/dataset_2.arrow outputs: highly_sensitive: dataset: output/dataset_2.arrow - compare_dataset_2: - run: r:latest analysis/compare_definition_2.R - needs: [dummy_dataset_2a, dummy_dataset_2b, generate_dataset_2] - outputs: - highly_sensitive: - csv: output/compare_dataset_2.csv \ No newline at end of file +# compare_dataset_2: +# run: r:latest analysis/compare_definition_2.R +# needs: [dummy_dataset_2a, dummy_dataset_2b, dummy_dataset_2c, generate_dataset_2] +# outputs: +# highly_sensitive: +# csv: output/compare_dataset_2.csv \ No newline at end of file