diff --git a/analysis/compare_definition_2.R b/analysis/compare_definition_2.R new file mode 100644 index 0000000..11af7c2 --- /dev/null +++ b/analysis/compare_definition_2.R @@ -0,0 +1,92 @@ +library(dplyr) +library(ggplot2) + +# developed using base R +custom_data_a <- arrow::read_feather(here::here("output", "dummy_dataset_2a.arrow")) + +# developed using dd4d +custom_data_b <- arrow::read_feather(here::here("output", "dummy_dataset_2b.arrow")) + +# developed using dd4d then passed through ehrQL for checks +custom_data_c <- arrow::read_feather(here::here("output", "dummy_dataset_2c.arrow")) + +# developed in ehrQL directly +ehrql_data <- arrow::read_feather(here::here("output", "dataset_2.arrow")) + + +simplify_product_names <- function(data){ + + pfizer_name <- "COVID-19 mRNA Vaccine Comirnaty 30micrograms/0.3ml dose conc for susp for inj MDV (Pfizer)" + az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)" + + mutate( + data, + across(starts_with("vaccine_product"), + ~{ + case_when( + .x == pfizer_name ~ "pfizer", + .x == az_name ~ "az", + .default = .x + ) + } + ) + ) +} + +product_xtab <- function(data){ + with( + data, + table(vaccine_product1, vaccine_product2, useNA='ifany') + ) +} + +custom_data_a %>% + simplify_product_names() %>% + product_xtab() + +custom_data_b %>% + simplify_product_names() %>% + product_xtab() + +custom_data_c %>% + simplify_product_names() %>% + product_xtab() + +ehrql_data %>% + simplify_product_names() %>% + with( + table(vax1 = vaccine_product1 %in% c("", NA), vax2 = vaccine_product2 %in% c("", NA)) + ) + + +plot_dates <- function(data){ + + data %>% + simplify_product_names() %>% + ggplot() + + geom_point(aes(x=vaccine_date1, y = vaccine_date2))+ + scale_x_date(labels = ~ format(.x, "%Y-%b"))+ + scale_y_date(labels = ~ format(.x, "%Y-%b"))+ + theme_bw() +} + +plot_dates(custom_data_c) +plot_dates(ehrql_data) + + + +plot_death <- function(data){ + + data %>% + simplify_product_names() %>% + ggplot() + + geom_histogram(aes(x=death_date))+ + scale_x_date(labels = ~ format(.x, "%Y-%b"))+ + theme_bw() +} + +sum(custom_data_c$death_datedataset.vaccine_date1). + first_for_patient() + ) +dataset.vaccine_date2 = covid_vaccinations2.date +dataset.vaccine_product2 = covid_vaccinations2.product_name + +dataset.death_date = ons_deaths.date + diff --git a/analysis/dummy_definition_1.R b/analysis/dummy_definition_1.R new file mode 100644 index 0000000..e7c5c70 --- /dev/null +++ b/analysis/dummy_definition_1.R @@ -0,0 +1,60 @@ + +population_size <- 1000 + +set.seed(420) + +## attempt 1 +# dummy_data <- data.frame( +# patient_id = 1:population_size, +# age = rnorm(population_size, mean= 60, sd=20), +# sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)), +# ) + +## attempt 2 - add registered variable +# dummy_data <- data.frame( +# patient_id = 1:population_size, +# registered = rbinom(population_size, 1, 0.99)==1, +# age = rnorm(population_size, mean= 60, sd=20), +# sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)), +# ) + + +## attempt 3 - ensure registered is output as "T" or "F" so is read by python correctly +# dummy_data <- data.frame( +# patient_id = 1:population_size, +# registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"), +# age = rnorm(population_size, mean= 60, sd=20), +# sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)), +# ) + +# ## attempt 4 - fix age so it's an integer +# dummy_data <- data.frame( +# patient_id = 1:population_size, +# registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"), +# age = as.integer(rnorm(population_size, mean= 60, sd=20)), +# sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)), +# ) + + +## attempt 5 - fix sex so it uses the correct levels +# dummy_data <- data.frame( +# patient_id = 1:population_size, +# registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"), +# age = as.integer(rnorm(population_size, mean= 60, sd=20)), +# sex = sample(c("female", "male", "intersex", "unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)) +# ) + +## attempt 6 - add a diabetes variable that is not in the ehrql definition +dummy_data <- data.frame( + patient_id = 1:population_size, + registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"), + age = as.integer(rnorm(population_size, mean= 55, sd=20)), + sex = sample(c("female", "male", "intersex", "unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)), + diabetes = rbinom(population_size, 1, 0.1) +) + +# write to compressed csv file +readr::write_csv(dummy_data, here::here("output", "dummy_dataset_1.csv.gz")) + +# write to arrow file +feather::write_feather(dummy_data, here::here("output", "dummy_dataset_1.arrow")) diff --git a/analysis/dummy_definition_2a.R b/analysis/dummy_definition_2a.R new file mode 100644 index 0000000..798bb59 --- /dev/null +++ b/analysis/dummy_definition_2a.R @@ -0,0 +1,37 @@ +library(dplyr) + +set.seed(314159) + +pop_n <- 1000 +index_date <- as.Date("2020-12-08") + +pfizer_name <- "COVID-19 mRNA Vaccine Comirnaty 30micrograms/0.3ml dose conc for susp for inj MDV (Pfizer)" +az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)" + +dummy_data_nomissing <- tibble( + patient_id = 1:pop_n, + registered = rbinom(pop_n, 1, 0.99)==1, + age = as.integer(rnorm(pop_n, mean= 55, sd=20)), + sex = sample(c("female", "male", "intersex", "unknown"), size = pop_n, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)), + diabetes = rbinom(pop_n, 1, 0.1)==1, + vaccine_date1 = index_date + runif(pop_n, 0, 150), + vaccine_product1 = sample(c(pfizer_name, az_name), size = pop_n, replace=TRUE, prob = c(0.50, 0.50)), + vaccine_date2 = vaccine_date1 + runif(pop_n, 3*7, 16*7), + vaccine_product2 = if_else(runif(pop_n)<0.95, vaccine_product1, "az"), + death_date = index_date + runif(pop_n, 0, 1000) +) + +dummy_data <- + mutate( + dummy_data_nomissing, + sex = if_else(runif(pop_n)<0.01, NA_character_, sex), + vaccine_date1 = if_else(runif(pop_n)<0.2, as.Date(NA), vaccine_date1), + vaccine_product1 = if_else(is.na(vaccine_date1), NA_character_, vaccine_product1), + vaccine_date2 = if_else(runif(pop_n)<0.1 | is.na(vaccine_date1), as.Date(NA), vaccine_date2), + vaccine_product2 = if_else(is.na(vaccine_date2), NA_character_, vaccine_product2), + death_date = if_else(runif(pop_n)<0.95, as.Date(NA), death_date) + ) + +# write to arrow file +arrow::write_feather(dummy_data, sink = here::here("output", "dummy_dataset_2a.arrow")) + diff --git a/analysis/dummy_definition_2b.R b/analysis/dummy_definition_2b.R new file mode 100644 index 0000000..252c7bb --- /dev/null +++ b/analysis/dummy_definition_2b.R @@ -0,0 +1,69 @@ +library(dplyr) +library(dd4d) + +set.seed(314159) + +pop_n <- 1000 +index_date <- as.Date("2020-12-08") + +pfizer_name <- "COVID-19 mRNA Vaccine Comirnaty 30micrograms/0.3ml dose conc for susp for inj MDV (Pfizer)" +az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)" + + +# define each variable as a node in a bayesian network + +sim_list <- lst( + + registered = bn_node( + ~ rbernoulli(n = ..n, p = 0.99) + ), + age = bn_node( + ~ as.integer(rnorm(..n, mean= 55, sd=20)), + ), + sex = bn_node( + ~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"), p = c(0.50, 0.49, 0, 0.01)), + missing_rate = ~0.01 # this is shorthand for ~(rbernoulli(n=..n, p = 0.2)) + ), + diabetes = bn_node( + ~ rbernoulli(n = ..n, p = 0.10) + ), + vaccine_date1 = bn_node( + ~ runif(n = ..n, 0, 150), + missing_rate = ~0.2, + ), + vaccine_date2 = bn_node( + ~ vaccine_date1 + runif(n = ..n, 3*7, 16*7), + missing_rate = ~0.1, + needs = "vaccine_date1" + ), + vaccine_product1 = bn_node( + ~ rcat(n = ..n, c(pfizer_name, az_name), c(0.5, 0.5)), + needs = "vaccine_date1" + ), + vaccine_product2 = bn_node( + ~ if_else(runif(..n) < 0.95, vaccine_product1, c(az_name)), + needs = "vaccine_date2" + ), + death_date = bn_node( + ~ runif(n = ..n, 0, 1000), + missing_rate = ~0.95 + ) +) + +bn <- bn_create(sim_list,known_variables = c("pfizer_name","az_name")) + +# plot the network +bn_plot(bn) + +# plot the network (connected nodes only) +bn_plot(bn, connected_only = TRUE) + +# simulate the dataset +dummy_data <- bn_simulate(bn, pop_size = pop_n, keep_all = FALSE, .id = "patient_id") + +dummy_data_days_to_dates <- dummy_data %>% + mutate(across(contains("_date"), ~ index_date + .)) + +# write to arrow file +arrow::write_feather(dummy_data_days_to_dates, sink = here::here("output", "dummy_dataset_2b.arrow")) + diff --git a/docs_src/dummy_data_in_r.md b/docs_src/dummy_data_in_r.md index e69de29..68cced4 100644 --- a/docs_src/dummy_data_in_r.md +++ b/docs_src/dummy_data_in_r.md @@ -0,0 +1,25 @@ +### Generate a dummy dataset using a custom R script + + +## Why use custom dummy data? +* Complete control over distribution of variables and dependencies between variables +* Complete control over dataset size +* No ad-hoc adjustments here and there - do it all in one place, once +* ehrQL will check that the custom dataset is "correct" (to an extent) + + +## Challenges +* Maintaining consistency between custom dummy data specification and ehrQL specification as the dataset definition evolves +* Need some understanding of data simulation + +## Why use dd4d instead of base R? +* An API that supports common simulation patterns, such as: + * Specifying dependent missingness + * Flagging latent variables which are needed for simulation but do not appear in the final dataset +* Supports common simulation functions that are awkward in base R (eg `rcat` and `rbernoulli`) +* Supports inspection and visualisation of the specified DAG +* Tests that catch errors early, e.g. testing that the specification isn't cyclic and if so which path is cyclic. +* Variables do not have to be specified in topological order (unlike in a `mutate` step). +* You get all this whilst retaining: + * Support for arbitrary distributions and dependencies + * IDE syntax highlighting / error-checking (unlike if dependencies were defined with strings). diff --git a/project.yaml b/project.yaml index dfbccc7..17ba4c9 100644 --- a/project.yaml +++ b/project.yaml @@ -1,8 +1,52 @@ version: '4.0' actions: - generate_dataset: - run: ehrql:v1 generate-dataset analysis/ehrql_dataset_definition.py --output output/dataset.csv.gz + dummy_dataset_1: + run: r:latest analysis/dummy_definition_1.R outputs: highly_sensitive: - dataset: output/dataset.csv.gz + dataset: output/dummy_dataset_1.csv.gz + + generate_dataset_1: + run: ehrql:v1 generate-dataset analysis/dataset_definition_1.py + --output output/dataset_1.csv.gz + --dummy-data-file output/dummy_dataset_1.csv.gz + needs: [dummy_dataset_1] + outputs: + highly_sensitive: + dataset: output/dataset_1.csv.gz + + dummy_dataset_2a: + run: r:latest analysis/dummy_definition_2a.R + outputs: + highly_sensitive: + dataset: output/dummy_dataset_2a.arrow + + dummy_dataset_2b: + run: r:latest analysis/dummy_definition_2b.R + outputs: + highly_sensitive: + dataset: output/dummy_dataset_2b.arrow + + dummy_dataset_2c: + run: ehrql:v1 generate-dataset analysis/dataset_definition_2.py + --output output/dummy_dataset_2c.arrow + --dummy-data-file output/dummy_dataset_2b.arrow + needs: [dummy_dataset_2b] + outputs: + highly_sensitive: + dataset: output/dummy_dataset_2c.arrow + + generate_dataset_2: + run: ehrql:v1 generate-dataset analysis/dataset_definition_2.py + --output output/dataset_2.arrow + outputs: + highly_sensitive: + dataset: output/dataset_2.arrow + +# compare_dataset_2: +# run: r:latest analysis/compare_definition_2.R +# needs: [dummy_dataset_2a, dummy_dataset_2b, dummy_dataset_2c, generate_dataset_2] +# outputs: +# highly_sensitive: +# csv: output/compare_dataset_2.csv \ No newline at end of file