opensafely · wjchulme · Nov 23, 2024 · Nov 20, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/analysis/compare_definition_2.R b/analysis/compare_definition_2.R
@@ -0,0 +1,92 @@
+library(dplyr)
+library(ggplot2)
+
+# developed using base R
+custom_data_a <- arrow::read_feather(here::here("output", "dummy_dataset_2a.arrow"))
+
+# developed using dd4d
+custom_data_b <- arrow::read_feather(here::here("output", "dummy_dataset_2b.arrow"))
+
+# developed using dd4d then passed through ehrQL for checks
+custom_data_c <- arrow::read_feather(here::here("output", "dummy_dataset_2c.arrow"))
+
+# developed in ehrQL directly
+ehrql_data <- arrow::read_feather(here::here("output", "dataset_2.arrow"))
+
+
+simplify_product_names <- function(data){
+
+  pfizer_name <- "COVID-19 mRNA Vaccine Comirnaty 30micrograms/0.3ml dose conc for susp for inj MDV (Pfizer)"
+  az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)"
+
+  mutate(
+    data, 
+    across(starts_with("vaccine_product"), 
+      ~{
+        case_when(
+          .x == pfizer_name ~ "pfizer",
+          .x == az_name ~ "az",
+          .default = .x
+        )
+      }
+    )
+  )
+}
+
+product_xtab <- function(data){
+  with(
+    data,
+    table(vaccine_product1, vaccine_product2, useNA='ifany')
+  )
+}
+
+custom_data_a %>%
+  simplify_product_names() %>%
+  product_xtab()
+
+custom_data_b %>%
+  simplify_product_names() %>%
+  product_xtab()
+
+custom_data_c %>%
+  simplify_product_names() %>%
+  product_xtab()
+
+ehrql_data %>%
+  simplify_product_names() %>%
+  with(
+    table(vax1 = vaccine_product1 %in% c("", NA), vax2 = vaccine_product2 %in% c("", NA))
+  )
+
+
+plot_dates <- function(data){
+
+  data %>%
+  simplify_product_names() %>%
+  ggplot() +
+  geom_point(aes(x=vaccine_date1, y = vaccine_date2))+
+  scale_x_date(labels = ~ format(.x, "%Y-%b"))+
+  scale_y_date(labels = ~ format(.x, "%Y-%b"))+
+  theme_bw()
+}
+
+plot_dates(custom_data_c)
+plot_dates(ehrql_data)
+
+
+
+plot_death <- function(data){
+
+  data %>%
+    simplify_product_names() %>%
+    ggplot() +
+    geom_histogram(aes(x=death_date))+
+    scale_x_date(labels = ~ format(.x, "%Y-%b"))+
+    theme_bw()
+}
+
+sum(custom_data_c$death_date<as.Date("2021-06-01"), na.rm=TRUE)
+sum(ehrql_data$death_date<as.Date("2021-06-01"), na.rm=TRUE)
+
+plot_death(custom_data_c)
+plot_death(ehrql_data)
diff --git a/analysis/dataset_definition_1.py b/analysis/dataset_definition_1.py
@@ -0,0 +1,18 @@
+from ehrql import create_dataset
+from ehrql.tables.tpp import patients, practice_registrations
+
+dataset = create_dataset()
+
+index_date = "2020-03-31"
+
+has_registration = practice_registrations.for_patient_on(
+    index_date
+).exists_for_patient()
+
+dataset.define_population(has_registration)
+dataset.configure_dummy_data(population_size=1000)
+
+dataset.registered = has_registration
+dataset.age = patients.age_on(index_date)
+dataset.sex = patients.sex
+
diff --git a/analysis/dataset_definition_2.py b/analysis/dataset_definition_2.py
@@ -0,0 +1,46 @@
+from ehrql import create_dataset
+from ehrql.tables.tpp import patients, practice_registrations, vaccinations, ons_deaths
+
+dataset = create_dataset()
+
+index_date = "2020-12-08"
+
+has_registration = practice_registrations.for_patient_on(
+    index_date
+).exists_for_patient()
+
+alive = (
+  ons_deaths.date.is_on_or_after(index_date) | 
+  ons_deaths.date.is_null()
+)
+
+dataset.define_population(
+  has_registration & 
+  alive
+)
+dataset.configure_dummy_data(population_size=1000)
+
+dataset.registered = has_registration
+dataset.age = patients.age_on(index_date)
+dataset.sex = patients.sex
+
+covid_vaccinations = (
+  vaccinations
+  .where(vaccinations.target_disease.is_in(["SARS-2 CORONAVIRUS"]))
+  .sort_by(vaccinations.date)
+)
+
+covid_vaccinations1 = covid_vaccinations.first_for_patient()
+dataset.vaccine_date1 = covid_vaccinations1.date
+dataset.vaccine_product1 = covid_vaccinations1.product_name
+
+covid_vaccinations2 = (
+    covid_vaccinations.
+    where(covid_vaccinations.date>dataset.vaccine_date1).
+    first_for_patient()
+  )
+dataset.vaccine_date2 = covid_vaccinations2.date
+dataset.vaccine_product2 = covid_vaccinations2.product_name
+
+dataset.death_date = ons_deaths.date
+
diff --git a/analysis/dummy_definition_1.R b/analysis/dummy_definition_1.R
@@ -0,0 +1,60 @@
+
+population_size <- 1000
+
+set.seed(420)
+
+## attempt 1
+# dummy_data <- data.frame(
+#   patient_id = 1:population_size,
+#   age = rnorm(population_size, mean= 60, sd=20),
+#   sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
+# )
+
+## attempt 2 - add registered variable
+# dummy_data <- data.frame(
+#   patient_id = 1:population_size,
+#   registered = rbinom(population_size, 1, 0.99)==1,
+#   age = rnorm(population_size, mean= 60, sd=20),
+#   sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
+# )
+
+
+## attempt 3 - ensure registered is output as "T" or "F" so is  read by python correctly
+# dummy_data <- data.frame(
+#   patient_id = 1:population_size,
+#   registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"),
+#   age = rnorm(population_size, mean= 60, sd=20),
+#   sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
+# )
+
+# ## attempt 4 - fix age so it's an integer
+# dummy_data <- data.frame(
+#   patient_id = 1:population_size,
+#   registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"),
+#   age = as.integer(rnorm(population_size, mean= 60, sd=20)),
+#   sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
+# )
+
+
+## attempt 5 - fix sex so it uses the correct levels
+# dummy_data <- data.frame(
+#   patient_id = 1:population_size,
+#   registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"),
+#   age = as.integer(rnorm(population_size,  mean= 60, sd=20)),
+#   sex = sample(c("female", "male", "intersex", "unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01))
+# )
+
+## attempt 6 - add a diabetes variable that is not in the ehrql definition
+dummy_data <- data.frame(
+  patient_id = 1:population_size,
+  registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"),
+  age = as.integer(rnorm(population_size, mean= 55, sd=20)),
+  sex = sample(c("female", "male", "intersex", "unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
+  diabetes = rbinom(population_size, 1, 0.1)
+)
+
+# write to compressed csv file
+readr::write_csv(dummy_data, here::here("output", "dummy_dataset_1.csv.gz"))
+
+# write to arrow file
+feather::write_feather(dummy_data, here::here("output", "dummy_dataset_1.arrow"))
diff --git a/analysis/dummy_definition_2a.R b/analysis/dummy_definition_2a.R
@@ -0,0 +1,37 @@
+library(dplyr)
+
+set.seed(314159)
+
+pop_n <- 1000
+index_date <- as.Date("2020-12-08")
+
+pfizer_name <- "COVID-19 mRNA Vaccine Comirnaty 30micrograms/0.3ml dose conc for susp for inj MDV (Pfizer)"
+az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)"
+
+dummy_data_nomissing <- tibble(
+  patient_id = 1:pop_n,
+  registered = rbinom(pop_n, 1, 0.99)==1,
+  age = as.integer(rnorm(pop_n, mean= 55, sd=20)),
+  sex = sample(c("female", "male", "intersex", "unknown"), size = pop_n, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
+  diabetes = rbinom(pop_n, 1, 0.1)==1,
+  vaccine_date1 = index_date + runif(pop_n, 0, 150),
+  vaccine_product1 = sample(c(pfizer_name, az_name), size = pop_n, replace=TRUE, prob = c(0.50, 0.50)),
+  vaccine_date2 = vaccine_date1 + runif(pop_n, 3*7, 16*7),
+  vaccine_product2 = if_else(runif(pop_n)<0.95, vaccine_product1, "az"),
+  death_date = index_date + runif(pop_n, 0, 1000)
+)
+
+dummy_data <- 
+  mutate(
+    dummy_data_nomissing,
+    sex = if_else(runif(pop_n)<0.01, NA_character_, sex),
+    vaccine_date1 = if_else(runif(pop_n)<0.2, as.Date(NA), vaccine_date1),
+    vaccine_product1 = if_else(is.na(vaccine_date1), NA_character_, vaccine_product1),
+    vaccine_date2 = if_else(runif(pop_n)<0.1 | is.na(vaccine_date1), as.Date(NA), vaccine_date2),
+    vaccine_product2 = if_else(is.na(vaccine_date2), NA_character_, vaccine_product2),
+    death_date = if_else(runif(pop_n)<0.95, as.Date(NA), death_date)
+  )
+
+# write to arrow file
+arrow::write_feather(dummy_data, sink = here::here("output", "dummy_dataset_2a.arrow"))
+
diff --git a/analysis/dummy_definition_2b.R b/analysis/dummy_definition_2b.R
@@ -0,0 +1,69 @@
+library(dplyr)
+library(dd4d)
+
+set.seed(314159)
+
+pop_n <- 1000
+index_date <- as.Date("2020-12-08")
+
+pfizer_name <- "COVID-19 mRNA Vaccine Comirnaty 30micrograms/0.3ml dose conc for susp for inj MDV (Pfizer)"
+az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)"
+
+
+# define each variable as a node in a bayesian network
+
+sim_list <- lst(
+
+  registered = bn_node(
+    ~ rbernoulli(n = ..n, p = 0.99)
+  ),
+  age = bn_node(
+    ~ as.integer(rnorm(..n, mean= 55, sd=20)),
+  ),
+  sex = bn_node(
+    ~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"), p = c(0.50, 0.49, 0, 0.01)),
+    missing_rate = ~0.01 # this is shorthand for ~(rbernoulli(n=..n, p = 0.2))
+  ),
+  diabetes = bn_node(
+    ~ rbernoulli(n = ..n, p = 0.10)
+  ),
+  vaccine_date1 = bn_node(
+    ~ runif(n = ..n, 0, 150),
+    missing_rate = ~0.2,
+  ),
+  vaccine_date2 = bn_node(
+    ~ vaccine_date1 + runif(n = ..n, 3*7, 16*7),
+    missing_rate = ~0.1,
+    needs = "vaccine_date1"
+  ),
+  vaccine_product1 = bn_node(
+    ~ rcat(n = ..n, c(pfizer_name, az_name), c(0.5, 0.5)), 
+    needs = "vaccine_date1"
+  ),
+  vaccine_product2 = bn_node(
+    ~ if_else(runif(..n) < 0.95, vaccine_product1, c(az_name)), 
+    needs = "vaccine_date2"
+  ),
+  death_date = bn_node(
+    ~ runif(n = ..n,  0, 1000),
+    missing_rate = ~0.95
+  )
+)
+
+bn <- bn_create(sim_list,known_variables = c("pfizer_name","az_name"))
+
+# plot the network
+bn_plot(bn)
+
+# plot the network (connected nodes only)
+bn_plot(bn, connected_only = TRUE)
+
+# simulate the dataset
+dummy_data <- bn_simulate(bn, pop_size = pop_n, keep_all = FALSE, .id = "patient_id")
+
+dummy_data_days_to_dates <- dummy_data %>%
+  mutate(across(contains("_date"), ~ index_date + .)) 
+
+# write to arrow file
+arrow::write_feather(dummy_data_days_to_dates, sink = here::here("output", "dummy_dataset_2b.arrow"))
+
diff --git a/docs_src/dummy_data_in_r.md b/docs_src/dummy_data_in_r.md
@@ -0,0 +1,25 @@
+### Generate a dummy dataset using a custom R script
+
+
+## Why use custom dummy data?
+* Complete control over distribution of variables and dependencies between variables
+* Complete control over dataset size
+* No ad-hoc adjustments here and there - do it all in one place, once
+* ehrQL will check that the custom dataset is "correct" (to an extent)
+
+
+## Challenges
+* Maintaining consistency between custom dummy data specification and ehrQL specification as the dataset definition evolves
+* Need some understanding of data simulation
+
+## Why use dd4d instead of base R?
+* An API that supports common simulation patterns, such as:
+    * Specifying dependent missingness
+    * Flagging latent variables which are needed for simulation but do not appear in the final dataset
+* Supports common simulation functions that are awkward in base R (eg `rcat` and `rbernoulli`)
+* Supports inspection and visualisation of the specified DAG
+* Tests that catch errors early, e.g. testing that the specification isn't cyclic and if so which path is cyclic.
+* Variables do not have to be specified in topological order (unlike in a `mutate` step).
+* You get all this whilst retaining:
+    * Support for arbitrary distributions and dependencies
+    * IDE syntax highlighting / error-checking (unlike if dependencies were defined with strings).