From a36c7e28e83375c551c9bed4e2c41ccc48edf8c4 Mon Sep 17 00:00:00 2001
From: Will Hulme <25637345+wjchulme@users.noreply.github.com>
Date: Wed, 20 Nov 2024 22:32:18 +0000
Subject: [PATCH 1/4] add example actions

---
 analysis/compare_definition_2.R  | 10 +++++
 analysis/dataset_definition_1.py | 18 +++++++++
 analysis/dataset_definition_2.py | 36 ++++++++++++++++++
 analysis/dummy_definition_1.R    | 60 +++++++++++++++++++++++++++++
 analysis/dummy_definition_2a.R   | 35 +++++++++++++++++
 analysis/dummy_definition_2b.R   | 65 ++++++++++++++++++++++++++++++++
 project.yaml                     | 46 ++++++++++++++++++++--
 7 files changed, 267 insertions(+), 3 deletions(-)
 create mode 100644 analysis/compare_definition_2.R
 create mode 100644 analysis/dataset_definition_1.py
 create mode 100644 analysis/dataset_definition_2.py
 create mode 100644 analysis/dummy_definition_1.R
 create mode 100644 analysis/dummy_definition_2a.R
 create mode 100644 analysis/dummy_definition_2b.R

diff --git a/analysis/compare_definition_2.R b/analysis/compare_definition_2.R
new file mode 100644
index 0000000..7d24023
--- /dev/null
+++ b/analysis/compare_definition_2.R
@@ -0,0 +1,10 @@
+
+custom_data_a <- feather::read_feather(here::here("output", "dummy_dataset_2a.arrow"))
+custom_data_b <- feather::read_feather(here::here("output", "dummy_dataset_2b.arrow"))
+
+ehrql_data <- readr::read_csv(here::here("output", "dataset_2.csv.gz"))
+ehrql_data <- feather::read_feather(here::here("output", "dataset_2.arrow"))
+
+
+
+
diff --git a/analysis/dataset_definition_1.py b/analysis/dataset_definition_1.py
new file mode 100644
index 0000000..fc80272
--- /dev/null
+++ b/analysis/dataset_definition_1.py
@@ -0,0 +1,18 @@
+from ehrql import create_dataset
+from ehrql.tables.tpp import patients, practice_registrations
+
+dataset = create_dataset()
+
+index_date = "2020-03-31"
+
+has_registration = practice_registrations.for_patient_on(
+    index_date
+).exists_for_patient()
+
+dataset.define_population(has_registration)
+dataset.configure_dummy_data(population_size=1000)
+
+dataset.registered = has_registration
+dataset.age = patients.age_on(index_date)
+dataset.sex = patients.sex
+
diff --git a/analysis/dataset_definition_2.py b/analysis/dataset_definition_2.py
new file mode 100644
index 0000000..938c47b
--- /dev/null
+++ b/analysis/dataset_definition_2.py
@@ -0,0 +1,36 @@
+from ehrql import create_dataset
+from ehrql.tables.tpp import patients, practice_registrations, vaccinations
+
+dataset = create_dataset()
+
+index_date = "2020-12-08"
+
+has_registration = practice_registrations.for_patient_on(
+    index_date
+).exists_for_patient()
+
+dataset.define_population(has_registration)
+dataset.configure_dummy_data(population_size=1000)
+
+dataset.registered = has_registration
+dataset.age = patients.age_on(index_date)
+dataset.sex = patients.sex
+
+
+covid_vaccinations = (
+  vaccinations
+  .where(vaccinations.target_disease.is_in(["SARS-2 CORONAVIRUS"]))
+  .sort_by(vaccinations.date)
+)
+
+covid_vaccinations1 = covid_vaccinations.first_for_patient()
+dataset.vaccine_date1 = covid_vaccinations1.date
+dataset.vaccine_product1 = covid_vaccinations1.product_name
+
+covid_vaccinations2 = (
+    covid_vaccinations.
+    where(covid_vaccinations.date>dataset.vaccine_date1).
+    first_for_patient()
+  )
+dataset.vaccine_date2 = covid_vaccinations2.date
+dataset.vaccine_product2 = covid_vaccinations2.product_name
diff --git a/analysis/dummy_definition_1.R b/analysis/dummy_definition_1.R
new file mode 100644
index 0000000..684012e
--- /dev/null
+++ b/analysis/dummy_definition_1.R
@@ -0,0 +1,60 @@
+
+population_size <- 1000
+
+set.seed(420)
+
+## attempt 1
+# dummy_data <- data.frame(
+#   patient_id = 1:population_size,
+#   age = rnorm(population_size, mean= 60, sd=20),
+#   sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
+# )
+
+## attempt 2 - add registered variable
+# dummy_data <- data.frame(
+#   patient_id = 1:population_size,
+#   registered = rbinom(population_size, 1, 0.99)==1,
+#   age = rnorm(population_size, mean= 60, sd=20),
+#   sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
+# )
+
+
+## attempt 3 - ensure registered is output as "T" or "F" so is  read by python correctly
+# dummy_data <- data.frame(
+#   patient_id = 1:population_size,
+#   registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"),
+#   age = rnorm(population_size, mean= 60, sd=20),
+#   sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
+# )
+
+# ## attempt 4 - fix age so it's an integer
+# dummy_data <- data.frame(
+#   patient_id = 1:population_size,
+#   registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"),
+#   age = as.integer(rnorm(population_size, mean= 60, sd=20)),
+#   sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
+# )
+
+
+## attempt 5 - fix sex so it uses the correct levels
+# dummy_data <- data.frame(
+#   patient_id = 1:population_size,
+#   registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"),
+#   age = as.integer(rnorm(population_size,  mean= 60, sd=20)),
+#   sex = sample(c("female", "male", "intersex", "unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01))
+# )
+
+## attempt 6 - add a diabetes variable that is not in the ehrql definition
+dummy_data <- data.frame(
+  patient_id = 1:population_size,
+  registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"),
+  age = as.integer(rnorm(population_size, mean= 55, sd=20)),
+  sex = sample(c("female", "male", "intersex", "unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
+  diabetes = rbinom(population_size, 1, 0.1)
+)
+
+# write to compressed csv file
+readr::write_csv(dummy_data, here::here("output", "dummy_dataset_1.csv.gz"))
+
+# write to arrow file
+# feather::write_feather(dummy_data, here::here("output", "dummy_dataset_1.arrow"))
diff --git a/analysis/dummy_definition_2a.R b/analysis/dummy_definition_2a.R
new file mode 100644
index 0000000..a727396
--- /dev/null
+++ b/analysis/dummy_definition_2a.R
@@ -0,0 +1,35 @@
+library(dplyr)
+
+set.seed(314159)
+
+pop_n <- 1000
+index_date <- as.Date("2020-12-08")
+
+pfizer_name <- "COVID-19 mRNA Vaccine Comirnaty 30micrograms/0.3ml dose conc for susp for inj MDV (Pfizer)"
+az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)"
+
+dummy_data_nomissing <- tibble(
+  patient_id = 1:pop_n,
+  registered = rbinom(pop_n, 1, 0.99)==1,
+  age = as.integer(rnorm(pop_n, mean= 55, sd=20)),
+  sex = sample(c("female", "male", "intersex", "unknown"), size = pop_n, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
+  diabetes = rbinom(pop_n, 1, 0.1)==1,
+  vaccine_date1 = index_date + runif(pop_n, 0, 150),
+  vaccine_product1 = sample(c(pfizer_name, az_name), size = pop_n, replace=TRUE, prob = c(0.50, 0.50)),
+  vaccine_date2 = vaccine_date1 + runif(pop_n, 3*7, 16*7),
+  vaccine_product2 = if_else(runif(pop_n)<0.95, vaccine_product1, "az"),
+)
+
+dummy_data <- 
+  mutate(
+    dummy_data_nomissing,
+    vaccine_date1 = if_else(runif(pop_n)<0.2, as.Date(NA), vaccine_date1),
+    vaccine_product1 = if_else(is.na(vaccine_date1), NA_character_, vaccine_product1),
+    vaccine_date2 = if_else(runif(pop_n)<0.1 | is.na(vaccine_date1), as.Date(NA), vaccine_date2),
+    vaccine_product2 = if_else(is.na(vaccine_date2), NA_character_, vaccine_product2)
+  )
+
+
+# write to arrow file
+feather::write_feather(dummy_data, path = here::here("output", "dummy_dataset_2a.arrow"))
+
diff --git a/analysis/dummy_definition_2b.R b/analysis/dummy_definition_2b.R
new file mode 100644
index 0000000..59c5599
--- /dev/null
+++ b/analysis/dummy_definition_2b.R
@@ -0,0 +1,65 @@
+library(dplyr)
+library(dd4d)
+
+set.seed(314159)
+
+pop_n <- 1000
+index_date <- as.Date("2020-12-08")
+
+pfizer_name <- "COVID-19 mRNA Vaccine Comirnaty 30micrograms/0.3ml dose conc for susp for inj MDV (Pfizer)"
+az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)"
+
+
+# define each variable as a node in a bayesian network
+
+sim_list_vax_info <- lst(
+  
+  registered = bn_node(
+    ~ rbernoulli(n = ..n, p = 0.99)
+  ),
+  
+  age = bn_node(
+    ~ as.integer(rnorm(pop_n, mean= 55, sd=20)),
+  ),
+  
+  sex = bn_node(
+    ~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"), p = c(0.50, 0.49, 0, 0.01)),
+    missing_rate = ~0.001 # this is shorthand for ~(rbernoulli(n=..n, p = 0.2))
+  ),
+  
+  diabetes = bn_node(
+    ~ rbernoulli(n = ..n, p = 0.10)
+  ),
+
+  vaccine_date1 = bn_node(
+    ~ index_date + runif(n = ..n, 0, 150),
+    missing_rate = ~0.2,
+  ),
+  vaccine_date2 = bn_node(
+    ~ covid_vax_1_day + runif(n = ..n, 3*7, 16*7),
+    missing_rate = ~0.001,
+    needs = "vaccine_date2"
+  ),
+  
+  vaccine_product1 = bn_node(~ rcat(n = ..n, c("pfizer", "az"), c(0.5, 0.5)), needs = "vaccine_date1"),
+  vaccine_product2 = bn_node(~ if_else(runif(..n) < 0.95, vaccine_product1, "az"), needs = "vaccine_date2"),
+
+)
+
+bn <- bn_create(sim_list, known_variables = known_variables)
+
+# plot the network
+bn_plot(bn)
+
+# plot the network (connected nodes only)
+bn_plot(bn, connected_only = TRUE)
+
+
+# simulate the dataset
+dummydata <- bn_simulate(bn, pop_size = population_size, keep_all = FALSE, .id = "patient_id")
+
+
+
+# write to arrow file
+feather::write_feather(dummy_data, path = here::here("output", "dummy_dataset_2b.arrow"))
+
diff --git a/project.yaml b/project.yaml
index b941ab8..2989462 100644
--- a/project.yaml
+++ b/project.yaml
@@ -1,8 +1,48 @@
 version: '4.0'
 
 actions:
-  generate_dataset:
-    run: ehrql:v1 generate-dataset analysis/dataset_definition.py --output output/dataset.csv.gz
+  #generate_dataset:
+  #  run: ehrql:v1 generate-dataset analysis/dataset_definition.py --output output/dataset.csv.gz
+  #  outputs:
+  #    highly_sensitive:
+  #      dataset: output/dataset.csv.gz
+
+
+
+  dummy_dataset_1:
+    run: r:latest analysis/dummy_definition_1.R
+    outputs:
+      highly_sensitive:
+        dataset: output/dummy_dataset_1.csv.gz
+
+  generate_dataset_1:
+    run: ehrql:v1 generate-dataset analysis/dataset_definition_1.py 
+      --output output/dataset_1.csv.gz
+      --dummy-data-file output/dummy_dataset_1.csv.gz
+    needs: [dummy_dataset_1]
     outputs:
       highly_sensitive:
-        dataset: output/dataset.csv.gz
+        dataset: output/dataset_1.csv.gz
+  
+  compare_dataset_1:
+    run: r:latest analysis/compare_definition_1.R
+    needs: [dummy_dataset_1, generate_dataset_1]
+    outputs:
+      highly_sensitive:
+        csv: output/comparison_1.csv
+
+  dummy_dataset_2:
+   run: r:latest analysis/dummy_definition_2.R
+   outputs:
+     highly_sensitive:
+       dataset: output/dummy_dataset_2.arrow
+
+  generate_dataset_2:
+   run: ehrql:v1 generate-dataset analysis/dataset_definition_2.py
+     --output output/dataset_2.arrow
+  #   --dummy-data-file output/dummy_dataset_2.arrow
+  # needs: [dummy_dataset_2]
+   outputs:
+     highly_sensitive:
+       dataset: output/dataset_2.arrow
+

From 46ee2c15ee77ebd98a3fb63890a2899664a6dd0b Mon Sep 17 00:00:00 2001
From: Will Hulme <25637345+wjchulme@users.noreply.github.com>
Date: Thu, 21 Nov 2024 09:25:27 +0000
Subject: [PATCH 2/4] fix dummy spec

---
 analysis/compare_definition_2.R |  3 ---
 analysis/dummy_definition_1.R   |  2 +-
 analysis/dummy_definition_2b.R  | 11 +++++------
 project.yaml                    | 21 ++++++++++-----------
 4 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/analysis/compare_definition_2.R b/analysis/compare_definition_2.R
index 7d24023..af2afba 100644
--- a/analysis/compare_definition_2.R
+++ b/analysis/compare_definition_2.R
@@ -5,6 +5,3 @@ custom_data_b <- feather::read_feather(here::here("output", "dummy_dataset_2b.ar
 ehrql_data <- readr::read_csv(here::here("output", "dataset_2.csv.gz"))
 ehrql_data <- feather::read_feather(here::here("output", "dataset_2.arrow"))
 
-
-
-
diff --git a/analysis/dummy_definition_1.R b/analysis/dummy_definition_1.R
index 684012e..e7c5c70 100644
--- a/analysis/dummy_definition_1.R
+++ b/analysis/dummy_definition_1.R
@@ -57,4 +57,4 @@ dummy_data <- data.frame(
 readr::write_csv(dummy_data, here::here("output", "dummy_dataset_1.csv.gz"))
 
 # write to arrow file
-# feather::write_feather(dummy_data, here::here("output", "dummy_dataset_1.arrow"))
+feather::write_feather(dummy_data, here::here("output", "dummy_dataset_1.arrow"))
diff --git a/analysis/dummy_definition_2b.R b/analysis/dummy_definition_2b.R
index 59c5599..2607e30 100644
--- a/analysis/dummy_definition_2b.R
+++ b/analysis/dummy_definition_2b.R
@@ -12,14 +12,14 @@ az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)"
 
 # define each variable as a node in a bayesian network
 
-sim_list_vax_info <- lst(
+sim_list <- lst(
   
   registered = bn_node(
     ~ rbernoulli(n = ..n, p = 0.99)
   ),
   
   age = bn_node(
-    ~ as.integer(rnorm(pop_n, mean= 55, sd=20)),
+    ~ as.integer(rnorm(..n, mean= 55, sd=20)),
   ),
   
   sex = bn_node(
@@ -36,7 +36,7 @@ sim_list_vax_info <- lst(
     missing_rate = ~0.2,
   ),
   vaccine_date2 = bn_node(
-    ~ covid_vax_1_day + runif(n = ..n, 3*7, 16*7),
+    ~ vaccine_date1 + runif(n = ..n, 3*7, 16*7),
     missing_rate = ~0.001,
     needs = "vaccine_date2"
   ),
@@ -46,7 +46,7 @@ sim_list_vax_info <- lst(
 
 )
 
-bn <- bn_create(sim_list, known_variables = known_variables)
+bn <- bn_create(sim_list, known_variables = c("index_date"))
 
 # plot the network
 bn_plot(bn)
@@ -54,9 +54,8 @@ bn_plot(bn)
 # plot the network (connected nodes only)
 bn_plot(bn, connected_only = TRUE)
 
-
 # simulate the dataset
-dummydata <- bn_simulate(bn, pop_size = population_size, keep_all = FALSE, .id = "patient_id")
+dummydata <- bn_simulate(bn, pop_size = pop_n, keep_all = FALSE, .id = "patient_id")
 
 
 
diff --git a/project.yaml b/project.yaml
index 2989462..9973bc6 100644
--- a/project.yaml
+++ b/project.yaml
@@ -24,24 +24,23 @@ actions:
       highly_sensitive:
         dataset: output/dataset_1.csv.gz
   
-  compare_dataset_1:
-    run: r:latest analysis/compare_definition_1.R
-    needs: [dummy_dataset_1, generate_dataset_1]
-    outputs:
-      highly_sensitive:
-        csv: output/comparison_1.csv
+  dummy_dataset_2a:
+   run: r:latest analysis/dummy_definition_2a.R
+   outputs:
+     highly_sensitive:
+       dataset: output/dummy_dataset_2a.arrow
 
-  dummy_dataset_2:
-   run: r:latest analysis/dummy_definition_2.R
+  dummy_dataset_2b:
+   run: r:latest analysis/dummy_definition_2b.R
    outputs:
      highly_sensitive:
-       dataset: output/dummy_dataset_2.arrow
+       dataset: output/dummy_dataset_2b.arrow
 
   generate_dataset_2:
    run: ehrql:v1 generate-dataset analysis/dataset_definition_2.py
      --output output/dataset_2.arrow
-  #   --dummy-data-file output/dummy_dataset_2.arrow
-  # needs: [dummy_dataset_2]
+  #   --dummy-data-file output/dummy_dataset_2b.arrow
+  # needs: [dummy_dataset_2b]
    outputs:
      highly_sensitive:
        dataset: output/dataset_2.arrow

From 3ad530da236105d0b737a3aa86fe12f01e41911f Mon Sep 17 00:00:00 2001
From: Will Hulme <25637345+wjchulme@users.noreply.github.com>
Date: Thu, 21 Nov 2024 11:10:40 +0000
Subject: [PATCH 3/4] fix dates, demo feather issue

---
 analysis/compare_definition_2.R | 11 ++++++++++-
 analysis/dummy_definition_2b.R  | 24 +++++++++++++-----------
 project.yaml                    | 30 ++++++++++++++++++------------
 3 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/analysis/compare_definition_2.R b/analysis/compare_definition_2.R
index af2afba..817d363 100644
--- a/analysis/compare_definition_2.R
+++ b/analysis/compare_definition_2.R
@@ -1,7 +1,16 @@
+library(dplyr)
 
 custom_data_a <- feather::read_feather(here::here("output", "dummy_dataset_2a.arrow"))
 custom_data_b <- feather::read_feather(here::here("output", "dummy_dataset_2b.arrow"))
 
-ehrql_data <- readr::read_csv(here::here("output", "dataset_2.csv.gz"))
+#ehrql_data <- readr::read_csv(here::here("output", "dataset_2.csv.gz"))
 ehrql_data <- feather::read_feather(here::here("output", "dataset_2.arrow"))
 
+
+# just testing yaml for now...
+custom_data_b %>%
+  summarise(
+    n=n(),
+    vaccine_date1_missing = sum(is.na(vaccine_date1))
+  ) %>%
+  readr::write_csv(here::here("output", "compare_dataset_2.csv"))
diff --git a/analysis/dummy_definition_2b.R b/analysis/dummy_definition_2b.R
index 2607e30..c1b5b33 100644
--- a/analysis/dummy_definition_2b.R
+++ b/analysis/dummy_definition_2b.R
@@ -31,22 +31,22 @@ sim_list <- lst(
     ~ rbernoulli(n = ..n, p = 0.10)
   ),
 
-  vaccine_date1 = bn_node(
-    ~ index_date + runif(n = ..n, 0, 150),
+  vaccine_day1 = bn_node(
+    ~ runif(n = ..n, 0, 150),
     missing_rate = ~0.2,
   ),
-  vaccine_date2 = bn_node(
-    ~ vaccine_date1 + runif(n = ..n, 3*7, 16*7),
+  vaccine_day2 = bn_node(
+    ~ vaccine_day1 + runif(n = ..n, 3*7, 16*7),
     missing_rate = ~0.001,
-    needs = "vaccine_date2"
+    needs = "vaccine_day2"
   ),
   
-  vaccine_product1 = bn_node(~ rcat(n = ..n, c("pfizer", "az"), c(0.5, 0.5)), needs = "vaccine_date1"),
-  vaccine_product2 = bn_node(~ if_else(runif(..n) < 0.95, vaccine_product1, "az"), needs = "vaccine_date2"),
+  vaccine_product1 = bn_node(~ rcat(n = ..n, c("pfizer", "az"), c(0.5, 0.5)), needs = "vaccine_day1"),
+  vaccine_product2 = bn_node(~ if_else(runif(..n) < 0.95, vaccine_product1, "az"), needs = "vaccine_day2"),
 
 )
 
-bn <- bn_create(sim_list, known_variables = c("index_date"))
+bn <- bn_create(sim_list)
 
 # plot the network
 bn_plot(bn)
@@ -55,10 +55,12 @@ bn_plot(bn)
 bn_plot(bn, connected_only = TRUE)
 
 # simulate the dataset
-dummydata <- bn_simulate(bn, pop_size = pop_n, keep_all = FALSE, .id = "patient_id")
-
+dummy_data <- bn_simulate(bn, pop_size = pop_n, keep_all = FALSE, .id = "patient_id")
 
+dummy_data_days_to_dates <- dummy_data %>%
+  mutate(across(num_range("vaccine_day", 1:2), ~ index_date + .)) %>%
+  rename_with(~ stringr::str_replace(., "_day", "_date"), starts_with("vaccine_day"))
 
 # write to arrow file
-feather::write_feather(dummy_data, path = here::here("output", "dummy_dataset_2b.arrow"))
+feather::write_feather(dummy_data_days_to_dates, path = here::here("output", "dummy_dataset_2b.arrow"))
 
diff --git a/project.yaml b/project.yaml
index 9973bc6..21a975d 100644
--- a/project.yaml
+++ b/project.yaml
@@ -25,23 +25,29 @@ actions:
         dataset: output/dataset_1.csv.gz
   
   dummy_dataset_2a:
-   run: r:latest analysis/dummy_definition_2a.R
-   outputs:
-     highly_sensitive:
-       dataset: output/dummy_dataset_2a.arrow
+    run: r:latest analysis/dummy_definition_2a.R
+    outputs:
+      highly_sensitive:
+        dataset: output/dummy_dataset_2a.arrow
 
   dummy_dataset_2b:
-   run: r:latest analysis/dummy_definition_2b.R
-   outputs:
-     highly_sensitive:
-       dataset: output/dummy_dataset_2b.arrow
+    run: r:latest analysis/dummy_definition_2b.R
+    outputs:
+      highly_sensitive:
+        dataset: output/dummy_dataset_2b.arrow
 
   generate_dataset_2:
-   run: ehrql:v1 generate-dataset analysis/dataset_definition_2.py
+    run: ehrql:v1 generate-dataset analysis/dataset_definition_2.py
      --output output/dataset_2.arrow
   #   --dummy-data-file output/dummy_dataset_2b.arrow
   # needs: [dummy_dataset_2b]
-   outputs:
-     highly_sensitive:
-       dataset: output/dataset_2.arrow
+    outputs:
+      highly_sensitive:
+        dataset: output/dataset_2.arrow
 
+  compare_dataset_2:
+    run: r:latest analysis/compare_definition_2.R
+    needs: [dummy_dataset_2a, dummy_dataset_2b, generate_dataset_2]
+    outputs:
+      highly_sensitive:
+        csv: output/compare_dataset_2.csv
\ No newline at end of file

From 723465f1c144cf6c4e79451205b15dfab2d35ebe Mon Sep 17 00:00:00 2001
From: Will Hulme <25637345+wjchulme@users.noreply.github.com>
Date: Sat, 23 Nov 2024 12:05:36 +0000
Subject: [PATCH 4/4] fix arrow; add death

---
 analysis/compare_definition_2.R  | 96 ++++++++++++++++++++++++++++----
 analysis/dataset_definition_2.py | 16 +++++-
 analysis/dummy_definition_2a.R   |  8 ++-
 analysis/dummy_definition_2b.R   | 39 +++++++------
 docs_src/dummy_data_in_r.md      | 25 +++++++++
 project.yaml                     | 25 ++++++---
 6 files changed, 166 insertions(+), 43 deletions(-)

diff --git a/analysis/compare_definition_2.R b/analysis/compare_definition_2.R
index 817d363..11af7c2 100644
--- a/analysis/compare_definition_2.R
+++ b/analysis/compare_definition_2.R
@@ -1,16 +1,92 @@
 library(dplyr)
+library(ggplot2)
 
-custom_data_a <- feather::read_feather(here::here("output", "dummy_dataset_2a.arrow"))
-custom_data_b <- feather::read_feather(here::here("output", "dummy_dataset_2b.arrow"))
+# developed using base R
+custom_data_a <- arrow::read_feather(here::here("output", "dummy_dataset_2a.arrow"))
 
-#ehrql_data <- readr::read_csv(here::here("output", "dataset_2.csv.gz"))
-ehrql_data <- feather::read_feather(here::here("output", "dataset_2.arrow"))
+# developed using dd4d
+custom_data_b <- arrow::read_feather(here::here("output", "dummy_dataset_2b.arrow"))
 
+# developed using dd4d then passed through ehrQL for checks
+custom_data_c <- arrow::read_feather(here::here("output", "dummy_dataset_2c.arrow"))
+
+# developed in ehrQL directly
+ehrql_data <- arrow::read_feather(here::here("output", "dataset_2.arrow"))
+
+
+simplify_product_names <- function(data){
+  
+  pfizer_name <- "COVID-19 mRNA Vaccine Comirnaty 30micrograms/0.3ml dose conc for susp for inj MDV (Pfizer)"
+  az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)"
+  
+  mutate(
+    data, 
+    across(starts_with("vaccine_product"), 
+      ~{
+        case_when(
+          .x == pfizer_name ~ "pfizer",
+          .x == az_name ~ "az",
+          .default = .x
+        )
+      }
+    )
+  )
+}
+
+product_xtab <- function(data){
+  with(
+    data,
+    table(vaccine_product1, vaccine_product2, useNA='ifany')
+  )
+}
+
+custom_data_a %>%
+  simplify_product_names() %>%
+  product_xtab()
 
-# just testing yaml for now...
 custom_data_b %>%
-  summarise(
-    n=n(),
-    vaccine_date1_missing = sum(is.na(vaccine_date1))
-  ) %>%
-  readr::write_csv(here::here("output", "compare_dataset_2.csv"))
+  simplify_product_names() %>%
+  product_xtab()
+
+custom_data_c %>%
+  simplify_product_names() %>%
+  product_xtab()
+
+ehrql_data %>%
+  simplify_product_names() %>%
+  with(
+    table(vax1 = vaccine_product1 %in% c("", NA), vax2 = vaccine_product2 %in% c("", NA))
+  )
+
+
+plot_dates <- function(data){
+  
+  data %>%
+  simplify_product_names() %>%
+  ggplot() +
+  geom_point(aes(x=vaccine_date1, y = vaccine_date2))+
+  scale_x_date(labels = ~ format(.x, "%Y-%b"))+
+  scale_y_date(labels = ~ format(.x, "%Y-%b"))+
+  theme_bw()
+}
+
+plot_dates(custom_data_c)
+plot_dates(ehrql_data)
+
+
+
+plot_death <- function(data){
+  
+  data %>%
+    simplify_product_names() %>%
+    ggplot() +
+    geom_histogram(aes(x=death_date))+
+    scale_x_date(labels = ~ format(.x, "%Y-%b"))+
+    theme_bw()
+}
+
+sum(custom_data_c$death_date<as.Date("2021-06-01"), na.rm=TRUE)
+sum(ehrql_data$death_date<as.Date("2021-06-01"), na.rm=TRUE)
+
+plot_death(custom_data_c)
+plot_death(ehrql_data)
diff --git a/analysis/dataset_definition_2.py b/analysis/dataset_definition_2.py
index 938c47b..1a5bf54 100644
--- a/analysis/dataset_definition_2.py
+++ b/analysis/dataset_definition_2.py
@@ -1,5 +1,5 @@
 from ehrql import create_dataset
-from ehrql.tables.tpp import patients, practice_registrations, vaccinations
+from ehrql.tables.tpp import patients, practice_registrations, vaccinations, ons_deaths
 
 dataset = create_dataset()
 
@@ -9,14 +9,21 @@
     index_date
 ).exists_for_patient()
 
-dataset.define_population(has_registration)
+alive = (
+  ons_deaths.date.is_on_or_after(index_date) | 
+  ons_deaths.date.is_null()
+)
+
+dataset.define_population(
+  has_registration & 
+  alive
+)
 dataset.configure_dummy_data(population_size=1000)
 
 dataset.registered = has_registration
 dataset.age = patients.age_on(index_date)
 dataset.sex = patients.sex
 
-
 covid_vaccinations = (
   vaccinations
   .where(vaccinations.target_disease.is_in(["SARS-2 CORONAVIRUS"]))
@@ -34,3 +41,6 @@
   )
 dataset.vaccine_date2 = covid_vaccinations2.date
 dataset.vaccine_product2 = covid_vaccinations2.product_name
+
+dataset.death_date = ons_deaths.date
+
diff --git a/analysis/dummy_definition_2a.R b/analysis/dummy_definition_2a.R
index a727396..798bb59 100644
--- a/analysis/dummy_definition_2a.R
+++ b/analysis/dummy_definition_2a.R
@@ -18,18 +18,20 @@ dummy_data_nomissing <- tibble(
   vaccine_product1 = sample(c(pfizer_name, az_name), size = pop_n, replace=TRUE, prob = c(0.50, 0.50)),
   vaccine_date2 = vaccine_date1 + runif(pop_n, 3*7, 16*7),
   vaccine_product2 = if_else(runif(pop_n)<0.95, vaccine_product1, "az"),
+  death_date = index_date + runif(pop_n, 0, 1000)
 )
 
 dummy_data <- 
   mutate(
     dummy_data_nomissing,
+    sex = if_else(runif(pop_n)<0.01, NA_character_, sex),
     vaccine_date1 = if_else(runif(pop_n)<0.2, as.Date(NA), vaccine_date1),
     vaccine_product1 = if_else(is.na(vaccine_date1), NA_character_, vaccine_product1),
     vaccine_date2 = if_else(runif(pop_n)<0.1 | is.na(vaccine_date1), as.Date(NA), vaccine_date2),
-    vaccine_product2 = if_else(is.na(vaccine_date2), NA_character_, vaccine_product2)
+    vaccine_product2 = if_else(is.na(vaccine_date2), NA_character_, vaccine_product2),
+    death_date = if_else(runif(pop_n)<0.95, as.Date(NA), death_date)
   )
 
-
 # write to arrow file
-feather::write_feather(dummy_data, path = here::here("output", "dummy_dataset_2a.arrow"))
+arrow::write_feather(dummy_data, sink = here::here("output", "dummy_dataset_2a.arrow"))
 
diff --git a/analysis/dummy_definition_2b.R b/analysis/dummy_definition_2b.R
index c1b5b33..252c7bb 100644
--- a/analysis/dummy_definition_2b.R
+++ b/analysis/dummy_definition_2b.R
@@ -17,36 +17,40 @@ sim_list <- lst(
   registered = bn_node(
     ~ rbernoulli(n = ..n, p = 0.99)
   ),
-  
   age = bn_node(
     ~ as.integer(rnorm(..n, mean= 55, sd=20)),
   ),
-  
   sex = bn_node(
     ~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"), p = c(0.50, 0.49, 0, 0.01)),
-    missing_rate = ~0.001 # this is shorthand for ~(rbernoulli(n=..n, p = 0.2))
+    missing_rate = ~0.01 # this is shorthand for ~(rbernoulli(n=..n, p = 0.2))
   ),
-  
   diabetes = bn_node(
     ~ rbernoulli(n = ..n, p = 0.10)
   ),
-
-  vaccine_day1 = bn_node(
+  vaccine_date1 = bn_node(
     ~ runif(n = ..n, 0, 150),
     missing_rate = ~0.2,
   ),
-  vaccine_day2 = bn_node(
-    ~ vaccine_day1 + runif(n = ..n, 3*7, 16*7),
-    missing_rate = ~0.001,
-    needs = "vaccine_day2"
+  vaccine_date2 = bn_node(
+    ~ vaccine_date1 + runif(n = ..n, 3*7, 16*7),
+    missing_rate = ~0.1,
+    needs = "vaccine_date1"
   ),
-  
-  vaccine_product1 = bn_node(~ rcat(n = ..n, c("pfizer", "az"), c(0.5, 0.5)), needs = "vaccine_day1"),
-  vaccine_product2 = bn_node(~ if_else(runif(..n) < 0.95, vaccine_product1, "az"), needs = "vaccine_day2"),
-
+  vaccine_product1 = bn_node(
+    ~ rcat(n = ..n, c(pfizer_name, az_name), c(0.5, 0.5)), 
+    needs = "vaccine_date1"
+  ),
+  vaccine_product2 = bn_node(
+    ~ if_else(runif(..n) < 0.95, vaccine_product1, c(az_name)), 
+    needs = "vaccine_date2"
+  ),
+  death_date = bn_node(
+    ~ runif(n = ..n,  0, 1000),
+    missing_rate = ~0.95
+  )
 )
 
-bn <- bn_create(sim_list)
+bn <- bn_create(sim_list,known_variables = c("pfizer_name","az_name"))
 
 # plot the network
 bn_plot(bn)
@@ -58,9 +62,8 @@ bn_plot(bn, connected_only = TRUE)
 dummy_data <- bn_simulate(bn, pop_size = pop_n, keep_all = FALSE, .id = "patient_id")
 
 dummy_data_days_to_dates <- dummy_data %>%
-  mutate(across(num_range("vaccine_day", 1:2), ~ index_date + .)) %>%
-  rename_with(~ stringr::str_replace(., "_day", "_date"), starts_with("vaccine_day"))
+  mutate(across(contains("_date"), ~ index_date + .)) 
 
 # write to arrow file
-feather::write_feather(dummy_data_days_to_dates, path = here::here("output", "dummy_dataset_2b.arrow"))
+arrow::write_feather(dummy_data_days_to_dates, sink = here::here("output", "dummy_dataset_2b.arrow"))
 
diff --git a/docs_src/dummy_data_in_r.md b/docs_src/dummy_data_in_r.md
index e69de29..68cced4 100644
--- a/docs_src/dummy_data_in_r.md
+++ b/docs_src/dummy_data_in_r.md
@@ -0,0 +1,25 @@
+### Generate a dummy dataset using a custom R script
+
+
+## Why use custom dummy data?
+* Complete control over distribution of variables and dependencies between variables
+* Complete control over dataset size
+* No ad-hoc adjustments here and there - do it all in one place, once
+* ehrQL will check that the custom dataset is "correct" (to an extent)
+
+
+## Challenges
+* Maintaining consistency between custom dummy data specification and ehrQL specification as the dataset definition evolves
+* Need some understanding of data simulation
+
+## Why use dd4d instead of base R?
+* An API that supports common simulation patterns, such as:
+    * Specifying dependent missingness
+    * Flagging latent variables which are needed for simulation but do not appear in the final dataset
+* Supports common simulation functions that are awkward in base R (eg `rcat` and `rbernoulli`)
+* Supports inspection and visualisation of the specified DAG
+* Tests that catch errors early, e.g. testing that the specification isn't cyclic and if so which path is cyclic.
+* Variables do not have to be specified in topological order (unlike in a `mutate` step).
+* You get all this whilst retaining:
+    * Support for arbitrary distributions and dependencies
+    * IDE syntax highlighting / error-checking (unlike if dependencies were defined with strings). 
diff --git a/project.yaml b/project.yaml
index 21a975d..3bb2760 100644
--- a/project.yaml
+++ b/project.yaml
@@ -36,18 +36,25 @@ actions:
       highly_sensitive:
         dataset: output/dummy_dataset_2b.arrow
 
+  dummy_dataset_2c:
+      run: ehrql:v1 generate-dataset analysis/dataset_definition_2.py
+        --output output/dummy_dataset_2c.arrow
+        --dummy-data-file output/dummy_dataset_2b.arrow
+      needs: [dummy_dataset_2b]
+      outputs:
+        highly_sensitive:
+          dataset: output/dummy_dataset_2c.arrow
+
   generate_dataset_2:
     run: ehrql:v1 generate-dataset analysis/dataset_definition_2.py
-     --output output/dataset_2.arrow
-  #   --dummy-data-file output/dummy_dataset_2b.arrow
-  # needs: [dummy_dataset_2b]
+      --output output/dataset_2.arrow
     outputs:
       highly_sensitive:
         dataset: output/dataset_2.arrow
 
-  compare_dataset_2:
-    run: r:latest analysis/compare_definition_2.R
-    needs: [dummy_dataset_2a, dummy_dataset_2b, generate_dataset_2]
-    outputs:
-      highly_sensitive:
-        csv: output/compare_dataset_2.csv
\ No newline at end of file
+#  compare_dataset_2:
+#    run: r:latest analysis/compare_definition_2.R
+#    needs: [dummy_dataset_2a, dummy_dataset_2b, dummy_dataset_2c, generate_dataset_2]
+#    outputs:
+#      highly_sensitive:
+#        csv: output/compare_dataset_2.csv
\ No newline at end of file