Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions analysis/compare_definition_2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
library(dplyr)
library(ggplot2)

# developed using base R
custom_data_a <- arrow::read_feather(here::here("output", "dummy_dataset_2a.arrow"))

# developed using dd4d
custom_data_b <- arrow::read_feather(here::here("output", "dummy_dataset_2b.arrow"))

# developed using dd4d then passed through ehrQL for checks
custom_data_c <- arrow::read_feather(here::here("output", "dummy_dataset_2c.arrow"))

# developed in ehrQL directly
ehrql_data <- arrow::read_feather(here::here("output", "dataset_2.arrow"))


simplify_product_names <- function(data){

pfizer_name <- "COVID-19 mRNA Vaccine Comirnaty 30micrograms/0.3ml dose conc for susp for inj MDV (Pfizer)"
az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)"

mutate(
data,
across(starts_with("vaccine_product"),
~{
case_when(
.x == pfizer_name ~ "pfizer",
.x == az_name ~ "az",
.default = .x
)
}
)
)
}

product_xtab <- function(data){
with(
data,
table(vaccine_product1, vaccine_product2, useNA='ifany')
)
}

custom_data_a %>%
simplify_product_names() %>%
product_xtab()

custom_data_b %>%
simplify_product_names() %>%
product_xtab()

custom_data_c %>%
simplify_product_names() %>%
product_xtab()

ehrql_data %>%
simplify_product_names() %>%
with(
table(vax1 = vaccine_product1 %in% c("", NA), vax2 = vaccine_product2 %in% c("", NA))
)


plot_dates <- function(data){

data %>%
simplify_product_names() %>%
ggplot() +
geom_point(aes(x=vaccine_date1, y = vaccine_date2))+
scale_x_date(labels = ~ format(.x, "%Y-%b"))+
scale_y_date(labels = ~ format(.x, "%Y-%b"))+
theme_bw()
}

plot_dates(custom_data_c)
plot_dates(ehrql_data)



plot_death <- function(data){

data %>%
simplify_product_names() %>%
ggplot() +
geom_histogram(aes(x=death_date))+
scale_x_date(labels = ~ format(.x, "%Y-%b"))+
theme_bw()
}

sum(custom_data_c$death_date<as.Date("2021-06-01"), na.rm=TRUE)
sum(ehrql_data$death_date<as.Date("2021-06-01"), na.rm=TRUE)

plot_death(custom_data_c)
plot_death(ehrql_data)
18 changes: 18 additions & 0 deletions analysis/dataset_definition_1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from ehrql import create_dataset
from ehrql.tables.tpp import patients, practice_registrations

dataset = create_dataset()

index_date = "2020-03-31"

has_registration = practice_registrations.for_patient_on(
index_date
).exists_for_patient()

dataset.define_population(has_registration)
dataset.configure_dummy_data(population_size=1000)

dataset.registered = has_registration
dataset.age = patients.age_on(index_date)
dataset.sex = patients.sex

46 changes: 46 additions & 0 deletions analysis/dataset_definition_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from ehrql import create_dataset
from ehrql.tables.tpp import patients, practice_registrations, vaccinations, ons_deaths

dataset = create_dataset()

index_date = "2020-12-08"

has_registration = practice_registrations.for_patient_on(
index_date
).exists_for_patient()

alive = (
ons_deaths.date.is_on_or_after(index_date) |
ons_deaths.date.is_null()
)

dataset.define_population(
has_registration &
alive
)
dataset.configure_dummy_data(population_size=1000)

dataset.registered = has_registration
dataset.age = patients.age_on(index_date)
dataset.sex = patients.sex

covid_vaccinations = (
vaccinations
.where(vaccinations.target_disease.is_in(["SARS-2 CORONAVIRUS"]))
.sort_by(vaccinations.date)
)

covid_vaccinations1 = covid_vaccinations.first_for_patient()
dataset.vaccine_date1 = covid_vaccinations1.date
dataset.vaccine_product1 = covid_vaccinations1.product_name

covid_vaccinations2 = (
covid_vaccinations.
where(covid_vaccinations.date>dataset.vaccine_date1).
first_for_patient()
)
dataset.vaccine_date2 = covid_vaccinations2.date
dataset.vaccine_product2 = covid_vaccinations2.product_name

dataset.death_date = ons_deaths.date

60 changes: 60 additions & 0 deletions analysis/dummy_definition_1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@

population_size <- 1000

set.seed(420)

## attempt 1
# dummy_data <- data.frame(
# patient_id = 1:population_size,
# age = rnorm(population_size, mean= 60, sd=20),
# sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
# )

## attempt 2 - add registered variable
# dummy_data <- data.frame(
# patient_id = 1:population_size,
# registered = rbinom(population_size, 1, 0.99)==1,
# age = rnorm(population_size, mean= 60, sd=20),
# sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
# )


## attempt 3 - ensure registered is output as "T" or "F" so is read by python correctly
# dummy_data <- data.frame(
# patient_id = 1:population_size,
# registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"),
# age = rnorm(population_size, mean= 60, sd=20),
# sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
# )

# ## attempt 4 - fix age so it's an integer
# dummy_data <- data.frame(
# patient_id = 1:population_size,
# registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"),
# age = as.integer(rnorm(population_size, mean= 60, sd=20)),
# sex = sample(c("Female", "Male", "Intersex", "Unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
# )


## attempt 5 - fix sex so it uses the correct levels
# dummy_data <- data.frame(
# patient_id = 1:population_size,
# registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"),
# age = as.integer(rnorm(population_size, mean= 60, sd=20)),
# sex = sample(c("female", "male", "intersex", "unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01))
# )

## attempt 6 - add a diabetes variable that is not in the ehrql definition
dummy_data <- data.frame(
patient_id = 1:population_size,
registered = ifelse(rbinom(population_size, 1, 0.99), "T", "F"),
age = as.integer(rnorm(population_size, mean= 55, sd=20)),
sex = sample(c("female", "male", "intersex", "unknown"), size = population_size, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
diabetes = rbinom(population_size, 1, 0.1)
)

# write to compressed csv file
readr::write_csv(dummy_data, here::here("output", "dummy_dataset_1.csv.gz"))

# write to arrow file
feather::write_feather(dummy_data, here::here("output", "dummy_dataset_1.arrow"))
37 changes: 37 additions & 0 deletions analysis/dummy_definition_2a.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
library(dplyr)

set.seed(314159)

pop_n <- 1000
index_date <- as.Date("2020-12-08")

pfizer_name <- "COVID-19 mRNA Vaccine Comirnaty 30micrograms/0.3ml dose conc for susp for inj MDV (Pfizer)"
az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)"

dummy_data_nomissing <- tibble(
patient_id = 1:pop_n,
registered = rbinom(pop_n, 1, 0.99)==1,
age = as.integer(rnorm(pop_n, mean= 55, sd=20)),
sex = sample(c("female", "male", "intersex", "unknown"), size = pop_n, replace=TRUE, prob = c(0.50, 0.49, 0, 0.01)),
diabetes = rbinom(pop_n, 1, 0.1)==1,
vaccine_date1 = index_date + runif(pop_n, 0, 150),
vaccine_product1 = sample(c(pfizer_name, az_name), size = pop_n, replace=TRUE, prob = c(0.50, 0.50)),
vaccine_date2 = vaccine_date1 + runif(pop_n, 3*7, 16*7),
vaccine_product2 = if_else(runif(pop_n)<0.95, vaccine_product1, "az"),
death_date = index_date + runif(pop_n, 0, 1000)
)

dummy_data <-
mutate(
dummy_data_nomissing,
sex = if_else(runif(pop_n)<0.01, NA_character_, sex),
vaccine_date1 = if_else(runif(pop_n)<0.2, as.Date(NA), vaccine_date1),
vaccine_product1 = if_else(is.na(vaccine_date1), NA_character_, vaccine_product1),
vaccine_date2 = if_else(runif(pop_n)<0.1 | is.na(vaccine_date1), as.Date(NA), vaccine_date2),
vaccine_product2 = if_else(is.na(vaccine_date2), NA_character_, vaccine_product2),
death_date = if_else(runif(pop_n)<0.95, as.Date(NA), death_date)
)

# write to arrow file
arrow::write_feather(dummy_data, sink = here::here("output", "dummy_dataset_2a.arrow"))

69 changes: 69 additions & 0 deletions analysis/dummy_definition_2b.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
library(dplyr)
library(dd4d)

set.seed(314159)

pop_n <- 1000
index_date <- as.Date("2020-12-08")

pfizer_name <- "COVID-19 mRNA Vaccine Comirnaty 30micrograms/0.3ml dose conc for susp for inj MDV (Pfizer)"
az_name <- "COVID-19 Vaccine Vaxzevria 0.5ml inj multidose vials (AstraZeneca)"


# define each variable as a node in a bayesian network

sim_list <- lst(

registered = bn_node(
~ rbernoulli(n = ..n, p = 0.99)
),
age = bn_node(
~ as.integer(rnorm(..n, mean= 55, sd=20)),
),
sex = bn_node(
~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"), p = c(0.50, 0.49, 0, 0.01)),
missing_rate = ~0.01 # this is shorthand for ~(rbernoulli(n=..n, p = 0.2))
),
diabetes = bn_node(
~ rbernoulli(n = ..n, p = 0.10)
),
vaccine_date1 = bn_node(
~ runif(n = ..n, 0, 150),
missing_rate = ~0.2,
),
vaccine_date2 = bn_node(
~ vaccine_date1 + runif(n = ..n, 3*7, 16*7),
missing_rate = ~0.1,
needs = "vaccine_date1"
),
vaccine_product1 = bn_node(
~ rcat(n = ..n, c(pfizer_name, az_name), c(0.5, 0.5)),
needs = "vaccine_date1"
),
vaccine_product2 = bn_node(
~ if_else(runif(..n) < 0.95, vaccine_product1, c(az_name)),
needs = "vaccine_date2"
),
death_date = bn_node(
~ runif(n = ..n, 0, 1000),
missing_rate = ~0.95
)
)

bn <- bn_create(sim_list,known_variables = c("pfizer_name","az_name"))

# plot the network
bn_plot(bn)

# plot the network (connected nodes only)
bn_plot(bn, connected_only = TRUE)

# simulate the dataset
dummy_data <- bn_simulate(bn, pop_size = pop_n, keep_all = FALSE, .id = "patient_id")

dummy_data_days_to_dates <- dummy_data %>%
mutate(across(contains("_date"), ~ index_date + .))

# write to arrow file
arrow::write_feather(dummy_data_days_to_dates, sink = here::here("output", "dummy_dataset_2b.arrow"))

25 changes: 25 additions & 0 deletions docs_src/dummy_data_in_r.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
### Generate a dummy dataset using a custom R script


## Why use custom dummy data?
* Complete control over distribution of variables and dependencies between variables
* Complete control over dataset size
* No ad-hoc adjustments here and there - do it all in one place, once
* ehrQL will check that the custom dataset is "correct" (to an extent)


## Challenges
* Maintaining consistency between custom dummy data specification and ehrQL specification as the dataset definition evolves
* Need some understanding of data simulation

## Why use dd4d instead of base R?
* An API that supports common simulation patterns, such as:
* Specifying dependent missingness
* Flagging latent variables which are needed for simulation but do not appear in the final dataset
* Supports common simulation functions that are awkward in base R (eg `rcat` and `rbernoulli`)
* Supports inspection and visualisation of the specified DAG
* Tests that catch errors early, e.g. testing that the specification isn't cyclic and if so which path is cyclic.
* Variables do not have to be specified in topological order (unlike in a `mutate` step).
* You get all this whilst retaining:
* Support for arbitrary distributions and dependencies
* IDE syntax highlighting / error-checking (unlike if dependencies were defined with strings).
Loading