Skip to content
91 changes: 62 additions & 29 deletions scripts/seed_nft_tables.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import csv
from datetime import datetime, timedelta, timezone
from itertools import cycle
from math import gcd
from random import shuffle
from typing import Any
from typing import Any, Iterator

import boto3
import fire

# import json
import numpy as np

from nrlf.consumer.fhir.r4.model import DocumentReference
from nrlf.core.constants import (
CATEGORY_ATTRIBUTES,
Expand Down Expand Up @@ -145,7 +149,7 @@ def _populate_seed_table(
px_with_pointers: int,
pointers_per_px: float = 1.0,
type_dists: dict[str, int] = DEFAULT_TYPE_DISTRIBUTIONS,
custodian_dists: dict[str, int] = DEFAULT_CUSTODIAN_DISTRIBUTIONS,
custodian_dists: dict[str, dict[str, int]] = DEFAULT_CUSTODIAN_DISTRIBUTIONS,
):
"""
Seeds a table with example data for non-functional testing.
Expand All @@ -155,25 +159,41 @@ def _populate_seed_table(
# set up iterations
type_iter = _set_up_cyclical_iterator(type_dists)
custodian_iters = _set_up_custodian_iterators(custodian_dists)
count_iter = _set_up_cyclical_iterator(DEFAULT_COUNT_DISTRIBUTIONS)
# count_iter = _set_up_cyclical_iterator(DEFAULT_COUNT_DISTRIBUTIONS)
count_iter = _get_pointer_count_poisson_distributions(
px_with_pointers, pointers_per_px
)
# count_iter = _get_pointer_count_negbinom_distributions(px_with_pointers, pointers_per_px)
testnum_cls = TestNhsNumbersIterator()
testnum_iter = iter(testnum_cls)

px_counter = 0
doc_ref_target = int(pointers_per_px * px_with_pointers)
print(
f"Will upsert {doc_ref_target} test pointers for {px_with_pointers} patients."
f"Will upsert ~{doc_ref_target} test pointers for {px_with_pointers} patients."
)
doc_ref_counter = 0
batch_counter = 0
unprocessed_count = 0

pointer_data: list[list[str]] = []

start_time = datetime.now(tz=timezone.utc)

batch_upsert_items = []
while px_counter <= px_with_pointers:
batch_upsert_items: list[dict[str, Any]] = []
while px_counter < px_with_pointers:
pointers_for_px = int(next(count_iter))

if batch_counter + pointers_for_px > 25 or px_counter == px_with_pointers:
resource.batch_write_item(RequestItems={table_name: batch_upsert_items})
response = resource.batch_write_item(
RequestItems={table_name: batch_upsert_items}
)

if response.get("UnprocessedItems"):
unprocessed_count += len(
response.get("UnprocessedItems").get(table_name, [])
)

batch_upsert_items = []
batch_counter = 0

Expand All @@ -189,55 +209,68 @@ def _populate_seed_table(
)
put_req = {"PutRequest": {"Item": pointer.model_dump()}}
batch_upsert_items.append(put_req)
pointer_data.append(
[
pointer.id,
pointer.type,
pointer.custodian,
pointer.nhs_number,
]
)
px_counter += 1

if px_counter % 1000 == 0:
print(".", end="", flush=True)
if px_counter % 100000 == 0:
print(f" {px_counter} patients processed ({doc_ref_counter} pointers).")

print(" Done.")

end_time = datetime.now(tz=timezone.utc)
print(
f"Created {doc_ref_counter} pointers in {timedelta.total_seconds(end_time - start_time)} seconds."
f"Created {doc_ref_counter} pointers in {timedelta.total_seconds(end_time - start_time)} seconds (unprocessed: {unprocessed_count})."
)

with open("./dist/seed-nft-pointers.csv", "w") as f:
writer = csv.writer(f)
writer.writerow(["pointer_id", "pointer_type", "custodian", "nhs_number"])
writer.writerows(pointer_data)
print(f"Pointer data saved to ./dist/seed-nft-pointers.csv") # noqa


def _set_up_cyclical_iterator(dists: dict[str, int]) -> iter:
def _set_up_cyclical_iterator(dists: dict[str, int]) -> Iterator[str]:
"""
Given a dict of values and their relative frequencies,
returns an iterator that will cycle through a the reduced and shuffled set of values.
This should result in more live-like data than e.g. creating a bulk amount of each pointer type/custodian in series.
It also means each batch will contain a representative sample of the distribution.
"""
d = gcd(*dists.values())
value_list = []
value_list: list[str] = []
for entry in dists:
value_list.extend([entry] * (dists[entry] // d))
shuffle(value_list)
return cycle(value_list)


def _get_pointer_count_poisson_distributions(
num_of_patients: int, pointers_per_px: float
) -> Iterator[int]:
p_count_distr = np.random.poisson(lam=pointers_per_px - 1, size=num_of_patients) + 1
p_count_distr = np.clip(p_count_distr, a_min=1, a_max=4)
return cycle(p_count_distr)


def _set_up_custodian_iterators(
custodian_dists: dict[dict[str, int]]
) -> dict[str, iter]:
custodian_iters = {}
custodian_dists: dict[str, dict[str, int]]
) -> dict[str, Iterator[str]]:
custodian_iters: dict[str, Iterator[str]] = {}
for pointer_type in custodian_dists:
custodian_iters[pointer_type] = _set_up_cyclical_iterator(
custodian_dists[pointer_type]
)
return custodian_iters


def _set_up_count_iterator(pointers_per_px: float) -> iter:
"""
Given a target average number of pointers per patient,
generates a distribution of counts per individual patient.
"""

extra_per_hundred = int(
(pointers_per_px - 1.0) * 100
) # no patients can have zero pointers
counts = {}
counts["3"] = extra_per_hundred // 10
counts["2"] = extra_per_hundred - 2 * counts["3"]
counts["1"] = 100 - counts[2] - counts[3]
return _set_up_cyclical_iterator(counts)


if __name__ == "__main__":
fire.Fire(_populate_seed_table)
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
output "bucket_name" {
description = "Name of the metadata S3 bucket"
value = aws_s3_bucket.metadata_bucket.bucket
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
resource "aws_s3_bucket" "metadata_bucket" {
bucket = "${var.name_prefix}-metadata"
force_destroy = false
}

resource "aws_s3_bucket_policy" "metadata_bucket_policy" {
bucket = aws_s3_bucket.metadata_bucket.id

policy = jsonencode({
Version = "2012-10-17"
Id = "metadata_bucket_policy"
Statement = [
{
Sid = "HTTPSOnly"
Effect = "Deny"
Principal = "*"
Action = "s3:*"
Resource = [
aws_s3_bucket.metadata_bucket.arn,
"${aws_s3_bucket.metadata_bucket.arn}/*",
]
Condition = {
Bool = {
"aws:SecureTransport" = "false"
}
}
},
]
})
}

resource "aws_s3_bucket_public_access_block" "metadata_bucket_public_access_block" {
bucket = aws_s3_bucket.metadata_bucket.id

block_public_acls = true
block_public_policy = true
ignore_public_acls = true
restrict_public_buckets = true
}

resource "aws_s3_bucket_server_side_encryption_configuration" "metadata_bucket" {
bucket = aws_s3_bucket.metadata_bucket.bucket

rule {
apply_server_side_encryption_by_default {
sse_algorithm = "AES256"
}
}
}

resource "aws_s3_bucket_versioning" "metadata_bucket" {
bucket = aws_s3_bucket.metadata_bucket.id
versioning_configuration {
status = "Enabled"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
variable "name_prefix" {
type = string
description = "The prefix to apply to all resources in the module."
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,6 @@ module "ref-pointers-table" {
}

module "perftest-pointers-table" {
source = "../modules/pointers-table"
name_prefix = "nhsd-nrlf--perftest"
enable_deletion_protection = true
enable_pitr = true
kms_deletion_window_in_days = 30
source = "../modules/pointers-table"
name_prefix = "nhsd-nrlf--perftest"
}
5 changes: 5 additions & 0 deletions terraform/account-wide-infrastructure/test/s3.tf
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,8 @@ module "perftest-truststore-bucket" {
name_prefix = "nhsd-nrlf--perftest"
server_certificate_file = "../../../truststore/server/perftest.pem"
}

module "perftest-metadata-bucket" {
source = "../modules/metadata-bucket"
name_prefix = "nhsd-nrlf--perftest"
}
8 changes: 4 additions & 4 deletions terraform/infrastructure/data.tf
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,22 @@ data "aws_iam_policy" "auth-store-read-policy" {

data "aws_dynamodb_table" "pointers-table" {
count = var.use_shared_resources ? 1 : 0
name = "${local.shared_prefix}-pointers-table"
name = "${local.pointers_table_prefix}-pointers-table"
}

data "aws_iam_policy" "pointers-table-read" {
count = var.use_shared_resources ? 1 : 0
name = "${local.shared_prefix}-pointers-table-read"
name = "${local.pointers_table_prefix}-pointers-table-read"
}

data "aws_iam_policy" "pointers-table-write" {
count = var.use_shared_resources ? 1 : 0
name = "${local.shared_prefix}-pointers-table-write"
name = "${local.pointers_table_prefix}-pointers-table-write"
}

data "aws_iam_policy" "pointers-kms-read-write" {
count = var.use_shared_resources ? 1 : 0
name = "${local.shared_prefix}-pointers-kms-read-write"
name = "${local.pointers_table_prefix}-pointers-kms-read-write"
}

data "external" "current-info" {
Expand Down
8 changes: 6 additions & 2 deletions terraform/infrastructure/etc/dev.tfvars
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
account_name = "dev"
aws_account_name = "dev"

dynamodb_pointers_table_prefix = "nhsd-nrlf--dev"
dynamodb_sandbox_pointers_table_prefix = "nhsd-nrlf--dev-sandbox"

domain = "api.record-locator.dev.national.nhs.uk"
public_domain = "internal-dev.api.service.nhs.uk"
public_sandbox_domain = "internal-dev-sandbox.api.service.nhs.uk"
log_retention_period = 90
enable_reporting = false

log_retention_period = 90
enable_reporting = false
11 changes: 7 additions & 4 deletions terraform/infrastructure/etc/int.tfvars
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
account_name = "int"
aws_account_name = "test"

domain = "api.record-locator.int.national.nhs.uk"
deletion_protection = true
dynamodb_pointers_table_prefix = "nhsd-nrlf--int"
dynamodb_sandbox_pointers_table_prefix = "nhsd-nrlf--int-sandbox"
deletion_protection = true

domain = "api.record-locator.int.national.nhs.uk"
public_domain = "int.api.service.nhs.uk"
public_sandbox_domain = "sandbox.api.service.nhs.uk"
log_retention_period = 90
enable_reporting = true

log_retention_period = 90
enable_reporting = true
8 changes: 5 additions & 3 deletions terraform/infrastructure/etc/perftest.tfvars
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
account_name = "perftest"
aws_account_name = "test"

domain = "perftest.record-locator.national.nhs.uk"
public_domain = "perftest.api.service.nhs.uk"
deletion_protection = true
dynamodb_pointers_table_prefix = "nhsd-nrlf--perftest"

domain = "perftest.record-locator.national.nhs.uk"
public_domain = "perftest.api.service.nhs.uk"

log_retention_period = 30
enable_reporting = false
disable_firehose_lambda_subscriptions = true
9 changes: 6 additions & 3 deletions terraform/infrastructure/etc/prod.tfvars
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
account_name = "prod"
aws_account_name = "prod"

domain = "api.record-locator.national.nhs.uk"
public_domain = "api.service.nhs.uk"
deletion_protection = true
dynamodb_pointers_table_prefix = "nhsd-nrlf--prod"
deletion_protection = true

domain = "api.record-locator.national.nhs.uk"
public_domain = "api.service.nhs.uk"

log_retention_period = 2192
enable_reporting = true
8 changes: 6 additions & 2 deletions terraform/infrastructure/etc/qa.tfvars
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
account_name = "qa"
aws_account_name = "test"

dynamodb_pointers_table_prefix = "nhsd-nrlf--qa"
dynamodb_sandbox_pointers_table_prefix = "nhsd-nrlf--qa-sandbox"

domain = "qa.record-locator.national.nhs.uk"
public_domain = "internal-qa.api.service.nhs.uk"
public_sandbox_domain = "internal-qa-sandbox.api.service.nhs.uk"
log_retention_period = 90
enable_reporting = false

log_retention_period = 90
enable_reporting = false
7 changes: 5 additions & 2 deletions terraform/infrastructure/etc/ref.tfvars
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
account_name = "ref"
aws_account_name = "test"

domain = "api.record-locator.ref.national.nhs.uk"
public_domain = "ref.api.service.nhs.uk"
dynamodb_pointers_table_prefix = "nhsd-nrlf--ref"

domain = "api.record-locator.ref.national.nhs.uk"
public_domain = "ref.api.service.nhs.uk"

log_retention_period = 30
enable_reporting = false
2 changes: 2 additions & 0 deletions terraform/infrastructure/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ locals {
auth_store_id = var.use_shared_resources ? data.aws_s3_bucket.authorization-store[0].id : module.ephemeral-s3-permission-store[0].bucket_id
auth_store_read_policy_arn = var.use_shared_resources ? data.aws_iam_policy.auth-store-read-policy[0].arn : module.ephemeral-s3-permission-store[0].bucket_read_policy_arn

pointers_table_prefix = local.is_sandbox_env ? "${var.dynamodb_sandbox_pointers_table_prefix}" : "${var.dynamodb_pointers_table_prefix}"

pointers_table_name = var.use_shared_resources ? data.aws_dynamodb_table.pointers-table[0].name : module.ephemeral-pointers-table[0].table_name
pointers_table_read_policy_arn = var.use_shared_resources ? data.aws_iam_policy.pointers-table-read[0].arn : module.ephemeral-pointers-table[0].read_policy_arn
pointers_table_write_policy_arn = var.use_shared_resources ? data.aws_iam_policy.pointers-table-write[0].arn : module.ephemeral-pointers-table[0].write_policy_arn
Expand Down
11 changes: 11 additions & 0 deletions terraform/infrastructure/vars.tf
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,14 @@ variable "disable_firehose_lambda_subscriptions" {
type = bool
default = false
}

variable "dynamodb_pointers_table_prefix" {
type = string
description = "The prefix of the DynamoDB pointers table to use when using shared resources"
}

variable "dynamodb_sandbox_pointers_table_prefix" {
type = string
description = "The prefix of the DynamoDB pointers table to use when using shared resources in a sandbox environment"
default = null
}
Loading