From 456c4b01ea47b36800fd4c53c3a7c1e2c931e3df Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 23 Jun 2025 14:27:31 -0700 Subject: [PATCH 1/3] refactor(scripts): Organize tests into Critical User Journeys (CUJs) This commit refactors the project's testing and automation scripts, moving them from a monolithic `bin/` directory into a structured framework centered around Critical User Journeys (CUJs). The goal is to create a more maintainable and scalable way to exercise the use cases that are essential for our customers. This new structure is designed for better organization, code reuse, and CI/CD integration. Key changes include: * **CUJ Directory Structure:** * `gcloud/cuj/`: Contains individual journey scripts. The first CUJ, `standard-cluster-management`, is introduced to handle the creation, deletion, and rebuilding of a standard Dataproc cluster. * `gcloud/onboarding/`: Holds scripts for setting up shared, prerequisite infrastructure (e.g., GCS buckets) that CUJs depend on. * `gcloud/lib/`: A new directory for shared bash libraries. `common.sh` is introduced to centralize functions for configuration loading, network management, and logging. * `gcloud/ci/`: Includes scripts specifically for CI/CD pipelines. * **CI Environment Management:** * A new `pristine_check.sh` script is added to ensure the GCP project is in a clean state before a test run. It can operate in a strict validation mode (fail if resources exist) or a cleanup mode (actively delete resources). * **Configuration and Code Improvements:** * Configuration is centralized and simplified in `env.json`, which is now loaded by the `load_config` function in `common.sh`. * Scripts are refactored to be more robust, with clear `up`, `down`, and `validate` commands, and checks for pre-existing resources. * The `.gitignore` is updated to exclude temporary files and local configuration. This change lays the foundation for adding more complex user journeys, ensuring our core services are consistently tested from a customer perspective. --- gcloud/.gitignore | 5 +- gcloud/ci/pristine_check.sh | 98 +++++++++++++++++++ .../cuj/standard-cluster-management/manage.sh | 80 +++++++++++++++ gcloud/env.json.sample | 27 ++--- gcloud/lib/common.sh | 95 ++++++++++++++++++ gcloud/lib/env.sh | 4 +- gcloud/onboarding/setup_shared_infra.sh | 22 +++++ 7 files changed, 310 insertions(+), 21 deletions(-) create mode 100644 gcloud/ci/pristine_check.sh create mode 100644 gcloud/cuj/standard-cluster-management/manage.sh create mode 100644 gcloud/lib/common.sh create mode 100644 gcloud/onboarding/setup_shared_infra.sh diff --git a/gcloud/.gitignore b/gcloud/.gitignore index d21b057..e4d4c60 100644 --- a/gcloud/.gitignore +++ b/gcloud/.gitignore @@ -1 +1,4 @@ -init/*/ \ No newline at end of file +init/*/ +tls +*~ +env.json diff --git a/gcloud/ci/pristine_check.sh b/gcloud/ci/pristine_check.sh new file mode 100644 index 0000000..5b1ef7c --- /dev/null +++ b/gcloud/ci/pristine_check.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# +# Verifies and enforces a pristine state in the project for CUJ testing. +# This script is designed to be run from a CI/CD pipeline. +# +# It finds all resources associated with the CUJ test network and deletes them. +# +# Usage: +# ./pristine_check.sh # Cleanup mode: Aggressively deletes resources. +# ./pristine_check.sh --strict # Validation mode: Fails if any resources are found. + +set -e +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +source "${SCRIPT_DIR}/../lib/common.sh" +load_config + +STRICT_MODE=false +if [[ "$1" == "--strict" ]]; then + STRICT_MODE=true +fi + +# Use a temporary file to track leftover resources for the final report. +LEFTOVERS_FILE=$(mktemp) +trap 'rm -f -- "${LEFTOVERS_FILE}"' EXIT + +header "Pristine Check running in $([[ "$STRICT_MODE" == true ]] && echo 'STRICT' || echo 'CLEANUP') mode" + +# --- Resource Discovery and Cleanup --- + +# 1. Dataproc Clusters +# Find any clusters on the target network. +CLUSTERS=$(gcloud dataproc clusters list --region="${CONFIG[REGION]}" --filter="config.gceClusterConfig.networkUri.endsWith(\"/${CONFIG[NETWORK]}\")" --format="value(clusterName)" 2>/dev/null) +if [[ -n "${CLUSTERS}" ]]; then + echo "Found leftover Dataproc clusters: ${CLUSTERS}" | tee -a "${LEFTOVERS_FILE}" + if [[ "$STRICT_MODE" == false ]]; then + echo "Cleaning up..." + # Run deletions in the background for speed. + for cluster in ${CLUSTERS}; do + gcloud dataproc clusters delete --quiet "${cluster}" --region="${CONFIG[REGION]}" & + done + fi +fi + +# 2. GCE Instances +# Find any instances on the target network that are NOT part of a managed instance group (like a KDC). +INSTANCES=$(gcloud compute instances list --filter="networkInterfaces.network.endsWith(\"/${CONFIG[NETWORK]}\") AND -name~gke-" --format="value(name)" 2>/dev/null) +if [[ -n "${INSTANCES}" ]]; then + echo "Found leftover GCE instances: ${INSTANCES}" | tee -a "${LEFTOVERS_FILE}" + if [[ "$STRICT_MODE" == false ]]; then + echo "Cleaning up..." + gcloud compute instances delete --quiet ${INSTANCES} & + fi +fi + +# 3. Firewall Rules +# Dataproc auto-creates firewall rules with the network name. We'll find them. +FIREWALL_RULES=$(gcloud compute firewall-rules list --filter="network.endsWith(\"/${CONFIG[NETWORK]}\")" --format="value(name)" 2>/dev/null) +if [[ -n "${FIREWALL_RULES}" ]]; then + echo "Found leftover Firewall Rules: ${FIREWALL_RULES}" | tee -a "${LEFTOVERS_FILE}" + if [[ "$STRICT_MODE" == false ]]; then + echo "Cleaning up..." + gcloud compute firewall-rules delete --quiet ${FIREWALL_RULES} & + fi +fi + +# Wait for all background cleanup jobs to finish before proceeding to network deletion. +if [[ "$STRICT_MODE" == false ]]; then + echo "Waiting for resource cleanup to complete..." + wait + echo "Cleanup complete." +fi + +# 4. VPC Network +# This is the last step, as the network cannot be deleted if resources are using it. +# We will use the function from our library here. +if gcloud compute networks describe "${CONFIG[NETWORK]}" &>/dev/null; then + echo "Found leftover VPC Network: ${CONFIG[NETWORK]}" | tee -a "${LEFTOVERS_FILE}" + if [[ "$STRICT_MODE" == false ]]; then + echo "Cleaning up..." + # The delete_network_and_subnet function is already quiet and handles non-existence. + delete_network_and_subnet + fi +fi + + +# --- Final Report --- +if [[ -s "${LEFTOVERS_FILE}" ]]; then + echo "--------------------------------------------------" >&2 + echo "ERROR: Leftover resources were detected:" >&2 + cat "${LEFTOVERS_FILE}" >&2 + echo "--------------------------------------------------" >&2 + if [[ "$STRICT_MODE" == true ]]; then + echo "STRICT mode failed. The project is not pristine." >&2 + exit 1 + fi +fi + +echo "Pristine check complete." diff --git a/gcloud/cuj/standard-cluster-management/manage.sh b/gcloud/cuj/standard-cluster-management/manage.sh new file mode 100644 index 0000000..61c995f --- /dev/null +++ b/gcloud/cuj/standard-cluster-management/manage.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# CUJ: Standard Dataproc Cluster Management + +function main() { + local SCRIPT_DIR + SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + source "${SCRIPT_DIR}/../../lib/common.sh" + set -e + load_config + + function validate() { + header "Validating prerequisites" + echo "Checking for shared GCS bucket..." + if ! gsutil -q stat "gs://${CONFIG[SHARED_GCS_BUCKET]}/"; then + echo "ERROR: Shared GCS bucket 'gs://${CONFIG[SHARED_GCS_BUCKET]}/' not found." >&2 + echo "Please run the script in 'gcloud/onboarding/' first." >&2 + exit 1 + fi + echo "Prerequisites met." + } + + function create_cluster() { + # ---FIX--- + # Add defensive check for required configuration. + if [[ -z "${CONFIG[CLUSTER_NAME]}" || -z "${CONFIG[REGION]}" || -z "${CONFIG[SUBNET]}" || -z "${CONFIG[CUJ_TAG]}" ]]; then + echo "ERROR: One or more required keys (CLUSTER_NAME, REGION, SUBNET, CUJ_TAG) are missing from env.json" >&2 + exit 1 + fi + # ---END FIX--- + + echo "Creating Dataproc cluster '${CONFIG[CLUSTER_NAME]}'..." + set -x + gcloud dataproc clusters create "${CONFIG[CLUSTER_NAME]}" \ + --region="${CONFIG[REGION]}" \ + --subnet="${CONFIG[SUBNET]}" \ + --tags="${CONFIG[CUJ_TAG]}" \ + --format json + set +x + } + + function delete_cluster() { + if [[ -z "${CONFIG[CLUSTER_NAME]}" || -z "${CONFIG[REGION]}" ]]; then + echo "ERROR: One or more required keys (CLUSTER_NAME, REGION) are missing from env.json" >&2 + exit 1 + fi + echo "Deleting Dataproc cluster '${CONFIG[CLUSTER_NAME]}'..." + if gcloud dataproc clusters describe "${CONFIG[CLUSTER_NAME]}" --region="${CONFIG[REGION]}" &>/dev/null; then + gcloud dataproc clusters delete --quiet "${CONFIG[CLUSTER_NAME]}" --region="${CONFIG[REGION]}" + else + echo "Cluster '${CONFIG[CLUSTER_NAME]}' not found, skipping delete." + fi + } + + case "$1" in + validate) + validate + ;; + up) # Creates the full managed stack for this CUJ + validate + create_network_and_subnet + create_cluster + ;; + down) # Deletes the full managed stack for this CUJ + delete_cluster + delete_network_and_subnet + ;; + cluster-rebuild) # Cycles the cluster, leaves network + (delete_cluster) || true + create_cluster + ;; + *) + echo "Usage: $0 {validate|up|down|cluster-rebuild}" + exit 1 + ;; + esac +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/gcloud/env.json.sample b/gcloud/env.json.sample index b6602b4..03385e7 100644 --- a/gcloud/env.json.sample +++ b/gcloud/env.json.sample @@ -1,21 +1,10 @@ { - "PROJECT_ID":"ldap-example-yyyy-nn", - "ORG_NUMBER":"100000000001", - "USER": "project-owner", - "DOMAIN": "your-domain-goes-here.com", - "PRIV_USER": "privileged", - "PRIV_DOMAIN": "privileged-domain-here.com", - "BILLING_ACCOUNT": "100000-000000-000001", - "FOLDER_NUMBER": "100000000001", - "REGION": "us-west4", - "RANGE": "10.00.01.0/24", - "IDLE_TIMEOUT": "30m", - "ASN_NUMBER": "65531", - "IMAGE_VERSION": "2.2, - "BUCKET": "myproject-dataproc-repro-bucket", - "TEMP_BUCKET": "myproject-dataproc-repro-temp-bucket", - "CLUSTER_NAME": "cluster-name-here", - "BIGTABLE_INSTANCE": "my-bigtable" - "BIGTABLE_DISPLAY_NAME": "my-bigtable-cluster", - "BIGTABLE_CLUSTER_CONFIG": "null" + "PROJECT_ID": "your-gcp-project-id", + "REGION": "us-central1", + "NETWORK": "cuj-network", + "SUBNET": "cuj-subnet", + "SUBNET_CIDR": "10.1.2.0/24", + "CLUSTER_NAME": "cuj-standard-cluster", + "SHARED_GCS_BUCKET": "your-cuj-shared-bucket", + "CUJ_TAG": "cuj-test-run" } diff --git a/gcloud/lib/common.sh b/gcloud/lib/common.sh new file mode 100644 index 0000000..4ab06b3 --- /dev/null +++ b/gcloud/lib/common.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This library contains common functions for use in CUJ scripts. + +# A global associative array to hold configuration values. +declare -A CONFIG + +# Loads configuration from env.json into the CONFIG array. +function load_config() { + # This assumes env.json is in the gcloud/ directory, two levels above the cuj/*/ directory + local env_file + env_file="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)/../env.json" + + if [[ ! -f "${env_file}" ]]; then + echo "ERROR: env.json not found at ${env_file}" >&2 + exit 1 + fi + + # Read all keys and values from JSON into the CONFIG array + while IFS='=' read -r key value; do + CONFIG["$key"]="$value" + done < <(jq -r 'to_entries|map("\(.key)=\(.value|tostring)")|.[]' < "${env_file}") + + # Set the project for all subsequent gcloud commands + gcloud config set project "${CONFIG[PROJECT_ID]}" +} + +# Prints a formatted header message. +function header() { + echo "========================================================================" + echo " $1" + echo "========================================================================" +} + +# Prompts the user for confirmation before proceeding. +function confirm() { + # When running in an automated test, skip the confirmation. + if [[ -z "${CI_TEST}" ]]; then + read -p "$1 (y/N): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo "Operation cancelled." + exit 1 + fi + fi +} + +# Creates the VPC network and a subnet within it. +# (Inside common.sh) +function create_network_and_subnet() { + # ... (logic to check if network exists) + gcloud compute networks create "${CONFIG[NETWORK]}" \ + --subnet-mode=custom \ + --description="Network for CUJ testing" \ + --bgp-routing-mode="regional" + + # ... (logic to check if subnet exists) + gcloud compute networks subnets create "${CONFIG[SUBNET]}" \ + --network="${CONFIG[NETWORK]}" \ + --range="${CONFIG[SUBNET_CIDR]}" \ + --region="${CONFIG[REGION]}" + + # Add firewall rule with the tag to allow SSH + gcloud compute firewall-rules create "${CONFIG[CUJ_TAG]}-allow-ssh" \ + --network="${CONFIG[NETWORK]}" \ + --allow=tcp:22 \ + --source-ranges="0.0.0.0/0" \ + --description="Allow SSH for CUJ test" \ + --target-tags="${CONFIG[CUJ_TAG]}" +} + +# Deletes the VPC network. Subnets are deleted automatically with the network. +function delete_network_and_subnet() { + header "Deleting VPC Network: ${CONFIG[NETWORK]}" + if gcloud compute networks describe "${CONFIG[NETWORK]}" &>/dev/null; then + gcloud compute networks delete --quiet "${CONFIG[NETWORK]}" + echo "Network ${CONFIG[NETWORK]} and its subnets have been deleted." + else + echo "Network ${CONFIG[NETWORK]} not found." + fi +} diff --git a/gcloud/lib/env.sh b/gcloud/lib/env.sh index bea45b5..a79650c 100644 --- a/gcloud/lib/env.sh +++ b/gcloud/lib/env.sh @@ -27,11 +27,13 @@ fi export BILLING_ACCOUNT="$(jq -r .BILLING_ACCOUNT env.json)" export CLUSTER_NAME="$(jq -r .CLUSTER_NAME env.json)" export BUCKET="$(jq -r .BUCKET env.json)" +export SHARED_GCS_BUCKET="${BUCKET:-your-cuj-shared-bucket}" export RANGE="$(jq -r .RANGE env.json)" export IDLE_TIMEOUT="$(jq -r .IDLE_TIMEOUT env.json)" export ASN_NUMBER="$(jq -r .ASN_NUMBER env.json)" export IMAGE_VERSION="$(jq -r .IMAGE_VERSION env.json)" export REGION="$(jq -r .REGION env.json)" +export NETWORK="$(jq -r .NETWORK env.json)" export ZONE="${REGION}-b" #export ZONE="${REGION}-b" @@ -67,7 +69,7 @@ export PROPERTIES="${SPARK_PROPERTIES},${DOCKER_PROPERTIES}" export CONNECTIVITY_TEST="ct-${CLUSTER_NAME}" export ALLOCATION_NAME="allocation-${CLUSTER_NAME}-${REGION}" -export NETWORK="net-${CLUSTER_NAME}" +export NETWORK="${NETWORK:-net-${CLUSTER_NAME}}" export NETWORK_URI_PARTIAL="projects/${PROJECT_ID}/global/networks/${NETWORK}" export NETWORK_URI="https://www.googleapis.com/compute/v1/${NETWORK_URI_PARTIAL}" export SUBNET="subnet-${CLUSTER_NAME}" diff --git a/gcloud/onboarding/setup_shared_infra.sh b/gcloud/onboarding/setup_shared_infra.sh new file mode 100644 index 0000000..9104c01 --- /dev/null +++ b/gcloud/onboarding/setup_shared_infra.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# +# Creates the shared, "unmanaged" infrastructure that CUJs depend on. +# This script is idempotent and can be re-run safely. + +source ../lib/common.sh +load_config + +function main() { + header "Onboarding: Setting up Shared Infrastructure" + + echo "Creating shared GCS bucket: ${CONFIG[SHARED_GCS_BUCKET]}" + # Use gsutil's 'mb' command with -p to create if it doesn't exist. + gsutil -q mb -p "${CONFIG[PROJECT_ID]}" "gs://${CONFIG[SHARED_GCS_BUCKET]}" || echo "Bucket already exists." + + # Add other shared resource creation here in the future, + # for example, a shared KDC virtual machine. + + echo "Onboarding of shared infrastructure complete." +} + +main From 1d2ffb5583fadc84f4641559c967749f745a88b8 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 23 Jun 2025 14:27:31 -0700 Subject: [PATCH 2/3] refactor(scripts): Organize tests into Critical User Journeys (CUJs) This commit refactors the project's testing and automation scripts, moving them from a monolithic `bin/` directory into a structured framework centered around Critical User Journeys (CUJs). The goal is to create a more maintainable and scalable way to exercise the use cases that are essential for our customers. This new structure is designed for better organization, code reuse, and CI/CD integration. Key changes include: * **CUJ Directory Structure:** * `gcloud/cuj/`: Contains individual journey scripts. The first CUJ, `gce/standard`, is introduced to handle the creation, deletion, and rebuilding of a standard Dataproc cluster. * `gcloud/onboarding/`: Holds scripts for setting up shared, prerequisite infrastructure (e.g., GCS buckets) that CUJs depend on. * `gcloud/lib/`: A new directory for shared bash libraries. `common.sh` is introduced to centralize functions for configuration loading, network management, and logging. * `gcloud/ci/`: Includes scripts specifically for CI/CD pipelines. * **CI Environment Management:** * A new `pristine_check.sh` script is added to ensure the GCP project is in a clean state before a test run. It can operate in a strict validation mode (fail if resources exist) or a cleanup mode (actively delete resources). * **Configuration and Code Improvements:** * Configuration is centralized and simplified in `env.json`, which is now loaded by the `load_config` function in `common.sh`. * Scripts are refactored to be more robust, with clear `up`, `down`, and `validate` commands, and checks for pre-existing resources. * The `.gitignore` is updated to exclude temporary files and local configuration. This change lays the foundation for adding more complex user journeys, ensuring our core services are consistently tested from a customer perspective. --- gcloud/cuj/gce/standard/manage.sh | 80 +++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 gcloud/cuj/gce/standard/manage.sh diff --git a/gcloud/cuj/gce/standard/manage.sh b/gcloud/cuj/gce/standard/manage.sh new file mode 100644 index 0000000..61c995f --- /dev/null +++ b/gcloud/cuj/gce/standard/manage.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# CUJ: Standard Dataproc Cluster Management + +function main() { + local SCRIPT_DIR + SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + source "${SCRIPT_DIR}/../../lib/common.sh" + set -e + load_config + + function validate() { + header "Validating prerequisites" + echo "Checking for shared GCS bucket..." + if ! gsutil -q stat "gs://${CONFIG[SHARED_GCS_BUCKET]}/"; then + echo "ERROR: Shared GCS bucket 'gs://${CONFIG[SHARED_GCS_BUCKET]}/' not found." >&2 + echo "Please run the script in 'gcloud/onboarding/' first." >&2 + exit 1 + fi + echo "Prerequisites met." + } + + function create_cluster() { + # ---FIX--- + # Add defensive check for required configuration. + if [[ -z "${CONFIG[CLUSTER_NAME]}" || -z "${CONFIG[REGION]}" || -z "${CONFIG[SUBNET]}" || -z "${CONFIG[CUJ_TAG]}" ]]; then + echo "ERROR: One or more required keys (CLUSTER_NAME, REGION, SUBNET, CUJ_TAG) are missing from env.json" >&2 + exit 1 + fi + # ---END FIX--- + + echo "Creating Dataproc cluster '${CONFIG[CLUSTER_NAME]}'..." + set -x + gcloud dataproc clusters create "${CONFIG[CLUSTER_NAME]}" \ + --region="${CONFIG[REGION]}" \ + --subnet="${CONFIG[SUBNET]}" \ + --tags="${CONFIG[CUJ_TAG]}" \ + --format json + set +x + } + + function delete_cluster() { + if [[ -z "${CONFIG[CLUSTER_NAME]}" || -z "${CONFIG[REGION]}" ]]; then + echo "ERROR: One or more required keys (CLUSTER_NAME, REGION) are missing from env.json" >&2 + exit 1 + fi + echo "Deleting Dataproc cluster '${CONFIG[CLUSTER_NAME]}'..." + if gcloud dataproc clusters describe "${CONFIG[CLUSTER_NAME]}" --region="${CONFIG[REGION]}" &>/dev/null; then + gcloud dataproc clusters delete --quiet "${CONFIG[CLUSTER_NAME]}" --region="${CONFIG[REGION]}" + else + echo "Cluster '${CONFIG[CLUSTER_NAME]}' not found, skipping delete." + fi + } + + case "$1" in + validate) + validate + ;; + up) # Creates the full managed stack for this CUJ + validate + create_network_and_subnet + create_cluster + ;; + down) # Deletes the full managed stack for this CUJ + delete_cluster + delete_network_and_subnet + ;; + cluster-rebuild) # Cycles the cluster, leaves network + (delete_cluster) || true + create_cluster + ;; + *) + echo "Usage: $0 {validate|up|down|cluster-rebuild}" + exit 1 + ;; + esac +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi From 9475017a0b90a2f6aa0580e113f7b319306fbaef Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 23 Jun 2025 20:59:57 -0700 Subject: [PATCH 3/3] feat(gcloud): Enhance CUJ framework and add advanced use cases This commit builds upon the foundational CUJ framework by ingesting battle-tested logic from numerous sources and implementing the initial set of comprehensive, production-like Critical User Journeys. The framework is now enhanced with a powerful, modular library and the first advanced CUJs, making it a robust tool for end-to-end testing. Key Enhancements: * **Modular Library (`lib/`)**: The monolithic `common.sh` is refactored into a modular library with components organized by function (`_core.sh`, `_network.sh`, `_dataproc.sh`, `_database.sh`, `_security.sh`). This incorporates advanced, parameterized, and idempotent functions for managing a wide range of GCP resources. * **Advanced Onboarding (`onboarding/`)**: New scripts are added to provision persistent, shared infrastructure, including a High-Availability Cloud SQL instance with VPC Peering and a dual-NIC Squid Proxy VM, following GCP best practices. * **New Critical User Journeys (`cuj/`)**: * `gce/standard`: This CUJ is enhanced to provision a full, NAT-based network environment. * `gce/proxy-egress`: A new CUJ is added to test Dataproc clusters that use a proxy for all outbound traffic. * `gke/standard`: A new CUJ is added for the standard Dataproc on GKE use case. * **Enhanced CI/CD (`ci/`)**: `pristine_check.sh` is upgraded to use a robust, tag-based cleanup strategy, making it scalable to any number of CUJs without modification. * **Finalized Configuration (`env.json`)**: The `env.json.sample` file is finalized with a simplified structure that defines the shared test environment and a `cuj_set` for test orchestration, abstracting implementation details from the user. * **Comprehensive Documentation (`README.md`)**: The README is updated to be a complete guide for the new framework, explaining its philosophy and providing a clear "Getting Started" workflow for new users. --- gcloud/README.md | 255 +++++------------- gcloud/ci/pristine_check.sh | 113 ++++---- gcloud/cuj/gce/cloud-nat/manage.sh | 85 ++++++ gcloud/cuj/gce/proxy-egress/manage.sh | 122 +++++++++ gcloud/cuj/gce/secure-web-proxy/README.md | 60 +++++ gcloud/cuj/gce/secure-web-proxy/manage.sh | 92 +++++++ gcloud/cuj/gce/standard/manage.sh | 80 ------ gcloud/cuj/gke/cloud-nat/manage.sh | 94 +++++++ gcloud/env.json.sample | 36 ++- gcloud/lib/_core.sh | 73 +++++ gcloud/lib/_database.sh | 150 +++++++++++ gcloud/lib/_dataproc.sh | 138 ++++++++++ gcloud/lib/_network.sh | 185 +++++++++++++ gcloud/lib/_security.sh | 108 ++++++++ gcloud/lib/common.sh | 92 +------ gcloud/onboarding/create_cloudsql_instance.sh | 57 ++++ gcloud/onboarding/create_squid_proxy.sh | 73 +++++ gcloud/onboarding/create_swp_instance.sh | 84 ++++++ gcloud/onboarding/delete_cloudsql_instance.sh | 36 +++ gcloud/onboarding/delete_squid_proxy.sh | 50 ++++ gcloud/onboarding/delete_swp_instance.sh | 59 ++++ gcloud/onboarding/install_squid.sh | 85 ++++++ 22 files changed, 1724 insertions(+), 403 deletions(-) create mode 100644 gcloud/cuj/gce/cloud-nat/manage.sh create mode 100644 gcloud/cuj/gce/proxy-egress/manage.sh create mode 100644 gcloud/cuj/gce/secure-web-proxy/README.md create mode 100644 gcloud/cuj/gce/secure-web-proxy/manage.sh delete mode 100644 gcloud/cuj/gce/standard/manage.sh create mode 100644 gcloud/cuj/gke/cloud-nat/manage.sh create mode 100644 gcloud/lib/_core.sh create mode 100644 gcloud/lib/_database.sh create mode 100644 gcloud/lib/_dataproc.sh create mode 100644 gcloud/lib/_network.sh create mode 100644 gcloud/lib/_security.sh create mode 100644 gcloud/onboarding/create_cloudsql_instance.sh create mode 100644 gcloud/onboarding/create_squid_proxy.sh create mode 100644 gcloud/onboarding/create_swp_instance.sh create mode 100644 gcloud/onboarding/delete_cloudsql_instance.sh create mode 100644 gcloud/onboarding/delete_squid_proxy.sh create mode 100644 gcloud/onboarding/delete_swp_instance.sh create mode 100644 gcloud/onboarding/install_squid.sh diff --git a/gcloud/README.md b/gcloud/README.md index e967aa2..6c6a69d 100644 --- a/gcloud/README.md +++ b/gcloud/README.md @@ -16,200 +16,85 @@ limitations under the License. --> -## Introduction +# Dataproc Critical User Journey (CUJ) Framework -This README file describes how to use this collection of gcloud bash examples to -reproduce common Dataproc cluster creation problems relating to the GCE startup -script, Dataproc startup script, and Dataproc initialization-actions scripts. +This directory contains a collection of scripts that form a test framework for exercising Critical User Journeys (CUJs) on Google Cloud Dataproc. The goal of this framework is to provide a robust, maintainable, and automated way to reproduce and validate the common and complex use cases that are essential for our customers. -## Clone the git repository +This framework replaces the previous monolithic scripts with a modular, scalable, and self-documenting structure designed for both interactive use and CI/CD automation. -``` -$ git clone git@github.com:GoogleCloudDataproc/cloud-dataproc -$ cd cloud-dataproc/gcloud -$ cp env.json.sample env.json -$ vi env.json -``` +## Framework Overview -## Environment configuration +The framework is organized into several key directories, each with a distinct purpose: -First, copy `env.json.sample` to `env.json` and modify the environment -variable names and their values in `env.json` to match your -environment: +* **`onboarding/`**: Contains idempotent scripts to set up persistent, shared infrastructure that multiple CUJs might depend on. These are typically run once per project. Examples include setting up a shared Cloud SQL instance or a Squid proxy VM. -``` -{ - "PROJECT_ID":"ldap-example-yyyy-nn", - "ORG_NUMBER":"100000000001", - "DOMAIN": "your-domain-goes-here.com", - "BILLING_ACCOUNT":"100000-000000-000001", - "FOLDER_NUMBER":"100000000001", - "REGION":"us-west4", - "RANGE":"10.00.01.0/24", - "IDLE_TIMEOUT":"30m", - "ASN_NUMBER":"65531", - "IMAGE_VERSION":"2.2, - "BIGTABLE_INSTANCE":"my-bigtable" -} +* **`cuj/`**: The heart of the framework. This directory contains the individual, self-contained CUJs, grouped by the Dataproc platform (`gce`, `gke`, `s8s`). Each CUJ represents a specific, testable customer scenario. + +* **`lib/`**: A collection of modular bash script libraries (`_core.sh`, `_network.sh`, `_database.sh`, etc.). These files contain all the powerful, reusable functions for creating and managing GCP resources, forming a shared API for all `onboarding` and `cuj` scripts. + +* **`ci/`**: Includes scripts specifically for CI/CD automation. The `pristine_check.sh` script is designed to enforce a clean project state before and after test runs, preventing bitrot and ensuring reproducibility. + +## Getting Started + +Follow these steps to configure your environment and run your first CUJ. + +### 1. Prerequisites + +Ensure you have the following tools installed and configured: +* `gcloud` CLI (authenticated to your Google account) +* `jq` +* A Google Cloud project with billing enabled. + +### 2. Configure Your Environment + +Copy the sample configuration file and edit it to match your environment. + +```bash +cp gcloud/env.json.sample gcloud/env.json +vi gcloud/env.json ``` -The values that you enter here will be used to build reasonable defaults in -`lib/env.sh` ; you can view and modify `lib/env.sh` to more finely tune your -environment. The code in lib/env.sh is sourced and executed at the head of many -scripts in this suite to ensure that the environment is tuned for use with this -reproduction. - -#### Dataproc on GCE - -To tune the reproduction environment for your (customer's) GCE use case, review -the `create_dpgce_cluster` function in the `lib/shared-functions.sh` file. This -is where you can select which arguments are passed to the `gcloud dataproc -clusters create ${CLUSTER_NAME}` command. There exist many examples in the -comments of common use cases below the call to gcloud itself. - -## creation phase - -When reviewing `lib/shared-functions.sh`, pay attention to the -`--metadata startup-script="..."` and `--initialization-actions -"${INIT_ACTIONS_ROOT}/"` arguments. These can be used to -execute arbitrary code during the creation of Dataproc clusters. Many -Google Cloud Support cases relate to failures during either a) -Dataproc's internal startup script, which runs after the `--metadata -startup-script="..."`, or b) scripts passed using the -`--initialization-actions` cluster creation argument. - -## creating the environment and cluster - -Once you have altered `env.json` and have reviewed the function names in -`lib/shared-functions.sh`, you can create your cluster environment and launch -your cluster by running `bin/create-dpgce`. Although the function should be -idempotent, users should not plan to run this more than once for a single -reproduction, as it may configure the environment in a way which renders the -environment non-functional. - -Running the `bin/create-dpgce` script will create the staging bucket, enable the -required services, create a dedicated VPC network, router, NAT, subnet, firewall -rules, and finally, the cluster itself. - -By default, your cluster will time out and be destroyed after 30 minutes of -inactivity. Activity is defined by receipt of a job using the `gcloud dataproc -jobs submit` command. You can change this default of 30 minutes by altering the -value of IDLE_TIMEOUT in `env.json`. This saves your project and your org -operating costs on reproduction clusters which are not being used to actively -reproduce problems. It also gives you a half of an hour to do your work before -worrying that your cluster will be brought down. - -## recreating the cluster - -If your cluster has been destroyed either by timeout or manually calling -`gcloud dataproc clusters delete` you can re-create it by running -`bin/recreate-dpgce`. This script does not re-create any of the resources the -cluster depends on such as network, router, staging bucket, etc. It only -deletes and re-creates the cluster that's already been defined in `env.json` and -previously provisioned using `bin/create-dpgce` - -## deleting the environment and cluster - -If you need to delete the entire environment, you can run `bin/destroy-dpgce` ; -this will delete the cluster, remove the firewall rules, subnet, NAT, router, -VPC network, and staging bucket. To re-create a deleted environment, you may -run `bin/create-dpgce` after `bin/destroy-dpgce` completes successfully. - -### Metadata store - -All startup-scripts run on GCE instances, including Dataproc GCE cluster nodes, -may make use of the `/usr/share/google/get_metadata_value` script to look up -information in the metadata store. The information available in the metadata -server includes some of the arguments passed when creating the cluster using the -`--metadata` argument. - -For instance, if you were to call `gcloud dataproc clusters create -${CLUSTER_NAME}` with the argument `--metadata -init-actions-repo=${INIT_ACTIONS_ROOT}`, then you can find this value by running -`/usr/share/google/get_metadata_value "attributes/init-actions-repo"`. By -default, there are some attributes which are set for dataproc. Some important -ones follow: - -* attributes/dataproc-role -- value: `Master` for master nodes -- value: `Worker` for primary and secondary worker nodes -* attributes/dataproc-cluster-name -* attributes/dataproc-bucket -* attributes/dataproc-cluster-uuid -* attributes/dataproc-region -* hostname (FQDN) -* name (short hostname) -* machine-type - -### GCE Startup script - -Before reading this section, please become familiar with the documentation in -the GCE library for the -[startup-script](https://cloud.google.com/compute/docs/instances/startup-scripts/linux) -metadata argument - -The content of the startup-script, if passed as a string, is stored as -`attributes/startup-script` in the metadata store. If passed as a url, the url -can be found as `attributes/startup-script-url`. - -The GCE startup script runs prior to the Dataproc Agent. This script can be -used to make small modifications to the environment prior to starting Dataproc -services on the host. - -### Dataproc Startup script - -The Dataproc agent is responsible for launching the [Dataproc startup -script](https://cs/piper///depot/google3/cloud/hadoop/services/images/startup-script.sh) -and the [initialization -actions](https://github.com/GoogleCloudDataproc/initialization-actions) in order -of specification. - -The Dataproc startup script runs before the initialization actions, and logs its -output to `/var/log/dataproc-startup-script.log`. It is linked to by -`/usr/local/share/google/dataproc/startup-script.sh` on all dataproc nodes. The -tasks which the startup script run are influenced by the following arguments. -This is not an exhaustive list. If you are troubleshooting startup errors, -determine whether any arguments or properties are being supplied to the -`clusters create` command, especially any similar to the following. +You only need to edit the universal and onboarding settings. The `load_config` function in the library will dynamically generate a `PROJECT_ID` if the default value is present. +### 3. Run Onboarding Scripts + +Before running any CUJs, you must set up the shared infrastructure for your project. These scripts are idempotent and can be run multiple times safely. + +```bash +# Set up the shared Cloud SQL instance with VPC Peering +bash gcloud/onboarding/create_cloudsql_instance.sh + +# Set up the shared Squid Proxy VM and its networking +bash gcloud/onboarding/create_squid_proxy.sh ``` -* `--optional-components` -* `--enable-component-gateway` -* `--properties 'dataproc:conda.*=...'` -* `--properties 'dataproc:pip.*=...'` -* `--properties 'dataproc:kerberos.*=...'` -* `--properties 'dataproc:ranger.*=...'` -* `--properties 'dataproc:druid.*=...'` -* `--properties 'dataproc:kafka.*=...'` -* `--properties 'dataproc:yarn.docker.*=...'` -* `--properties 'dataproc:solr.*=...'` -* `--properties 'dataproc:jupyter.*=...'` -* `--properties 'dataproc:zeppelin.*=...'` + +### 4. Run a Critical User Journey + +Navigate to the directory of the CUJ you want to run and use its `manage.sh` script. + +**Example: Running the standard GCE cluster CUJ** + +```bash +# Navigate to the CUJ directory +cd gcloud/cuj/gce/standard/ + +# Create all resources for this CUJ +./manage.sh up + +# When finished, tear down all resources for this CUJ +./manage.sh down ``` -On Dataproc images prior to 2.3, the Startup script is responsible for -configuring the optional components which the customer has selected in the way -that the customer has specified with properties. Errors indicating -dataproc-startup-script.log often have to do with configuration of optional -components and their services. - -### Dataproc Initialization Actions scripts - -Documentation for the -[initialization-actions](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/init-actions) -argument to the `gcloud dataproc clusters create` command can be found in the -Dataproc library. You may also want to review the -[README.md](https://github.com/GoogleCloudDataproc/initialization-actions/blob/master/README.md) -from the public initialization-actions repo on GitHub. - -Do note that you can specify multiple initialization actions scripts. They will -be executed in the order of specification. The initialization-actions scripts -are stored to -`/etc/google-dataproc/startup-scripts/dataproc-initialization-script-${INDEX}` -on the filesystem of each cluster node, where ${INDEX} is the script number, -starting with 0, and incrementing for each additional script. The URL of the -script can be found by querying the metadata server for -`attributes/dataproc-initialization-action-script-${INDEX}`. From within the -script itself, you can refer to `attributes/$0`. - -Logs for each initialization action script are created under /var/log +Each `manage.sh` script supports several commands: +* **`up`**: Creates all resources for the CUJ. +* **`down`**: Deletes all resources created by this CUJ. +* **`rebuild`**: Runs `down` and then `up` for a full cycle. +* **`validate`**: Checks for prerequisites, such as required APIs or shared infrastructure. + +## Available CUJs + +This framework includes the following initial CUJs: + +* **`gce/standard`**: Creates a standard Dataproc on GCE cluster in a dedicated VPC with a Cloud NAT gateway for secure internet egress. +* **`gce/proxy-egress`**: Creates a Dataproc on GCE cluster in a private network configured to use the shared Squid proxy for all outbound internet traffic. +* **`gke/standard`**: Creates a standard Dataproc on GKE virtual cluster on a new GKE cluster. diff --git a/gcloud/ci/pristine_check.sh b/gcloud/ci/pristine_check.sh index 5b1ef7c..b2ca1a8 100644 --- a/gcloud/ci/pristine_check.sh +++ b/gcloud/ci/pristine_check.sh @@ -1,9 +1,10 @@ #!/bin/bash # -# Verifies and enforces a pristine state in the project for CUJ testing. -# This script is designed to be run from a CI/CD pipeline. +# Verifies and enforces a pristine state in the project for CUJ testing +# by finding and deleting all resources tagged with the CUJ_TAG. # -# It finds all resources associated with the CUJ test network and deletes them. +# This script is designed to be run from a CI/CD pipeline at the beginning +# (in cleanup mode) and at the end (in strict mode) of a test run. # # Usage: # ./pristine_check.sh # Cleanup mode: Aggressively deletes resources. @@ -19,69 +20,74 @@ if [[ "$1" == "--strict" ]]; then STRICT_MODE=true fi -# Use a temporary file to track leftover resources for the final report. +# Store leftover resources to report at the end LEFTOVERS_FILE=$(mktemp) trap 'rm -f -- "${LEFTOVERS_FILE}"' EXIT -header "Pristine Check running in $([[ "$STRICT_MODE" == true ]] && echo 'STRICT' || echo 'CLEANUP') mode" +# --- Helper Functions --- -# --- Resource Discovery and Cleanup --- - -# 1. Dataproc Clusters -# Find any clusters on the target network. -CLUSTERS=$(gcloud dataproc clusters list --region="${CONFIG[REGION]}" --filter="config.gceClusterConfig.networkUri.endsWith(\"/${CONFIG[NETWORK]}\")" --format="value(clusterName)" 2>/dev/null) -if [[ -n "${CLUSTERS}" ]]; then - echo "Found leftover Dataproc clusters: ${CLUSTERS}" | tee -a "${LEFTOVERS_FILE}" - if [[ "$STRICT_MODE" == false ]]; then - echo "Cleaning up..." - # Run deletions in the background for speed. - for cluster in ${CLUSTERS}; do - gcloud dataproc clusters delete --quiet "${cluster}" --region="${CONFIG[REGION]}" & - done - fi -fi +# Generic function to find, report, and optionally delete tagged resources. +# Arguments: +# $1: The type of resource (for logging purposes, e.g., "Dataproc Clusters") +# $2: The gcloud command to list resources (e.g., "gcloud dataproc clusters list ...") +# $3: The gcloud command to delete resources (e.g., "gcloud dataproc clusters delete ...") +function process_resources() { + local resource_type="$1" + local list_command="$2" + local delete_command="$3" -# 2. GCE Instances -# Find any instances on the target network that are NOT part of a managed instance group (like a KDC). -INSTANCES=$(gcloud compute instances list --filter="networkInterfaces.network.endsWith(\"/${CONFIG[NETWORK]}\") AND -name~gke-" --format="value(name)" 2>/dev/null) -if [[ -n "${INSTANCES}" ]]; then - echo "Found leftover GCE instances: ${INSTANCES}" | tee -a "${LEFTOVERS_FILE}" - if [[ "$STRICT_MODE" == false ]]; then - echo "Cleaning up..." - gcloud compute instances delete --quiet ${INSTANCES} & - fi -fi + # The "tr" command handles cases where no resources are found (to avoid errors) + # and where multiple resources are found (one per line). + local resources + resources=$(eval "${list_command}" | tr '\n' ' ' | sed 's/ *$//') -# 3. Firewall Rules -# Dataproc auto-creates firewall rules with the network name. We'll find them. -FIREWALL_RULES=$(gcloud compute firewall-rules list --filter="network.endsWith(\"/${CONFIG[NETWORK]}\")" --format="value(name)" 2>/dev/null) -if [[ -n "${FIREWALL_RULES}" ]]; then - echo "Found leftover Firewall Rules: ${FIREWALL_RULES}" | tee -a "${LEFTOVERS_FILE}" - if [[ "$STRICT_MODE" == false ]]; then - echo "Cleaning up..." - gcloud compute firewall-rules delete --quiet ${FIREWALL_RULES} & + if [[ -n "${resources}" ]]; then + echo "Found leftover ${resource_type}: ${resources}" | tee -a "${LEFTOVERS_FILE}" + if [[ "$STRICT_MODE" == false ]]; then + echo "Cleaning up ${resource_type}..." + # Some delete commands need resource name(s) first, others last. We assume last. + eval "${delete_command} ${resources}" & + fi fi -fi +} + +# --- Main Execution --- + +header "Pristine Check running in $([[ "$STRICT_MODE" == true ]] && echo 'STRICT' || echo 'CLEANUP') mode" + +# Define commands for each resource type. All are filtered by the CUJ_TAG where possible. +LIST_CLUSTERS_CMD="gcloud dataproc clusters list --region='${CONFIG[REGION]}' --filter='config.gceClusterConfig.tags.items=${CONFIG[CUJ_TAG]}' --format='value(clusterName)' 2>/dev/null" +DELETE_CLUSTERS_CMD="gcloud dataproc clusters delete --quiet --region='${CONFIG[REGION]}'" + +LIST_INSTANCES_CMD="gcloud compute instances list --filter='tags.items=${CONFIG[CUJ_TAG]}' --format='value(name)' 2>/dev/null" +DELETE_INSTANCES_CMD="gcloud compute instances delete --quiet --zone='${CONFIG[ZONE]}'" + +# Routers and Networks cannot be tagged, so we must rely on a naming convention for them. +LIST_ROUTERS_CMD="gcloud compute routers list --filter='name~^cuj-' --format='value(name)' 2>/dev/null" +DELETE_ROUTERS_CMD="gcloud compute routers delete --quiet --region='${CONFIG[REGION]}'" + +LIST_FIREWALLS_CMD="gcloud compute firewall-rules list --filter='targetTags.items=${CONFIG[CUJ_TAG]} OR name~^cuj-' --format='value(name)' 2>/dev/null" +DELETE_FIREWALLS_CMD="gcloud compute firewall-rules delete --quiet" + +# Process resources that can be deleted in parallel first. +process_resources "Dataproc Clusters" "${LIST_CLUSTERS_CMD}" "${DELETE_CLUSTERS_CMD}" +process_resources "GCE Instances" "${LIST_INSTANCES_CMD}" "${DELETE_INSTANCES_CMD}" +process_resources "Firewall Rules" "${LIST_FIREWALLS_CMD}" "${DELETE_FIREWALLS_CMD}" +process_resources "Cloud Routers" "${LIST_ROUTERS_CMD}" "${DELETE_ROUTERS_CMD}" -# Wait for all background cleanup jobs to finish before proceeding to network deletion. if [[ "$STRICT_MODE" == false ]]; then - echo "Waiting for resource cleanup to complete..." + echo "Waiting for initial resource cleanup to complete..." wait - echo "Cleanup complete." fi -# 4. VPC Network -# This is the last step, as the network cannot be deleted if resources are using it. -# We will use the function from our library here. -if gcloud compute networks describe "${CONFIG[NETWORK]}" &>/dev/null; then - echo "Found leftover VPC Network: ${CONFIG[NETWORK]}" | tee -a "${LEFTOVERS_FILE}" - if [[ "$STRICT_MODE" == false ]]; then - echo "Cleaning up..." - # The delete_network_and_subnet function is already quiet and handles non-existence. - delete_network_and_subnet - fi -fi +# Process networks last, as they have dependencies. +LIST_NETWORKS_CMD="gcloud compute networks list --filter='name~^cuj-' --format='value(name)' 2>/dev/null" +DELETE_NETWORKS_CMD="gcloud compute networks delete --quiet" +process_resources "VPC Networks" "${LIST_NETWORKS_CMD}" "${DELETE_NETWORKS_CMD}" +if [[ "$STRICT_MODE" == false ]]; then + wait +fi # --- Final Report --- if [[ -s "${LEFTOVERS_FILE}" ]]; then @@ -93,6 +99,7 @@ if [[ -s "${LEFTOVERS_FILE}" ]]; then echo "STRICT mode failed. The project is not pristine." >&2 exit 1 fi + # In non-strict mode, we report but don't fail, assuming the next run will succeed. fi echo "Pristine check complete." diff --git a/gcloud/cuj/gce/cloud-nat/manage.sh b/gcloud/cuj/gce/cloud-nat/manage.sh new file mode 100644 index 0000000..1e92e62 --- /dev/null +++ b/gcloud/cuj/gce/cloud-nat/manage.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# +# CUJ: GCE Cluster Management with Cloud NAT +# +# This script manages the lifecycle of a standard Dataproc on GCE cluster. +# It creates a dedicated VPC with a Cloud Router and NAT gateway to provide +# internet access for the cluster nodes without requiring public IP addresses. + +set -e + +function main() { + local SCRIPT_DIR + SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + source "${SCRIPT_DIR}/../../../lib/common.sh" + load_config + + # --- Define derived resource names --- + # All resource names are derived from the single top-level name in env.json + # to simplify configuration. + local cluster_name="${CONFIG[GCE_CLUSTER_NAME]}" + local network_name="${cluster_name}-net" + local subnet_name="${cluster_name}-subnet" + local router_name="${cluster_name}-router" + local firewall_prefix="${cluster_name}-fw" + + # --- Helper Functions --- + # These functions orchestrate calls to the common library. + function up() { + header "Provisioning environment for CUJ: ${cluster_name}" + # The library functions will be idempotent. + create_network "${network_name}" + create_subnet "${network_name}" "${subnet_name}" "${CONFIG[GCE_SUBNET_RANGE]}" "${CONFIG[REGION]}" + create_firewall_rules "${network_name}" "${firewall_prefix}" "${CONFIG[CUJ_TAG]}" + create_router "${network_name}" "${router_name}" "${CONFIG[REGION]}" "${CONFIG[GCE_ROUTER_ASN]}" + add_nat_gateway_to_router "${router_name}" "${CONFIG[REGION]}" + + header "Creating Dataproc cluster '${cluster_name}'" + create_gce_cluster "${cluster_name}" "${subnet_name}" "${CONFIG[REGION]}" "${CONFIG[CUJ_TAG]}" + echo "Environment for '${cluster_name}' is UP." + } + + function down() { + header "Tearing down environment for CUJ: ${cluster_name}" + # Teardown is in reverse order of creation. + delete_gce_cluster "${cluster_name}" "${CONFIG[REGION]}" + # Deleting a router also deletes its NAT gateway. + delete_router "${router_name}" "${CONFIG[REGION]}" + delete_firewall_rules "${firewall_prefix}" + # Deleting a network also deletes its subnets. + delete_network "${network_name}" + echo "Environment for '${cluster_name}' is DOWN." + } + + function validate() { + header "Validating APIs for CUJ: ${cluster_name}" + validate_apis "compute.googleapis.com" "dataproc.googleapis.com" + } + + + # --- Main command handler --- + case "$1" in + up) + up + ;; + down) + confirm "This will delete cluster '${cluster_name}' and its entire NATed network environment." + down + ;; + rebuild) + down || true + up + ;; + validate) + validate + ;; + *) + echo "Usage: $0 {up|down|rebuild|validate}" + exit 1 + ;; + esac +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/gcloud/cuj/gce/proxy-egress/manage.sh b/gcloud/cuj/gce/proxy-egress/manage.sh new file mode 100644 index 0000000..50aeb40 --- /dev/null +++ b/gcloud/cuj/gce/proxy-egress/manage.sh @@ -0,0 +1,122 @@ +#!/bin/bash +# +# CUJ: GCE Cluster with Proxy Egress +# +# This script manages the lifecycle of a Dataproc cluster that is configured +# to use a pre-existing Squid proxy for all its outbound internet traffic. +# It assumes the proxy was created by the 'gcloud/onboarding/create_squid_proxy.sh' script. + +set -e + +function main() { + local SCRIPT_DIR + SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + source "${SCRIPT_DIR}/../../../lib/common.sh" + load_config + + # --- Define derived resource names --- + # This CUJ is self-contained. All resource names are derived from a static + # base name, not from env.json. + local base_name="cuj-proxy-egress" + local cluster_name="${base_name}-cluster" + local network_name="${base_name}-net" + local subnet_name="${base_name}-subnet" + local subnet_range="10.200.0.0/24" # A distinct CIDR for this CUJ + local test_client_name="${base_name}-test-client" + # Name of the shared proxy VM *is* read from config. + local squid_vm_name="${CONFIG[SQUID_PROXY_VM_NAME]}" + + # --- Helper Functions --- + + function get_squid_ip() { + gcloud compute instances describe "${squid_vm_name}" \ + --zone="${CONFIG[ZONE]}" \ + --format='get(networkInterfaces[0].networkIP)' + } + + function validate() { + header "Validating prerequisites for Proxy Egress CUJ" + echo "Checking for Squid Proxy VM: ${squid_vm_name}..." + if ! gcloud compute instances describe "${squid_vm_name}" --zone="${CONFIG[ZONE]}" &>/dev/null; then + echo "ERROR: Squid Proxy VM '${squid_vm_name}' not found." >&2 + echo "Please run 'gcloud/onboarding/create_squid_proxy.sh' first." >&2 + exit 1 + fi + echo "Prerequisites met." + } + + function up() { + header "Provisioning environment for CUJ: ${base_name}" + validate + create_network "${network_name}" + # Create a subnet with Private Google Access disabled to force traffic through the proxy. + create_subnet "${network_name}" "${subnet_name}" "${subnet_range}" "${CONFIG[REGION]}" "false" + + local squid_ip + squid_ip=$(get_squid_ip) + local proxy_uri="http://${squid_ip}:3128" + + # Define the flags needed to configure the cluster for proxy usage. + local proxy_properties="core:fs.gs.proxy.address=${squid_ip}:3128" + local proxy_metadata="http_proxy=${proxy_uri},https_proxy=${proxy_uri},no_proxy=metadata.google.internal,localhost,127.0.0.1" + local extra_flags="--no-address --properties='${proxy_properties}' --metadata='${proxy_metadata}' --tags='${CONFIG[CUJ_TAG]}'" + + create_gce_cluster "${cluster_name}" "${CONFIG[REGION]}" "${subnet_name}" "${extra_flags}" + echo "Environment for '${base_name}' is UP." + } + + function down() { + header "Tearing down environment for CUJ: ${base_name}" + delete_gce_cluster "${cluster_name}" "${CONFIG[REGION]}" + delete_network "${network_name}" + echo "Environment for '${base_name}' is DOWN." + } + + function test_client_up() { + header "Creating test client VM: ${test_client_name}" + if ! gcloud compute instances describe "${test_client_name}" --zone="${CONFIG[ZONE]}" &>/dev/null; then + gcloud compute instances create "${test_client_name}" \ + --zone="${CONFIG[ZONE]}" --machine-type="e2-small" \ + --image-family="debian-12" --image-project="debian-cloud" \ + --subnet="${subnet_name}" --no-address --tags="${CONFIG[CUJ_TAG]}" + else + echo "Test client '${test_client_name}' already exists." + fi + } + + function test_client_down() { + header "Deleting test client VM: ${test_client_name}" + delete_gce_instance "${test_client_name}" "${CONFIG[ZONE]}" + } + + function test_client_run() { + header "Running proxy connectivity test from '${test_client_name}'" + local squid_ip + squid_ip=$(get_squid_ip) + echo "Proxy IP determined to be: ${squid_ip}" + gcloud compute ssh "${test_client_name}" \ + --zone="${CONFIG[ZONE]}" \ + --command="echo 'Testing proxy connectivity to google.com...' && curl -s --fail --verbose --proxy http://${squid_ip}:3128 https://www.google.com" \ + -- -q # -q suppresses host key warnings + echo "Proxy test completed successfully." + } + + # --- Main command handler --- + case "$1" in + validate) validate ;; + up) up ;; + down) confirm "This will delete cluster '${cluster_name}' and its dedicated private network." && down ;; + rebuild) down || true; up ;; + test-client-up) test_client_up ;; + test-client-down) confirm "This will delete the test client VM '${test_client_name}'." && test_client_down ;; + test-client-run) test_client_run ;; + *) + echo "Usage: $0 {validate|up|down|rebuild|test-client-up|test-client-down|test-client-run}" + exit 1 + ;; + esac +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/gcloud/cuj/gce/secure-web-proxy/README.md b/gcloud/cuj/gce/secure-web-proxy/README.md new file mode 100644 index 0000000..15d4ea2 --- /dev/null +++ b/gcloud/cuj/gce/secure-web-proxy/README.md @@ -0,0 +1,60 @@ +Here are the instructions for running the Secure Web Proxy (SWP) CUJ. + +----- + +### 1\. Configure Your Environment + +First, you'll need to set up your `env.json` file with your project details and the name for your new SWP instance. + +```bash +# Navigate to the gcloud directory +cd gcloud/ + +# Copy the sample configuration +cp env.json.sample env.json + +# Open env.json in your favorite editor and set the following keys: +# - PROJECT_ID +# - REGION +# - SWP_INSTANCE_NAME (e.g., "my-swp-instance") +vi env.json +``` + +----- + +### 2\. Run the Onboarding Script + +Next, you'll run the onboarding script to provision the SWP instance and its related resources. This is a one-time setup that may take a few minutes to complete. + +```bash +# From the gcloud/ directory, run the onboarding script +bash onboarding/create_swp_instance.sh +``` + +----- + +### 3\. Run the CUJ + +Now you're ready to run the CUJ itself. This will create a Dataproc cluster in a private network and configure it to use the SWP instance for all its outbound internet traffic. + +```bash +# Navigate to the CUJ directory +cd cuj/gce/secure-web-proxy/ + +# Create the Dataproc cluster and its related resources +bash manage.sh up + +# When you're finished, you can tear down the cluster and its resources +bash manage.sh down +``` + +----- + +### 4\. Cleanup + +When you're completely finished with your testing, you can run the teardown script to remove the SWP instance and its related resources. + +```bash +# From the gcloud/ directory, run the teardown script +bash onboarding/delete_swp_instance.sh +``` diff --git a/gcloud/cuj/gce/secure-web-proxy/manage.sh b/gcloud/cuj/gce/secure-web-proxy/manage.sh new file mode 100644 index 0000000..1f325aa --- /dev/null +++ b/gcloud/cuj/gce/secure-web-proxy/manage.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# +# CUJ: GCE Cluster with Secure Web Proxy Egress +# +# This script manages the lifecycle of a Dataproc cluster that is configured +# to use a pre-existing Secure Web Proxy (SWP) instance for all its outbound +# internet traffic. +# It assumes the SWP instance was created by the +# 'gcloud/onboarding/create_swp_instance.sh' script. + +set -e + +function main() { + local SCRIPT_DIR + SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + source "${SCRIPT_DIR}/../../../lib/common.sh" + load_config + + # --- Define derived resource names --- + local base_name="cuj-swp-egress" + local cluster_name="${base_name}-cluster" + local network_name="${base_name}-net" + local subnet_name="${base_name}-subnet" + local subnet_range="10.30.0.0/24" # A distinct CIDR for this CUJ + local policy_based_route_name="${base_name}-route" + local swp_instance_name="${CONFIG[SWP_INSTANCE_NAME]}" + + # --- Helper Functions --- + + function validate() { + header "Validating prerequisites for SWP Egress CUJ" + echo "Checking for SWP instance: ${swp_instance_name}..." + if ! gcloud alpha network-services gateways describe "${swp_instance_name}" --location="${CONFIG[REGION]}" &>/dev/null; then + echo "ERROR: SWP instance '${swp_instance_name}' not found." >&2 + echo "Please run 'gcloud/onboarding/create_swp_instance.sh' first." >&2 + exit 1 + fi + echo "Prerequisites met." + } + + function up() { + header "Provisioning environment for CUJ: ${base_name}" + validate + create_network "${network_name}" + create_subnet "${network_name}" "${subnet_name}" "${subnet_range}" "${CONFIG[REGION]}" "false" + + # Create a policy-based route to direct traffic to the SWP instance. + if ! gcloud compute policy-based-routes describe "${policy_based_route_name}" &>/dev/null; then + echo "Creating policy-based route: ${policy_based_route_name}" + gcloud compute policy-based-routes create "${policy_based_route_name}" \ + --network="${network_name}" \ + --source-range="${subnet_range}" \ + --destination-range="0.0.0.0/0" \ + --next-hop-ilb-ip="$(gcloud alpha network-services gateways describe "${swp_instance_name}" --location="${CONFIG[REGION]}" --format='value(addresses[0])')" \ + --priority=100 + else + echo "Policy-based route '${policy_based_route_name}' already exists." + fi + + create_gce_cluster "${cluster_name}" "${CONFIG[REGION]}" "${subnet_name}" "--tags='${CONFIG[CUJ_TAG]}'" + echo "Environment for '${base_name}' is UP." + } + + function down() { + header "Tearing down environment for CUJ: ${base_name}" + delete_gce_cluster "${cluster_name}" "${CONFIG[REGION]}" + if gcloud compute policy-based-routes describe "${policy_based_route_name}" &>/dev/null; then + echo "Deleting policy-based route: ${policy_based_route_name}" + gcloud compute policy-based-routes delete "${policy_based_route_name}" --quiet + else + echo "Policy-based route '${policy_based_route_name}' not found." + fi + delete_network "${network_name}" + echo "Environment for '${base_name}' is DOWN." + } + + # --- Main command handler --- + case "$1" in + validate) validate ;; + up) up ;; + down) confirm "This will delete cluster '${cluster_name}' and its dedicated private network and route." && down ;; + rebuild) down || true; up ;; + *) + echo "Usage: $0 {validate|up|down|rebuild}" + exit 1 + ;; + esac +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/gcloud/cuj/gce/standard/manage.sh b/gcloud/cuj/gce/standard/manage.sh deleted file mode 100644 index 61c995f..0000000 --- a/gcloud/cuj/gce/standard/manage.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash -# CUJ: Standard Dataproc Cluster Management - -function main() { - local SCRIPT_DIR - SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) - source "${SCRIPT_DIR}/../../lib/common.sh" - set -e - load_config - - function validate() { - header "Validating prerequisites" - echo "Checking for shared GCS bucket..." - if ! gsutil -q stat "gs://${CONFIG[SHARED_GCS_BUCKET]}/"; then - echo "ERROR: Shared GCS bucket 'gs://${CONFIG[SHARED_GCS_BUCKET]}/' not found." >&2 - echo "Please run the script in 'gcloud/onboarding/' first." >&2 - exit 1 - fi - echo "Prerequisites met." - } - - function create_cluster() { - # ---FIX--- - # Add defensive check for required configuration. - if [[ -z "${CONFIG[CLUSTER_NAME]}" || -z "${CONFIG[REGION]}" || -z "${CONFIG[SUBNET]}" || -z "${CONFIG[CUJ_TAG]}" ]]; then - echo "ERROR: One or more required keys (CLUSTER_NAME, REGION, SUBNET, CUJ_TAG) are missing from env.json" >&2 - exit 1 - fi - # ---END FIX--- - - echo "Creating Dataproc cluster '${CONFIG[CLUSTER_NAME]}'..." - set -x - gcloud dataproc clusters create "${CONFIG[CLUSTER_NAME]}" \ - --region="${CONFIG[REGION]}" \ - --subnet="${CONFIG[SUBNET]}" \ - --tags="${CONFIG[CUJ_TAG]}" \ - --format json - set +x - } - - function delete_cluster() { - if [[ -z "${CONFIG[CLUSTER_NAME]}" || -z "${CONFIG[REGION]}" ]]; then - echo "ERROR: One or more required keys (CLUSTER_NAME, REGION) are missing from env.json" >&2 - exit 1 - fi - echo "Deleting Dataproc cluster '${CONFIG[CLUSTER_NAME]}'..." - if gcloud dataproc clusters describe "${CONFIG[CLUSTER_NAME]}" --region="${CONFIG[REGION]}" &>/dev/null; then - gcloud dataproc clusters delete --quiet "${CONFIG[CLUSTER_NAME]}" --region="${CONFIG[REGION]}" - else - echo "Cluster '${CONFIG[CLUSTER_NAME]}' not found, skipping delete." - fi - } - - case "$1" in - validate) - validate - ;; - up) # Creates the full managed stack for this CUJ - validate - create_network_and_subnet - create_cluster - ;; - down) # Deletes the full managed stack for this CUJ - delete_cluster - delete_network_and_subnet - ;; - cluster-rebuild) # Cycles the cluster, leaves network - (delete_cluster) || true - create_cluster - ;; - *) - echo "Usage: $0 {validate|up|down|cluster-rebuild}" - exit 1 - ;; - esac -} - -if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then - main "$@" -fi diff --git a/gcloud/cuj/gke/cloud-nat/manage.sh b/gcloud/cuj/gke/cloud-nat/manage.sh new file mode 100644 index 0000000..bbbb3cd --- /dev/null +++ b/gcloud/cuj/gke/cloud-nat/manage.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# +# CUJ: Standard Dataproc on GKE Management +# +# This script manages the lifecycle of a standard Dataproc on GKE virtual +# cluster and its underlying GKE cluster foundation. + +set -e + +function main() { + local SCRIPT_DIR + SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + source "${SCRIPT_DIR}/../../../lib/common.sh" + load_config + + # --- Define derived resource names --- + # All resource names are derived from a static prefix. This CUJ is self-contained + # and requires no specific resource names in env.json. + local base_name="cuj-gke-standard" + local gke_cluster_name="${base_name}" + local virtual_cluster_name="${base_name}-dpgke" + + # --- Helper Functions --- + # These functions orchestrate calls to the common library. + function provision_gke_infra() { + header "Provisioning GKE Cluster for Dataproc: '${gke_cluster_name}'" + # This library function is idempotent and encapsulates creating the GKE + # cluster and the 'dataproc' node pool required for Dataproc on GKE. + create_gke_cluster_for_dataproc "${gke_cluster_name}" "${CONFIG[ZONE]}" "${CONFIG[CUJ_TAG]}" + } + + function teardown_gke_infra() { + header "Deleting GKE Cluster: '${gke_cluster_name}'" + delete_gke_cluster "${gke_cluster_name}" "${CONFIG[ZONE]}" + } + + function provision_virtual_cluster() { + header "Creating Dataproc virtual cluster '${virtual_cluster_name}'" + create_dpgke_virtual_cluster \ + "${virtual_cluster_name}" \ + "${gke_cluster_name}" \ + "${CONFIG[ZONE]}" \ + "${CONFIG[REGION]}" \ + "${CONFIG[SHARED_GCS_BUCKET]}" + } + + function teardown_virtual_cluster() { + header "Deleting Dataproc virtual cluster '${virtual_cluster_name}'" + delete_dpgke_virtual_cluster "${virtual_cluster_name}" "${CONFIG[REGION]}" + } + + function validate() { + header "Validating APIs for CUJ: ${base_name}" + validate_apis "container.googleapis.com" "dataproc.googleapis.com" + } + + + # --- Main command handler --- + case "$1" in + up) + validate + provision_gke_infra + provision_virtual_cluster + ;; + down) + confirm "This will delete virtual cluster '${virtual_cluster_name}' AND GKE cluster '${gke_cluster_name}'." + teardown_virtual_cluster + teardown_gke_infra + ;; + cluster-up) + validate + provision_virtual_cluster + ;; + cluster-down) + confirm "This will delete virtual cluster '${virtual_cluster_name}' but leave the GKE cluster." + teardown_virtual_cluster + ;; + cluster-rebuild) + (teardown_virtual_cluster) || true + provision_virtual_cluster + ;; + validate) + validate + ;; + *) + echo "Usage: $0 {up|down|cluster-up|cluster-down|cluster-rebuild|validate}" + exit 1 + ;; + esac +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/gcloud/env.json.sample b/gcloud/env.json.sample index 03385e7..ec4c5cc 100644 --- a/gcloud/env.json.sample +++ b/gcloud/env.json.sample @@ -1,10 +1,34 @@ { + "//": "================== Universal Settings ==================", + "//": "Settings for the overall test environment.", "PROJECT_ID": "your-gcp-project-id", "REGION": "us-central1", - "NETWORK": "cuj-network", - "SUBNET": "cuj-subnet", - "SUBNET_CIDR": "10.1.2.0/24", - "CLUSTER_NAME": "cuj-standard-cluster", - "SHARED_GCS_BUCKET": "your-cuj-shared-bucket", - "CUJ_TAG": "cuj-test-run" + "ZONE": "us-central1-c", + "CUJ_TAG": "cuj-test-run", + + "//": "================== Shared Infrastructure (Onboarding) ==================", + "//": "Names for persistent resources created by onboarding scripts.", + "SHARED_GCS_BUCKET": "your-project-id-cuj-shared-bucket", + "SQUID_PROXY_VM_NAME": "cuj-squid-proxy-vm", + "SHARED_SQL_INSTANCE_NAME": "cuj-shared-sql-instance", + "SHARED_SQL_ENGINE": "mysql", + "GCE_STANDARD_NETWORK": "cuj-gce-standard-network", + + "//": "================== Critical User Journey Test Suite ==================", + "//": "Defines the set of CUJs to be exercised by an orchestrator.", + "cuj_set": { + "gce": [ + "standard", + "proxy-egress", + "arm-cluster", + "gpu-cluster", + "kerberos-ranger-cluster" + ], + "gke": [ + "standard" + ], + "s8s": [ + "standard-batch" + ] + } } diff --git a/gcloud/lib/_core.sh b/gcloud/lib/_core.sh new file mode 100644 index 0000000..7e4409f --- /dev/null +++ b/gcloud/lib/_core.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This library contains the core, universal utility functions for the CUJ +# framework. It is sourced by the main common.sh library file. + +# A global associative array to hold configuration values. +declare -A CONFIG + +# Loads configuration from env.json into the CONFIG array. +# It also includes logic to dynamically generate a project ID if the +# default placeholder is found in the configuration file. +function load_config() { + local env_file + env_file="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)/../env.json" + if [[ ! -f "${env_file}" ]]; then + echo "ERROR: env.json not found at ${env_file}" >&2 + exit 1 + fi + while IFS='=' read -r key value; do + CONFIG["$key"]="$value" + done < <(jq -r 'to_entries|map("\(.key)=\(.value|tostring)")|.[]' < "${env_file}") + + # Dynamically generate project ID if the default placeholder is present. + if [[ "${CONFIG[PROJECT_ID]}" == "your-gcp-project-id" ]]; then + local user_prefix + # Sanitize the username to be a valid part of a project ID. + user_prefix=$(gcloud config get-value account | cut -d'@' -f1 | tr -c '[:alnum:]' '-') + CONFIG["PROJECT_ID"]="${user_prefix}-cuj-$(date +%Y%m%d)" + echo "NOTE: PROJECT_ID not set in env.json, dynamically generating: ${CONFIG[PROJECT_ID]}" + fi + + gcloud config set project "${CONFIG[PROJECT_ID]}" +} + +# Prints a formatted header message. +function header() { + echo "========================================================================" + echo " $1" + echo "========================================================================" +} + +# Prompts the user for confirmation before proceeding. +# Skips the prompt if the CI_TEST environment variable is set. +function confirm() { + if [[ -z "${CI_TEST}" ]]; then + read -p "$1 (y/N): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo "Operation cancelled." + exit 1 + fi + fi +} + +# Validates that a list of necessary APIs are enabled, and enables them if not. +function validate_apis() { + header "Validating required APIs: $*" + gcloud services enable "$@" --project="${CONFIG[PROJECT_ID]}" +} diff --git a/gcloud/lib/_database.sh b/gcloud/lib/_database.sh new file mode 100644 index 0000000..c89bb4d --- /dev/null +++ b/gcloud/lib/_database.sh @@ -0,0 +1,150 @@ +#!/bin/bash +# +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This library contains functions for managing database resources like +# Cloud SQL and Bigtable. + +# --- Cloud SQL Instance Management --- +# These functions manage the lifecycle of the entire instance and are typically +# called by onboarding scripts. + +# Creates a Cloud SQL for MySQL instance with production-ready settings. +# Assumes VPC Peering has already been established on the network. +# Arguments: +# $1: The name of the Cloud SQL instance. +# $2: The name of the VPC network to peer with. +# $3: The region for the instance. +# $4: The availability type (e.g., "HA" for regional). +function create_mysql_instance() { + local instance_name="$1" + local network_name="$2" + local region="$3" + local availability_type="$4" + + local ha_flag="" + if [[ "${availability_type}" == "HA" ]]; then + ha_flag="--availability-type=REGIONAL" + fi + + if ! gcloud sql instances describe "${instance_name}" &>/dev/null; then + echo "Creating Cloud SQL MySQL instance: ${instance_name}" + gcloud sql instances create "${instance_name}" \ + --database-version="MYSQL_8_0" \ + --region="${region}" \ + --network="projects/${CONFIG[PROJECT_ID]}/global/networks/${network_name}" \ + --no-assign-ip \ + --enable-point-in-time-recovery \ + ${ha_flag} + else + echo "Cloud SQL MySQL instance '${instance_name}' already exists." + fi +} + +# Creates a Cloud SQL for PostgreSQL instance with production-ready settings. +function create_postgres_instance() { + local instance_name="$1" + local network_name="$2" + local region="$3" + local availability_type="$4" + + local ha_flag="" + if [[ "${availability_type}" == "HA" ]]; then + ha_flag="--availability-type=REGIONAL" + fi + + if ! gcloud sql instances describe "${instance_name}" &>/dev/null; then + echo "Creating Cloud SQL PostgreSQL instance: ${instance_name}" + # Ensure peering is set up first. + create_peering_ip_allocation "${network_name}" + create_vpc_peering_connection "${network_name}" + + gcloud sql instances create "${instance_name}" \ + --database-version="POSTGRES_14" \ + --region="${region}" \ + --network="projects/${CONFIG[PROJECT_ID]}/global/networks/${network_name}" \ + --no-assign-ip \ + --enable-point-in-time-recovery \ + ${ha_flag} + else + echo "Cloud SQL PostgreSQL instance '${instance_name}' already exists." + fi +} + +# Deletes any Cloud SQL instance. +function delete_sql_instance() { + local instance_name="$1" + if gcloud sql instances describe "${instance_name}" &>/dev/null; then + echo "Deleting Cloud SQL instance: ${instance_name}" + gcloud sql instances delete --quiet "${instance_name}" + else + echo "Cloud SQL instance '${instance_name}' not found." + fi +} + + +# --- Database Management --- +# These functions manage individual databases within a Cloud SQL instance and +# are typically called by CUJ scripts. + +function create_database_on_instance() { + local instance_name="$1" + local db_name="$2" + if ! gcloud sql databases describe "${db_name}" --instance="${instance_name}" &>/dev/null; then + echo "Creating database '${db_name}' on instance '${instance_name}'" + gcloud sql databases create "${db_name}" --instance="${instance_name}" + else + echo "Database '${db_name}' already exists on instance '${instance_name}'." + fi +} + +function delete_database_from_instance() { + local instance_name="$1" + local db_name="$2" + if gcloud sql databases describe "${db_name}" --instance="${instance_name}" &>/dev/null; then + echo "Deleting database '${db_name}' from instance '${instance_name}'" + gcloud sql databases delete "${db_name}" --instance="${instance_name}" --quiet + else + echo "Database '${db_name}' not found on instance '${instance_name}'." + fi +} + + +# --- Bigtable Functions --- + +function create_bigtable_instance() { + local instance_name="$1" + local display_name="$2" + local cluster_config="$3" # e.g., "id=my-cluster,zone=us-central1-b,nodes=1" + + if ! gcloud bigtable instances describe "${instance_name}" &>/dev/null; then + echo "Creating Bigtable instance: ${instance_name}" + gcloud bigtable instances create "${instance_name}" \ + --display-name="${display_name}" \ + --cluster-config="${cluster_config}" + else + echo "Bigtable instance '${instance_name}' already exists." + fi +} + +function delete_bigtable_instance() { + local instance_name="$1" + if gcloud bigtable instances describe "${instance_name}" &>/dev/null; then + echo "Deleting Bigtable instance: ${instance_name}" + gcloud bigtable instances delete --quiet "${instance_name}" + else + echo "Bigtable instance '${instance_name}' not found." + fi +} diff --git a/gcloud/lib/_dataproc.sh b/gcloud/lib/_dataproc.sh new file mode 100644 index 0000000..a37b94f --- /dev/null +++ b/gcloud/lib/_dataproc.sh @@ -0,0 +1,138 @@ +#!/bin/bash +# +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This library contains functions for managing Dataproc resources. +# It is sourced by the main 'common.sh' library. + +# --- Dataproc on GCE Functions --- + +# A comprehensive GCE cluster creation function. +# It takes required arguments for name, region, and subnet, and a single +# string containing all other optional gcloud flags. +# This allows for maximum flexibility for different CUJs. +# +# Example extra_flags: +# --tags="foo,bar" --properties="core:fs.gs.proxy.address=1.2.3.4:3128" +# --master-machine-type="e2-standard-4" --worker-accelerator="type=nvidia-tesla-t4,count=1" +# --service-account="my-sa@..." --shielded-secure-boot +# +function create_gce_cluster() { + local cluster_name="$1" + local region="$2" + local subnet_name="$3" + local extra_flags="$4" # A single string for all other flags + + if ! gcloud dataproc clusters describe "${cluster_name}" --region="${region}" &>/dev/null; then + header "Creating Dataproc on GCE cluster '${cluster_name}'" + # Using 'eval' is necessary here to correctly parse the string of extra flags. + eval gcloud dataproc clusters create "'${cluster_name}'" \ + --region="'${region}'" \ + --subnet="'${subnet_name}'" \ + ${extra_flags} + else + echo "Dataproc on GCE cluster '${cluster_name}' already exists." + fi +} + +function delete_gce_cluster() { + local cluster_name="$1" + local region="$2" + if gcloud dataproc clusters describe "${cluster_name}" --region="${region}" &>/dev/null; then + header "Deleting Dataproc on GCE cluster '${cluster_name}'" + gcloud dataproc clusters delete --quiet "${cluster_name}" --region="${region}" + else + echo "Dataproc on GCE cluster '${cluster_name}' not found." + fi +} + + +# --- Dataproc on GKE Functions --- + +function create_gke_cluster() { + local gke_cluster_name="$1" + local zone="$2" + local machine_type="${3:-e2-standard-4}" + local tags="${4:-cuj-gke-node}" + + if ! gcloud container clusters describe "${gke_cluster_name}" --zone="${zone}" &>/dev/null; then + header "Creating GKE cluster '${gke_cluster_name}' for Dataproc" + gcloud container clusters create "${gke_cluster_name}" \ + --zone="${zone}" \ + --machine-type="${machine_type}" \ + --num-nodes=1 \ + --enable-dataproc \ + --tags="${tags}" + else + echo "GKE cluster '${gke_cluster_name}' already exists." + fi +} + +function delete_gke_cluster() { + local gke_cluster_name="$1" + local zone="$2" + if gcloud container clusters describe "${gke_cluster_name}" --zone="${zone}" &>/dev/null; then + header "Deleting GKE cluster '${gke_cluster_name}'" + gcloud container clusters delete --quiet "${gke_cluster_name}" --zone="${zone}" + else + echo "GKE cluster '${gke_cluster_name}' not found." + fi +} + +function create_dpgke_virtual_cluster() { + local virtual_cluster_name="$1" + local region="$2" + local gke_cluster_name="$3" + local gke_cluster_zone="$4" + local staging_bucket="$5" + + if ! gcloud dataproc virtual-clusters describe "${virtual_cluster_name}" --region="${region}" &>/dev/null; then + header "Creating Dataproc virtual cluster '${virtual_cluster_name}'" + # The --enable-dataproc flag on the GKE cluster creates the 'dataproc' node pool by default. + gcloud dataproc virtual-clusters create "${virtual_cluster_name}" \ + --region="${region}" \ + --gke-cluster-name="${gke_cluster_name}" \ + --gke-cluster-zone="${gke_cluster_zone}" \ + --gke-node-pool="dataproc" \ + --staging-bucket="${staging_bucket}" + else + echo "Dataproc virtual cluster '${virtual_cluster_name}' already exists." + fi +} + +function delete_dpgke_virtual_cluster() { + local virtual_cluster_name="$1" + local region="$2" + if gcloud dataproc virtual-clusters describe "${virtual_cluster_name}" --region="${region}" &>/dev/null; then + header "Deleting Dataproc virtual cluster '${virtual_cluster_name}'" + gcloud dataproc virtual-clusters delete --quiet "${virtual_cluster_name}" --region="${region}" + else + echo "Dataproc virtual cluster '${virtual_cluster_name}' not found." + fi +} + +# --- Dataproc Helper Functions --- + +# Gets the full JSON representation of a cluster. +# Arguments: $1=cluster_name, $2=region +function get_cluster_json() { + gcloud dataproc clusters describe "$1" --region="$2" --format=json +} + +# Gets just the UUID of a cluster. +# Arguments: $1=cluster_name, $2=region +function get_cluster_uuid() { + get_cluster_json "$1" "$2" | jq -r .clusterUuid +} diff --git a/gcloud/lib/_network.sh b/gcloud/lib/_network.sh new file mode 100644 index 0000000..28f3384 --- /dev/null +++ b/gcloud/lib/_network.sh @@ -0,0 +1,185 @@ +#!/bin/bash +# +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This library contains all common functions for managing network resources. + +# --- VPC Network Functions --- + +function create_network() { + local network_name="$1" + if ! gcloud compute networks describe "${network_name}" &>/dev/null; then + echo "Creating VPC Network: ${network_name}" + gcloud compute networks create "${network_name}" \ + --subnet-mode=custom \ + --bgp-routing-mode="regional" \ + --description="VPC for CUJ workloads" + else + echo "VPC Network '${network_name}' already exists." + fi +} + +function delete_network() { + local network_name="$1" + if gcloud compute networks describe "${network_name}" &>/dev/null; then + echo "Deleting VPC Network: ${network_name}" + gcloud compute networks delete --quiet "${network_name}" + else + echo "VPC Network '${network_name}' not found." + fi +} + +# --- Subnet Functions --- + +function create_subnet() { + local network_name="$1" + local subnet_name="$2" + local subnet_range="$3" + local region="$4" + local enable_private_access="${5:-true}" + + if ! gcloud compute networks subnets describe "${subnet_name}" --region="${region}" &>/dev/null; then + echo "Creating Subnet: ${subnet_name}" + local private_access_flag="" + if [[ "${enable_private_access}" == "true" ]]; then + private_access_flag="--enable-private-ip-google-access" + fi + gcloud compute networks subnets create "${subnet_name}" \ + --network="${network_name}" \ + --range="${subnet_range}" \ + --region="${region}" \ + ${private_access_flag} + else + echo "Subnet '${subnet_name}' already exists." + fi +} + +function delete_subnet() { + local subnet_name="$1" + local region="$2" + if gcloud compute networks subnets describe "${subnet_name}" --region="${region}" &>/dev/null; then + echo "Deleting subnet '${subnet_name}'..." + gcloud compute networks subnets delete --quiet "${subnet_name}" --region="${region}" + else + echo "Subnet '${subnet_name}' not found." + fi +} + +# --- Firewall Rule Functions --- + +function create_firewall_rule() { + local rule_name="$1" + local network_name="$2" + # direction, action, rules, source_ranges, target_tags are passed in a single string + local other_flags="$3" + + if ! gcloud compute firewall-rules describe "${rule_name}" &>/dev/null; then + echo "Creating firewall rule '${rule_name}'..." + eval gcloud compute firewall-rules create "'${rule_name}'" --network="'${network_name}'" ${other_flags} + else + echo "Firewall rule '${rule_name}' already exists." + fi +} + +function delete_firewall_rule() { + local rule_name="$1" + if gcloud compute firewall-rules describe "${rule_name}" &>/dev/null; then + echo "Deleting firewall rule '${rule_name}'..." + gcloud compute firewall-rules delete --quiet "${rule_name}" + else + echo "Firewall rule '${rule_name}' not found." + fi +} + + +# --- Router and NAT Functions --- + +function create_router() { + local router_name="$1" + local network_name="$2" + local region="$3" + local asn="$4" + if ! gcloud compute routers describe "${router_name}" --region="${region}" &>/dev/null; then + echo "Creating Cloud Router: ${router_name}" + gcloud compute routers create "${router_name}" \ + --network="${network_name}" --asn="${asn}" --region="${region}" + else + echo "Cloud Router '${router_name}' already exists." + fi +} + +function delete_router() { + local router_name="$1" + local region="$2" + if gcloud compute routers describe "${router_name}" --region="${region}" &>/dev/null; then + echo "Deleting Cloud Router: ${router_name}" + gcloud compute routers delete --quiet "${router_name}" --region="${region}" + else + echo "Cloud Router '${router_name}' not found." + fi +} + +function add_nat_gateway_to_router() { + local router_name="$1" + local region="$2" + local nat_name="${router_name}-nat" + if ! gcloud compute routers nats describe "${nat_name}" --router="${router_name}" --region="${region}" &>/dev/null; then + echo "Adding NAT Gateway '${nat_name}' to router '${router_name}'" + gcloud compute routers nats create "${nat_name}" \ + --router="${router_name}" --region="${region}" \ + --nat-all-subnet-ip-ranges --auto-allocate-nat-external-ips + else + echo "NAT Gateway '${nat_name}' already exists." + fi +} + + +# --- VPC Peering Functions for Cloud SQL --- + +function create_peering_ip_allocation() { + local network_name="$1" + local allocation_name="${network_name}-sql-peer" + if ! gcloud compute addresses describe "${allocation_name}" --global &>/dev/null; then + echo "Creating IP Allocation for SQL Peering: ${allocation_name}" + gcloud compute addresses create "${allocation_name}" \ + --global --purpose=VPC_PEERING --prefix-length=16 --network="${network_name}" + else + echo "IP Allocation '${allocation_name}' already exists." + fi +} + +function delete_peering_ip_allocation() { + local network_name="$1" + local allocation_name="${network_name}-sql-peer" + if gcloud compute addresses describe "${allocation_name}" --global &>/dev/null; then + echo "Deleting IP Allocation '${allocation_name}'..." + gcloud compute addresses delete --quiet "${allocation_name}" --global + else + echo "IP Allocation '${allocation_name}' not found." + fi +} + +function create_vpc_peering_connection() { + local network_name="$1" + local allocation_name="${network_name}-sql-peer" + # The name of the peering connection is fixed by the service. + if ! gcloud services vpc-peerings list --network="${network_name}" | grep -q "servicenetworking-googleapis-com"; then + echo "Creating VPC Peering Connection for Service Networking..." + gcloud services vpc-peerings connect --service=servicenetworking.googleapis.com \ + --ranges="${allocation_name}" --network="${network_name}" + else + echo "VPC Peering Connection already exists for network '${network_name}'." + fi +} diff --git a/gcloud/lib/_security.sh b/gcloud/lib/_security.sh new file mode 100644 index 0000000..958d695 --- /dev/null +++ b/gcloud/lib/_security.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This library contains common functions related to security configurations, +# such as KMS and Secret Manager, often used as prerequisites for +# secure Dataproc environments like Kerberos. + +# --- KMS Functions --- + +# Creates a KMS KeyRing if it does not already exist. +# +# $1: KeyRing name +# $2: Location (e.g., "global" or a specific region) +function create_kms_keyring() { + local keyring_name="$1" + local location="$2" + if ! gcloud kms keyrings describe "${keyring_name}" --location="${location}" &>/dev/null; then + echo "Creating KMS KeyRing: ${keyring_name}" + gcloud kms keyrings create "${keyring_name}" --location="${location}" + else + echo "KMS KeyRing '${keyring_name}' already exists." + fi +} + +# Creates a KMS Key within a KeyRing if it does not already exist. +# +# $1: Key name +# $2: KeyRing name +# $3: Location (e.g., "global" or a specific region) +function create_kms_key() { + local key_name="$1" + local keyring_name="$2" + local location="$3" + if ! gcloud kms keys describe "${key_name}" --keyring="${keyring_name}" --location="${location}" &>/dev/null; then + echo "Creating KMS Key: ${key_name}" + gcloud kms keys create "${key_name}" \ + --keyring="${keyring_name}" \ + --location="${location}" \ + --purpose="encryption" + else + echo "KMS Key '${key_name}' already exists." + fi +} + +# --- Secret Manager Functions --- + +# Creates a secret with a given payload if it does not already exist. +# +# $1: Secret name +# $2: The secret data/payload as a string +function create_secret_from_string() { + local secret_name="$1" + local secret_data="$2" + + if ! gcloud secrets describe "${secret_name}" &>/dev/null; then + echo "Creating Secret: ${secret_name}" + # Create the secret container + gcloud secrets create "${secret_name}" --replication-policy="automatic" + # Add the first version of the secret from the provided string + printf "%s" "${secret_data}" | gcloud secrets versions add "${secret_name}" --data-file=- + else + echo "Secret '${secret_name}' already exists." + fi +} + +# Deletes a secret and all its versions. +# +# $1: Secret name +function delete_secret() { + local secret_name="$1" + if gcloud secrets describe "${secret_name}" &>/dev/null; then + echo "Deleting Secret: ${secret_name}" + gcloud secrets delete --quiet "${secret_name}" + else + echo "Secret '${secret_name}' not found." + fi +} + +# --- Kerberos-Specific Prerequisite Functions --- + +# Generates a random password and stores it as a new secret in Secret Manager. +# +# $1: Secret name for the password +function create_kerberos_password_secret() { + local secret_name="$1" + if ! gcloud secrets describe "${secret_name}" &>/dev/null; then + header "Generating and storing Kerberos password in Secret Manager" + # Generate a random 32-character alphanumeric password + local random_password + random_password=$(head /dev/urandom | tr -dc 'A-Za-z0-9' | head -c 32) + create_secret_from_string "${secret_name}" "${random_password}" + else + echo "Kerberos password secret '${secret_name}' already exists." + fi +} diff --git a/gcloud/lib/common.sh b/gcloud/lib/common.sh index 4ab06b3..2b2dc62 100644 --- a/gcloud/lib/common.sh +++ b/gcloud/lib/common.sh @@ -14,82 +14,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # -# This library contains common functions for use in CUJ scripts. - -# A global associative array to hold configuration values. -declare -A CONFIG - -# Loads configuration from env.json into the CONFIG array. -function load_config() { - # This assumes env.json is in the gcloud/ directory, two levels above the cuj/*/ directory - local env_file - env_file="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)/../env.json" - - if [[ ! -f "${env_file}" ]]; then - echo "ERROR: env.json not found at ${env_file}" >&2 - exit 1 - fi - - # Read all keys and values from JSON into the CONFIG array - while IFS='=' read -r key value; do - CONFIG["$key"]="$value" - done < <(jq -r 'to_entries|map("\(.key)=\(.value|tostring)")|.[]' < "${env_file}") - - # Set the project for all subsequent gcloud commands - gcloud config set project "${CONFIG[PROJECT_ID]}" -} - -# Prints a formatted header message. -function header() { - echo "========================================================================" - echo " $1" - echo "========================================================================" -} - -# Prompts the user for confirmation before proceeding. -function confirm() { - # When running in an automated test, skip the confirmation. - if [[ -z "${CI_TEST}" ]]; then - read -p "$1 (y/N): " -n 1 -r - echo - if [[ ! $REPLY =~ ^[Yy]$ ]]; then - echo "Operation cancelled." - exit 1 - fi - fi -} - -# Creates the VPC network and a subnet within it. -# (Inside common.sh) -function create_network_and_subnet() { - # ... (logic to check if network exists) - gcloud compute networks create "${CONFIG[NETWORK]}" \ - --subnet-mode=custom \ - --description="Network for CUJ testing" \ - --bgp-routing-mode="regional" - - # ... (logic to check if subnet exists) - gcloud compute networks subnets create "${CONFIG[SUBNET]}" \ - --network="${CONFIG[NETWORK]}" \ - --range="${CONFIG[SUBNET_CIDR]}" \ - --region="${CONFIG[REGION]}" - - # Add firewall rule with the tag to allow SSH - gcloud compute firewall-rules create "${CONFIG[CUJ_TAG]}-allow-ssh" \ - --network="${CONFIG[NETWORK]}" \ - --allow=tcp:22 \ - --source-ranges="0.0.0.0/0" \ - --description="Allow SSH for CUJ test" \ - --target-tags="${CONFIG[CUJ_TAG]}" -} - -# Deletes the VPC network. Subnets are deleted automatically with the network. -function delete_network_and_subnet() { - header "Deleting VPC Network: ${CONFIG[NETWORK]}" - if gcloud compute networks describe "${CONFIG[NETWORK]}" &>/dev/null; then - gcloud compute networks delete --quiet "${CONFIG[NETWORK]}" - echo "Network ${CONFIG[NETWORK]} and its subnets have been deleted." - else - echo "Network ${CONFIG[NETWORK]} not found." - fi -} +# This is the main entrypoint for the CUJ shell script library. +# It sources all other library components, making them available to any +# script that sources this file. + +# Determine the directory where this script resides to reliably source other files. +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + +# Source all library components. +source "${SCRIPT_DIR}/_core.sh" +source "${SCRIPT_DIR}/_network.sh" +source "${SCRIPT_DIR}/_dataproc.sh" +source "${SCRIPT_DIR}/_database.sh" +source "${SCRIPT_DIR}/_security.sh" diff --git a/gcloud/onboarding/create_cloudsql_instance.sh b/gcloud/onboarding/create_cloudsql_instance.sh new file mode 100644 index 0000000..1af732e --- /dev/null +++ b/gcloud/onboarding/create_cloudsql_instance.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# +# Creates the shared, persistent Cloud SQL instance based on the engine +# specified in env.json. +# +# This script is idempotent and follows GCP best practices by setting up a +# High Availability (HA) instance with a Private IP, connected to the main +# GCE network via VPC Service Peering. + +set -e + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +source "${SCRIPT_DIR}/../lib/common.sh" +load_config + +function main() { + header "Onboarding: Setting up Shared Cloud SQL Instance" + + # 1. Define resource names from the config file. Default to mysql if not set. + local instance_name="${CONFIG[SHARED_SQL_INSTANCE_NAME]}" + local db_engine="${CONFIG[SHARED_SQL_ENGINE]:-mysql}" + local network_name="${CONFIG[GCE_STANDARD_NETWORK]}" + local region="${CONFIG[REGION]}" + + # 2. Validate that the necessary APIs are enabled. + validate_apis "sqladmin.googleapis.com" "servicenetworking.googleapis.com" "compute.googleapis.com" + + # 3. Ensure the main VPC network and the VPC Peering connection exist. + # These library functions are idempotent. + create_network "${network_name}" + create_peering_ip_allocation "${network_name}" + create_vpc_peering_connection "${network_name}" + + # 4. Dispatch to the correct library function based on the selected engine. + # We pass "HA" to enable the high-availability flag. + header "Provisioning Cloud SQL instance '${instance_name}' with engine '${db_engine}'" + case "${db_engine}" in + mysql) + create_mysql_instance "${instance_name}" "${network_name}" "${region}" "HA" + ;; + postgres) + create_postgres_instance "${instance_name}" "${network_name}" "${region}" "HA" + ;; + *) + echo "ERROR: Unsupported database engine '${db_engine}' specified in env.json." >&2 + echo "Supported values are 'mysql' or 'postgres'." >&2 + exit 1 + ;; + esac + + echo "Onboarding of Cloud SQL infrastructure is complete." + local private_ip + private_ip=$(gcloud sql instances describe "${instance_name}" --format="value(ipAddresses.ipAddress)") + echo "--> Private IP Address: ${private_ip}" +} + +main diff --git a/gcloud/onboarding/create_squid_proxy.sh b/gcloud/onboarding/create_squid_proxy.sh new file mode 100644 index 0000000..9c5198e --- /dev/null +++ b/gcloud/onboarding/create_squid_proxy.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# +# Creates the shared, persistent Squid Proxy VM and its networking. +# This script is idempotent and can be re-run safely. + +set -e + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +source "${SCRIPT_DIR}/../lib/common.sh" +load_config + +function main() { + header "Onboarding: Setting up Squid Proxy Infrastructure" + + # 1. Define resource names. The proxy onboarding script defines its own + # network names to ensure isolation and minimal configuration. + local squid_vm_name="${CONFIG[SQUID_PROXY_VM_NAME]}" + local external_vpc_name="cuj-external-internet-vpc" + local proxy_internal_net="cuj-proxy-internal-network" + local proxy_internal_subnet="cuj-proxy-internal-subnet" + local proxy_internal_range="10.20.0.0/24" + + + # 2. Create the external network (with internet access) if it doesn't exist. + if ! gcloud compute networks describe "${external_vpc_name}" &>/dev/null; then + echo "Creating external VPC: ${external_vpc_name}" + gcloud compute networks create "${external_vpc_name}" \ + --subnet-mode=auto \ + --description="External VPC with internet access for CUJ proxy VMs" + else + echo "External VPC '${external_vpc_name}' already exists." + fi + + # 3. Create the dedicated internal network for the proxy test environment. + create_network "${proxy_internal_net}" + create_subnet "${proxy_internal_net}" "${proxy_internal_subnet}" "${proxy_internal_range}" "${CONFIG[REGION]}" "false" + + # 4. Create the Squid Proxy VM if it doesn't exist. + if ! gcloud compute instances describe "${squid_vm_name}" --zone="${CONFIG[ZONE]}" &>/dev/null; then + echo "Creating Squid Proxy VM: ${squid_vm_name}" + gcloud compute instances create "${squid_vm_name}" \ + --zone="${CONFIG[ZONE]}" \ + --machine-type="e2-medium" \ + --image-family="debian-12" \ + --image-project="debian-cloud" \ + --tags="${CONFIG[CUJ_TAG]},squid-proxy" \ + --network-interface="network=${proxy_internal_net},subnet=${proxy_internal_subnet},no-address" \ + --network-interface="network=${external_vpc_name}" \ + --metadata="internal_subnet_range=${proxy_internal_range}" \ + --metadata-from-file="startup-script=${SCRIPT_DIR}/install_squid.sh" + else + echo "Squid Proxy VM '${squid_vm_name}' already exists." + fi + + # 5. Create a firewall rule to allow traffic from the proxy's internal subnet. + local firewall_rule_name="${CONFIG[CUJ_TAG]}-allow-proxy-internal-ingress" + if ! gcloud compute firewall-rules describe "${firewall_rule_name}" &>/dev/null; then + echo "Creating firewall rule to allow access to the proxy..." + gcloud compute firewall-rules create "${firewall_rule_name}" \ + --network="${proxy_internal_net}" \ + --allow="tcp:3128" \ + --source-ranges="${proxy_internal_range}" \ + --target-tags="squid-proxy" \ + --description="Allow internal traffic to the CUJ Squid proxy" + else + echo "Firewall rule '${firewall_rule_name}' already exists." + fi + + echo "Onboarding of Squid Proxy infrastructure is complete." + echo "NOTE: It may take a few minutes for the startup script to finish installing Squid on the VM." +} + +main diff --git a/gcloud/onboarding/create_swp_instance.sh b/gcloud/onboarding/create_swp_instance.sh new file mode 100644 index 0000000..6906f03 --- /dev/null +++ b/gcloud/onboarding/create_swp_instance.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# +# Creates the shared, persistent Secure Web Proxy (SWP) instance. +# This script is idempotent and can be re-run safely. + +set -e + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +source "${SCRIPT_DIR}/../lib/common.sh" +load_config + +function main() { + header "Onboarding: Setting up Secure Web Proxy Infrastructure" + + # 1. Define resource names from the config file. + local swp_instance_name="${CONFIG[SWP_INSTANCE_NAME]}" + local network_name="${CONFIG[GCE_STANDARD_NETWORK]}" + local region="${CONFIG[REGION]}" + local project_id="${CONFIG[PROJECT_ID]}" + local proxy_only_subnet_name="${network_name}-swp-proxy-only" + local proxy_only_subnet_range="10.10.1.0/24" # Use a distinct range + local certificate_name="${swp_instance_name}-cert" + local security_policy_name="${swp_instance_name}-policy" + + # 2. Validate that the necessary APIs are enabled. + validate_apis "networkservices.googleapis.com" "networksecurity.googleapis.com" "certificatemanager.googleapis.com" + + # 3. Ensure the main network exists before adding a subnet to it. + create_network "${network_name}" + + # 4. Create a proxy-only subnet. + if ! gcloud compute networks subnets describe "${proxy_only_subnet_name}" --region="${region}" &>/dev/null; then + echo "Creating proxy-only subnet: ${proxy_only_subnet_name}" + gcloud compute networks subnets create "${proxy_only_subnet_name}" \ + --purpose=REGIONAL_MANAGED_PROXY \ + --role=ACTIVE \ + --region="${region}" \ + --network="${network_name}" \ + --range="${proxy_only_subnet_range}" + else + echo "Proxy-only subnet '${proxy_only_subnet_name}' already exists." + fi + + # 5. Create and upload a self-signed SSL certificate for TLS inspection. + if ! gcloud certificate-manager certificates describe "${certificate_name}" --location="${region}" &>/dev/null; then + echo "Creating self-signed certificate: ${certificate_name}" + openssl req -x509 -newkey rsa:2048 -nodes \ + -keyout private.key -out certificate.crt \ + -days 365 -subj "/CN=swp.example.com" + gcloud certificate-manager certificates create "${certificate_name}" \ + --certificate-file=certificate.crt \ + --private-key-file=private.key \ + --location="${region}" + rm private.key certificate.crt + else + echo "Certificate '${certificate_name}' already exists." + fi + + # 6. Create a Gateway Security Policy. + if ! gcloud network-security gateway-security-policies describe "${security_policy_name}" --location="${region}" &>/dev/null; then + echo "Creating Gateway Security Policy: ${security_policy_name}" + gcloud network-security gateway-security-policies create "${security_policy_name}" \ + --location="${region}" + else + echo "Gateway Security Policy '${security_policy_name}' already exists." + fi + + # 7. Create the Secure Web Proxy Gateway instance. + if ! gcloud alpha network-services gateways describe "${swp_instance_name}" --location="${region}" &>/dev/null; then + echo "Creating Secure Web Proxy Gateway: ${swp_instance_name}" + gcloud alpha network-services gateways create "${swp_instance_name}" \ + --location="${region}" \ + --network="${network_name}" \ + --ports=443 \ + --certificate-urls="projects/${project_id}/locations/${region}/certificates/${certificate_name}" \ + --gateway-security-policy="projects/${project_id}/locations/${region}/gatewaySecurityPolicies/${security_policy_name}" + else + echo "Secure Web Proxy Gateway '${swp_instance_name}' already exists." + fi + + echo "Onboarding of Secure Web Proxy infrastructure is complete." +} + +main diff --git a/gcloud/onboarding/delete_cloudsql_instance.sh b/gcloud/onboarding/delete_cloudsql_instance.sh new file mode 100644 index 0000000..a0a4699 --- /dev/null +++ b/gcloud/onboarding/delete_cloudsql_instance.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# +# Tears down the shared, persistent Cloud SQL instance and its associated +# VPC Peering connection and IP range allocation. +# This script is idempotent and can be re-run safely. + +set -e + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +source "${SCRIPT_DIR}/../lib/common.sh" +load_config + +function main() { + header "Onboarding Teardown: Deleting Shared Cloud SQL Infrastructure" + + # 1. Define resource names from the config file. + local instance_name="${CONFIG[SHARED_SQL_INSTANCE_NAME]}" + local network_name="${CONFIG[GCE_STANDARD_NETWORK]}" + + # 2. Delete the Cloud SQL instance first. + # This function from common.sh is generic and works for any engine. + delete_sql_instance "${instance_name}" + + # 3. Delete the VPC Peering connection. + # This library function must exist in _network.sh and be idempotent. + delete_vpc_peering_connection "${network_name}" + + # 4. Delete the reserved IP range for the peering. + # This library function must exist in _network.sh and be idempotent. + delete_peering_ip_allocation "${network_name}" + + echo "Teardown of Cloud SQL infrastructure is complete." +} + +main + diff --git a/gcloud/onboarding/delete_squid_proxy.sh b/gcloud/onboarding/delete_squid_proxy.sh new file mode 100644 index 0000000..9b2b98b --- /dev/null +++ b/gcloud/onboarding/delete_squid_proxy.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# +# Tears down the shared, persistent Squid Proxy VM and related networking. +# This script is idempotent and can be re-run safely. + +set -e + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +source "${SCRIPT_DIR}/../lib/common.sh" +load_config + +function main() { + header "Onboarding Teardown: Deleting Squid Proxy Infrastructure" + + # 1. Define resource names from the config file. + local squid_vm_name="${CONFIG[SQUID_PROXY_VM_NAME]}" + local external_network_name="${CONFIG[EXTERNAL_VPC_NAME]}" + local firewall_rule_name="${CONFIG[CUJ_TAG]}-allow-proxy-ingress" + local zone="${CONFIG[ZONE]}" + + + # 2. Delete the firewall rule that allows access to the proxy. + if gcloud compute firewall-rules describe "${firewall_rule_name}" &>/dev/null; then + echo "Deleting firewall rule '${firewall_rule_name}'..." + gcloud compute firewall-rules delete --quiet "${firewall_rule_name}" + else + echo "Firewall rule '${firewall_rule_name}' not found, skipping." + fi + + # 3. Delete the Squid Proxy VM. + if gcloud compute instances describe "${squid_vm_name}" --zone="${zone}" &>/dev/null; then + echo "Deleting Squid Proxy VM '${squid_vm_name}'..." + gcloud compute instances delete --quiet "${squid_vm_name}" --zone="${zone}" + else + echo "Squid Proxy VM '${squid_vm_name}' not found, skipping." + fi + + # 4. Delete the external VPC network. + # This script does not touch the main internal GCE network, as other CUJs may use it. + if gcloud compute networks describe "${external_network_name}" &>/dev/null; then + echo "Deleting external VPC '${external_network_name}'..." + gcloud compute networks delete --quiet "${external_network_name}" + else + echo "External VPC '${external_network_name}' not found, skipping." + fi + + echo "Teardown of Squid Proxy infrastructure is complete." +} + +main diff --git a/gcloud/onboarding/delete_swp_instance.sh b/gcloud/onboarding/delete_swp_instance.sh new file mode 100644 index 0000000..d5fdc36 --- /dev/null +++ b/gcloud/onboarding/delete_swp_instance.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# +# Tears down the shared, persistent Secure Web Proxy (SWP) instance. +# This script is idempotent and can be re-run safely. + +set -e + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +source "${SCRIPT_DIR}/../lib/common.sh" +load_config + +function main() { + header "Onboarding Teardown: Deleting Secure Web Proxy Infrastructure" + + # 1. Define resource names from the config file. + local swp_instance_name="${CONFIG[SWP_INSTANCE_NAME]}" + local network_name="${CONFIG[GCE_STANDARD_NETWORK]}" + local region="${CONFIG[REGION]}" + local project_id="${CONFIG[PROJECT_ID]}" + local proxy_only_subnet_name="${network_name}-swp-proxy-only" + local certificate_name="${swp_instance_name}-cert" + local security_policy_name="${swp_instance_name}-policy" + + # 2. Delete the Secure Web Proxy Gateway instance. + if gcloud alpha network-services gateways describe "${swp_instance_name}" --location="${region}" &>/dev/null; then + echo "Deleting Secure Web Proxy Gateway: ${swp_instance_name}" + gcloud alpha network-services gateways delete "${swp_instance_name}" --location="${region}" --quiet + else + echo "Secure Web Proxy Gateway '${swp_instance_name}' not found." + fi + + # 3. Delete the Gateway Security Policy. + if gcloud network-security gateway-security-policies describe "${security_policy_name}" --location="${region}" &>/dev/null; then + echo "Deleting Gateway Security Policy: ${security_policy_name}" + gcloud network-security gateway-security-policies delete "${security_policy_name}" --location="${region}" --quiet + else + echo "Gateway Security Policy '${security_policy_name}' not found." + fi + + # 4. Delete the SSL certificate. + if gcloud certificate-manager certificates describe "${certificate_name}" --location="${region}" &>/dev/null; then + echo "Deleting certificate: ${certificate_name}" + gcloud certificate-manager certificates delete "${certificate_name}" --location="${region}" --quiet + else + echo "Certificate '${certificate_name}' not found." + fi + + # 5. Delete the proxy-only subnet. + if gcloud compute networks subnets describe "${proxy_only_subnet_name}" --region="${region}" &>/dev/null; then + echo "Deleting proxy-only subnet: ${proxy_only_subnet_name}" + gcloud compute networks subnets delete "${proxy_only_subnet_name}" --region="${region}" --quiet + else + echo "Proxy-only subnet '${proxy_only_subnet_name}' not found." + fi + + echo "Teardown of Secure Web Proxy infrastructure is complete." +} + +main diff --git a/gcloud/onboarding/install_squid.sh b/gcloud/onboarding/install_squid.sh new file mode 100644 index 0000000..2f1f567 --- /dev/null +++ b/gcloud/onboarding/install_squid.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# +# Startup script for the Squid Proxy VM. +# This script is executed automatically on the first boot of the instance. + +set -e +set -x + +# --- Installation --- +# Update package lists and install Squid silently. +export DEBIAN_FRONTEND=noninteractive +apt-get -q update +apt-get -q install -y squid + +# --- Configuration --- + +# Fetch the internal subnet range from the GCE metadata server. +# This value is passed in during the 'gcloud compute instances create' command. +INTERNAL_SUBNET_RANGE=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/internal_subnet_range) + +if [[ -z "${INTERNAL_SUBNET_RANGE}" ]]; then + echo "FATAL: Could not determine internal subnet range from instance metadata." >&2 + exit 1 +fi + +# Backup the original squid configuration +mv /etc/squid/squid.conf /etc/squid/squid.conf.original + +# Create a new, secure squid.conf from scratch. +# This configuration only allows access from the specified internal subnet. +cat < /etc/squid/squid.conf +# +# Squid Proxy Configuration - Generated by CUJ startup script +# + +# Define an Access Control List (ACL) for the internal private network +acl internal_network src ${INTERNAL_SUBNET_RANGE} + +# Define standard ports +acl SSL_ports port 443 +acl Safe_ports port 80 # http +acl Safe_ports port 443 # https +acl CONNECT method CONNECT + +# Deny requests to non-standard ports +http_access deny !Safe_ports +http_access deny CONNECT !SSL_ports + +# Allow access only from our internal network. +# The 'deny all' rule is crucial for security. +http_access allow internal_network +http_access deny all + +# Listen on the standard proxy port 3128 on all available interfaces +http_port 0.0.0.0:3128 + +# Use Google's reliable public DNS servers +dns_nameservers 8.8.8.8 8.8.4.4 + +# Turn off Via and other headers for cleaner requests +via off +forwarded_for off +request_header_access From deny all +request_header_access Server deny all +request_header_access User-Agent deny all +request_header_access WWW-Authenticate deny all +request_header_access Link deny all + +# Recommended performance and security settings +cache_dir ufs /var/spool/squid 100 16 256 +coredump_dir /var/spool/squid +refresh_pattern . 0 20% 1440 +EOF + +# --- Enable IP Forwarding --- +# This is necessary for the dual-NIC VM to route traffic from the internal +# network to the external one. +echo "net.ipv4.ip_forward = 1" >> /etc/sysctl.conf +sysctl -p + +# --- Restart Service --- +# Restart squid to apply the new configuration. +systemctl restart squid + +echo "Squid proxy installation and configuration complete."