From f9f131ab383d439525728005e01180f21e46fc4a Mon Sep 17 00:00:00 2001 From: schongloo Date: Thu, 25 Sep 2025 17:12:53 -0700 Subject: [PATCH 01/11] Debugging the Blue/Green e2e tests --- e2e-tests/test_bluegreen_laststate.sh | 16 ++++++++++++++++ e2e-tests/test_bluegreen_stateless.sh | 15 +++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/e2e-tests/test_bluegreen_laststate.sh b/e2e-tests/test_bluegreen_laststate.sh index 95a91896d2..27dfc05eeb 100755 --- a/e2e-tests/test_bluegreen_laststate.sh +++ b/e2e-tests/test_bluegreen_laststate.sh @@ -61,6 +61,22 @@ jm_pod_name=$(get_jm_pod_name $BLUE_CLUSTER_ID) echo "Inspecting savepoint directory..." kubectl exec -it $jm_pod_name -- bash -c "ls -lt /opt/flink/volume/flink-sp/" + +sleep 5 +jm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=jobmanager" -o jsonpath='{..metadata.name}') +sleep 10 +tm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=taskmanager" -o jsonpath='{..metadata.name}') +echo "JM:" $jm_pod_name +echo "TM:" $tm_pod_name +for i in $(seq 1 10); do + echo "===" + kubectl logs $jm_pod_name -c flink-main-container + echo "---" + kubectl logs $tm_pod_name -c flink-main-container + sleep 2 +done + + wait_for_status $GREEN_APPLICATION_IDENTIFIER '.status.lifecycleState' STABLE ${TIMEOUT} || exit 1 kubectl wait --for=delete deployment --timeout=${TIMEOUT}s --selector="app=${BLUE_CLUSTER_ID}" wait_for_status $APPLICATION_IDENTIFIER '.status.jobStatus.state' RUNNING ${TIMEOUT} || exit 1 diff --git a/e2e-tests/test_bluegreen_stateless.sh b/e2e-tests/test_bluegreen_stateless.sh index 5914a11088..5e131530ef 100755 --- a/e2e-tests/test_bluegreen_stateless.sh +++ b/e2e-tests/test_bluegreen_stateless.sh @@ -49,6 +49,21 @@ echo "PATCHING B/G deployment..." #kubectl patch flinkbgdep ${BG_CLUSTER_ID} --type merge --patch '{"spec":{"template":{"spec":{"flinkConfiguration":{"rest.port":"8082","taskmanager.numberOfTaskSlots":"2"}}}}}' kubectl patch flinkbgdep ${BG_CLUSTER_ID} --type merge --patch '{"spec":{"template":{"spec":{"flinkConfiguration":{"taskmanager.numberOfTaskSlots":"2"}}}}}' +sleep 5 +jm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=jobmanager" -o jsonpath='{..metadata.name}') +sleep 10 +tm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=taskmanager" -o jsonpath='{..metadata.name}') +echo "JM:" $jm_pod_name +echo "TM:" $tm_pod_name +for i in $(seq 1 10); do + echo "===" + kubectl logs $jm_pod_name -c flink-main-container + echo "---" + kubectl logs $tm_pod_name -c flink-main-container + sleep 2 +done + + wait_for_status $GREEN_APPLICATION_IDENTIFIER '.status.lifecycleState' STABLE ${TIMEOUT} || exit 1 kubectl wait --for=delete deployment --timeout=${TIMEOUT}s --selector="app=${BLUE_CLUSTER_ID}" wait_for_status $APPLICATION_IDENTIFIER '.status.jobStatus.state' RUNNING ${TIMEOUT} || exit 1 From ad79f7d7794c5771f46ad331d79a7d44efd9117f Mon Sep 17 00:00:00 2001 From: schongloo Date: Thu, 25 Sep 2025 17:28:28 -0700 Subject: [PATCH 02/11] Debugging the Blue/Green e2e tests --- .github/workflows/ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f0e04bc639..232075b900 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -219,11 +219,17 @@ jobs: - test_flink_operator_ha.sh - test_snapshot.sh - test_batch_job.sh + - test_bluegreen_laststate.sh + - test_bluegreen_stateless.sh exclude: - mode: standalone test: test_autoscaler.sh - flink-version: v1_19 test: test_snapshot.sh + - flink-version: v1_19 + test: test_bluegreen_laststate.sh + - flink-version: v1_19 + test: test_bluegreen_stateless.sh uses: ./.github/workflows/e2e.yaml with: java-version: 17 From c150315e461d6a6ec86ca6fa68a3b81d2e105648 Mon Sep 17 00:00:00 2001 From: schongloo Date: Thu, 25 Sep 2025 18:12:01 -0700 Subject: [PATCH 03/11] Debugging the Blue/Green e2e tests --- e2e-tests/test_bluegreen_laststate.sh | 22 +++++++++++++++------- e2e-tests/test_bluegreen_stateless.sh | 22 +++++++++++++++------- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/e2e-tests/test_bluegreen_laststate.sh b/e2e-tests/test_bluegreen_laststate.sh index 27dfc05eeb..2a6bbc236f 100755 --- a/e2e-tests/test_bluegreen_laststate.sh +++ b/e2e-tests/test_bluegreen_laststate.sh @@ -62,18 +62,26 @@ echo "Inspecting savepoint directory..." kubectl exec -it $jm_pod_name -- bash -c "ls -lt /opt/flink/volume/flink-sp/" -sleep 5 -jm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=jobmanager" -o jsonpath='{..metadata.name}') -sleep 10 -tm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=taskmanager" -o jsonpath='{..metadata.name}') -echo "JM:" $jm_pod_name -echo "TM:" $tm_pod_name +jm_pod_name="" +tm_pod_name="" for i in $(seq 1 10); do echo "===" + echo "LISTING PODS:" + kubectl get pods + + if [ "$jm_pod_name" = "" ]; then + jm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=jobmanager" -o jsonpath='{..metadata.name}') + echo "Set JM pod name:" $jm_pod_name + fi kubectl logs $jm_pod_name -c flink-main-container + echo "---" + if [ "$tm_pod_name" = "" ]; then + tm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=taskmanager" -o jsonpath='{..metadata.name}') + echo "Set TM pod name:" $tm_pod_name + fi kubectl logs $tm_pod_name -c flink-main-container - sleep 2 + sleep 5 done diff --git a/e2e-tests/test_bluegreen_stateless.sh b/e2e-tests/test_bluegreen_stateless.sh index 5e131530ef..827377ae6a 100755 --- a/e2e-tests/test_bluegreen_stateless.sh +++ b/e2e-tests/test_bluegreen_stateless.sh @@ -49,18 +49,26 @@ echo "PATCHING B/G deployment..." #kubectl patch flinkbgdep ${BG_CLUSTER_ID} --type merge --patch '{"spec":{"template":{"spec":{"flinkConfiguration":{"rest.port":"8082","taskmanager.numberOfTaskSlots":"2"}}}}}' kubectl patch flinkbgdep ${BG_CLUSTER_ID} --type merge --patch '{"spec":{"template":{"spec":{"flinkConfiguration":{"taskmanager.numberOfTaskSlots":"2"}}}}}' -sleep 5 -jm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=jobmanager" -o jsonpath='{..metadata.name}') -sleep 10 -tm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=taskmanager" -o jsonpath='{..metadata.name}') -echo "JM:" $jm_pod_name -echo "TM:" $tm_pod_name +jm_pod_name="" +tm_pod_name="" for i in $(seq 1 10); do echo "===" + echo "LISTING PODS:" + kubectl get pods + + if [ "$jm_pod_name" = "" ]; then + jm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=jobmanager" -o jsonpath='{..metadata.name}') + echo "Set JM pod name:" $jm_pod_name + fi kubectl logs $jm_pod_name -c flink-main-container + echo "---" + if [ "$tm_pod_name" = "" ]; then + tm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=taskmanager" -o jsonpath='{..metadata.name}') + echo "Set TM pod name:" $tm_pod_name + fi kubectl logs $tm_pod_name -c flink-main-container - sleep 2 + sleep 5 done From 7d1bfb5c7a41c9dbd6aa422cc84a43d2a6780933 Mon Sep 17 00:00:00 2001 From: schongloo Date: Thu, 25 Sep 2025 23:27:22 -0700 Subject: [PATCH 04/11] Debugging the Blue/Green e2e tests --- e2e-tests/test_bluegreen_laststate.sh | 5 ++--- e2e-tests/test_bluegreen_stateless.sh | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/e2e-tests/test_bluegreen_laststate.sh b/e2e-tests/test_bluegreen_laststate.sh index 2a6bbc236f..bd09c3f12d 100755 --- a/e2e-tests/test_bluegreen_laststate.sh +++ b/e2e-tests/test_bluegreen_laststate.sh @@ -62,9 +62,8 @@ echo "Inspecting savepoint directory..." kubectl exec -it $jm_pod_name -- bash -c "ls -lt /opt/flink/volume/flink-sp/" -jm_pod_name="" tm_pod_name="" -for i in $(seq 1 10); do +for i in $(seq 1 4); do echo "===" echo "LISTING PODS:" kubectl get pods @@ -81,7 +80,7 @@ for i in $(seq 1 10); do echo "Set TM pod name:" $tm_pod_name fi kubectl logs $tm_pod_name -c flink-main-container - sleep 5 + sleep 15 done diff --git a/e2e-tests/test_bluegreen_stateless.sh b/e2e-tests/test_bluegreen_stateless.sh index 827377ae6a..784c12fd52 100755 --- a/e2e-tests/test_bluegreen_stateless.sh +++ b/e2e-tests/test_bluegreen_stateless.sh @@ -51,7 +51,7 @@ kubectl patch flinkbgdep ${BG_CLUSTER_ID} --type merge --patch '{"spec":{"templa jm_pod_name="" tm_pod_name="" -for i in $(seq 1 10); do +for i in $(seq 1 4); do echo "===" echo "LISTING PODS:" kubectl get pods @@ -68,7 +68,7 @@ for i in $(seq 1 10); do echo "Set TM pod name:" $tm_pod_name fi kubectl logs $tm_pod_name -c flink-main-container - sleep 5 + sleep 15 done From d81ac6cc6d1e55d8159b414d14a83deb2df3ba76 Mon Sep 17 00:00:00 2001 From: schongloo Date: Fri, 26 Sep 2025 11:56:46 -0700 Subject: [PATCH 05/11] Debugging the Blue/Green e2e tests --- e2e-tests/data/bluegreen-laststate.yaml | 4 ++-- e2e-tests/data/bluegreen-stateless.yaml | 4 ++-- e2e-tests/test_bluegreen_laststate.sh | 4 ++-- e2e-tests/test_bluegreen_stateless.sh | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/e2e-tests/data/bluegreen-laststate.yaml b/e2e-tests/data/bluegreen-laststate.yaml index 2344a520e2..c8e77f415b 100644 --- a/e2e-tests/data/bluegreen-laststate.yaml +++ b/e2e-tests/data/bluegreen-laststate.yaml @@ -28,7 +28,7 @@ spec: image: flink:1.20 flinkVersion: v1_20 flinkConfiguration: - rest.port: "8081" +# rest.port: "8081" execution.checkpointing.interval: "10s" execution.checkpointing.storage: "filesystem" state.backend.incremental: "true" @@ -59,7 +59,7 @@ spec: claimName: flink-bg-laststate taskManager: resource: - memory: 2G + memory: 1G cpu: 1 job: jarURI: local:///opt/flink/examples/streaming/StateMachineExample.jar diff --git a/e2e-tests/data/bluegreen-stateless.yaml b/e2e-tests/data/bluegreen-stateless.yaml index 969dc67c39..e9522758a9 100644 --- a/e2e-tests/data/bluegreen-stateless.yaml +++ b/e2e-tests/data/bluegreen-stateless.yaml @@ -28,7 +28,7 @@ spec: image: flink:1.20 flinkVersion: v1_20 flinkConfiguration: - rest.port: "8081" +# rest.port: "8081" taskmanager.numberOfTaskSlots: "1" serviceAccount: flink jobManager: @@ -37,7 +37,7 @@ spec: cpu: 1 taskManager: resource: - memory: 2G + memory: 1G cpu: 1 job: jarURI: local:///opt/flink/examples/streaming/StateMachineExample.jar diff --git a/e2e-tests/test_bluegreen_laststate.sh b/e2e-tests/test_bluegreen_laststate.sh index bd09c3f12d..851bad9139 100755 --- a/e2e-tests/test_bluegreen_laststate.sh +++ b/e2e-tests/test_bluegreen_laststate.sh @@ -64,7 +64,7 @@ kubectl exec -it $jm_pod_name -- bash -c "ls -lt /opt/flink/volume/flink-sp/" tm_pod_name="" for i in $(seq 1 4); do - echo "===" + echo "=====" echo "LISTING PODS:" kubectl get pods @@ -74,7 +74,7 @@ for i in $(seq 1 4); do fi kubectl logs $jm_pod_name -c flink-main-container - echo "---" + echo "--==--" if [ "$tm_pod_name" = "" ]; then tm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=taskmanager" -o jsonpath='{..metadata.name}') echo "Set TM pod name:" $tm_pod_name diff --git a/e2e-tests/test_bluegreen_stateless.sh b/e2e-tests/test_bluegreen_stateless.sh index 784c12fd52..3f31088f28 100755 --- a/e2e-tests/test_bluegreen_stateless.sh +++ b/e2e-tests/test_bluegreen_stateless.sh @@ -52,7 +52,7 @@ kubectl patch flinkbgdep ${BG_CLUSTER_ID} --type merge --patch '{"spec":{"templa jm_pod_name="" tm_pod_name="" for i in $(seq 1 4); do - echo "===" + echo "=====" echo "LISTING PODS:" kubectl get pods @@ -62,7 +62,7 @@ for i in $(seq 1 4); do fi kubectl logs $jm_pod_name -c flink-main-container - echo "---" + echo "--==--" if [ "$tm_pod_name" = "" ]; then tm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=taskmanager" -o jsonpath='{..metadata.name}') echo "Set TM pod name:" $tm_pod_name From bfe1b817ff74cb974aaa15aa44fd9e9e471acee6 Mon Sep 17 00:00:00 2001 From: schongloo Date: Fri, 26 Sep 2025 14:33:55 -0700 Subject: [PATCH 06/11] Debugging the Blue/Green e2e tests --- e2e-tests/test_bluegreen_laststate.sh | 9 +++++++-- e2e-tests/test_bluegreen_stateless.sh | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/e2e-tests/test_bluegreen_laststate.sh b/e2e-tests/test_bluegreen_laststate.sh index 851bad9139..8ca3db9fe3 100755 --- a/e2e-tests/test_bluegreen_laststate.sh +++ b/e2e-tests/test_bluegreen_laststate.sh @@ -52,8 +52,8 @@ wait_for_status $APPLICATION_IDENTIFIER '.status.blueGreenState' ACTIVE_BLUE ${T #blue_job_id=$(kubectl get -oyaml flinkdep/basic-bluegreen-example-blue | yq '.status.jobStatus.jobId') -#kubectl patch flinkbgdep ${BG_CLUSTER_ID} --type merge --patch '{"spec":{"template":{"spec":{"flinkConfiguration":{"rest.port":"8082","state.checkpoints.num-retained":"6"}}}}}' -kubectl patch flinkbgdep ${BG_CLUSTER_ID} --type merge --patch '{"spec":{"template":{"spec":{"flinkConfiguration":{"state.checkpoints.num-retained":"6"}}}}}' +kubectl patch flinkbgdep ${BG_CLUSTER_ID} --type merge --patch '{"spec":{"template":{"spec":{"flinkConfiguration":{"rest.port":"8082","state.checkpoints.num-retained":"6"}}}}}' +#kubectl patch flinkbgdep ${BG_CLUSTER_ID} --type merge --patch '{"spec":{"template":{"spec":{"flinkConfiguration":{"state.checkpoints.num-retained":"6"}}}}}' echo "Resource patched, giving a chance for the savepoint to be taken..." sleep 10 @@ -72,6 +72,7 @@ for i in $(seq 1 4); do jm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=jobmanager" -o jsonpath='{..metadata.name}') echo "Set JM pod name:" $jm_pod_name fi + echo "GETTING JM LOGS:" kubectl logs $jm_pod_name -c flink-main-container echo "--==--" @@ -79,7 +80,11 @@ for i in $(seq 1 4); do tm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=taskmanager" -o jsonpath='{..metadata.name}') echo "Set TM pod name:" $tm_pod_name fi + echo "GETTING TM LOGS:" kubectl logs $tm_pod_name -c flink-main-container + + echo "--=EVs=--" + kubectl get events --field-selector involvedObject.kind=FlinkDeployment,involvedObject.name=$GREEN_CLUSTER_ID --sort-by=.metadata.creationTimestamp sleep 15 done diff --git a/e2e-tests/test_bluegreen_stateless.sh b/e2e-tests/test_bluegreen_stateless.sh index 3f31088f28..5aadcf524c 100755 --- a/e2e-tests/test_bluegreen_stateless.sh +++ b/e2e-tests/test_bluegreen_stateless.sh @@ -46,8 +46,8 @@ wait_for_status $APPLICATION_IDENTIFIER '.status.jobStatus.state' RUNNING ${TIME wait_for_status $APPLICATION_IDENTIFIER '.status.blueGreenState' ACTIVE_BLUE ${TIMEOUT} || exit 1 echo "PATCHING B/G deployment..." -#kubectl patch flinkbgdep ${BG_CLUSTER_ID} --type merge --patch '{"spec":{"template":{"spec":{"flinkConfiguration":{"rest.port":"8082","taskmanager.numberOfTaskSlots":"2"}}}}}' -kubectl patch flinkbgdep ${BG_CLUSTER_ID} --type merge --patch '{"spec":{"template":{"spec":{"flinkConfiguration":{"taskmanager.numberOfTaskSlots":"2"}}}}}' +kubectl patch flinkbgdep ${BG_CLUSTER_ID} --type merge --patch '{"spec":{"template":{"spec":{"flinkConfiguration":{"rest.port":"8082","taskmanager.numberOfTaskSlots":"2"}}}}}' +#kubectl patch flinkbgdep ${BG_CLUSTER_ID} --type merge --patch '{"spec":{"template":{"spec":{"flinkConfiguration":{"taskmanager.numberOfTaskSlots":"2"}}}}}' jm_pod_name="" tm_pod_name="" @@ -60,6 +60,7 @@ for i in $(seq 1 4); do jm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=jobmanager" -o jsonpath='{..metadata.name}') echo "Set JM pod name:" $jm_pod_name fi + echo "GETTING JM LOGS:" kubectl logs $jm_pod_name -c flink-main-container echo "--==--" @@ -67,7 +68,11 @@ for i in $(seq 1 4); do tm_pod_name=$(kubectl get pods --selector="app=${GREEN_CLUSTER_ID},component=taskmanager" -o jsonpath='{..metadata.name}') echo "Set TM pod name:" $tm_pod_name fi + echo "GETTING TM LOGS:" kubectl logs $tm_pod_name -c flink-main-container + + echo "--=EVs=--" + kubectl get events --field-selector involvedObject.kind=FlinkDeployment,involvedObject.name=$GREEN_CLUSTER_ID --sort-by=.metadata.creationTimestamp sleep 15 done From be0ca8efb4ee5ad5f0065b88d2cb7df135a50239 Mon Sep 17 00:00:00 2001 From: schongloo Date: Fri, 26 Sep 2025 16:03:27 -0700 Subject: [PATCH 07/11] Debugging the Blue/Green e2e tests --- e2e-tests/test_bluegreen_laststate.sh | 3 ++- e2e-tests/test_bluegreen_stateless.sh | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/e2e-tests/test_bluegreen_laststate.sh b/e2e-tests/test_bluegreen_laststate.sh index 8ca3db9fe3..5e318da0c0 100755 --- a/e2e-tests/test_bluegreen_laststate.sh +++ b/e2e-tests/test_bluegreen_laststate.sh @@ -84,7 +84,8 @@ for i in $(seq 1 4); do kubectl logs $tm_pod_name -c flink-main-container echo "--=EVs=--" - kubectl get events --field-selector involvedObject.kind=FlinkDeployment,involvedObject.name=$GREEN_CLUSTER_ID --sort-by=.metadata.creationTimestamp + kubectl describe flinkdep $GREEN_CLUSTER_ID +# kubectl get events --field-selector involvedObject.kind=FlinkDeployment,involvedObject.name=$GREEN_CLUSTER_ID --sort-by=.metadata.creationTimestamp sleep 15 done diff --git a/e2e-tests/test_bluegreen_stateless.sh b/e2e-tests/test_bluegreen_stateless.sh index 5aadcf524c..9064620f11 100755 --- a/e2e-tests/test_bluegreen_stateless.sh +++ b/e2e-tests/test_bluegreen_stateless.sh @@ -69,10 +69,11 @@ for i in $(seq 1 4); do echo "Set TM pod name:" $tm_pod_name fi echo "GETTING TM LOGS:" - kubectl logs $tm_pod_name -c flink-main-container + kubectl logs $tm_pod_name echo "--=EVs=--" - kubectl get events --field-selector involvedObject.kind=FlinkDeployment,involvedObject.name=$GREEN_CLUSTER_ID --sort-by=.metadata.creationTimestamp + kubectl describe flinkdep $GREEN_CLUSTER_ID +# kubectl get events --field-selector involvedObject.kind=FlinkDeployment,involvedObject.name=$GREEN_CLUSTER_ID --sort-by=.metadata.creationTimestamp sleep 15 done From 3296c6fc5e3ab67772ddbc08e96180362863316e Mon Sep 17 00:00:00 2001 From: schongloo Date: Mon, 29 Sep 2025 20:46:11 -0700 Subject: [PATCH 08/11] Debugging the Blue/Green e2e tests --- e2e-tests/test_bluegreen_laststate.sh | 7 +++++-- e2e-tests/test_bluegreen_stateless.sh | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/e2e-tests/test_bluegreen_laststate.sh b/e2e-tests/test_bluegreen_laststate.sh index 5e318da0c0..0384c6e5dc 100755 --- a/e2e-tests/test_bluegreen_laststate.sh +++ b/e2e-tests/test_bluegreen_laststate.sh @@ -63,7 +63,7 @@ kubectl exec -it $jm_pod_name -- bash -c "ls -lt /opt/flink/volume/flink-sp/" tm_pod_name="" -for i in $(seq 1 4); do +for i in $(seq 1 6); do echo "=====" echo "LISTING PODS:" kubectl get pods @@ -86,7 +86,10 @@ for i in $(seq 1 4); do echo "--=EVs=--" kubectl describe flinkdep $GREEN_CLUSTER_ID # kubectl get events --field-selector involvedObject.kind=FlinkDeployment,involvedObject.name=$GREEN_CLUSTER_ID --sort-by=.metadata.creationTimestamp - sleep 15 + sleep 30 + + status=$(kubectl get -oyaml $GREEN_APPLICATION_IDENTIFIER | yq '.status.lifecycleState') + echo "==> lifecycleState after 30 secs: " $status done diff --git a/e2e-tests/test_bluegreen_stateless.sh b/e2e-tests/test_bluegreen_stateless.sh index 9064620f11..cb877fcc58 100755 --- a/e2e-tests/test_bluegreen_stateless.sh +++ b/e2e-tests/test_bluegreen_stateless.sh @@ -51,7 +51,7 @@ kubectl patch flinkbgdep ${BG_CLUSTER_ID} --type merge --patch '{"spec":{"templa jm_pod_name="" tm_pod_name="" -for i in $(seq 1 4); do +for i in $(seq 1 6); do echo "=====" echo "LISTING PODS:" kubectl get pods @@ -74,7 +74,10 @@ for i in $(seq 1 4); do echo "--=EVs=--" kubectl describe flinkdep $GREEN_CLUSTER_ID # kubectl get events --field-selector involvedObject.kind=FlinkDeployment,involvedObject.name=$GREEN_CLUSTER_ID --sort-by=.metadata.creationTimestamp - sleep 15 + sleep 30 + + status=$(kubectl get -oyaml $GREEN_APPLICATION_IDENTIFIER | yq '.status.lifecycleState') + echo "==> lifecycleState after 30 secs: " $status done From 24438087a28687becb51cbe8d05bf88cbbe4e7ea Mon Sep 17 00:00:00 2001 From: schongloo Date: Tue, 30 Sep 2025 11:01:19 -0700 Subject: [PATCH 09/11] Debugging the Blue/Green e2e tests (temporarily excluding some other tests) --- .github/workflows/ci.yml | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 232075b900..281aa08866 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -209,23 +209,23 @@ jobs: - "native" - "standalone" test: - - test_application_kubernetes_ha.sh - - test_application_operations.sh - - test_dynamic_config.sh - - test_dynamic_flink_conf.sh - - test_sessionjob_kubernetes_ha.sh - - test_sessionjob_operations.sh - - test_autoscaler.sh - - test_flink_operator_ha.sh - - test_snapshot.sh - - test_batch_job.sh +# - test_application_kubernetes_ha.sh +# - test_application_operations.sh +# - test_dynamic_config.sh +# - test_dynamic_flink_conf.sh +# - test_sessionjob_kubernetes_ha.sh +# - test_sessionjob_operations.sh +# - test_autoscaler.sh +# - test_flink_operator_ha.sh +# - test_snapshot.sh +# - test_batch_job.sh - test_bluegreen_laststate.sh - test_bluegreen_stateless.sh exclude: - - mode: standalone - test: test_autoscaler.sh - - flink-version: v1_19 - test: test_snapshot.sh +# - mode: standalone +# test: test_autoscaler.sh +# - flink-version: v1_19 +# test: test_snapshot.sh - flink-version: v1_19 test: test_bluegreen_laststate.sh - flink-version: v1_19 From 35678bcfbe058a77a5cf36a353fe43090b600ed5 Mon Sep 17 00:00:00 2001 From: schongloo Date: Wed, 8 Oct 2025 00:42:25 -0700 Subject: [PATCH 10/11] Debugging the Blue/Green e2e tests --- e2e-tests/data/bluegreen-laststate.yaml | 14 ++++++++++---- e2e-tests/data/bluegreen-stateless.yaml | 11 +++++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/e2e-tests/data/bluegreen-laststate.yaml b/e2e-tests/data/bluegreen-laststate.yaml index c8e77f415b..1abce9117e 100644 --- a/e2e-tests/data/bluegreen-laststate.yaml +++ b/e2e-tests/data/bluegreen-laststate.yaml @@ -29,6 +29,12 @@ spec: flinkVersion: v1_20 flinkConfiguration: # rest.port: "8081" + jobmanager.memory.jvm-metaspace.size: 96m + jobmanager.memory.jvm-overhead.min: 32m + taskmanager.memory.managed.size: 78m + taskmanager.memory.jvm-metaspace.size: 64m + taskmanager.memory.jvm-overhead.min: 32m + taskmanager.memory.network.min: 32m execution.checkpointing.interval: "10s" execution.checkpointing.storage: "filesystem" state.backend.incremental: "true" @@ -39,7 +45,7 @@ spec: serviceAccount: flink jobManager: resource: - memory: 1G + memory: 512m cpu: 1 podTemplate: spec: @@ -47,9 +53,9 @@ spec: - name: flink-main-container resources: requests: - ephemeral-storage: 2048Mi + ephemeral-storage: 1024Mi limits: - ephemeral-storage: 2048Mi + ephemeral-storage: 1024Mi volumeMounts: - mountPath: /opt/flink/volume name: flink-volume @@ -59,7 +65,7 @@ spec: claimName: flink-bg-laststate taskManager: resource: - memory: 1G + memory: 512m cpu: 1 job: jarURI: local:///opt/flink/examples/streaming/StateMachineExample.jar diff --git a/e2e-tests/data/bluegreen-stateless.yaml b/e2e-tests/data/bluegreen-stateless.yaml index e9522758a9..a98e9d46d9 100644 --- a/e2e-tests/data/bluegreen-stateless.yaml +++ b/e2e-tests/data/bluegreen-stateless.yaml @@ -29,15 +29,22 @@ spec: flinkVersion: v1_20 flinkConfiguration: # rest.port: "8081" +# taskmanager.memory.process.size: 768m + jobmanager.memory.jvm-metaspace.size: 96m + jobmanager.memory.jvm-overhead.min: 32m + taskmanager.memory.managed.size: 78m + taskmanager.memory.jvm-metaspace.size: 64m + taskmanager.memory.jvm-overhead.min: 32m + taskmanager.memory.network.min: 32m taskmanager.numberOfTaskSlots: "1" serviceAccount: flink jobManager: resource: - memory: 1G + memory: 512m cpu: 1 taskManager: resource: - memory: 1G + memory: 512m cpu: 1 job: jarURI: local:///opt/flink/examples/streaming/StateMachineExample.jar From 17d833a3706abd232c23cf8ff4f65598e744525e Mon Sep 17 00:00:00 2001 From: schongloo Date: Thu, 23 Oct 2025 16:03:53 -0700 Subject: [PATCH 11/11] Debugging the Blue/Green e2e tests --- e2e-tests/data/bluegreen-laststate.yaml | 4 ++-- e2e-tests/data/bluegreen-stateless.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/e2e-tests/data/bluegreen-laststate.yaml b/e2e-tests/data/bluegreen-laststate.yaml index 1abce9117e..31c13e19e8 100644 --- a/e2e-tests/data/bluegreen-laststate.yaml +++ b/e2e-tests/data/bluegreen-laststate.yaml @@ -46,7 +46,7 @@ spec: jobManager: resource: memory: 512m - cpu: 1 + cpu: 0.5 podTemplate: spec: containers: @@ -66,7 +66,7 @@ spec: taskManager: resource: memory: 512m - cpu: 1 + cpu: 0.5 job: jarURI: local:///opt/flink/examples/streaming/StateMachineExample.jar parallelism: 1 diff --git a/e2e-tests/data/bluegreen-stateless.yaml b/e2e-tests/data/bluegreen-stateless.yaml index a98e9d46d9..a61397a3de 100644 --- a/e2e-tests/data/bluegreen-stateless.yaml +++ b/e2e-tests/data/bluegreen-stateless.yaml @@ -41,11 +41,11 @@ spec: jobManager: resource: memory: 512m - cpu: 1 + cpu: 0.5 taskManager: resource: memory: 512m - cpu: 1 + cpu: 0.5 job: jarURI: local:///opt/flink/examples/streaming/StateMachineExample.jar parallelism: 1