From ed1e6720637a484b70a541eb4158a643415b9334 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Tue, 9 Dec 2025 09:11:41 +0000 Subject: [PATCH 01/26] minor improvements all around --- DEVELOPMENT.md | 11 ++++++++++- deps.edn | 12 +++++++++++- dev/intemporal/demo_parallelism.clj | 2 +- dev/intemporal/vthread-recovery.edn | 6 +++++- src/intemporal/macros.cljc | 10 +++++----- src/intemporal/store.cljc | 1 + src/intemporal/workflow.cljc | 2 +- src/intemporal/workflow/internal.cljc | 4 +++- 8 files changed, 37 insertions(+), 11 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index ff6e154..e3d9fb6 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -70,4 +70,13 @@ $ JAVA_OPTS="-DFDB_LIBRARY_PATH_FDB_C=/usr/local/lib/libfdb_c.dylib -DFDB_LIBRAR (.invoke method com.apple.foundationdb.JNIUtil (object-array ["fdb_java"])) (.invoke method com.apple.foundationdb.JNIUtil (object-array ["fdb_c"]))) -``` \ No newline at end of file +``` + +# Telemetry + +# Get the OT javaagent + +```shell +wget --content-disposition https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/v2.21.0/opentelemetry-javaagent.jar +``` +Run with the `dev` profile to activate the java agent. \ No newline at end of file diff --git a/deps.edn b/deps.edn index 471372a..299a077 100644 --- a/deps.edn +++ b/deps.edn @@ -14,7 +14,17 @@ missionary/missionary {:mvn/version "b.46"}} :aliases {:dev {:extra-paths ["dev" "test"] - :jvm-opts ["-Djdk.attach.allowAttachSelf"] + :jvm-opts ["-Djdk.attach.allowAttachSelf" + "-javaagent:./opentelemetry-javaagent.jar" + "-Dotel.instrumentation.common.default-enabled=true" + "-Dotel.javaagent.debug=false" + "-Dotel.exporter.otlp.protocol=grpc" + "-Dotel.exporter.otlp.endpoint=http://localhost:4317" + "-Dotel.instrumentation.netty.enabled=false" + "-Dotel.metrics.exporter=none" + "-Dotel.javaagent.debug=false" + "-Dotel.resource.attributes=service.name=intemporal"] + :extra-deps {exoscale/automata {:mvn/version "0.1.10"} lambdaisland/kaocha {:mvn/version "1.91.1392"} lambdaisland/kaocha-cloverage {:mvn/version "1.1.89"} diff --git a/dev/intemporal/demo_parallelism.clj b/dev/intemporal/demo_parallelism.clj index 9aa30f4..048a1f0 100644 --- a/dev/intemporal/demo_parallelism.clj +++ b/dev/intemporal/demo_parallelism.clj @@ -26,7 +26,7 @@ ;; this code is deterministic up to here @(p/all proms))) -(def mstore (store/make-store)) +(def mstore (store/make-store )) (def stop-worker (w/start-worker! mstore {:protocols {`ThreadActivity (->ThreadActivityImpl)}})) ;; note that in cljs, this returns a promise diff --git a/dev/intemporal/vthread-recovery.edn b/dev/intemporal/vthread-recovery.edn index 4b41e2d..ebe304d 100644 --- a/dev/intemporal/vthread-recovery.edn +++ b/dev/intemporal/vthread-recovery.edn @@ -7,7 +7,9 @@ :result nil, :id "silly-mcclintock", :order 1, - :lease-end nil}}, + :lease-end nil + :runtime {:timeout-ms 900000, + :telemetry-context {"traceparent" "00-0f17ae74e434659106c06f591ee56a6d-85f973ca22cd0d92-01"}}}}, :history {"silly-mcclintock" [{:ref "silly-mcclintock", :root "silly-mcclintock", :type :intemporal.workflow/invoke, @@ -16,6 +18,8 @@ :error nil, :result nil, :id 17} + cccccbrfjtefettfeikinrkevrueftfcuttfningduuc + #_ {:ref "silly-mcclintock", :root "silly-mcclintock", diff --git a/src/intemporal/macros.cljc b/src/intemporal/macros.cljc index e5cd45f..93865a0 100644 --- a/src/intemporal/macros.cljc +++ b/src/intemporal/macros.cljc @@ -210,14 +210,14 @@ ;(w/enqueue-and-wait i/*env* task#))))))))) (defmacro with-failure - "Runs `fcall`, ensuring that if it fails, compensation will always run. - - if `fcall` fails, `binding` will have the value `intemporal.activity/failure`. - - if `fcall` succeeds, but compensation is invoked later (eg other activity failure), `binding` will have its return value + "Runs `body`, ensuring that if it fails, compensation will always run. + - if `body` fails, `binding` will have the value `intemporal.activity/failure`. + - if `body` succeeds, but compensation is invoked later (eg other activity failure), `binding` will have its return value (with-failure [v (book-hotel stub \"hotel\")] (cancel-hotel stub v n)) " - [[binding fcall] comp-fn] + [[binding body] comp-fn] `(let [val# (atom :intemporal.activity/failure)] (w/add-compensation (fn [] (let [~binding @val#] (do ~comp-fn)))) - (reset! val# (do ~fcall)))) \ No newline at end of file + (reset! val# (do ~body)))) \ No newline at end of file diff --git a/src/intemporal/store.cljc b/src/intemporal/store.cljc index 46bff76..4968761 100644 --- a/src/intemporal/store.cljc +++ b/src/intemporal/store.cljc @@ -288,6 +288,7 @@ (try ;; ensure we only run f once - swap! might run the fn multiple times (assoc task :state :new :owner owner) + ;; TODO log reenqueued task (finally (when-not (contains? @task->run? task) (try diff --git a/src/intemporal/workflow.cljc b/src/intemporal/workflow.cljc index 65d1f5c..a9b0b99 100644 --- a/src/intemporal/workflow.cljc +++ b/src/intemporal/workflow.cljc @@ -105,7 +105,7 @@ ;; subsequent workflow traces to have a "parent" span, otherwise ;; they won't show up correctly in jaeger ;; TODO test with eg loki - (trace-async! {:name ::worker-execute-fn :attributes {:task-id (:id task)}} + (trace-async! {:name "worker: worker-execute-fn" :attributes {:task-id (:id task)}} #?(:cljs (internal/resume-task internal-env store protocols task) :clj (otctx/bind-context! (otctx/headers->merged-context (:telemetry-context runtime)) (internal/resume-task internal-env store protocols task))))))) diff --git a/src/intemporal/workflow/internal.cljc b/src/intemporal/workflow/internal.cljc index eddbb8f..1cc7fa2 100644 --- a/src/intemporal/workflow/internal.cljc +++ b/src/intemporal/workflow/internal.cljc @@ -328,7 +328,9 @@ (assert (some? store) "Store should exist") (assert (some? task) "Task should exist") - ;; TODO trace + ;; TODO trace if we pick the task from the db? + ;; the db task should have a telemetry context already no? + ;; (trace! {:name (format "workflow: %s" orig#) :attributes {:task-id id#}} (let [db-task (or (find-task store id) (enqueue-task store task)) From f928d2bb3998ca8ff51d459efacb6c304e67bcb9 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Tue, 9 Dec 2025 22:02:12 +0000 Subject: [PATCH 02/26] use poller, fix threading --- dev/intemporal/demo_parallelism.clj | 9 ++++----- docker/fdb.cluster | 2 +- src/intemporal/workflow.cljc | 2 +- src/intemporal/workflow/internal.cljc | 21 ++++++++++++++------- test/intemporal/failures_test.cljc | 8 ++++---- test/intemporal/internal_failures_test.cljc | 8 ++++---- test/intemporal/recovery_failure_test.clj | 6 +++--- test/intemporal/shutdown_restart_test.cljs | 8 ++++---- test/intemporal/vthread_recovery_test.clj | 6 +++--- test/intemporal/vthread_test.cljc | 10 +++++----- test/intemporal/workflow_test.cljc | 10 +++++----- 11 files changed, 48 insertions(+), 42 deletions(-) diff --git a/dev/intemporal/demo_parallelism.clj b/dev/intemporal/demo_parallelism.clj index 048a1f0..296c627 100644 --- a/dev/intemporal/demo_parallelism.clj +++ b/dev/intemporal/demo_parallelism.clj @@ -1,9 +1,8 @@ (ns intemporal.demo-parallelism (:require [intemporal.store :as store] [intemporal.workflow :as w] - [promesa.core :as p]) - (:require [intemporal.macros :refer [stub-protocol defn-workflow vthread]] - [intemporal.workflow])) + [intemporal.macros :refer [stub-protocol defn-workflow vthread]] + [promesa.core :as p])) ;;;; ;; demo @@ -26,8 +25,8 @@ ;; this code is deterministic up to here @(p/all proms))) -(def mstore (store/make-store )) -(def stop-worker (w/start-worker! mstore {:protocols {`ThreadActivity (->ThreadActivityImpl)}})) +(def mstore (store/make-store)) +(def ex (w/start-worker! mstore {:protocols {`ThreadActivity (->ThreadActivityImpl)}})) ;; note that in cljs, this returns a promise (def res (w/with-env {:store mstore} diff --git a/docker/fdb.cluster b/docker/fdb.cluster index fc3b14b..1ea4469 100644 --- a/docker/fdb.cluster +++ b/docker/fdb.cluster @@ -1 +1 @@ -docker:docker@192.168.107.2:4500 +docker:docker@172.20.0.2:4500 diff --git a/src/intemporal/workflow.cljc b/src/intemporal/workflow.cljc index a9b0b99..ab06a62 100644 --- a/src/intemporal/workflow.cljc +++ b/src/intemporal/workflow.cljc @@ -192,4 +192,4 @@ (trace! {:name "compensations" :attributes {:fn-count (count @thunks)}} (doseq [f @thunks] (swap! thunks pop) - (f))))) + (f))))) \ No newline at end of file diff --git a/src/intemporal/workflow/internal.cljc b/src/intemporal/workflow/internal.cljc index 1cc7fa2..dcb85d9 100644 --- a/src/intemporal/workflow/internal.cljc +++ b/src/intemporal/workflow/internal.cljc @@ -11,7 +11,8 @@ #?(:cljs (:require-macros [net.cgrand.macrovich :as macros] [intemporal.workflow.internal :refer [trace! trace-async!]] - [intemporal.store :refer [bfn]]))) + [intemporal.store :refer [bfn]])) + #?(:clj (:import [java.util.function BiConsumer]))) #?(:clj (set! *warn-on-reflection* true)) @@ -80,11 +81,17 @@ (macros/case ;; cljs: no telemetry :cljs `(do ~@body) - :clj `(let [attrs# (do ~attrs)] - (otspan/async-bound-cf-span attrs# - (with-env-internal (merge *env* {:telemetry-context (->telemetry-context)}) - (let [res# (do ~@body)] - res#)))))) + :clj `(let [attrs# (do ~attrs) + span# (otspan/new-span! attrs#)] + ;(otspan/async-bound-cf-span attrs#) + ;(with-env-internal (merge *env* {:telemetry-context (->telemetry-context)})) + (let [res# (do ~@body)] + (.whenComplete res# + (reify BiConsumer + (accept [_# t# e#] + (when e# (otspan/add-exception! {:context span#} e#)) + (otspan/end-span! {:context span#})))))))) + (defn add-event! ([task ename attrs] @@ -157,7 +164,6 @@ ;;;; ;; task execution/replay - (defn resume-fn-task "Resumes a generic fn call task" [{:keys [vthread? shutdown?] :as env} store protos {:keys [type proto id root sym fvar args] :as task} [invoke success failure]] @@ -292,6 +298,7 @@ (throw (error/internal-error "Transition unexpected" {:got (:type res?) :expected [success failure]})))] (t/log! {:level :debug :data {:sym sym :retval retval}} ["Finished internal execution for task" id]) + ;; if userland called a vthread, retval will be delayed retval)) ;; ensure we terminate the fn call, even if the next event wouldnt be the expected type (catch #?(:clj Exception :cljs js/Error) e diff --git a/test/intemporal/failures_test.cljc b/test/intemporal/failures_test.cljc index 954574b..b0f0e69 100644 --- a/test/intemporal/failures_test.cljc +++ b/test/intemporal/failures_test.cljc @@ -39,9 +39,9 @@ (deftest activity-failure-test (testing "failure: activity throws" - (let [mstore (store/make-store) - stop-worker (w/start-worker! mstore {:protocols {`MyActivities (->MyActivitiesImpl)}})] + (let [mstore (store/make-store) + ex (w/start-poller! mstore {:protocols {`MyActivities (->MyActivitiesImpl)}})] (with-result [res (w/with-env {:store mstore} - (my-workflow :nok))] + (my-workflow :nok))] (is (instance? #?(:clj Exception :cljs js/Error) res)) - (stop-worker))))) + (w/shutdown ex 1000))))) diff --git a/test/intemporal/internal_failures_test.cljc b/test/intemporal/internal_failures_test.cljc index b442f1f..ecf6c98 100644 --- a/test/intemporal/internal_failures_test.cljc +++ b/test/intemporal/internal_failures_test.cljc @@ -35,13 +35,13 @@ (deftest store-failure-test (testing "failure: task validation fails" - (let [mstore (store/make-store {:failures {:validation 1.0}}) - stop-worker (w/start-worker! mstore {:protocols {`MyActivities (->MyActivitiesImpl)}})] + (let [mstore (store/make-store {:failures {:validation 1.0}}) + ex (w/start-poller! mstore {:protocols {`MyActivities (->MyActivitiesImpl)}})] (with-result [res (w/with-env {:store mstore} - (my-workflow :ok))] + (my-workflow :ok))] (is (instance? #?(:clj Exception :cljs js/Error) res)) (is (= {:intemporal.workflow.internal/type :internal} (ex-data (or (ex-cause res) res)))) - (stop-worker))))) + (w/shutdown ex 1000))))) ;(cljs.test/run-tests *ns*) \ No newline at end of file diff --git a/test/intemporal/recovery_failure_test.clj b/test/intemporal/recovery_failure_test.clj index e9c653c..f882ebf 100644 --- a/test/intemporal/recovery_failure_test.clj +++ b/test/intemporal/recovery_failure_test.clj @@ -41,9 +41,9 @@ (io/copy (io/file "./test/intemporal/recovery_failure.edn") (io/file "/tmp/recovery_failure.edn")) (testing "workflow" - (let [mstore (store/make-store {:file "/tmp/recovery_failure.edn"}) + (let [mstore (store/make-store {:file "/tmp/recovery_failure.edn"}) [task] (store/list-tasks mstore) - stop-worker (w/start-worker! mstore {:protocols {`MyActivities (->MyActivitiesImpl)}})] + ex (w/start-poller! mstore {:protocols {`MyActivities (->MyActivitiesImpl)}})] (try (store/reenqueue-pending-tasks mstore println) @@ -59,7 +59,7 @@ (is (= :intemporal.workflow/failure (:type last-ev))))) (finally (tu/print-tables mstore) - (stop-worker)))))) + (w/shutdown ex 1000)))))) #_:clj-kondo/ignore (comment diff --git a/test/intemporal/shutdown_restart_test.cljs b/test/intemporal/shutdown_restart_test.cljs index 3c5462a..f671c3c 100644 --- a/test/intemporal/shutdown_restart_test.cljs +++ b/test/intemporal/shutdown_restart_test.cljs @@ -31,15 +31,15 @@ ;;;; test proper (def mstore (store/make-store {})) -(def stop-worker (w/start-worker! mstore {:protocols {`MyActivities (->MyActivitiesImpl)} - :polling-ms 10})) +(def ex (w/start-poller! mstore {:protocols {`MyActivities (->MyActivitiesImpl)} + :polling-ms 10})) (deftest executor-shutdown-test (testing "shutdown of ongoing workflow" (with-result [res (w/with-env {:store mstore} - (my-workflow :ok))] - (stop-worker) + (my-workflow :ok))] + (w/shutdown ex 1000) (is (instance? js/Error res)) (is (error/panic? res)) diff --git a/test/intemporal/vthread_recovery_test.clj b/test/intemporal/vthread_recovery_test.clj index 3c1e7ad..016d352 100644 --- a/test/intemporal/vthread_recovery_test.clj +++ b/test/intemporal/vthread_recovery_test.clj @@ -36,8 +36,8 @@ ;; make a backup of the db to allow replay (io/copy (io/file "./test/intemporal/vthread-recovery.edn") (io/file "/tmp/intemporal-vthread-recovery.edn")) - (let [mstore (store/make-store {:file "/tmp/intemporal-vthread-recovery.edn"}) - stop-worker (w/start-worker! mstore {:protocols {`ThreadActivity (->ThreadActivityImpl)}})] + (let [mstore (store/make-store {:file "/tmp/intemporal-vthread-recovery.edn"}) + ex (w/start-poller! mstore {:protocols {`ThreadActivity (->ThreadActivityImpl)}})] (store/reenqueue-pending-tasks mstore println) @@ -54,4 +54,4 @@ (is (= (into [] (range nthreads)) (-> evts last :result))))))) - (stop-worker))) + (w/shutdown ex 1000))) diff --git a/test/intemporal/vthread_test.cljc b/test/intemporal/vthread_test.cljc index d059f52..29125e3 100644 --- a/test/intemporal/vthread_test.cljc +++ b/test/intemporal/vthread_test.cljc @@ -32,10 +32,10 @@ (defn-workflow my-workflow [sleep-time] (let [pr (stub-protocol ThreadActivity {}) - proms (-> (for [i (range 10)] - (vthread - (sleep pr i sleep-time))) - (doall))] + proms (->> (for [i (range 10)] + (vthread + (sleep pr i sleep-time))) + (doall))] #?(:clj (Thread/sleep (long sleep-time))) (p/all proms))) @@ -50,7 +50,7 @@ ;; cljs runtimes return promises ;; clj runtime will run synchronously (with-result [v (w/with-env {:store mstore} - (my-workflow sleep-time))] + (my-workflow sleep-time))] (testing "result" (is (= (range 10) diff --git a/test/intemporal/workflow_test.cljc b/test/intemporal/workflow_test.cljc index 844c0fd..82e754f 100644 --- a/test/intemporal/workflow_test.cljc +++ b/test/intemporal/workflow_test.cljc @@ -27,7 +27,7 @@ :cljs (env-let [f (stub-function nested-fn)] - (f :sub)))) + (f :sub)))) (defprotocol MyActivities (foo [this a])) @@ -56,13 +56,13 @@ (deftest workflow-happy-path-test (testing "workflow" (let [mstore (store/make-store) - stop-worker (w/start-worker! mstore {:protocols {`MyActivities (->MyActivitiesImpl)}}) + ex (w/start-poller! mstore {:protocols {`MyActivities (->MyActivitiesImpl)}}) uuid-store (atom nil) workflow-id (str (random-uuid))] (with-result [v (w/with-env {:store mstore - :id workflow-id} - (my-workflow uuid-store))] + :id workflow-id} + (my-workflow uuid-store))] (testing "workflow result" (is (= [:root [:sub :nested] [:proto :pr]] @@ -112,7 +112,7 @@ (is (every? #(= @uuid-store %) (map :id tasks))) (is (= @uuid-store workflow-id))))) - (stop-worker))))) + (w/shutdown ex 1000))))) #_:clj-kondo/ignore (comment From 0d38aea7535af74ab10b4b8c6fea5b1c00ed5c12 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Wed, 10 Dec 2025 16:00:42 +0000 Subject: [PATCH 03/26] bump fdb --- deps.edn | 2 +- docker-compose.yaml | 2 +- docker/fdb.cluster | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/deps.edn b/deps.edn index 299a077..62b9873 100644 --- a/deps.edn +++ b/deps.edn @@ -34,7 +34,7 @@ tortue/spy {:mvn/version "2.15.0"} nubank/matcher-combinators {:mvn/version "3.9.2"} com.clojure-goes-fast/clj-async-profiler {:mvn/version "1.6.2"}}} - :fdb {:extra-deps {org.foundationdb/fdb-java {:mvn/version "7.1.60"} + :fdb {:extra-deps {org.foundationdb/fdb-java {:mvn/version "7.3.57"} me.vedang/clj-fdb {:mvn/version "0.3.0"}}} :jdbc {:extra-deps {com.github.seancorfield/next.jdbc {:mvn/version "1.3.1048"} diff --git a/docker-compose.yaml b/docker-compose.yaml index e1a0611..1f385c0 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -26,7 +26,7 @@ services: - jaeger foundation: - image: "foundationdb/foundationdb:7.1.60" + image: "foundationdb/foundationdb:7.3.57" environment: FDB_NETWORKING_MODE: host entrypoint: ["/usr/bin/tini", "-g", "--", "sh", "/fdb-init.bash"] diff --git a/docker/fdb.cluster b/docker/fdb.cluster index 1ea4469..25af3c6 100644 --- a/docker/fdb.cluster +++ b/docker/fdb.cluster @@ -1 +1 @@ -docker:docker@172.20.0.2:4500 +docker:docker@172.21.0.3:4500 From 7772c2a6011e5046b6cc172bbfc94dae99e404bc Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Thu, 11 Dec 2025 20:54:21 +0000 Subject: [PATCH 04/26] fix warn --- src/intemporal/workflow/internal.cljc | 5 +++-- test/intemporal/shutdown_restart_test.clj | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/intemporal/workflow/internal.cljc b/src/intemporal/workflow/internal.cljc index dcb85d9..528a420 100644 --- a/src/intemporal/workflow/internal.cljc +++ b/src/intemporal/workflow/internal.cljc @@ -12,7 +12,8 @@ [net.cgrand.macrovich :as macros] [intemporal.workflow.internal :refer [trace! trace-async!]] [intemporal.store :refer [bfn]])) - #?(:clj (:import [java.util.function BiConsumer]))) + #?(:clj (:import [java.util.function BiConsumer] + [java.util.concurrent CompletableFuture]))) #?(:clj (set! *warn-on-reflection* true)) @@ -86,7 +87,7 @@ ;(otspan/async-bound-cf-span attrs#) ;(with-env-internal (merge *env* {:telemetry-context (->telemetry-context)})) (let [res# (do ~@body)] - (.whenComplete res# + (.whenComplete ^CompletableFuture res# (reify BiConsumer (accept [_# t# e#] (when e# (otspan/add-exception! {:context span#} e#)) diff --git a/test/intemporal/shutdown_restart_test.clj b/test/intemporal/shutdown_restart_test.clj index 3d29555..2662039 100644 --- a/test/intemporal/shutdown_restart_test.clj +++ b/test/intemporal/shutdown_restart_test.clj @@ -19,7 +19,6 @@ MyActivities (foo [this a] (.countDown activity-invoked?) - (Thread/sleep 100) (.await executor-shutdown?) :foo)) From e95d08dd3379a565b3a16463b04c1cd0040e3a5b Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Tue, 16 Dec 2025 21:23:17 +0000 Subject: [PATCH 05/26] fix shutdown sequence --- src/intemporal/workflow.cljc | 50 +++++++++---------- src/intemporal/workflow/internal.cljc | 11 ++-- .../stores_release_reenqueue_test.clj | 2 +- 3 files changed, 29 insertions(+), 34 deletions(-) diff --git a/src/intemporal/workflow.cljc b/src/intemporal/workflow.cljc index ab06a62..348f7c6 100644 --- a/src/intemporal/workflow.cljc +++ b/src/intemporal/workflow.cljc @@ -45,27 +45,6 @@ (shutdown [this grace-period-ms] "Shuts down the task executor") (running? [this] "Indicates if the executor is running")) -;; allow expressions like (with-open [executor (w/start-poller .... - -#?(:clj #_:clj-kondo/ignore (extend-protocol ITaskExecutor - AutoCloseable - (close [this] (shutdown this 0)))) - -;; make sure that any given executor service can implement ITaskExecutor -#?(:clj (extend-type ExecutorService - ITaskExecutor - (submit [executor f] - (.submit ^ExecutorService executor ^Runnable f)) - (shutdown [executor grace-period-ms] - ;; todo: release tasks - (.shutdown ^ExecutorService executor) - (t/log! {:level :debug} ["Executor shutdown"]) - (when-not (.awaitTermination ^ExecutorService executor grace-period-ms TimeUnit/MILLISECONDS) - (t/log! {:level :debug} ["Executor shutdown grace period over, shutting down NOW"]) - (.shutdownNow ^ExecutorService executor))) - (running? [executor] - (not (.isShutdown ^ExecutorService executor))))) - (defn make-task-executor "Creates an object that satisfies `ITaskExecutor`." [] @@ -78,12 +57,31 @@ (shutdown [_ grace-period-ms] (t/log! {:level :debug} ["Executor shutdown"]) (reset! run? false)) - (running? [_] @run?)) + (running? [_] + @run?)) :clj - (let [factory (-> (Thread/ofVirtual) - (.name "Task Thread") - (.factory))] - (Executors/newThreadPerTaskExecutor factory))))) ;])))) + (let [factory (-> (Thread/ofVirtual) + (.name "Task Thread") + (.factory)) + exec (Executors/newThreadPerTaskExecutor factory) + running? (atom true)] + (reify + ITaskExecutor + (submit [_ f] + (.submit exec ^Runnable f)) + (shutdown [_ grace-period-ms] + (.shutdown exec) + (t/log! {:level :debug} ["Executor shutdown"]) + (when-not (.awaitTermination exec grace-period-ms TimeUnit/MILLISECONDS) + (t/log! {:level :debug} ["Executor shutdown grace period over, shutting down NOW"]) + (.shutdownNow exec)) + (reset! running? false)) + (running? [_] + @running?) + ;; allow expressions like (with-open [executor (w/start-poller .... + AutoCloseable + (close [this] + (shutdown this 0))))))) (defn- worker-execute-fn "Executes a given protocol, activity or workflow `task`" diff --git a/src/intemporal/workflow/internal.cljc b/src/intemporal/workflow/internal.cljc index 528a420..3cbdbdb 100644 --- a/src/intemporal/workflow/internal.cljc +++ b/src/intemporal/workflow/internal.cljc @@ -88,11 +88,9 @@ ;(with-env-internal (merge *env* {:telemetry-context (->telemetry-context)})) (let [res# (do ~@body)] (.whenComplete ^CompletableFuture res# - (reify BiConsumer - (accept [_# t# e#] - (when e# (otspan/add-exception! {:context span#} e#)) - (otspan/end-span! {:context span#})))))))) - + (fn [t# e#] + (when e# (otspan/add-exception! {:context span#} e#)) + (otspan/end-span! {:context span#}))))))) (defn add-event! ([task ename attrs] @@ -208,12 +206,11 @@ handle-ok (bfn [r] ;; TODO assert r is serializable! ;; we check for shutdown because in js runtime, there is no thread interruption + ;; at this point, if we are shutting down it means we exhausted the grace period (let [panic? (shutting-down?)] (try (if panic? - ;(trace! {:id ::store/task<-panic}) (task<-panic store id (error/panic "Worker shutting down during invocation result handling")) - ;(trace! {:id ::store/task<-event}) (let [new-event (assoc next-event :result r)] #?(:clj (otspan/add-span-data! {:attributes {:replayed false :result r}})) (task<-event store id new-event) diff --git a/test/intemporal/stores_release_reenqueue_test.clj b/test/intemporal/stores_release_reenqueue_test.clj index ddc3765..c6b3cf3 100644 --- a/test/intemporal/stores_release_reenqueue_test.clj +++ b/test/intemporal/stores_release_reenqueue_test.clj @@ -35,7 +35,7 @@ ;;;; test proper -(deftest executor-shutdown-reenqueue-test +(deftest release-reenqueue-test (doseq [[label store] @stores] (store/clear-events store) From 52a7f38dbcb84fd24fb7070ed5b61e8203db5290 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Thu, 18 Dec 2025 13:40:17 +0000 Subject: [PATCH 06/26] use separate thread --- src/intemporal/workflow.cljc | 35 +++++---- src/intemporal/workflow/internal.cljc | 2 + .../basic_test.clj} | 2 +- .../release_reenqueue_test.clj} | 2 +- test/intemporal/stores/resilience_test.clj | 71 +++++++++++++++++++ test/intemporal/{ => stores}/saga_test.clj | 2 +- test/intemporal/test_utils.cljc | 38 +++++++++- 7 files changed, 133 insertions(+), 19 deletions(-) rename test/intemporal/{stores_test.clj => stores/basic_test.clj} (99%) rename test/intemporal/{stores_release_reenqueue_test.clj => stores/release_reenqueue_test.clj} (97%) create mode 100644 test/intemporal/stores/resilience_test.clj rename test/intemporal/{ => stores}/saga_test.clj (97%) diff --git a/src/intemporal/workflow.cljc b/src/intemporal/workflow.cljc index 348f7c6..7bba675 100644 --- a/src/intemporal/workflow.cljc +++ b/src/intemporal/workflow.cljc @@ -70,12 +70,15 @@ (submit [_ f] (.submit exec ^Runnable f)) (shutdown [_ grace-period-ms] - (.shutdown exec) - (t/log! {:level :debug} ["Executor shutdown"]) - (when-not (.awaitTermination exec grace-period-ms TimeUnit/MILLISECONDS) - (t/log! {:level :debug} ["Executor shutdown grace period over, shutting down NOW"]) - (.shutdownNow exec)) - (reset! running? false)) + (try + (.shutdown exec) + (t/log! {:level :debug} ["Executor shutdown"]) + (when-not (.awaitTermination exec grace-period-ms TimeUnit/MILLISECONDS) + (t/log! {:level :debug} ["Executor shutdown grace period over, shutting down NOW"]) + (.shutdownNow exec)) + ;; in case we got interrupted exception, make sure to set the flag + (finally + (reset! running? false)))) (running? [_] @running?) ;; allow expressions like (with-open [executor (w/start-poller .... @@ -112,7 +115,6 @@ "Continously polls for task while `task-executor` is active." [store protocols task-executor polling-ms] (let [task-counter (atom 0) - uid (random-uuid) shutting-down? (fn [] (not (running? task-executor)))] #_{:clj-kondo/ignore [:loop-without-recur :invalid-arity]} @@ -124,12 +126,15 @@ (when-let [task (store/dequeue-task store)] (t/log! {:level :debug :_data {:task task}} ["Dequeued task with id" (:id task)]) (submit task-executor (fn [] - (worker-execute-fn store protocols task task-counter shutting-down?))))))) + (worker-execute-fn store protocols task task-counter shutting-down?))) + (when (running? task-executor) + (recur)))) + (when (running? task-executor) + (p/recur)))) (p/catch (fn [e] - (t/log! {:level :warn :data {:exception e}} ["Caught error during task polling, continuing"]))) - (p/finally (fn [_ _] - (when (running? task-executor) - (p/recur)))))))) + (t/log! {:level :warn :data {:exception e}} ["Caught error during task polling, continuing"]) + (when (running? task-executor) + (p/recur)))))))) (defn start-poller! "Starts a poller that will submit tasks to the `task-executor`. @@ -141,8 +146,10 @@ (start-poller! store (make-task-executor) opts)) ([store task-executor & {:keys [protocols polling-ms] :or {protocols {} polling-ms 100}}] (assert (satisfies? ITaskExecutor task-executor) "Supplied task executor does not satisfy ITaskExecutor") - (let [polling-fn (fn [] (worker-poll-fn store protocols task-executor polling-ms))] - (submit task-executor polling-fn)) + ;; start poller in a out-of-executor thread so it doesnt prevent the executor from shutting down + ;; the only way to stop the poller is via shutdown + (p/vthread + (worker-poll-fn store protocols task-executor polling-ms)) task-executor)) (defn start-worker! diff --git a/src/intemporal/workflow/internal.cljc b/src/intemporal/workflow/internal.cljc index 3cbdbdb..f4b079a 100644 --- a/src/intemporal/workflow/internal.cljc +++ b/src/intemporal/workflow/internal.cljc @@ -141,6 +141,8 @@ (store/all-events store id)) (defn- task<-event [store task-id event-descr] + ;; TODO patch this to use a compare-and-swap + ;; must send the expected state as arg (add-event! (:type event-descr) {:task-id task-id}) (store/task<-event store task-id event-descr)) diff --git a/test/intemporal/stores_test.clj b/test/intemporal/stores/basic_test.clj similarity index 99% rename from test/intemporal/stores_test.clj rename to test/intemporal/stores/basic_test.clj index 6516ec0..a9624ec 100644 --- a/test/intemporal/stores_test.clj +++ b/test/intemporal/stores/basic_test.clj @@ -1,4 +1,4 @@ -(ns ^:integration ^:fdb ^:sql intemporal.stores-test +(ns ^:integration ^:fdb ^:sql intemporal.stores.basic-test (:require [clojure.test :refer [deftest is testing use-fixtures]] [intemporal.store :as store] [intemporal.store.foundationdb :as fdb] diff --git a/test/intemporal/stores_release_reenqueue_test.clj b/test/intemporal/stores/release_reenqueue_test.clj similarity index 97% rename from test/intemporal/stores_release_reenqueue_test.clj rename to test/intemporal/stores/release_reenqueue_test.clj index c6b3cf3..a20cc26 100644 --- a/test/intemporal/stores_release_reenqueue_test.clj +++ b/test/intemporal/stores/release_reenqueue_test.clj @@ -1,4 +1,4 @@ -(ns ^:integration ^:fdb ^:sql intemporal.stores-release-reenqueue-test +(ns ^:integration ^:fdb ^:sql intemporal.stores.release-reenqueue-test (:require [clojure.test :refer [deftest is testing use-fixtures]] [intemporal.store :as store] [intemporal.store.foundationdb :as fdb] diff --git a/test/intemporal/stores/resilience_test.clj b/test/intemporal/stores/resilience_test.clj new file mode 100644 index 0000000..e705a1e --- /dev/null +++ b/test/intemporal/stores/resilience_test.clj @@ -0,0 +1,71 @@ +(ns ^:integration ^:fdb ^:sql intemporal.stores.resilience-test + (:require [clojure.test :refer [deftest is testing use-fixtures]] + [intemporal.store :as store] + [intemporal.store.foundationdb :as fdb] + [intemporal.store.jdbc :as jdbc] + [intemporal.workflow :as w] + [intemporal.macros :refer [stub-protocol defn-workflow]] + [intemporal.test-utils :as tu :refer [wait]] + [promesa.core :as p]) + (:import (java.util.concurrent CountDownLatch))) + +;(tu/setup-telemere) + +(defprotocol MyActivities + (foo [this a])) + +(defrecord MyActivitiesImpl [] + MyActivities + (foo [this a] + (Thread/sleep (long (rand-int 100))) + [:proto a])) + +(defn-workflow my-workflow [] + (let [pr (stub-protocol MyActivities {}) + prr (foo pr :pr)] + prr)) + +(def stores (delay {:memory (store/make-store) + :fdb (fdb/make-store {:cluster-file-path "docker/fdb.cluster"}) + :postgres (jdbc/make-store {:jdbcUrl "jdbc:postgresql://localhost:5432/root?user=root&password=root" + :migration-dir "migrations/postgres"})})) + + +(def iterations 10) +(def latch (CountDownLatch. iterations)) + +(deftest stores-test + (doseq [[label store] @stores] + (testing (format "store: %s" label) + + (testing "clear" + (store/clear-events store) + (store/clear-tasks store)) + + (testing "multiple iterations" + (w/with-env {:store store} + (dotimes [_ iterations] + ;; workflows are blocking, we wrap in a virtual thread + (p/vthread + (my-workflow)))) + + ;; check that all tasks are enqueued + (wait (= iterations (count (store/list-tasks store))) + (let [wflows (store/list-tasks store)] + (testing "workflows are all new" + (is (= iterations (count wflows))) + (is (= #{:new} (set (map :state wflows)))))))) + + (testing "enqueue all jobs" + (let [ex (w/start-poller! store {:protocols {`MyActivities (->MyActivitiesImpl)}})] + ;; lets wait for all pending + (try + (wait (not (contains? (into #{} (map :state (store/list-tasks store))) :new)) + (w/shutdown ex 5000)) + + (testing "workflows are all new" + (let [wflows (store/list-tasks store)] + (is (= iterations (count wflows))) + (is (= #{:success} (set (map :state wflows)))))) + (finally + (w/shutdown ex 0)))))))) diff --git a/test/intemporal/saga_test.clj b/test/intemporal/stores/saga_test.clj similarity index 97% rename from test/intemporal/saga_test.clj rename to test/intemporal/stores/saga_test.clj index 2cf37f4..ae67ed3 100644 --- a/test/intemporal/saga_test.clj +++ b/test/intemporal/stores/saga_test.clj @@ -1,4 +1,4 @@ -(ns ^:integration ^:fdb ^:sql intemporal.saga-test +(ns ^:integration ^:fdb ^:sql intemporal.stores.saga-test (:require [clojure.test :refer [deftest is testing use-fixtures]] [intemporal.store :as store] [intemporal.store.foundationdb :as fdb] diff --git a/test/intemporal/test_utils.cljc b/test/intemporal/test_utils.cljc index f59c32d..fe35136 100644 --- a/test/intemporal/test_utils.cljc +++ b/test/intemporal/test_utils.cljc @@ -15,7 +15,7 @@ [net.cgrand.macrovich :as macros] [clojure.pprint :as pprint])) #?(:cljs (:require-macros [net.cgrand.macrovich :as macros] - [intemporal.test-utils :refer [with-result]])) + [intemporal.test-utils :refer [with-result wait]])) #?(:clj (:import [java.util.concurrent TimeoutException]))) ;;;; @@ -86,7 +86,8 @@ ;;;; ;; macros -(def with-result-default-timeout 10000) +(def ^:dynamic with-result-default-timeout 10000) +(def ^:dynamic wait-default-timeout 3000) (defmacro with-result "Promise-aware macro: the result can either be a value or a thrown exception. @@ -125,6 +126,39 @@ (done#))))) 0)))))) +(defmacro wait + "Waits for 3 secs until the result is true, or throws; + In `clj` it polls every 100ms + In `cljs` it continuously loops + ``` + (wait (db/find id) + (is (= 1 1)) + ``` + " + [condition & body] + (macros/case + :clj + `(let [timeout# wait-default-timeout + start# (System/currentTimeMillis)] + (loop [] + (if ~condition + (do ~@body) + (if (> (- (System/currentTimeMillis) start#) timeout#) + (throw (ex-info "Timed out" {:timeout timeout#})) + (do (Thread/sleep 100) + (recur)))))) + + :cljs + `(let [timeout# 3000 + start# (.getTime (js/Date.))] + (loop [] + (if ~condition + (do ~@body) + (if (> (- (.getTime (js/Date.)) start#) timeout#) + (throw (ex-info "Timed out" {:timeout timeout#})) + ;; Note: In CLJS this is a "busy wait" loop + (recur))))))) + (defn setup-telemere [] #?(:clj (clojure.pprint/pprint (telemere/check-interop))) (telemere/set-min-level! :trace) From 6f1627d1261baa5773b9b1bddc7859c0faa9b3c7 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Fri, 19 Dec 2025 18:55:16 +0000 Subject: [PATCH 07/26] implement basic update-if for task attrs, fix tests --- docker/fdb.cluster | 2 +- src/intemporal/store.cljc | 13 ++-- src/intemporal/store/foundationdb.clj | 1 + src/intemporal/store/internal.cljc | 19 +++++ src/intemporal/store/jdbc.clj | 11 ++- test/intemporal/shutdown_restart_test.clj | 2 +- test/intemporal/shutdown_restart_test.cljs | 5 +- test/intemporal/stores/basic_test.clj | 89 +++++++++++----------- 8 files changed, 87 insertions(+), 55 deletions(-) diff --git a/docker/fdb.cluster b/docker/fdb.cluster index 25af3c6..32c6f9d 100644 --- a/docker/fdb.cluster +++ b/docker/fdb.cluster @@ -1 +1 @@ -docker:docker@172.21.0.3:4500 +docker:docker@172.20.0.3:4500 diff --git a/src/intemporal/store.cljc b/src/intemporal/store.cljc index 4968761..3548f32 100644 --- a/src/intemporal/store.cljc +++ b/src/intemporal/store.cljc @@ -135,10 +135,11 @@ find-task (fn [this id] (get @tasks id)) - update-task (fn [this id & kvs] + update-task (fn [this id attrs] (when-let [w (find-task this id)] (maybe-fail!) - (->> (apply assoc w kvs) + (si/validate-transition! w attrs) + (->> (merge w attrs) (si/validate-task!) (swap! tasks assoc id))))] @@ -191,7 +192,7 @@ (vals @tasks))) (task<-panic [this task-id error] - (update-task this task-id :result error)) + (update-task this task-id {:result error})) (task<-event [this task-id {:keys [id ref root type sym args result error] :as event-descr}] ;; some redundancy between :result in task and event @@ -202,14 +203,14 @@ (let [evt {:ref ref :root root :type type :sym sym :args args :error nil :result nil}] (when-not id (save-event this task-id evt)) - (update-task this task-id :state :pending) + (update-task this task-id {:state :pending}) evt) (some? error) (let [evt {:ref ref :root root :type type :sym sym :args nil :error error :result nil}] (when-not id (save-event this task-id evt)) - (update-task this task-id :state :failure :result error) + (update-task this task-id {:state :failure :result error}) evt) ;;(some? result) ;result can be nil @@ -217,7 +218,7 @@ (let [evt {:ref ref :root root :type type :sym sym :args nil :error nil :result result}] (when-not id (save-event this task-id evt)) - (update-task this task-id :state :success :result result) + (update-task this task-id {:state :success :result result}) evt))) (find-task [this id] diff --git a/src/intemporal/store/foundationdb.clj b/src/intemporal/store/foundationdb.clj index 2e4ebf8..a12dbf0 100644 --- a/src/intemporal/store/foundationdb.clj +++ b/src/intemporal/store/foundationdb.clj @@ -110,6 +110,7 @@ ;; not every invocation will come from a persisted task (when task (si/validate-task! updated-task) + (si/validate-transition! task updated-task) (fc/set tx subspace-owned-tasks task-id (serialize updated-task))) updated-evt))) diff --git a/src/intemporal/store/internal.cljc b/src/intemporal/store/internal.cljc index 130e980..3c302b6 100644 --- a/src/intemporal/store/internal.cljc +++ b/src/intemporal/store/internal.cljc @@ -91,6 +91,24 @@ [:result {:optional true} :any] [:error {:optional true} :any]]) +;; valid task states +(def valid-state-transitions {:new #{:pending} + :pending #{:new :success :failure}}) + +(defn validate-transition! + "Ensures that the task's new `:state`, if any, is allowed. + Useful to implement compare-and-swap semantics" + [{:keys [state id]} attrs] + (let [next-states (get valid-state-transitions state)] + ;; if we are updating state + ;; and the new state is not allowed + ;; error out + (when (and (contains? attrs :state) + (not= (:state attrs) state) + (not (contains? next-states (:state attrs)))) + (throw (ex-info (str "Cannot update task with id " id " from state " state " to " (:state attrs)) {:task-id id + :state state + :next-state (:state attrs)}))))) (def validate-task! "Throws if the task is not valid" (m/coercer Task nil {:registry registry})) @@ -107,6 +125,7 @@ (when-not (serializable? obj) (throw (ex-info msg {:object obj}))))) + (defn success? [{:keys [state] :as task}] (= :success state)) diff --git a/src/intemporal/store/jdbc.clj b/src/intemporal/store/jdbc.clj index 4f661e7..4629e5f 100644 --- a/src/intemporal/store/jdbc.clj +++ b/src/intemporal/store/jdbc.clj @@ -123,6 +123,10 @@ ;; and they would expect the event to be present in the history (jdbc/with-transaction [tx db-spec] (let [evt {:ref ref :root root :type type :sym sym :args args} + expected-state (cond + (some? args) :new + (or (some? result) (some? error)) :pending + :else :unknown) updated-task (cond (some? args) {:state (kw->db :pending) :args (serialize args)} (some? error) {:state (kw->db :failure) :result (serialize error)} @@ -136,8 +140,11 @@ (store/save-event this task-id updated-evt)) ;; cant really validate because its a partial task ;(validate-task! updated-task) - (jdbc/execute-one! tx (builder/for-update "tasks" updated-task {:id task-id} default-opts)) - updated-evt))) + (let [updated (jdbc/execute-one! tx (builder/for-update "tasks" updated-task {:id task-id :state (name expected-state)} default-opts))] + (when (empty? updated) + (throw (ex-info (format "Cannot update task with id %s, expected state %s did not match" id expected-state {:task-id id + :expected-state expected-state})))) + updated-evt)))) (find-task [this id] (some-> (jdbc/with-transaction [tx db-spec] diff --git a/test/intemporal/shutdown_restart_test.clj b/test/intemporal/shutdown_restart_test.clj index 2662039..6b54200 100644 --- a/test/intemporal/shutdown_restart_test.clj +++ b/test/intemporal/shutdown_restart_test.clj @@ -64,7 +64,7 @@ (testing "workflow resumes" (with-open [_ (w/start-poller! mstore {:protocols {`MyActivities (->MyActivitiesImpl)} - :polling-ms 500})] + :polling-ms 100})] (store/reenqueue-pending-tasks mstore (constantly nil)) (tu/wait-for-task mstore (:id w1)) (tu/print-tables mstore) diff --git a/test/intemporal/shutdown_restart_test.cljs b/test/intemporal/shutdown_restart_test.cljs index f671c3c..559a400 100644 --- a/test/intemporal/shutdown_restart_test.cljs +++ b/test/intemporal/shutdown_restart_test.cljs @@ -34,6 +34,9 @@ (def ex (w/start-poller! mstore {:protocols {`MyActivities (->MyActivitiesImpl)} :polling-ms 10})) +(defn stop-worker [] + (w/shutdown ex 1000)) + (deftest executor-shutdown-test (testing "shutdown of ongoing workflow" @@ -72,4 +75,4 @@ (stop-worker))))) -;(cljs.test/run-tests *ns*) \ No newline at end of file +(cljs.test/run-tests *ns*) \ No newline at end of file diff --git a/test/intemporal/stores/basic_test.clj b/test/intemporal/stores/basic_test.clj index a9624ec..71dfb36 100644 --- a/test/intemporal/stores/basic_test.clj +++ b/test/intemporal/stores/basic_test.clj @@ -132,50 +132,51 @@ :sym 'clojure.core/+, :result nil, :id string?,} - task)))) - - (testing "ok" - (let [ev-descr {:ref "self" :root "self" :type :intemporal.workflow/success :sym 'clojure.core/+ :result ["result"]} - ev (store/task<-event store (:id db-task) ev-descr) - [task] (store/list-tasks store)] - (is (match? {:ref "self" - :root "self" - :type :intemporal.workflow/success - :sym 'clojure.core/+ - :result ["result"]} - ev)) - (is (match? {:args ["invoke" 333], - :ref "self", - :root "self", - :type :workflow, - :state :success, - :sym 'clojure.core/+, - :result ["result"], - :id string?} - task)))) - - (testing "error" - (let [ex {:some "exception" :data false} - ev-descr {:ref "self" :root "self" :type :intemporal.workflow/failure :sym 'clojure.core/+ :error ex} - ev (store/task<-event store (:id db-task) ev-descr) - [task] (store/list-tasks store)] - - (is (match? {:ref "self" - :root "self" - :type :intemporal.workflow/failure - :sym 'clojure.core/+ - :error ex} - ev)) - - (is (match? {:args ["invoke" 333], - :ref "self", - :root "self", - :type :workflow, - :state :failure, - :sym 'clojure.core/+, - :result ex - :id string?} - task)))))))) + task))) + + (testing "ok" + (let [ev-descr {:ref "self" :root "self" :type :intemporal.workflow/success :sym 'clojure.core/+ :result ["result"]} + ev (store/task<-event store (:id db-task) ev-descr) + [task] (store/list-tasks store)] + (is (match? {:ref "self" + :root "self" + :type :intemporal.workflow/success + :sym 'clojure.core/+ + :result ["result"]} + ev)) + (is (match? {:args ["invoke" 333], + :ref "self", + :root "self", + :type :workflow, + :state :success, + :sym 'clojure.core/+, + :result ["result"], + :id string?} + task))))) + + ;; TODO need to reenqueue another task + #_(testing "error" + (let [ex {:some "exception" :data false} + ev-descr {:ref "self" :root "self" :type :intemporal.workflow/failure :sym 'clojure.core/+ :error ex} + ev (store/task<-event store (:id db-task) ev-descr) + [task] (store/list-tasks store)] + + (is (match? {:ref "self" + :root "self" + :type :intemporal.workflow/failure + :sym 'clojure.core/+ + :error ex} + ev)) + + (is (match? {:args ["invoke" 333], + :ref "self", + :root "self", + :type :workflow, + :state :failure, + :sym 'clojure.core/+, + :result ex + :id string?} + task)))))))) (testing "task await+watch" (let [task (internal/create-workflow-task "self" "self" 'clojure.core/- (var-get #'-) ["invoke" 333] "4") From 5cc6dbf38c93704f25cb953b226b48200aa346c0 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Sat, 20 Dec 2025 10:38:16 +0000 Subject: [PATCH 08/26] fix earthly --- Earthfile | 7 +++---- bin/kaocha | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Earthfile b/Earthfile index 1e9e1cb..4ed7ef0 100644 --- a/Earthfile +++ b/Earthfile @@ -25,11 +25,10 @@ deps: CACHE ~/.m2 RUN clj -Stree RUN npm install - #RUN wget https://github.com/apple/foundationdb/releases/download/7.3.63/foundationdb-clients_7.3.63-1_aarch64.deb - #RUN dpkg -i foundationdb-clients_7.3.63-1_aarch64.deb - RUN wget -q https://github.com/apple/foundationdb/releases/download/7.1.31/foundationdb-clients_7.1.31-1_amd64.deb - RUN dpkg -i foundationdb-clients_7.1.31-1_amd64.deb + RUN wget -nv https://github.com/apple/foundationdb/releases/download/7.3.57/foundationdb-clients_7.3.57-1_amd64.deb + RUN dpkg -i foundationdb-clients_7.3.57-1_amd64.deb RUN echo "docker:docker@127.0.0.1:4500" > /etc/foundationdb/fdb.cluster + RUN wget -nv --content-disposition https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar build-base: FROM +deps diff --git a/bin/kaocha b/bin/kaocha index afb5ccd..9e1c63a 100755 --- a/bin/kaocha +++ b/bin/kaocha @@ -2,7 +2,7 @@ [ -d "node_modules/ws" ] || npm install ws -JAVA_OPTS="-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005 -Dio.netty.tryUnsafe=false" +JAVA_OPTS="${JAVA_OPTS:--agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005 -Dio.netty.tryUnsafe=false}" if [[ "$OSTYPE" == "darwin"* ]]; then FDB_OPTS="-J-DFDB_LIBRARY_PATH_FDB_C=/usr/local/lib/libfdb_c.dylib -J-DFDB_LIBRARY_PATH_FDB_JAVA=/usr/local/lib/libfdb_java.jnilib" From 91ec50c4325de7bf82405951b226d3c8b6bddfb6 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Sat, 20 Dec 2025 11:11:00 +0000 Subject: [PATCH 09/26] fix test --- test/intemporal/shutdown_restart_test.cljs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/intemporal/shutdown_restart_test.cljs b/test/intemporal/shutdown_restart_test.cljs index 559a400..605e50e 100644 --- a/test/intemporal/shutdown_restart_test.cljs +++ b/test/intemporal/shutdown_restart_test.cljs @@ -75,4 +75,4 @@ (stop-worker))))) -(cljs.test/run-tests *ns*) \ No newline at end of file +;(cljs.test/run-tests *ns*) \ No newline at end of file From d556b99bc26475d0c0b2c3eb2d8d55afb4165849 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Sun, 21 Dec 2025 11:00:06 +0000 Subject: [PATCH 10/26] experiment with docker --- build/build.clj | 15 ----- deps.edn | 22 +++---- docker/fdb.cluster | 2 +- .../postgres/20240326161343-initial.up.sql | 4 +- src/intemporal/workflow/internal.cljc | 2 + .../containers/execute_workflow.clj | 42 +++++++++++++ test/intemporal/containers/failures_test.clj | 60 +++++++++++++++++++ test/intemporal/containers/start_runner.clj | 45 ++++++++++++++ test/intemporal/shutdown_restart_test.cljs | 8 +-- ...ce_test.clj => lots_of_workflows_test.clj} | 18 +++--- 10 files changed, 176 insertions(+), 42 deletions(-) create mode 100644 test/intemporal/containers/execute_workflow.clj create mode 100644 test/intemporal/containers/failures_test.clj create mode 100644 test/intemporal/containers/start_runner.clj rename test/intemporal/stores/{resilience_test.clj => lots_of_workflows_test.clj} (84%) diff --git a/build/build.clj b/build/build.clj index 4ee7623..e7f22c8 100644 --- a/build/build.clj +++ b/build/build.clj @@ -1,21 +1,6 @@ (ns build (:require [clojure.tools.build.api :as b] [clojure.pprint :as pprint])) - -(def base-nses ['intemporal.workflow - 'intemporal.store.internal - 'intemporal.store.foundationdb - 'intemporal.store - 'intemporal.macros - 'intemporal.workflow.internal - 'intemporal.store.jdbc]) - -(def dev-nses ['intemporal.demo-parallelism - 'intemporal.demo-recovery - 'intemporal.demo-saga - 'intemporal.demo-vthread-recovery - 'intemporal.demo-workflow]) - ;; clj -T:build compile-main (defn compile-main [opts] (b/delete {:path "target/classes"}) diff --git a/deps.edn b/deps.edn index 62b9873..5107e5d 100644 --- a/deps.edn +++ b/deps.edn @@ -1,20 +1,20 @@ {:paths ["src" "target"] :test-paths ["test"] - :deps {org.clojure/clojure {:mvn/version "1.12.1"} - thheller/shadow-cljs {:mvn/version "2.28.11"} - com.taoensso/telemere {:mvn/version "1.1.0"} - com.taoensso/nippy {:mvn/version "3.6.0"} + :deps {org.clojure/clojure {:mvn/version "1.12.1"} + thheller/shadow-cljs {:mvn/version "2.28.11"} + com.taoensso/telemere {:mvn/version "1.1.0"} + com.taoensso/nippy {:mvn/version "3.6.0"} com.github.steffan-westcott/clj-otel-api {:mvn/version "0.2.10"} ;;stuff - net.cgrand/macrovich {:mvn/version "0.2.2"} - funcool/promesa {:mvn/version "11.0.678"} - metosin/malli {:mvn/version "0.19.1"} - thedavidmeister/cljc-md5 {:mvn/version "0.0.2"} - missionary/missionary {:mvn/version "b.46"}} + net.cgrand/macrovich {:mvn/version "0.2.2"} + funcool/promesa {:mvn/version "11.0.678"} + metosin/malli {:mvn/version "0.19.1"} + thedavidmeister/cljc-md5 {:mvn/version "0.0.2"} + missionary/missionary {:mvn/version "b.46"}} :aliases {:dev {:extra-paths ["dev" "test"] - :jvm-opts ["-Djdk.attach.allowAttachSelf" + :_jvm-opts ["-Djdk.attach.allowAttachSelf" "-javaagent:./opentelemetry-javaagent.jar" "-Dotel.instrumentation.common.default-enabled=true" "-Dotel.javaagent.debug=false" @@ -22,6 +22,7 @@ "-Dotel.exporter.otlp.endpoint=http://localhost:4317" "-Dotel.instrumentation.netty.enabled=false" "-Dotel.metrics.exporter=none" + "-Dotel.logs.exporter=none" "-Dotel.javaagent.debug=false" "-Dotel.resource.attributes=service.name=intemporal"] @@ -32,6 +33,7 @@ com.lambdaisland/kaocha-cljs {:mvn/version "1.5.154"} ch.qos.logback/logback-classic {:mvn/version "1.5.18"} tortue/spy {:mvn/version "2.15.0"} + org.testcontainers/testcontainers {:mvn/version "2.0.2"} nubank/matcher-combinators {:mvn/version "3.9.2"} com.clojure-goes-fast/clj-async-profiler {:mvn/version "1.6.2"}}} :fdb {:extra-deps {org.foundationdb/fdb-java {:mvn/version "7.3.57"} diff --git a/docker/fdb.cluster b/docker/fdb.cluster index 32c6f9d..74a077b 100644 --- a/docker/fdb.cluster +++ b/docker/fdb.cluster @@ -1 +1 @@ -docker:docker@172.20.0.3:4500 +docker:docker@172.18.0.2:4500 diff --git a/resources/migrations/postgres/20240326161343-initial.up.sql b/resources/migrations/postgres/20240326161343-initial.up.sql index 052fa25..72fad5b 100644 --- a/resources/migrations/postgres/20240326161343-initial.up.sql +++ b/resources/migrations/postgres/20240326161343-initial.up.sql @@ -4,7 +4,7 @@ CREATE TABLE IF NOT EXISTS tasks ( type varchar(50) NOT NULL, ref varchar(50) NULL, root varchar(50) NULL, - sym varchar(100) NOT NULL, + sym varchar(200) NOT NULL, args bytea NULL, result bytea NULL, state varchar(20) NOT NULL, @@ -23,7 +23,7 @@ CREATE TABLE IF NOT EXISTS events ( type varchar(50) NOT NULL, ref varchar(50) NULL, --NOT NULL, root varchar(50) NOT NULL, - sym varchar(50) NOT NULL, + sym varchar(200) NOT NULL, args bytea NULL, result bytea NULL, --FOREIGN KEY (ref) REFERENCES tasks(id) on delete set null, diff --git a/src/intemporal/workflow/internal.cljc b/src/intemporal/workflow/internal.cljc index f4b079a..14964af 100644 --- a/src/intemporal/workflow/internal.cljc +++ b/src/intemporal/workflow/internal.cljc @@ -302,6 +302,8 @@ retval)) ;; ensure we terminate the fn call, even if the next event wouldnt be the expected type (catch #?(:clj Exception :cljs js/Error) e + ;; TODO at this point we should just panic, "userland" exceptions should be handled in the handle-fail + ;; on theory there is no other way for a user exception to bubble out (let [wrapped (ex-info "Internal error while resuming execution" {::type :internal} e)] (task<-event store id {:ref id :root (or root id) :type ::failure :sym sym :error wrapped})) (p/rejected e)))) diff --git a/test/intemporal/containers/execute_workflow.clj b/test/intemporal/containers/execute_workflow.clj new file mode 100644 index 0000000..9564abd --- /dev/null +++ b/test/intemporal/containers/execute_workflow.clj @@ -0,0 +1,42 @@ +(ns ^:integration ^:fdb ^:sql intemporal.containers.execute-workflow + (:gen-class) + (:require [clojure.string :as str] + [clojure.tools.logging :as log] + [intemporal.workflow :as w] + [intemporal.macros :refer [stub-function defn-workflow]] + [intemporal.containers.start-runner :as runner])) + +(defn foo [ a] + (Thread/sleep (long (rand-int 100))) + [:fun a]) + +(defn-workflow test-workflow [] + (let [pr (stub-function foo) + prr (pr :pr)] + prr)) + +;; serves as an entrypoint for a docker container that can be killed +;; some other container will have to execute the poller/runner +(defn -main [& args] + (let [store-type (-> args first keyword) + workflow-sym (or (-> args second) 'my-workflow) + workflow-fn (requiring-resolve (symbol (str/trim workflow-sym))) + store (get runner/stores store-type)] + (prn args) + (when (nil? store) + (throw (ex-info (format "Unknown store type: %s" store-type) {:store-type store-type}))) + + (when (nil? workflow-fn) + (throw (ex-info (format "Unknown worfklow fn : %s" workflow-sym) {:workflow workflow-sym}))) + + + (w/with-env {:store @store} + (try + (let [res (workflow-fn)] + (println "Response: " res)) + (finally + (log/info "Ready")))))) + +(comment + ;; clj -A:dev:jdbc:fdb -m intemporal.containers.execute-workflow postgres intemporal.containers.execute-workflow/test-workflow + (-main "postgres" "intemporal.containers.execute-workflow/test-workflow")) \ No newline at end of file diff --git a/test/intemporal/containers/failures_test.clj b/test/intemporal/containers/failures_test.clj new file mode 100644 index 0000000..2ce3bbf --- /dev/null +++ b/test/intemporal/containers/failures_test.clj @@ -0,0 +1,60 @@ +(ns ^:integration ^:fdb ^:sql intemporal.containers.failures-test + (:require [clojure.string :as str] + [clojure.test :refer [deftest is testing]] + [clojure.tools.logging :as log] + [intemporal.containers.start-runner] + [intemporal.containers.execute-workflow]) + (:import (org.testcontainers.containers BindMode GenericContainer) + (org.testcontainers.containers.startupcheck IndefiniteWaitOneShotStartupCheckStrategy) + (org.testcontainers.containers.wait.strategy Wait))) + +;; assumes docker-compose is running +(def default-network "intemporal_default") + +(defn make-container ^GenericContainer + ^GenericContainer [clj-main {:keys [aliases args env] :or {env {"JDBC_URL" "jdbc:postgresql://postgresql:5432/root?user=root&password=root"} + aliases "dev:jdbc:fdb"}}] + ;; docker run --name workflow --network=intemporal_default -ti --rm + ;; -e JDBC_URL="jdbc:postgresql://postgresql:5432/root?user=root&password=root" + ;; -v "$(pwd)":/tmp clojure:temurin-25-tools-deps-noble clj + ;; -A:dev:jdbc:fdb -m intemporal.containers.execute-workflow postgres intemporal.containers.execute-workflow/test-workflow + (let [cmd (format "clj -A:%s -m %s %s" aliases clj-main (str/join " " args)) + img "clojure:temurin-25-tools-deps-noble" + container (doto (GenericContainer. img) + (.withEnv env) + (.withNetworkMode default-network) + (.withCommand cmd) + ;; will wait until the work is finished, but no exit + (.waitingFor (Wait/forLogMessage ".*Ready.*\n", 1)) + (.withFileSystemBind (System/getProperty "user.dir") "/tmp" BindMode/READ_ONLY))] + (log/infof "Creating container with image %s and command %s" img cmd) + container)) + +(defn container-exit-code [^GenericContainer container] + (-> (.inspectContainerCmd (.getDockerClient container) (.getContainerId container)) + (.exec) + (.getState) + (.getExitCodeLong))) + +(defn var->ns [v] + (:ns (meta v))) + +(defn var->str [v] + (str (symbol v))) + +(deftest postgres-test + (testing "failure via crash" + (let [runner (make-container (var->ns #'intemporal.containers.start-runner/-main) {:args ["postgres"]}) + workflow-fn #'intemporal.containers.execute-workflow/test-workflow + workflow (make-container (var->ns workflow-fn) {:args ["postgres" (var->str workflow-fn)]})] + (try + (.start runner) + (.start workflow) + + (testing "containers eventually exit" + (is (zero? (container-exit-code runner))) + (is (zero? (container-exit-code workflow)))) + + (finally + (.stop runner) + (.stop workflow)))))) diff --git a/test/intemporal/containers/start_runner.clj b/test/intemporal/containers/start_runner.clj new file mode 100644 index 0000000..a9cb097 --- /dev/null +++ b/test/intemporal/containers/start_runner.clj @@ -0,0 +1,45 @@ +(ns ^:integration ^:fdb ^:sql intemporal.containers.start-runner + (:gen-class) + (:require [clojure.tools.logging :as log] + [intemporal.store.foundationdb :as fdb] + [intemporal.store.jdbc :as jdbc] + [intemporal.workflow :as w]) + (:import (sun.misc Signal SignalHandler))) + +(def stores {:fdb (delay (fdb/make-store {:cluster-file-path (or (System/getenv "FDB_CLUSTERFILE") + "docker/fdb.cluster")})) + :postgres (delay (jdbc/make-store {:jdbcUrl (or (System/getenv "JDBC_URL") + "jdbc:postgresql://localhost:5432/root?user=root&password=root") + :migration-dir "migrations/postgres"}))}) + +;; serves as an entrypoint for a docker container that can be killed +;; some other container will have to execute the poller/runner +;; clj -A:dev:jdbc:fdb -m intemporal.containers.start-runner postgres +(defn -main [& args] + (let [store-type (-> args first keyword) + store (get stores store-type)] + (when (nil? store) + (throw (ex-info (format "Unknown store type: %s" store-type) {:store-type store-type}))) + (log/info "Starting poller for store" store-type) + + (let [ex (w/start-poller! @store {})] + (log/info "Ready: poller started for store" store-type) + (doto (Thread. (fn [] + (Signal/handle (Signal. "TERM") + (reify SignalHandler + (handle [this sig] + (log/info "Received SIGTERM, shutting down") + (w/shutdown ex 0)))) + (Signal/handle (Signal. "INT") + (reify SignalHandler + (handle [this sig] + (log/info "Received INT, quitting") + (System/exit 0)))) + (log/info "Waiting for SIGTERM/SIGINT...") + (Thread/sleep Integer/MAX_VALUE))) + + (.setDaemon false) + (.start))))) + +(comment + (-main "postgres")) \ No newline at end of file diff --git a/test/intemporal/shutdown_restart_test.cljs b/test/intemporal/shutdown_restart_test.cljs index 605e50e..0f24a45 100644 --- a/test/intemporal/shutdown_restart_test.cljs +++ b/test/intemporal/shutdown_restart_test.cljs @@ -55,10 +55,10 @@ (is (match? {:type :workflow :sym 'intemporal.shutdown-restart-test/my-workflow- :state :pending} w1)))) (testing "workflow events" - (let [[e1 e2 e3] (store/list-events mstore)] - (is (match? {:type :intemporal.workflow/invoke :sym 'intemporal.shutdown-restart-test/my-workflow-} e1)) - (is (match? {:type :intemporal.protocol/invoke :sym 'intemporal.shutdown-restart-test/foo} e2)) - (is (nil? e3)))))))) + (let [[e1 e2 e3] (store/list-events mstore)] + (is (match? {:type :intemporal.workflow/invoke :sym 'intemporal.shutdown-restart-test/my-workflow-} e1) + (is (match? {:type :intemporal.protocol/invoke :sym 'intemporal.shutdown-restart-test/foo} e2))) + (is (nil? e3)))))))) #_(deftest executor-shutdown-resume-test (testing "workflow resumes" diff --git a/test/intemporal/stores/resilience_test.clj b/test/intemporal/stores/lots_of_workflows_test.clj similarity index 84% rename from test/intemporal/stores/resilience_test.clj rename to test/intemporal/stores/lots_of_workflows_test.clj index e705a1e..36d853e 100644 --- a/test/intemporal/stores/resilience_test.clj +++ b/test/intemporal/stores/lots_of_workflows_test.clj @@ -1,5 +1,5 @@ -(ns ^:integration ^:fdb ^:sql intemporal.stores.resilience-test - (:require [clojure.test :refer [deftest is testing use-fixtures]] +(ns ^:integration ^:fdb ^:sql intemporal.stores.lots-of-workflows-test + (:require [clojure.test :refer [deftest is testing]] [intemporal.store :as store] [intemporal.store.foundationdb :as fdb] [intemporal.store.jdbc :as jdbc] @@ -9,8 +9,6 @@ [promesa.core :as p]) (:import (java.util.concurrent CountDownLatch))) -;(tu/setup-telemere) - (defprotocol MyActivities (foo [this a])) @@ -31,7 +29,7 @@ :migration-dir "migrations/postgres"})})) -(def iterations 10) +(def iterations 100) (def latch (CountDownLatch. iterations)) (deftest stores-test @@ -44,10 +42,10 @@ (testing "multiple iterations" (w/with-env {:store store} - (dotimes [_ iterations] - ;; workflows are blocking, we wrap in a virtual thread - (p/vthread - (my-workflow)))) + (dotimes [_ iterations] + ;; workflows are blocking, we wrap in a virtual thread + (p/vthread + (my-workflow)))) ;; check that all tasks are enqueued (wait (= iterations (count (store/list-tasks store))) @@ -63,7 +61,7 @@ (wait (not (contains? (into #{} (map :state (store/list-tasks store))) :new)) (w/shutdown ex 5000)) - (testing "workflows are all new" + (testing "workflows are all completed" (let [wflows (store/list-tasks store)] (is (= iterations (count wflows))) (is (= #{:success} (set (map :state wflows)))))) From 0c58cb5667dbd1ece157ad7f78a60bfc99fc19df Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Sun, 21 Dec 2025 11:15:00 +0000 Subject: [PATCH 11/26] fixup! experiment with docker --- .github/workflows/ci.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 295b2a8..3af5e7b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -7,6 +7,7 @@ concurrency: jobs: earthly: + if: false name: Earthly ci runs-on: ubuntu-latest permissions: write-all @@ -36,7 +37,7 @@ jobs: coverage-file: './coverage/lcov.info' clojure: - if: false + if: true runs-on: ubuntu-latest steps: - name: Checkout From 1f3abab13116cb47b8ce6e3b525dcc68e3fbc01e Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Sun, 21 Dec 2025 11:16:45 +0000 Subject: [PATCH 12/26] fixup! fixup! experiment with docker --- test/intemporal/containers/failures_test.clj | 1 - 1 file changed, 1 deletion(-) diff --git a/test/intemporal/containers/failures_test.clj b/test/intemporal/containers/failures_test.clj index 2ce3bbf..923148a 100644 --- a/test/intemporal/containers/failures_test.clj +++ b/test/intemporal/containers/failures_test.clj @@ -5,7 +5,6 @@ [intemporal.containers.start-runner] [intemporal.containers.execute-workflow]) (:import (org.testcontainers.containers BindMode GenericContainer) - (org.testcontainers.containers.startupcheck IndefiniteWaitOneShotStartupCheckStrategy) (org.testcontainers.containers.wait.strategy Wait))) ;; assumes docker-compose is running From b119947682301f93db9c042a66d50d92973b767b Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Sun, 21 Dec 2025 11:17:51 +0000 Subject: [PATCH 13/26] fixup! fixup! fixup! experiment with docker --- src/intemporal/store/jdbc.clj | 4 ++-- src/intemporal/workflow.cljc | 2 +- src/intemporal/workflow/internal.cljc | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/intemporal/store/jdbc.clj b/src/intemporal/store/jdbc.clj index 4629e5f..052af29 100644 --- a/src/intemporal/store/jdbc.clj +++ b/src/intemporal/store/jdbc.clj @@ -142,8 +142,8 @@ ;(validate-task! updated-task) (let [updated (jdbc/execute-one! tx (builder/for-update "tasks" updated-task {:id task-id :state (name expected-state)} default-opts))] (when (empty? updated) - (throw (ex-info (format "Cannot update task with id %s, expected state %s did not match" id expected-state {:task-id id - :expected-state expected-state})))) + (throw (ex-info (format "Cannot update task with id %s, expected state %s did not match" id expected-state) + {:task-id id :expected-state expected-state}))) updated-evt)))) (find-task [this id] diff --git a/src/intemporal/workflow.cljc b/src/intemporal/workflow.cljc index 7bba675..20d43a3 100644 --- a/src/intemporal/workflow.cljc +++ b/src/intemporal/workflow.cljc @@ -9,7 +9,7 @@ [intemporal.workflow :refer [with-env]])) #?(:clj (:require [intemporal.workflow.internal :refer [trace! trace-async!]] [steffan-westcott.clj-otel.context :as otctx])) - #?(:clj (:import [java.util.concurrent ExecutorService Executors TimeUnit] + #?(:clj (:import [java.util.concurrent Executors TimeUnit] [java.lang AutoCloseable]))) #?(:clj (set! *warn-on-reflection* true)) diff --git a/src/intemporal/workflow/internal.cljc b/src/intemporal/workflow/internal.cljc index 14964af..62545ab 100644 --- a/src/intemporal/workflow/internal.cljc +++ b/src/intemporal/workflow/internal.cljc @@ -12,8 +12,7 @@ [net.cgrand.macrovich :as macros] [intemporal.workflow.internal :refer [trace! trace-async!]] [intemporal.store :refer [bfn]])) - #?(:clj (:import [java.util.function BiConsumer] - [java.util.concurrent CompletableFuture]))) + #?(:clj (:import [java.util.concurrent CompletableFuture]))) #?(:clj (set! *warn-on-reflection* true)) From be36d0052046f4b9baeb7ad064e4e31854d3348f Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Sun, 21 Dec 2025 11:32:08 +0000 Subject: [PATCH 14/26] fixup! fixup! fixup! fixup! experiment with docker --- .github/workflows/ci.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3af5e7b..7487a93 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -74,10 +74,10 @@ jobs: - name: Setup FDB run: - curl -L "https://github.com/apple/foundationdb/releases/download/7.1.31/foundationdb-clients_7.1.31-1_amd64.deb" --output /tmp/foundationdb-clients_7.1.31-1_amd64.deb --fail; - sudo dpkg -i /tmp/foundationdb-clients_7.1.31-1_amd64.deb; - sudo rm -f /tmp/foundationdb-clients_7.1.31-1_amd64.deb; - sudo curl --fail -L "https://github.com/apple/foundationdb/releases/download/7.1.31/libfdb_c.x86_64.so" --output "/usr/lib/libfdb_c.7.1.31.x86_64.so"; + curl -L "https://github.com/apple/foundationdb/releases/download/7.3.57/foundationdb-clients_7.3.57-1_amd64.deb" --output /tmp/foundationdb-clients_7.3.57-1_amd64.deb --fail; + sudo dpkg -i /tmp/foundationdb-clients_7.3.57-1_amd64.deb; + sudo rm -f /tmp/foundationdb-clients_7.3.57-1_amd64.deb; + sudo curl --fail -L "https://github.com/apple/foundationdb/releases/download/7.3.57/libfdb_c.x86_64.so" --output "/usr/lib/libfdb_c.7.3.57.x86_64.so"; - name: Setup docker compose id: compose From 6f3835a36aaea97ffbdf1d8b16a6ada6d70be5f2 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Sun, 21 Dec 2025 11:38:48 +0000 Subject: [PATCH 15/26] fixup! fixup! fixup! fixup! fixup! experiment with docker --- test/intemporal/test_utils.cljc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/intemporal/test_utils.cljc b/test/intemporal/test_utils.cljc index fe35136..1dae4e1 100644 --- a/test/intemporal/test_utils.cljc +++ b/test/intemporal/test_utils.cljc @@ -87,7 +87,7 @@ ;; macros (def ^:dynamic with-result-default-timeout 10000) -(def ^:dynamic wait-default-timeout 3000) +(def ^:dynamic wait-default-timeout 10000) (defmacro with-result "Promise-aware macro: the result can either be a value or a thrown exception. @@ -127,7 +127,7 @@ 0)))))) (defmacro wait - "Waits for 3 secs until the result is true, or throws; + "Waits for 10 secs until the result is true, or throws; In `clj` it polls every 100ms In `cljs` it continuously loops ``` From c2fd0006d140e1b9c3b8caca782b72b3a7bc5c35 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Sun, 21 Dec 2025 11:43:44 +0000 Subject: [PATCH 16/26] fixup! fixup! fixup! fixup! fixup! fixup! experiment with docker --- test/intemporal/stores/lots_of_workflows_test.clj | 11 ++++++----- test/intemporal/test_utils.cljc | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/test/intemporal/stores/lots_of_workflows_test.clj b/test/intemporal/stores/lots_of_workflows_test.clj index 36d853e..5402738 100644 --- a/test/intemporal/stores/lots_of_workflows_test.clj +++ b/test/intemporal/stores/lots_of_workflows_test.clj @@ -48,11 +48,12 @@ (my-workflow)))) ;; check that all tasks are enqueued - (wait (= iterations (count (store/list-tasks store))) - (let [wflows (store/list-tasks store)] - (testing "workflows are all new" - (is (= iterations (count wflows))) - (is (= #{:new} (set (map :state wflows)))))))) + (with-redefs [tu/wait-default-timeout 60000] + (wait (= iterations (count (store/list-tasks store))) + (let [wflows (store/list-tasks store)] + (testing "workflows are all new" + (is (= iterations (count wflows))) + (is (= #{:new} (set (map :state wflows))))))))) (testing "enqueue all jobs" (let [ex (w/start-poller! store {:protocols {`MyActivities (->MyActivitiesImpl)}})] diff --git a/test/intemporal/test_utils.cljc b/test/intemporal/test_utils.cljc index 1dae4e1..d2e04fc 100644 --- a/test/intemporal/test_utils.cljc +++ b/test/intemporal/test_utils.cljc @@ -87,7 +87,7 @@ ;; macros (def ^:dynamic with-result-default-timeout 10000) -(def ^:dynamic wait-default-timeout 10000) +(def ^:dynamic wait-default-timeout 3000) (defmacro with-result "Promise-aware macro: the result can either be a value or a thrown exception. From 7d00a23e37d6087d2fc9aff856d49f33551ba696 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Sun, 21 Dec 2025 11:54:33 +0000 Subject: [PATCH 17/26] fixup! fixup! fixup! fixup! fixup! fixup! fixup! experiment with docker --- .../stores/lots_of_workflows_test.clj | 53 ++++++++++--------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/test/intemporal/stores/lots_of_workflows_test.clj b/test/intemporal/stores/lots_of_workflows_test.clj index 5402738..61034bb 100644 --- a/test/intemporal/stores/lots_of_workflows_test.clj +++ b/test/intemporal/stores/lots_of_workflows_test.clj @@ -33,38 +33,39 @@ (def latch (CountDownLatch. iterations)) (deftest stores-test - (doseq [[label store] @stores] - (testing (format "store: %s" label) + (with-redefs [tu/wait-default-timeout 60000] + (doseq [[label store] @stores] + (testing (format "store: %s" label) - (testing "clear" - (store/clear-events store) - (store/clear-tasks store)) + (testing "clear" + (store/clear-events store) + (store/clear-tasks store)) - (testing "multiple iterations" - (w/with-env {:store store} - (dotimes [_ iterations] - ;; workflows are blocking, we wrap in a virtual thread - (p/vthread - (my-workflow)))) + (testing "multiple iterations" + (w/with-env {:store store} + (dotimes [_ iterations] + ;; workflows are blocking, we wrap in a virtual thread + (p/vthread + (my-workflow)))) + + ;; check that all tasks are enqueued - ;; check that all tasks are enqueued - (with-redefs [tu/wait-default-timeout 60000] (wait (= iterations (count (store/list-tasks store))) (let [wflows (store/list-tasks store)] (testing "workflows are all new" (is (= iterations (count wflows))) - (is (= #{:new} (set (map :state wflows))))))))) + (is (= #{:new} (set (map :state wflows)))))))) - (testing "enqueue all jobs" - (let [ex (w/start-poller! store {:protocols {`MyActivities (->MyActivitiesImpl)}})] - ;; lets wait for all pending - (try - (wait (not (contains? (into #{} (map :state (store/list-tasks store))) :new)) - (w/shutdown ex 5000)) + (testing "enqueue all jobs" + (let [ex (w/start-poller! store {:protocols {`MyActivities (->MyActivitiesImpl)}})] + ;; lets wait for all pending + (try + (wait (not (contains? (into #{} (map :state (store/list-tasks store))) :new)) + (w/shutdown ex 5000)) - (testing "workflows are all completed" - (let [wflows (store/list-tasks store)] - (is (= iterations (count wflows))) - (is (= #{:success} (set (map :state wflows)))))) - (finally - (w/shutdown ex 0)))))))) + (testing "workflows are all completed" + (let [wflows (store/list-tasks store)] + (is (= iterations (count wflows))) + (is (= #{:success} (set (map :state wflows)))))) + (finally + (w/shutdown ex 0))))))))) From 13fe52904e6bf3281a93c3c61ec1be38c98d3137 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Sun, 21 Dec 2025 12:12:51 +0000 Subject: [PATCH 18/26] start 10 workfows --- test/intemporal/containers/failures_test.clj | 31 +++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/test/intemporal/containers/failures_test.clj b/test/intemporal/containers/failures_test.clj index 923148a..50de5d6 100644 --- a/test/intemporal/containers/failures_test.clj +++ b/test/intemporal/containers/failures_test.clj @@ -4,7 +4,8 @@ [clojure.tools.logging :as log] [intemporal.containers.start-runner] [intemporal.containers.execute-workflow]) - (:import (org.testcontainers.containers BindMode GenericContainer) + (:import (java.util.concurrent CountDownLatch) + (org.testcontainers.containers BindMode GenericContainer) (org.testcontainers.containers.wait.strategy Wait))) ;; assumes docker-compose is running @@ -17,10 +18,11 @@ ;; -e JDBC_URL="jdbc:postgresql://postgresql:5432/root?user=root&password=root" ;; -v "$(pwd)":/tmp clojure:temurin-25-tools-deps-noble clj ;; -A:dev:jdbc:fdb -m intemporal.containers.execute-workflow postgres intemporal.containers.execute-workflow/test-workflow - (let [cmd (format "clj -A:%s -m %s %s" aliases clj-main (str/join " " args)) + (let [cmd (format "clj -A:%s -M -m %s %s" aliases clj-main (str/join " " args)) img "clojure:temurin-25-tools-deps-noble" container (doto (GenericContainer. img) (.withEnv env) + ;; requires docker-compose running foundation and postgres (.withNetworkMode default-network) (.withCommand cmd) ;; will wait until the work is finished, but no exit @@ -43,17 +45,30 @@ (deftest postgres-test (testing "failure via crash" - (let [runner (make-container (var->ns #'intemporal.containers.start-runner/-main) {:args ["postgres"]}) - workflow-fn #'intemporal.containers.execute-workflow/test-workflow - workflow (make-container (var->ns workflow-fn) {:args ["postgres" (var->str workflow-fn)]})] + (let [runner (make-container (var->ns #'intemporal.containers.start-runner/-main) {:args ["postgres"]}) + workflow-fn #'intemporal.containers.execute-workflow/test-workflow + num-workflows 10 + workflows (mapv (fn [_] (make-container (var->ns workflow-fn) {:args ["postgres" (var->str workflow-fn)]})) + (range num-workflows)) + latch (CountDownLatch. num-workflows)] (try (.start runner) - (.start workflow) + ;; start all at same time + (doseq [workflow workflows] + (future + (try (.start workflow) + (finally + (.countDown latch))))) (testing "containers eventually exit" + ;; at this point, runner is ready + (is (zero? (container-exit-code runner))) - (is (zero? (container-exit-code workflow)))) + (.await latch) + (doseq [workflow workflows] + (is (zero? (container-exit-code workflow))))) (finally (.stop runner) - (.stop workflow)))))) + (doseq [workflow workflows] + (.stop workflow))))))) From ab332c030a65ad19f021c1edce3f84d2cae57c44 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Sun, 21 Dec 2025 16:45:48 +0000 Subject: [PATCH 19/26] restore earthly --- .github/workflows/ci.yaml | 4 +- deps.edn | 1 - test/intemporal/containers/failures_test.clj | 74 -------------------- 3 files changed, 2 insertions(+), 77 deletions(-) delete mode 100644 test/intemporal/containers/failures_test.clj diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 7487a93..84ec9de 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -7,7 +7,7 @@ concurrency: jobs: earthly: - if: false + if: true name: Earthly ci runs-on: ubuntu-latest permissions: write-all @@ -37,7 +37,7 @@ jobs: coverage-file: './coverage/lcov.info' clojure: - if: true + if: false runs-on: ubuntu-latest steps: - name: Checkout diff --git a/deps.edn b/deps.edn index 5107e5d..ee52428 100644 --- a/deps.edn +++ b/deps.edn @@ -33,7 +33,6 @@ com.lambdaisland/kaocha-cljs {:mvn/version "1.5.154"} ch.qos.logback/logback-classic {:mvn/version "1.5.18"} tortue/spy {:mvn/version "2.15.0"} - org.testcontainers/testcontainers {:mvn/version "2.0.2"} nubank/matcher-combinators {:mvn/version "3.9.2"} com.clojure-goes-fast/clj-async-profiler {:mvn/version "1.6.2"}}} :fdb {:extra-deps {org.foundationdb/fdb-java {:mvn/version "7.3.57"} diff --git a/test/intemporal/containers/failures_test.clj b/test/intemporal/containers/failures_test.clj deleted file mode 100644 index 50de5d6..0000000 --- a/test/intemporal/containers/failures_test.clj +++ /dev/null @@ -1,74 +0,0 @@ -(ns ^:integration ^:fdb ^:sql intemporal.containers.failures-test - (:require [clojure.string :as str] - [clojure.test :refer [deftest is testing]] - [clojure.tools.logging :as log] - [intemporal.containers.start-runner] - [intemporal.containers.execute-workflow]) - (:import (java.util.concurrent CountDownLatch) - (org.testcontainers.containers BindMode GenericContainer) - (org.testcontainers.containers.wait.strategy Wait))) - -;; assumes docker-compose is running -(def default-network "intemporal_default") - -(defn make-container ^GenericContainer - ^GenericContainer [clj-main {:keys [aliases args env] :or {env {"JDBC_URL" "jdbc:postgresql://postgresql:5432/root?user=root&password=root"} - aliases "dev:jdbc:fdb"}}] - ;; docker run --name workflow --network=intemporal_default -ti --rm - ;; -e JDBC_URL="jdbc:postgresql://postgresql:5432/root?user=root&password=root" - ;; -v "$(pwd)":/tmp clojure:temurin-25-tools-deps-noble clj - ;; -A:dev:jdbc:fdb -m intemporal.containers.execute-workflow postgres intemporal.containers.execute-workflow/test-workflow - (let [cmd (format "clj -A:%s -M -m %s %s" aliases clj-main (str/join " " args)) - img "clojure:temurin-25-tools-deps-noble" - container (doto (GenericContainer. img) - (.withEnv env) - ;; requires docker-compose running foundation and postgres - (.withNetworkMode default-network) - (.withCommand cmd) - ;; will wait until the work is finished, but no exit - (.waitingFor (Wait/forLogMessage ".*Ready.*\n", 1)) - (.withFileSystemBind (System/getProperty "user.dir") "/tmp" BindMode/READ_ONLY))] - (log/infof "Creating container with image %s and command %s" img cmd) - container)) - -(defn container-exit-code [^GenericContainer container] - (-> (.inspectContainerCmd (.getDockerClient container) (.getContainerId container)) - (.exec) - (.getState) - (.getExitCodeLong))) - -(defn var->ns [v] - (:ns (meta v))) - -(defn var->str [v] - (str (symbol v))) - -(deftest postgres-test - (testing "failure via crash" - (let [runner (make-container (var->ns #'intemporal.containers.start-runner/-main) {:args ["postgres"]}) - workflow-fn #'intemporal.containers.execute-workflow/test-workflow - num-workflows 10 - workflows (mapv (fn [_] (make-container (var->ns workflow-fn) {:args ["postgres" (var->str workflow-fn)]})) - (range num-workflows)) - latch (CountDownLatch. num-workflows)] - (try - (.start runner) - ;; start all at same time - (doseq [workflow workflows] - (future - (try (.start workflow) - (finally - (.countDown latch))))) - - (testing "containers eventually exit" - ;; at this point, runner is ready - - (is (zero? (container-exit-code runner))) - (.await latch) - (doseq [workflow workflows] - (is (zero? (container-exit-code workflow))))) - - (finally - (.stop runner) - (doseq [workflow workflows] - (.stop workflow))))))) From 67cd89b34219507e6e78f11089f4fd49dd5368c6 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Mon, 22 Dec 2025 22:19:34 +0000 Subject: [PATCH 20/26] bump fdb to 7.3.62 --- deps.edn | 2 +- docker-compose.yaml | 2 +- docker/fdb.cluster | 2 +- src/intemporal/workflow.cljc | 2 +- src/intemporal/workflow/internal.cljc | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/deps.edn b/deps.edn index ee52428..6468ae7 100644 --- a/deps.edn +++ b/deps.edn @@ -35,7 +35,7 @@ tortue/spy {:mvn/version "2.15.0"} nubank/matcher-combinators {:mvn/version "3.9.2"} com.clojure-goes-fast/clj-async-profiler {:mvn/version "1.6.2"}}} - :fdb {:extra-deps {org.foundationdb/fdb-java {:mvn/version "7.3.57"} + :fdb {:extra-deps {org.foundationdb/fdb-java {:mvn/version "7.3.62"} me.vedang/clj-fdb {:mvn/version "0.3.0"}}} :jdbc {:extra-deps {com.github.seancorfield/next.jdbc {:mvn/version "1.3.1048"} diff --git a/docker-compose.yaml b/docker-compose.yaml index 1f385c0..e3f5d43 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -26,7 +26,7 @@ services: - jaeger foundation: - image: "foundationdb/foundationdb:7.3.57" + image: "foundationdb/foundationdb:7.3.62" environment: FDB_NETWORKING_MODE: host entrypoint: ["/usr/bin/tini", "-g", "--", "sh", "/fdb-init.bash"] diff --git a/docker/fdb.cluster b/docker/fdb.cluster index 74a077b..5c41e53 100644 --- a/docker/fdb.cluster +++ b/docker/fdb.cluster @@ -1 +1 @@ -docker:docker@172.18.0.2:4500 +docker:docker@192.168.107.3:4500 diff --git a/src/intemporal/workflow.cljc b/src/intemporal/workflow.cljc index 20d43a3..08efc8c 100644 --- a/src/intemporal/workflow.cljc +++ b/src/intemporal/workflow.cljc @@ -101,7 +101,7 @@ internal-env (merge internal/default-env base-env runtime)] ;; root task: we only enqueue workflows (with-env internal-env - (t/log! {:level :debug :data {:sym (:sym task) :env internal-env}} ["Resuming task with id" (:id task)]) + (t/log! {:level :debug :data {:sym (:sym task)}} ["Resuming task with id" (:id task)]) ;; this span creation is required in order for ;; subsequent workflow traces to have a "parent" span, otherwise ;; they won't show up correctly in jaeger diff --git a/src/intemporal/workflow/internal.cljc b/src/intemporal/workflow/internal.cljc index 62545ab..308409e 100644 --- a/src/intemporal/workflow/internal.cljc +++ b/src/intemporal/workflow/internal.cljc @@ -187,9 +187,9 @@ ;; mark invoke/replay (let [next-event {:ref id :root (or root id) :type invoke :sym sym :args args}] (when inv? - (t/log! {:level :debug :data {:task task}} ["Found replay event for task with id" (:id task)])) + (t/log! {:level :debug :data {:sym (:sym task)}} ["Found replay event for task with id" (:id task)])) (when res? - (t/log! {:level :debug :data {:task task}} ["Found result event for task with id" (:id task)])) + (t/log! {:level :debug :data {:sym (:sym task)}} ["Found result event for task with id" (:id task)])) (cond ;; do we have an invocation event? if not, save this one From e29f8bf198d1d5adf066fbb2f40acd3ba763d485 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Mon, 22 Dec 2025 11:43:47 +0000 Subject: [PATCH 21/26] use one task per activity due to error recovery --- dev/intemporal/demo_parallelism.clj | 2 +- src/intemporal/macros.cljc | 22 ++-- src/intemporal/workflow.cljc | 106 ++++++++++-------- src/intemporal/workflow/internal.cljc | 8 +- .../containers/execute_workflow.clj | 42 ------- test/intemporal/containers/start_runner.clj | 45 -------- test/intemporal/shutdown_restart_test.clj | 39 ++++--- .../stores/lots_of_workflows_test.clj | 8 +- .../stores/multiple_shutdown_test.clj | 75 +++++++++++++ .../stores/release_reenqueue_test.clj | 2 +- test/intemporal/test_executor.clj | 42 +++++++ test/intemporal/test_utils.cljc | 8 +- test/intemporal/vthread_test.cljc | 7 +- test/intemporal/workflow_test.cljc | 2 +- 14 files changed, 232 insertions(+), 176 deletions(-) delete mode 100644 test/intemporal/containers/execute_workflow.clj delete mode 100644 test/intemporal/containers/start_runner.clj create mode 100644 test/intemporal/stores/multiple_shutdown_test.clj create mode 100644 test/intemporal/test_executor.clj diff --git a/dev/intemporal/demo_parallelism.clj b/dev/intemporal/demo_parallelism.clj index 296c627..9d42081 100644 --- a/dev/intemporal/demo_parallelism.clj +++ b/dev/intemporal/demo_parallelism.clj @@ -26,7 +26,7 @@ @(p/all proms))) (def mstore (store/make-store)) -(def ex (w/start-worker! mstore {:protocols {`ThreadActivity (->ThreadActivityImpl)}})) +(def stop-worker (w/start-worker! mstore {:protocols {`ThreadActivity (->ThreadActivityImpl)}})) ;; note that in cljs, this returns a promise (def res (w/with-env {:store mstore} diff --git a/src/intemporal/macros.cljc b/src/intemporal/macros.cljc index 93865a0..a66f574 100644 --- a/src/intemporal/macros.cljc +++ b/src/intemporal/macros.cljc @@ -104,11 +104,11 @@ ;; an embedded workflow engine doesn't need to have a task per invocation (t/log! {:level :debug :_data {:env i/*env* :task task#}} ["Invoking task with id " id#]) (trace! {:name (format "activity: %s" (symbol fvar#)) :attributes {:task-id id#}} - (let [res# (i/resume-task i/*env* store# protos# task#)] - (macros/case - :cljs res# - :clj (deref res#)))))))) - ;(w/enqueue-and-wait i/*env* task#))))) + (w/enqueue-and-wait i/*env* task#) + #_(let [res# (i/resume-task i/*env* store# protos# task#)] + (macros/case + :cljs res# + :clj (deref res#)))))))) (defmacro stub-protocol "Stub a protocol definition. Opts are currently unused. @@ -159,8 +159,8 @@ id#)] (t/log! {:level :debug :_data {:env i/*env* :task task#}} ["Invoking task with id" id#]) - (i/resume-task i/*env* store# protos# task#)))))))) - ;(w/enqueue-and-wait i/*env* task#)))))))) + ;(i/resume-task i/*env* store# protos# task#)))))))) + (w/enqueue-and-wait i/*env* task#)))))))) :clj #_{:clj-kondo/ignore [:unresolved-symbol]} @@ -203,11 +203,15 @@ (t/log! {:level :debug :_data {:env i/*env* :task task#}} ["Invoking task with id" id#]) (if (:vthread? i/*env*) + (trace-async! {:name (format "activity: %s" aid#) :attributes {:task-id id# :protocol (-> ~proto :var symbol)}} @(i/resume-task i/*env* store# protos# task#)) + #_ + (trace! {:name (format "activity: %s" aid#) :attributes {:task-id id# :protocol (-> ~proto :var symbol)}} + (w/enqueue-and-wait i/*env* task#)) (trace! {:name (format "activity: %s" aid#) :attributes {:task-id id# :protocol (-> ~proto :var symbol)}} - @(i/resume-task i/*env* store# protos# task#))))))))))) - ;(w/enqueue-and-wait i/*env* task#))))))))) + ;@(i/resume-task i/*env* store# protos# task#))))))))))) + (w/enqueue-and-wait i/*env* task#))))))))))) (defmacro with-failure "Runs `body`, ensuring that if it fails, compensation will always run. diff --git a/src/intemporal/workflow.cljc b/src/intemporal/workflow.cljc index 08efc8c..bd1f5c2 100644 --- a/src/intemporal/workflow.cljc +++ b/src/intemporal/workflow.cljc @@ -43,44 +43,52 @@ (defprotocol ITaskExecutor (submit [this f] "Submits the function `f` for execution") (shutdown [this grace-period-ms] "Shuts down the task executor") - (running? [this] "Indicates if the executor is running")) + (terminated? [this] "Indicates if the executor has terminated") + (shutting-down? [this] "Indicates if the executor has entered shutdown state")) (defn make-task-executor "Creates an object that satisfies `ITaskExecutor`." [] - (let [run? (atom true)] + (let [terminated? (atom false) + shutdown? (atom false)] #?(:cljs (reify ITaskExecutor (submit [_ f] - (when @run? + (when (not @terminated?) (p/vthread (f)))) (shutdown [_ grace-period-ms] (t/log! {:level :debug} ["Executor shutdown"]) - (reset! run? false)) - (running? [_] - @run?)) + (reset! terminated? true) + (reset! shutdown? true)) + (terminated? [_] @terminated?) + (shutting-down? [_] @shutdown?)) :clj - (let [factory (-> (Thread/ofVirtual) - (.name "Task Thread") - (.factory)) - exec (Executors/newThreadPerTaskExecutor factory) - running? (atom true)] + (let [factory (-> (Thread/ofVirtual) + (.name "Task Thread") + (.factory)) + exec (Executors/newThreadPerTaskExecutor factory)] (reify ITaskExecutor (submit [_ f] (.submit exec ^Runnable f)) (shutdown [_ grace-period-ms] (try + ;; reject tasks (.shutdown exec) + (reset! shutdown? true) (t/log! {:level :debug} ["Executor shutdown"]) + ;; await ongoing tasks (when-not (.awaitTermination exec grace-period-ms TimeUnit/MILLISECONDS) (t/log! {:level :debug} ["Executor shutdown grace period over, shutting down NOW"]) (.shutdownNow exec)) ;; in case we got interrupted exception, make sure to set the flag + ;; so ongoing ops fail (finally - (reset! running? false)))) - (running? [_] - @running?) + (reset! terminated? true)))) + (terminated? [_] + @terminated?) + (shutting-down? [_] + @shutdown?) ;; allow expressions like (with-open [executor (w/start-poller .... AutoCloseable (close [this] @@ -88,16 +96,17 @@ (defn- worker-execute-fn "Executes a given protocol, activity or workflow `task`" - [store protocols {:keys [type id root runtime fvar] :as task} task-counter shutting-down?] + [store protocols {:keys [type id root runtime fvar] :as task} task-counter terminated? shutting-down?] (let [runtime (:runtime task) - base-env {:store store - :type type - :ref id - :id id - :root (or root id) - :protos protocols - :next-id (fn [] (str (or root id) "-" (swap! task-counter inc))) - :shutdown? shutting-down?} + base-env {:store store + :type type + :ref id + :id id + :root (or root id) + :protos protocols + :next-id (fn [] (str (or root id) "-" (swap! task-counter inc))) + :terminated? terminated? + :shutdown? shutting-down?} internal-env (merge internal/default-env base-env runtime)] ;; root task: we only enqueue workflows (with-env internal-env @@ -114,27 +123,27 @@ (defn- worker-poll-fn "Continously polls for task while `task-executor` is active." [store protocols task-executor polling-ms] - (let [task-counter (atom 0) - shutting-down? (fn [] (not (running? task-executor)))] + (let [task-counter (atom 0) + stopped? (fn [] (terminated? task-executor)) + shutdown? (fn [] (shutting-down? task-executor))] #_{:clj-kondo/ignore [:loop-without-recur :invalid-arity]} - @(p/loop [] - (-> (p/delay polling-ms) - (p/chain (fn [_] - (loop [] - (t/log! {:level :debug} ["Polling for tasks"]) - (when-let [task (store/dequeue-task store)] - (t/log! {:level :debug :_data {:task task}} ["Dequeued task with id" (:id task)]) - (submit task-executor (fn [] - (worker-execute-fn store protocols task task-counter shutting-down?))) - (when (running? task-executor) - (recur)))) - (when (running? task-executor) - (p/recur)))) - (p/catch (fn [e] - (t/log! {:level :warn :data {:exception e}} ["Caught error during task polling, continuing"]) - (when (running? task-executor) - (p/recur)))))))) + (-> (p/delay polling-ms) + (p/chain (fn [_] + (loop [] + (t/log! {:level :debug} ["Polling for tasks"]) + (when-let [task (store/dequeue-task store)] + (t/log! {:level :debug :_data {:task task}} ["Dequeued task with id" (:id task)]) + (submit task-executor (fn [] + (worker-execute-fn store protocols task task-counter stopped? shutdown?))) + (when-not (stopped?) + (recur)))) + (when-not (stopped?) + (p/recur)))) + (p/catch (fn [e] + (t/log! {:level :warn :data {:exception e}} ["Caught error during task polling, continuing"]) + (when-not (stopped?) + (p/recur)))))))) (defn start-poller! "Starts a poller that will submit tasks to the `task-executor`. @@ -159,8 +168,7 @@ (start-worker! store {})) ([store & {:keys [protocols polling-ms] :or {protocols {} polling-ms 100}}] (let [run? (atom true) - task-counter (atom 0) - uid (random-uuid)] + task-counter (atom 0)] (internal/libthread "Worker" #_{:clj-kondo/ignore [:loop-without-recur :invalid-arity]} @(p/loop [] @@ -169,7 +177,9 @@ (when-let [task (store/dequeue-task store)] (t/log! {:level :debug :data {:sym (:sym task)}} ["Dequeued task with id" (:id task)]) (internal/libthread (str "Worker-" (:id task)) - (worker-execute-fn store protocols task task-counter (fn [] (not @run?))))) + (worker-execute-fn store protocols task task-counter + (fn [] (not @run?)) + (fn [] (not @run?))))) (when @run? (p/recur))))))) @@ -195,6 +205,6 @@ [] (let [thunks (-> internal/*env* :compensations)] (trace! {:name "compensations" :attributes {:fn-count (count @thunks)}} - (doseq [f @thunks] - (swap! thunks pop) - (f))))) \ No newline at end of file + (doseq [f @thunks] + (swap! thunks pop) + (f))))) \ No newline at end of file diff --git a/src/intemporal/workflow/internal.cljc b/src/intemporal/workflow/internal.cljc index 308409e..1d8f9d0 100644 --- a/src/intemporal/workflow/internal.cljc +++ b/src/intemporal/workflow/internal.cljc @@ -166,7 +166,7 @@ (defn resume-fn-task "Resumes a generic fn call task" - [{:keys [vthread? shutdown?] :as env} store protos {:keys [type proto id root sym fvar args] :as task} [invoke success failure]] + [{:keys [vthread? terminated? shutdown?] :as env} store protos {:keys [type proto id root sym fvar args] :as task} [invoke success failure]] (when (and (= :proto-activity type) (nil? (get protos proto))) (throw (ex-info (str "Protocol implementation for " @@ -181,7 +181,7 @@ (t/log! {:level :debug :sym sym} ["Resuming try/catch task with id" id]) (try - (let [shutting-down? (fn [] (and (ifn? shutdown?) (shutdown?))) ;; TODO fix this hack + (let [terminated? (fn [] (and (ifn? terminated?) (terminated?))) [inv? res?] (all-events store id)] ;; mark invoke/replay @@ -208,7 +208,7 @@ ;; TODO assert r is serializable! ;; we check for shutdown because in js runtime, there is no thread interruption ;; at this point, if we are shutting down it means we exhausted the grace period - (let [panic? (shutting-down?)] + (let [panic? (terminated?)] (try (if panic? (task<-panic store id (error/panic "Worker shutting down during invocation result handling")) @@ -221,7 +221,7 @@ (t/log! {:level :debug :data {:sym sym :result r}} ["Shutting down, interrupted result" id]) (t/log! {:level :debug :data {:sym sym :result r}} ["Got actual function result for task" id])))))) handle-fail (bfn [e] - (if (shutting-down?) + (if (terminated?) (do (t/log! {:level :warn :data {:exception e}} ["Exception caught during shutdown, panicking task"]) (task<-panic store id (error/panic "Worker shutting down during invocation failure handling"))) diff --git a/test/intemporal/containers/execute_workflow.clj b/test/intemporal/containers/execute_workflow.clj deleted file mode 100644 index 9564abd..0000000 --- a/test/intemporal/containers/execute_workflow.clj +++ /dev/null @@ -1,42 +0,0 @@ -(ns ^:integration ^:fdb ^:sql intemporal.containers.execute-workflow - (:gen-class) - (:require [clojure.string :as str] - [clojure.tools.logging :as log] - [intemporal.workflow :as w] - [intemporal.macros :refer [stub-function defn-workflow]] - [intemporal.containers.start-runner :as runner])) - -(defn foo [ a] - (Thread/sleep (long (rand-int 100))) - [:fun a]) - -(defn-workflow test-workflow [] - (let [pr (stub-function foo) - prr (pr :pr)] - prr)) - -;; serves as an entrypoint for a docker container that can be killed -;; some other container will have to execute the poller/runner -(defn -main [& args] - (let [store-type (-> args first keyword) - workflow-sym (or (-> args second) 'my-workflow) - workflow-fn (requiring-resolve (symbol (str/trim workflow-sym))) - store (get runner/stores store-type)] - (prn args) - (when (nil? store) - (throw (ex-info (format "Unknown store type: %s" store-type) {:store-type store-type}))) - - (when (nil? workflow-fn) - (throw (ex-info (format "Unknown worfklow fn : %s" workflow-sym) {:workflow workflow-sym}))) - - - (w/with-env {:store @store} - (try - (let [res (workflow-fn)] - (println "Response: " res)) - (finally - (log/info "Ready")))))) - -(comment - ;; clj -A:dev:jdbc:fdb -m intemporal.containers.execute-workflow postgres intemporal.containers.execute-workflow/test-workflow - (-main "postgres" "intemporal.containers.execute-workflow/test-workflow")) \ No newline at end of file diff --git a/test/intemporal/containers/start_runner.clj b/test/intemporal/containers/start_runner.clj deleted file mode 100644 index a9cb097..0000000 --- a/test/intemporal/containers/start_runner.clj +++ /dev/null @@ -1,45 +0,0 @@ -(ns ^:integration ^:fdb ^:sql intemporal.containers.start-runner - (:gen-class) - (:require [clojure.tools.logging :as log] - [intemporal.store.foundationdb :as fdb] - [intemporal.store.jdbc :as jdbc] - [intemporal.workflow :as w]) - (:import (sun.misc Signal SignalHandler))) - -(def stores {:fdb (delay (fdb/make-store {:cluster-file-path (or (System/getenv "FDB_CLUSTERFILE") - "docker/fdb.cluster")})) - :postgres (delay (jdbc/make-store {:jdbcUrl (or (System/getenv "JDBC_URL") - "jdbc:postgresql://localhost:5432/root?user=root&password=root") - :migration-dir "migrations/postgres"}))}) - -;; serves as an entrypoint for a docker container that can be killed -;; some other container will have to execute the poller/runner -;; clj -A:dev:jdbc:fdb -m intemporal.containers.start-runner postgres -(defn -main [& args] - (let [store-type (-> args first keyword) - store (get stores store-type)] - (when (nil? store) - (throw (ex-info (format "Unknown store type: %s" store-type) {:store-type store-type}))) - (log/info "Starting poller for store" store-type) - - (let [ex (w/start-poller! @store {})] - (log/info "Ready: poller started for store" store-type) - (doto (Thread. (fn [] - (Signal/handle (Signal. "TERM") - (reify SignalHandler - (handle [this sig] - (log/info "Received SIGTERM, shutting down") - (w/shutdown ex 0)))) - (Signal/handle (Signal. "INT") - (reify SignalHandler - (handle [this sig] - (log/info "Received INT, quitting") - (System/exit 0)))) - (log/info "Waiting for SIGTERM/SIGINT...") - (Thread/sleep Integer/MAX_VALUE))) - - (.setDaemon false) - (.start))))) - -(comment - (-main "postgres")) \ No newline at end of file diff --git a/test/intemporal/shutdown_restart_test.clj b/test/intemporal/shutdown_restart_test.clj index 6b54200..825e051 100644 --- a/test/intemporal/shutdown_restart_test.clj +++ b/test/intemporal/shutdown_restart_test.clj @@ -4,7 +4,8 @@ [intemporal.workflow :as w] [matcher-combinators.test :refer [match?]] [intemporal.macros :refer [stub-protocol defn-workflow]] - [intemporal.test-utils :as tu :refer [with-result]]) + [intemporal.test-utils :as tu :refer [with-result]] + [intemporal.test-executor :as te]) (:import (java.util.concurrent CountDownLatch))) ;(t/use-fixtures :once tu/with-trace-logging) @@ -13,39 +14,42 @@ (def executor-shutdown? (CountDownLatch. 1)) (defprotocol MyActivities - (foo [this a])) + (foo [this a]) + (foo2 [this a])) (defrecord MyActivitiesImpl [] MyActivities (foo [this a] (.countDown activity-invoked?) (.await executor-shutdown?) - :foo)) + :foo) + (foo2 [this a] a)) (defn-workflow my-workflow [k] (let [stub (stub-protocol MyActivities {}) - prr (foo stub :pr)] - prr)) + r1 (foo stub :pr) + r2 (foo2 stub :pr)] + [r1 r2])) ;;;; test proper (deftest executor-shutdown-test (testing "failure: task validation fails" - (let [mstore (store/make-store {}) - executor (w/start-poller! mstore {:protocols {`MyActivities (->MyActivitiesImpl)} - :polling-ms 500})] + (let [mstore (store/make-store {}) + custom-ex (te/make-test-executor (fn [] (.countDown executor-shutdown?)) nil) + executor (w/start-poller! mstore custom-ex {:protocols {`MyActivities (->MyActivitiesImpl)} + :polling-ms 500})] (testing "shutdown of ongoing workflow" (future ;; ensure activity is inflight (.await activity-invoked?) + ;; immediately countdown the latch (w/shutdown executor 0) - ;; proceed activity, it will fail - (.countDown executor-shutdown?) - (is (not (w/running? executor)))) + (is (w/shutting-down? executor))) (with-result [res (w/with-env {:store mstore} - (my-workflow :ok))] + (my-workflow :ok))] (is (instance? Exception res)) @@ -53,14 +57,17 @@ (tu/print-tables mstore) (testing "workflow task" - (let [[w1] (store/list-tasks mstore)] + (let [tasks (store/list-tasks mstore) + [w1] tasks] (is (match? {:type :workflow :sym 'intemporal.shutdown-restart-test/my-workflow- :state :pending} w1)) - (testing "workflow events" - (let [[e1 e2 e3] (store/list-events mstore)] + (testing "workflow events: workflow has not finished" + (let [[e1 e2] (store/list-events mstore)] (is (match? {:type :intemporal.workflow/invoke :sym 'intemporal.shutdown-restart-test/my-workflow-} e1)) (is (match? {:type :intemporal.protocol/invoke :sym 'intemporal.shutdown-restart-test/foo} e2)) - (is (nil? e3)))) + + (let [[w1] (store/list-tasks mstore)] + (is (match? {:type :workflow :sym 'intemporal.shutdown-restart-test/my-workflow- :state :pending} w1))))) (testing "workflow resumes" (with-open [_ (w/start-poller! mstore {:protocols {`MyActivities (->MyActivitiesImpl)} diff --git a/test/intemporal/stores/lots_of_workflows_test.clj b/test/intemporal/stores/lots_of_workflows_test.clj index 61034bb..6d5d96b 100644 --- a/test/intemporal/stores/lots_of_workflows_test.clj +++ b/test/intemporal/stores/lots_of_workflows_test.clj @@ -61,11 +61,11 @@ ;; lets wait for all pending (try (wait (not (contains? (into #{} (map :state (store/list-tasks store))) :new)) - (w/shutdown ex 5000)) + (w/shutdown ex 10000)) (testing "workflows are all completed" - (let [wflows (store/list-tasks store)] - (is (= iterations (count wflows))) - (is (= #{:success} (set (map :state wflows)))))) + (let [tasks (store/list-tasks store)] + (is (= (* 2 iterations) (count tasks))) + (is (= #{:success} (set (map :state tasks)))))) (finally (w/shutdown ex 0))))))))) diff --git a/test/intemporal/stores/multiple_shutdown_test.clj b/test/intemporal/stores/multiple_shutdown_test.clj new file mode 100644 index 0000000..449309d --- /dev/null +++ b/test/intemporal/stores/multiple_shutdown_test.clj @@ -0,0 +1,75 @@ +(ns intemporal.stores.multiple-shutdown-test + (:require [clojure.test :as t :refer [deftest is testing]] + [intemporal.store :as store] + [intemporal.store.foundationdb :as fdb] + [intemporal.store.jdbc :as jdbc] + [intemporal.workflow :as w] + [matcher-combinators.test :refer [match?]] + [intemporal.macros :refer [stub-protocol defn-workflow]] + [intemporal.test-utils :as tu :refer [with-result]])) + +(t/use-fixtures :once tu/with-trace-logging) + +(def stores (delay {:memory (store/make-store) + :fdb (fdb/make-store {:cluster-file-path "docker/fdb.cluster"}) + :postgres (jdbc/make-store {:jdbcUrl "jdbc:postgresql://localhost:5432/root?user=root&password=root" + :migration-dir "migrations/postgres"})})) + +(defprotocol MyActivities + (sleep [this time])) + +(defrecord MyActivitiesImpl [] + MyActivities + (sleep [this a] + (Thread/sleep (long a)))) + +(defn-workflow my-workflow [steps max-sleep] + (let [stub (stub-protocol MyActivities {})] + (dotimes [_ steps] + (sleep stub (rand-int max-sleep))) + :done)) + +;;;; test proper + +(deftest executor-shutdown-test + (testing "workflow eventually finishes" + (let [store (store/make-store {}) + polling-ms 100 + executor (w/start-poller! store {:protocols {`MyActivities (->MyActivitiesImpl)} + :polling-ms polling-ms}) + steps 3 + max-timeout 500 + max-wait (+ 1000 ;; leeway + (* steps polling-ms) + (* steps max-timeout)) + future-res (future + (w/with-env {:store store} + (my-workflow steps max-timeout)))] + + (testing "shutdown of ongoing workflow" + (let [start (System/currentTimeMillis)] + (loop [ex executor] + (store/reenqueue-pending-tasks store #(println "XXX" %)) + + (cond + (future-done? future-res) + (do + (is (= :done (deref future-res))) + (w/shutdown ex 0) + (tu/print-tables store)) + + (> (- (System/currentTimeMillis) start) max-wait) + (do + (future-cancel future-res) + (w/shutdown ex 0) + (tu/print-tables store) + (throw (ex-info "Timeout done, future not finished" {:timeout max-wait}))) + + :else + (do + ;; give a step a chance to finish + (w/shutdown ex max-wait) + (recur (w/start-poller! store {:protocols {`MyActivities (->MyActivitiesImpl)} + :polling-ms 100})))))))))) + + diff --git a/test/intemporal/stores/release_reenqueue_test.clj b/test/intemporal/stores/release_reenqueue_test.clj index a20cc26..ef2da8f 100644 --- a/test/intemporal/stores/release_reenqueue_test.clj +++ b/test/intemporal/stores/release_reenqueue_test.clj @@ -53,7 +53,7 @@ (w/shutdown executor 0) ;; proceed activity, it will fail (.countDown executor-shutdown?) - (is (not (w/running? executor)))) + (is (w/shutting-down? executor))) (with-result [res (w/with-env {:store store} (my-workflow :ok))] diff --git a/test/intemporal/test_executor.clj b/test/intemporal/test_executor.clj new file mode 100644 index 0000000..31a38c7 --- /dev/null +++ b/test/intemporal/test_executor.clj @@ -0,0 +1,42 @@ +(ns intemporal.test-executor + (:require [intemporal.workflow :as w] + [taoensso.telemere :as t]) + (:import (java.lang AutoCloseable) + (java.util.concurrent Executors TimeUnit))) + +(defn make-test-executor [shutdown-fn? terminated-fn?] + (let [factory (-> (Thread/ofVirtual) + (.name "Task Thread") + (.factory)) + exec (Executors/newThreadPerTaskExecutor factory) + shutdown? (atom false) + terminated? (atom false)] + (reify + w/ITaskExecutor + (submit [_ f] + (.submit exec ^Runnable f)) + (shutdown [_ grace-period-ms] + (try + ;; reject tasks + (.shutdown exec) + (when (ifn? shutdown-fn?) + (shutdown-fn?)) + (reset! shutdown? true) + (t/log! {:level :debug} ["Executor shutdown"]) + ;; await ongoing tasks + (when-not (.awaitTermination exec grace-period-ms TimeUnit/MILLISECONDS) + (t/log! {:level :debug} ["Executor shutdown grace period over, shutting down NOW"]) + (.shutdownNow exec)) + ;; in case we got interrupted exception, make sure to set the flag + ;; so ongoing ops fail + (finally + (when (ifn? terminated-fn?) + (terminated-fn?)) + (reset! terminated? true)))) + (terminated? [_] + @terminated?) + (shutting-down? [_] + @shutdown?) + AutoCloseable + (close [this] + (w/shutdown this 0))))) \ No newline at end of file diff --git a/test/intemporal/test_utils.cljc b/test/intemporal/test_utils.cljc index d2e04fc..d672247 100644 --- a/test/intemporal/test_utils.cljc +++ b/test/intemporal/test_utils.cljc @@ -64,8 +64,12 @@ (let [tasks (store/list-tasks store) events (->> (store/list-events store) (sort-by :id))] - (pprint/print-table tasks) - (pprint/print-table events))) + (locking *out* + (println "==================== TASKS") + (pprint/print-table tasks) + (println "==================== EVENTS") + (pprint/print-table events) + (flush)))) (defn wait-for-task "Waits for the task with given id to reach terminal state" diff --git a/test/intemporal/vthread_test.cljc b/test/intemporal/vthread_test.cljc index 29125e3..7d751c1 100644 --- a/test/intemporal/vthread_test.cljc +++ b/test/intemporal/vthread_test.cljc @@ -40,17 +40,18 @@ (p/all proms))) (deftest workflow-with-vthread-test - (let [sleep-time (+ 3000 (rand-int 500))] + (let [sleep-time (+ 1000 (rand-int 500))] (testing "workflow" (let [mstore (store/make-store) - executor (w/start-poller! mstore {:protocols {`ThreadActivity (->ThreadActivityImpl)}}) + executor (w/start-poller! mstore {:protocols {`ThreadActivity (->ThreadActivityImpl)} + :polling-ms 10}) start (store/now)] ;; cljs runtimes return promises ;; clj runtime will run synchronously (with-result [v (w/with-env {:store mstore} - (my-workflow sleep-time))] + (my-workflow sleep-time))] (testing "result" (is (= (range 10) diff --git a/test/intemporal/workflow_test.cljc b/test/intemporal/workflow_test.cljc index 82e754f..4d64941 100644 --- a/test/intemporal/workflow_test.cljc +++ b/test/intemporal/workflow_test.cljc @@ -109,7 +109,7 @@ (is (match? {:type :workflow :sym 'intemporal.workflow-test/my-workflow- :state :success} w1))) (testing "workflow uuid" - (is (every? #(= @uuid-store %) (map :id tasks))) + (is (some #(= @uuid-store %) (map :id tasks))) (is (= @uuid-store workflow-id))))) (w/shutdown ex 1000))))) From 9cc624e98534a2182586cec2d21a47e16e93b72a Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Tue, 23 Dec 2025 12:54:45 +0000 Subject: [PATCH 22/26] fix bad ex state --- src/intemporal/error.cljc | 13 +++- src/intemporal/workflow.cljc | 22 ++++-- src/intemporal/workflow/internal.cljc | 15 +++- .../stores/multiple_shutdown_test.clj | 78 +++++++++---------- 4 files changed, 80 insertions(+), 48 deletions(-) diff --git a/src/intemporal/error.cljc b/src/intemporal/error.cljc index fc51bc7..85ca2a6 100644 --- a/src/intemporal/error.cljc +++ b/src/intemporal/error.cljc @@ -1,4 +1,15 @@ -(ns intemporal.error) +(ns intemporal.error + #?(:clj (:import [java.lang InterruptedException] + [java.util.concurrent RejectedExecutionException]))) + + +(defn interrupted? [e] + #?(:clj (instance? InterruptedException e) + :cljs false)) + +(defn rejected? [e] + #?(:clj (instance? RejectedExecutionException e) + :cljs false)) (defn internal-error? [ex] (when-let [t (-> ex ex-data ::type)] diff --git a/src/intemporal/workflow.cljc b/src/intemporal/workflow.cljc index bd1f5c2..1d05a24 100644 --- a/src/intemporal/workflow.cljc +++ b/src/intemporal/workflow.cljc @@ -1,13 +1,15 @@ (ns intemporal.workflow (:require [intemporal.store :as store] [intemporal.workflow.internal :as internal] + [intemporal.error :as error] [promesa.core :as p] [taoensso.telemere :as t]) #?(:cljs (:require-macros #_:clj-kondo/ignore [intemporal.workflow.internal :refer [with-env-internal trace! trace-async!]] [intemporal.workflow :refer [with-env]])) - #?(:clj (:require [intemporal.workflow.internal :refer [trace! trace-async!]] + #?(:clj (:require [intemporal.error :as error] + [intemporal.workflow.internal :refer [trace! trace-async!]] [steffan-westcott.clj-otel.context :as otctx])) #?(:clj (:import [java.util.concurrent Executors TimeUnit] [java.lang AutoCloseable]))) @@ -131,11 +133,21 @@ (-> (p/delay polling-ms) (p/chain (fn [_] (loop [] - (t/log! {:level :debug} ["Polling for tasks"]) - (when-let [task (store/dequeue-task store)] + ;(t/log! {:level :debug} ["Polling for tasks..."]) + ;; TODO add another check for shutting-down? + (when-let [task (and + (not (shutting-down? task-executor)) + (store/dequeue-task store))] (t/log! {:level :debug :_data {:task task}} ["Dequeued task with id" (:id task)]) - (submit task-executor (fn [] - (worker-execute-fn store protocols task task-counter stopped? shutdown?))) + (try + (submit task-executor (fn [] + (worker-execute-fn store protocols task task-counter stopped? shutdown?))) + (catch #?(:clj Exception :cljs js/Error) e + ;; dequeued updated the state atomically (so other txs dont do the same) + ;; but if the executor stopped in the meantime we need to revert the task's state + (when (error/rejected? e) + (t/log! {:level :warn} ["Task execution rejected, reverting state to :new"]) + (store/enqueue-task store (assoc task :state :new))))) (when-not (stopped?) (recur)))) (when-not (stopped?) diff --git a/src/intemporal/workflow/internal.cljc b/src/intemporal/workflow/internal.cljc index 1d8f9d0..2f54631 100644 --- a/src/intemporal/workflow/internal.cljc +++ b/src/intemporal/workflow/internal.cljc @@ -221,14 +221,27 @@ (t/log! {:level :debug :data {:sym sym :result r}} ["Shutting down, interrupted result" id]) (t/log! {:level :debug :data {:sym sym :result r}} ["Got actual function result for task" id])))))) handle-fail (bfn [e] - (if (terminated?) + (cond + ;; if its a java.lang.InterruptedException it means + ;; we killed the executor + ;; - we must leave the task pending (assuming its idempotent) + (error/interrupted? e) + (t/log! {:level :debug :data {:sym sym :exception e}} ["Exception caught during actual function invocation for task" id]) + + ;; executor has terminated, it means we exhausted the graceful shutdown period + ;; panic the task + (terminated?) (do (t/log! {:level :warn :data {:exception e}} ["Exception caught during shutdown, panicking task"]) (task<-panic store id (error/panic "Worker shutting down during invocation failure handling"))) + + ;; regular task failure + :else (do (t/log! {:level :debug :data {:sym sym :exception e}} ["Exception caught during actual function invocation for task" id]) (task<-event store id (cond-> (assoc next-failure :error e) (error/internal-error? e) (assoc :type ::failure))))) + ;; finally, return error (p/rejected e)) retval (cond ;; are we replaying a result? diff --git a/test/intemporal/stores/multiple_shutdown_test.clj b/test/intemporal/stores/multiple_shutdown_test.clj index 449309d..ef304cf 100644 --- a/test/intemporal/stores/multiple_shutdown_test.clj +++ b/test/intemporal/stores/multiple_shutdown_test.clj @@ -4,72 +4,68 @@ [intemporal.store.foundationdb :as fdb] [intemporal.store.jdbc :as jdbc] [intemporal.workflow :as w] - [matcher-combinators.test :refer [match?]] [intemporal.macros :refer [stub-protocol defn-workflow]] - [intemporal.test-utils :as tu :refer [with-result]])) + [intemporal.test-utils :as tu] + [matcher-combinators.test :refer [match?]] + [matcher-combinators.matchers :as m])) -(t/use-fixtures :once tu/with-trace-logging) +;(t/use-fixtures :once tu/with-trace-logging) (def stores (delay {:memory (store/make-store) :fdb (fdb/make-store {:cluster-file-path "docker/fdb.cluster"}) :postgres (jdbc/make-store {:jdbcUrl "jdbc:postgresql://localhost:5432/root?user=root&password=root" :migration-dir "migrations/postgres"})})) +(def activity-counter (atom 0)) + (defprotocol MyActivities (sleep [this time])) (defrecord MyActivitiesImpl [] MyActivities (sleep [this a] + (swap! activity-counter inc) (Thread/sleep (long a)))) (defn-workflow my-workflow [steps max-sleep] (let [stub (stub-protocol MyActivities {})] - (dotimes [_ steps] - (sleep stub (rand-int max-sleep))) + (dotimes [i steps] + (sleep stub max-sleep)) :done)) ;;;; test proper (deftest executor-shutdown-test (testing "workflow eventually finishes" - (let [store (store/make-store {}) - polling-ms 100 - executor (w/start-poller! store {:protocols {`MyActivities (->MyActivitiesImpl)} - :polling-ms polling-ms}) - steps 3 - max-timeout 500 - max-wait (+ 1000 ;; leeway - (* steps polling-ms) - (* steps max-timeout)) - future-res (future - (w/with-env {:store store} - (my-workflow steps max-timeout)))] + (let [store (store/make-store {}) + polling-ms 500 + make-poller (fn [] (w/start-poller! store {:protocols {`MyActivities (->MyActivitiesImpl)} + :polling-ms polling-ms})) + executor (atom (make-poller)) - (testing "shutdown of ongoing workflow" - (let [start (System/currentTimeMillis)] - (loop [ex executor] - (store/reenqueue-pending-tasks store #(println "XXX" %)) + steps 2 + max-timeout 500 - (cond - (future-done? future-res) - (do - (is (= :done (deref future-res))) - (w/shutdown ex 0) - (tu/print-tables store)) - - (> (- (System/currentTimeMillis) start) max-wait) - (do - (future-cancel future-res) - (w/shutdown ex 0) - (tu/print-tables store) - (throw (ex-info "Timeout done, future not finished" {:timeout max-wait}))) - - :else - (do - ;; give a step a chance to finish - (w/shutdown ex max-wait) - (recur (w/start-poller! store {:protocols {`MyActivities (->MyActivitiesImpl)} - :polling-ms 100})))))))))) + workflow-id "f100ded0-0000-4000-a000-000000000000" + future-res (future + (w/with-env {:store store + :id workflow-id} + (my-workflow steps max-timeout))) + reenqueued (atom [])] + (testing "shutdown of ongoing workflow" + (add-watch activity-counter ::watch (fn [_ _ _ v] + (when (and (zero? (mod v 2)) + (empty? @reenqueued)) + (w/shutdown @executor max-timeout) + (store/reenqueue-pending-tasks store #(swap! reenqueued conj %)) + (reset! executor (make-poller))))) + (try + (is (= :done (deref future-res 10000 ::error))) + (finally + (testing "workflow was re-enqueued" + (is (match? (m/embeds [{:type :workflow :sym 'intemporal.stores.multiple-shutdown-test/my-workflow-}]) + @reenqueued))) + (w/shutdown @executor 0) + (tu/print-tables store))))))) From 7dae1b301240f7580fa1be996fa1566c07e6f92c Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Tue, 23 Dec 2025 23:07:43 +0000 Subject: [PATCH 23/26] replace with phaser --- src/intemporal/store/jdbc.clj | 3 +- src/intemporal/workflow/internal.cljc | 2 +- test/intemporal/shutdown_restart_test.clj | 2 +- test/intemporal/store_test.cljc | 4 +- test/intemporal/stores/basic_test.clj | 6 +-- .../stores/lots_of_workflows_test.clj | 4 +- .../stores/multiple_shutdown_test.clj | 2 +- .../stores/release_reenqueue_test.clj | 41 ++++++++++--------- test/intemporal/stores/saga_test.clj | 4 +- test/intemporal/test_utils.cljc | 6 +-- test/intemporal/vthread_recovery_test.clj | 4 +- 11 files changed, 41 insertions(+), 37 deletions(-) diff --git a/src/intemporal/store/jdbc.clj b/src/intemporal/store/jdbc.clj index 052af29..06abfd2 100644 --- a/src/intemporal/store/jdbc.clj +++ b/src/intemporal/store/jdbc.clj @@ -203,7 +203,8 @@ (reenqueue-pending-tasks [this f] (let [tasks? (jdbc/with-transaction [tx db-spec] (let [tasks (jdbc/execute! tx ["select * from tasks where state='pending' and (owner is null or owner=?)" owner] default-opts)] - (jdbc/execute-one! tx ["update tasks set state='new', owner=? where id = ANY(?)" owner (into-array (mapv :id tasks))]) + (jdbc/execute-one! tx ["update tasks set state='new', owner=? where id = ANY(?)" owner + (into-array String (mapv :id tasks))]) (doseq [row tasks] (f (db->task row))) tasks))] diff --git a/src/intemporal/workflow/internal.cljc b/src/intemporal/workflow/internal.cljc index 2f54631..28f8009 100644 --- a/src/intemporal/workflow/internal.cljc +++ b/src/intemporal/workflow/internal.cljc @@ -226,7 +226,7 @@ ;; we killed the executor ;; - we must leave the task pending (assuming its idempotent) (error/interrupted? e) - (t/log! {:level :debug :data {:sym sym :exception e}} ["Exception caught during actual function invocation for task" id]) + (t/log! {:level :debug :data {:sym sym}} ["InterruptedException caught during actual function invocation for task" id]) ;; executor has terminated, it means we exhausted the graceful shutdown period ;; panic the task diff --git a/test/intemporal/shutdown_restart_test.clj b/test/intemporal/shutdown_restart_test.clj index 825e051..91ab757 100644 --- a/test/intemporal/shutdown_restart_test.clj +++ b/test/intemporal/shutdown_restart_test.clj @@ -33,7 +33,7 @@ ;;;; test proper -(deftest executor-shutdown-test +(deftest shutdown-restart-test (testing "failure: task validation fails" (let [mstore (store/make-store {}) custom-ex (te/make-test-executor (fn [] (.countDown executor-shutdown?)) nil) diff --git a/test/intemporal/store_test.cljc b/test/intemporal/store_test.cljc index c7ab769..a1c3638 100644 --- a/test/intemporal/store_test.cljc +++ b/test/intemporal/store_test.cljc @@ -1,11 +1,11 @@ (ns intemporal.store-test - (:require [clojure.test :as t :refer [deftest is testing use-fixtures]] + (:require [clojure.test :as t :refer [deftest is testing]] [intemporal.test-utils :as tu] [intemporal.store :as s] [matcher-combinators.test :refer [match?]] [promesa.core :as p])) -(use-fixtures :once tu/with-trace-logging) +(t/use-fixtures :once tu/with-trace-logging) (defn- is-promise-ok [prom] (-> prom diff --git a/test/intemporal/stores/basic_test.clj b/test/intemporal/stores/basic_test.clj index 71dfb36..5b2ee44 100644 --- a/test/intemporal/stores/basic_test.clj +++ b/test/intemporal/stores/basic_test.clj @@ -1,5 +1,5 @@ (ns ^:integration ^:fdb ^:sql intemporal.stores.basic-test - (:require [clojure.test :refer [deftest is testing use-fixtures]] + (:require [clojure.test :as t :refer [deftest is testing]] [intemporal.store :as store] [intemporal.store.foundationdb :as fdb] [intemporal.store.jdbc :as jdbc] @@ -8,14 +8,14 @@ [intemporal.matchers :refer [nilable?]] [matcher-combinators.test :refer [match?]])) -(use-fixtures :once tu/with-trace-logging) +(t/use-fixtures :once tu/with-trace-logging) (def stores (delay {:memory (store/make-store) :fdb (fdb/make-store {:cluster-file-path "docker/fdb.cluster"}) :postgres (jdbc/make-store {:jdbcUrl "jdbc:postgresql://localhost:5432/root?user=root&password=root" :migration-dir "migrations/postgres"})})) -(deftest stores-test +(deftest basic-test (doseq [[label store] @stores] (testing (format "store: %s" label) (let [evt {:ref "some-ref" :root "some-root" diff --git a/test/intemporal/stores/lots_of_workflows_test.clj b/test/intemporal/stores/lots_of_workflows_test.clj index 6d5d96b..b256a6d 100644 --- a/test/intemporal/stores/lots_of_workflows_test.clj +++ b/test/intemporal/stores/lots_of_workflows_test.clj @@ -32,7 +32,7 @@ (def iterations 100) (def latch (CountDownLatch. iterations)) -(deftest stores-test +(deftest lots-of-workflows-test (with-redefs [tu/wait-default-timeout 60000] (doseq [[label store] @stores] (testing (format "store: %s" label) @@ -61,7 +61,7 @@ ;; lets wait for all pending (try (wait (not (contains? (into #{} (map :state (store/list-tasks store))) :new)) - (w/shutdown ex 10000)) + (w/shutdown ex 20000)) (testing "workflows are all completed" (let [tasks (store/list-tasks store)] diff --git a/test/intemporal/stores/multiple_shutdown_test.clj b/test/intemporal/stores/multiple_shutdown_test.clj index ef304cf..03144f6 100644 --- a/test/intemporal/stores/multiple_shutdown_test.clj +++ b/test/intemporal/stores/multiple_shutdown_test.clj @@ -9,7 +9,7 @@ [matcher-combinators.test :refer [match?]] [matcher-combinators.matchers :as m])) -;(t/use-fixtures :once tu/with-trace-logging) +(t/use-fixtures :once tu/with-trace-logging) (def stores (delay {:memory (store/make-store) :fdb (fdb/make-store {:cluster-file-path "docker/fdb.cluster"}) diff --git a/test/intemporal/stores/release_reenqueue_test.clj b/test/intemporal/stores/release_reenqueue_test.clj index ef2da8f..a41c32d 100644 --- a/test/intemporal/stores/release_reenqueue_test.clj +++ b/test/intemporal/stores/release_reenqueue_test.clj @@ -1,22 +1,22 @@ (ns ^:integration ^:fdb ^:sql intemporal.stores.release-reenqueue-test - (:require [clojure.test :refer [deftest is testing use-fixtures]] + (:require [clojure.test :as t :refer [deftest is testing]] [intemporal.store :as store] [intemporal.store.foundationdb :as fdb] [intemporal.store.jdbc :as jdbc] [intemporal.workflow :as w] [intemporal.macros :refer [stub-protocol defn-workflow]] [intemporal.test-utils :as tu :refer [with-result]]) - (:import (java.util.concurrent CountDownLatch))) + (:import (java.util.concurrent Phaser))) -(use-fixtures :once tu/with-trace-logging) +(t/use-fixtures :once tu/with-trace-logging) (def stores (delay {:memory (store/make-store) :fdb (fdb/make-store {:cluster-file-path "docker/fdb.cluster"}) :postgres (jdbc/make-store {:jdbcUrl "jdbc:postgresql://localhost:5432/root?user=root&password=root" :migration-dir "migrations/postgres"})})) -(def activity-invoked? (CountDownLatch. 1)) -(def executor-shutdown? (CountDownLatch. 1)) +(def activity-invoked? (Phaser. 1)) +(def executor-shutdown? (Phaser. 1)) (defprotocol MyActivities (foo [this a])) @@ -24,8 +24,8 @@ (defrecord MyActivitiesImpl [] MyActivities (foo [this a] - (.countDown activity-invoked?) - (.await executor-shutdown?) + (.arrive activity-invoked?) + (.awaitAdvance executor-shutdown? (.getPhase executor-shutdown?)) :foo)) (defn-workflow my-workflow [k] @@ -43,25 +43,28 @@ (testing (format "store: %s" label) (let [executor (w/start-poller! store {:protocols {`MyActivities (->MyActivitiesImpl)} - :polling-ms 500})] + :polling-ms 100})] (testing "shutdown of ongoing workflow" ;; give it some time so the poller can pick it up but just once - (future - ;; ensure activity is inflight - (.await activity-invoked?) - (w/shutdown executor 0) - ;; proceed activity, it will fail - (.countDown executor-shutdown?) - (is (w/shutting-down? executor))) + (let [fut (future + ;; ensure activity is inflight + (.awaitAdvance activity-invoked? (.getPhase activity-invoked?)) + (w/shutdown executor 0) + ;; proceed activity, it will fail + (.arrive executor-shutdown?) + :done)] - (with-result [res (w/with-env {:store store} - (my-workflow :ok))] + (with-result [res (w/with-env {:store store} + (my-workflow :ok))] - (is (instance? Exception res)))) + (is (instance? Exception res))) + + (is (= :done (deref fut 1000 ::error))))) (testing "Tasks are pending" (let [[task] (store/list-tasks store)] + (tu/print-tables store) (is (= :pending (:state task))))) (testing "Tasks are released" @@ -70,7 +73,7 @@ (is (nil? (:owner task))))) (testing "Tasks are reenqueued" - (store/reenqueue-pending-tasks store prn) + (store/reenqueue-pending-tasks store (constantly nil)) (let [[task] (store/list-tasks store)] (is (= store/default-owner (:owner task))) (is (= :new (:state task))))))))) diff --git a/test/intemporal/stores/saga_test.clj b/test/intemporal/stores/saga_test.clj index ae67ed3..03b95d0 100644 --- a/test/intemporal/stores/saga_test.clj +++ b/test/intemporal/stores/saga_test.clj @@ -1,5 +1,5 @@ (ns ^:integration ^:fdb ^:sql intemporal.stores.saga-test - (:require [clojure.test :refer [deftest is testing use-fixtures]] + (:require [clojure.test :as t :refer [deftest is testing]] [intemporal.store :as store] [intemporal.store.foundationdb :as fdb] [intemporal.store.jdbc :as jdbc] @@ -9,7 +9,7 @@ [spy.core :as spy] [spy.protocol :as pspy])) -(use-fixtures :once tu/with-trace-logging) +(t/use-fixtures :once tu/with-trace-logging) (defprotocol ProtocolActivity (some-io [this val]) diff --git a/test/intemporal/test_utils.cljc b/test/intemporal/test_utils.cljc index d672247..3dfa5e0 100644 --- a/test/intemporal/test_utils.cljc +++ b/test/intemporal/test_utils.cljc @@ -65,9 +65,9 @@ events (->> (store/list-events store) (sort-by :id))] (locking *out* - (println "==================== TASKS") + (print "==================== TASKS") (pprint/print-table tasks) - (println "==================== EVENTS") + (println "\n==================== EVENTS") (pprint/print-table events) (flush)))) @@ -164,7 +164,7 @@ (recur))))))) (defn setup-telemere [] - #?(:clj (clojure.pprint/pprint (telemere/check-interop))) + ;#?(:clj (clojure.pprint/pprint (telemere/check-interop))) (telemere/set-min-level! :trace) (telemere/remove-handler! ::custom) #?(:clj (telemere/add-handler! :default/open-telemetry (tot/handler:open-telemetry))) diff --git a/test/intemporal/vthread_recovery_test.clj b/test/intemporal/vthread_recovery_test.clj index 016d352..bbb6c23 100644 --- a/test/intemporal/vthread_recovery_test.clj +++ b/test/intemporal/vthread_recovery_test.clj @@ -1,6 +1,6 @@ (ns intemporal.vthread-recovery-test (:require [clojure.java.io :as io] - [clojure.test :refer [deftest is testing use-fixtures]] + [clojure.test :as t :refer [deftest is testing]] [intemporal.store :as store] [intemporal.workflow :as w] [intemporal.macros :refer [stub-protocol vthread defn-workflow]] @@ -10,7 +10,7 @@ ;;;; ;; demo - recovery of a crashed process -(use-fixtures :once tu/with-trace-logging) +(t/use-fixtures :once tu/with-trace-logging) (defprotocol ThreadActivity (with-thread [this id])) From 963f3576eb6ff41696f07a4b649a22bb11a1a0f2 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Wed, 24 Dec 2025 14:15:04 +0000 Subject: [PATCH 24/26] add hikari --- deps.edn | 1 + docker/fdb.cluster | 2 +- src/intemporal/store/jdbc.clj | 40 ++++++++++--------- .../stores/lots_of_workflows_test.clj | 40 +++++++++---------- 4 files changed, 43 insertions(+), 40 deletions(-) diff --git a/deps.edn b/deps.edn index 6468ae7..65872da 100644 --- a/deps.edn +++ b/deps.edn @@ -40,6 +40,7 @@ :jdbc {:extra-deps {com.github.seancorfield/next.jdbc {:mvn/version "1.3.1048"} org.postgresql/postgresql {:mvn/version "42.7.7"} + hikari-cp/hikari-cp {:mvn/version "3.3.0"} migratus/migratus {:mvn/version "1.6.4"}}} :doc {:extra-paths ["doc"]} :cljs {:extra-deps {org.clojure/clojurescript {:mvn/version "1.12.42"} diff --git a/docker/fdb.cluster b/docker/fdb.cluster index 5c41e53..e7aea03 100644 --- a/docker/fdb.cluster +++ b/docker/fdb.cluster @@ -1 +1 @@ -docker:docker@192.168.107.3:4500 +docker:docker@192.168.97.2:4500 diff --git a/src/intemporal/store/jdbc.clj b/src/intemporal/store/jdbc.clj index 06abfd2..274d9bc 100644 --- a/src/intemporal/store/jdbc.clj +++ b/src/intemporal/store/jdbc.clj @@ -1,5 +1,6 @@ (ns intemporal.store.jdbc - (:require [intemporal.store :as store] + (:require [hikari-cp.core :as hikari] + [intemporal.store :as store] [intemporal.workflow.internal :as i] [intemporal.store.internal :as si :refer [serialize deserialize]] [migratus.core :as migratus] @@ -61,9 +62,12 @@ (defn make-store "Creates a new Postgres-based store." - [{:keys [owner migration-dir migrate? watch-polling-ms] + [{:keys [owner migration-dir migrate? watch-polling-ms jdbcUrl] :or {owner store/default-owner migrate? true watch-polling-ms 100} :as opts}] - (let [db-spec (dissoc opts :migration-dir :migrate? :watch-polling-ms) + (let [db-spec (-> opts + (dissoc :migration-dir :migrate? :watch-polling-ms) + (assoc :jdbc-url jdbcUrl)) + datasource (hikari/make-datasource db-spec) config {:store :database :migration-dir migration-dir :db db-spec} @@ -80,7 +84,7 @@ store/HistoryStore (list-events [this] - (->> (jdbc/with-transaction [tx db-spec] + (->> (jdbc/with-transaction [tx datasource] (jdbc/execute! tx ["select * from events"] default-opts)) (map db->event))) @@ -91,29 +95,29 @@ (let [args (serialize args) result (serialize result) - res (jdbc/with-transaction [tx db-spec] + res (jdbc/with-transaction [tx datasource] (jdbc/execute-one! tx ["INSERT INTO events(type, ref, root, sym, args, result) values (?,?,?,?,?,?) RETURNING id" (kw->db type) ref root (str sym) args result] default-opts))] (assoc event :id (:id res)))) (all-events [this task-id] - (->> (jdbc/with-transaction [tx db-spec] + (->> (jdbc/with-transaction [tx datasource] (jdbc/execute! tx ["select * from events where ref=?" task-id] default-opts)) (map db->event))) (clear-events [this] - (jdbc/with-transaction [tx db-spec] + (jdbc/with-transaction [tx datasource] (jdbc/execute! tx ["delete from events"]))) store/TaskStore (list-tasks [this] - (->> (jdbc/with-transaction [tx db-spec] + (->> (jdbc/with-transaction [tx datasource] (jdbc/execute! tx ["select * from tasks where (owner is null or owner=?)" owner] default-opts)) (map db->task))) (task<-panic [this task-id error] - (jdbc/with-transaction [tx db-spec] + (jdbc/with-transaction [tx datasource] (let [updated-task {:result (serialize error)}] (jdbc/execute-one! tx (builder/for-update "tasks" updated-task {:id task-id} default-opts))))) @@ -121,7 +125,7 @@ ;; some redundancy between :result in task and event ;; note that we save the event first, because update-task can trigger some watchers ;; and they would expect the event to be present in the history - (jdbc/with-transaction [tx db-spec] + (jdbc/with-transaction [tx datasource] (let [evt {:ref ref :root root :type type :sym sym :args args} expected-state (cond (some? args) :new @@ -147,13 +151,13 @@ updated-evt)))) (find-task [this id] - (some-> (jdbc/with-transaction [tx db-spec] + (some-> (jdbc/with-transaction [tx datasource] (jdbc/execute-one! tx ["select * from tasks where id=?" id] default-opts)) (db->task))) (watch-task [this id f] (let [query-state! (fn [] - (jdbc/with-transaction [tx db-spec] + (jdbc/with-transaction [tx datasource] (jdbc/execute-one! tx ["select state from tasks where id=?" id] default-opts))) state (query-state!) watch? (atom true)] @@ -161,7 +165,7 @@ (while (and @watch? state) (Thread/sleep (long watch-polling-ms)) (when (not= state (query-state!)) - (let [task (some-> (jdbc/with-transaction [tx db-spec] + (let [task (some-> (jdbc/with-transaction [tx datasource] (jdbc/execute-one! tx ["select * from tasks where id=?" id] default-opts)) (db->task))] (when (and task (f task)) @@ -197,11 +201,11 @@ (wrap-result resolved))))))))) (release-pending-tasks [this] - (jdbc/with-transaction [tx db-spec] + (jdbc/with-transaction [tx datasource] (jdbc/execute-one! tx ["update tasks set owner=null where owner=?" owner]))) (reenqueue-pending-tasks [this f] - (let [tasks? (jdbc/with-transaction [tx db-spec] + (let [tasks? (jdbc/with-transaction [tx datasource] (let [tasks (jdbc/execute! tx ["select * from tasks where state='pending' and (owner is null or owner=?)" owner] default-opts)] (jdbc/execute-one! tx ["update tasks set state='new', owner=? where id = ANY(?)" owner (into-array String (mapv :id tasks))]) @@ -225,7 +229,7 @@ args (serialize args) result (serialize result) runtime (serialize runtime)] - (jdbc/with-transaction [tx db-spec] + (jdbc/with-transaction [tx datasource] (jdbc/execute! tx ["INSERT INTO tasks(id,owner,proto,type,ref,root,sym,args,result,state,lease_end,runtime) values (?,?,?,?,?,?,?,?,?,?,?,?) RETURNING id" id owner proto? (kw->db type) (kw->db ref) (kw->db root) (str sym) args result (kw->db state) lease-end runtime]))) task+owner)) @@ -236,7 +240,7 @@ (dequeue-task [this {:keys [lease-ms]}] ;; TODO check owner (let [query "select * from tasks where (owner=? or owner is null) and (state='new' or lease_end < now()) order by id asc limit 1" - found? (jdbc/with-transaction [tx db-spec] + found? (jdbc/with-transaction [tx datasource] (when-let [task (some-> (jdbc/execute-one! tx [query owner] default-opts) (db->task))] (let [lease-epoch (when lease-ms @@ -254,7 +258,7 @@ found?)) (clear-tasks [this] - (jdbc/with-transaction [tx db-spec] + (jdbc/with-transaction [tx datasource] (jdbc/execute! tx ["delete from tasks"])))))) #_:clj-kondo/ignore diff --git a/test/intemporal/stores/lots_of_workflows_test.clj b/test/intemporal/stores/lots_of_workflows_test.clj index b256a6d..de8638e 100644 --- a/test/intemporal/stores/lots_of_workflows_test.clj +++ b/test/intemporal/stores/lots_of_workflows_test.clj @@ -6,8 +6,7 @@ [intemporal.workflow :as w] [intemporal.macros :refer [stub-protocol defn-workflow]] [intemporal.test-utils :as tu :refer [wait]] - [promesa.core :as p]) - (:import (java.util.concurrent CountDownLatch))) + [promesa.core :as p])) (defprotocol MyActivities (foo [this a])) @@ -23,45 +22,44 @@ prr (foo pr :pr)] prr)) +(def iterations 100) + (def stores (delay {:memory (store/make-store) :fdb (fdb/make-store {:cluster-file-path "docker/fdb.cluster"}) :postgres (jdbc/make-store {:jdbcUrl "jdbc:postgresql://localhost:5432/root?user=root&password=root" - :migration-dir "migrations/postgres"})})) + :migration-dir "migrations/postgres" + :maximum-pool-size 20})})) -(def iterations 100) -(def latch (CountDownLatch. iterations)) - (deftest lots-of-workflows-test - (with-redefs [tu/wait-default-timeout 60000] + (with-redefs [tu/wait-default-timeout 10000] (doseq [[label store] @stores] (testing (format "store: %s" label) - (testing "clear" - (store/clear-events store) - (store/clear-tasks store)) + (store/clear-events store) + (store/clear-tasks store) (testing "multiple iterations" - (w/with-env {:store store} - (dotimes [_ iterations] - ;; workflows are blocking, we wrap in a virtual thread - (p/vthread + (dotimes [_ iterations] + (p/vthread + (w/with-env {:store store} + ;; workflows are blocking, we wrap in a virtual thread (my-workflow)))) ;; check that all tasks are enqueued - (wait (= iterations (count (store/list-tasks store))) - (let [wflows (store/list-tasks store)] - (testing "workflows are all new" - (is (= iterations (count wflows))) - (is (= #{:new} (set (map :state wflows)))))))) + (let [wflows (store/list-tasks store)] + (testing "workflows are all new" + (is (= iterations (count wflows))) + (is (= #{:new} (set (map :state wflows)))))))) (testing "enqueue all jobs" - (let [ex (w/start-poller! store {:protocols {`MyActivities (->MyActivitiesImpl)}})] + (let [ex (w/start-poller! store {:protocols {`MyActivities (->MyActivitiesImpl)} + :polling-ms 100})] ;; lets wait for all pending (try (wait (not (contains? (into #{} (map :state (store/list-tasks store))) :new)) - (w/shutdown ex 20000)) + (w/shutdown ex 10000)) (testing "workflows are all completed" (let [tasks (store/list-tasks store)] From 4eb386f9f0a3920eb2c01898778ade6873940413 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Wed, 24 Dec 2025 15:03:42 +0000 Subject: [PATCH 25/26] log info --- test/intemporal/test_utils.cljc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/intemporal/test_utils.cljc b/test/intemporal/test_utils.cljc index 3dfa5e0..3c6f3c4 100644 --- a/test/intemporal/test_utils.cljc +++ b/test/intemporal/test_utils.cljc @@ -165,7 +165,7 @@ (defn setup-telemere [] ;#?(:clj (clojure.pprint/pprint (telemere/check-interop))) - (telemere/set-min-level! :trace) + (telemere/set-min-level! :info) (telemere/remove-handler! ::custom) #?(:clj (telemere/add-handler! :default/open-telemetry (tot/handler:open-telemetry))) (telemere/add-handler! ::custom From db7f44cca3dfd99b05d3fca8f39ccd0606df94d3 Mon Sep 17 00:00:00 2001 From: Miguel Ping Date: Wed, 24 Dec 2025 15:46:06 +0000 Subject: [PATCH 26/26] add basic test --- docker/fdb.cluster | 2 +- src/intemporal/store/jdbc.clj | 1 + .../intemporal/stores/basic_workflow_test.clj | 47 +++++++++++++++++++ .../stores/lots_of_workflows_test.clj | 4 +- 4 files changed, 52 insertions(+), 2 deletions(-) create mode 100644 test/intemporal/stores/basic_workflow_test.clj diff --git a/docker/fdb.cluster b/docker/fdb.cluster index e7aea03..5c41e53 100644 --- a/docker/fdb.cluster +++ b/docker/fdb.cluster @@ -1 +1 @@ -docker:docker@192.168.97.2:4500 +docker:docker@192.168.107.3:4500 diff --git a/src/intemporal/store/jdbc.clj b/src/intemporal/store/jdbc.clj index 274d9bc..07dc171 100644 --- a/src/intemporal/store/jdbc.clj +++ b/src/intemporal/store/jdbc.clj @@ -239,6 +239,7 @@ (dequeue-task [this {:keys [lease-ms]}] ;; TODO check owner + ;; TODO select for update skip locked (let [query "select * from tasks where (owner=? or owner is null) and (state='new' or lease_end < now()) order by id asc limit 1" found? (jdbc/with-transaction [tx datasource] (when-let [task (some-> (jdbc/execute-one! tx [query owner] default-opts) diff --git a/test/intemporal/stores/basic_workflow_test.clj b/test/intemporal/stores/basic_workflow_test.clj new file mode 100644 index 0000000..f5e2615 --- /dev/null +++ b/test/intemporal/stores/basic_workflow_test.clj @@ -0,0 +1,47 @@ +(ns ^:integration ^:fdb ^:sql intemporal.stores.basic-workflow-test + (:require [clojure.test :as t :refer [deftest is testing]] + [intemporal.store :as store] + [intemporal.store.foundationdb :as fdb] + [intemporal.store.jdbc :as jdbc] + [intemporal.workflow :as w] + [intemporal.macros :as macros :refer [stub-protocol defn-workflow]] + [intemporal.test-utils :as tu])) + +(t/use-fixtures :once tu/with-trace-logging) + +(defprotocol ProtocolActivity + (some-io [this val])) + +(def example-impl + (reify + ProtocolActivity + (some-io [_ val] val))) + +;;;; +;; workflow registration + +(defn-workflow run-workflow [] + (let [stub (stub-protocol ProtocolActivity)] + (some-io stub :ok))) + +(def stores (delay {:memory (store/make-store) + :fdb (fdb/make-store {:cluster-file-path "docker/fdb.cluster"}) + :postgres (jdbc/make-store {:jdbcUrl "jdbc:postgresql://localhost:5432/root?user=root&password=root" + :migration-dir "migrations/postgres" + :polling-ms 10})})) + +(deftest basic-workflow-test + (doseq [[label store] @stores] + (testing (format "store: %s" label) + + (testing "running a workflow" + (store/clear-events store) + (store/clear-tasks store) + + (let [ex (w/start-poller! store {:protocols {`ProtocolActivity example-impl} + :polling-ms 10})] + (try + (is (= :ok (w/with-env {:store store} + (run-workflow)))) + (finally + (w/shutdown ex 0)))))))) diff --git a/test/intemporal/stores/lots_of_workflows_test.clj b/test/intemporal/stores/lots_of_workflows_test.clj index de8638e..bb1f98d 100644 --- a/test/intemporal/stores/lots_of_workflows_test.clj +++ b/test/intemporal/stores/lots_of_workflows_test.clj @@ -6,6 +6,7 @@ [intemporal.workflow :as w] [intemporal.macros :refer [stub-protocol defn-workflow]] [intemporal.test-utils :as tu :refer [wait]] + [matcher-combinators.test :refer [match?]] [promesa.core :as p])) (defprotocol MyActivities @@ -64,6 +65,7 @@ (testing "workflows are all completed" (let [tasks (store/list-tasks store)] (is (= (* 2 iterations) (count tasks))) - (is (= #{:success} (set (map :state tasks)))))) + (is (match? {:success (* 2 iterations)} + (frequencies (map :state tasks)))))) (finally (w/shutdown ex 0)))))))))