diff --git a/.gitignore b/.gitignore index f2786d7..8b5bf0f 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,12 @@ config/*.toml .idea .vscode/ + +temp/ +.holy-lambda/ +?/ +.cpcache/ +.clj-kondo/ +.lsp/ +.DS_Store +native-configuration/ diff --git a/headless/Dockerfile.amd b/headless/Dockerfile.amd new file mode 100644 index 0000000..f152f26 --- /dev/null +++ b/headless/Dockerfile.amd @@ -0,0 +1,5 @@ +FROM scratch + +COPY dist/bin/amd64/output-serial-gc headless + +ENTRYPOINT ["./headless"] \ No newline at end of file diff --git a/headless/Dockerfile.arm b/headless/Dockerfile.arm new file mode 100644 index 0000000..a9382d6 --- /dev/null +++ b/headless/Dockerfile.arm @@ -0,0 +1,10 @@ +from dlbears/scratch-arm-dlls:latest as dlls +from scratch +USER root +RUN apk add gcompat + +COPY --from=dlls /lib/ /lib +COPY --from=dlls /lib64/ /lib64 +COPY dist/bin/arm64/output-noop-gc headless + +ENTRYPOINT ["./headless"] \ No newline at end of file diff --git a/headless/Dockerfile.script.amd b/headless/Dockerfile.script.amd new file mode 100644 index 0000000..fdfcb9f --- /dev/null +++ b/headless/Dockerfile.script.amd @@ -0,0 +1,35 @@ +FROM debian:stretch AS runner +USER root +ARG TARGETARCH +ENV GRAAL_VERSION=22.3.0 +ENV JAVA_VERSION=11 +# Install chromedriver +RUN apt-get update && apt install chromium-driver curl -y + +# Install script interpreter +RUN curl -sLO https://raw.githubusercontent.com/babashka/babashka/master/install +RUN chmod a+x install +RUN ["/bin/bash", "./install", "--static"] + +# Install JVM (for loading jsoup pod and other java deps) +RUN cd / && mkdir graal +WORKDIR /graal + +RUN curl -sLO https://github.com/graalvm/graalvm-ce-builds/releases/download/vm-${GRAAL_VERSION}/graalvm-ce-java${JAVA_VERSION}-linux-amd64-${GRAAL_VERSION}.tar.gz +RUN tar -xzf graalvm-ce-java${JAVA_VERSION}-linux-amd64-${GRAAL_VERSION}.tar.gz + +RUN cd / && mkdir babashka +WORKDIR /babashka + +COPY dist/jar/pod-jaydeesimon-jsoup-0.1-standalone.jar dist/jar/pod-jaydeesimon-jsoup-0.1-standalone.jar +COPY bb.edn bb.edn +COPY deps.edn deps.edn + +RUN apt-get install -y git +ENV PATH="/graal/graalvm-ce-java${JAVA_VERSION}-${GRAAL_VERSION}/bin:${PATH}" +ENV JAVA_HOME="/graal/graalvm-ce-java${JAVA_VERSION}-${GRAAL_VERSION}" +RUN bb prepare + +COPY src/headless/find.cljc src/headless/find.cljc + +ENTRYPOINT ["bb", "./src/headless/find.cljc"] diff --git a/headless/Dockerfile.script.arm b/headless/Dockerfile.script.arm new file mode 100644 index 0000000..8275324 --- /dev/null +++ b/headless/Dockerfile.script.arm @@ -0,0 +1,34 @@ +FROM debian:stretch AS runner +USER root +ARG TARGETARCH +ENV GRAAL_VERSION=22.3.0 +ENV JAVA_VERSION=11 +# Install chromedriver +RUN apt-get update && apt install chromium-driver curl -y + +# Install script interpreter +RUN curl -sLO https://raw.githubusercontent.com/babashka/babashka/master/install +RUN chmod a+x install +RUN ["/bin/bash", "./install", "--static"] + +# Install JVM (for loading jsoup pod and other java deps) +RUN cd / && mkdir graal +WORKDIR /graal +RUN curl -sLO https://github.com/graalvm/graalvm-ce-builds/releases/download/vm-${GRAAL_VERSION}/graalvm-ce-java${JAVA_VERSION}-linux-aarch64-${GRAAL_VERSION}.tar.gz +RUN tar -xzf graalvm-ce-java${JAVA_VERSION}-linux-aarch64-${GRAAL_VERSION}.tar.gz + +RUN cd / && mkdir babashka +WORKDIR /babashka + +COPY dist/jar/pod-jaydeesimon-jsoup-0.1-standalone.jar dist/jar/pod-jaydeesimon-jsoup-0.1-standalone.jar +COPY bb.edn bb.edn +COPY deps.edn deps.edn + +RUN apt-get install -y git +ENV PATH="/graal/graalvm-ce-java${JAVA_VERSION}-${GRAAL_VERSION}/bin:${PATH}" +ENV JAVA_HOME="/graal/graalvm-ce-java${JAVA_VERSION}-${GRAAL_VERSION}" +RUN bb prepare + +COPY src/headless/find.cljc src/headless/find.cljc + +ENTRYPOINT ["bb", "./src/headless/find.cljc"] diff --git a/headless/bb.edn b/headless/bb.edn new file mode 100644 index 0000000..224390e --- /dev/null +++ b/headless/bb.edn @@ -0,0 +1,76 @@ +{:paths ["src" "jsoup"] + :deps {etaoin/etaoin {:mvn/version "1.0.38"} + org.clojars.askonomm/ruuter {:mvn/version "1.3.2"} + io.github.FieryCod/holy-lambda-babashka-tasks + {:git/url "https://github.com/FieryCod/holy-lambda" + :deps/root "./modules/holy-lambda-babashka-tasks" + :sha "1469bb96b85c2c65a52df9e3a4914dde1b4c816f"} + io.github.FieryCod/holy-lambda {:mvn/version "0.6.6"}} + + :min-bb-version "0.3.7" + :holy-lambda/options {;; User should use docker for local development and use `HL_NO_DOCKER` environment variable + ;; set to "true" for CI based deployments. + ;; For CI based deployments user should base it's builder image on `fierycod/graalvm-native-image:ce` + :docker {;; Check https://docs.docker.com/network/ + ;; Network setting for future versions of HL will propagate to AWS SAM as well + ;; Options: "host"|"bridge"|"overlay"|"none"|nil|"macvlan" + :network "host" + + ;; HL runs bb tasks in docker context which means that local libraries will not work out-of-the-box. + ;; + ;; To make local libraries work with HL you have to: + ;; 1. Mount your local library folder as a docker volume. + ;; 2. Modify the `deps.edn` to use library path available in docker context. + ;; Preferably use `deps.edn` alias and reference it's name in `:build:clj-name` + ;; ---------------------------------------------------------------------------- + ;; Single volume definition: + ;; + ;; {:docker "/path-where-to-mount-local-library-on-docker" + ;; :host "relative-local-library-path"} + ;;:volumes [{:docker "/holy-lambda" + ;; :host "./"}] + + ;; GraalVM Community holy-lambda compatible docker image + ;; You can always build your own GraalVM image with enterprise edition + :image "dlbears/graalvm-native-image:ee"} + :build {:compile-cmd "clj -T:build uberjar" + :graalvm-home nil} + :runtime + {;; :pods {org.babashka/aws "0.0.5"} + + ;; For `:native` runtime you can provide your own bootstrap file + ;; :bootstrap-file "bootstrap" + + ;; For `:native` runtime you can provide some native resources which will be available during lambda execution + ;; Resources are packed as is. + :native-deps "resources" + + ;; Specify custom arguments for native image generation + ;; Works only on `:native` runtime. + ;; Check https://www.graalvm.org/reference-manual/native-image/Options/ + :native-image-args + ["--verbose" + ;"-H:+StaticExecutableWithDynamicLibC" ; aarch64 and x86 + ;"--static" ; x86 only + ;"--libc=musl" ; x86 only + "--gc=epsilon" ; [ g1 (x86 only), serial, epsilon ] + "--no-fallback" + "--enable-url-protocols=http,https" + "--report-unsupported-elements-at-runtime" + "-H:+AllowIncompleteClasspath" + "--no-server" + "--initialize-at-build-time" + "--initialize-at-run-time=org.httpkit.client.ClientSslEngineFactory\\$SSLHolder,org.apache.http.impl.auth.NTLMEngineImpl" + "--trace-object-instantiation=java.security.SecureRandom"]}} + :tasks {:requires ([holy-lambda.tasks]) + + hl:docker:run holy-lambda.tasks/hl:docker:run + + hl:native:conf holy-lambda.tasks/hl:native:conf + hl:native:executable holy-lambda.tasks/hl:native:executable + hl:babashka:sync holy-lambda.tasks/hl:babashka:sync + hl:sync holy-lambda.tasks/hl:sync + hl:compile holy-lambda.tasks/hl:compile + hl:doctor holy-lambda.tasks/hl:doctor + hl:clean holy-lambda.tasks/hl:clean + hl:version holy-lambda.tasks/hl:version}} \ No newline at end of file diff --git a/headless/build.clj b/headless/build.clj new file mode 100644 index 0000000..35421f5 --- /dev/null +++ b/headless/build.clj @@ -0,0 +1,24 @@ +(ns build + (:require + [clojure.tools.build.api :as b])) + +(def class-dir ".holy-lambda/target/classes") +(def basis (b/create-basis {:project "deps.edn"})) + +(defn clean [_] + (b/delete {:path ".holy-lambda/target"}) + (b/delete {:path ".holy-lambda/build"})) + + +;; Compile clojure code into standalone jar +(defn uberjar [_] + (b/copy-dir {:src-dirs ["src" "resources"] + :target-dir class-dir}) + (b/compile-clj {:basis basis + :src-dirs ["src"] + :ns-compile ['headless.find] ;; Only package this namespace + :class-dir class-dir}) + (b/uber {:class-dir class-dir + :main 'headless.find + :basis basis + :uber-file ".holy-lambda/build/output.jar"})) \ No newline at end of file diff --git a/headless/deps.edn b/headless/deps.edn new file mode 100644 index 0000000..ab2db8d --- /dev/null +++ b/headless/deps.edn @@ -0,0 +1,19 @@ + +{:deps {etaoin/etaoin {:mvn/version "1.0.38"} + nrepl/bencode {:mvn/version "1.1.0"} + org.jsoup/jsoup {:mvn/version "1.15.3"} + org.clojars.askonomm/ruuter {:mvn/version "1.3.2"} + http-kit/http-kit {:mvn/version "2.6.0"} + org.clojure/core.match {:mvn/version "1.0.0"} + org.babashka/cli {:mvn/version "0.5.40"} + babashka/babashka.pods {:mvn/version "0.1.0"} + org.clojure/clojure {:mvn/version "1.10.3"} + io.github.FieryCod/holy-lambda {:mvn/version "0.6.6"} + com.github.clj-easy/graal-build-time {:mvn/version "0.1.4"}} + + :paths ["src" "resources" "jsoup"] + :aliases + {:build {:deps {io.github.clojure/tools.build {:git/tag "v0.8.3" :git/sha "0d20256"}} + :ns-default build + :exec-fn build/uber + :jvm-opts ["-Dclojure.compiler.direct-linking=true" "-Dclojure.spec.skip-macros=true"]}}} \ No newline at end of file diff --git a/headless/dist/bin/amd64/output-noop-gc b/headless/dist/bin/amd64/output-noop-gc new file mode 100755 index 0000000..a2509e2 Binary files /dev/null and b/headless/dist/bin/amd64/output-noop-gc differ diff --git a/headless/dist/bin/amd64/output-serial-gc b/headless/dist/bin/amd64/output-serial-gc new file mode 100755 index 0000000..0eee876 Binary files /dev/null and b/headless/dist/bin/amd64/output-serial-gc differ diff --git a/headless/dist/bin/arm64/jsoupnative b/headless/dist/bin/arm64/jsoupnative new file mode 100755 index 0000000..de04a29 Binary files /dev/null and b/headless/dist/bin/arm64/jsoupnative differ diff --git a/headless/dist/bin/arm64/output-noop-gc b/headless/dist/bin/arm64/output-noop-gc new file mode 100755 index 0000000..578fc93 Binary files /dev/null and b/headless/dist/bin/arm64/output-noop-gc differ diff --git a/headless/dist/jar/output.jar b/headless/dist/jar/output.jar new file mode 100644 index 0000000..2be55bd Binary files /dev/null and b/headless/dist/jar/output.jar differ diff --git a/headless/dist/jar/pod-jaydeesimon-jsoup-0.1-standalone.jar b/headless/dist/jar/pod-jaydeesimon-jsoup-0.1-standalone.jar new file mode 100644 index 0000000..815f6d7 Binary files /dev/null and b/headless/dist/jar/pod-jaydeesimon-jsoup-0.1-standalone.jar differ diff --git a/headless/jsoup/pod/jaydeesimon/jsoup.clj b/headless/jsoup/pod/jaydeesimon/jsoup.clj new file mode 100644 index 0000000..31b9056 --- /dev/null +++ b/headless/jsoup/pod/jaydeesimon/jsoup.clj @@ -0,0 +1,113 @@ +(ns pod.jaydeesimon.jsoup + (:refer-clojure :exclude [read read-string]) + (:require [bencode.core :as bencode] + [clojure.edn :as edn]) + (:import (org.jsoup Jsoup) + (java.io PushbackInputStream) + (org.jsoup.nodes Element Attribute)) + (:gen-class)) + +(set! *warn-on-reflection* true) + +(def stdin (PushbackInputStream. System/in)) + +(defn write [v] + (bencode/write-bencode System/out v) + (.flush System/out)) + +(defn read-string [^"[B" v] + (String. v)) + +(defn read [] + (bencode/read-bencode stdin)) + +(defn select [html css-query] + (let [elements (-> (Jsoup/parse html) + (.select ^String css-query))] + (map (fn [element] + {:id (.id ^Element element) + :class-names (.classNames ^Element element) + :tag-name (.normalName ^Element element) + :attrs (->> (.attributes ^Element element) + .iterator + iterator-seq + (map (juxt (memfn ^Attribute getKey) (memfn ^Attribute getValue))) + (into {})) + :own-text (.ownText ^Element element) + :text (.text ^Element element) + :whole-text (.wholeText ^Element element) + :inner-html (.html ^Element element) + :outer-html (.outerHtml ^Element element)}) + elements))) + +(def lookup + {'pod.jaydeesimon.jsoup/select select}) + +;; Copied from https://github.com/babashka/pod-babashka-hsqldb/blob/master/src/pod/babashka/hsqldb.clj#L33 +(defn -main [& _args] + (loop [] + (let [message (try (read) + (catch java.io.EOFException _ + ::EOF))] + (when-not (identical? ::EOF message) + (let [op (get message "op") + op (read-string op) + op (keyword op) + id (some-> (get message "id") + read-string) + id (or id "unknown")] + (case op + :describe (do (write {"format" "edn" + "namespaces" [{"name" "pod.jaydeesimon.jsoup" + "vars" [{"name" "select"}]}] + "id" id + "ops" {"shutdown" {}}}) + (recur)) + :invoke (do (try + (let [var (-> (get message "var") + read-string + symbol) + args (get message "args") + args (read-string args) + args (edn/read-string args)] + (if-let [f (lookup var)] + (let [value (pr-str (apply f args)) + reply {"value" value + "id" id + "status" ["done"]}] + (write reply)) + (throw (ex-info (str "Var not found: " var) {})))) + (catch Throwable e + (binding [*out* *err*] + (println e)) + (let [reply {"ex-message" (ex-message e) + "ex-data" (pr-str + (assoc (ex-data e) + :type (class e))) + "id" id + "status" ["done" "error"]}] + (write reply)))) + (recur)) + :shutdown (System/exit 0) + (recur))))))) + +(comment + + ;; Run these commands in Babashka + (require '[babashka.pods :as pods]) + + ;; for the uberjar + (pods/load-pod ["java" "-jar" "target/uberjar/pod-jaydeesimon-jsoup-0.1-standalone.jar"]) + + ;; for the graalvm compiled binary + (pods/load-pod "./pod-jaydeesimon-jsoup") + + (require '[pod.jaydeesimon.jsoup :as jsoup]) + + (-> (curl/get "https://clojure.org") + :body + (jsoup/select "div.clj-header-message") + first + :text) + + ) diff --git a/headless/main.go b/headless/main.go new file mode 100644 index 0000000..649140c --- /dev/null +++ b/headless/main.go @@ -0,0 +1,71 @@ +package main + +import "os/exec" +import "fmt" +import "errors" + +type FinderOptions struct { + Url string // Required + MatchBy string // Required, Either [ regex, css, xpath, js ] + Matcher string // Required, grammar/code to match against + Secret string // Optional, if using MatchBy regex or js, otherwise, required. + Timeout uint // Optional, defaults to 10 seconds. + Strategy string // Optional, defaults to fallback, Either [ fallback, static, webdriver ] +} + + + +func NewFinder(Url string, MatchBy string, Matcher string, Secret string, Timeout uint, Strategy string) (*FinderOptions, error) { + hop := FinderOptions{Url, MatchBy, Matcher, Secret, Timeout, Strategy} + if hop.Url == "" { + return nil, errors.New("Url must be assigned a non-empty value.") + } + if hop.MatchBy == "" || (hop.MatchBy != "regex" && hop.MatchBy != "css" && hop.MatchBy != "xpath" && hop.MatchBy != "js") { + return nil, errors.New(` + MatchBy must be assigned a valid value. + Valid values: [ "regex", "css", "xpath", "js" ]`) + } + if hop.Matcher == "" { + return nil, errors.New("Matcher required.") + } + if hop.Secret == "" && hop.MatchBy != "regex" && hop.MatchBy != "js" { + return nil, errors.New("Secret required.") + } + if hop.Strategy == "" || (hop.Strategy != "fallback" && hop.Strategy != "static" && hop.Strategy != "webdriver") { + hop.Strategy = "fallback" + } + if hop.Timeout == 0 { + hop.Timeout = 5 + } + if hop.MatchBy == "js" || hop.MatchBy == "xpath" { + hop.Strategy = "webdriver" + } + return &hop, nil +} + + +func find(opt *FinderOptions) (string) { + out, bberr := exec.Command("bb", "src/headless/find.cljc", + "--url", opt.Url, + "--match-by", opt.MatchBy, + "--matcher", opt.Matcher, + "--secret", opt.Secret, + "--timeout", string(opt.Timeout), + "--strategy", opt.Strategy).Output() + if bberr != nil { + fmt.Print(bberr.Error()) + } + fmt.Print(string(out)) + return string(out) // Last line contains the JSON encoded output + +} + +func main() { + finder_opts, arg_err := NewFinder("https://clojure.org", "css", "div.clj-header-message", "Clojure is a robust, practical, and fast programming language with a set of useful features that together form a simple, coherent, and powerful tool.", 10, "") + if arg_err != nil { + fmt.Print(arg_err.Error()) + } + + output := find(finder_opts) + +} diff --git a/headless/resources/native-agents-payloads/1-static.edn b/headless/resources/native-agents-payloads/1-static.edn new file mode 100644 index 0000000..dfca688 --- /dev/null +++ b/headless/resources/native-agents-payloads/1-static.edn @@ -0,0 +1,31 @@ +{:name "headless.find.-main" ; Qualified path of Lambda Handler + :request {:event {:batch [{:url "https://gist.github.com/synycboom/9f64460e3fcf5d0a03af2a9570e993e3", + :matcher "match-this-text", + :match-by "regex", + :strategy "static"}, + {:url "https://twitter.com/wesbos/status/1592190991208947713?s=20&t=cZcZCmR70R-5evuevNMoNA", + :matcher "const func =>", + :secret "const func =>", + :match-by "regex", + :strategy "static"}, + {:url "https://javascript30.com", + :matcher "a.hero__title > span.word", + :secret "JavaScript", + :match-by "css", + :strategy "static"}, + {:url "https://www.amazon.com/Mistel-Bluetooth-Mechanical-Ergonomic-DoubleShot/dp/B08GWZCC34/?_encoding=UTF8&pd_rd_w=DKTzY&content-id=amzn1.sym.e4bd6ac6-9035-4a04-92a6-fc4ad60e09ad&pf_rd_p=e4bd6ac6-9035-4a04-92a6-fc4ad60e09ad&pf_rd_r=BTX6XS1VZPV55X89ARW1&pd_rd_wg=BKYIN&pd_rd_r=84d172c6-ae2f-446e-8ebe-632efddff1ed&ref_=pd_gw_ci_mcx_mr_hp_atf_m", + :match-by "css", + :matcher "span#productTitle", + :secret "Mistel BAROCCO MD770 RGB Wired + Wireless Bluetooth TKL Split Mechanical Keyboard with Cherry MX Brown Switch, Ergonomic Gaming Keyboard, Yellow Letter Glaze Blue PBT DoubleShot Keycaps, Macro Support", + :strategy "static"}, + {:url "https://clojure.org", + :matcher "golang", + :match-by "regex", + :strategy "static"}, + {:url "https://clojure.org", + :matcher "div.clj-header-message", + :match-by "css", + :secret "wrong secret", + :strategy "static"}]}, + :ctx {}}, ; Execution paths to trace (for native image) + :propagate true} ; Should stop the agent when invocation error occurs? \ No newline at end of file diff --git a/headless/src/headless/find.cljc b/headless/src/headless/find.cljc new file mode 100644 index 0000000..817884c --- /dev/null +++ b/headless/src/headless/find.cljc @@ -0,0 +1,226 @@ + #?(:bb (do + (println "Running in bb") + (require '[babashka.pods :as pods]) + (pods/load-pod ["java" "-jar" "dist/jar/pod-jaydeesimon-jsoup-0.1-standalone.jar"]) + ) + :clj (import (org.jsoup Jsoup) + (org.jsoup.select Elements) + (org.jsoup.nodes Element) + (java.lang NullPointerException))) +(ns headless.find + (:gen-class) + (:require + [babashka.cli :as cli] + [etaoin.api :as e] ;; WebDriver API + [ruuter.core :as ruuter] ;; HTTP router + [cheshire.core :as json] + [org.httpkit.server :as http] + [org.httpkit.client :refer [get]] + [clojure.core.match :refer [match]] ;; Pattern Matching + [clojure.java.io :as io])) + +;; Debug variables +;; (set! *warn-on-reflection* true) + +;; JSOUP Interop +#?(:bb nil + :clj (defn get-page [l] + (.get (Jsoup/connect l)))) +#?(:bb nil + :clj (defn get-elems [page css] + (.select page css))) + +;; Default driver opts (Currently only chrome/chromium) +(def chrome-driver-opts {:capabilities {:chromeOptions {:args ["--headless" "--no-sandbox"]}}}) + +;; TODO: refactor strategies to work independent of the driver implementation +;; - Example driver with driver opts used as defaults +;; (defn start-default-chrome [& args] (e/chrome chrome-driver-opts) ) + +;; Finder API Strategies - Static +(def static-css-strategy + #?(:bb (fn [url selector] + (try (-> + (-> @(get url) :body) + (pod.jaydeesimon.jsoup/select selector) + first + :text) + (catch Exception _ (do (println (str "Error - Failed to get url -> " url)) false)))) + :clj (fn [url selector] + (try (let [html (get-page url) + el (first (get-elems html selector)) + res (if (nil? el) false (.text el))] + res) + (catch Exception e (do (println (str "Error - Failed to get url -> " url)) false)))))) ;; TODO: Allow for custom Element attribute after selector instead of only Element.text + +(def static-regex-strategy + (fn [url regex] + (let [src (-> @(get url) :body) + ptrn (re-pattern regex) + res (try (re-find ptrn src) (catch NullPointerException e false))] + (if (nil? res) false res)))) + +;; Finder API Strategies - WebDriver +(def webdriver-js-strategy + (fn [url js] + (e/with-driver :chrome chrome-driver-opts driver + (e/go driver url) + (e/js-execute driver js)))) + +(def webdriver-xpath-strategy + (fn [url xpath] + (e/with-driver :chrome chrome-driver-opts driver + (e/go driver url) + (e/get-element-text driver {:xpath xpath})))) + +(def webdriver-css-strategy + (fn [url selector] + (e/with-driver :chrome chrome-driver-opts driver + (e/go driver url) + (e/get-element-text driver {:css selector})))) + +(def webdriver-regex-strategy + (fn [url regex] + (e/with-driver :chrome chrome-driver-opts driver + (e/go driver url) + (let [src (e/get-source driver) + ptrn (re-pattern regex)] + (re-find ptrn src))))) + +;; Finder API Strategies - Fallback (static first, webdriver last) +(defn fallback-regex + ([url matcher] (let [static-res? (boolean (static-regex-strategy url matcher))] + (if static-res? + true + (boolean (webdriver-regex-strategy url matcher))))) + ([url matcher secret] (let [static-res? (= secret (static-regex-strategy url matcher))] + (if static-res? + true + (= secret (webdriver-regex-strategy url matcher)))))) + +(defn fallback-css [url matcher secret] + (let [static-res? (= secret (static-css-strategy url matcher))] + (if static-res? + true + (= secret (webdriver-css-strategy url matcher))))) + +;; Option Validation and Dispatch +(defn dispatch [opts] + ;; Option Validation + (when (or + (= "" (opts :url)) (nil? (opts :url)) + (= "" (opts :matcher)) (nil? (opts :matcher)) + (= "" (opts :match-by)) (nil? (opts :match-by))) + {:status 400 :body ((if (opts :batch?) identity json/generate-string) {:match false :message (str "[error] url, matcher, and match-by must all contain valid values.")})}) + ;; Pattern Matching Dispatch + (let [{:keys [url matcher secret batch?]} opts + result (match [(merge opts (if (nil? secret) {:secret ""} {}))] + [{:strategy "webdriver" :match-by "js" :secret ""}] (boolean (webdriver-js-strategy url matcher)) + [{:strategy "fallback" :match-by "regex" :secret ""}] (fallback-regex url matcher) + [{:strategy "static" :match-by "regex" :secret ""}] (boolean (static-regex-strategy url matcher)) + [{:strategy "webdriver" :match-by "regex" :secret ""}] (boolean (webdriver-regex-strategy url matcher)) + [{:secret ""}] "Finder: Invalid configuration, secret must be non empty" + [{:strategy "fallback" :match-by "css"}] (fallback-css url matcher secret) + [{:strategy "fallback" :match-by "regex"}] (fallback-regex url matcher secret) + [{:strategy "static" :match-by "regex"}] (= secret (static-regex-strategy url matcher)) + [{:strategy "static" :match-by "css"}] (= secret (static-css-strategy url matcher)) + [{:strategy "webdriver" :match-by "regex"}] (= secret (webdriver-regex-strategy url matcher)) + [{:strategy "webdriver" :match-by "css"}] (= secret (webdriver-css-strategy url matcher)) + [{:strategy "webdriver" :match-by "xpath"}] (= secret (webdriver-xpath-strategy url matcher)) + [{:strategy "webdriver" :match-by "js"}] (= secret (webdriver-js-strategy url matcher)) + :else "Finder: Invalid configuration, unknown.")] + ;; Result Coercion + (if (string? result) + {:status 400 :body ((if batch? identity json/generate-string) {:match false :message (str "[error] " result)})} + {:status 200 :body ((if batch? identity json/generate-string) {:match result :message (if result "match found" "no match")})}))) + +;; Batch vars and helpers +(def default-batch-size 5) +(defn safe-parse-int [s d] + (try (Integer/parseInt s) + (catch Exception _ d))) + +;; Modified dispatch and find handlers for batch processing +(defn batch-dispatch [opts] + (:body (dispatch (merge opts {:batch? true})))) +(defn find-batch [{:keys [batch batch-size] :or {batch [] batch-size default-batch-size}}] ;; Allow specifying batch-size, and integrate browser instance pooling + (let [;; Validate batch-size arg + bs (if (int? batch-size) batch-size (safe-parse-int batch-size default-batch-size)) + ;; Start parallel batches from single thread, maybe parallelizable with fold? + proc (flatten (transduce + (comp + (partition-all bs) + (map (fn [v] + (doall (pmap batch-dispatch v))))) + conj + batch)) + ;; Stringify result to JSON + body (json/generate-string proc)] + ;; HTTP Response + {:status 200 + :body body})) + +;; Find api CLI & HTTP entrypoint +(defn find [opts] + (try + (if (contains? opts :batch) (find-batch opts) (dispatch opts)) + (catch Exception e + (println (str "[DEBUG] find crash, error -> " (pr-str e))) + {:status 500 :body "Internal Server Error."}))) + +;; HTTP Request middleware +(defn json->edn [reader] (json/parse-stream reader true)) +(defn parse-json-middleware [request] + (-> request + :body + io/reader + json->edn)) + +;; HTTP routes +(def routes + [{:path "/health" + :method :get + :response {:status 200 + :headers {"Content-Type" "text/html"} + :body "Ok"}} + {:path "/v1/find" + :method :get + :response (fn [req] + (-> req + parse-json-middleware + find))}]) + +;; Server entrypoint +(defn start-server [{:keys [port address] :or {port 8080 address "0.0.0.0"}}] + (http/run-server + #(ruuter/route routes %) + {:port (int port) + :address address}) + (println "Server started on " address ":" port) + @(promise) ;; Hold thread for HTTP server + ) + +;; Lambda entrypoint +(defn lambda [{:keys [event ctx]}] + (println (str "Event " (pr-str event) "\nCtx " (pr-str ctx))) + {:statusCode 200 + :headers {"content-type" "application/json"} + :body (:body (headless.find/find event))}) + +;; Main (manifold) entrypoint +(defn -main [& args] + (when-not (nil? (first args)) + (try + (if (or (contains? (first args) :cli) (string? (first args))) + (let [cli-arg (if (contains? (first args) :cli) ((first args) :cli) (cli/parse-opts args))] + (if (contains? cli-arg :server) + (start-server cli-arg) + (:body (find cli-arg)))) + (lambda (first args))) + (catch Exception e + (println (str "[DEBUG] main crash, error -> " (pr-str e))) + (if (string? (first args)) + {:status 500 :body (str "Internal Server Error." (pr-str e))} + "CLI Internal Error."))))) + +#?(:bb (-main {:cli (cli/parse-opts *command-line-args*)})) \ No newline at end of file