diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 00000000..bdaab28a --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,39 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.gitignore b/.gitignore index c2b62d1b..5fbd6c8e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,61 +1,4 @@ -.DS_Store -*.secrets -.ipynb_checkpoints -data/* -*.log -*.swp -HMP_Dataset - -**/.ipynb_checkpoints/* -**/.virtual_documents/* -assets/.METADATA/job_run.* -assets/job_run -__pypackages__/ -__pycache__/ -*.py[cod] -*$py.class -*.so -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -*.manifest -*.spec -pip-log.txt -pip-delete-this-directory.txt -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ -*.mo -*.pot -local_settings.py -db.sqlite3 -db.sqlite3-journal -instance/ -.webassets-cache -.scrapy -docs/_build/ -.pybuilder/ -profile_default/ -ipython_config.py -celerybeat-schedule -celerybeat.pid -.spyderproject -.spyproject -.ropeproject -.mypy_cache/ -.pyre/ -.pytype/ -cython_debug/ venv/ .venv/ +__pycache__ +.ipynb_checkpoints/ diff --git a/GettingStarted.md b/GettingStarted.md new file mode 100644 index 00000000..92762c9c --- /dev/null +++ b/GettingStarted.md @@ -0,0 +1,757 @@ +# Getting Started with CLAIMED + +The [CLAIMED framework](https://github.com/claimed-framework) enables ease-of-use development and deployment of cloud native data processing applications on Kubernetes using operators and workflows. + +A central tool of CLAIMED is the **Claimed Component Compiler (C3)** which creates a docker image with all dependencies, pushes the container to a registry, and creates a kubernetes-job.yaml as well as a kubeflow-pipeline-component.yaml. +This page explains how to apply operators, combine them to workflows, and how to build them yourself using C3. + +If you like CLAIMED, just give us a [star](https://github.com/claimed-framework/component-library) on our [main project](https://github.com/claimed-framework/component-library). + + +## Content + +**[1. Apply operators](#1-apply-operators)** + +**[2. Operator library](#2-operator-library)** + +**[3. Create workflows](#3-create-workflows)** + +**[4. Create operators](#4-create-operators)** + +**[5. Create grid wrapper](#5-create-grid-wrapper)** + +--- + +## 1. Apply operators + +An operator is a single processing step. You can run the script locally with the [CLAIMED CLI](https://github.com/claimed-framework/cli) using the following command: +```shell +claimed --component /: -- -- ... +``` + +Besides CLAIMED CLI, you can use an operator in [workflows](#3-create-workflows), or deploy a kubernetes job using the `job.yaml` which is explained in the following. + + +### 1.1 Specify the job + +First, update the variable values in the `job.yaml`. +You can delete a variable to use the default value, if one is defined. +The default values are listed in the KubeFlow component `yaml` file. + +#### Secrets + +You can use key-value secrets for passing credentials to the job. Save the secrets to the cluster and replace the `value: ...` with the following pattern in the `job.yaml`: + +```yaml + containers: + env: + - name: + valueFrom: + secretKeyRef: + name: + key: + +# Example for an access key + containers: + env: + - name: access_key_id + valueFrom: + secretKeyRef: + name: cos-secret + key: access_key_id +``` + +#### Container registry + +If the container image is saved in a non-public registry, add an image pull secret to the container specs. Check `image: ...` in the `job.yaml` to find the location of the container image. If it includes a non-public registry like [icr.io](), you need to provide the image pull secret at the end of the file: + +```yaml + spec: + containers: + - name: example-script + image: icr.io/namespace/claimed-example-script:0.1 + ...: + imagePullSecrets: + - name: +``` + +#### Storage + +You can provide access to a Kubernetes/OpenShift persistent volume by specifying it in the `job.yaml`. +OpenShift clusters require specifying the security context on the pod/template spec level. +You get the group ID for the volume from your administrator. +You can use `/opt/app-root/src/` to mount the `mount_dir` in the working directory of the pod. + +```yaml + spec: + containers: + ...: + volumeMounts: + - name: data + mountPath: /opt/app-root/src/ + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumes: + - name: data + persistentVolumeClaim: + claimName: + securityContext: + supplementalGroups: [] +``` + +#### Error handling + +If a pod fails, it is restarted by the job until it finishes successfully. You can specify the error handling in the `job.yaml`. +First, `backoffLimit` limits the number of restarts (default: 5). Second, `restartPolicy` defines if a failed pod restarts (`OnFailure`) or if a new pod is created while the failed pod stops with the error (`Never`). + +```yaml +spec: + backoffLimit: 1 + template: + spec: + ...: + restartPolicy: Never +``` + +#### Example + +The following is an exemplary `example_script.job.yaml` that includes a `imagePullSecret` and mounts a persistent volume claim from a cluster. +Variables that are not defined are using the default value. + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: example-script +spec: + template: + spec: + containers: + - name: example-script + image: docker.io/user/claimed-example-script:0.1 + command: ["/opt/app-root/bin/ipython","/opt/app-root/src/example_script.py"] + env: + - name: input_path + value: "data/" + - name: num_values + value: "5" + volumeMounts: + - name: pvc-data + mountPath: /opt/app-root/src/data/ + volumes: + - name: pvc-data + persistentVolumeClaim: + claimName: pvc-name + restartPolicy: OnFailure + imagePullSecrets: + - name: user-pull-secret +``` + + +### 1.2 Cluster CLI login + +You can start jobs with the `kubectl` (Kubernetes) or `oc` (OpenShift) CLI. If your using Kubernetes, the login procedure includes multiple steps which are detailed in the [Kubernetes docs](https://kubernetes.io/docs/tasks/access-application-cluster/access-cluster/). + +Logging into an OpenShift cluster is easier. You can use a token which you can generate via the browser UI, or you're username. You might want to add `--insecure-skip-tls-verify` when errors occur. + +```sh +# Login via token (Browser login > Your name > Copy login command > Display token) +oc login --token= --server= --insecure-skip-tls-verify + +# Login via user name +oc login -u + +# Optional: Change default project +oc project +``` + +### 1.3 Start and manage jobs + +After specifying the `job.yaml` and logging into the cluster, you can start or stop a job via the CLI. If your using an OpenShift cluster, you simply replace `kubectl` with `oc` in the commands. + +```sh +# start job +kubectl apply -f .job.yaml + +# kill job +kubectl delete -f .job.yaml +``` +Note that calling `kubectl apply` two times can lead to an error because jobs have unique names. If a job with the same name is running, you might need to kill the job before restarting it. + +The job creates a pod which is accessible via the browser UI or via CLI using the standard kubectl commands. +```sh +# list all pods in the current project +kubectl pods + +# get logs of a pod +kubectl logs -f + +# pod description +kubectl describe pod +``` + +--- + +## 2. Operator library + +Reusable code is a key idea of CLAIMED and operator libraries make it easier to share single processing steps. +Because each operator includes a docker image with specified dependencies, operators can be easily reused in different workflows. + +Public operators are accessible from the [CLAIMED component library](https://github.com/claimed-framework/component-library/tree/main/component-library). + +You can run a public operator locally by using [claimed-cli](https://github.com/claimed-framework/cli) or copy the Kubernetes job.yaml file for running the operator on a Kubernetes/OpenShift cluster. +You can also use the operators in workflows as explained in the next section. + +--- + +## 3. Create workflows + +Multiple operators can be combined to a workflow, e.g., a KubeFlow pipeline or a CWL workflow. Therefore, C3 creates `.yaml` files which define a KFP component and `.cwl` files for a CWL step. + +### KubeFlow Pipeline + +After initializing your operators, you can combine them in a pipeline function: + +```python +# pip install kfp + +import kfp.components as comp +import kfp +import kfp.dsl as dsl + +# initialize operator from yaml file +file_op = comp.load_component_from_file('.yaml') +# initialize operator from remote file +web_op = comp.load_component_from_url('https://raw.githubusercontent.com/claimed-framework/component-library/main/component-library/.yaml') + +@dsl.pipeline( + name="my_pipeline", + description="Description", +): +def my_pipeline( + parameter1: str = "value", + parameter2: int = 1, + parameter3: str = "value", +): + step1 = file_op( + parameter1=parameter1, + parameter2=parameter2, + ) + + step2 = web_op( + parameter1=parameter1, + parameter3=parameter3, + ) + + step2.after(step1) + +kfp.compiler.Compiler().compile(pipeline_func=my_pipeline, package_path='my_pipeline.yaml') +``` + +When running the script, the KFP compiler generates a `.yaml` file which can be uploaded to the KubeFlow UI to start the pipeline. +Alternatively, you can run the pipeline with the SDK client, see [KubeFlow Docs](https://www.kubeflow.org/docs/components/pipelines/v1/sdk/build-pipeline/) for details. + +If your using an OpenShift cluster, your might want to use the Tekton compiler. + +```python +# pip install kfp-tekton + +from kfp_tekton.compiler import TektonCompiler + +TektonCompiler().compile(pipeline_func=my_pipeline, package_path='my_pipeline.yaml') +``` + +If you are using another tekton version, you can use the following code to save an adjusted yaml file for version `v1beta1`: + +```python +# pip install kfp-tekton pyyaml + +import yaml +from kfp_tekton.compiler import TektonCompiler + +# Read dict to update apiVersion +_, pipeline_dict = TektonCompiler().prepare_workflow(my_pipeline) +pipeline_dict['apiVersion'] = 'tekton.dev/v1beta1' +# write pipeline to yaml +with open('my_pipeline.yaml', 'w') as f: + yaml.dump(pipeline_dict, f) +``` + +#### Timeout in KubeFlow Tekton + +The default timeout in a KFP tekton pipeline is set to 60 minutes. The default value can be changed in the tekton config by the [administrators](https://tekton.dev/docs/pipelines/pipelineruns/#configuring-a-failure-timeout). Otherwise, you can update the timeout in the yaml with the following code: + +```python +# Read dict to update apiVersion and timeouts +_, pipeline_dict = TektonCompiler().prepare_workflow(my_pipeline) +pipeline_dict['spec']['timeouts'] = {'pipeline': "0"} # 0 = no timeout +# write pipeline to yaml +with open('my_pipeline.yaml', 'w') as f: + yaml.dump(pipeline_dict, f) +``` + +#### Shared volumes + +Data is not shared by default between different steps. +You can add a volume to each step for data sharing. +First, you create a PersistentVolumeClaim (PVC) in the Kubernetes project that is running KubeFlow. +If you want to run multiple steps in parallel, this PVC must support ReadWriteMany, otherwise ReadWirteOnce is sufficient. +Next, you can mount this PVC to each step with the following code: + +```python +mount_folder = "/opt/app-root/src/" + +# Init the KFP component +step = my_kfp_op(...) + +step.add_pvolumes({mount_folder: dsl.PipelineVolume(pvc='')}) +``` + +You can include the working directory in the mount path to use relative paths (`/opt/app-root/src/` for python and `home/docker` for R). +Otherwise, you can use absolute paths in your scripts/variables `//...`. + +#### Secrets + +You can use key-value secrets in KubeFlow as well to avoid publishing sensible information in pod configs and logs. +You can add the secrets in the Kubernetes project that is running KubeFlow. +Then, you can add secrets to a specfic step in the pipeline with the following code: + +```python +from kubernetes.client import V1EnvVar, V1EnvVarSource, V1SecretKeySelector + +# Init the KFP component +step = my_kfp_op(...) + +# Add a secret as env variable +secret_env_var = V1EnvVar( + name='', + value_from=V1EnvVarSource(secret_key_ref=V1SecretKeySelector(name='', key='') +)) +step.add_env_variable(secret_env_var) +``` + +The secret will be set as an env variable and load by the common C3 interface. +Therefore, it is important that KubeFlow does not everwrite this env variable. +You need to adjust the command in the KFP component yaml by deleting the variable: +```yaml +# Original command with secret_variable +command: + ... + python ./.py log_level="${0}" ="${1}" other_variable="${2}" ... + ... + +# Adjusted command +command: + ... + python ./.py log_level="${0}" other_variable="${2}" ... + ... +``` +Further, it is important, that the variable has a default value and is optional +(You can simply add `default: ""` to the variable in the KFP component yaml without recompiling your script). + + +### CWL workflows + +You can run workflows locally with CWL. This requires the cwltool package: +```shell +pip install cwltool +``` + +You can create a CWL workflow by combining multiple CWL steps: + +```text +cwlVersion: v1.0 +class: Workflow + +inputs: + parameter1: string + parameter2: string + parameter3: string + parameter4: string +outputs: [] + +steps: + .cwl: + run: ./path/to/.cwl + in: + parameter1: parameter1 + parameter2: parameter2 + parameter3: parameter3 + out: [] + .cwl: + run: ./path/to/.cwl + in: + parameter3: parameter3 + parameter4: parameter4 + out: [] +``` + +Run the CWL workflow in your terminal with: +```shell +cwltool .cwl --parameter1 --parameter2 --parameter3 --parameter4 +``` + +--- + +## 4. Create operators + +### 4.1 Download C3 + +You can install C3 via pip: +```sh +pip install claimed +``` + +### 4.2 C3 requirements + +Your operator script has to follow certain requirements to be processed by C3. Currently supported are python scripts and ipython notebooks. + +#### Python scripts + +- The operator name is the python file: `my_operator_name.py` -> `claimed-my-operator-name` +- The operator description is the first doc string in the script: `"""Operator description"""` +- The required pip packages are listed in comments starting with pip install: `# pip install ` or `# pip install -r ~/requierments.txt` +- The interface is defined by environment variables `my_parameter = os.getenv('my_parameter')`. +- You can cast a specific type by wrapping `os.getenv()` with `int()`, `float()`, `bool()`. The default type is string. Only these four types are currently supported. You can use `None` as a default value but not pass the `NoneType` via the `job.yaml`. +- Output paths for KubeFlow can be defined with `os.environ['my_output_parameter'] = ...'`. Note that operators cannot return values but always have to save outputs in files. + +You can optionally install future tools with `dnf` by adding a comment `# dnf `. + +#### iPython notebooks + +- The operator name is the notebook file: `my_operator_name.ipynb` -> `claimed-my-operator-name` +- The notebook is converted by `nbconvert` to a python script before creating the operator by merging all cells. +- Markdown cells are converted into doc strings. shell commands with `!...` are converted into `get_ipython().run_line_magic()`. +- The requirements of python scripts apply to the notebook code (The operator description can be the first markdown cell). + +#### R scripts + +- The operator name is the python file: `my_operator_name.R` -> `claimed-my-operator-name` +- The operator description is currently fixed to `"R script"`. +- The required R packages are installed with: `install.packages(, repos=)` +- The interface is defined by environment variables `my_parameter <- Sys.getenv('my_parameter', 'optional_default_value')`. +- You can cast a specific type by wrapping `Sys.getenv()` with `as.numeric()` or `as.logical()`. The default type is string. Only these three types are currently supported. You can use `NULL` as a default value but not pass `NULL` via the `job.yaml`. +- Output paths for KubeFlow can be defined with `Sys.setenv()`. Note that operators cannot return values but always have to save outputs in files. + +You can optionally install future tools with `apt` by adding a comment `# apt `. + +#### Example + +The following is an example python script `example_script.py` that can be compiled by C3. + +```py +""" +This is the operator description. +The file name becomes the operator name. +""" + +# Add dependencies by comments starting with "pip install". +# You can add multiple comments if the packages require a specific order. +# pip install numpy + +import os +import logging +import numpy as np + +# A comment one line above os.getenv is the description of this variable. +input_path = os.getenv('input_path') + +# You can cast a specific type with int(), float(), or bool(). +num_values = int(os.getenv('num_values', 5)) + +# Output paths are starting with "output_". +output_path = os.getenv('output_path', None) + + +def my_function(n_random): + """ + The compiler only includes the first doc string.This text is not included. + """ + random_values = np.random.randn(n_random) + # You can use logging in operators. + # C3 adds a logger and a parameter log_level (default: 'INFO') to the operator. + logging.info(f'Random values: {random_values}') + + +if __name__ == '__main__': + my_function(num_values) + +``` + +### 4.3 Docker engine +C3 requires a running Docker engine to build the container image. A popular app is [Docker Desktop](https://www.docker.com/products/docker-desktop/). However, Docker Desktop requires licences for commercial usage in companies. An open source alternatives is [Rancher Desktop](https://rancherdesktop.io) (macOS/Windows/Linux) which includes docker engine and a UI. A CLI alternative for macOS and Linux is [Colima](https://github.com/abiosoft/colima) which creates a Linux VM for docker. + +```sh +# Install Colima with homebrew +brew install docker docker-compose colima + +# Start docker VM +colima start + +#Stop docker VM +colima stop +``` + +### 4.4 Container registry + +C3 creates a container image for the operator which has to be stored in a container registry. A simple solution for non-commercial usage is Docker Hub, but it has limited private usage. +Alternative to a professional plan from Docker Hub are the [IBM Cloud registry](https://www.ibm.com/products/container-registry) or [Amazon ECR](https://aws.amazon.com/ecr/). + +After starting the Docker engine, you need to login to the registry with docker. + +```sh +docker login -u -p / +``` + +### 4.5 Compile an operator with C3 + +With a running Docker engine and your operator script matching the C3 requirements, you can execute the C3 compiler by running `create_operator.py`: + +```sh +c3_create_operator --repository "/" ".py" "" "" +``` + +You need to provide the repository with `--repository` or `-r`. You can specify the version of the container image (default: "0.1") with `--version` or `-v`. +The first positional argument is the path to the python script or the ipython notebook. Optional, you can define additional files that are copied to the container images in the following positinal arguments. You can use wildcards for additional files. E.g., `*` would copy all files in the current directory to the container image. (Hidden files and directories must be specified. Be aware of `data/` folders and others before including all files.) +Note,that the docker build messages are suppressed by default. If you want to display the docker logs, you can add `--log_level DEBUG`. + +View all arguments by running: +```sh +c3_create_operator --help +``` + +C3 generates the container image that is pushed to the registry, a `.yaml` file for KubeFlow, a `.job.yaml` for Kubernetes, and a `.cwl` file for CWL. + +### 4.6 CLAIMED Containerless Operators +CLAIMED containerless operators allow you to execute scripts as fully functional workflow components without the need for traditional containerization. + +After installing the claimed component compiler via pip install claimed c3, you can compile a script into a containerless operator just as you would for containerized components like Docker, Kubernetes (jobs, pods, deployments), Kubeflow, or Apache Airflow. + +Using the command c3_create_containerless_operator my_script.py, your script is transformed into a standalone, executable operator. An example of a containerless operator can be found in the [containerless-bootstrap repository](https://github.com/claimed-framework/containerless-bootstrap). These operators can be executed seamlessly using the claimed CLI, replacing the container registry path with the containerless prefix. For instance, running claimed --component containerless/claimed-util-cos:latest --cos_connection cos://access_key_id:secret_access_key@s3.us-east.cloud-object-storage.appdomain.cloud/some_bucket/some_path --operation put --local_path some_file.zip enables cloud object storage operations with the 'claimed-util-cos' without requiring a container runtime. This approach significantly reduces overhead and speeds up execution while maintaining compatibility with established workflow orchestration frameworks. + + +--- + +## 5. Create grid wrapper + +You can use grid computing to parallelize an operator. +The grid computing requires that the code is parallelizable, e.g., by processing different files. +Therefore, the code gets wrapped by a coordinator script: The grid wrapper. + +### 5.1 C3 grid computing requirements + +You can use the same code for the grid wrapper as for an operator by adding an extra functon which is passed to C3. +The grid wrapper executes this function in each batch and passes specific parameters to the function: +The first parameter is the batch id, followed by all variables defined in the operator interface. +You need to adapt the variables based on the batch, e.g., by adding the batch id to input and output paths. + +```python +def grid_process(batch_id, parameter1, parameter2, *args, **kwargs): + # update operator parameters based on batch id + parameter1 = parameter1 + batch_id + parameter2 = os.path.join(parameter2, batch_id) + + # execute operator code with adapted parameters + my_function(parameter1, parameter2) +``` + +You might want to add `*args, **kwargs` to avoid errors, if not all interface variables are used in the grid process. +Note that the operator script is imported by the grid wrapper script. Therefore, all code in the script is executed. +If the script is also used as a single operator, it is recommended to check for `__main__` to avoid executions when the code is imported by the grid wrapper. + +```python +if __name__ == '__main__': + my_function(parameter1, parameter2) +``` + +Note that the grid computing is currently not implemented for R scripts. + +### 5.2 Compile a grid wrapper with C3 + +The compilation is similar to an operator. Additionally, the name of the grid process is passed to `create_gridwrapper.py` using `--process` or `-p` (default: `"grid_process"`) +and a backend for the coordinator is selected with `--backend` or `-b` (default: `"local"`). + +```sh +c3_create_gridwrapper -r "/" -p "grid_process" -b "local" ".py" "" "" +``` + +C3 supports three backends for the coordination: Coordinator files on a shared local storage (`"local"`), on COS (`"cos"`), or as a key-value storage on S3 (`"s3kv"`). + +Note, that the backend `"legacy_cos"` also handles downloading and uploading files from COS. We removed this functionality to simplify the grid wrapper. + +The grid wrapper creates a temporary file `gw_.py` which is copied to the container image and deleted. +Similar to an operator, `gw_.yaml`, `gw_.cwl`, and `gw_.job.yaml` are created. + + +### 5.3 Apply grid wrappers + +The grid wrapper uses coordinator files to split up the batch processes between different pods. +Therefore, each pod needs access to a shared persistent volume, see [storage](#storage). +Alternatively, you can use the COS or S3kv grid wrapper which uses a coordinator in S3. + +The grid wrapper adds specific variables to the `job.yaml`, that define the batches and some coordination settings. + +First, you can define the list of batch ids in a file and pass `gw_batch_file` to the grid wrapper. +You can use either a `txt` file with a comma-separated list of strings, a `json` file with the keys being the batch ids, or a `csv` file with `gw_batch_file_col_name` being the column with the batch ids. +`gw_batch_file` can be a local path, a path within the coordinator bucket or a COS connection to a file (`cos://:@///`). + +Second, you need to define a `gw_coordinator_path` or `gw_coordinator_connection`. +The `gw_coordinator_path` is used in the `local` version. It is a path to a persistent and shared directory that is used by the pods to lock batches and mark them as processed. +`gw_coordinator_connection` is used in the `cos` and `s3kv` version. It defines a connection to a directory on COS: `cos://:@//`. +The coordinator uses files with specific suffixes: `.lock`, `.processed`, and `.err`. +`gw_lock_timeout` defines the time in seconds until other pods remove the `.lock` file from batches that might be struggling (default `10800`). +If your processes run very long, you can increase `gw_lock_timeout` to avoid duplicated processing of batches. +By default, pods skip batches with `.err` files. You can set `gw_ignore_error_files` to `True` after you fixed the error. + +The grid wrapper currently does not support [secrets](#secrets) for the access key and secret within a connection. + +Lastly, you want to add the number of parallel pods by adding `parallelism : ` to the `job.yaml`. + +```yaml +spec: + parallelism: 10 +``` + +In KubeFlow pipelines, you can call the grid wrapper multiple times via a `for` loop. Note that the following step needs to wait for all parallel processes to finish. + +```python +process_parallel_instances = 10 + +@dsl.pipeline(...) +def preprocessing_val_pipeline(...): + step1 = first_op() + step3 = following_op() + + for i in range(process_parallel_instances): + step2 = grid_wrapper_op(...) + + step2.after(step1) + step3.after(step2) +``` + +#### Local example + +The local grid wrapper requires a local storage for coordination like the PVC in the following example. + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: cgw-my-operator +spec: + parallelism: 10 + template: + spec: + containers: + - name: cgw-my-operator + image: us.icr.io/geodn/claimed-cgw-my-operator:0.01 + command: ["/opt/app-root/bin/python","/opt/app-root/src/claimed_cgw_my_operator.py"] + env: + - name: gw_batch_file + value: "data/schedule.json" + - name: gw_coordinator_path + value: 'gw_coordinator' + - name: my_operator_data_path + value: 'data/*' + - name: my_operator_target_path + value: 'data/output/' + - name: my_operator_parameter + value: "100" + volumeMounts: + - name: pvc-data + mountPath: /opt/app-root/src/data/ + volumes: + - name: pvc-data + persistentVolumeClaim: + claimName: pvc-name + restartPolicy: Never + imagePullSecrets: + - name: image-pull-secret +``` + +#### COS example + +The COS grid wrapper uses a COS bucket for downloading and uploading batch data and coordination. + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: cgw-my-operator +spec: + parallelism: 10 + template: + spec: + containers: + - name: cgw-my-operator + image: us.icr.io/geodn/claimed-cgw-my-operator:0.01 + command: ["/opt/app-root/bin/python","/opt/app-root/src/claimed_cgw_my_operator.py"] + env: + - name: gw_file_path_pattern + value: 'data/*' + - name: gw_group_by + value: '[-10:-4]' + - name: gw_source_access_key_id + valueFrom: + secretKeyRef: + name: cos-secret + key: access_key_id + - name: gw_source_secret_access_key + valueFrom: + secretKeyRef: + name: cos-secret + key: secret_access_key + - name: gw_source_endpoint + value: 'https://s3.cloud-object-storage.cloud' + - name: gw_source_bucket + value: 'my-bucket' + - name: gw_target_path + value: 'cos_results' + - name: gw_coordinator_path + value: 'gw_coordinator' + - name: my_operator_data_path + value: 'input' + - name: my_operator_target_path + value: 'target' + - name: my_operator_parameter + value: "100" + restartPolicy: Never + imagePullSecrets: + - name: image-pull-secret +``` + +### 5.4 Simple Grid Wrapper +Although CLAIMED grid wrappers with the different coordinator plugins are very powerful, sometimes it is also overwhelming. Therefore we created the simple_grid_wrapper plugin which allows you to just point as many parallel workers as you like to a directory of files. Those files are randomly processed by each worker, making sure there is only one worker processing a file. Once all files are processed, the results are renamed to original_file_name.PROCESSED.ext. Please have a look at the examples folder to create your own simple grid wrapper. Here are the commands, given you are in the examples folder of this repository: + +``` +(pip install claimed c3) +c3_create_gridwrapper simple_grid_wrapper_example.py -b simple_grid_wrapper +export CLAIMED_DATA_PATH=/path/to/your/c3/examples +claimed --component local/claimed-gw-simple-grid-wrapper-example:0.1 --log_level "INFO" --sgw_source_folder /opt/app-root/src/data/simple_grid_wrapper_source --sgw_target_folder /opt/app-root/src/data/simple_grid_wrapper_target + +# you can also store the results in the source folder +claimed --component local/claimed-gw-simple-grid-wrapper-example:0.1 --log_level "INFO" --sgw_source_folder /opt/app-root/src/data/simple_grid_wrapper_source_and_target --sgw_target_folder /opt/app-root/src/data/simple_grid_wrapper_source_and_target +``` + +### 5.5 Folder Grid Wrapper +It's exactly like the simple grid wrapper but here you lock folder instead of files. +Here are the commands, given you are in the examples/folder_grid_wrapper_example folder of this repository: +``` +c3_create_gridwrapper folder_grid_wrapper_example.py -b folder_grid_wrapper +export CLAIMED_DATA_PATH=/path/to/your/c3/examples +claimed --component local/claimed-gw-folder-grid-wrapper-example:0.1 --log_level "INFO" --sgw_source_folder /opt/app-root/src/data/folder_grid_wrapper_source --sgw_target_folder /opt/app-root/src/data/folder_grid_wrapper_target +``` +CLAIMED_DATA_PATH specifies the root directory that contains both the source and target folders used by the folder grid wrapper. +For example, if +``` +CLAIMED_DATA_PATH=/c3/examples/folder_grid_wrapper_example +``` +then the directory structure should look like this: +``` +/c3/examples/folder_grid_wrapper_example/ +├── folder_grid_wrapper_source/ +├── folder_grid_wrapper_target/ +``` diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 00000000..d6456956 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 80196b98..68c5fd10 100644 --- a/README.md +++ b/README.md @@ -3,95 +3,48 @@ -# CLAIMED - It's time to concentrate on your code only +# C3 - the CLAIMED Component Compiler -For more information, please visit the project's [website](https://claimed-framework.github.io/) +**TL;DR** +- takes arbitrary assets (Jupyter notebooks, python scripts, R scripts) as input +- automatically creates container images and pushes to container registries +- automatically installs all required dependencies into the container image +- creates KubeFlow Pipeline components (target workflow execution engines are pluggable) +- creates Kubernetes job configs for execution on Kubernetes/Openshift clusters +- can be triggered from CICD pipelines -[![OpenSSF Best Practices](https://bestpractices.coreinfrastructure.org/projects/6718/badge)](https://bestpractices.coreinfrastructure.org/projects/6718) -[![GitHub](https://img.shields.io/badge/issue_tracking-github-blue.svg)](https://github.com/claimed-framework/component-library/issues) - - - ---- -**TL;DR** [Video on YouTube](https://www.youtube.com/watch?v=IhrIzLgY-Cg) -- set of re-usable coarse-grained components (just a bunch of code) -- think of tasks, not functions (e.g., read from a database, transform data, train model, deploy model, store result to cloud object storage) -- write once, runs everywhere: Kubeflow, Docker/Podman, CLI, KNative, Kubernetes, CodeEngine -- orchestrate with anything: shell script, Kubeflow, CWL, ... -- persistence layer / queue agnostic: Cloud Object Storage, file systems, PVC, Kafka, MQTT -- just use Python, bash or R - no other skills required (no Kubeflow component YAML, maven, Java, Dockerfile) -- 1st class citizen in JupyterLab and the Elyra Pipeline Editor (creating a low code / no code IDE for data science) -- upstream repository to IBM Watson Studio Pipelines contributed components in IBM Cloud Pak for Data - -[FAQ](FAQ.md) - ---- - -CLAIMED is a operator component framework for artificial intelligence, machine learning, "extract, transform, load" processes, and data science. The goal is to enable low-code/no-code rapid prototyping style programming to seamlessly CI/CD into production. The open source CLAIMED component library provides ready-made components for various business domains, supports multiple computer languages, works on different data flow editors and command line tools, and runs on various execution engines including Kubernetes (Job), Knative (Pod), Kubeflow (KFP component) and on the CLI using docker or podman. In addition to the open source library, clients create their private operator libraries using in conjunction with the open source one. -Core of CLAIMED is C3, the Claimed Component Compiler which takes source code in the form of jupyter notebooks or source files and compiles it to various target runtime environments, allowing to add additional targets as plugins. Currently, C3 supports python notebooks and python scripts as input and Kubernetes/OpenShift, Kubeflow Pipeline Components/RedHat OpenShift DataScience Pipelines, and plain docker/podmain (via CLAIMED CLI) -C3 does the following steps: -- extract meta information (name, description, interface, dependencies (requirements)) from source -- create (docker) container with dependencies and source added -- push container to registry -- create kubernetes-job.yaml -- create kubeflow-pipeline-component.yaml - -To demonstrate its utility, we constructed a workflow composed exclusively of this library's components. To display the capabilities of this library, we made use of a publicly available Computed Tomography (CT) scan dataset [covidata]. We created a deep learning model, which is supposed to classify exams as either COVID-19 positive or negative. We built the pipeline with Elyra's Pipeline Visual Editor, with support for local, Airflow, and Kubeflow execution [https://arxiv.org/abs/2103.03281](https://arxiv.org/abs/2103.03281). - - -![Low Code / No Code pipeline creation tool for data science](https://github.com/IBM/claimed/raw/master/images/elyra_pipeline.png) -*Low Code / No Code pipeline creation tool for data science* - - **Bring the latest and greatest libraries to the hands of everybody.** -![AIX360/LIME highlights a poor deep learning covid classification model looking at bones only](https://github.com/IBM/claimed/raw/master/images/elyra_lime.png) -*AIX360/LIME highlights a poor deep learning covid classification model looking at bones only* - -Components of this library can be exported as: -1. Kubeflow pipeline components -2. Apache Airflow components -3. Standalone graphical components for the Elyra pipeline editor -4. Standalone components to be run from the command line -5. Standalone components to be run as docker containers -6. Standalone components to be run as Kubernetes Service -7. Standalone components to be run as KNative Application or Job -8. Components to consume from or publish to Queue Managers like Kafka or MQTT -9. Components deployed to Kubernets wrapped into DAPR (as service or message consumer/producer) - -![Visually create pipelines from notebooks and run everywhere](https://github.com/IBM/claimed/raw/master/images/elyra_graphical_export.png) -*Visually create pipelines from notebooks and run them everywhere* - -Each notebook is following a similar format. - -1. The first cell contains a description of the component itself. -2. The second cell installs all dependencies using pip3. -3. The third cell imports all dependencies. -4. The fourth cell contains a list of dependencies, input parameters, and return values as Python comments -5. The fifth cell reads the input parameters from environment variables. - - -![Export notebooks and files as runtime components for different engines](https://github.com/IBM/claimed/raw/master/images/elyra_cli_export.png) -*Export notebooks and files as runtime components for different engines* +To learn more on how this library works in practice, please have a look at the following [video](https://www.youtube.com/watch?v=FuV2oG55C5s) +## Related work +[Ploomber](https://github.com/ploomber/ploomber) -To learn more on how this library works in practice, please have a look at the following [video](https://www.youtube.com/watch?v=FuV2oG55C5s) +[Orchest](https://www.orchest.io/) +## Getting started -## Achievements +### Install -🏆 [IEEE OSS AWARD 2023](https://arxiv.org/abs/2307.06824) +```sh +pip install claimed +``` -Kienzler, R., Khan, R., Nilmeier, J., Nesic, I., & Haddad, I. (IEEE OSS 2023). Claimed -- the open source framework for building coarse-grained operators for accelerated discovery in science. [arXiv:2307.06824](https://arxiv.org/abs/2307.06824) +### Usage -## Related work -[Ploomber](https://github.com/ploomber/ploomber) +Just run the following command with your python script or notebook: +```sh +c3_create_operator ".py" --repository "/" +``` -[Orchest](https://www.orchest.io/) +Your code needs to follow certain requirements which are explained in [Getting Started](https://github.com/claimed-framework/c3/blob/main/GettingStarted.md). -[covidata] Joseph Paul Cohen et al. *COVID-19 Image Data Collection: Prospective Predictions Are the Future*, arXiv:2006.11988, 2020 ## Getting Help +```sh +c3_create_operator --help +``` + We welcome your questions, ideas, and feedback. Please create an [issue](https://github.com/claimed-framework/component-library/issues) or a [discussion thread](https://github.com/claimed-framework/component-library/discussions). Please see [VULNERABILITIES.md](VULNERABILITIES.md) for reporting vulnerabilities. @@ -99,31 +52,9 @@ Please see [VULNERABILITIES.md](VULNERABILITIES.md) for reporting vulnerabilitie Interested in helping make CLAIMED better? We encourage you to take a look at our [Contributing](CONTRIBUTING.md) page. -## License -This software is released under Apache License v2.0 - - ## Credits CLAIMED is supported by the EU’s Horizon Europe program under Grant Agreement number 101131841 and also received funding from the Swiss State Secretariat for Education, Research and Innovation (SERI) and the UK Research and Innovation (UKRI). - - -# Install - -```bash -pip install claimed -``` - -This package installs the [CLAIMED Component Library (CCL)](https://pypi.org/project/claimed/), [CLAIMED Component Compiler (C3)](https://pypi.org/project/claimed-c3/) and the [CLAIMED CLI tool](https://pypi.org/project/claimed-cli/) which can be used to run operators locally. - - -# Build & Publish -```bash -python -m build # might require a 'pip install build' -python -m twine upload --repository pypi dist/* # might require a 'pip install twine' -rm -r dist -``` - - - +## License +This software is released under Apache License v2.0. diff --git a/examples/example_rscript.R b/examples/example_rscript.R new file mode 100644 index 00000000..05bc0745 --- /dev/null +++ b/examples/example_rscript.R @@ -0,0 +1,15 @@ +# Reading env variables + +name <- Sys.getenv('name', 'world') + +default <- Sys.getenv('default', "default") + +number <- as.numeric(Sys.getenv('number', 10)) + +print(paste("hello", name)) + +print(number) + +# Install packages +install.packages('readr') +library(readr) diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_example.py b/examples/folder_grid_wrapper_example/folder_grid_wrapper_example.py new file mode 100644 index 00000000..03a9d79c --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_example.py @@ -0,0 +1,10 @@ +from pathlib import Path + +def grid_process(source_folder: str, target_folder: str) -> None: + src_dir = Path(source_folder) + tgt_dir = Path(target_folder) + + for src_file in sorted(src_dir.glob("*.txt")): + text = src_file.read_text(encoding="utf-8") + updated = text.replace("test", "test processed") + (tgt_dir / src_file.name).write_text(updated, encoding="utf-8") \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_1/1.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_1/1.txt new file mode 100644 index 00000000..30d74d25 --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_1/1.txt @@ -0,0 +1 @@ +test \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_1/2.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_1/2.txt new file mode 100644 index 00000000..30d74d25 --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_1/2.txt @@ -0,0 +1 @@ +test \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_1/3.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_1/3.txt new file mode 100644 index 00000000..30d74d25 --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_1/3.txt @@ -0,0 +1 @@ +test \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_2/1.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_2/1.txt new file mode 100644 index 00000000..30d74d25 --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_2/1.txt @@ -0,0 +1 @@ +test \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_2/2.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_2/2.txt new file mode 100644 index 00000000..30d74d25 --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_2/2.txt @@ -0,0 +1 @@ +test \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_2/3.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_2/3.txt new file mode 100644 index 00000000..30d74d25 --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_2/3.txt @@ -0,0 +1 @@ +test \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_3/1.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_3/1.txt new file mode 100644 index 00000000..30d74d25 --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_3/1.txt @@ -0,0 +1 @@ +test \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_3/2.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_3/2.txt new file mode 100644 index 00000000..30d74d25 --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_3/2.txt @@ -0,0 +1 @@ +test \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_3/3.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_3/3.txt new file mode 100644 index 00000000..30d74d25 --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_3/3.txt @@ -0,0 +1 @@ +test \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_1.PROCESSED/1.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_1.PROCESSED/1.txt new file mode 100644 index 00000000..02b2ffde --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_1.PROCESSED/1.txt @@ -0,0 +1 @@ +test processed \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_1.PROCESSED/2.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_1.PROCESSED/2.txt new file mode 100644 index 00000000..02b2ffde --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_1.PROCESSED/2.txt @@ -0,0 +1 @@ +test processed \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_1.PROCESSED/3.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_1.PROCESSED/3.txt new file mode 100644 index 00000000..02b2ffde --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_1.PROCESSED/3.txt @@ -0,0 +1 @@ +test processed \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_2.PROCESSED/1.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_2.PROCESSED/1.txt new file mode 100644 index 00000000..02b2ffde --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_2.PROCESSED/1.txt @@ -0,0 +1 @@ +test processed \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_2.PROCESSED/2.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_2.PROCESSED/2.txt new file mode 100644 index 00000000..02b2ffde --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_2.PROCESSED/2.txt @@ -0,0 +1 @@ +test processed \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_2.PROCESSED/3.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_2.PROCESSED/3.txt new file mode 100644 index 00000000..02b2ffde --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_2.PROCESSED/3.txt @@ -0,0 +1 @@ +test processed \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_3.PROCESSED/1.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_3.PROCESSED/1.txt new file mode 100644 index 00000000..02b2ffde --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_3.PROCESSED/1.txt @@ -0,0 +1 @@ +test processed \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_3.PROCESSED/2.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_3.PROCESSED/2.txt new file mode 100644 index 00000000..02b2ffde --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_3.PROCESSED/2.txt @@ -0,0 +1 @@ +test processed \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_3.PROCESSED/3.txt b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_3.PROCESSED/3.txt new file mode 100644 index 00000000..02b2ffde --- /dev/null +++ b/examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_3.PROCESSED/3.txt @@ -0,0 +1 @@ +test processed \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/gw_folder_grid_wrapper_example.cwl b/examples/folder_grid_wrapper_example/gw_folder_grid_wrapper_example.cwl new file mode 100644 index 00000000..d7571613 --- /dev/null +++ b/examples/folder_grid_wrapper_example/gw_folder_grid_wrapper_example.cwl @@ -0,0 +1,33 @@ +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: "claimed" + +inputs: + component: + type: string + default: local/claimed-gw-folder-grid-wrapper-example:0.1 + inputBinding: + position: 1 + prefix: --component + log_level: + type: string + default: "INFO" + inputBinding: + position: 2 + prefix: --log_level + sgw_source_folder: + type: string + default: None + inputBinding: + position: 3 + prefix: --sgw_source_folder + sgw_target_folder: + type: string + default: "sgw_source_folder" + inputBinding: + position: 4 + prefix: --sgw_target_folder + + +outputs: [] diff --git a/examples/folder_grid_wrapper_example/gw_folder_grid_wrapper_example.job.yaml b/examples/folder_grid_wrapper_example/gw_folder_grid_wrapper_example.job.yaml new file mode 100644 index 00000000..1260f050 --- /dev/null +++ b/examples/folder_grid_wrapper_example/gw_folder_grid_wrapper_example.job.yaml @@ -0,0 +1,22 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: gw-folder-grid-wrapper-example +spec: + template: + spec: + containers: + - name: gw-folder-grid-wrapper-example + image: local/claimed-gw-folder-grid-wrapper-example:0.1 + workingDir: /opt/app-root/src/ + command: ["/opt/app-root/bin/python","examples/folder_grid_wrapper_example/claimed_gw_folder_grid_wrapper_example.py"] + env: + - name: log_level + value: value_of_log_level + - name: sgw_source_folder + value: value_of_sgw_source_folder + - name: sgw_target_folder + value: value_of_sgw_target_folder + restartPolicy: OnFailure + imagePullSecrets: + - name: image_pull_secret \ No newline at end of file diff --git a/examples/folder_grid_wrapper_example/gw_folder_grid_wrapper_example.yaml b/examples/folder_grid_wrapper_example/gw_folder_grid_wrapper_example.yaml new file mode 100644 index 00000000..c56a36c7 --- /dev/null +++ b/examples/folder_grid_wrapper_example/gw_folder_grid_wrapper_example.yaml @@ -0,0 +1,23 @@ +name: gw-folder-grid-wrapper-example +description: "component_folder_grid_wrapper_example got wrapped by folder_grid_wrapper, which wraps any CLAIMED component and implements folder-level locking. This folder grid wrapper scans immediate subdirectories of sgw_source_folder and for each folder the grid_process function is called once. Locking is achieved by creating files in the target directory using the pattern .{STATUS} where STATUS in: LOCKED PROCESSED FAILED CLAIMED component description: component-folder-grid-wrapper-example – CLAIMED V0.1" + +inputs: +- {name: log_level, type: String, description: "update log level", default: "INFO"} +- {name: sgw_source_folder, type: String, description: "folder containing input data in single files or subfolders"} +- {name: sgw_target_folder, type: String, description: "Default: sgw_source_folder. If equal, entries containing LOCKED or PROCESSED or FAILED are ignored.", default: "sgw_source_folder"} + + +outputs: + + +implementation: + container: + image: local/claimed-gw-folder-grid-wrapper-example:0.1 + command: + - sh + - -ec + - | + python ./examples/folder_grid_wrapper_example/claimed_gw_folder_grid_wrapper_example.py log_level="${0}" sgw_source_folder="${1}" sgw_target_folder="${2}" + - {inputValue: log_level} + - {inputValue: sgw_source_folder} + - {inputValue: sgw_target_folder} diff --git a/examples/gw_simple_grid_wrapper_example.cwl b/examples/gw_simple_grid_wrapper_example.cwl new file mode 100644 index 00000000..9ae91cab --- /dev/null +++ b/examples/gw_simple_grid_wrapper_example.cwl @@ -0,0 +1,33 @@ +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: "claimed" + +inputs: + component: + type: string + default: local/claimed-gw-simple-grid-wrapper-example:0.1 + inputBinding: + position: 1 + prefix: --component + log_level: + type: string + default: "INFO" + inputBinding: + position: 2 + prefix: --log_level + sgw_source_folder: + type: string + default: None + inputBinding: + position: 3 + prefix: --sgw_source_folder + sgw_target_folder: + type: string + default: "sgw_source_folder" + inputBinding: + position: 4 + prefix: --sgw_target_folder + + +outputs: [] diff --git a/examples/gw_simple_grid_wrapper_example.job.yaml b/examples/gw_simple_grid_wrapper_example.job.yaml new file mode 100644 index 00000000..cbb8e7e4 --- /dev/null +++ b/examples/gw_simple_grid_wrapper_example.job.yaml @@ -0,0 +1,22 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: gw-simple-grid-wrapper-example +spec: + template: + spec: + containers: + - name: gw-simple-grid-wrapper-example + image: local/claimed-gw-simple-grid-wrapper-example:0.1 + workingDir: /opt/app-root/src/ + command: ["/opt/app-root/bin/python","claimed_gw_simple_grid_wrapper_example.py"] + env: + - name: log_level + value: value_of_log_level + - name: sgw_source_folder + value: value_of_sgw_source_folder + - name: sgw_target_folder + value: value_of_sgw_target_folder + restartPolicy: OnFailure + imagePullSecrets: + - name: image_pull_secret \ No newline at end of file diff --git a/examples/gw_simple_grid_wrapper_example.py b/examples/gw_simple_grid_wrapper_example.py new file mode 100644 index 00000000..1560e48b --- /dev/null +++ b/examples/gw_simple_grid_wrapper_example.py @@ -0,0 +1,105 @@ +""" +component_simple_grid_wrapper_example got wrapped by grid_wrapper, which wraps any CLAIMED component and implements the generic grid computing pattern https://romeokienzler.medium.com/the-generic-grid-computing-pattern-transforms-any-sequential-workflow-step-into-a-transient-grid-c7f3ca7459c8 +This simple grid wrapper just scans a folder and for each file the grid_process function is called. Locking is achieved the following way: +Given source file1.ext is processed, simple_grid_wrapper creates files in the target_directory following the pattern file1.{STATUS}.ext where STATUS in: +LOCKED +PROCESSED +FAILED + + +CLAIMED component description: component-simple-grid-wrapper-example +""" + +# pip install pandas + +# component dependencies +# + +import os +import json +import random +import logging +import time +import glob +from pathlib import Path +import pandas as pd + +# import component code +from component_simple_grid_wrapper_example import * + + +#folder containing input data in single files +sgw_source_folder = os.environ.get('sgw_source_folder') + +#folder to store the output data in single files. Default: sgw_source_folder, in case sgw_source_folder==sgw_target_folder, files containing .LOCKED., .PROCESSED., .FAILED. are ignored +sgw_target_folder = os.environ.get('sgw_target_folder', sgw_source_folder) + +# component interface + + +def get_next_batch(): + files = os.listdir(sgw_source_folder) + if sgw_source_folder == sgw_target_folder: + files = [ + f for f in files + if not any(keyword in f for keyword in ["LOCKED", "PROCESSED", "FAILED"]) + ] + + # Filter files and check if corresponding target file exists + filtered_files = [] + for file in files: + file_name, file_ext = os.path.splitext(file) + + # Create target file names with LOCKED, PROCESSED, FAILED extensions + target_file_locked = f"{file_name}.LOCKED{file_ext}" + target_file_processed = f"{file_name}.PROCESSED{file_ext}" + target_file_failed = f"{file_name}.FAILED{file_ext}" + + # Check if any of the target files exists + if not any( + os.path.exists(os.path.join(sgw_target_folder, target_file)) + for target_file in [target_file_locked, target_file_processed, target_file_failed] + ): + filtered_files.append(file) + + if filtered_files: + return random.choice(filtered_files) + else: + return None + + +def process_wrapper(sub_process): + sgw_target_folder_path = Path(sgw_target_folder) + sgw_target_folder_path.mkdir(exist_ok=True, parents=True) + + while True: + file_to_process = get_next_batch() + logging.info(f"Processing batch: {file_to_process}") + if file_to_process is None: + break + + file_name = Path(file_to_process).stem + file_ext = Path(file_to_process).suffix + locked_file = sgw_target_folder+f"/{file_name}.LOCKED{file_ext}" + locked_file_path = Path(locked_file) + + try: + locked_file_path.touch() + sub_process(sgw_source_folder +'/'+ file_to_process, locked_file) + processed_file = sgw_target_folder+f"/{file_name}.PROCESSED{file_ext}" + locked_file_path.rename(processed_file) + + except Exception as e: + failed_file = sgw_target_folder+f"/{file_name}.FAILED{file_ext}" + locked_file_path.rename(failed_file) + + with open(failed_file, 'w') as f: + f.write(f"Exception occurred: {str(e)}\n") + + logging.error(f"Processing failed for {file_to_process}: {str(e)}") + + logging.info("Finished processing all batches.") + + +if __name__ == '__main__': + process_wrapper(grid_process) diff --git a/examples/gw_simple_grid_wrapper_example.yaml b/examples/gw_simple_grid_wrapper_example.yaml new file mode 100644 index 00000000..da527cdd --- /dev/null +++ b/examples/gw_simple_grid_wrapper_example.yaml @@ -0,0 +1,23 @@ +name: gw-simple-grid-wrapper-example +description: "component_simple_grid_wrapper_example got wrapped by grid_wrapper, which wraps any CLAIMED component and implements the generic grid computing pattern https://romeokienzler.medium.com/the-generic-grid-computing-pattern-transforms-any-sequential-workflow-step-into-a-transient-grid-c7f3ca7459c8 This simple grid wrapper just scans a folder and for each file the grid_process function is called. Locking is achieved the following way: Given source file1.ext is processed, simple_grid_wrapper creates files in the target_directory following the pattern file1.{STATUS}.ext where STATUS in: LOCKED PROCESSED FAILED CLAIMED component description: component-simple-grid-wrapper-example – CLAIMED V0.1" + +inputs: +- {name: log_level, type: String, description: "update log level", default: "INFO"} +- {name: sgw_source_folder, type: String, description: "folder containing input data in single files"} +- {name: sgw_target_folder, type: String, description: "folder to store the output data in single files. Default: sgw_source_folder, in case sgw_source_folder==sgw_target_folder, files containing .LOCKED., .PROCESSED., .FAILED. are ignored", default: "sgw_source_folder"} + + +outputs: + + +implementation: + container: + image: local/claimed-gw-simple-grid-wrapper-example:0.1 + command: + - sh + - -ec + - | + python ./claimed_gw_simple_grid_wrapper_example.py log_level="${0}" sgw_source_folder="${1}" sgw_target_folder="${2}" + - {inputValue: log_level} + - {inputValue: sgw_source_folder} + - {inputValue: sgw_target_folder} diff --git a/examples/operator_example.cwl b/examples/operator_example.cwl new file mode 100644 index 00000000..ed1a5f1b --- /dev/null +++ b/examples/operator_example.cwl @@ -0,0 +1,47 @@ +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: "claimed" + +inputs: + component: + type: string + default: us.ico.io/geodn/claimed-operator-example:0.2 + inputBinding: + position: 1 + prefix: --component + log_level: + type: string + default: "INFO" + inputBinding: + position: 2 + prefix: --log_level + input_path: + type: string + default: None + inputBinding: + position: 3 + prefix: --input_path + with_default: + type: string + default: "default_value" + inputBinding: + position: 4 + prefix: --with_default + num_values: + type: string + default: "5" + inputBinding: + position: 5 + prefix: --num_values + output_path: + type: string + default: "None" + inputBinding: + position: 6 + prefix: --output_path + + + + +outputs: [] \ No newline at end of file diff --git a/examples/operator_example.ipynb b/examples/operator_example.ipynb new file mode 100644 index 00000000..c32380ad --- /dev/null +++ b/examples/operator_example.ipynb @@ -0,0 +1,81 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# operator_example" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Please update the description of the operator in this markdown cell" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add any requirements (Hint: pip install -r requirements.txt is supported as well)\n", + "# commenting out the pip install command is supported as well\n", + "#!pip install numpy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import logging\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Add the operator interface.\n", + "# A comment one line above os.getenv is the description of this variable.\n", + "input_path = os.getenv('input_path')\n", + "\n", + "# If you specify a default value, this parameter gets marked as optional\n", + "with_default = os.getenv('with_default', 'default_value')\n", + "\n", + "# You can cast to a specific type with int(), float(), or bool() - this type information propagates down to the execution engines (e.g., Kubeflow)\n", + "num_values = int(os.getenv('num_values', 5))\n", + "\n", + "# Output paths are starting with \"output_\".\n", + "output_path = os.getenv('output_path', None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Add your code.\n", + "random_values = np.random.rand(num_values)\n", + "\n", + "# C3 adds setup code to your notebook which initalize the logging.\n", + "# You can just use logging.debug(), logging.info(), logging.warning() in your code.\n", + "logging.info(f'Random values: {random_values}')" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/operator_example.job.yaml b/examples/operator_example.job.yaml new file mode 100644 index 00000000..3a1d8366 --- /dev/null +++ b/examples/operator_example.job.yaml @@ -0,0 +1,26 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: operator-example +spec: + template: + spec: + containers: + - name: operator-example + image: us.ico.io/geodn/claimed-operator-example:0.2 + workingDir: /opt/app-root/src/ + command: ["/opt/app-root/bin/python","claimed_operator_example.py"] + env: + - name: log_level + value: value_of_log_level + - name: input_path + value: value_of_input_path + - name: with_default + value: value_of_with_default + - name: num_values + value: value_of_num_values + - name: output_path + value: value_of_output_path + restartPolicy: OnFailure + imagePullSecrets: + - name: image_pull_secret \ No newline at end of file diff --git a/examples/operator_example.py b/examples/operator_example.py new file mode 100644 index 00000000..abff15c9 --- /dev/null +++ b/examples/operator_example.py @@ -0,0 +1,59 @@ +# TODO: Rename the file to the desired operator name. +""" +TODO: Update the description of the operator in the first doc string. +This is the operator description. +The file name becomes the operator name. +""" + +# TODO: Update the required pip packages. +# pip install numpy + +import os +import logging +import numpy as np + +# TODO: Add the operator interface. +# A comment one line above os.getenv is the description of this variable. +input_path = os.getenv('input_path') + +# If you specify a default value, this parameter gets marked as optional +with_default = os.getenv('with_default', 'default_value') + +# You can cast to a specific type with int(), float(), or bool() - this type information propagates down to the execution engines (e.g., Kubeflow) +num_values = int(os.getenv('num_values', 5)) + +# Output paths are starting with "output_". +output_path = os.getenv('output_path', None) + + +# You can call a function from an additional file (must be in the same directory) or add your code here. +def main(num_values, *args, **kwargs): + # TODO: Add your code. + random_values = np.random.rand(num_values) + # C3 adds setup code to your script which initalize the logging. + # You can just use logging.debug(), logging.info(), logging.warning() in your code. + logging.info(f'Random values: {random_values}') + + +# It is recommended to use a main block to avoid unexpected code execution. +if __name__ == '__main__': + main(num_values) + + +# TODO: Add a grid process if you want to parallelize your code. +def grid_process(batch_id, input_path, with_default, num_values, output_path): + """ + A process for the c3 grid wrapper. The process gets the batch name as the first positional argument, + followed by all interface variables. This is only possible if the code can be processed in parallel, + e.g., by splitting up input files. + """ + + # You might need to update the variables based on the batch + input_path += batch_id + '*.json' + output_path += batch_id + '_data.csv' + + # Execute the processing with adjusted variables + main(num_values, input_path, output_path) + + # optionally return a string or list with output files + return output_path \ No newline at end of file diff --git a/examples/operator_example.yaml b/examples/operator_example.yaml new file mode 100644 index 00000000..7fb4b75f --- /dev/null +++ b/examples/operator_example.yaml @@ -0,0 +1,27 @@ +name: operator-example +description: "TODO: Update the description of the operator in the first doc string. This is the operator description. The file name becomes the operator name. – CLAIMED V0.1" + +inputs: +- {name: log_level, type: String, description: "update log level", default: "INFO"} +- {name: input_path, type: String, description: "A comment one line above os.getenv is the description of this variable."} +- {name: with_default, type: String, description: "If you specify a default value, this parameter gets marked as optional", default: "default_value"} +- {name: num_values, type: Integer, description: "You can cast to a specific type with int(), float(), or bool() - this type information propagates down to the execution engines (e.g., Kubeflow)", default: "5"} +- {name: output_path, type: String, description: "Output paths are starting with 'output_'.", default: "None"} + + +outputs: + + +implementation: + container: + image: us.ico.io/geodn/claimed-operator-example:0.2 + command: + - sh + - -ec + - | + python ./claimed_operator_example.py log_level="${0}" input_path="${1}" with_default="${2}" num_values="${3}" output_path="${4}" + - {inputValue: log_level} + - {inputValue: input_path} + - {inputValue: with_default} + - {inputValue: num_values} + - {inputValue: output_path} diff --git a/examples/pipeline_example.py b/examples/pipeline_example.py new file mode 100644 index 00000000..4b772158 --- /dev/null +++ b/examples/pipeline_example.py @@ -0,0 +1,54 @@ +""" +# TODO: Update description +Tekton pipeline for with the following steps: +1. Step 1 +""" + +# TODO: Install kfp +# pip install kfp +# pip install kfp-tekton + +import kfp +import kfp.dsl as dsl +import kfp.components as comp +from kfp_tekton.compiler import TektonCompiler + +# TODO: Add your pipeline components based on the kfp yaml file from CLAIMED +# initialize operator from yaml file +component_op = comp.load_component_from_file('.yaml') +# initialize operator from remote file +web_op = comp.load_component_from_url('https://raw.githubusercontent.com/claimed-framework/component-library/main/component-library/.yaml') + + +# TODO: Update pipeline description, function name, and parameters +pipeline_name = 'my_pipeline' +# Pipeline function +@dsl.pipeline( + name=pipeline_name, + description="Pipeline description" +) +def my_pipeline( + parameter1: str = "default_value", + parameter2: str = "default_value", +): + # TODO: Add the components and the required parameters + step1 = component_op( + parameter1=parameter1, + ) + step2 = web_op( + parameter2=parameter2, + ) + + # TODO: You can call multiple steps and created the dependencies + step2.after(step1) + +# TODO: Update pipeline function +# Kubernetes +kfp.compiler.Compiler().compile(pipeline_func=my_pipeline, package_path=f'{pipeline_name}.yaml') +# OpenShift with Tekton +TektonCompiler().compile(my_pipeline, f'{pipeline_name}.yaml') + +print(f'Saved pipeline in {pipeline_name}.yaml') + +# TODO: Run script with python +# TODO: Upload the yaml to KubeFlow diff --git a/examples/simple_grid_wrapper_example.py b/examples/simple_grid_wrapper_example.py new file mode 100644 index 00000000..5a2e23b0 --- /dev/null +++ b/examples/simple_grid_wrapper_example.py @@ -0,0 +1,6 @@ +# append processed to each line +def grid_process(source_file, target_file): + with open(source_file, 'r') as src, open(target_file, 'w') as tgt: + for line in src: + processed_line = line.strip() + ' processed\n' + tgt.write(processed_line) \ No newline at end of file diff --git a/examples/simple_grid_wrapper_source/1.txt b/examples/simple_grid_wrapper_source/1.txt new file mode 100644 index 00000000..9daeafb9 --- /dev/null +++ b/examples/simple_grid_wrapper_source/1.txt @@ -0,0 +1 @@ +test diff --git a/examples/simple_grid_wrapper_source/2.txt b/examples/simple_grid_wrapper_source/2.txt new file mode 100644 index 00000000..9daeafb9 --- /dev/null +++ b/examples/simple_grid_wrapper_source/2.txt @@ -0,0 +1 @@ +test diff --git a/examples/simple_grid_wrapper_source/3.txt b/examples/simple_grid_wrapper_source/3.txt new file mode 100644 index 00000000..9daeafb9 --- /dev/null +++ b/examples/simple_grid_wrapper_source/3.txt @@ -0,0 +1 @@ +test diff --git a/examples/simple_grid_wrapper_source_and_target/1.txt b/examples/simple_grid_wrapper_source_and_target/1.txt new file mode 100644 index 00000000..9daeafb9 --- /dev/null +++ b/examples/simple_grid_wrapper_source_and_target/1.txt @@ -0,0 +1 @@ +test diff --git a/examples/simple_grid_wrapper_source_and_target/2.txt b/examples/simple_grid_wrapper_source_and_target/2.txt new file mode 100644 index 00000000..9daeafb9 --- /dev/null +++ b/examples/simple_grid_wrapper_source_and_target/2.txt @@ -0,0 +1 @@ +test diff --git a/examples/simple_grid_wrapper_source_and_target/3.txt b/examples/simple_grid_wrapper_source_and_target/3.txt new file mode 100644 index 00000000..9daeafb9 --- /dev/null +++ b/examples/simple_grid_wrapper_source_and_target/3.txt @@ -0,0 +1 @@ +test diff --git a/examples/simple_grid_wrapper_target/1.PROCESSED.txt b/examples/simple_grid_wrapper_target/1.PROCESSED.txt new file mode 100644 index 00000000..a7ebda21 --- /dev/null +++ b/examples/simple_grid_wrapper_target/1.PROCESSED.txt @@ -0,0 +1 @@ +test processed diff --git a/examples/simple_grid_wrapper_target/2.PROCESSED.txt b/examples/simple_grid_wrapper_target/2.PROCESSED.txt new file mode 100644 index 00000000..a7ebda21 --- /dev/null +++ b/examples/simple_grid_wrapper_target/2.PROCESSED.txt @@ -0,0 +1 @@ +test processed diff --git a/examples/simple_grid_wrapper_target/3.PROCESSED.txt b/examples/simple_grid_wrapper_target/3.PROCESSED.txt new file mode 100644 index 00000000..a7ebda21 --- /dev/null +++ b/examples/simple_grid_wrapper_target/3.PROCESSED.txt @@ -0,0 +1 @@ +test processed diff --git a/examples/workflow_example.cwl b/examples/workflow_example.cwl new file mode 100644 index 00000000..40bf2a09 --- /dev/null +++ b/examples/workflow_example.cwl @@ -0,0 +1,29 @@ +#!/usr/bin/env cwl-runner + + +cwlVersion: v1.2 + +# What type of CWL process we have in this document. +#class: CommandLineTool + +class: Workflow + +inputs: + num_values: string + + +outputs: [] + +steps: + example1: + run: operator_example.cwl + in: + num_values: num_values + out: [] + + example2: + run: operator_example.cwl + in: + num_values: num_values + out: [] + diff --git a/pyproject.toml b/pyproject.toml index 717fa386..d8440d0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,11 +3,13 @@ requires = ["setuptools>=61.0", "setuptools_scm[toml]>=6.2"] build-backend = "setuptools.build_meta" [tool.setuptools_scm] -version_file = "src/_version.py" +version_file = "src/c3/_version.py" [project] -name="claimed" -version = "0.2.1" +name = "claimed-c3" +dynamic = ["version"] +# test pypi version: +# version = "0.2.15" authors = [ { name="The CLAIMED authors", email="claimed-framework@proton.me"}, ] @@ -15,29 +17,34 @@ maintainers = [ { name="Romeo Kienzler", email="claimed-framework@proton.me"}, { name="Benedikt Blumenstiel"}, ] -description = "The CLAIMED framework includes a library of reusable operators, a CLI and the CLAIMED Component Compiler (C3)." +description = "The CLAIMED component compiler (C3) generates container images, KFP components, Kubernetes jobs, CWL Tasks, CLI applications" readme = "README.md" -requires-python=">=3.9" +requires-python = ">=3.7" license = {file = "LICENSE.txt"} -keywords = ["CLAIMED", "component-library"] +keywords = ["CLAIMED", "compiler", "KubeFlow", "Kubernetes"] classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", ] - dependencies = [ - "aiobotocore", - "botocore", - "s3fs", - "claimed-c3", - "claimed-cli", + 'nbconvert >= 7.9.2', + 'ipython >= 8.16.1', + 'traitlets >= 5.11.2', + 'pandas', ] -[tool.setuptools.packages.find] -include = ["claimed*"] - [project.urls] -"Homepage" = "https://github.com/claimed-framework/component-library" -"Bug Tracker" = "https://github.com/claimed-framework/component-library/issues" +"Homepage" = "https://github.com/claimed-framework/c3" +"Bug Tracker" = "https://github.com/claimed-framework/c3/issues" + +[project.scripts] +c3_create_operator = "c3.create_operator:main" +c3_create_containerless_operator = "c3.create_containerless_operator:main" +c3_create_gridwrapper = "c3.create_gridwrapper:main" + +[tool.setuptools.packages.find] +where = ["src"] +[tool.setuptools.package-data] +"c3.templates" = ["*"] diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/c3/__init__.py b/src/c3/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/c3/create_containerless_operator.py b/src/c3/create_containerless_operator.py new file mode 100644 index 00000000..b72f48d2 --- /dev/null +++ b/src/c3/create_containerless_operator.py @@ -0,0 +1,94 @@ +import argparse +import os +import sys +import logging +import subprocess +import re +from c3.create_operator import create_cwl_component +from c3.pythonscript import Pythonscript +from c3.templates import component_setup_code_wo_logging, python_component_setup_code + +def create_containerless_operator( + file_path, + version, + skip_logging = False + ): + + if version is None: + version = 'latest' + + logging.debug(f'Called create_containerless_operator {version} with {file_path}') + + filename, file_extension = os.path.splitext(file_path) + + if file_extension != '.py': + raise NotImplementedError('Containerless operators currenly only support python scripts') + + all_pip_packages_found = '' + with open(file_path, 'r') as file: + for line in file: + if re.search('pip ', line): + pip_packages = re.sub('[#, ,!]*pip[ ]*install[ ]*', '', line) + logging.debug(f'PIP packages found: {pip_packages}') + all_pip_packages_found += (f' {pip_packages}') + logging.info(f'all PIP packages found: {all_pip_packages_found}') + + + # prepend init code to script + target_code = 'runnable.py' + + if os.path.exists(target_code): + os.remove(target_code) + + with open(file_path, 'r') as f: + script = f.read() + if skip_logging: + script = component_setup_code_wo_logging + script + else: + script = python_component_setup_code + script + with open(target_code, 'w') as f: + f.write(script) + + subprocess.run(';'.join(['rm -Rf claimedenv','python -m venv claimedenv', + 'source ./claimedenv/bin/activate', + f'pip install {all_pip_packages_found.strip()}', + 'pip list', + f'zip -r claimed-{filename}:{version}.zip {target_code} claimedenv', + 'rm -Rf claimedenv', + f'rm {target_code}']), shell=True) + script_data = Pythonscript(file_path) + inputs = script_data.get_inputs() + outputs = script_data.get_outputs() + + create_cwl_component(filename, "containerless", version, file_path, inputs, outputs) + + + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('FILE_PATH', type=str, + help='Path to python script or notebook') + parser.add_argument('ADDITIONAL_FILES', type=str, nargs='*', default=None, + help='Paths to additional files to include in the container image') + parser.add_argument('-v', '--version', type=str, default=None, + help='Container image version. Auto-increases the version number if not provided (default 0.1)') + parser.add_argument('-l', '--log_level', type=str, default='INFO') + args = parser.parse_args() + + # Init logging + root = logging.getLogger() + root.setLevel(args.log_level) + handler = logging.StreamHandler(sys.stdout) + formatter = logging.Formatter('%(levelname)s - %(message)s') + handler.setFormatter(formatter) + handler.setLevel(args.log_level) + root.addHandler(handler) + + create_containerless_operator( + file_path=args.FILE_PATH, + version=args.version, + ) + +if __name__ == '__main__': + main() diff --git a/src/c3/create_gridwrapper.py b/src/c3/create_gridwrapper.py new file mode 100644 index 00000000..e8184ea3 --- /dev/null +++ b/src/c3/create_gridwrapper.py @@ -0,0 +1,256 @@ +import logging +import os +import argparse +import sys +from string import Template +from c3.pythonscript import Pythonscript +from c3.utils import convert_notebook +from c3.create_operator import create_operator +from c3.templates import component_setup_code_wo_logging +import c3 + + +def wrap_component(component_path, + component_description, + component_dependencies, + component_interface, + component_inputs, + component_process, + backend, + ): + # get component name from path + component_name = os.path.splitext(os.path.basename(component_path))[0] + + logging.info(f'Using backend: {backend}') + + backends = { + 'local': c3.templates.grid_wrapper_template, + 'cos': c3.templates.cos_grid_wrapper_template, + 'legacy_cos': c3.templates.legacy_cos_grid_wrapper_template, + 's3kv': c3.templates.s3kv_grid_wrapper_template, + 'grid_wrapper': c3.templates.grid_wrapper_template, + 'cos_grid_wrapper': c3.templates.cos_grid_wrapper_template, + 'legacy_cos_grid_wrapper': c3.templates.legacy_cos_grid_wrapper_template, + 's3kv_grid_wrapper': c3.templates.s3kv_grid_wrapper_template, + 'simple_grid_wrapper': c3.templates.simple_grid_wrapper_template, + 'folder_grid_wrapper': c3.templates.folder_grid_wrapper_template, + } + gw_template = backends.get(backend) + + logging.debug(f'Using backend template: {gw_template}') + + grid_wrapper_code = gw_template.substitute( + component_name=component_name, + component_description=component_description, + component_dependencies=component_dependencies, + component_inputs=component_inputs, + component_interface=component_interface, + component_process=component_process, + ) + + # Write edited code to file + grid_wrapper_file = f'gw_{component_name}.py' + grid_wrapper_file_path = os.path.join(os.path.dirname(component_path), grid_wrapper_file) + # remove 'component_' from gw path + grid_wrapper_file_path = grid_wrapper_file_path.replace('component_', '') + with open(grid_wrapper_file_path, 'w') as f: + f.write(grid_wrapper_code) + + logging.info(f'Saved wrapped component to {grid_wrapper_file_path}') + + return grid_wrapper_file_path + + +def get_component_elements(file_path): + # get required elements from component code + py = Pythonscript(file_path) + # convert description into a string with a single line + description = (py.get_description().replace('\n', ' ').replace('"', '\'')) + inputs = py.get_inputs() + outputs = py.get_outputs() + dependencies = py.get_requirements() + + # combine dependencies list + dependencies = '\n# '.join(dependencies) + + # generate interface code from inputs + interface = '' + type_to_func = {'String': '', 'Boolean': 'bool', 'Integer': 'int', 'Float': 'float'} + for variable, d in inputs.items(): + interface += f"# {d['description']}\n" + if (d['type'] == 'String' and d['default'] is not None and + (d['default'] == '' or d['default'][0] not in '\'\"')): + # Add quotation marks + d['default'] = "'" + d['default'] + "'" + interface += f"component_{variable} = {type_to_func[d['type']]}(os.getenv('{variable}', {d['default']}))\n" + + # TODO: Implement output interface + if len(outputs) > 0: + logging.warning('Found output paths in the component code which is currently not supported.') + + # generate kwargs for the subprocesses + process_inputs = ', '.join([f'{i}=component_{i}' for i in inputs.keys()]) + # use log level from grid wrapper + process_inputs = process_inputs.replace('component_log_level', 'log_level') + + return description, interface, process_inputs, dependencies + + +# Adding code +def edit_component_code(file_path, component_process): + file_name = os.path.basename(file_path) + if file_path.endswith('.ipynb'): + logging.info('Convert notebook to python script') + target_file = convert_notebook(file_path) + file_path = target_file + file_name = os.path.basename(file_path) + else: + # write edited code to different file + target_file = os.path.join(os.path.dirname(file_path), 'component_' + file_name.replace('-', '_')) + + target_file_name = os.path.basename(target_file) + + with open(file_path, 'r') as f: + script = f.read() + assert component_process in script, (f'Did not find the grid process {component_process} in the script. ' + f'Please provide the grid process in the arguments `-p `.') + # Add code for logging and cli parameters to the beginning of the script + script = component_setup_code_wo_logging + script + # replace old filename with new file name + script = script.replace(file_name, target_file_name) + with open(target_file, 'w') as f: + f.write(script) + + if '__main__' not in script: + logging.warning('No __main__ found in component code. Grid wrapper will import functions from component, ' + 'which can lead to unexpected behaviour without using __main__.') + + logging.info('Saved component python script in ' + target_file) + + return target_file + + +def apply_grid_wrapper(file_path, component_process, backend): + assert file_path.endswith('.py') or file_path.endswith('.ipynb'), \ + "Please provide a component file path to a python script or notebook." + + file_path = edit_component_code(file_path, component_process) + + description, interface, inputs, dependencies = get_component_elements(file_path) + + component_elements = dict( + component_path=file_path, + component_description=description, + component_dependencies=dependencies, + component_interface=interface, + component_inputs=inputs, + component_process=component_process + ) + + logging.debug('Wrap component with parameters:') + for component, value in component_elements.items(): + logging.debug(component + ':\n' + str(value) + '\n') + + logging.info('Wrap component') + grid_wrapper_file_path = wrap_component(backend=backend, **component_elements) + return grid_wrapper_file_path, file_path + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('FILE_PATH', type=str, + help='Path to python script or notebook') + parser.add_argument('ADDITIONAL_FILES', type=str, nargs='*', + help='List of paths to additional files to include in the container image') + parser.add_argument('-p', '--component_process', type=str, default='grid_process', + help='Name of the component sub process that is executed for each batch.') + parser.add_argument('-b', '--backend', type=str, default='local', + help='Define backend. Default: local. Others: cos, s3kv, legacy_cos (with automatic file download/upload)') + parser.add_argument('-r', '--repository', type=str, default=None, + help='Container registry address, e.g. docker.io/') + parser.add_argument('-v', '--version', type=str, default=None, + help='Container image version. Auto-increases the version number if not provided (default 0.1)') + parser.add_argument('--rename', type=str, nargs='?', default=None, const='', + help='Rename existing yaml files (argument without value leads to modified_{file name})') + parser.add_argument('--overwrite', action='store_true', help='Overwrite existing yaml files') + parser.add_argument('-l', '--log_level', type=str, default='INFO') + parser.add_argument('--dockerfile_template_path', type=str, default='', + help='Path to custom dockerfile template') + parser.add_argument('--dockerfile', type=str, default='Dockerfile.generated', + help='Name or path of the generated dockerfile.') + parser.add_argument('--local_mode', action='store_true', + help='Continue processing after docker errors.') + parser.add_argument('--no-cache', action='store_true', help='Not using cache for docker build.') + parser.add_argument('--skip-logging', action='store_true', + help='Exclude logging code from component setup code') + parser.add_argument('--keep-generated-files', action='store_true', + help='Do not delete temporary generated files.') + parser.add_argument('--platform', type=str, default='linux/amd64', + help='Select image platform, default is linux/amd64. Alternativly, select linux/arm64".') + parser.add_argument('--image_version', type=str, default='python3.12', + help='Select python or R version (defaults to python3.12).') + + args = parser.parse_args() + + # Init logging + root = logging.getLogger() + root.setLevel(args.log_level) + handler = logging.StreamHandler(sys.stdout) + formatter = logging.Formatter('%(levelname)s - %(message)s') + handler.setFormatter(formatter) + handler.setLevel(args.log_level) + root.addHandler(handler) + + grid_wrapper_file_path = component_path = '' + try: + grid_wrapper_file_path, component_path = apply_grid_wrapper( + file_path=args.FILE_PATH, + component_process=args.component_process, + backend=args.backend, + ) + + logging.info('Generate CLAIMED operator for grid wrapper') + + # Add component path and init file path to additional_files + args.ADDITIONAL_FILES.append(component_path) + + # Update dockerfile template if specified + if args.dockerfile_template_path != '': + logging.info(f'Uses custom dockerfile template from {args.dockerfile_template_path}') + with open(args.dockerfile_template_path, 'r') as f: + custom_dockerfile_template = Template(f.read()) + else: + custom_dockerfile_template = None + + create_operator( + file_path=grid_wrapper_file_path, + repository=args.repository, + version=args.version, + custom_dockerfile_template=custom_dockerfile_template, + additional_files=args.ADDITIONAL_FILES, + log_level=args.log_level, + local_mode=args.local_mode, + no_cache=args.no_cache, + overwrite_files=args.overwrite, + rename_files=args.rename, + skip_logging=args.skip_logging, + keep_generated_files=args.keep_generated_files, + platform=args.platform, + dockerfile=args.dockerfile, + image_version=args.image_version, + ) + except Exception as err: + logging.error('Error while generating CLAIMED grid wrapper. ' + 'Consider using `--log_level DEBUG` and `--keep-generated-files` for debugging.') + raise err + finally: + if not args.keep_generated_files: + logging.info('Remove local component file and grid wrapper code.') + if os.path.isfile(grid_wrapper_file_path): + os.remove(grid_wrapper_file_path) + if os.path.isfile(component_path): + os.remove(component_path) + + +if __name__ == '__main__': + main() diff --git a/src/c3/create_operator.py b/src/c3/create_operator.py new file mode 100644 index 00000000..0e2bb738 --- /dev/null +++ b/src/c3/create_operator.py @@ -0,0 +1,525 @@ + +import os +import sys +import logging +import shutil +import argparse +import subprocess +import glob +import re +import json +from pathlib import Path +from string import Template +from typing import Optional +from c3.pythonscript import Pythonscript +from c3.notebook import Notebook +from c3.rscript import Rscript +from c3.utils import convert_notebook, get_image_version +from c3.templates import (python_component_setup_code, component_setup_code_wo_logging, r_component_setup_code, + python_dockerfile_template, r_dockerfile_template, + kfp_component_template, kubernetes_job_template, cwl_component_template) + +CLAIMED_VERSION = 'V0.1' + + +def create_dockerfile(dockerfile_template, dockerfile, requirements, target_code, target_dir, additional_files, + working_dir, command, image_version): + # Check for requirements file + for i in range(len(requirements)): + if '-r ' in requirements[i]: + r_file_search = re.search('-r ~?\/?([^\s]*\.txt)', requirements[i]) + if len(r_file_search.groups()): + # Get file from regex + requirements_file = r_file_search.groups()[0] + if requirements_file not in additional_files and os.path.isfile(requirements_file): + # Add missing requirements text file to additional files + additional_files.append(r_file_search.groups()[0]) + if '/' not in requirements[i]: + # Add missing home directory to the command `pip install -r ~/requirements.txt` + requirements[i] = requirements[i].replace('-r ', '-r ~/') + + requirements_docker = list(map(lambda s: 'RUN ' + s, requirements)) + requirements_docker = '\n'.join(requirements_docker) + additional_files_docker = list(map(lambda s: f"ADD {s} {working_dir}{s}", additional_files)) + additional_files_docker = '\n'.join(additional_files_docker) + + # Select base image + if 'python' in command: + base_image = f"registry.access.redhat.com/ubi8/python-{image_version.strip('python').replace('.', '')}" + elif command == 'Rscript': + if 'python' in image_version: + # Using default R version + image_version = 'R4.3.2' + base_image = f"r-base:{image_version.strip('Rr:')}" + else: + raise ValueError(f'Unrecognized command {command}') + logging.info(f'Using base image {base_image}') + + docker_file = dockerfile_template.substitute( + base_image=base_image, + requirements_docker=requirements_docker, + target_code=target_code, + target_dir=target_dir, + additional_files_docker=additional_files_docker, + working_dir=working_dir, + command=os.path.basename(command), + ) + + logging.info('Create Dockerfile') + with open(dockerfile, "w") as text_file: + text_file.write(docker_file) + logging.debug(f'{dockerfile}:\n' + docker_file) + + +def create_kfp_component(name, description, repository, version, command, target_code, target_dir, file_path, inputs, outputs): + + inputs_list = str() + for input, options in inputs.items(): + inputs_list += f'- {{name: {input}, type: {options["type"]}, description: "{options["description"]}"' + if options['default'] is not None: + if not options["default"].startswith('"'): + options["default"] = f'"{options["default"]}"' + inputs_list += f', default: {options["default"]}' + inputs_list += '}\n' + + outputs_list = str() + for output, options in outputs.items(): + outputs_list += f'- {{name: {output}, type: String, description: "{options["description"]}"}}\n' + + parameter_list = str() + for index, key in enumerate(list(inputs.keys()) + list(outputs.keys())): + parameter_list += f'{key}="${{{index}}}" ' + + parameter_values = str() + for input_key in inputs.keys(): + parameter_values += f" - {{inputValue: {input_key}}}\n" + for output_key in outputs.keys(): + parameter_values += f" - {{outputPath: {output_key}}}\n" + + yaml = kfp_component_template.substitute( + name=name, + description=description, + repository=repository, + version=version, + inputs=inputs_list, + outputs=outputs_list, + command=os.path.basename(command), + target_dir=target_dir, + target_code=target_code, + parameter_list=parameter_list, + parameter_values=parameter_values, + ) + + logging.debug('KubeFlow component yaml:\n' + yaml) + target_yaml_path = str(Path(file_path).with_suffix('.yaml')) + + logging.info(f'Write KubeFlow component yaml to {target_yaml_path}') + with open(target_yaml_path, "w") as text_file: + text_file.write(yaml) + + +def create_kubernetes_job(name, repository, version, target_code, target_dir, command, working_dir, file_path, inputs): + # get environment entries + env_entries = str() + for key in list(inputs.keys()): + env_entries += f" - name: {key}\n value: value_of_{key}\n" + env_entries = env_entries.rstrip() + + job_yaml = kubernetes_job_template.substitute( + name=name, + repository=repository, + version=version, + target_code=target_code, + target_dir=target_dir, + env_entries=env_entries, + command=command, + working_dir=working_dir, + ) + + logging.debug('Kubernetes job yaml:\n' + job_yaml) + target_job_yaml_path = str(Path(file_path).with_suffix('.job.yaml')) + + logging.info(f'Write kubernetes job yaml to {target_job_yaml_path}') + with open(target_job_yaml_path, "w") as text_file: + text_file.write(job_yaml) + + +def create_cwl_component(name, repository, version, file_path, inputs, outputs): + type_dict = {'String': 'string', 'Integer': 'int', 'Float': 'float', 'Boolean': 'bool'} + # get environment entries + i = 1 + input_envs = str() + for input, options in inputs.items(): + i += 1 + # Convert string default value to CWL types + default_value = options['default'] if options['type'] == 'String' and options['default'] != '"None"' \ + else options['default'].strip('"\'') + input_envs += (f" {input}:\n type: {type_dict[options['type']]}\n default: {default_value}\n " + f"inputBinding:\n position: {i}\n prefix: --{input}\n") + + if len(outputs) == 0: + output_envs = '[]' + else: + output_envs = '\n' + for output, options in outputs.items(): + i += 1 + output_envs += (f" {output}:\n type: string\n " + f"inputBinding:\n position: {i}\n prefix: --{output}\n") + + cwl = cwl_component_template.substitute( + name=name, + repository=repository, + version=version, + inputs=input_envs, + outputs=output_envs, + ) + + logging.debug('CWL component:\n' + cwl) + target_cwl_path = str(Path(file_path).with_suffix('.cwl')) + + logging.info(f'Write cwl component to {target_cwl_path}') + with open(target_cwl_path, "w") as text_file: + text_file.write(cwl) + + +def check_existing_files(file_path, rename_files, overwrite_files): + if rename_files is None and overwrite_files: + # Overwrite potential files + return + + target_job_yaml_path = Path(file_path).with_suffix('.job.yaml') + + # Check for existing job yaml + if target_job_yaml_path.is_file(): + if rename_files is None: + # Ask user + rename_files = input(f'\nFound a existing Kubernetes job file at {target_job_yaml_path}.\n' + f'ENTER to overwrite the file, write Y to rename the file to ' + f'modified_{target_job_yaml_path.name}, or provide a custom name:\n') + if rename_files.strip() == '': + # Overwrite file + return + elif rename_files.lower() == 'y': + # Default file name + new_file_name = 'modified_' + Path(file_path).name + else: + # Rename to custom name + new_file_name = rename_files + + modified_path = (target_job_yaml_path.parent / new_file_name).with_suffix('.job.yaml') + # Check if modified path exists and potentially overwrite + if modified_path.exists(): + if overwrite_files: + logging.info(f'Overwriting modified path {modified_path}.') + else: + overwrite = input(f'Modified path {modified_path} already exists. ENTER to overwrite the file.') + if overwrite != '': + logging.error(f'Abort creating operator. Please rename file manually and rerun the script.') + raise FileExistsError + + os.rename(str(target_job_yaml_path), str(modified_path)) + logging.info(f'Renamed Kubernetes job file to {modified_path}') + # TODO: Should we check other files too? Currently assuming no modification for yaml and cwl. + + +def print_claimed_command(name, repository, version, inputs): + claimed_command = f"claimed --component {repository}/claimed-{name}:{version}" + for input, options in inputs.items(): + claimed_command += f" --{input} {options['default']}" + logging.info(f'Run operators locally with claimed-cli:\n{claimed_command}') + + +def remove_temporary_files(file_path, target_code): + logging.info(f'Remove local files') + # remove temporary files + if file_path != target_code: + os.remove(target_code) + if os.path.isfile('Dockerfile'): + os.remove('Dockerfile') + + +def create_operator(file_path: str, + repository: str, + version: str, + custom_dockerfile_template: Optional[Template], + additional_files: str = None, + log_level='INFO', + local_mode=False, + no_cache=False, + rename_files=None, + overwrite_files=False, + skip_logging=False, + keep_generated_files=False, + platform='linux/amd64', + dockerfile='Dockerfile.generated', + image_version='python3.12', + ): + logging.info('Parameters: ') + logging.info('file_path: ' + file_path) + logging.info('repository: ' + str(repository)) + logging.info('version: ' + str(version)) + logging.info('additional_files: ' + '; '.join(additional_files)) + + if file_path.endswith('.py'): + # use temp file for processing + target_code = 'claimed_' + os.path.basename(file_path) + # Copy file to current working directory + shutil.copy(file_path, target_code) + # Add code for logging and cli parameters to the beginning of the script + with open(target_code, 'r') as f: + script = f.read() + if skip_logging: + script = component_setup_code_wo_logging + script + else: + script = python_component_setup_code + script + with open(target_code, 'w') as f: + f.write(script) + # getting parameter from the script + script_data = Pythonscript(target_code) + dockerfile_template = custom_dockerfile_template or python_dockerfile_template + command = '/opt/app-root/bin/python' + working_dir = '/opt/app-root/src/' + + elif file_path.endswith('.ipynb'): + # use temp file for processing + target_code = 'claimed_' + os.path.basename(file_path) + # Copy file to current working directory + shutil.copy(file_path, target_code) + with open(target_code, 'r') as json_file: + notebook = json.load(json_file) + # Add code for logging and cli parameters to the beginning of the notebook + notebook['cells'].insert(0, { + 'cell_type': 'code', 'execution_count': None, 'metadata': {}, 'outputs': [], + 'source': component_setup_code_wo_logging if skip_logging else python_component_setup_code}) + with open(target_code, 'w') as json_file: + json.dump(notebook, json_file) + # getting parameter from the script + script_data = Notebook(target_code) + dockerfile_template = custom_dockerfile_template or python_dockerfile_template + command = '/opt/app-root/bin/ipython' + working_dir = '/opt/app-root/src/' + + elif file_path.lower().endswith('.r'): + # use temp file for processing + target_code = 'claimed_' + os.path.basename(file_path) + # Copy file to current working directory + shutil.copy(file_path, target_code) + # Add code for logging and cli parameters to the beginning of the script + with open(target_code, 'r') as f: + script = f.read() + script = r_component_setup_code + script + with open(target_code, 'w') as f: + f.write(script) + # getting parameter from the script + script_data = Rscript(target_code) + dockerfile_template = custom_dockerfile_template or r_dockerfile_template + command = 'Rscript' + working_dir = '/home/docker/' + else: + raise NotImplementedError('Please provide a file_path to a jupyter notebook, python script, or R script.') + + name = script_data.get_name() + # convert description into a string with a single line + description = ('"' + script_data.get_description().replace('\n', ' ').replace('"', '\'') + + ' – CLAIMED ' + CLAIMED_VERSION + '"') + inputs = script_data.get_inputs() + outputs = script_data.get_outputs() + requirements = script_data.get_requirements() + # Strip 'claimed-' from name of copied temp file + if name.startswith('claimed-'): + name = name[8:] + target_dir = os.path.dirname(file_path) + # Check that the main file is within the cwd + if '../' in target_dir: + raise PermissionError(f"Forbidden path outside the docker build context: {target_dir}. " + f"Change the current working directory to include the file.") + elif target_dir != '': + target_dir += '/' + + logging.info('Operator name: ' + name) + logging.info('Description: ' + description) + logging.info('Inputs:\n' + ('\n'.join([f'{k}: {v}' for k, v in inputs.items()]))) + logging.info('Outputs:\n' + ('\n'.join([f'{k}: {v}' for k, v in outputs.items()]))) + logging.info('Requirements: ' + '; '.join(requirements)) + logging.debug(f'Target code: {target_code}') + logging.debug(f'Target directory: {target_dir}') + + # Load all additional files + logging.debug('Looking for additional files:') + additional_files_found = [] + for file_pattern in additional_files: + if '../' in file_pattern: + # Check that additional file are within the cwd + raise PermissionError(f"Forbidden path outside the docker build context: {file_pattern}. " + f"Change the current working directory to include all additional files.") + # Include files based on wildcards + files_found = glob.glob(file_pattern) + if len(files_found) == 0: + raise FileNotFoundError(f'No additional files for path {file_pattern}.') + additional_files_found.extend(files_found) + logging.debug(f'Searched for "{file_pattern}". Found {", ".join(files_found)}') + logging.info(f'Found {len(additional_files_found)} additional files and directories\n' + f'{", ".join(additional_files_found)}') + + create_dockerfile(dockerfile_template, dockerfile, requirements, target_code, target_dir, additional_files_found, + working_dir, command, image_version) + + if version is None: + # auto increase version based on registered images + version = get_image_version(repository, name) + + if repository is None: + if not local_mode: + logging.warning('No repository provided. The container image is only saved locally. Add `-r ` ' + 'to push the image to a container registry or run `--local_mode` to suppress this warning.') + local_mode = True + repository = 'local' + + if subprocess.run('docker buildx', shell=True, stdout=subprocess.PIPE).returncode == 0: + # Using docker buildx + logging.debug('Using docker buildx') + build_command = f'docker buildx build -f {dockerfile}' + else: + logging.debug('Using docker build. Consider installing docker-buildx.') + build_command = f'docker build -f {dockerfile}' + + logging.info(f'Building container image claimed-{name}:{version}') + try: + # Run docker build + subprocess.run( + f"{build_command} --platform {platform} -t claimed-{name}:{version} . {'--no-cache' if no_cache else ''}", + stdout=None if log_level == 'DEBUG' else subprocess.PIPE, check=True, shell=True + ) + if repository is not None: + # Run docker tag + logging.debug(f'Tagging images with "latest" and "{version}"') + subprocess.run( + f"docker tag claimed-{name}:{version} {repository}/claimed-{name}:{version}", + stdout=None if log_level == 'DEBUG' else subprocess.PIPE, check=True, shell=True, + ) + subprocess.run( + f"docker tag claimed-{name}:{version} {repository}/claimed-{name}:latest", + stdout=None if log_level == 'DEBUG' else subprocess.PIPE, check=True, shell=True, + ) + except Exception as err: + logging.error('Docker build failed. Consider running C3 with `--log_level DEBUG` to see the docker build logs.') + if not keep_generated_files: + remove_temporary_files(file_path, target_code) + raise err + logging.info(f'Successfully built image claimed-{name}:{version}') + + if local_mode: + logging.info(f'No repository provided, skip docker push.') + else: + logging.info(f'Pushing images to registry {repository}') + try: + # Run docker push + subprocess.run( + f"docker push {repository}/claimed-{name}:latest", + stdout=None if log_level == 'DEBUG' else subprocess.PIPE, check=True, shell=True, + ) + subprocess.run( + f"docker push {repository}/claimed-{name}:{version}", + stdout=None if log_level == 'DEBUG' else subprocess.PIPE, check=True, shell=True, + ) + logging.info('Successfully pushed image to registry') + except Exception as err: + logging.error(f'Could not push images to namespace {repository}. ' + f'Please check if docker is logged in or select a namespace with access.') + if not keep_generated_files: + remove_temporary_files(file_path, target_code) + raise err + + # Check for existing files and optionally modify them before overwriting + try: + check_existing_files(file_path, rename_files, overwrite_files) + except Exception as err: + if not keep_generated_files: + remove_temporary_files(file_path, target_code) + raise err + + # Create application scripts + create_kfp_component(name, description, repository, version, command, target_code, target_dir, file_path, inputs, + outputs) + + create_kubernetes_job(name, repository, version, target_code, target_dir, command, working_dir, file_path, inputs) + + create_cwl_component(name, repository, version, file_path, inputs, outputs) + + print_claimed_command(name, repository, version, inputs) + + # Remove temp files + if not keep_generated_files: + remove_temporary_files(file_path, target_code) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('FILE_PATH', type=str, + help='Path to python script or notebook') + parser.add_argument('ADDITIONAL_FILES', type=str, nargs='*', + help='Paths to additional files to include in the container image') + parser.add_argument('-r', '--repository', type=str, default=None, + help='Container registry address, e.g. docker.io/') + parser.add_argument('-v', '--version', type=str, default=None, + help='Container image version. Auto-increases the version number if not provided (default 0.1)') + parser.add_argument('--rename', type=str, nargs='?', default=None, const='', + help='Rename existing yaml files (argument without value leads to modified_{file name})') + parser.add_argument('--overwrite', action='store_true', help='Overwrite existing yaml files') + parser.add_argument('-l', '--log_level', type=str, default='INFO') + parser.add_argument('--dockerfile_template_path', type=str, default='', + help='Path to custom dockerfile template') + parser.add_argument('--dockerfile', type=str, default='Dockerfile.generated', + help='Name or path of the generated dockerfile.') + parser.add_argument('--local_mode', action='store_true', + help='Continue processing after docker errors.') + parser.add_argument('--no-cache', action='store_true', help='Not using cache for docker build.') + parser.add_argument('--skip-logging', action='store_true', + help='Exclude logging code from component setup code') + parser.add_argument('--keep-generated-files', action='store_true', + help='Do not delete temporary generated files.') + parser.add_argument('--platform', type=str, default='linux/amd64', + help='Select image platform, default is linux/amd64. Alternativly, select linux/arm64".') + parser.add_argument('--image_version', type=str, default='python3.12', + help='Select python or R version (defaults to python3.12).') + + args = parser.parse_args() + + # Init logging + root = logging.getLogger() + root.setLevel(args.log_level) + handler = logging.StreamHandler(sys.stdout) + formatter = logging.Formatter('%(levelname)s - %(message)s') + handler.setFormatter(formatter) + handler.setLevel(args.log_level) + root.addHandler(handler) + + # Update dockerfile template if specified + if args.dockerfile_template_path != '': + logging.info(f'Uses custom dockerfile template from {args.dockerfile_template_path}') + with open(args.dockerfile_template_path, 'r') as f: + custom_dockerfile_template = Template(f.read()) + else: + custom_dockerfile_template = None + + create_operator( + file_path=args.FILE_PATH, + repository=args.repository, + version=args.version, + custom_dockerfile_template=custom_dockerfile_template, + additional_files=args.ADDITIONAL_FILES, + log_level=args.log_level, + local_mode=args.local_mode, + no_cache=args.no_cache, + overwrite_files=args.overwrite, + rename_files=args.rename, + skip_logging=args.skip_logging, + keep_generated_files=args.keep_generated_files, + platform=args.platform, + dockerfile=args.dockerfile, + image_version=args.image_version, + ) + + +if __name__ == '__main__': + main() diff --git a/src/c3/notebook.py b/src/c3/notebook.py new file mode 100644 index 00000000..1a5a25bb --- /dev/null +++ b/src/c3/notebook.py @@ -0,0 +1,98 @@ +import json +import re +import os +import logging +from c3.parser import ContentParser, NotebookReader + + +class Notebook(): + def __init__(self, path): + self.path = path + with open(path) as json_file: + self.notebook = json.load(json_file) + + self.name = os.path.basename(path)[:-6].replace('_', '-').lower() + + if self.notebook['cells'][1]['cell_type'] == self.notebook['cells'][2]['cell_type'] == 'markdown': + # backwards compatibility (v0.1 description was included in second cell, merge first two markdown cells) + logging.info('Merge first two markdown cells for description. ' + 'The file name is used as the operator name, not the first markdown cell.') + self.description = self.notebook['cells'][1]['source'][0] + '\n' + self.notebook['cells'][2]['source'][0] + else: + # Using second cell because first cell was added for setup code + self.description = self.notebook['cells'][1]['source'][0] + + self.inputs = self._get_input_vars() + self.outputs = self._get_output_vars() + + def _get_input_vars(self): + cp = ContentParser() + env_names = cp.parse(self.path)['inputs'] + return_value = dict() + notebook_code_lines = list(NotebookReader(self.path).read_next_code_line()) + for env_name, default in env_names.items(): + comment_line = str() + for line in notebook_code_lines: + if re.search("[\"']" + env_name + "[\"']", line): + if not comment_line.strip().startswith('#'): + # previous line was no description, reset comment_line. + comment_line = '' + if comment_line == '': + logging.debug(f'Interface: No description for variable {env_name} provided.') + if re.search(r'=\s*int\(\s*os', line): + type = 'Integer' + elif re.search(r'=\s*float\(\s*os', line): + type = 'Float' + elif re.search(r'=\s*bool\(\s*os', line): + type = 'Boolean' + else: + type = 'String' + return_value[env_name] = { + 'description': comment_line.replace('#', '').replace("\"", "\'").strip(), + 'type': type, + 'default': default + } + break + comment_line = line + return return_value + + def _get_output_vars(self): + cp = ContentParser() + output_names = cp.parse(self.path)['outputs'] + # TODO: Does not check for description code + return_value = {name: { + 'description': f'Output path for {name}', + 'type': 'String', + } for name in output_names} + return return_value + + def get_requirements(self): + requirements = [] + notebook_code_lines = list(NotebookReader(self.path).read_next_code_line()) + # Add dnf install + for line in notebook_code_lines: + if re.search(r'[\s#]*dnf\s*.[^#]*', line): + if '-y' not in line: + # Adding default repo + line += ' -y' + requirements.append(line.replace('#', '').strip()) + + # Add pip install + pattern = r"^[# !]*(pip[ ]*install)[ ]*(.[^#]*)" + for line in notebook_code_lines: + result = re.findall(pattern, line) + if len(result) == 1: + requirements.append((result[0][0] + ' ' + result[0][1].strip())) + return requirements + + def get_name(self): + return self.name + + def get_description(self): + return self.description + + def get_inputs(self): + return self.inputs + + def get_outputs(self): + return self.outputs diff --git a/src/c3/operator_utils.py b/src/c3/operator_utils.py new file mode 100644 index 00000000..5f524872 --- /dev/null +++ b/src/c3/operator_utils.py @@ -0,0 +1,43 @@ +import contextlib +import logging +import os + +# converts string in form [cos|s3]://access_key_id:secret_access_key@endpoint/bucket/path to +# access_key_id, secret_access_key, endpoint, path - path includes bucket name +def explode_connection_string(cs): + if cs is None: + return None + if cs.startswith('cos') or cs.startswith('s3'): + buffer=cs.split('://')[1] + access_key_id=buffer.split('@')[0].split(':')[0] + secret_access_key=buffer.split('@')[0].split(':')[1] + endpoint=f"https://{buffer.split('@')[1].split('/')[0]}" + path='/'.join(buffer.split('@')[1].split('/')[1:]) + return (access_key_id, secret_access_key, endpoint, path) + else: + return (None, None, None, cs) + # TODO consider cs as secret and grab connection string from kubernetes + + +def run_and_log(cos_conn, log_folder, task_id, command_array): + log_root_name = time.time() + job_id = ('-').join(command_array).replace('/','-') # TODO get a unique job id + job_id = re.sub(r'[^a-zA-Z0-9]', '-', job_id) + task_id = re.sub(r'[^a-zA-Z0-9]', '-', task_id) + std_out_log_name = f'{job_id}-{task_id}-{log_root_name}-stdout.log' + std_err_log_name = f'{job_id}-{task_id}-{log_root_name}-stderr.log' + with open(std_out_log_name,'w') as so: + with open(std_err_log_name,'w') as se: + with contextlib.redirect_stdout(so): + with contextlib.redirect_stderr(se): + logging.info('-----INVOKING TASK-----------------------------------') + logging.info(f'Task ID: {task_id}') + logging.info(f'Command: {command_array}') + result = subprocess.run(command_array, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=os.environ.copy()) + output = result.stdout.decode('utf-8') + logging.info("Output:", output) + logging.info("Return code:", result.returncode) + cos_conn.put(std_out_log_name,os.path.join(log_folder,std_out_log_name)) + cos_conn.put(std_err_log_name,os.path.join(log_folder,std_err_log_name)) + os.remove(std_out_log_name) + os.remove(std_err_log_name) \ No newline at end of file diff --git a/src/c3/parser.py b/src/c3/parser.py new file mode 100644 index 00000000..1be4307d --- /dev/null +++ b/src/c3/parser.py @@ -0,0 +1,211 @@ +# +# Copyright 2018-2021 Elyra Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import re + +from traitlets.config import LoggingConfigurable + +from typing import TypeVar, List, Dict + +# Setup forward reference for type hint on return from class factory method. See +# https://stackoverflow.com/questions/39205527/can-you-annotate-return-type-when-value-is-instance-of-cls/39205612#39205612 +F = TypeVar('F', bound='FileReader') + + +class FileReader(LoggingConfigurable): + """ + Base class for parsing a file for resources according to operation type. Subclasses set + their own parser member variable according to their implementation language. + """ + + def __init__(self, filepath: str): + self._filepath = filepath + + @property + def filepath(self): + return self._filepath + + @property + def language(self) -> str: + file_extension = os.path.splitext(self._filepath)[-1].lower() + if file_extension == '.py': + return 'python' + elif file_extension == '.r': + return 'r' + else: + return None + + def read_next_code_line(self) -> List[str]: + """ + Implements a generator for lines of code in the specified filepath. Subclasses + may override if explicit line-by-line parsing is not feasible, e.g. with Notebooks. + """ + with open(self._filepath) as f: + for line in f: + yield line.strip() + + +class NotebookReader(FileReader): + def __init__(self, filepath: str): + super().__init__(filepath) + import nbformat + + with open(self._filepath) as f: + self._notebook = nbformat.read(f, as_version=4) + self._language = None + + try: + self._language = self._notebook['metadata']['language_info']['name'].lower() + + except KeyError: + self.log.warning(f'No language metadata found in {self._filepath}') + pass + + @property + def language(self) -> str: + return self._language + + def read_next_code_line(self) -> List[str]: + for cell in self._notebook.cells: + if cell.source and cell.cell_type == "code": + for line in cell.source.split('\n'): + yield line + + +class ScriptParser(): + """ + Base class for parsing individual lines of code. Subclasses implement a search_expressions() + function that returns language-specific regexes to match against code lines. + """ + + _comment_char = "#" + + def _get_line_without_comments(self, line): + if self._comment_char in line: + index = line.find(self._comment_char) + line = line[:index] + return line.strip() + + def parse_environment_variables(self, line): + # Parse a line fed from file and match each regex in regex dictionary + line = self._get_line_without_comments(line) + if not line: + return [] + + matches = [] + for key, value in self.search_expressions().items(): + for pattern in value: + regex = re.compile(pattern) + for match in regex.finditer(line): + matches.append((key, match)) + return matches + + +class PythonScriptParser(ScriptParser): + def search_expressions(self) -> Dict[str, List]: + # First regex matches envvar assignments that use os.getenv("name", "value") with ow w/o default provided + # Second regex matches envvar assignments that use os.environ.get("name", "value") with or w/o default provided + # Both name and value are captured if possible + inputs = [r"os\.getenv\([\"']([a-zA-Z_]+[A-Za-z0-9_]*)[\"']*(?:\s*\,\s*[\"']?(.[^#]*)?[\"']?)?\).*", + r"os\.environ\.get\([\"']([a-zA-Z_]+[A-Za-z0-9_]*)[\"']*(?:\s*\,\s*[\"']?(.[^#]*)?[\"']?)?\).*"] + # regex matches setting envvars assignments that use + outputs = [r"\s*os\.environ\[[\"']([a-zA-Z_]+[A-Za-z0-9_]*)[\"']].*"] + + regex_dict = dict(inputs=inputs, outputs=outputs) + return regex_dict + + +class RScriptParser(ScriptParser): + def search_expressions(self) -> Dict[str, List]: + + + # Tests for matches of the form: var <- Sys.getenv("key", "optional default") + inputs = [r".*Sys\.getenv\([\"']*([a-zA-Z_]+[A-Za-z0-9_]*)[\"']*(?:\s*\,\s*[\"']?(.[^#]*)?[\"']?)?\).*"] + # Tests for matches of the form: var <- Sys.getenv("key", "optional default") + outputs = [r"\s*Sys\.setenv\([\"']*([a-zA-Z_]+[A-Za-z0-9_]*)[\"']*(?:\s*\,\s*[\"']?(.[^#]*)?[\"']?)?\).*"] + + regex_dict = dict(inputs=inputs, outputs=outputs) + return regex_dict + + +class ContentParser(LoggingConfigurable): + parsers = { + 'python': PythonScriptParser(), + 'r': RScriptParser() + } + + def parse(self, filepath: str) -> dict: + """Returns a model dictionary of all the regex matches for each key in the regex dictionary""" + + properties = {"inputs": {}, "outputs": []} + reader = self._get_reader(filepath) + parser = self._get_parser(reader.language) + + if not parser: + return properties + + for line in reader.read_next_code_line(): + matches = parser.parse_environment_variables(line) + for key, match in matches: + if key == "inputs": + default_value = match.group(2) + if default_value: + # The default value match can end with an additional ', ", or ) which is removed + default_value = re.sub(r"['\")]?$", '', default_value, count=1) + properties[key][match.group(1)] = default_value + else: + properties[key].append(match.group(1)) + + return properties + + def _validate_file(self, filepath: str): + """ + Validate file exists and is file (e.g. not a directory) + """ + if not os.path.exists(filepath): + raise FileNotFoundError(f'No such file or directory: {filepath}') + if not os.path.isfile(filepath): + raise IsADirectoryError(f'Is a directory: {filepath}') + + def _get_reader(self, filepath: str): + """ + Find the proper reader based on the file extension + """ + file_extension = os.path.splitext(filepath)[-1] + + self._validate_file(filepath) + + if file_extension == '.ipynb': + return NotebookReader(filepath) + elif file_extension.lower() in ['.py', '.r']: + return FileReader(filepath) + else: + raise ValueError(f'File type {file_extension} is not supported.') + + def _get_parser(self, language: str): + """ + Find the proper parser based on content language + """ + parser = None + if language: + parser = self.parsers.get(language) + + if not parser: + self.log.warning(f'Content parser for {language} is not available.') + pass + + return parser diff --git a/src/c3/pythonscript.py b/src/c3/pythonscript.py new file mode 100644 index 00000000..eeed2226 --- /dev/null +++ b/src/c3/pythonscript.py @@ -0,0 +1,96 @@ + +import logging +import os +import re +from c3.parser import ContentParser + + +class Pythonscript: + def __init__(self, path): + + self.path = path + with open(path, 'r') as f: + self.script = f.read() + + self.name = os.path.basename(path)[:-3].replace('_', '-').lower() + if '"""' not in self.script: + logging.warning('Please provide a description of the operator in the first doc string.') + self.description = self.name + else: + self.description = self.script.split('"""')[1].strip() + self.inputs = self._get_input_vars() + self.outputs = self._get_output_vars() + + def _get_input_vars(self): + cp = ContentParser() + env_names = cp.parse(self.path)['inputs'] + return_value = dict() + for env_name, default in env_names.items(): + comment_line = str() + for line in self.script.split('\n'): + if re.search("[\"']" + env_name + "[\"']", line): + # Check the description for current variable + if not comment_line.strip().startswith('#'): + # previous line was no description, reset comment_line. + comment_line = '' + if comment_line == '': + logging.debug(f'Interface: No description for variable {env_name} provided.') + if re.search(r'=\s*int\(\s*os', line): + type = 'Integer' + default = default.strip('\"\'') + elif re.search(r'=\s*float\(\s*os', line): + type = 'Float' + default = default.strip('\"\'') + elif re.search(r'=\s*bool\(\s*os', line): + type = 'Boolean' + default = default.strip('\"\'') + else: + type = 'String' + return_value[env_name] = { + 'description': comment_line.replace('#', '').replace("\"", "\'").strip(), + 'type': type, + 'default': default + } + break + comment_line = line + return return_value + + def _get_output_vars(self): + cp = ContentParser() + output_names = cp.parse(self.path)['outputs'] + # TODO: Does not check for description code + return_value = {name: { + 'description': f'Output path for {name}', + 'type': 'String', + } for name in output_names} + return return_value + + def get_requirements(self): + requirements = [] + # Add dnf install + for line in self.script.split('\n'): + if re.search(r'[\s#]*dnf\s*.[^#]*', line): + if '-y' not in line: + # Adding default repo + line += ' -y' + requirements.append(line.replace('#', '').strip()) + + # Add pip install + pattern = r"^[# !]*(pip[ ]*install)[ ]*(.[^#]*)" + for line in self.script.split('\n'): + result = re.findall(pattern, line) + if len(result) == 1: + requirements.append((result[0][0] + ' ' + result[0][1].strip())) + return requirements + + def get_name(self): + return self.name + + def get_description(self): + return self.description + + def get_inputs(self): + return self.inputs + + def get_outputs(self): + return self.outputs diff --git a/src/c3/rscript.py b/src/c3/rscript.py new file mode 100644 index 00000000..9e6cc93e --- /dev/null +++ b/src/c3/rscript.py @@ -0,0 +1,88 @@ + +import logging +import os +import re +from c3.parser import ContentParser + + +class Rscript: + def __init__(self, path): + + self.path = path + with open(path, 'r') as f: + self.script = f.read() + + self.name = os.path.basename(path)[:-2].replace('_', '-').lower() + # TODO: Currently does not support a description + self.description = self.name + self.inputs = self._get_input_vars() + self.outputs = self._get_output_vars() + + def _get_input_vars(self): + cp = ContentParser() + env_names = cp.parse(self.path)['inputs'] + return_value = dict() + for env_name, default in env_names.items(): + comment_line = str() + for line in self.script.split('\n'): + if re.search("[\"']" + env_name + "[\"']", line): + # Check the description for current variable + if not comment_line.strip().startswith('#'): + # previous line was no description, reset comment_line. + comment_line = '' + if comment_line == '': + logging.debug(f'Interface: No description for variable {env_name} provided.') + if re.search(r'=\s*as.numeric\(\s*os', line): + type = 'Float' # double in R + elif re.search(r'=\s*bool\(\s*os', line): + type = 'Boolean' # logical in R + else: + type = 'String' # character in R + + return_value[env_name] = { + 'description': comment_line.replace('#', '').replace("\"", "\'").strip(), + 'type': type, + 'default': default + } + break + comment_line = line + return return_value + + def _get_output_vars(self): + cp = ContentParser() + output_names = cp.parse(self.path)['outputs'] + # TODO: Does not check for description + return_value = {name: {'description': 'output path'} for name in output_names} + return return_value + + def get_requirements(self): + requirements = [] + # Add apt install commands + for line in self.script.split('\n'): + if re.search(r'[\s#]*apt\s*[A-Za-z0-9_-]*', line): + if '-y' not in line: + # Adding default repo + line += ' -y' + requirements.append(line.replace('#', '').strip()) + + # Add Rscript install.packages commands + for line in self.script.split('\n'): + if re.search(r'[\s#]*install\.packages\(.*\)', line): + if 'http://' not in line: + # Adding default repo + line = line.rstrip(') ') + ", repos='http://cran.us.r-project.org')" + command = f"Rscript -e \"{line.replace('#', '').strip()}\"" + requirements.append(command) + return requirements + + def get_name(self): + return self.name + + def get_description(self): + return self.description + + def get_inputs(self): + return self.inputs + + def get_outputs(self): + return self.outputs diff --git a/src/c3/templates/R_dockerfile_template b/src/c3/templates/R_dockerfile_template new file mode 100644 index 00000000..e60449e5 --- /dev/null +++ b/src/c3/templates/R_dockerfile_template @@ -0,0 +1,11 @@ +FROM ${base_image} +USER root +RUN apt update +${requirements_docker} +ADD ${target_code} ${working_dir}${target_dir} +${additional_files_docker} +RUN chmod -R 777 ${working_dir} +RUN chmod -R 777 /usr/local/lib/R/ +USER docker +WORKDIR "${working_dir}" +CMD ["${command}", "${target_dir}${target_code}"] \ No newline at end of file diff --git a/src/c3/templates/__init__.py b/src/c3/templates/__init__.py new file mode 100644 index 00000000..94a3b13f --- /dev/null +++ b/src/c3/templates/__init__.py @@ -0,0 +1,66 @@ + +import os +from string import Template +from pathlib import Path + +# template file names +PYTHON_COMPONENT_SETUP_CODE = 'component_setup_code.py' +R_COMPONENT_SETUP_CODE = 'component_setup_code.R' +PYTHON_COMPONENT_SETUP_CODE_WO_LOGGING = 'component_setup_code_wo_logging.py' +PYTHON_DOCKERFILE_FILE = 'python_dockerfile_template' +R_DOCKERFILE_FILE = 'R_dockerfile_template' +KFP_COMPONENT_FILE = 'kfp_component_template.yaml' +KUBERNETES_JOB_FILE = 'kubernetes_job_template.job.yaml' +CWL_COMPONENT_FILE = 'cwl_component_template.cwl' +GRID_WRAPPER_FILE = 'grid_wrapper_template.py' +COS_GRID_WRAPPER_FILE = 'cos_grid_wrapper_template.py' +LEGACY_COS_GRID_WRAPPER_FILE = 'legacy_cos_grid_wrapper_template.py' +S3KV_GRID_WRAPPER_FILE = 's3kv_grid_wrapper_template.py' +SIMPLE_GRID_WRAPPER_FILE = 'simple_grid_wrapper_template.py' +FOLDER_GRID_WRAPPER_FILE = 'folder_grid_wrapper_template.py' + +# load templates +template_path = Path(os.path.dirname(__file__)) + +with open(template_path / PYTHON_COMPONENT_SETUP_CODE, 'r') as f: + python_component_setup_code = f.read() + +with open(template_path / R_COMPONENT_SETUP_CODE, 'r') as f: + r_component_setup_code = f.read() + +with open(template_path / PYTHON_COMPONENT_SETUP_CODE_WO_LOGGING, 'r') as f: + component_setup_code_wo_logging = f.read() + +with open(template_path / PYTHON_DOCKERFILE_FILE, 'r') as f: + python_dockerfile_template = Template(f.read()) + +with open(template_path / R_DOCKERFILE_FILE, 'r') as f: + r_dockerfile_template = Template(f.read()) + +with open(template_path / KFP_COMPONENT_FILE, 'r') as f: + kfp_component_template = Template(f.read()) + +with open(template_path / KUBERNETES_JOB_FILE, 'r') as f: + kubernetes_job_template = Template(f.read()) + +with open(template_path / CWL_COMPONENT_FILE, 'r') as f: + cwl_component_template = Template(f.read()) + +with open(template_path / GRID_WRAPPER_FILE, 'r') as f: + grid_wrapper_template = Template(f.read()) + +with open(template_path / COS_GRID_WRAPPER_FILE, 'r') as f: + cos_grid_wrapper_template = Template(f.read()) + +with open(template_path / LEGACY_COS_GRID_WRAPPER_FILE, 'r') as f: + legacy_cos_grid_wrapper_template = Template(f.read()) + +with open(template_path / S3KV_GRID_WRAPPER_FILE, 'r') as f: + s3kv_grid_wrapper_template = Template(f.read()) + +with open(template_path / SIMPLE_GRID_WRAPPER_FILE, 'r') as f: + simple_grid_wrapper_template = Template(f.read()) + +with open(template_path / FOLDER_GRID_WRAPPER_FILE, 'r') as f: + folder_grid_wrapper_template = Template(f.read()) + \ No newline at end of file diff --git a/src/c3/templates/component_setup_code.R b/src/c3/templates/component_setup_code.R new file mode 100644 index 00000000..daa3f847 --- /dev/null +++ b/src/c3/templates/component_setup_code.R @@ -0,0 +1,14 @@ + +args = commandArgs(trailingOnly=TRUE) + +for (parameter in args) { + key_value <- unlist(strsplit(parameter, split="=")) + if (length(key_value) == 2) { + print(parameter) + key <- key_value[1] + value <- key_value[2] + eval(parse(text=paste0('Sys.setenv(',key,'="',value,'")'))) + } else { + print(paste('Could not find key value pair for argument ', parameter)) + } +} diff --git a/src/c3/templates/component_setup_code.py b/src/c3/templates/component_setup_code.py new file mode 100644 index 00000000..348cae7b --- /dev/null +++ b/src/c3/templates/component_setup_code.py @@ -0,0 +1,35 @@ +# default code for each operator +import os +import sys +import re +import logging + +# init logger +root = logging.getLogger() +root.setLevel('INFO') +handler = logging.StreamHandler(sys.stdout) +handler.setLevel('INFO') +formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +handler.setFormatter(formatter) +root.addHandler(handler) +logging.basicConfig(level=logging.CRITICAL) + +# get parameters from args +parameters = list(filter( + lambda s: s.find('=') > -1 and bool(re.match(r'[A-Za-z0-9_]*=[.\/A-Za-z0-9]*', s)), + sys.argv + )) + +# set parameters to env variables +for parameter in parameters: + variable = parameter.split('=')[0] + value = parameter.split('=', 1)[-1] + logging.info(f'Parameter: {variable} = "{value}"') + os.environ[variable] = value + +# update log level +log_level = os.environ.get('log_level', 'INFO') +if log_level !='INFO': + logging.info(f'Updating log level to {log_level}') + root.setLevel(log_level) + handler.setLevel(log_level) diff --git a/src/c3/templates/component_setup_code_wo_logging.py b/src/c3/templates/component_setup_code_wo_logging.py new file mode 100644 index 00000000..e8b67a7b --- /dev/null +++ b/src/c3/templates/component_setup_code_wo_logging.py @@ -0,0 +1,17 @@ +import os +import re +import sys +import logging + +# get parameters from args +parameters = list(filter( + lambda s: s.find('=') > -1 and bool(re.match(r'[A-Za-z0-9_]*=[.\/A-Za-z0-9]*', s)), + sys.argv + )) + +# set parameters to env variables +for parameter in parameters: + variable = parameter.split('=')[0] + value = parameter.split('=', 1)[-1] + logging.debug(f'Parameter: {variable} = "{value}"') + os.environ[variable] = value diff --git a/src/c3/templates/cos_grid_wrapper_template.py b/src/c3/templates/cos_grid_wrapper_template.py new file mode 100644 index 00000000..30fa86d9 --- /dev/null +++ b/src/c3/templates/cos_grid_wrapper_template.py @@ -0,0 +1,217 @@ +""" +${component_name} got wrapped by cos_grid_wrapper, which wraps any CLAIMED component and implements the generic grid computing pattern for cos files https://romeokienzler.medium.com/the-generic-grid-computing-pattern-transforms-any-sequential-workflow-step-into-a-transient-grid-c7f3ca7459c8 + +CLAIMED component description: ${component_description} +""" + +# pip install s3fs pandas +# component dependencies +# ${component_dependencies} + +import os +import json +import random +import logging +import shutil +import time +import glob +import s3fs +from datetime import datetime +from pathlib import Path +import pandas as pd + + +# import component code +from ${component_name} import * + + +def explode_connection_string(cs): + if cs is None: + return None, None, None, None + elif cs.startswith('cos') or cs.startswith('s3'): + buffer=cs.split('://', 1)[1] + access_key_id=buffer.split('@')[0].split(':')[0] + secret_access_key=buffer.split('@')[0].split(':')[1] + endpoint = f"https://{buffer.split('@')[1].split('/')[0]}" + path=buffer.split('@')[1].split('/', 1)[1] + return (access_key_id, secret_access_key, endpoint, path) + else: + return (None, None, None, cs) + # TODO consider cs as secret and grab connection string from kubernetes + + +# File containing batches. Provided as a comma-separated list of strings or keys in a json dict. All batch file names must contain the batch name. +gw_batch_file = os.environ.get('gw_batch_file', None) +(gw_batch_file_access_key_id, gw_batch_file_secret_access_key, gw_batch_file_endpoint, gw_batch_file) = explode_connection_string(gw_batch_file) +# Optional column name for a csv batch file (default: 'filename') +gw_batch_file_col_name = os.environ.get('gw_batch_file_col_name', 'filename') +# cos gw_coordinator_connection +gw_coordinator_connection = os.environ.get('gw_coordinator_connection') +(gw_coordinator_access_key_id, gw_coordinator_secret_access_key, gw_coordinator_endpoint, gw_coordinator_path) = explode_connection_string(gw_coordinator_connection) +# timeout in seconds to remove lock file from struggling job (default 3 hours) +gw_lock_timeout = int(os.environ.get('gw_lock_timeout', 10800)) +# ignore error files and rerun batches with errors +gw_ignore_error_files = bool(os.environ.get('gw_ignore_error_files', False)) +# maximal wait time for staggering start +gw_max_time_wait_staggering = int(os.environ.get('gw_max_time_wait_staggering', 60)) + +# coordinator file suffix +suffix_lock = '.lock' +suffix_processed = '.processed' +suffix_error = '.err' + +# component interface +${component_interface} + +# Init s3 +s3coordinator = s3fs.S3FileSystem( + anon=False, + key=gw_coordinator_access_key_id, + secret=gw_coordinator_secret_access_key, + client_kwargs={'endpoint_url': gw_coordinator_endpoint}) +gw_coordinator_path = Path(gw_coordinator_path) + +if gw_batch_file_access_key_id is not None: + s3batch_file = s3fs.S3FileSystem( + anon=False, + key=gw_batch_file_access_key_id, + secret=gw_batch_file_secret_access_key, + client_kwargs={'endpoint_url': gw_batch_file_endpoint}) +else: + logging.debug('Loading batch file from source s3.') + s3batch_file = s3coordinator + + +def load_batches_from_file(batch_file): + if batch_file.endswith('.json'): + # Load batches from keys of a json file + logging.info(f'Loading batches from json file: {batch_file}') + with open(batch_file, 'r') as f: + batch_dict = json.load(f) + batches = batch_dict.keys() + + elif batch_file.endswith('.csv'): + # Load batches from keys of a csv file + logging.info(f'Loading batches from csv file: {batch_file}') + df = pd.read_csv(batch_file, header='infer') + assert gw_batch_file_col_name in df.columns, \ + f'gw_batch_file_col_name {gw_batch_file_col_name} not in columns of batch file {batch_file}' + batches = df[gw_batch_file_col_name].to_list() + + elif batch_file.endswith('.txt'): + # Load batches from comma-separated txt file + logging.info(f'Loading comma-separated batch strings from file: {batch_file}') + with open(batch_file, 'r') as f: + batch_string = f.read() + batches = [b.strip() for b in batch_string.split(',')] + else: + raise ValueError(f'C3 only supports batch files of type ' + f'json (batches = dict keys), ' + f'csv (batches = column values), or ' + f'txt (batches = comma-seperated list).') + + logging.info(f'Loaded {len(batches)} batches') + logging.debug(f'List of batches: {batches}') + assert len(batches) > 0, f"batch_file {batch_file} has no batches." + return batches + + +def perform_process(process, batch): + logging.debug(f'Check coordinator files for batch {batch}.') + # Init coordinator files + lock_file = str(gw_coordinator_path / (batch + suffix_lock)) + processed_file = str(gw_coordinator_path / (batch + suffix_processed)) + error_file = str(gw_coordinator_path / (batch + suffix_error)) + + if s3coordinator.exists(lock_file): + # Remove strugglers + last_modified = s3coordinator.info(lock_file)['LastModified'] + if (datetime.now(last_modified.tzinfo) - last_modified).total_seconds() > gw_lock_timeout: + logging.info(f'Lock file {lock_file} is expired.') + s3coordinator.rm(lock_file) + else: + logging.debug(f'Batch {batch} is locked.') + return + + if s3coordinator.exists(processed_file): + logging.debug(f'Batch {batch} is processed.') + return + + if s3coordinator.exists(error_file): + if gw_ignore_error_files: + logging.info(f'Ignoring previous error in batch {batch} and rerun.') + else: + logging.debug(f'Batch {batch} has error.') + return + + logging.debug(f'Locking batch {batch}.') + s3coordinator.touch(lock_file) + + # processing files with custom process + logging.info(f'Processing batch {batch}.') + try: + target_files = process(batch, ${component_inputs}) + except Exception as err: + logging.exception(err) + # Write error to file + with s3coordinator.open(error_file, 'w') as f: + f.write(f"{type(err).__name__} in batch {batch}: {err}") + s3coordinator.rm(lock_file) + logging.error(f'Continue processing.') + return + + logging.info(f'Finished Batch {batch}.') + s3coordinator.touch(processed_file) + # Remove lock file + if s3coordinator.exists(lock_file): + s3coordinator.rm(lock_file) + else: + logging.warning(f'Lock file {lock_file} was removed by another process. ' + f'Consider increasing gw_lock_timeout to avoid repeated processing (currently {gw_lock_timeout}s).') + + +def process_wrapper(sub_process): + delay = random.randint(0, gw_max_time_wait_staggering) + logging.info(f'Staggering start, waiting for {delay} seconds') + time.sleep(delay) + + # Init coordinator dir + s3coordinator.makedirs(gw_coordinator_path, exist_ok=True) + + # Download batch file + if s3batch_file.exists(gw_batch_file): + s3batch_file.get(gw_batch_file, gw_batch_file) + if not os.path.isfile(gw_batch_file): + # Download batch file from s3 coordinator + cos_gw_batch_file = str(gw_coordinator_path.split([0]) / gw_batch_file) + if s3batch_file.exists(cos_gw_batch_file): + s3batch_file.get(gw_batch_file, gw_batch_file) + else: + raise ValueError("Cannot identify batches. Provide valid gw_batch_file " + "(local path, path within coordinator bucket, or s3 connection to batch file).") + + # Get batches + batches = load_batches_from_file(gw_batch_file) + + # Iterate over all batches + for batch in batches: + perform_process(sub_process, batch) + + # Check and log status of batches + processed_status = sum(s3coordinator.exists(gw_coordinator_path / (batch + suffix_processed)) for batch in batches) + lock_status = sum(s3coordinator.exists(gw_coordinator_path / (batch + suffix_lock)) for batch in batches) + error_status = sum(s3coordinator.exists(gw_coordinator_path / (batch + suffix_error)) for batch in batches) + + logging.info(f'Finished current process. Status batches: ' + f'{processed_status} processed / {lock_status} locked / {error_status} errors / {len(batches)} total') + + if error_status: + logging.error(f'Found errors! Resolve errors and rerun operator with gw_ignore_error_files=True.') + # Print all error messages + for error_file in s3coordinator.glob(str(gw_coordinator_path / ('**/*' + suffix_error))): + with s3coordinator.open(error_file, 'r') as f: + logging.error(f.read()) + + +if __name__ == '__main__': + process_wrapper(${component_process}) diff --git a/src/c3/templates/cwl_component_template.cwl b/src/c3/templates/cwl_component_template.cwl new file mode 100644 index 00000000..f5106075 --- /dev/null +++ b/src/c3/templates/cwl_component_template.cwl @@ -0,0 +1,15 @@ +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: "claimed" + +inputs: + component: + type: string + default: ${repository}/claimed-${name}:${version} + inputBinding: + position: 1 + prefix: --component +${inputs} + +outputs: ${outputs} diff --git a/src/c3/templates/folder_grid_wrapper_template.py b/src/c3/templates/folder_grid_wrapper_template.py new file mode 100644 index 00000000..900ace74 --- /dev/null +++ b/src/c3/templates/folder_grid_wrapper_template.py @@ -0,0 +1,137 @@ +""" +${component_name} got wrapped by folder_grid_wrapper, which wraps any CLAIMED component and implements folder-level locking. +This folder grid wrapper scans immediate subdirectories of sgw_source_folder and for each folder the ${component_process} function is called once. +Locking is achieved by creating files in the target directory using the pattern .{STATUS} where STATUS in: +LOCKED +PROCESSED +FAILED + + +CLAIMED component description: ${component_description} +""" + +# pip install pandas + +# component dependencies +# ${component_dependencies} + +import os +import json +import random +import logging +from pathlib import Path +import pandas as pd + +# import component code +from ${component_name} import * + +# folder containing input data in single files or subfolders +sgw_source_folder = os.environ.get('sgw_source_folder') + +# folder to store the output markers and results +# Default: sgw_source_folder. If equal, entries containing LOCKED or PROCESSED or FAILED are ignored. +sgw_target_folder = os.environ.get('sgw_target_folder', sgw_source_folder) + +# component interface +${component_interface} + +def _marker_paths(entry_name: str, is_dir: bool): + """Return (LOCKED, PROCESSED, FAILED) marker paths for a file or a folder.""" + tgt = Path(sgw_target_folder) + if is_dir: + # folder markers are directories + return ( + tgt / f"{entry_name}.LOCKED", + tgt / f"{entry_name}.PROCESSED", + tgt / f"{entry_name}.FAILED", + ) + # file markers are files + base, ext = os.path.splitext(entry_name) + return ( + tgt / f"{base}.LOCKED{ext}", + tgt / f"{base}.PROCESSED{ext}", + tgt / f"{base}.FAILED{ext}", + ) + +def _claimed_any(locked, processed, failed) -> bool: + return locked.exists() or processed.exists() or failed.exists() + +def get_next_batch(): + """Pick a random unclaimed entry from source, supporting files and folders.""" + filtered = [] + with os.scandir(sgw_source_folder) as it: + for e in it: + name = e.name + + # If source and target are the same, skip marker entries + if sgw_source_folder == sgw_target_folder and ( + "LOCKED" in name or "PROCESSED" in name or "FAILED" in name + ): + continue + + locked, processed, failed = _marker_paths(name, e.is_dir()) + if not _claimed_any(locked, processed, failed): + filtered.append((name, e.is_dir())) + + if filtered: + return random.choice(filtered) # (name, is_dir) + return None + +def _try_acquire_lock(name: str, is_dir: bool): + """Create the LOCKED marker atomically and return its Path, or None if already claimed.""" + locked, _, _ = _marker_paths(name, is_dir) + try: + if is_dir: + # atomic directory creation is a good folder lock + locked.mkdir() + else: + # atomic file creation + fd = os.open(str(locked), os.O_CREAT | os.O_EXCL | os.O_WRONLY) + os.close(fd) + return locked + except FileExistsError: + return None + +def process_wrapper(sub_process): + sgw_target_folder_path = Path(sgw_target_folder) + sgw_target_folder_path.mkdir(exist_ok=True, parents=True) + + while True: + nxt = get_next_batch() + if nxt is None: + break + + entry_name, is_dir = nxt + src_path = str(Path(sgw_source_folder) / entry_name) + locked, processed, failed = _marker_paths(entry_name, is_dir) + logging.info(f"Processing: {src_path}") + + # Acquire the lock. If we lose the race, pick another entry. + lock_path = _try_acquire_lock(entry_name, is_dir) + if lock_path is None: + continue + + try: + # Call user component. For folders, src_path points to the folder. + # The second argument remains the marker path, same as before. + sub_process(src_path, str(lock_path)) + + # Success marker + lock_path.rename(processed) + + except Exception as e: + # Failure marker + lock_path.rename(failed) + if is_dir: + # Put the error message inside the FAILED directory + errfile = Path(failed) / "error.txt" + errfile.write_text(f"Exception occurred: {str(e)}\n", encoding="utf-8") + else: + # For files, FAILED is itself a file; overwrite with the error text + Path(failed).write_text(f"Exception occurred: {str(e)}\n", encoding="utf-8") + logging.error(f"Processing failed for {src_path}: {str(e)}") + + logging.info("Finished processing all batches.") + +if __name__ == '__main__': + process_wrapper(${component_process}) \ No newline at end of file diff --git a/src/c3/templates/grid_wrapper_template.py b/src/c3/templates/grid_wrapper_template.py new file mode 100644 index 00000000..9a418be7 --- /dev/null +++ b/src/c3/templates/grid_wrapper_template.py @@ -0,0 +1,205 @@ +""" +${component_name} got wrapped by grid_wrapper, which wraps any CLAIMED component and implements the generic grid computing pattern https://romeokienzler.medium.com/the-generic-grid-computing-pattern-transforms-any-sequential-workflow-step-into-a-transient-grid-c7f3ca7459c8 + +CLAIMED component description: ${component_description} +""" + +# pip install pandas + +# component dependencies +# ${component_dependencies} + +import os +import json +import random +import logging +import time +import glob +from pathlib import Path +import pandas as pd + +# import component code +from ${component_name} import * + + +# File with batches. Provided as a comma-separated list of strings, keys in a json dict or single column CSV with 'filename' has header. +gw_batch_file = os.environ.get('gw_batch_file', None) +# Optional column name for a csv batch file (default: 'filename') +gw_batch_file_col_name = os.environ.get('gw_batch_file_col_name', 'filename') +# file path pattern like your/path/**/*.tif. Multiple patterns can be separated with commas. Is ignored if gw_batch_file is provided. +gw_file_path_pattern = os.environ.get('gw_file_path_pattern', None) +# pattern for grouping file paths into batches like ".split('.')[-1]". Is ignored if gw_batch_file is provided. +gw_group_by = os.environ.get('gw_group_by', None) +# path to grid wrapper coordinator directory +gw_coordinator_path = os.environ.get('gw_coordinator_path') +gw_coordinator_path = Path(gw_coordinator_path) + +# timeout in seconds to remove lock file from struggling job (default 3 hours) +gw_lock_timeout = int(os.environ.get('gw_lock_timeout', 10800)) +# ignore error files and rerun batches with errors +gw_ignore_error_files = bool(os.environ.get('gw_ignore_error_files', False)) +# maximal wait time for staggering start +gw_max_time_wait_staggering = int(os.environ.get('gw_max_time_wait_staggering', 60)) + +# coordinator file suffix +suffix_lock = '.lock' +suffix_processed = '.processed' +suffix_error = '.err' + +# component interface +${component_interface} + +def load_batches_from_file(batch_file): + if batch_file.endswith('.json'): + # Load batches from keys of a json file + logging.info(f'Loading batches from json file: {batch_file}') + with open(batch_file, 'r') as f: + batch_dict = json.load(f) + batches = batch_dict.keys() + + elif batch_file.endswith('.csv'): + # Load batches from keys of a csv file + logging.info(f'Loading batches from csv file: {batch_file}') + df = pd.read_csv(batch_file, header='infer') + assert gw_batch_file_col_name in df.columns, \ + f'gw_batch_file_col_name {gw_batch_file_col_name} not in columns of batch file {batch_file}' + batches = df[gw_batch_file_col_name].to_list() + + elif batch_file.endswith('.txt'): + # Load batches from comma-separated txt file + logging.info(f'Loading comma-separated batch strings from file: {batch_file}') + with open(batch_file, 'r') as f: + batch_string = f.read() + batches = [b.strip() for b in batch_string.split(',')] + else: + raise ValueError(f'C3 only supports batch files of type ' + f'json (batches = dict keys), ' + f'csv (batches = column values), or ' + f'txt (batches = comma-seperated list).') + + logging.info(f'Loaded {len(batches)} batches') + logging.debug(f'List of batches: {batches}') + assert len(batches) > 0, f"batch_file {batch_file} has no batches." + return batches + + +def identify_batches_from_pattern(file_path_patterns, group_by): + logging.info(f'Start identifying files and batches') + batches = set() + all_files = [] + + # Iterate over comma-separated paths + for file_path_pattern in file_path_patterns.split(','): + logging.info(f'Get file paths from pattern: {file_path_pattern}') + files = glob.glob(file_path_pattern.strip()) + assert len(files) > 0, f"Found no files with file_path_pattern {file_path_pattern}." + all_files.extend(files) + + # get batches by applying the group by function to all file paths + for path_string in all_files: + part = eval('str(path_string)' + group_by, {"group_by": group_by, "path_string": path_string}) + assert part != '', f'Could not extract batch with path_string {path_string} and group_by {group_by}' + batches.add(part) + + logging.info(f'Identified {len(batches)} batches') + logging.debug(f'List of batches: {batches}') + + return batches + + +def perform_process(process, batch): + logging.debug(f'Check coordinator files for batch {batch}.') + # init coordinator files + lock_file = gw_coordinator_path / (batch + suffix_lock) + error_file = gw_coordinator_path / (batch + suffix_error) + processed_file = gw_coordinator_path / (batch + suffix_processed) + + if lock_file.exists(): + # remove strugglers + if lock_file.stat().st_mtime < time.time() - gw_lock_timeout: + logging.debug(f'Lock file {lock_file} is expired.') + lock_file.unlink() + else: + logging.debug(f'Batch {batch} is locked.') + return + + if processed_file.exists(): + logging.debug(f'Batch {batch} is processed.') + return + + if error_file.exists(): + if gw_ignore_error_files: + logging.info(f'Ignoring previous error in batch {batch} and rerun.') + else: + logging.debug(f'Batch {batch} has error.') + return + + logging.debug(f'Locking batch {batch}.') + lock_file.parent.mkdir(parents=True, exist_ok=True) + lock_file.touch() + + # processing files with custom process + logging.info(f'Processing batch {batch}.') + try: + target_files = process(batch, ${component_inputs}) + except Exception as err: + logging.exception(err) + # Write error to file + with open(error_file, 'w') as f: + f.write(f"{type(err).__name__} in batch {batch}: {err}") + lock_file.unlink() + logging.error(f'Continue processing.') + return + + logging.info(f'Finished Batch {batch}.') + processed_file.touch() + + # Remove lock file + if lock_file.exists(): + lock_file.unlink() + else: + logging.warning(f'Lock file {lock_file} was removed by another process. ' + f'Consider increasing gw_lock_timeout to avoid repeated processing (currently {gw_lock_timeout}s).') + + + +def process_wrapper(sub_process): + delay = random.randint(0, gw_max_time_wait_staggering) + logging.info(f'Staggering start, waiting for {delay} seconds') + time.sleep(delay) + + # Init coordinator dir + gw_coordinator_path.mkdir(exist_ok=True, parents=True) + + # get batches + if gw_batch_file is not None and os.path.isfile(gw_batch_file): + batches = load_batches_from_file(gw_batch_file) + elif gw_file_path_pattern is not None and gw_group_by is not None: + logging.warning("gw_file_path_pattern and gw_group_by are legacy and might be removed in a future release.") + batches = identify_batches_from_pattern(gw_file_path_pattern, gw_group_by) + else: + raise ValueError("Cannot identify batches. " + "Provide valid gw_batch_file or gw_file_path_pattern and gw_group_by.") + + # Iterate over all batches + for batch in batches: + perform_process(sub_process, batch) + + # Check and log status of batches + processed_status = sum((gw_coordinator_path / (batch + suffix_processed)).exists() for batch in batches) + lock_status = sum((gw_coordinator_path / (batch + suffix_lock)).exists() for batch in batches) + error_status = sum((gw_coordinator_path / (batch + suffix_error)).exists() for batch in batches) + + logging.info(f'Finished current process. Status batches: ' + f'{processed_status} processed / {lock_status} locked / {error_status} errors / {len(batches)} total') + + if error_status: + logging.error(f'Found errors! Resolve errors and rerun operator with gw_ignore_error_files=True.') + # print all error messages + for error_file in gw_coordinator_path.glob('**/*' + suffix_error): + with open(error_file, 'r') as f: + logging.error(f.read()) + + +if __name__ == '__main__': + process_wrapper(${component_process}) diff --git a/src/c3/templates/kfp_component_template.yaml b/src/c3/templates/kfp_component_template.yaml new file mode 100644 index 00000000..d5031586 --- /dev/null +++ b/src/c3/templates/kfp_component_template.yaml @@ -0,0 +1,18 @@ +name: ${name} +description: ${description} + +inputs: +${inputs} + +outputs: +${outputs} + +implementation: + container: + image: ${repository}/claimed-${name}:${version} + command: + - sh + - -ec + - | + ${command} ./${target_dir}${target_code} ${parameter_list} +${parameter_values} \ No newline at end of file diff --git a/src/c3/templates/kubernetes_job_template.job.yaml b/src/c3/templates/kubernetes_job_template.job.yaml new file mode 100644 index 00000000..413c417d --- /dev/null +++ b/src/c3/templates/kubernetes_job_template.job.yaml @@ -0,0 +1,17 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: ${name} +spec: + template: + spec: + containers: + - name: ${name} + image: ${repository}/claimed-${name}:${version} + workingDir: ${working_dir} + command: ["${command}","${target_dir}${target_code}"] + env: +${env_entries} + restartPolicy: OnFailure + imagePullSecrets: + - name: image_pull_secret \ No newline at end of file diff --git a/src/c3/templates/legacy_cos_grid_wrapper_template.py b/src/c3/templates/legacy_cos_grid_wrapper_template.py new file mode 100644 index 00000000..f68a2094 --- /dev/null +++ b/src/c3/templates/legacy_cos_grid_wrapper_template.py @@ -0,0 +1,352 @@ +""" +${component_name} got wrapped by cos_grid_wrapper, which wraps any CLAIMED component and implements the generic grid computing pattern for cos files https://romeokienzler.medium.com/the-generic-grid-computing-pattern-transforms-any-sequential-workflow-step-into-a-transient-grid-c7f3ca7459c8 + +CLAIMED component description: ${component_description} +""" + +# pip install s3fs pandas +# component dependencies +# ${component_dependencies} + +import os +import json +import random +import logging +import shutil +import time +import glob +import s3fs +from datetime import datetime +from pathlib import Path +import pandas as pd + + +# import component code +from ${component_name} import * + + +def explode_connection_string(cs): + if cs is None: + return None, None, None, None + elif cs.startswith('cos') or cs.startswith('s3'): + buffer=cs.split('://', 1)[1] + access_key_id=buffer.split('@')[0].split(':')[0] + secret_access_key=buffer.split('@')[0].split(':')[1] + endpoint = f"https://{buffer.split('@')[1].split('/')[0]}" + path=buffer.split('@')[1].split('/', 1)[1] + return (access_key_id, secret_access_key, endpoint, path) + else: + return (None, None, None, cs) + # TODO consider cs as secret and grab connection string from kubernetes + + +# File containing batches. Provided as a comma-separated list of strings or keys in a json dict. All batch file names must contain the batch name. +gw_batch_file = os.environ.get('gw_batch_file', None) +(gw_batch_file_access_key_id, gw_batch_file_secret_access_key, gw_batch_file_endpoint, gw_batch_file) = explode_connection_string(gw_batch_file) +# Optional column name for a csv batch file (default: 'filename') +gw_batch_file_col_name = os.environ.get('gw_batch_file_col_name', 'filename') +# file path pattern like your/path/**/*.tif. Multiple patterns can be separated with commas. It is ignored if gw_batch_file is provided. +gw_file_path_pattern = os.environ.get('gw_file_path_pattern', None) +# pattern for grouping file paths into batches like ".split('.')[-2]". It is ignored if gw_batch_file is provided. +gw_group_by = os.environ.get('gw_group_by', None) + +# comma-separated list of additional cos files to copy +gw_additional_source_files = os.environ.get('gw_additional_source_files', '') +# download source cos files to local input path +gw_local_input_path = os.environ.get('gw_local_input_path', 'input') +# upload local target files to target cos path +gw_local_target_path = os.environ.get('gw_local_target_path', 'target') + +# cos gw_source_connection +gw_source_connection = os.environ.get('gw_source_connection') +(gw_source_access_key_id, gw_source_secret_access_key, gw_source_endpoint, gw_source_path) = explode_connection_string(gw_source_connection) + +# cos gw_target_connection +gw_target_connection = os.environ.get('gw_target_connection') +(gw_target_access_key_id, gw_target_secret_access_key, gw_target_endpoint, gw_target_path) = explode_connection_string(gw_target_connection) + +# cos gw_coordinator_connection +gw_coordinator_connection = os.environ.get('gw_coordinator_connection') +(gw_coordinator_access_key_id, gw_coordinator_secret_access_key, gw_coordinator_endpoint, gw_coordinator_path) = explode_connection_string(gw_coordinator_connection) + +# lock file suffix +gw_lock_file_suffix = os.environ.get('gw_lock_file_suffix', '.lock') +# processed file suffix +gw_processed_file_suffix = os.environ.get('gw_lock_file_suffix', '.processed') +# error file suffix +gw_error_file_suffix = os.environ.get('gw_error_file_suffix', '.err') +# timeout in seconds to remove lock file from struggling job (default 3 hours) +gw_lock_timeout = int(os.environ.get('gw_lock_timeout', 10800)) +# ignore error files and rerun batches with errors +gw_ignore_error_files = bool(os.environ.get('gw_ignore_error_files', False)) +# maximal wait time for staggering start +gw_max_time_wait_staggering = int(os.environ.get('gw_max_time_wait_staggering', 60)) + + +# component interface +${component_interface} + +# init s3 +s3source = s3fs.S3FileSystem( + anon=False, + key=gw_source_access_key_id, + secret=gw_source_secret_access_key, + client_kwargs={'endpoint_url': gw_source_endpoint}) + +gw_source_path = Path(gw_source_path) + +if gw_target_connection is not None: + s3target = s3fs.S3FileSystem( + anon=False, + key=gw_target_access_key_id, + secret=gw_target_secret_access_key, + client_kwargs={'endpoint_url': gw_target_endpoint}) + gw_target_path = Path(gw_target_path) +else: + logging.debug('Using source path as target path.') + gw_target_path = gw_source_path + s3target = s3source + +if gw_coordinator_connection is not None: + s3coordinator = s3fs.S3FileSystem( + anon=False, + key=gw_coordinator_access_key_id, + secret=gw_coordinator_secret_access_key, + client_kwargs={'endpoint_url': gw_coordinator_endpoint}) + gw_coordinator_path = Path(gw_coordinator_path) +else: + logging.debug('Using source bucket as coordinator bucket.') + gw_coordinator_path = gw_source_path + s3coordinator = s3source + +if gw_batch_file_access_key_id is not None: + s3batch_file = s3fs.S3FileSystem( + anon=False, + key=gw_batch_file_access_key_id, + secret=gw_batch_file_secret_access_key, + client_kwargs={'endpoint_url': gw_batch_file_endpoint}) +else: + logging.debug('Loading batch file from source s3.') + s3batch_file = s3source + gw_batch_file = str(gw_source_path / gw_batch_file) + + +def load_batches_from_file(batch_file): + if batch_file.endswith('.json'): + # load batches from keys of a json file + logging.info(f'Loading batches from json file: {batch_file}') + with open(batch_file, 'r') as f: + batch_dict = json.load(f) + batches = batch_dict.keys() + + elif batch_file.endswith('.csv'): + # load batches from keys of a csv file + logging.info(f'Loading batches from csv file: {batch_file}') + df = pd.read_csv(batch_file, header='infer') + assert gw_batch_file_col_name in df.columns, \ + f'gw_batch_file_col_name {gw_batch_file_col_name} not in columns of batch file {batch_file}' + batches = df[gw_batch_file_col_name].to_list() + + elif batch_file.endswith('.txt'): + # Load batches from comma-separated txt file + logging.info(f'Loading comma-separated batch strings from file: {batch_file}') + with open(batch_file, 'r') as f: + batch_string = f.read() + batches = [b.strip() for b in batch_string.split(',')] + else: + raise ValueError(f'C3 only supports batch files of type ' + f'json (batches = dict keys), ' + f'csv (batches = column values), or ' + f'txt (batches = comma-seperated list).') + + logging.info(f'Loaded {len(batches)} batches') + logging.debug(f'List of batches: {batches}') + assert len(batches) > 0, f"batch_file {batch_file} has no batches." + return batches + + +def get_files_from_pattern(file_path_patterns): + logging.info(f'Start identifying files') + all_files = [] + + # Iterate over comma-separated paths + for file_path_pattern in file_path_patterns.split(','): + logging.info(f'Get file paths from pattern: {file_path_pattern}') + files = s3source.glob(str(gw_source_path / file_path_pattern.strip())) + if len(files) == 0: + logging.warning(f"Found no files with file_path_pattern {file_path_pattern}.") + all_files.extend(files) + logging.info(f'Found {len(all_files)} cos files') + return all_files + +def identify_batches_from_pattern(file_path_patterns, group_by): + logging.info(f'Start identifying files and batches') + batches = set() + all_files = get_files_from_pattern(file_path_patterns) + + # get batches by applying the group by function to all file paths + for path_string in all_files: + part = eval('str(path_string)' + group_by, {"group_by": group_by, "path_string": path_string}) + assert part != '', f'Could not extract batch with path_string {path_string} and group_by {group_by}' + batches.add(part) + + logging.info(f'Identified {len(batches)} batches') + logging.debug(f'List of batches: {batches}') + + return batches, all_files + + +def perform_process(process, batch, cos_files): + logging.debug(f'Check coordinator files for batch {batch}.') + # init coordinator files + coordinator_dir = gw_coordinator_path + lock_file = str(coordinator_dir / (batch + gw_lock_file_suffix)) + processed_file = str(coordinator_dir / (batch + gw_processed_file_suffix)) + error_file = str(coordinator_dir / (batch + gw_error_file_suffix)) + + if s3coordinator.exists(lock_file): + # remove strugglers + last_modified = s3coordinator.info(lock_file)['LastModified'] + if (datetime.now(last_modified.tzinfo) - last_modified).total_seconds() > gw_lock_timeout: + logging.info(f'Lock file {lock_file} is expired.') + s3coordinator.rm(lock_file) + else: + logging.debug(f'Batch {batch} is locked.') + return + + if s3coordinator.exists(processed_file): + logging.debug(f'Batch {batch} is processed.') + return + + if s3coordinator.exists(error_file): + if gw_ignore_error_files: + logging.info(f'Ignoring previous error in batch {batch} and rerun.') + else: + logging.debug(f'Batch {batch} has error.') + return + + logging.debug(f'Locking batch {batch}.') + s3coordinator.touch(lock_file) + logging.info(f'Processing batch {batch}.') + + # Create input and target directories + input_path = Path(gw_local_input_path) + target_path = Path(gw_local_target_path) + assert not input_path.exists(), (f'gw_local_input_path ({gw_local_input_path}) already exists. ' + f'Please provide a new input path.') + assert not target_path.exists(), (f'gw_local_target_path ({gw_local_target_path}) already exists. ' + f'Please provide a new target path.') + input_path.mkdir(parents=True) + target_path.mkdir(parents=True) + + # Download cos files to local input folder + batch_fileset = list(filter(lambda file: batch in file, cos_files)) + if gw_additional_source_files != '': + additional_source_files = [f.strip() for f in gw_additional_source_files.split(',')] + batch_fileset.extend(additional_source_files) + logging.info(f'Downloading {len(batch_fileset)} files from COS') + for cos_file in batch_fileset: + local_file = str(input_path / cos_file.split('/', 1)[-1]) + logging.debug(f'Downloading {cos_file} to {local_file}') + s3source.get(cos_file, local_file) + + # processing files with custom process + try: + target_files = process(batch, ${component_inputs}) + except Exception as err: + logging.exception(err) + # Write error to file + with s3coordinator.open(error_file, 'w') as f: + f.write(f"{type(err).__name__} in batch {batch}: {err}") + s3coordinator.rm(lock_file) + logging.error(f'Continue processing.') + return + + # optional verify target files + if target_files is not None: + if isinstance(target_files, str): + target_files = [target_files] + for target_file in target_files: + if not os.path.exists(target_file): + logging.error(f'Target file {target_file} does not exist for batch {batch}.') + if any([not str(t).startswith(gw_local_target_path) for t in target_files]): + logging.warning('Some target files are not in target path. Only files in target path are uploaded.') + else: + logging.info(f'Cannot verify batch {batch} (target files not provided). Using files in target_path.') + + # upload files in target path + local_target_files = list(target_path.glob('*')) + logging.info(f'Uploading {len(local_target_files)} target files to COS.') + for local_file in local_target_files: + cos_file = gw_target_path / local_file.relative_to(target_path) + logging.debug(f'Uploading {local_file} to {cos_file}') + s3target.put(str(local_file), str(cos_file)) + + logging.info(f'Remove local input and target files.') + shutil.rmtree(input_path) + shutil.rmtree(target_path) + + logging.info(f'Finished Batch {batch}.') + s3coordinator.touch(processed_file) + # Remove lock file + if s3coordinator.exists(lock_file): + s3coordinator.rm(lock_file) + else: + logging.warning(f'Lock file {lock_file} was removed by another process. ' + f'Consider increasing gw_lock_timeout (currently {gw_lock_timeout}s) to repeated processing.') + + +def process_wrapper(sub_process): + delay = random.randint(0, gw_max_time_wait_staggering) + logging.info(f'Staggering start, waiting for {delay} seconds') + time.sleep(delay) + + # Init coordinator dir + coordinator_dir = gw_coordinator_path + s3coordinator.makedirs(coordinator_dir, exist_ok=True) + + # get batches + cos_gw_batch_file = str(gw_source_path / gw_batch_file) + if (gw_batch_file is not None and (os.path.isfile(gw_batch_file) or s3source.exists(cos_gw_batch_file))): + if not os.path.isfile(gw_batch_file): + # Download batch file from s3 + if s3batch_file.exists(gw_batch_file): + s3batch_file.get(gw_batch_file, gw_batch_file) + else: + s3batch_file.get(str(gw_source_path / gw_batch_file), gw_batch_file) + batches = load_batches_from_file(gw_batch_file) + if gw_file_path_pattern: + cos_files = get_files_from_pattern(gw_file_path_pattern) + else: + logging.warning('gw_file_path_pattern is not provided. ' + 'Grid wrapper expects the wrapped operator to handle COS files instead of the automatic download and upload.') + cos_files = [] + elif gw_file_path_pattern is not None and gw_group_by is not None: + batches, cos_files = identify_batches_from_pattern(gw_file_path_pattern, gw_group_by) + else: + raise ValueError("Cannot identify batches. " + "Provide valid gw_batch_file (local path or path within source bucket) " + "or gw_file_path_pattern and gw_group_by.") + + # Iterate over all batches + for batch in batches: + perform_process(sub_process, batch, cos_files) + + # Check and log status of batches + processed_status = [s3coordinator.exists(coordinator_dir / (batch + gw_processed_file_suffix)) for batch in batches] + lock_status = [s3coordinator.exists(coordinator_dir / (batch + gw_lock_file_suffix)) for batch in batches] + error_status = [s3coordinator.exists(coordinator_dir / (batch + gw_error_file_suffix)) for batch in batches] + + logging.info(f'Finished current process. Status batches: ' + f'{sum(processed_status)} processed / {sum(lock_status)} locked / {sum(error_status)} errors / {len(processed_status)} total') + + if sum(error_status): + logging.error(f'Found errors! Resolve errors and rerun operator with gw_ignore_error_files=True.') + # print all error messages + for error_file in s3coordinator.glob(str(coordinator_dir / ('**/*' + gw_error_file_suffix))): + with s3coordinator.open(error_file, 'r') as f: + logging.error(f.read()) + + +if __name__ == '__main__': + process_wrapper(${component_process}) diff --git a/src/c3/templates/python_dockerfile_template b/src/c3/templates/python_dockerfile_template new file mode 100644 index 00000000..d4498650 --- /dev/null +++ b/src/c3/templates/python_dockerfile_template @@ -0,0 +1,11 @@ +FROM ${base_image} +USER root +${additional_files_docker} +RUN pip install --upgrade pip +RUN pip install ipython nbformat +${requirements_docker} +ADD ${target_code} ${working_dir}${target_dir} +RUN chmod -R 777 ${working_dir} +USER default +WORKDIR "${working_dir}" +CMD ["${command}", "${target_dir}${target_code}"] diff --git a/src/c3/templates/s3kv_grid_wrapper_template.py b/src/c3/templates/s3kv_grid_wrapper_template.py new file mode 100644 index 00000000..799be82b --- /dev/null +++ b/src/c3/templates/s3kv_grid_wrapper_template.py @@ -0,0 +1,643 @@ +""" +${component_name} got wrapped by grid_wrapper, which wraps any CLAIMED component and implements the generic grid computing pattern https://romeokienzler.medium.com/the-generic-grid-computing-pattern-transforms-any-sequential-workflow-step-into-a-transient-grid-c7f3ca7459c8 + +CLAIMED component description: ${component_description} +""" + +# pip install s3fs boto3 pandas +# component dependencies +# ${component_dependencies} + +import os +import json +import random +import logging +import time +import glob +from pathlib import Path +import pandas as pd +import s3fs +from hashlib import sha256 + + + +# import component code +from ${component_name} import * + +#------------------REMOVE once pip install for s3kv is fixed +import os +import time +from datetime import datetime +import shutil +import boto3 +import json + + +class S3KV: + def __init__(self, s3_endpoint_url:str, bucket_name: str, + aws_access_key_id: str = None, aws_secret_access_key: str = None , enable_local_cache=True): + """ + Initializes the S3KV object with the given S3 bucket, AWS credentials, and Elasticsearch host. + + :param s3_endpoint_url: The s3 endpoint. + :param bucket_name: The name of the S3 bucket to use for storing the key-value data. + :param aws_access_key_id: (Optional) AWS access key ID. + :param aws_secret_access_key: (Optional) AWS secret access key. + """ + self.bucket_name = bucket_name + self.enable_local_cache = enable_local_cache + self.s3_client = boto3.client( + 's3', + endpoint_url=s3_endpoint_url, + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key + ) + + if not os.path.exists('/tmp/s3kv_cache'): + os.makedirs('/tmp/s3kv_cache') + + def _get_object_key(self, key: str) -> str: + """ + Constructs the S3 object key for the given key. + + :param key: The key used to access the value in the S3 bucket. + :return: The S3 object key for the given key. + """ + return f"s3kv/{key}.json" + + def cache_all_keys(self): + """ + Saves all keys to the local /tmp directory as they are being added. + """ + keys = self.list_keys() + for key in keys: + value = self.get(key) + if value is not None: + with open(f'/tmp/s3kv_cache/{key}.json', 'w') as f: + json.dump(value, f) + + def get_from_cache(self, key: str) -> dict: + """ + Retrieves a key from the local cache if present, and clears old cache entries. + + :param key: The key to retrieve from the cache. + :return: The value associated with the given key if present in the cache, else None. + """ + self.clear_old_cache() + cache_path = f'/tmp/s3kv_cache/{key}.json' + if os.path.exists(cache_path): + with open(cache_path, 'r') as f: + return json.load(f) + else: + return None + + + def add(self, key: str, value: dict, metadata: dict = None): + """ + Adds a new key-value pair to the S3KV database, caches it locally, and sends metadata to Elasticsearch. + + :param key: The key to be added. + :param value: The value corresponding to the key. + :param metadata: (Optional) Metadata associated with the data (will be sent to Elasticsearch). + """ + s3_object_key = self._get_object_key(key) + serialized_value = json.dumps(value) + self.s3_client.put_object(Bucket=self.bucket_name, Key=s3_object_key, Body=serialized_value) + + with open(f'/tmp/s3kv_cache/{key}.json', 'w') as f: + json.dump(value, f) + + + + def delete(self, key: str): + """ + Deletes a key-value pair from the S3KV database. + + :param key: The key to be deleted. + """ + s3_object_key = self._get_object_key(key) + self.s3_client.delete_object(Bucket=self.bucket_name, Key=s3_object_key) + + cache_path = f'/tmp/s3kv_cache/{key}.json' + if os.path.exists(cache_path): + os.remove(cache_path) + + + def get(self, key: str, default: dict = None) -> dict: + """ + Retrieves the value associated with the given key from the S3KV database. + + :param key: The key whose value is to be retrieved. + :param default: (Optional) The default value to return if the key does not exist. + :return: The value associated with the given key, or the default value if the key does not exist. + """ + s3_object_key = self._get_object_key(key) + try: + response = self.s3_client.get_object(Bucket=self.bucket_name, Key=s3_object_key) + value = response['Body'].read() + return json.loads(value) + except self.s3_client.exceptions.NoSuchKey: + return default + + + def list_keys(self) -> list: + """ + Lists all the keys in the S3KV database. + + :return: A list of all keys in the database. + """ + response = self.s3_client.list_objects_v2(Bucket=self.bucket_name, Prefix="") + keys = [obj['Key'][5:-5] for obj in response.get('Contents', []) if obj['Key'].endswith('.json')] + return keys + + + def clear_cache(self): + """ + Clears the local cache by removing all cached JSON files. + """ + cache_directory = '/tmp/s3kv_cache' + if os.path.exists(cache_directory): + shutil.rmtree(cache_directory) + os.makedirs('/tmp/s3kv_cache') + + + def clear_old_cache(self, max_days: int = 7): + """ + Clears the cache for keys that have been in the cache for longer than a specific number of days. + + :param max_days: The maximum number of days a key can stay in the cache before being cleared. + """ + cache_directory = '/tmp/s3kv_cache' + current_time = time.time() + + if os.path.exists(cache_directory): + for filename in os.listdir(cache_directory): + file_path = os.path.join(cache_directory, filename) + if os.path.isfile(file_path): + file_age = current_time - os.path.getmtime(file_path) + if file_age > max_days * 86400: # Convert days to seconds + os.remove(file_path) + + + def clear_cache_for_key(self, key: str): + """ + Clears the local cache for a specific key in the S3KV database. + + :param key: The key for which to clear the local cache. + """ + cache_path = f'/tmp/s3kv_cache/{key}.json' + if os.path.exists(cache_path): + os.remove(cache_path) + + + def key_exists(self, key: str) -> bool: + """ + Checks if a key exists in the S3KV database. + + :param key: The key to check. + :return: True if the key exists, False otherwise. + """ + s3_object_key = self._get_object_key(key) + try: + self.s3_client.head_object(Bucket=self.bucket_name, Key=s3_object_key) + return True + except Exception as e: + # Return false even if response is unauthorized or similar + return False + + + def list_keys_with_prefix(self, prefix: str) -> list: + """ + Lists all the keys in the S3KV database that have a specific prefix. + + :param prefix: The prefix to filter the keys. + :return: A list of keys in the database that have the specified prefix. + """ + response = self.s3_client.list_objects_v2(Bucket=self.bucket_name, Prefix=prefix) + keys = [obj['Key'][5:-5] for obj in response.get('Contents', []) if obj['Key'].endswith('.json')] + return keys + + + def copy_key(self, source_key: str, destination_key: str): + """ + Copies the value of one key to another key in the S3KV database. + + :param source_key: The key whose value will be copied. + :param destination_key: The key to which the value will be copied. + """ + source_s3_object_key = self._get_object_key(source_key) + destination_s3_object_key = self._get_object_key(destination_key) + + response = self.s3_client.get_object(Bucket=self.bucket_name, Key=source_s3_object_key) + value = response['Body'].read() + + self.s3_client.put_object(Bucket=self.bucket_name, Key=destination_s3_object_key, Body=value) + + # Copy the key in the local cache if it exists + source_cache_path = f'/tmp/s3kv_cache/{source_key}.json' + destination_cache_path = f'/tmp/s3kv_cache/{destination_key}.json' + if os.path.exists(source_cache_path): + shutil.copy(source_cache_path, destination_cache_path) + + + def get_key_size(self, key: str) -> int: + """ + Gets the size (file size) of a key in the S3KV database. + + :param key: The key whose size will be retrieved. + :return: The size (file size) of the key in bytes, or 0 if the key does not exist. + """ + s3_object_key = self._get_object_key(key) + try: + response = self.s3_client.head_object(Bucket=self.bucket_name, Key=s3_object_key) + return response['ContentLength'] + except self.s3_client.exceptions.NoSuchKey: + return 0 + + + def get_key_last_updated_time(self, key: str) -> float: + """ + Gets the last updated time of a key in the S3KV database. + + :param key: The key whose last updated time will be retrieved. + :return: The last updated time of the key as a floating-point timestamp, or 0 if the key does not exist. + """ + s3_object_key = self._get_object_key(key) + try: + response = self.s3_client.head_object(Bucket=self.bucket_name, Key=s3_object_key) + last_modified = response['LastModified'] + st = time.mktime(last_modified.timetuple()) + + return datetime.fromtimestamp(st) + + except self.s3_client.exceptions.NoSuchKey: + return 0 + + + def set_bucket_policy(self): + """ + Sets a bucket policy to grant read and write access to specific keys used by the S3KV library. + """ + policy = { + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "S3KVReadWriteAccess", + "Effect": "Allow", + "Principal": { + "AWS": "*" + }, + "Action": [ + "s3:GetObject", + "s3:PutObject" + ], + "Resource": f"arn:aws:s3:::{self.bucket_name}/s3kv/*" + } + ] + } + + policy_json = json.dumps(policy) + self.s3_client.put_bucket_policy(Bucket=self.bucket_name, Policy=policy_json) + + + def tag_key(self, key: str, tags: dict): + """ + Tags a key in the S3KV database with the provided tags. + + :param key: The key to be tagged. + :param tags: A dictionary containing the tags to be added to the key. + For example, {'TagKey1': 'TagValue1', 'TagKey2': 'TagValue2'} + """ + s3_object_key = self._get_object_key(key) + + # Convert the tags dictionary to a format compatible with the `put_object_tagging` method + tagging = {'TagSet': [{'Key': k, 'Value': v} for k, v in tags.items()]} + + # Apply the tags to the object + self.s3_client.put_object_tagging(Bucket=self.bucket_name, Key=s3_object_key, Tagging=tagging) + + + def tag_keys_with_prefix(self, prefix: str, tags: dict): + """ + Tags all keys in the S3KV database with the provided prefix with the specified tags. + + :param prefix: The prefix of the keys to be tagged. + :param tags: A dictionary containing the tags to be added to the keys. + For example, {'TagKey1': 'TagValue1', 'TagKey2': 'TagValue2'} + """ + keys_to_tag = self.list_keys_with_prefix(prefix) + + for key in keys_to_tag: + self.tag_key(key, tags) + + + def merge_keys(self, source_keys: list, destination_key: str): + """ + Merges the values of source keys into the value of the destination key in the S3KV database. + + :param source_keys: A list of source keys whose values will be merged. + :param destination_key: The key whose value will be updated by merging the source values. + """ + destination_s3_object_key = self._get_object_key(destination_key) + + # Initialize an empty dictionary for the destination value + destination_value = {} + + # Retrieve and merge values from source keys + for source_key in source_keys: + source_value = self.get(source_key) + if source_value: + destination_value.update(source_value) + + # Update the destination value in the S3 bucket + serialized_value = json.dumps(destination_value) + self.s3_client.put_object(Bucket=self.bucket_name, Key=destination_s3_object_key, Body=serialized_value) + + # Update the value in the local cache if it exists + destination_cache_path = f'/tmp/s3kv_cache/{destination_key}.json' + with open(destination_cache_path, 'w') as f: + json.dump(destination_value, f) + + + + def find_keys_by_tag_value(self, tag_key: str, tag_value: str) -> list: + """ + Finds keys in the S3KV database based on the value of a specific tag. + + :param tag_key: The tag key to search for. + :param tag_value: The tag value to search for. + :return: A list of keys that have the specified tag key with the specified value. + """ + response = self.s3_client.list_objects_v2(Bucket=self.bucket_name, Prefix="s3kv/") + keys_with_tag = [] + + for obj in response.get('Contents', []): + s3_object_key = obj['Key'] + tags = self.get_tags(s3_object_key) + if tags and tag_key in tags and tags[tag_key] == tag_value: + keys_with_tag.append(s3_object_key[5:-5]) # Extract the key name + + return keys_with_tag + + def get_tags(self, s3_object_key: str) -> dict: + """ + Gets the tags of an object in the S3KV database. + + :param s3_object_key: The S3 object key whose tags will be retrieved. + :return: A dictionary containing the tags of the object. + """ + response = self.s3_client.get_object_tagging(Bucket=self.bucket_name, Key=s3_object_key) + tags = {} + for tag in response.get('TagSet', []): + tags[tag['Key']] = tag['Value'] + return tags + + + + def place_retention_lock(self, key: str, retention_days: int): + """ + Places a retention lock on a key in the S3KV database for the specified number of days. + + :param key: The key to place the retention lock on. + :param retention_days: The number of days to lock the key for. + """ + s3_object_key = self._get_object_key(key) + print(s3_object_key) + + retention_period = retention_days * 24 * 60 * 60 # Convert days to seconds + + self.s3_client.put_object_retention( + Bucket=self.bucket_name, + Key=s3_object_key, + Retention={ + 'Mode': 'GOVERNANCE', + 'RetainUntilDate': int(time.time()) + retention_period + } + ) + + + def remove_retention_lock(self, key: str): + """ + Removes the retention lock from a key in the S3KV database. + + :param key: The key to remove the retention lock from. + """ + s3_object_key = self._get_object_key(key) + + self.s3_client.put_object_retention( + Bucket=self.bucket_name, + Key=s3_object_key, + BypassGovernanceRetention=True, + Retention={ + + } + ) + + + def delete_by_tag(self, tag_key: str, tag_value: str): + """ + Deletes keys in the S3KV database based on a specific tag. + + :param tag_key: The tag key to match for deletion. + :param tag_value: The tag value to match for deletion. + """ + keys_to_delete = self.find_keys_by_tag_value(tag_key, tag_value) + + for key in keys_to_delete: + self.delete(key) + + + def apply_legal_hold(self, key: str): + """ + Applies a legal hold on a key in the S3KV database. + + :param key: The key on which to apply the legal hold. + """ + s3_object_key = self._get_object_key(key) + + self.s3_client.put_object_legal_hold( + Bucket=self.bucket_name, + Key=s3_object_key, + LegalHold={ + 'Status': 'ON' + } + ) + + + + + + def is_legal_hold_applied(self, key: str) -> bool: + """ + Checks if a key in the S3KV database is under legal hold. + + :param key: The key to check for legal hold. + :return: True if the key is under legal hold, False otherwise. + """ + s3_object_key = self._get_object_key(key) + + response = self.s3_client.get_object_legal_hold(Bucket=self.bucket_name, Key=s3_object_key) + + legal_hold_status = response.get('LegalHold', {}).get('Status') + return legal_hold_status == 'ON' + + + def release_legal_hold(self, key: str): + """ + Releases a key from legal hold in the S3KV database. + + :param key: The key to release from legal hold. + """ + s3_object_key = self._get_object_key(key) + + self.s3_client.put_object_legal_hold( + Bucket=self.bucket_name, + Key=s3_object_key, + LegalHold={ + 'Status': 'OFF' + } + ) + +#----------------------------------------------------------- + + +def explode_connection_string(cs): + if cs is None: + return None, None, None, None + if cs.startswith('cos') or cs.startswith('s3'): + buffer=cs.split('://')[1] + access_key_id=buffer.split('@')[0].split(':')[0] + secret_access_key=buffer.split('@')[0].split(':')[1] + endpoint=f"https://{buffer.split('@')[1].split('/')[0]}" + path=buffer.split('@')[1].split('/', 1)[1] + return (access_key_id, secret_access_key, endpoint, path) + else: + return (None, None, None, cs) + # TODO consider cs as secret and grab connection string from kubernetes + + + +# File with batches. Provided as a comma-separated list of strings, keys in a json dict or single column CSV with 'filename' has header. Either local path as [cos|s3]://user:pw@endpoint/path +gw_batch_file = os.environ.get('gw_batch_file', None) +(gw_batch_file_access_key_id, gw_batch_file_secret_access_key, gw_batch_file_endpoint, gw_batch_file) = explode_connection_string(gw_batch_file) +# Optional column name for a csv batch file (default: 'filename') +gw_batch_file_col_name = os.environ.get('gw_batch_file_col_name', 'filename') + +# cos gw_coordinator_connection +gw_coordinator_connection = os.environ.get('gw_coordinator_connection') +(gw_coordinator_access_key_id, gw_coordinator_secret_access_key, gw_coordinator_endpoint, gw_coordinator_path) = explode_connection_string(gw_coordinator_connection) + +# maximal wait time for staggering start +gw_max_time_wait_staggering = int(os.environ.get('gw_max_time_wait_staggering',60)) + +# component interface +#${component_interface} + +def load_batches_from_file(batch_file): + # Download batch file from s3 + s3_batch_file = s3fs.S3FileSystem( + anon=False, + key=gw_batch_file_access_key_id, + secret=gw_batch_file_secret_access_key, + client_kwargs={'endpoint_url': gw_batch_file_endpoint}) + s3_batch_file.get(batch_file, batch_file) + + if batch_file.endswith('.json'): + # load batches from keys of a json file + logging.info(f'Loading batches from json file: {batch_file}') + with open(batch_file, 'r') as f: + batch_dict = json.load(f) + batches = batch_dict.keys() + + elif batch_file.endswith('.csv'): + # load batches from keys of a csv file + logging.info(f'Loading batches from csv file: {batch_file}') + df = pd.read_csv(batch_file, header='infer') + assert gw_batch_file_col_name in df.columns, \ + f'gw_batch_file_col_name {gw_batch_file_col_name} not in columns of batch file {batch_file}' + batches = df[gw_batch_file_col_name].to_list() + + elif batch_file.endswith('.txt'): + # Load batches from comma-separated txt file + logging.info(f'Loading comma-separated batch strings from file: {batch_file}') + with open(batch_file, 'r') as f: + batch_string = f.read() + batches = [b.strip() for b in batch_string.split(',')] + else: + raise ValueError(f'C3 only supports batch files of type ' + f'json (batches = dict keys), ' + f'csv (batches = column values), or ' + f'txt (batches = comma-seperated list).') + + logging.info(f'Loaded {len(batches)} batches') + logging.debug(f'List of batches: {batches}') + assert len(batches) > 0, f"batch_file {batch_file} has no batches." + return batches + + +def perform_process(process, batch, coordinator): + logging.debug(f'Check coordinator files for batch {batch}.') + + batch_id = sha256(batch.encode('utf-8')).hexdigest() # ensure no special characters break cos + logging.info(f'Generating {batch_id} for {batch}') + + if coordinator.key_exists(batch_id): + if coordinator.get(batch_id) == 'locked': + logging.debug(f'Batch {batch_id} is locked') + return + elif coordinator.get(batch_id) == 'processed': + logging.debug(f'Batch {batch_id} is processed') + return + else: + logging.debug(f'Batch {batch_id} is failed') + return + + + logging.debug(f'Locking batch {batch_id}.') + coordinator.add(batch_id,'locked') + + # processing files with custom process + logging.info(f'Processing batch {batch_id}.') + try: + process(batch, ${component_inputs}) + except Exception as err: + logging.exception(err) + coordinator.add(batch_id,f"{type(err).__name__} in batch {batch_id}: {err}") + logging.error(f'Continue processing.') + return + + logging.info(f'Finished Batch {batch_id}.') + coordinator.add(batch_id,'processed') + + +def process_wrapper(sub_process): + delay = random.randint(0, gw_max_time_wait_staggering) + logging.info(f'Staggering start, waiting for {delay} seconds') + time.sleep(delay) + + # Init coordinator + coordinator = S3KV(gw_coordinator_endpoint, + gw_coordinator_path, + gw_coordinator_access_key_id, gw_coordinator_secret_access_key, + enable_local_cache=False) + + + # get batches + batches = load_batches_from_file(gw_batch_file) + + # Iterate over all batches + for batch in batches: + perform_process(sub_process, batch, coordinator) + + # Check and log status of batches + processed_status = sum(coordinator.get(batch_id) == 'processed' for batch_id in batches) + lock_status = sum(coordinator.get(batch_id) == 'locked' for batch_id in batches) + exists_status = sum(coordinator.key_exists(batch_id) for batch_id in batches) + error_status = exists_status - processed_status - lock_status + + logging.info(f'Finished current process. Status batches: ' + f'{processed_status} processed / {lock_status} locked / {error_status} errors / {len(batches)} total') + + +if __name__ == '__main__': + process_wrapper(${component_process}) diff --git a/src/c3/templates/simple_grid_wrapper_template.py b/src/c3/templates/simple_grid_wrapper_template.py new file mode 100644 index 00000000..66908801 --- /dev/null +++ b/src/c3/templates/simple_grid_wrapper_template.py @@ -0,0 +1,105 @@ +""" +${component_name} got wrapped by grid_wrapper, which wraps any CLAIMED component and implements the generic grid computing pattern https://romeokienzler.medium.com/the-generic-grid-computing-pattern-transforms-any-sequential-workflow-step-into-a-transient-grid-c7f3ca7459c8 +This simple grid wrapper just scans a folder and for each file the grid_process function is called. Locking is achieved the following way: +Given source file1.ext is processed, simple_grid_wrapper creates files in the target_directory following the pattern file1.{STATUS}.ext where STATUS in: +LOCKED +PROCESSED +FAILED + + +CLAIMED component description: ${component_description} +""" + +# pip install pandas + +# component dependencies +# ${component_dependencies} + +import os +import json +import random +import logging +import time +import glob +from pathlib import Path +import pandas as pd + +# import component code +from ${component_name} import * + + +#folder containing input data in single files +sgw_source_folder = os.environ.get('sgw_source_folder') + +#folder to store the output data in single files. Default: sgw_source_folder, in case sgw_source_folder==sgw_target_folder, files containing .LOCKED., .PROCESSED., .FAILED. are ignored +sgw_target_folder = os.environ.get('sgw_target_folder', sgw_source_folder) + +# component interface +${component_interface} + +def get_next_batch(): + files = os.listdir(sgw_source_folder) + if sgw_source_folder == sgw_target_folder: + files = [ + f for f in files + if not any(keyword in f for keyword in ["LOCKED", "PROCESSED", "FAILED"]) + ] + + # Filter files and check if corresponding target file exists + filtered_files = [] + for file in files: + file_name, file_ext = os.path.splitext(file) + + # Create target file names with LOCKED, PROCESSED, FAILED extensions + target_file_locked = f"{file_name}.LOCKED{file_ext}" + target_file_processed = f"{file_name}.PROCESSED{file_ext}" + target_file_failed = f"{file_name}.FAILED{file_ext}" + + # Check if any of the target files exists + if not any( + os.path.exists(os.path.join(sgw_target_folder, target_file)) + for target_file in [target_file_locked, target_file_processed, target_file_failed] + ): + filtered_files.append(file) + + if filtered_files: + return random.choice(filtered_files) + else: + return None + + +def process_wrapper(sub_process): + sgw_target_folder_path = Path(sgw_target_folder) + sgw_target_folder_path.mkdir(exist_ok=True, parents=True) + + while True: + file_to_process = get_next_batch() + logging.info(f"Processing batch: {file_to_process}") + if file_to_process is None: + break + + file_name = Path(file_to_process).stem + file_ext = Path(file_to_process).suffix + locked_file = sgw_target_folder+f"/{file_name}.LOCKED{file_ext}" + locked_file_path = Path(locked_file) + + try: + locked_file_path.touch() + sub_process(sgw_source_folder +'/'+ file_to_process, locked_file) + processed_file = sgw_target_folder+f"/{file_name}.PROCESSED{file_ext}" + locked_file_path.rename(processed_file) + + except Exception as e: + failed_file = sgw_target_folder+f"/{file_name}.FAILED{file_ext}" + locked_file_path.rename(failed_file) + + with open(failed_file, 'w') as f: + f.write(f"Exception occurred: {str(e)}\n") + + logging.error(f"Processing failed for {file_to_process}: {str(e)}") + + logging.info("Finished processing all batches.") + + +if __name__ == '__main__': + process_wrapper(${component_process}) diff --git a/src/c3/utils.py b/src/c3/utils.py new file mode 100644 index 00000000..0bbe5442 --- /dev/null +++ b/src/c3/utils.py @@ -0,0 +1,146 @@ +import os +import logging +import nbformat +import re +import subprocess +from nbconvert.exporters import PythonExporter + + +def convert_notebook(path): + notebook = nbformat.read(path, as_version=4) + + # backwards compatibility (v0.1 description was included in second cell, merge first two markdown cells) + if notebook['cells'][0]['cell_type'] == 'markdown' and notebook['cells'][1]['cell_type'] == 'markdown': + logging.info('Merge first two markdown cells. File name is used as operator name, not first markdown cell.') + notebook['cells'][1]['source'] = notebook['cells'][0]['source'] + '\n' + notebook['cells'][1]['source'] + notebook['cells'] = notebook['cells'][1:] + + for cell in notebook['cells']: + if cell['cell_type'] == 'markdown': + # convert markdown to doc string + cell['cell_type'] = 'code' + cell['source'] = '"""\n' + cell['source'] + '\n"""' + cell['outputs'] = [] + cell['execution_count'] = 0 + if cell['cell_type'] == 'code' and re.search('![ ]*pip', cell['source']): + # replace !pip with #pip + cell['source'] = re.sub('![ ]*pip[ ]*install', '# pip install', cell['source']) + + # convert tp python script + (code, _) = PythonExporter().from_notebook_node(notebook) + + # add import get_ipython + code = 'from IPython import get_ipython \n' + code + + py_path = path.split('/')[-1].replace('.ipynb', '.py').replace('-', '_') + + assert not os.path.exists(py_path), f"File {py_path} already exist. Cannot convert notebook." + with open(py_path, 'w') as py_file: + py_file.write(code) + + return py_path + + +def increase_image_version(last_version): + try: + # increase last version value by 1 + version = last_version.split('.') + version[-1] = str(int(version[-1]) + 1) + version = '.'.join(version) + except: + # fails if a string value was used for the last tag + version = last_version + '.1' + logging.debug(f'Failed to increase last value, adding .1') + pass + return version + + +def pull_docker_image_tags(image): + logging.warning("The current implementation can only query local docker images. " + "Please use an argument '-v ' to avoid duplicates.") + # TODO: Add script for reading image tags from docker hub + # list images + output = subprocess.run( + ['docker', 'image', 'ls', image], + stdout=subprocess.PIPE + ).stdout.decode('utf-8') + try: + # remove header + image_list = output.splitlines()[1:] + # get list of image tags + image_tags = [line.split()[1] for line in image_list] + except: + image_tags = [] + logging.error(f"Could not load image tags from 'docker image ls' output: {output}") + pass + + # filter latest and none + image_tags = [t for t in image_tags if t not in ['latest', '']] + return image_tags + + +def pull_icr_image_tags(image): + # list images from icr + output = subprocess.run( + ['ibmcloud', 'cr', 'images', '--restrict', image.split('icr.io/', 1)[1]], + stdout=subprocess.PIPE + ).stdout.decode('utf-8') + + try: + assert 'You have no images in the namespaces' not in output + # remove header and final status + image_list = output.splitlines()[3:-2] + # get list of image tags + image_tags = [line.split()[1] for line in image_list] + except: + image_tags = [] + logging.warning(f"Could not load image tags from 'ibmcloud cr images' output: {output}") + pass + + # filter latest and none + image_tags = [t for t in image_tags if t not in ['latest', '']] + return image_tags + + +def get_image_version(repository, name): + """ + Get current version of the image from the registry and increase the version by 1. + Defaults to 0.1 if no image is found in the registry. + """ + if repository is None: + logging.debug('Using 0.1 as local version.') + return '0.1' + + logging.debug(f'Get image version from registry.') + if 'docker.io' in repository: + logging.debug('Get image tags from docker.') + image_tags = pull_docker_image_tags(f'{repository}/claimed-{name}') + elif 'icr.io' in repository: + logging.debug('Get image tags from ibmcloud container registry.') + image_tags = pull_icr_image_tags(f'{repository}/claimed-{name}') + else: + logging.warning('Unrecognised container registry, using docker to query image tags.') + image_tags = pull_docker_image_tags(f'{repository}/claimed-{name}') + logging.debug(f'Image tags: {image_tags}') + + def check_only_numbers(test_str): + return set(test_str) <= set('.0123456789') + + if len(image_tags) == 0: + # default version + version = '0.1' + logging.info(f'Using default version {version}. No prior image tag found for {repository}/claimed-{name}.') + + elif not check_only_numbers(image_tags[0]): + # increase last version + version = increase_image_version(image_tags[0]) + logging.info(f'Using version {version} based on last version {image_tags[0]}.') + + else: + # find the highest numerical version + image_tags = list(filter(check_only_numbers, image_tags)) + image_tags.sort(key=lambda s: list(map(int, s.split('.')))) + version = increase_image_version(image_tags[-1]) + logging.info(f'Using version {version} based on highest previous version {image_tags[-1]}.') + + return version diff --git a/src/setup.py b/src/setup.py new file mode 100644 index 00000000..aa3a8d25 --- /dev/null +++ b/src/setup.py @@ -0,0 +1,10 @@ +from setuptools import setup, find_packages + +setup( + name='c3', + packages=find_packages(), + install_requires=[ + 'ipython', + 'nbconvert', + ], +) diff --git a/tests/example_notebook.ipynb b/tests/example_notebook.ipynb new file mode 100644 index 00000000..da2883d5 --- /dev/null +++ b/tests/example_notebook.ipynb @@ -0,0 +1,114 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Test description" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install numpy\n", + "\n", + "! pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hello\n" + ] + } + ], + "source": [ + "%%bash\n", + "echo hello\n", + "echo world" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# A comment one line above os.getenv is the description of this variable.\n", + "input_path = os.getenv('input_path')\n", + "\n", + "# You can change the type by using int(), float(), or bool().\n", + "batch_size = int(os.getenv('batch_size', 16))\n", + "\n", + "# The commas in the previous comment are deleted because the yaml file requires descriptions without commas.\n", + "debug = bool(os.getenv('debug', False))\n", + "\n", + "# Output parameters are starting with \"output_\"\n", + "output_path = os.getenv('output_path')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def your_function(*args):\n", + " \"\"\"\n", + " The compiler only includes the first doc string. Therefore, this text is not included.\n", + " \"\"\"\n", + "\n", + " print(args)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "your_function(input_path, batch_size, debug, output_path)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/example_rscript.R b/tests/example_rscript.R new file mode 100644 index 00000000..4f4e19d9 --- /dev/null +++ b/tests/example_rscript.R @@ -0,0 +1,17 @@ +# Reading env variables + +name <- Sys.getenv('name') + +default <- Sys.getenv('default', "default") + +number <- as.numeric(Sys.getenv('number', 10)) + +print(paste("hello", name)) + +print(number) + +# apt install libgdal-dev + +# Install packages +install.packages('readr') +library(readr) diff --git a/tests/example_script.py b/tests/example_script.py new file mode 100644 index 00000000..8d709e14 --- /dev/null +++ b/tests/example_script.py @@ -0,0 +1,44 @@ +""" +This is the operator description. +""" + +# pip install numpy + +#!pip install pandas + +# dnf update + +import os +import numpy as np + +# A comment one line above os.getenv is the description of this variable. +input_path = os.environ.get('input_path', '') # ('not this') + +# type casting to int(), float(), or bool() +batch_size = int(os.environ.get('batch_size', 16)) # (not this) + +# Commas in the previous comment are deleted because the yaml file requires descriptions without commas. +debug = bool(os.getenv('debug', False)) + +output_path = os.getenv('output_path', 'default_value') + + +def main(*args): + """ + The compiler only includes the first doc string.This text should not be included. + """ + _ = np.random.randn(5) + + os.environ['test_output'] = 'test' + + print(args) + + +def process(batch, *args): + # process function for grid wrapper + print('Execute batch:', batch) + main(batch, *args, input_path, batch_size, debug, output_path) + + +if __name__ == '__main__': + main(input_path, batch_size, debug, output_path) diff --git a/tests/test_compiler.py b/tests/test_compiler.py new file mode 100644 index 00000000..b3fd2772 --- /dev/null +++ b/tests/test_compiler.py @@ -0,0 +1,173 @@ +import os +import subprocess +import pytest +from typing import Any, Dict, List, Optional, Union +from pathlib import Path +from src.c3.utils import convert_notebook, increase_image_version, get_image_version +from src.c3.pythonscript import Pythonscript + +TEST_NOTEBOOK_PATH = 'example_notebook.ipynb' +TEST_SCRIPT_PATH = 'example_script.py' +TEST_RSCRIPT_PATH = 'example_rscript.R' +DUMMY_REPO = 'test' + +test_convert_notebook_input = [ + ( + TEST_NOTEBOOK_PATH, + ['input_path', 'batch_size', 'debug', 'output_path'] + ) +] + +@pytest.mark.parametrize( + "notebook_path, env_values", + test_convert_notebook_input, +) +def test_convert_notebook( + notebook_path: str, + env_values: List, +): + # convert notebook + script_path = convert_notebook(notebook_path) + + assert os.path.isfile(script_path), f"Error! No file {script_path}" + + # check if script runs with errors + for env in env_values: + os.environ[env] = '0' + subprocess.run(['python', script_path], check=True) + + # check if converted script is processable for create_operator + py = Pythonscript(script_path) + name = py.get_name() + assert isinstance(name, str), "Name is not a string." + description = py.get_description() + assert isinstance(description, str), "Description is not a string." + inputs = py.get_inputs() + assert isinstance(inputs, dict), "Inputs is not a dict." + outputs = py.get_outputs() + assert isinstance(outputs, dict), "Ouputs is not a dict." + requirements = py.get_requirements() + assert isinstance(requirements, list), "Requirements is not a list." + + # remove temporary file + os.remove(script_path) + + +test_get_remote_version_input = [ + ('us.icr.io/geodn', 'sleep',), + ('docker.io/romeokienzler', 'predict-image-endpoint',), +] + + +@pytest.mark.parametrize( + "repository, name", + test_get_remote_version_input, +) +def test_get_remote_version( + repository: str, + name: str, +): + # testing icr.io requires 'ibmcloud login' + version = get_image_version(repository, name) + assert version != '0.1', \ + f"get_image_version returns default version 0.1" + + +test_increase_version_input = [ + ('0.1', '0.2'), + ('2.1.13', '2.1.14'), + ('0.1beta', '0.1beta.1'), + ('0.1beta.1', '0.1beta.2'), +] + + +@pytest.mark.parametrize( + "last_version, expected_version", + test_increase_version_input, +) +def test_increase_version( + last_version: str, + expected_version: str, +): + new_version = increase_image_version(last_version) + assert new_version == expected_version, \ + f"Mismatch between new version {new_version} and expected version {expected_version}" + + +test_create_operator_input = [ + ( + TEST_SCRIPT_PATH, + DUMMY_REPO, + [TEST_NOTEBOOK_PATH], + ), + ( + TEST_RSCRIPT_PATH, + DUMMY_REPO, + [], + ), + ( + TEST_NOTEBOOK_PATH, + DUMMY_REPO, + [], + ), +] +@pytest.mark.parametrize( + "file_path, repository, args", + test_create_operator_input, +) +def test_create_operator( + file_path: str, + repository: str, + args: List, +): + subprocess.run(['python', '../src/c3/create_operator.py', file_path, *args, '-r', repository, + '--local_mode', '-v', 'test', '--log_level', 'DEBUG', '--overwrite'], + check=True) + + file = Path(file_path) + file.with_suffix('.yaml').unlink() + file.with_suffix('.job.yaml').unlink() + file.with_suffix('.cwl').unlink() + image_name = f"{repository}/claimed-{file_path.rsplit('.')[0].replace('_', '-')}:test" + subprocess.run(['docker', 'run', image_name], + check=True) + + +test_create_gridwrapper_input = [ + ( + TEST_SCRIPT_PATH, + 'process', + [TEST_NOTEBOOK_PATH], + ), + ( + TEST_SCRIPT_PATH, + 'process', + [TEST_NOTEBOOK_PATH, '--backend', 'cos'], + ), + ( + TEST_NOTEBOOK_PATH, + 'your_function', + [], + ), +] +@pytest.mark.parametrize( + "file_path, process, args", + test_create_gridwrapper_input, +) +def test_create_gridwrapper( + file_path: str, + process: str, + args: List, +): + subprocess.run(['python', '../src/c3/create_gridwrapper.py', file_path, *args, '--overwrite', + '-p', process, '--local_mode', '-v', 'test', '--log_level', 'DEBUG'], check=True) + + file = Path(file_path) + gw_file = file.parent / f'gw_{file.stem}.py' + + gw_file.with_suffix('.yaml').unlink() + gw_file.with_suffix('.job.yaml').unlink() + gw_file.with_suffix('.cwl').unlink() + image_name = f"claimed-gw-{file_path.rsplit('.')[0].replace('_', '-')}:test" + # TODO: Modify subprocess call to test grid wrapper + # subprocess.run(['docker', 'run', image_name], check=True) diff --git a/tests/test_operator_utils.py b/tests/test_operator_utils.py new file mode 100644 index 00000000..67d1f4b6 --- /dev/null +++ b/tests/test_operator_utils.py @@ -0,0 +1,9 @@ +from c3.operator_utils import explode_connection_string + + +def test_explode_connection_string(): + (ac, sc, ep, p) = explode_connection_string('cos://DF)S)DFU8:!#$%^*(){}[]"><@s3.us-east.cloud-object-storage.appdomain.cloud/claimed-test/ds=335/dl=50254/dt=20220101/tm=000000/lvl=0/gh=0/S1A_IW_GRDH_1SDV_20220101T090715_20220101T090740_041265_04E78F_73F0_VH.cog') + assert ac=='DF)S)DFU8' + assert sc=='!#$%^*(){}[]"><' + assert ep=='https://s3.us-east.cloud-object-storage.appdomain.cloud' + assert p=='claimed-test/ds=335/dl=50254/dt=20220101/tm=000000/lvl=0/gh=0/S1A_IW_GRDH_1SDV_20220101T090715_20220101T090740_041265_04E78F_73F0_VH.cog' \ No newline at end of file