diff --git a/build/lib/c3/__init__.py b/build/lib/c3/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/build/lib/c3/create_containerless_operator.py b/build/lib/c3/create_containerless_operator.py deleted file mode 100644 index b72f48d2..00000000 --- a/build/lib/c3/create_containerless_operator.py +++ /dev/null @@ -1,94 +0,0 @@ -import argparse -import os -import sys -import logging -import subprocess -import re -from c3.create_operator import create_cwl_component -from c3.pythonscript import Pythonscript -from c3.templates import component_setup_code_wo_logging, python_component_setup_code - -def create_containerless_operator( - file_path, - version, - skip_logging = False - ): - - if version is None: - version = 'latest' - - logging.debug(f'Called create_containerless_operator {version} with {file_path}') - - filename, file_extension = os.path.splitext(file_path) - - if file_extension != '.py': - raise NotImplementedError('Containerless operators currenly only support python scripts') - - all_pip_packages_found = '' - with open(file_path, 'r') as file: - for line in file: - if re.search('pip ', line): - pip_packages = re.sub('[#, ,!]*pip[ ]*install[ ]*', '', line) - logging.debug(f'PIP packages found: {pip_packages}') - all_pip_packages_found += (f' {pip_packages}') - logging.info(f'all PIP packages found: {all_pip_packages_found}') - - - # prepend init code to script - target_code = 'runnable.py' - - if os.path.exists(target_code): - os.remove(target_code) - - with open(file_path, 'r') as f: - script = f.read() - if skip_logging: - script = component_setup_code_wo_logging + script - else: - script = python_component_setup_code + script - with open(target_code, 'w') as f: - f.write(script) - - subprocess.run(';'.join(['rm -Rf claimedenv','python -m venv claimedenv', - 'source ./claimedenv/bin/activate', - f'pip install {all_pip_packages_found.strip()}', - 'pip list', - f'zip -r claimed-{filename}:{version}.zip {target_code} claimedenv', - 'rm -Rf claimedenv', - f'rm {target_code}']), shell=True) - script_data = Pythonscript(file_path) - inputs = script_data.get_inputs() - outputs = script_data.get_outputs() - - create_cwl_component(filename, "containerless", version, file_path, inputs, outputs) - - - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('FILE_PATH', type=str, - help='Path to python script or notebook') - parser.add_argument('ADDITIONAL_FILES', type=str, nargs='*', default=None, - help='Paths to additional files to include in the container image') - parser.add_argument('-v', '--version', type=str, default=None, - help='Container image version. Auto-increases the version number if not provided (default 0.1)') - parser.add_argument('-l', '--log_level', type=str, default='INFO') - args = parser.parse_args() - - # Init logging - root = logging.getLogger() - root.setLevel(args.log_level) - handler = logging.StreamHandler(sys.stdout) - formatter = logging.Formatter('%(levelname)s - %(message)s') - handler.setFormatter(formatter) - handler.setLevel(args.log_level) - root.addHandler(handler) - - create_containerless_operator( - file_path=args.FILE_PATH, - version=args.version, - ) - -if __name__ == '__main__': - main() diff --git a/build/lib/c3/create_gridwrapper.py b/build/lib/c3/create_gridwrapper.py deleted file mode 100644 index e8184ea3..00000000 --- a/build/lib/c3/create_gridwrapper.py +++ /dev/null @@ -1,256 +0,0 @@ -import logging -import os -import argparse -import sys -from string import Template -from c3.pythonscript import Pythonscript -from c3.utils import convert_notebook -from c3.create_operator import create_operator -from c3.templates import component_setup_code_wo_logging -import c3 - - -def wrap_component(component_path, - component_description, - component_dependencies, - component_interface, - component_inputs, - component_process, - backend, - ): - # get component name from path - component_name = os.path.splitext(os.path.basename(component_path))[0] - - logging.info(f'Using backend: {backend}') - - backends = { - 'local': c3.templates.grid_wrapper_template, - 'cos': c3.templates.cos_grid_wrapper_template, - 'legacy_cos': c3.templates.legacy_cos_grid_wrapper_template, - 's3kv': c3.templates.s3kv_grid_wrapper_template, - 'grid_wrapper': c3.templates.grid_wrapper_template, - 'cos_grid_wrapper': c3.templates.cos_grid_wrapper_template, - 'legacy_cos_grid_wrapper': c3.templates.legacy_cos_grid_wrapper_template, - 's3kv_grid_wrapper': c3.templates.s3kv_grid_wrapper_template, - 'simple_grid_wrapper': c3.templates.simple_grid_wrapper_template, - 'folder_grid_wrapper': c3.templates.folder_grid_wrapper_template, - } - gw_template = backends.get(backend) - - logging.debug(f'Using backend template: {gw_template}') - - grid_wrapper_code = gw_template.substitute( - component_name=component_name, - component_description=component_description, - component_dependencies=component_dependencies, - component_inputs=component_inputs, - component_interface=component_interface, - component_process=component_process, - ) - - # Write edited code to file - grid_wrapper_file = f'gw_{component_name}.py' - grid_wrapper_file_path = os.path.join(os.path.dirname(component_path), grid_wrapper_file) - # remove 'component_' from gw path - grid_wrapper_file_path = grid_wrapper_file_path.replace('component_', '') - with open(grid_wrapper_file_path, 'w') as f: - f.write(grid_wrapper_code) - - logging.info(f'Saved wrapped component to {grid_wrapper_file_path}') - - return grid_wrapper_file_path - - -def get_component_elements(file_path): - # get required elements from component code - py = Pythonscript(file_path) - # convert description into a string with a single line - description = (py.get_description().replace('\n', ' ').replace('"', '\'')) - inputs = py.get_inputs() - outputs = py.get_outputs() - dependencies = py.get_requirements() - - # combine dependencies list - dependencies = '\n# '.join(dependencies) - - # generate interface code from inputs - interface = '' - type_to_func = {'String': '', 'Boolean': 'bool', 'Integer': 'int', 'Float': 'float'} - for variable, d in inputs.items(): - interface += f"# {d['description']}\n" - if (d['type'] == 'String' and d['default'] is not None and - (d['default'] == '' or d['default'][0] not in '\'\"')): - # Add quotation marks - d['default'] = "'" + d['default'] + "'" - interface += f"component_{variable} = {type_to_func[d['type']]}(os.getenv('{variable}', {d['default']}))\n" - - # TODO: Implement output interface - if len(outputs) > 0: - logging.warning('Found output paths in the component code which is currently not supported.') - - # generate kwargs for the subprocesses - process_inputs = ', '.join([f'{i}=component_{i}' for i in inputs.keys()]) - # use log level from grid wrapper - process_inputs = process_inputs.replace('component_log_level', 'log_level') - - return description, interface, process_inputs, dependencies - - -# Adding code -def edit_component_code(file_path, component_process): - file_name = os.path.basename(file_path) - if file_path.endswith('.ipynb'): - logging.info('Convert notebook to python script') - target_file = convert_notebook(file_path) - file_path = target_file - file_name = os.path.basename(file_path) - else: - # write edited code to different file - target_file = os.path.join(os.path.dirname(file_path), 'component_' + file_name.replace('-', '_')) - - target_file_name = os.path.basename(target_file) - - with open(file_path, 'r') as f: - script = f.read() - assert component_process in script, (f'Did not find the grid process {component_process} in the script. ' - f'Please provide the grid process in the arguments `-p `.') - # Add code for logging and cli parameters to the beginning of the script - script = component_setup_code_wo_logging + script - # replace old filename with new file name - script = script.replace(file_name, target_file_name) - with open(target_file, 'w') as f: - f.write(script) - - if '__main__' not in script: - logging.warning('No __main__ found in component code. Grid wrapper will import functions from component, ' - 'which can lead to unexpected behaviour without using __main__.') - - logging.info('Saved component python script in ' + target_file) - - return target_file - - -def apply_grid_wrapper(file_path, component_process, backend): - assert file_path.endswith('.py') or file_path.endswith('.ipynb'), \ - "Please provide a component file path to a python script or notebook." - - file_path = edit_component_code(file_path, component_process) - - description, interface, inputs, dependencies = get_component_elements(file_path) - - component_elements = dict( - component_path=file_path, - component_description=description, - component_dependencies=dependencies, - component_interface=interface, - component_inputs=inputs, - component_process=component_process - ) - - logging.debug('Wrap component with parameters:') - for component, value in component_elements.items(): - logging.debug(component + ':\n' + str(value) + '\n') - - logging.info('Wrap component') - grid_wrapper_file_path = wrap_component(backend=backend, **component_elements) - return grid_wrapper_file_path, file_path - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('FILE_PATH', type=str, - help='Path to python script or notebook') - parser.add_argument('ADDITIONAL_FILES', type=str, nargs='*', - help='List of paths to additional files to include in the container image') - parser.add_argument('-p', '--component_process', type=str, default='grid_process', - help='Name of the component sub process that is executed for each batch.') - parser.add_argument('-b', '--backend', type=str, default='local', - help='Define backend. Default: local. Others: cos, s3kv, legacy_cos (with automatic file download/upload)') - parser.add_argument('-r', '--repository', type=str, default=None, - help='Container registry address, e.g. docker.io/') - parser.add_argument('-v', '--version', type=str, default=None, - help='Container image version. Auto-increases the version number if not provided (default 0.1)') - parser.add_argument('--rename', type=str, nargs='?', default=None, const='', - help='Rename existing yaml files (argument without value leads to modified_{file name})') - parser.add_argument('--overwrite', action='store_true', help='Overwrite existing yaml files') - parser.add_argument('-l', '--log_level', type=str, default='INFO') - parser.add_argument('--dockerfile_template_path', type=str, default='', - help='Path to custom dockerfile template') - parser.add_argument('--dockerfile', type=str, default='Dockerfile.generated', - help='Name or path of the generated dockerfile.') - parser.add_argument('--local_mode', action='store_true', - help='Continue processing after docker errors.') - parser.add_argument('--no-cache', action='store_true', help='Not using cache for docker build.') - parser.add_argument('--skip-logging', action='store_true', - help='Exclude logging code from component setup code') - parser.add_argument('--keep-generated-files', action='store_true', - help='Do not delete temporary generated files.') - parser.add_argument('--platform', type=str, default='linux/amd64', - help='Select image platform, default is linux/amd64. Alternativly, select linux/arm64".') - parser.add_argument('--image_version', type=str, default='python3.12', - help='Select python or R version (defaults to python3.12).') - - args = parser.parse_args() - - # Init logging - root = logging.getLogger() - root.setLevel(args.log_level) - handler = logging.StreamHandler(sys.stdout) - formatter = logging.Formatter('%(levelname)s - %(message)s') - handler.setFormatter(formatter) - handler.setLevel(args.log_level) - root.addHandler(handler) - - grid_wrapper_file_path = component_path = '' - try: - grid_wrapper_file_path, component_path = apply_grid_wrapper( - file_path=args.FILE_PATH, - component_process=args.component_process, - backend=args.backend, - ) - - logging.info('Generate CLAIMED operator for grid wrapper') - - # Add component path and init file path to additional_files - args.ADDITIONAL_FILES.append(component_path) - - # Update dockerfile template if specified - if args.dockerfile_template_path != '': - logging.info(f'Uses custom dockerfile template from {args.dockerfile_template_path}') - with open(args.dockerfile_template_path, 'r') as f: - custom_dockerfile_template = Template(f.read()) - else: - custom_dockerfile_template = None - - create_operator( - file_path=grid_wrapper_file_path, - repository=args.repository, - version=args.version, - custom_dockerfile_template=custom_dockerfile_template, - additional_files=args.ADDITIONAL_FILES, - log_level=args.log_level, - local_mode=args.local_mode, - no_cache=args.no_cache, - overwrite_files=args.overwrite, - rename_files=args.rename, - skip_logging=args.skip_logging, - keep_generated_files=args.keep_generated_files, - platform=args.platform, - dockerfile=args.dockerfile, - image_version=args.image_version, - ) - except Exception as err: - logging.error('Error while generating CLAIMED grid wrapper. ' - 'Consider using `--log_level DEBUG` and `--keep-generated-files` for debugging.') - raise err - finally: - if not args.keep_generated_files: - logging.info('Remove local component file and grid wrapper code.') - if os.path.isfile(grid_wrapper_file_path): - os.remove(grid_wrapper_file_path) - if os.path.isfile(component_path): - os.remove(component_path) - - -if __name__ == '__main__': - main() diff --git a/build/lib/c3/create_operator.py b/build/lib/c3/create_operator.py deleted file mode 100644 index 0e2bb738..00000000 --- a/build/lib/c3/create_operator.py +++ /dev/null @@ -1,525 +0,0 @@ - -import os -import sys -import logging -import shutil -import argparse -import subprocess -import glob -import re -import json -from pathlib import Path -from string import Template -from typing import Optional -from c3.pythonscript import Pythonscript -from c3.notebook import Notebook -from c3.rscript import Rscript -from c3.utils import convert_notebook, get_image_version -from c3.templates import (python_component_setup_code, component_setup_code_wo_logging, r_component_setup_code, - python_dockerfile_template, r_dockerfile_template, - kfp_component_template, kubernetes_job_template, cwl_component_template) - -CLAIMED_VERSION = 'V0.1' - - -def create_dockerfile(dockerfile_template, dockerfile, requirements, target_code, target_dir, additional_files, - working_dir, command, image_version): - # Check for requirements file - for i in range(len(requirements)): - if '-r ' in requirements[i]: - r_file_search = re.search('-r ~?\/?([^\s]*\.txt)', requirements[i]) - if len(r_file_search.groups()): - # Get file from regex - requirements_file = r_file_search.groups()[0] - if requirements_file not in additional_files and os.path.isfile(requirements_file): - # Add missing requirements text file to additional files - additional_files.append(r_file_search.groups()[0]) - if '/' not in requirements[i]: - # Add missing home directory to the command `pip install -r ~/requirements.txt` - requirements[i] = requirements[i].replace('-r ', '-r ~/') - - requirements_docker = list(map(lambda s: 'RUN ' + s, requirements)) - requirements_docker = '\n'.join(requirements_docker) - additional_files_docker = list(map(lambda s: f"ADD {s} {working_dir}{s}", additional_files)) - additional_files_docker = '\n'.join(additional_files_docker) - - # Select base image - if 'python' in command: - base_image = f"registry.access.redhat.com/ubi8/python-{image_version.strip('python').replace('.', '')}" - elif command == 'Rscript': - if 'python' in image_version: - # Using default R version - image_version = 'R4.3.2' - base_image = f"r-base:{image_version.strip('Rr:')}" - else: - raise ValueError(f'Unrecognized command {command}') - logging.info(f'Using base image {base_image}') - - docker_file = dockerfile_template.substitute( - base_image=base_image, - requirements_docker=requirements_docker, - target_code=target_code, - target_dir=target_dir, - additional_files_docker=additional_files_docker, - working_dir=working_dir, - command=os.path.basename(command), - ) - - logging.info('Create Dockerfile') - with open(dockerfile, "w") as text_file: - text_file.write(docker_file) - logging.debug(f'{dockerfile}:\n' + docker_file) - - -def create_kfp_component(name, description, repository, version, command, target_code, target_dir, file_path, inputs, outputs): - - inputs_list = str() - for input, options in inputs.items(): - inputs_list += f'- {{name: {input}, type: {options["type"]}, description: "{options["description"]}"' - if options['default'] is not None: - if not options["default"].startswith('"'): - options["default"] = f'"{options["default"]}"' - inputs_list += f', default: {options["default"]}' - inputs_list += '}\n' - - outputs_list = str() - for output, options in outputs.items(): - outputs_list += f'- {{name: {output}, type: String, description: "{options["description"]}"}}\n' - - parameter_list = str() - for index, key in enumerate(list(inputs.keys()) + list(outputs.keys())): - parameter_list += f'{key}="${{{index}}}" ' - - parameter_values = str() - for input_key in inputs.keys(): - parameter_values += f" - {{inputValue: {input_key}}}\n" - for output_key in outputs.keys(): - parameter_values += f" - {{outputPath: {output_key}}}\n" - - yaml = kfp_component_template.substitute( - name=name, - description=description, - repository=repository, - version=version, - inputs=inputs_list, - outputs=outputs_list, - command=os.path.basename(command), - target_dir=target_dir, - target_code=target_code, - parameter_list=parameter_list, - parameter_values=parameter_values, - ) - - logging.debug('KubeFlow component yaml:\n' + yaml) - target_yaml_path = str(Path(file_path).with_suffix('.yaml')) - - logging.info(f'Write KubeFlow component yaml to {target_yaml_path}') - with open(target_yaml_path, "w") as text_file: - text_file.write(yaml) - - -def create_kubernetes_job(name, repository, version, target_code, target_dir, command, working_dir, file_path, inputs): - # get environment entries - env_entries = str() - for key in list(inputs.keys()): - env_entries += f" - name: {key}\n value: value_of_{key}\n" - env_entries = env_entries.rstrip() - - job_yaml = kubernetes_job_template.substitute( - name=name, - repository=repository, - version=version, - target_code=target_code, - target_dir=target_dir, - env_entries=env_entries, - command=command, - working_dir=working_dir, - ) - - logging.debug('Kubernetes job yaml:\n' + job_yaml) - target_job_yaml_path = str(Path(file_path).with_suffix('.job.yaml')) - - logging.info(f'Write kubernetes job yaml to {target_job_yaml_path}') - with open(target_job_yaml_path, "w") as text_file: - text_file.write(job_yaml) - - -def create_cwl_component(name, repository, version, file_path, inputs, outputs): - type_dict = {'String': 'string', 'Integer': 'int', 'Float': 'float', 'Boolean': 'bool'} - # get environment entries - i = 1 - input_envs = str() - for input, options in inputs.items(): - i += 1 - # Convert string default value to CWL types - default_value = options['default'] if options['type'] == 'String' and options['default'] != '"None"' \ - else options['default'].strip('"\'') - input_envs += (f" {input}:\n type: {type_dict[options['type']]}\n default: {default_value}\n " - f"inputBinding:\n position: {i}\n prefix: --{input}\n") - - if len(outputs) == 0: - output_envs = '[]' - else: - output_envs = '\n' - for output, options in outputs.items(): - i += 1 - output_envs += (f" {output}:\n type: string\n " - f"inputBinding:\n position: {i}\n prefix: --{output}\n") - - cwl = cwl_component_template.substitute( - name=name, - repository=repository, - version=version, - inputs=input_envs, - outputs=output_envs, - ) - - logging.debug('CWL component:\n' + cwl) - target_cwl_path = str(Path(file_path).with_suffix('.cwl')) - - logging.info(f'Write cwl component to {target_cwl_path}') - with open(target_cwl_path, "w") as text_file: - text_file.write(cwl) - - -def check_existing_files(file_path, rename_files, overwrite_files): - if rename_files is None and overwrite_files: - # Overwrite potential files - return - - target_job_yaml_path = Path(file_path).with_suffix('.job.yaml') - - # Check for existing job yaml - if target_job_yaml_path.is_file(): - if rename_files is None: - # Ask user - rename_files = input(f'\nFound a existing Kubernetes job file at {target_job_yaml_path}.\n' - f'ENTER to overwrite the file, write Y to rename the file to ' - f'modified_{target_job_yaml_path.name}, or provide a custom name:\n') - if rename_files.strip() == '': - # Overwrite file - return - elif rename_files.lower() == 'y': - # Default file name - new_file_name = 'modified_' + Path(file_path).name - else: - # Rename to custom name - new_file_name = rename_files - - modified_path = (target_job_yaml_path.parent / new_file_name).with_suffix('.job.yaml') - # Check if modified path exists and potentially overwrite - if modified_path.exists(): - if overwrite_files: - logging.info(f'Overwriting modified path {modified_path}.') - else: - overwrite = input(f'Modified path {modified_path} already exists. ENTER to overwrite the file.') - if overwrite != '': - logging.error(f'Abort creating operator. Please rename file manually and rerun the script.') - raise FileExistsError - - os.rename(str(target_job_yaml_path), str(modified_path)) - logging.info(f'Renamed Kubernetes job file to {modified_path}') - # TODO: Should we check other files too? Currently assuming no modification for yaml and cwl. - - -def print_claimed_command(name, repository, version, inputs): - claimed_command = f"claimed --component {repository}/claimed-{name}:{version}" - for input, options in inputs.items(): - claimed_command += f" --{input} {options['default']}" - logging.info(f'Run operators locally with claimed-cli:\n{claimed_command}') - - -def remove_temporary_files(file_path, target_code): - logging.info(f'Remove local files') - # remove temporary files - if file_path != target_code: - os.remove(target_code) - if os.path.isfile('Dockerfile'): - os.remove('Dockerfile') - - -def create_operator(file_path: str, - repository: str, - version: str, - custom_dockerfile_template: Optional[Template], - additional_files: str = None, - log_level='INFO', - local_mode=False, - no_cache=False, - rename_files=None, - overwrite_files=False, - skip_logging=False, - keep_generated_files=False, - platform='linux/amd64', - dockerfile='Dockerfile.generated', - image_version='python3.12', - ): - logging.info('Parameters: ') - logging.info('file_path: ' + file_path) - logging.info('repository: ' + str(repository)) - logging.info('version: ' + str(version)) - logging.info('additional_files: ' + '; '.join(additional_files)) - - if file_path.endswith('.py'): - # use temp file for processing - target_code = 'claimed_' + os.path.basename(file_path) - # Copy file to current working directory - shutil.copy(file_path, target_code) - # Add code for logging and cli parameters to the beginning of the script - with open(target_code, 'r') as f: - script = f.read() - if skip_logging: - script = component_setup_code_wo_logging + script - else: - script = python_component_setup_code + script - with open(target_code, 'w') as f: - f.write(script) - # getting parameter from the script - script_data = Pythonscript(target_code) - dockerfile_template = custom_dockerfile_template or python_dockerfile_template - command = '/opt/app-root/bin/python' - working_dir = '/opt/app-root/src/' - - elif file_path.endswith('.ipynb'): - # use temp file for processing - target_code = 'claimed_' + os.path.basename(file_path) - # Copy file to current working directory - shutil.copy(file_path, target_code) - with open(target_code, 'r') as json_file: - notebook = json.load(json_file) - # Add code for logging and cli parameters to the beginning of the notebook - notebook['cells'].insert(0, { - 'cell_type': 'code', 'execution_count': None, 'metadata': {}, 'outputs': [], - 'source': component_setup_code_wo_logging if skip_logging else python_component_setup_code}) - with open(target_code, 'w') as json_file: - json.dump(notebook, json_file) - # getting parameter from the script - script_data = Notebook(target_code) - dockerfile_template = custom_dockerfile_template or python_dockerfile_template - command = '/opt/app-root/bin/ipython' - working_dir = '/opt/app-root/src/' - - elif file_path.lower().endswith('.r'): - # use temp file for processing - target_code = 'claimed_' + os.path.basename(file_path) - # Copy file to current working directory - shutil.copy(file_path, target_code) - # Add code for logging and cli parameters to the beginning of the script - with open(target_code, 'r') as f: - script = f.read() - script = r_component_setup_code + script - with open(target_code, 'w') as f: - f.write(script) - # getting parameter from the script - script_data = Rscript(target_code) - dockerfile_template = custom_dockerfile_template or r_dockerfile_template - command = 'Rscript' - working_dir = '/home/docker/' - else: - raise NotImplementedError('Please provide a file_path to a jupyter notebook, python script, or R script.') - - name = script_data.get_name() - # convert description into a string with a single line - description = ('"' + script_data.get_description().replace('\n', ' ').replace('"', '\'') + - ' – CLAIMED ' + CLAIMED_VERSION + '"') - inputs = script_data.get_inputs() - outputs = script_data.get_outputs() - requirements = script_data.get_requirements() - # Strip 'claimed-' from name of copied temp file - if name.startswith('claimed-'): - name = name[8:] - target_dir = os.path.dirname(file_path) - # Check that the main file is within the cwd - if '../' in target_dir: - raise PermissionError(f"Forbidden path outside the docker build context: {target_dir}. " - f"Change the current working directory to include the file.") - elif target_dir != '': - target_dir += '/' - - logging.info('Operator name: ' + name) - logging.info('Description: ' + description) - logging.info('Inputs:\n' + ('\n'.join([f'{k}: {v}' for k, v in inputs.items()]))) - logging.info('Outputs:\n' + ('\n'.join([f'{k}: {v}' for k, v in outputs.items()]))) - logging.info('Requirements: ' + '; '.join(requirements)) - logging.debug(f'Target code: {target_code}') - logging.debug(f'Target directory: {target_dir}') - - # Load all additional files - logging.debug('Looking for additional files:') - additional_files_found = [] - for file_pattern in additional_files: - if '../' in file_pattern: - # Check that additional file are within the cwd - raise PermissionError(f"Forbidden path outside the docker build context: {file_pattern}. " - f"Change the current working directory to include all additional files.") - # Include files based on wildcards - files_found = glob.glob(file_pattern) - if len(files_found) == 0: - raise FileNotFoundError(f'No additional files for path {file_pattern}.') - additional_files_found.extend(files_found) - logging.debug(f'Searched for "{file_pattern}". Found {", ".join(files_found)}') - logging.info(f'Found {len(additional_files_found)} additional files and directories\n' - f'{", ".join(additional_files_found)}') - - create_dockerfile(dockerfile_template, dockerfile, requirements, target_code, target_dir, additional_files_found, - working_dir, command, image_version) - - if version is None: - # auto increase version based on registered images - version = get_image_version(repository, name) - - if repository is None: - if not local_mode: - logging.warning('No repository provided. The container image is only saved locally. Add `-r ` ' - 'to push the image to a container registry or run `--local_mode` to suppress this warning.') - local_mode = True - repository = 'local' - - if subprocess.run('docker buildx', shell=True, stdout=subprocess.PIPE).returncode == 0: - # Using docker buildx - logging.debug('Using docker buildx') - build_command = f'docker buildx build -f {dockerfile}' - else: - logging.debug('Using docker build. Consider installing docker-buildx.') - build_command = f'docker build -f {dockerfile}' - - logging.info(f'Building container image claimed-{name}:{version}') - try: - # Run docker build - subprocess.run( - f"{build_command} --platform {platform} -t claimed-{name}:{version} . {'--no-cache' if no_cache else ''}", - stdout=None if log_level == 'DEBUG' else subprocess.PIPE, check=True, shell=True - ) - if repository is not None: - # Run docker tag - logging.debug(f'Tagging images with "latest" and "{version}"') - subprocess.run( - f"docker tag claimed-{name}:{version} {repository}/claimed-{name}:{version}", - stdout=None if log_level == 'DEBUG' else subprocess.PIPE, check=True, shell=True, - ) - subprocess.run( - f"docker tag claimed-{name}:{version} {repository}/claimed-{name}:latest", - stdout=None if log_level == 'DEBUG' else subprocess.PIPE, check=True, shell=True, - ) - except Exception as err: - logging.error('Docker build failed. Consider running C3 with `--log_level DEBUG` to see the docker build logs.') - if not keep_generated_files: - remove_temporary_files(file_path, target_code) - raise err - logging.info(f'Successfully built image claimed-{name}:{version}') - - if local_mode: - logging.info(f'No repository provided, skip docker push.') - else: - logging.info(f'Pushing images to registry {repository}') - try: - # Run docker push - subprocess.run( - f"docker push {repository}/claimed-{name}:latest", - stdout=None if log_level == 'DEBUG' else subprocess.PIPE, check=True, shell=True, - ) - subprocess.run( - f"docker push {repository}/claimed-{name}:{version}", - stdout=None if log_level == 'DEBUG' else subprocess.PIPE, check=True, shell=True, - ) - logging.info('Successfully pushed image to registry') - except Exception as err: - logging.error(f'Could not push images to namespace {repository}. ' - f'Please check if docker is logged in or select a namespace with access.') - if not keep_generated_files: - remove_temporary_files(file_path, target_code) - raise err - - # Check for existing files and optionally modify them before overwriting - try: - check_existing_files(file_path, rename_files, overwrite_files) - except Exception as err: - if not keep_generated_files: - remove_temporary_files(file_path, target_code) - raise err - - # Create application scripts - create_kfp_component(name, description, repository, version, command, target_code, target_dir, file_path, inputs, - outputs) - - create_kubernetes_job(name, repository, version, target_code, target_dir, command, working_dir, file_path, inputs) - - create_cwl_component(name, repository, version, file_path, inputs, outputs) - - print_claimed_command(name, repository, version, inputs) - - # Remove temp files - if not keep_generated_files: - remove_temporary_files(file_path, target_code) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('FILE_PATH', type=str, - help='Path to python script or notebook') - parser.add_argument('ADDITIONAL_FILES', type=str, nargs='*', - help='Paths to additional files to include in the container image') - parser.add_argument('-r', '--repository', type=str, default=None, - help='Container registry address, e.g. docker.io/') - parser.add_argument('-v', '--version', type=str, default=None, - help='Container image version. Auto-increases the version number if not provided (default 0.1)') - parser.add_argument('--rename', type=str, nargs='?', default=None, const='', - help='Rename existing yaml files (argument without value leads to modified_{file name})') - parser.add_argument('--overwrite', action='store_true', help='Overwrite existing yaml files') - parser.add_argument('-l', '--log_level', type=str, default='INFO') - parser.add_argument('--dockerfile_template_path', type=str, default='', - help='Path to custom dockerfile template') - parser.add_argument('--dockerfile', type=str, default='Dockerfile.generated', - help='Name or path of the generated dockerfile.') - parser.add_argument('--local_mode', action='store_true', - help='Continue processing after docker errors.') - parser.add_argument('--no-cache', action='store_true', help='Not using cache for docker build.') - parser.add_argument('--skip-logging', action='store_true', - help='Exclude logging code from component setup code') - parser.add_argument('--keep-generated-files', action='store_true', - help='Do not delete temporary generated files.') - parser.add_argument('--platform', type=str, default='linux/amd64', - help='Select image platform, default is linux/amd64. Alternativly, select linux/arm64".') - parser.add_argument('--image_version', type=str, default='python3.12', - help='Select python or R version (defaults to python3.12).') - - args = parser.parse_args() - - # Init logging - root = logging.getLogger() - root.setLevel(args.log_level) - handler = logging.StreamHandler(sys.stdout) - formatter = logging.Formatter('%(levelname)s - %(message)s') - handler.setFormatter(formatter) - handler.setLevel(args.log_level) - root.addHandler(handler) - - # Update dockerfile template if specified - if args.dockerfile_template_path != '': - logging.info(f'Uses custom dockerfile template from {args.dockerfile_template_path}') - with open(args.dockerfile_template_path, 'r') as f: - custom_dockerfile_template = Template(f.read()) - else: - custom_dockerfile_template = None - - create_operator( - file_path=args.FILE_PATH, - repository=args.repository, - version=args.version, - custom_dockerfile_template=custom_dockerfile_template, - additional_files=args.ADDITIONAL_FILES, - log_level=args.log_level, - local_mode=args.local_mode, - no_cache=args.no_cache, - overwrite_files=args.overwrite, - rename_files=args.rename, - skip_logging=args.skip_logging, - keep_generated_files=args.keep_generated_files, - platform=args.platform, - dockerfile=args.dockerfile, - image_version=args.image_version, - ) - - -if __name__ == '__main__': - main() diff --git a/build/lib/c3/notebook.py b/build/lib/c3/notebook.py deleted file mode 100644 index 1a5a25bb..00000000 --- a/build/lib/c3/notebook.py +++ /dev/null @@ -1,98 +0,0 @@ -import json -import re -import os -import logging -from c3.parser import ContentParser, NotebookReader - - -class Notebook(): - def __init__(self, path): - self.path = path - with open(path) as json_file: - self.notebook = json.load(json_file) - - self.name = os.path.basename(path)[:-6].replace('_', '-').lower() - - if self.notebook['cells'][1]['cell_type'] == self.notebook['cells'][2]['cell_type'] == 'markdown': - # backwards compatibility (v0.1 description was included in second cell, merge first two markdown cells) - logging.info('Merge first two markdown cells for description. ' - 'The file name is used as the operator name, not the first markdown cell.') - self.description = self.notebook['cells'][1]['source'][0] + '\n' + self.notebook['cells'][2]['source'][0] - else: - # Using second cell because first cell was added for setup code - self.description = self.notebook['cells'][1]['source'][0] - - self.inputs = self._get_input_vars() - self.outputs = self._get_output_vars() - - def _get_input_vars(self): - cp = ContentParser() - env_names = cp.parse(self.path)['inputs'] - return_value = dict() - notebook_code_lines = list(NotebookReader(self.path).read_next_code_line()) - for env_name, default in env_names.items(): - comment_line = str() - for line in notebook_code_lines: - if re.search("[\"']" + env_name + "[\"']", line): - if not comment_line.strip().startswith('#'): - # previous line was no description, reset comment_line. - comment_line = '' - if comment_line == '': - logging.debug(f'Interface: No description for variable {env_name} provided.') - if re.search(r'=\s*int\(\s*os', line): - type = 'Integer' - elif re.search(r'=\s*float\(\s*os', line): - type = 'Float' - elif re.search(r'=\s*bool\(\s*os', line): - type = 'Boolean' - else: - type = 'String' - return_value[env_name] = { - 'description': comment_line.replace('#', '').replace("\"", "\'").strip(), - 'type': type, - 'default': default - } - break - comment_line = line - return return_value - - def _get_output_vars(self): - cp = ContentParser() - output_names = cp.parse(self.path)['outputs'] - # TODO: Does not check for description code - return_value = {name: { - 'description': f'Output path for {name}', - 'type': 'String', - } for name in output_names} - return return_value - - def get_requirements(self): - requirements = [] - notebook_code_lines = list(NotebookReader(self.path).read_next_code_line()) - # Add dnf install - for line in notebook_code_lines: - if re.search(r'[\s#]*dnf\s*.[^#]*', line): - if '-y' not in line: - # Adding default repo - line += ' -y' - requirements.append(line.replace('#', '').strip()) - - # Add pip install - pattern = r"^[# !]*(pip[ ]*install)[ ]*(.[^#]*)" - for line in notebook_code_lines: - result = re.findall(pattern, line) - if len(result) == 1: - requirements.append((result[0][0] + ' ' + result[0][1].strip())) - return requirements - - def get_name(self): - return self.name - - def get_description(self): - return self.description - - def get_inputs(self): - return self.inputs - - def get_outputs(self): - return self.outputs diff --git a/build/lib/c3/operator_utils.py b/build/lib/c3/operator_utils.py deleted file mode 100644 index 5f524872..00000000 --- a/build/lib/c3/operator_utils.py +++ /dev/null @@ -1,43 +0,0 @@ -import contextlib -import logging -import os - -# converts string in form [cos|s3]://access_key_id:secret_access_key@endpoint/bucket/path to -# access_key_id, secret_access_key, endpoint, path - path includes bucket name -def explode_connection_string(cs): - if cs is None: - return None - if cs.startswith('cos') or cs.startswith('s3'): - buffer=cs.split('://')[1] - access_key_id=buffer.split('@')[0].split(':')[0] - secret_access_key=buffer.split('@')[0].split(':')[1] - endpoint=f"https://{buffer.split('@')[1].split('/')[0]}" - path='/'.join(buffer.split('@')[1].split('/')[1:]) - return (access_key_id, secret_access_key, endpoint, path) - else: - return (None, None, None, cs) - # TODO consider cs as secret and grab connection string from kubernetes - - -def run_and_log(cos_conn, log_folder, task_id, command_array): - log_root_name = time.time() - job_id = ('-').join(command_array).replace('/','-') # TODO get a unique job id - job_id = re.sub(r'[^a-zA-Z0-9]', '-', job_id) - task_id = re.sub(r'[^a-zA-Z0-9]', '-', task_id) - std_out_log_name = f'{job_id}-{task_id}-{log_root_name}-stdout.log' - std_err_log_name = f'{job_id}-{task_id}-{log_root_name}-stderr.log' - with open(std_out_log_name,'w') as so: - with open(std_err_log_name,'w') as se: - with contextlib.redirect_stdout(so): - with contextlib.redirect_stderr(se): - logging.info('-----INVOKING TASK-----------------------------------') - logging.info(f'Task ID: {task_id}') - logging.info(f'Command: {command_array}') - result = subprocess.run(command_array, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=os.environ.copy()) - output = result.stdout.decode('utf-8') - logging.info("Output:", output) - logging.info("Return code:", result.returncode) - cos_conn.put(std_out_log_name,os.path.join(log_folder,std_out_log_name)) - cos_conn.put(std_err_log_name,os.path.join(log_folder,std_err_log_name)) - os.remove(std_out_log_name) - os.remove(std_err_log_name) \ No newline at end of file diff --git a/build/lib/c3/parser.py b/build/lib/c3/parser.py deleted file mode 100644 index 1be4307d..00000000 --- a/build/lib/c3/parser.py +++ /dev/null @@ -1,211 +0,0 @@ -# -# Copyright 2018-2021 Elyra Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os -import re - -from traitlets.config import LoggingConfigurable - -from typing import TypeVar, List, Dict - -# Setup forward reference for type hint on return from class factory method. See -# https://stackoverflow.com/questions/39205527/can-you-annotate-return-type-when-value-is-instance-of-cls/39205612#39205612 -F = TypeVar('F', bound='FileReader') - - -class FileReader(LoggingConfigurable): - """ - Base class for parsing a file for resources according to operation type. Subclasses set - their own parser member variable according to their implementation language. - """ - - def __init__(self, filepath: str): - self._filepath = filepath - - @property - def filepath(self): - return self._filepath - - @property - def language(self) -> str: - file_extension = os.path.splitext(self._filepath)[-1].lower() - if file_extension == '.py': - return 'python' - elif file_extension == '.r': - return 'r' - else: - return None - - def read_next_code_line(self) -> List[str]: - """ - Implements a generator for lines of code in the specified filepath. Subclasses - may override if explicit line-by-line parsing is not feasible, e.g. with Notebooks. - """ - with open(self._filepath) as f: - for line in f: - yield line.strip() - - -class NotebookReader(FileReader): - def __init__(self, filepath: str): - super().__init__(filepath) - import nbformat - - with open(self._filepath) as f: - self._notebook = nbformat.read(f, as_version=4) - self._language = None - - try: - self._language = self._notebook['metadata']['language_info']['name'].lower() - - except KeyError: - self.log.warning(f'No language metadata found in {self._filepath}') - pass - - @property - def language(self) -> str: - return self._language - - def read_next_code_line(self) -> List[str]: - for cell in self._notebook.cells: - if cell.source and cell.cell_type == "code": - for line in cell.source.split('\n'): - yield line - - -class ScriptParser(): - """ - Base class for parsing individual lines of code. Subclasses implement a search_expressions() - function that returns language-specific regexes to match against code lines. - """ - - _comment_char = "#" - - def _get_line_without_comments(self, line): - if self._comment_char in line: - index = line.find(self._comment_char) - line = line[:index] - return line.strip() - - def parse_environment_variables(self, line): - # Parse a line fed from file and match each regex in regex dictionary - line = self._get_line_without_comments(line) - if not line: - return [] - - matches = [] - for key, value in self.search_expressions().items(): - for pattern in value: - regex = re.compile(pattern) - for match in regex.finditer(line): - matches.append((key, match)) - return matches - - -class PythonScriptParser(ScriptParser): - def search_expressions(self) -> Dict[str, List]: - # First regex matches envvar assignments that use os.getenv("name", "value") with ow w/o default provided - # Second regex matches envvar assignments that use os.environ.get("name", "value") with or w/o default provided - # Both name and value are captured if possible - inputs = [r"os\.getenv\([\"']([a-zA-Z_]+[A-Za-z0-9_]*)[\"']*(?:\s*\,\s*[\"']?(.[^#]*)?[\"']?)?\).*", - r"os\.environ\.get\([\"']([a-zA-Z_]+[A-Za-z0-9_]*)[\"']*(?:\s*\,\s*[\"']?(.[^#]*)?[\"']?)?\).*"] - # regex matches setting envvars assignments that use - outputs = [r"\s*os\.environ\[[\"']([a-zA-Z_]+[A-Za-z0-9_]*)[\"']].*"] - - regex_dict = dict(inputs=inputs, outputs=outputs) - return regex_dict - - -class RScriptParser(ScriptParser): - def search_expressions(self) -> Dict[str, List]: - - - # Tests for matches of the form: var <- Sys.getenv("key", "optional default") - inputs = [r".*Sys\.getenv\([\"']*([a-zA-Z_]+[A-Za-z0-9_]*)[\"']*(?:\s*\,\s*[\"']?(.[^#]*)?[\"']?)?\).*"] - # Tests for matches of the form: var <- Sys.getenv("key", "optional default") - outputs = [r"\s*Sys\.setenv\([\"']*([a-zA-Z_]+[A-Za-z0-9_]*)[\"']*(?:\s*\,\s*[\"']?(.[^#]*)?[\"']?)?\).*"] - - regex_dict = dict(inputs=inputs, outputs=outputs) - return regex_dict - - -class ContentParser(LoggingConfigurable): - parsers = { - 'python': PythonScriptParser(), - 'r': RScriptParser() - } - - def parse(self, filepath: str) -> dict: - """Returns a model dictionary of all the regex matches for each key in the regex dictionary""" - - properties = {"inputs": {}, "outputs": []} - reader = self._get_reader(filepath) - parser = self._get_parser(reader.language) - - if not parser: - return properties - - for line in reader.read_next_code_line(): - matches = parser.parse_environment_variables(line) - for key, match in matches: - if key == "inputs": - default_value = match.group(2) - if default_value: - # The default value match can end with an additional ', ", or ) which is removed - default_value = re.sub(r"['\")]?$", '', default_value, count=1) - properties[key][match.group(1)] = default_value - else: - properties[key].append(match.group(1)) - - return properties - - def _validate_file(self, filepath: str): - """ - Validate file exists and is file (e.g. not a directory) - """ - if not os.path.exists(filepath): - raise FileNotFoundError(f'No such file or directory: {filepath}') - if not os.path.isfile(filepath): - raise IsADirectoryError(f'Is a directory: {filepath}') - - def _get_reader(self, filepath: str): - """ - Find the proper reader based on the file extension - """ - file_extension = os.path.splitext(filepath)[-1] - - self._validate_file(filepath) - - if file_extension == '.ipynb': - return NotebookReader(filepath) - elif file_extension.lower() in ['.py', '.r']: - return FileReader(filepath) - else: - raise ValueError(f'File type {file_extension} is not supported.') - - def _get_parser(self, language: str): - """ - Find the proper parser based on content language - """ - parser = None - if language: - parser = self.parsers.get(language) - - if not parser: - self.log.warning(f'Content parser for {language} is not available.') - pass - - return parser diff --git a/build/lib/c3/pythonscript.py b/build/lib/c3/pythonscript.py deleted file mode 100644 index eeed2226..00000000 --- a/build/lib/c3/pythonscript.py +++ /dev/null @@ -1,96 +0,0 @@ - -import logging -import os -import re -from c3.parser import ContentParser - - -class Pythonscript: - def __init__(self, path): - - self.path = path - with open(path, 'r') as f: - self.script = f.read() - - self.name = os.path.basename(path)[:-3].replace('_', '-').lower() - if '"""' not in self.script: - logging.warning('Please provide a description of the operator in the first doc string.') - self.description = self.name - else: - self.description = self.script.split('"""')[1].strip() - self.inputs = self._get_input_vars() - self.outputs = self._get_output_vars() - - def _get_input_vars(self): - cp = ContentParser() - env_names = cp.parse(self.path)['inputs'] - return_value = dict() - for env_name, default in env_names.items(): - comment_line = str() - for line in self.script.split('\n'): - if re.search("[\"']" + env_name + "[\"']", line): - # Check the description for current variable - if not comment_line.strip().startswith('#'): - # previous line was no description, reset comment_line. - comment_line = '' - if comment_line == '': - logging.debug(f'Interface: No description for variable {env_name} provided.') - if re.search(r'=\s*int\(\s*os', line): - type = 'Integer' - default = default.strip('\"\'') - elif re.search(r'=\s*float\(\s*os', line): - type = 'Float' - default = default.strip('\"\'') - elif re.search(r'=\s*bool\(\s*os', line): - type = 'Boolean' - default = default.strip('\"\'') - else: - type = 'String' - return_value[env_name] = { - 'description': comment_line.replace('#', '').replace("\"", "\'").strip(), - 'type': type, - 'default': default - } - break - comment_line = line - return return_value - - def _get_output_vars(self): - cp = ContentParser() - output_names = cp.parse(self.path)['outputs'] - # TODO: Does not check for description code - return_value = {name: { - 'description': f'Output path for {name}', - 'type': 'String', - } for name in output_names} - return return_value - - def get_requirements(self): - requirements = [] - # Add dnf install - for line in self.script.split('\n'): - if re.search(r'[\s#]*dnf\s*.[^#]*', line): - if '-y' not in line: - # Adding default repo - line += ' -y' - requirements.append(line.replace('#', '').strip()) - - # Add pip install - pattern = r"^[# !]*(pip[ ]*install)[ ]*(.[^#]*)" - for line in self.script.split('\n'): - result = re.findall(pattern, line) - if len(result) == 1: - requirements.append((result[0][0] + ' ' + result[0][1].strip())) - return requirements - - def get_name(self): - return self.name - - def get_description(self): - return self.description - - def get_inputs(self): - return self.inputs - - def get_outputs(self): - return self.outputs diff --git a/build/lib/c3/rscript.py b/build/lib/c3/rscript.py deleted file mode 100644 index 9e6cc93e..00000000 --- a/build/lib/c3/rscript.py +++ /dev/null @@ -1,88 +0,0 @@ - -import logging -import os -import re -from c3.parser import ContentParser - - -class Rscript: - def __init__(self, path): - - self.path = path - with open(path, 'r') as f: - self.script = f.read() - - self.name = os.path.basename(path)[:-2].replace('_', '-').lower() - # TODO: Currently does not support a description - self.description = self.name - self.inputs = self._get_input_vars() - self.outputs = self._get_output_vars() - - def _get_input_vars(self): - cp = ContentParser() - env_names = cp.parse(self.path)['inputs'] - return_value = dict() - for env_name, default in env_names.items(): - comment_line = str() - for line in self.script.split('\n'): - if re.search("[\"']" + env_name + "[\"']", line): - # Check the description for current variable - if not comment_line.strip().startswith('#'): - # previous line was no description, reset comment_line. - comment_line = '' - if comment_line == '': - logging.debug(f'Interface: No description for variable {env_name} provided.') - if re.search(r'=\s*as.numeric\(\s*os', line): - type = 'Float' # double in R - elif re.search(r'=\s*bool\(\s*os', line): - type = 'Boolean' # logical in R - else: - type = 'String' # character in R - - return_value[env_name] = { - 'description': comment_line.replace('#', '').replace("\"", "\'").strip(), - 'type': type, - 'default': default - } - break - comment_line = line - return return_value - - def _get_output_vars(self): - cp = ContentParser() - output_names = cp.parse(self.path)['outputs'] - # TODO: Does not check for description - return_value = {name: {'description': 'output path'} for name in output_names} - return return_value - - def get_requirements(self): - requirements = [] - # Add apt install commands - for line in self.script.split('\n'): - if re.search(r'[\s#]*apt\s*[A-Za-z0-9_-]*', line): - if '-y' not in line: - # Adding default repo - line += ' -y' - requirements.append(line.replace('#', '').strip()) - - # Add Rscript install.packages commands - for line in self.script.split('\n'): - if re.search(r'[\s#]*install\.packages\(.*\)', line): - if 'http://' not in line: - # Adding default repo - line = line.rstrip(') ') + ", repos='http://cran.us.r-project.org')" - command = f"Rscript -e \"{line.replace('#', '').strip()}\"" - requirements.append(command) - return requirements - - def get_name(self): - return self.name - - def get_description(self): - return self.description - - def get_inputs(self): - return self.inputs - - def get_outputs(self): - return self.outputs diff --git a/build/lib/c3/templates/R_dockerfile_template b/build/lib/c3/templates/R_dockerfile_template deleted file mode 100644 index e60449e5..00000000 --- a/build/lib/c3/templates/R_dockerfile_template +++ /dev/null @@ -1,11 +0,0 @@ -FROM ${base_image} -USER root -RUN apt update -${requirements_docker} -ADD ${target_code} ${working_dir}${target_dir} -${additional_files_docker} -RUN chmod -R 777 ${working_dir} -RUN chmod -R 777 /usr/local/lib/R/ -USER docker -WORKDIR "${working_dir}" -CMD ["${command}", "${target_dir}${target_code}"] \ No newline at end of file diff --git a/build/lib/c3/templates/__init__.py b/build/lib/c3/templates/__init__.py deleted file mode 100644 index 94a3b13f..00000000 --- a/build/lib/c3/templates/__init__.py +++ /dev/null @@ -1,66 +0,0 @@ - -import os -from string import Template -from pathlib import Path - -# template file names -PYTHON_COMPONENT_SETUP_CODE = 'component_setup_code.py' -R_COMPONENT_SETUP_CODE = 'component_setup_code.R' -PYTHON_COMPONENT_SETUP_CODE_WO_LOGGING = 'component_setup_code_wo_logging.py' -PYTHON_DOCKERFILE_FILE = 'python_dockerfile_template' -R_DOCKERFILE_FILE = 'R_dockerfile_template' -KFP_COMPONENT_FILE = 'kfp_component_template.yaml' -KUBERNETES_JOB_FILE = 'kubernetes_job_template.job.yaml' -CWL_COMPONENT_FILE = 'cwl_component_template.cwl' -GRID_WRAPPER_FILE = 'grid_wrapper_template.py' -COS_GRID_WRAPPER_FILE = 'cos_grid_wrapper_template.py' -LEGACY_COS_GRID_WRAPPER_FILE = 'legacy_cos_grid_wrapper_template.py' -S3KV_GRID_WRAPPER_FILE = 's3kv_grid_wrapper_template.py' -SIMPLE_GRID_WRAPPER_FILE = 'simple_grid_wrapper_template.py' -FOLDER_GRID_WRAPPER_FILE = 'folder_grid_wrapper_template.py' - -# load templates -template_path = Path(os.path.dirname(__file__)) - -with open(template_path / PYTHON_COMPONENT_SETUP_CODE, 'r') as f: - python_component_setup_code = f.read() - -with open(template_path / R_COMPONENT_SETUP_CODE, 'r') as f: - r_component_setup_code = f.read() - -with open(template_path / PYTHON_COMPONENT_SETUP_CODE_WO_LOGGING, 'r') as f: - component_setup_code_wo_logging = f.read() - -with open(template_path / PYTHON_DOCKERFILE_FILE, 'r') as f: - python_dockerfile_template = Template(f.read()) - -with open(template_path / R_DOCKERFILE_FILE, 'r') as f: - r_dockerfile_template = Template(f.read()) - -with open(template_path / KFP_COMPONENT_FILE, 'r') as f: - kfp_component_template = Template(f.read()) - -with open(template_path / KUBERNETES_JOB_FILE, 'r') as f: - kubernetes_job_template = Template(f.read()) - -with open(template_path / CWL_COMPONENT_FILE, 'r') as f: - cwl_component_template = Template(f.read()) - -with open(template_path / GRID_WRAPPER_FILE, 'r') as f: - grid_wrapper_template = Template(f.read()) - -with open(template_path / COS_GRID_WRAPPER_FILE, 'r') as f: - cos_grid_wrapper_template = Template(f.read()) - -with open(template_path / LEGACY_COS_GRID_WRAPPER_FILE, 'r') as f: - legacy_cos_grid_wrapper_template = Template(f.read()) - -with open(template_path / S3KV_GRID_WRAPPER_FILE, 'r') as f: - s3kv_grid_wrapper_template = Template(f.read()) - -with open(template_path / SIMPLE_GRID_WRAPPER_FILE, 'r') as f: - simple_grid_wrapper_template = Template(f.read()) - -with open(template_path / FOLDER_GRID_WRAPPER_FILE, 'r') as f: - folder_grid_wrapper_template = Template(f.read()) - \ No newline at end of file diff --git a/build/lib/c3/templates/component_setup_code.R b/build/lib/c3/templates/component_setup_code.R deleted file mode 100644 index daa3f847..00000000 --- a/build/lib/c3/templates/component_setup_code.R +++ /dev/null @@ -1,14 +0,0 @@ - -args = commandArgs(trailingOnly=TRUE) - -for (parameter in args) { - key_value <- unlist(strsplit(parameter, split="=")) - if (length(key_value) == 2) { - print(parameter) - key <- key_value[1] - value <- key_value[2] - eval(parse(text=paste0('Sys.setenv(',key,'="',value,'")'))) - } else { - print(paste('Could not find key value pair for argument ', parameter)) - } -} diff --git a/build/lib/c3/templates/component_setup_code.py b/build/lib/c3/templates/component_setup_code.py deleted file mode 100644 index 348cae7b..00000000 --- a/build/lib/c3/templates/component_setup_code.py +++ /dev/null @@ -1,35 +0,0 @@ -# default code for each operator -import os -import sys -import re -import logging - -# init logger -root = logging.getLogger() -root.setLevel('INFO') -handler = logging.StreamHandler(sys.stdout) -handler.setLevel('INFO') -formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') -handler.setFormatter(formatter) -root.addHandler(handler) -logging.basicConfig(level=logging.CRITICAL) - -# get parameters from args -parameters = list(filter( - lambda s: s.find('=') > -1 and bool(re.match(r'[A-Za-z0-9_]*=[.\/A-Za-z0-9]*', s)), - sys.argv - )) - -# set parameters to env variables -for parameter in parameters: - variable = parameter.split('=')[0] - value = parameter.split('=', 1)[-1] - logging.info(f'Parameter: {variable} = "{value}"') - os.environ[variable] = value - -# update log level -log_level = os.environ.get('log_level', 'INFO') -if log_level !='INFO': - logging.info(f'Updating log level to {log_level}') - root.setLevel(log_level) - handler.setLevel(log_level) diff --git a/build/lib/c3/templates/component_setup_code_wo_logging.py b/build/lib/c3/templates/component_setup_code_wo_logging.py deleted file mode 100644 index e8b67a7b..00000000 --- a/build/lib/c3/templates/component_setup_code_wo_logging.py +++ /dev/null @@ -1,17 +0,0 @@ -import os -import re -import sys -import logging - -# get parameters from args -parameters = list(filter( - lambda s: s.find('=') > -1 and bool(re.match(r'[A-Za-z0-9_]*=[.\/A-Za-z0-9]*', s)), - sys.argv - )) - -# set parameters to env variables -for parameter in parameters: - variable = parameter.split('=')[0] - value = parameter.split('=', 1)[-1] - logging.debug(f'Parameter: {variable} = "{value}"') - os.environ[variable] = value diff --git a/build/lib/c3/templates/cos_grid_wrapper_template.py b/build/lib/c3/templates/cos_grid_wrapper_template.py deleted file mode 100644 index 30fa86d9..00000000 --- a/build/lib/c3/templates/cos_grid_wrapper_template.py +++ /dev/null @@ -1,217 +0,0 @@ -""" -${component_name} got wrapped by cos_grid_wrapper, which wraps any CLAIMED component and implements the generic grid computing pattern for cos files https://romeokienzler.medium.com/the-generic-grid-computing-pattern-transforms-any-sequential-workflow-step-into-a-transient-grid-c7f3ca7459c8 - -CLAIMED component description: ${component_description} -""" - -# pip install s3fs pandas -# component dependencies -# ${component_dependencies} - -import os -import json -import random -import logging -import shutil -import time -import glob -import s3fs -from datetime import datetime -from pathlib import Path -import pandas as pd - - -# import component code -from ${component_name} import * - - -def explode_connection_string(cs): - if cs is None: - return None, None, None, None - elif cs.startswith('cos') or cs.startswith('s3'): - buffer=cs.split('://', 1)[1] - access_key_id=buffer.split('@')[0].split(':')[0] - secret_access_key=buffer.split('@')[0].split(':')[1] - endpoint = f"https://{buffer.split('@')[1].split('/')[0]}" - path=buffer.split('@')[1].split('/', 1)[1] - return (access_key_id, secret_access_key, endpoint, path) - else: - return (None, None, None, cs) - # TODO consider cs as secret and grab connection string from kubernetes - - -# File containing batches. Provided as a comma-separated list of strings or keys in a json dict. All batch file names must contain the batch name. -gw_batch_file = os.environ.get('gw_batch_file', None) -(gw_batch_file_access_key_id, gw_batch_file_secret_access_key, gw_batch_file_endpoint, gw_batch_file) = explode_connection_string(gw_batch_file) -# Optional column name for a csv batch file (default: 'filename') -gw_batch_file_col_name = os.environ.get('gw_batch_file_col_name', 'filename') -# cos gw_coordinator_connection -gw_coordinator_connection = os.environ.get('gw_coordinator_connection') -(gw_coordinator_access_key_id, gw_coordinator_secret_access_key, gw_coordinator_endpoint, gw_coordinator_path) = explode_connection_string(gw_coordinator_connection) -# timeout in seconds to remove lock file from struggling job (default 3 hours) -gw_lock_timeout = int(os.environ.get('gw_lock_timeout', 10800)) -# ignore error files and rerun batches with errors -gw_ignore_error_files = bool(os.environ.get('gw_ignore_error_files', False)) -# maximal wait time for staggering start -gw_max_time_wait_staggering = int(os.environ.get('gw_max_time_wait_staggering', 60)) - -# coordinator file suffix -suffix_lock = '.lock' -suffix_processed = '.processed' -suffix_error = '.err' - -# component interface -${component_interface} - -# Init s3 -s3coordinator = s3fs.S3FileSystem( - anon=False, - key=gw_coordinator_access_key_id, - secret=gw_coordinator_secret_access_key, - client_kwargs={'endpoint_url': gw_coordinator_endpoint}) -gw_coordinator_path = Path(gw_coordinator_path) - -if gw_batch_file_access_key_id is not None: - s3batch_file = s3fs.S3FileSystem( - anon=False, - key=gw_batch_file_access_key_id, - secret=gw_batch_file_secret_access_key, - client_kwargs={'endpoint_url': gw_batch_file_endpoint}) -else: - logging.debug('Loading batch file from source s3.') - s3batch_file = s3coordinator - - -def load_batches_from_file(batch_file): - if batch_file.endswith('.json'): - # Load batches from keys of a json file - logging.info(f'Loading batches from json file: {batch_file}') - with open(batch_file, 'r') as f: - batch_dict = json.load(f) - batches = batch_dict.keys() - - elif batch_file.endswith('.csv'): - # Load batches from keys of a csv file - logging.info(f'Loading batches from csv file: {batch_file}') - df = pd.read_csv(batch_file, header='infer') - assert gw_batch_file_col_name in df.columns, \ - f'gw_batch_file_col_name {gw_batch_file_col_name} not in columns of batch file {batch_file}' - batches = df[gw_batch_file_col_name].to_list() - - elif batch_file.endswith('.txt'): - # Load batches from comma-separated txt file - logging.info(f'Loading comma-separated batch strings from file: {batch_file}') - with open(batch_file, 'r') as f: - batch_string = f.read() - batches = [b.strip() for b in batch_string.split(',')] - else: - raise ValueError(f'C3 only supports batch files of type ' - f'json (batches = dict keys), ' - f'csv (batches = column values), or ' - f'txt (batches = comma-seperated list).') - - logging.info(f'Loaded {len(batches)} batches') - logging.debug(f'List of batches: {batches}') - assert len(batches) > 0, f"batch_file {batch_file} has no batches." - return batches - - -def perform_process(process, batch): - logging.debug(f'Check coordinator files for batch {batch}.') - # Init coordinator files - lock_file = str(gw_coordinator_path / (batch + suffix_lock)) - processed_file = str(gw_coordinator_path / (batch + suffix_processed)) - error_file = str(gw_coordinator_path / (batch + suffix_error)) - - if s3coordinator.exists(lock_file): - # Remove strugglers - last_modified = s3coordinator.info(lock_file)['LastModified'] - if (datetime.now(last_modified.tzinfo) - last_modified).total_seconds() > gw_lock_timeout: - logging.info(f'Lock file {lock_file} is expired.') - s3coordinator.rm(lock_file) - else: - logging.debug(f'Batch {batch} is locked.') - return - - if s3coordinator.exists(processed_file): - logging.debug(f'Batch {batch} is processed.') - return - - if s3coordinator.exists(error_file): - if gw_ignore_error_files: - logging.info(f'Ignoring previous error in batch {batch} and rerun.') - else: - logging.debug(f'Batch {batch} has error.') - return - - logging.debug(f'Locking batch {batch}.') - s3coordinator.touch(lock_file) - - # processing files with custom process - logging.info(f'Processing batch {batch}.') - try: - target_files = process(batch, ${component_inputs}) - except Exception as err: - logging.exception(err) - # Write error to file - with s3coordinator.open(error_file, 'w') as f: - f.write(f"{type(err).__name__} in batch {batch}: {err}") - s3coordinator.rm(lock_file) - logging.error(f'Continue processing.') - return - - logging.info(f'Finished Batch {batch}.') - s3coordinator.touch(processed_file) - # Remove lock file - if s3coordinator.exists(lock_file): - s3coordinator.rm(lock_file) - else: - logging.warning(f'Lock file {lock_file} was removed by another process. ' - f'Consider increasing gw_lock_timeout to avoid repeated processing (currently {gw_lock_timeout}s).') - - -def process_wrapper(sub_process): - delay = random.randint(0, gw_max_time_wait_staggering) - logging.info(f'Staggering start, waiting for {delay} seconds') - time.sleep(delay) - - # Init coordinator dir - s3coordinator.makedirs(gw_coordinator_path, exist_ok=True) - - # Download batch file - if s3batch_file.exists(gw_batch_file): - s3batch_file.get(gw_batch_file, gw_batch_file) - if not os.path.isfile(gw_batch_file): - # Download batch file from s3 coordinator - cos_gw_batch_file = str(gw_coordinator_path.split([0]) / gw_batch_file) - if s3batch_file.exists(cos_gw_batch_file): - s3batch_file.get(gw_batch_file, gw_batch_file) - else: - raise ValueError("Cannot identify batches. Provide valid gw_batch_file " - "(local path, path within coordinator bucket, or s3 connection to batch file).") - - # Get batches - batches = load_batches_from_file(gw_batch_file) - - # Iterate over all batches - for batch in batches: - perform_process(sub_process, batch) - - # Check and log status of batches - processed_status = sum(s3coordinator.exists(gw_coordinator_path / (batch + suffix_processed)) for batch in batches) - lock_status = sum(s3coordinator.exists(gw_coordinator_path / (batch + suffix_lock)) for batch in batches) - error_status = sum(s3coordinator.exists(gw_coordinator_path / (batch + suffix_error)) for batch in batches) - - logging.info(f'Finished current process. Status batches: ' - f'{processed_status} processed / {lock_status} locked / {error_status} errors / {len(batches)} total') - - if error_status: - logging.error(f'Found errors! Resolve errors and rerun operator with gw_ignore_error_files=True.') - # Print all error messages - for error_file in s3coordinator.glob(str(gw_coordinator_path / ('**/*' + suffix_error))): - with s3coordinator.open(error_file, 'r') as f: - logging.error(f.read()) - - -if __name__ == '__main__': - process_wrapper(${component_process}) diff --git a/build/lib/c3/templates/cwl_component_template.cwl b/build/lib/c3/templates/cwl_component_template.cwl deleted file mode 100644 index f5106075..00000000 --- a/build/lib/c3/templates/cwl_component_template.cwl +++ /dev/null @@ -1,15 +0,0 @@ -cwlVersion: v1.2 -class: CommandLineTool - -baseCommand: "claimed" - -inputs: - component: - type: string - default: ${repository}/claimed-${name}:${version} - inputBinding: - position: 1 - prefix: --component -${inputs} - -outputs: ${outputs} diff --git a/build/lib/c3/templates/folder_grid_wrapper_template.py b/build/lib/c3/templates/folder_grid_wrapper_template.py deleted file mode 100644 index 900ace74..00000000 --- a/build/lib/c3/templates/folder_grid_wrapper_template.py +++ /dev/null @@ -1,137 +0,0 @@ -""" -${component_name} got wrapped by folder_grid_wrapper, which wraps any CLAIMED component and implements folder-level locking. -This folder grid wrapper scans immediate subdirectories of sgw_source_folder and for each folder the ${component_process} function is called once. -Locking is achieved by creating files in the target directory using the pattern .{STATUS} where STATUS in: -LOCKED -PROCESSED -FAILED - - -CLAIMED component description: ${component_description} -""" - -# pip install pandas - -# component dependencies -# ${component_dependencies} - -import os -import json -import random -import logging -from pathlib import Path -import pandas as pd - -# import component code -from ${component_name} import * - -# folder containing input data in single files or subfolders -sgw_source_folder = os.environ.get('sgw_source_folder') - -# folder to store the output markers and results -# Default: sgw_source_folder. If equal, entries containing LOCKED or PROCESSED or FAILED are ignored. -sgw_target_folder = os.environ.get('sgw_target_folder', sgw_source_folder) - -# component interface -${component_interface} - -def _marker_paths(entry_name: str, is_dir: bool): - """Return (LOCKED, PROCESSED, FAILED) marker paths for a file or a folder.""" - tgt = Path(sgw_target_folder) - if is_dir: - # folder markers are directories - return ( - tgt / f"{entry_name}.LOCKED", - tgt / f"{entry_name}.PROCESSED", - tgt / f"{entry_name}.FAILED", - ) - # file markers are files - base, ext = os.path.splitext(entry_name) - return ( - tgt / f"{base}.LOCKED{ext}", - tgt / f"{base}.PROCESSED{ext}", - tgt / f"{base}.FAILED{ext}", - ) - -def _claimed_any(locked, processed, failed) -> bool: - return locked.exists() or processed.exists() or failed.exists() - -def get_next_batch(): - """Pick a random unclaimed entry from source, supporting files and folders.""" - filtered = [] - with os.scandir(sgw_source_folder) as it: - for e in it: - name = e.name - - # If source and target are the same, skip marker entries - if sgw_source_folder == sgw_target_folder and ( - "LOCKED" in name or "PROCESSED" in name or "FAILED" in name - ): - continue - - locked, processed, failed = _marker_paths(name, e.is_dir()) - if not _claimed_any(locked, processed, failed): - filtered.append((name, e.is_dir())) - - if filtered: - return random.choice(filtered) # (name, is_dir) - return None - -def _try_acquire_lock(name: str, is_dir: bool): - """Create the LOCKED marker atomically and return its Path, or None if already claimed.""" - locked, _, _ = _marker_paths(name, is_dir) - try: - if is_dir: - # atomic directory creation is a good folder lock - locked.mkdir() - else: - # atomic file creation - fd = os.open(str(locked), os.O_CREAT | os.O_EXCL | os.O_WRONLY) - os.close(fd) - return locked - except FileExistsError: - return None - -def process_wrapper(sub_process): - sgw_target_folder_path = Path(sgw_target_folder) - sgw_target_folder_path.mkdir(exist_ok=True, parents=True) - - while True: - nxt = get_next_batch() - if nxt is None: - break - - entry_name, is_dir = nxt - src_path = str(Path(sgw_source_folder) / entry_name) - locked, processed, failed = _marker_paths(entry_name, is_dir) - logging.info(f"Processing: {src_path}") - - # Acquire the lock. If we lose the race, pick another entry. - lock_path = _try_acquire_lock(entry_name, is_dir) - if lock_path is None: - continue - - try: - # Call user component. For folders, src_path points to the folder. - # The second argument remains the marker path, same as before. - sub_process(src_path, str(lock_path)) - - # Success marker - lock_path.rename(processed) - - except Exception as e: - # Failure marker - lock_path.rename(failed) - if is_dir: - # Put the error message inside the FAILED directory - errfile = Path(failed) / "error.txt" - errfile.write_text(f"Exception occurred: {str(e)}\n", encoding="utf-8") - else: - # For files, FAILED is itself a file; overwrite with the error text - Path(failed).write_text(f"Exception occurred: {str(e)}\n", encoding="utf-8") - logging.error(f"Processing failed for {src_path}: {str(e)}") - - logging.info("Finished processing all batches.") - -if __name__ == '__main__': - process_wrapper(${component_process}) \ No newline at end of file diff --git a/build/lib/c3/templates/grid_wrapper_template.py b/build/lib/c3/templates/grid_wrapper_template.py deleted file mode 100644 index 9a418be7..00000000 --- a/build/lib/c3/templates/grid_wrapper_template.py +++ /dev/null @@ -1,205 +0,0 @@ -""" -${component_name} got wrapped by grid_wrapper, which wraps any CLAIMED component and implements the generic grid computing pattern https://romeokienzler.medium.com/the-generic-grid-computing-pattern-transforms-any-sequential-workflow-step-into-a-transient-grid-c7f3ca7459c8 - -CLAIMED component description: ${component_description} -""" - -# pip install pandas - -# component dependencies -# ${component_dependencies} - -import os -import json -import random -import logging -import time -import glob -from pathlib import Path -import pandas as pd - -# import component code -from ${component_name} import * - - -# File with batches. Provided as a comma-separated list of strings, keys in a json dict or single column CSV with 'filename' has header. -gw_batch_file = os.environ.get('gw_batch_file', None) -# Optional column name for a csv batch file (default: 'filename') -gw_batch_file_col_name = os.environ.get('gw_batch_file_col_name', 'filename') -# file path pattern like your/path/**/*.tif. Multiple patterns can be separated with commas. Is ignored if gw_batch_file is provided. -gw_file_path_pattern = os.environ.get('gw_file_path_pattern', None) -# pattern for grouping file paths into batches like ".split('.')[-1]". Is ignored if gw_batch_file is provided. -gw_group_by = os.environ.get('gw_group_by', None) -# path to grid wrapper coordinator directory -gw_coordinator_path = os.environ.get('gw_coordinator_path') -gw_coordinator_path = Path(gw_coordinator_path) - -# timeout in seconds to remove lock file from struggling job (default 3 hours) -gw_lock_timeout = int(os.environ.get('gw_lock_timeout', 10800)) -# ignore error files and rerun batches with errors -gw_ignore_error_files = bool(os.environ.get('gw_ignore_error_files', False)) -# maximal wait time for staggering start -gw_max_time_wait_staggering = int(os.environ.get('gw_max_time_wait_staggering', 60)) - -# coordinator file suffix -suffix_lock = '.lock' -suffix_processed = '.processed' -suffix_error = '.err' - -# component interface -${component_interface} - -def load_batches_from_file(batch_file): - if batch_file.endswith('.json'): - # Load batches from keys of a json file - logging.info(f'Loading batches from json file: {batch_file}') - with open(batch_file, 'r') as f: - batch_dict = json.load(f) - batches = batch_dict.keys() - - elif batch_file.endswith('.csv'): - # Load batches from keys of a csv file - logging.info(f'Loading batches from csv file: {batch_file}') - df = pd.read_csv(batch_file, header='infer') - assert gw_batch_file_col_name in df.columns, \ - f'gw_batch_file_col_name {gw_batch_file_col_name} not in columns of batch file {batch_file}' - batches = df[gw_batch_file_col_name].to_list() - - elif batch_file.endswith('.txt'): - # Load batches from comma-separated txt file - logging.info(f'Loading comma-separated batch strings from file: {batch_file}') - with open(batch_file, 'r') as f: - batch_string = f.read() - batches = [b.strip() for b in batch_string.split(',')] - else: - raise ValueError(f'C3 only supports batch files of type ' - f'json (batches = dict keys), ' - f'csv (batches = column values), or ' - f'txt (batches = comma-seperated list).') - - logging.info(f'Loaded {len(batches)} batches') - logging.debug(f'List of batches: {batches}') - assert len(batches) > 0, f"batch_file {batch_file} has no batches." - return batches - - -def identify_batches_from_pattern(file_path_patterns, group_by): - logging.info(f'Start identifying files and batches') - batches = set() - all_files = [] - - # Iterate over comma-separated paths - for file_path_pattern in file_path_patterns.split(','): - logging.info(f'Get file paths from pattern: {file_path_pattern}') - files = glob.glob(file_path_pattern.strip()) - assert len(files) > 0, f"Found no files with file_path_pattern {file_path_pattern}." - all_files.extend(files) - - # get batches by applying the group by function to all file paths - for path_string in all_files: - part = eval('str(path_string)' + group_by, {"group_by": group_by, "path_string": path_string}) - assert part != '', f'Could not extract batch with path_string {path_string} and group_by {group_by}' - batches.add(part) - - logging.info(f'Identified {len(batches)} batches') - logging.debug(f'List of batches: {batches}') - - return batches - - -def perform_process(process, batch): - logging.debug(f'Check coordinator files for batch {batch}.') - # init coordinator files - lock_file = gw_coordinator_path / (batch + suffix_lock) - error_file = gw_coordinator_path / (batch + suffix_error) - processed_file = gw_coordinator_path / (batch + suffix_processed) - - if lock_file.exists(): - # remove strugglers - if lock_file.stat().st_mtime < time.time() - gw_lock_timeout: - logging.debug(f'Lock file {lock_file} is expired.') - lock_file.unlink() - else: - logging.debug(f'Batch {batch} is locked.') - return - - if processed_file.exists(): - logging.debug(f'Batch {batch} is processed.') - return - - if error_file.exists(): - if gw_ignore_error_files: - logging.info(f'Ignoring previous error in batch {batch} and rerun.') - else: - logging.debug(f'Batch {batch} has error.') - return - - logging.debug(f'Locking batch {batch}.') - lock_file.parent.mkdir(parents=True, exist_ok=True) - lock_file.touch() - - # processing files with custom process - logging.info(f'Processing batch {batch}.') - try: - target_files = process(batch, ${component_inputs}) - except Exception as err: - logging.exception(err) - # Write error to file - with open(error_file, 'w') as f: - f.write(f"{type(err).__name__} in batch {batch}: {err}") - lock_file.unlink() - logging.error(f'Continue processing.') - return - - logging.info(f'Finished Batch {batch}.') - processed_file.touch() - - # Remove lock file - if lock_file.exists(): - lock_file.unlink() - else: - logging.warning(f'Lock file {lock_file} was removed by another process. ' - f'Consider increasing gw_lock_timeout to avoid repeated processing (currently {gw_lock_timeout}s).') - - - -def process_wrapper(sub_process): - delay = random.randint(0, gw_max_time_wait_staggering) - logging.info(f'Staggering start, waiting for {delay} seconds') - time.sleep(delay) - - # Init coordinator dir - gw_coordinator_path.mkdir(exist_ok=True, parents=True) - - # get batches - if gw_batch_file is not None and os.path.isfile(gw_batch_file): - batches = load_batches_from_file(gw_batch_file) - elif gw_file_path_pattern is not None and gw_group_by is not None: - logging.warning("gw_file_path_pattern and gw_group_by are legacy and might be removed in a future release.") - batches = identify_batches_from_pattern(gw_file_path_pattern, gw_group_by) - else: - raise ValueError("Cannot identify batches. " - "Provide valid gw_batch_file or gw_file_path_pattern and gw_group_by.") - - # Iterate over all batches - for batch in batches: - perform_process(sub_process, batch) - - # Check and log status of batches - processed_status = sum((gw_coordinator_path / (batch + suffix_processed)).exists() for batch in batches) - lock_status = sum((gw_coordinator_path / (batch + suffix_lock)).exists() for batch in batches) - error_status = sum((gw_coordinator_path / (batch + suffix_error)).exists() for batch in batches) - - logging.info(f'Finished current process. Status batches: ' - f'{processed_status} processed / {lock_status} locked / {error_status} errors / {len(batches)} total') - - if error_status: - logging.error(f'Found errors! Resolve errors and rerun operator with gw_ignore_error_files=True.') - # print all error messages - for error_file in gw_coordinator_path.glob('**/*' + suffix_error): - with open(error_file, 'r') as f: - logging.error(f.read()) - - -if __name__ == '__main__': - process_wrapper(${component_process}) diff --git a/build/lib/c3/templates/kfp_component_template.yaml b/build/lib/c3/templates/kfp_component_template.yaml deleted file mode 100644 index d5031586..00000000 --- a/build/lib/c3/templates/kfp_component_template.yaml +++ /dev/null @@ -1,18 +0,0 @@ -name: ${name} -description: ${description} - -inputs: -${inputs} - -outputs: -${outputs} - -implementation: - container: - image: ${repository}/claimed-${name}:${version} - command: - - sh - - -ec - - | - ${command} ./${target_dir}${target_code} ${parameter_list} -${parameter_values} \ No newline at end of file diff --git a/build/lib/c3/templates/kubernetes_job_template.job.yaml b/build/lib/c3/templates/kubernetes_job_template.job.yaml deleted file mode 100644 index 413c417d..00000000 --- a/build/lib/c3/templates/kubernetes_job_template.job.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: ${name} -spec: - template: - spec: - containers: - - name: ${name} - image: ${repository}/claimed-${name}:${version} - workingDir: ${working_dir} - command: ["${command}","${target_dir}${target_code}"] - env: -${env_entries} - restartPolicy: OnFailure - imagePullSecrets: - - name: image_pull_secret \ No newline at end of file diff --git a/build/lib/c3/templates/legacy_cos_grid_wrapper_template.py b/build/lib/c3/templates/legacy_cos_grid_wrapper_template.py deleted file mode 100644 index f68a2094..00000000 --- a/build/lib/c3/templates/legacy_cos_grid_wrapper_template.py +++ /dev/null @@ -1,352 +0,0 @@ -""" -${component_name} got wrapped by cos_grid_wrapper, which wraps any CLAIMED component and implements the generic grid computing pattern for cos files https://romeokienzler.medium.com/the-generic-grid-computing-pattern-transforms-any-sequential-workflow-step-into-a-transient-grid-c7f3ca7459c8 - -CLAIMED component description: ${component_description} -""" - -# pip install s3fs pandas -# component dependencies -# ${component_dependencies} - -import os -import json -import random -import logging -import shutil -import time -import glob -import s3fs -from datetime import datetime -from pathlib import Path -import pandas as pd - - -# import component code -from ${component_name} import * - - -def explode_connection_string(cs): - if cs is None: - return None, None, None, None - elif cs.startswith('cos') or cs.startswith('s3'): - buffer=cs.split('://', 1)[1] - access_key_id=buffer.split('@')[0].split(':')[0] - secret_access_key=buffer.split('@')[0].split(':')[1] - endpoint = f"https://{buffer.split('@')[1].split('/')[0]}" - path=buffer.split('@')[1].split('/', 1)[1] - return (access_key_id, secret_access_key, endpoint, path) - else: - return (None, None, None, cs) - # TODO consider cs as secret and grab connection string from kubernetes - - -# File containing batches. Provided as a comma-separated list of strings or keys in a json dict. All batch file names must contain the batch name. -gw_batch_file = os.environ.get('gw_batch_file', None) -(gw_batch_file_access_key_id, gw_batch_file_secret_access_key, gw_batch_file_endpoint, gw_batch_file) = explode_connection_string(gw_batch_file) -# Optional column name for a csv batch file (default: 'filename') -gw_batch_file_col_name = os.environ.get('gw_batch_file_col_name', 'filename') -# file path pattern like your/path/**/*.tif. Multiple patterns can be separated with commas. It is ignored if gw_batch_file is provided. -gw_file_path_pattern = os.environ.get('gw_file_path_pattern', None) -# pattern for grouping file paths into batches like ".split('.')[-2]". It is ignored if gw_batch_file is provided. -gw_group_by = os.environ.get('gw_group_by', None) - -# comma-separated list of additional cos files to copy -gw_additional_source_files = os.environ.get('gw_additional_source_files', '') -# download source cos files to local input path -gw_local_input_path = os.environ.get('gw_local_input_path', 'input') -# upload local target files to target cos path -gw_local_target_path = os.environ.get('gw_local_target_path', 'target') - -# cos gw_source_connection -gw_source_connection = os.environ.get('gw_source_connection') -(gw_source_access_key_id, gw_source_secret_access_key, gw_source_endpoint, gw_source_path) = explode_connection_string(gw_source_connection) - -# cos gw_target_connection -gw_target_connection = os.environ.get('gw_target_connection') -(gw_target_access_key_id, gw_target_secret_access_key, gw_target_endpoint, gw_target_path) = explode_connection_string(gw_target_connection) - -# cos gw_coordinator_connection -gw_coordinator_connection = os.environ.get('gw_coordinator_connection') -(gw_coordinator_access_key_id, gw_coordinator_secret_access_key, gw_coordinator_endpoint, gw_coordinator_path) = explode_connection_string(gw_coordinator_connection) - -# lock file suffix -gw_lock_file_suffix = os.environ.get('gw_lock_file_suffix', '.lock') -# processed file suffix -gw_processed_file_suffix = os.environ.get('gw_lock_file_suffix', '.processed') -# error file suffix -gw_error_file_suffix = os.environ.get('gw_error_file_suffix', '.err') -# timeout in seconds to remove lock file from struggling job (default 3 hours) -gw_lock_timeout = int(os.environ.get('gw_lock_timeout', 10800)) -# ignore error files and rerun batches with errors -gw_ignore_error_files = bool(os.environ.get('gw_ignore_error_files', False)) -# maximal wait time for staggering start -gw_max_time_wait_staggering = int(os.environ.get('gw_max_time_wait_staggering', 60)) - - -# component interface -${component_interface} - -# init s3 -s3source = s3fs.S3FileSystem( - anon=False, - key=gw_source_access_key_id, - secret=gw_source_secret_access_key, - client_kwargs={'endpoint_url': gw_source_endpoint}) - -gw_source_path = Path(gw_source_path) - -if gw_target_connection is not None: - s3target = s3fs.S3FileSystem( - anon=False, - key=gw_target_access_key_id, - secret=gw_target_secret_access_key, - client_kwargs={'endpoint_url': gw_target_endpoint}) - gw_target_path = Path(gw_target_path) -else: - logging.debug('Using source path as target path.') - gw_target_path = gw_source_path - s3target = s3source - -if gw_coordinator_connection is not None: - s3coordinator = s3fs.S3FileSystem( - anon=False, - key=gw_coordinator_access_key_id, - secret=gw_coordinator_secret_access_key, - client_kwargs={'endpoint_url': gw_coordinator_endpoint}) - gw_coordinator_path = Path(gw_coordinator_path) -else: - logging.debug('Using source bucket as coordinator bucket.') - gw_coordinator_path = gw_source_path - s3coordinator = s3source - -if gw_batch_file_access_key_id is not None: - s3batch_file = s3fs.S3FileSystem( - anon=False, - key=gw_batch_file_access_key_id, - secret=gw_batch_file_secret_access_key, - client_kwargs={'endpoint_url': gw_batch_file_endpoint}) -else: - logging.debug('Loading batch file from source s3.') - s3batch_file = s3source - gw_batch_file = str(gw_source_path / gw_batch_file) - - -def load_batches_from_file(batch_file): - if batch_file.endswith('.json'): - # load batches from keys of a json file - logging.info(f'Loading batches from json file: {batch_file}') - with open(batch_file, 'r') as f: - batch_dict = json.load(f) - batches = batch_dict.keys() - - elif batch_file.endswith('.csv'): - # load batches from keys of a csv file - logging.info(f'Loading batches from csv file: {batch_file}') - df = pd.read_csv(batch_file, header='infer') - assert gw_batch_file_col_name in df.columns, \ - f'gw_batch_file_col_name {gw_batch_file_col_name} not in columns of batch file {batch_file}' - batches = df[gw_batch_file_col_name].to_list() - - elif batch_file.endswith('.txt'): - # Load batches from comma-separated txt file - logging.info(f'Loading comma-separated batch strings from file: {batch_file}') - with open(batch_file, 'r') as f: - batch_string = f.read() - batches = [b.strip() for b in batch_string.split(',')] - else: - raise ValueError(f'C3 only supports batch files of type ' - f'json (batches = dict keys), ' - f'csv (batches = column values), or ' - f'txt (batches = comma-seperated list).') - - logging.info(f'Loaded {len(batches)} batches') - logging.debug(f'List of batches: {batches}') - assert len(batches) > 0, f"batch_file {batch_file} has no batches." - return batches - - -def get_files_from_pattern(file_path_patterns): - logging.info(f'Start identifying files') - all_files = [] - - # Iterate over comma-separated paths - for file_path_pattern in file_path_patterns.split(','): - logging.info(f'Get file paths from pattern: {file_path_pattern}') - files = s3source.glob(str(gw_source_path / file_path_pattern.strip())) - if len(files) == 0: - logging.warning(f"Found no files with file_path_pattern {file_path_pattern}.") - all_files.extend(files) - logging.info(f'Found {len(all_files)} cos files') - return all_files - -def identify_batches_from_pattern(file_path_patterns, group_by): - logging.info(f'Start identifying files and batches') - batches = set() - all_files = get_files_from_pattern(file_path_patterns) - - # get batches by applying the group by function to all file paths - for path_string in all_files: - part = eval('str(path_string)' + group_by, {"group_by": group_by, "path_string": path_string}) - assert part != '', f'Could not extract batch with path_string {path_string} and group_by {group_by}' - batches.add(part) - - logging.info(f'Identified {len(batches)} batches') - logging.debug(f'List of batches: {batches}') - - return batches, all_files - - -def perform_process(process, batch, cos_files): - logging.debug(f'Check coordinator files for batch {batch}.') - # init coordinator files - coordinator_dir = gw_coordinator_path - lock_file = str(coordinator_dir / (batch + gw_lock_file_suffix)) - processed_file = str(coordinator_dir / (batch + gw_processed_file_suffix)) - error_file = str(coordinator_dir / (batch + gw_error_file_suffix)) - - if s3coordinator.exists(lock_file): - # remove strugglers - last_modified = s3coordinator.info(lock_file)['LastModified'] - if (datetime.now(last_modified.tzinfo) - last_modified).total_seconds() > gw_lock_timeout: - logging.info(f'Lock file {lock_file} is expired.') - s3coordinator.rm(lock_file) - else: - logging.debug(f'Batch {batch} is locked.') - return - - if s3coordinator.exists(processed_file): - logging.debug(f'Batch {batch} is processed.') - return - - if s3coordinator.exists(error_file): - if gw_ignore_error_files: - logging.info(f'Ignoring previous error in batch {batch} and rerun.') - else: - logging.debug(f'Batch {batch} has error.') - return - - logging.debug(f'Locking batch {batch}.') - s3coordinator.touch(lock_file) - logging.info(f'Processing batch {batch}.') - - # Create input and target directories - input_path = Path(gw_local_input_path) - target_path = Path(gw_local_target_path) - assert not input_path.exists(), (f'gw_local_input_path ({gw_local_input_path}) already exists. ' - f'Please provide a new input path.') - assert not target_path.exists(), (f'gw_local_target_path ({gw_local_target_path}) already exists. ' - f'Please provide a new target path.') - input_path.mkdir(parents=True) - target_path.mkdir(parents=True) - - # Download cos files to local input folder - batch_fileset = list(filter(lambda file: batch in file, cos_files)) - if gw_additional_source_files != '': - additional_source_files = [f.strip() for f in gw_additional_source_files.split(',')] - batch_fileset.extend(additional_source_files) - logging.info(f'Downloading {len(batch_fileset)} files from COS') - for cos_file in batch_fileset: - local_file = str(input_path / cos_file.split('/', 1)[-1]) - logging.debug(f'Downloading {cos_file} to {local_file}') - s3source.get(cos_file, local_file) - - # processing files with custom process - try: - target_files = process(batch, ${component_inputs}) - except Exception as err: - logging.exception(err) - # Write error to file - with s3coordinator.open(error_file, 'w') as f: - f.write(f"{type(err).__name__} in batch {batch}: {err}") - s3coordinator.rm(lock_file) - logging.error(f'Continue processing.') - return - - # optional verify target files - if target_files is not None: - if isinstance(target_files, str): - target_files = [target_files] - for target_file in target_files: - if not os.path.exists(target_file): - logging.error(f'Target file {target_file} does not exist for batch {batch}.') - if any([not str(t).startswith(gw_local_target_path) for t in target_files]): - logging.warning('Some target files are not in target path. Only files in target path are uploaded.') - else: - logging.info(f'Cannot verify batch {batch} (target files not provided). Using files in target_path.') - - # upload files in target path - local_target_files = list(target_path.glob('*')) - logging.info(f'Uploading {len(local_target_files)} target files to COS.') - for local_file in local_target_files: - cos_file = gw_target_path / local_file.relative_to(target_path) - logging.debug(f'Uploading {local_file} to {cos_file}') - s3target.put(str(local_file), str(cos_file)) - - logging.info(f'Remove local input and target files.') - shutil.rmtree(input_path) - shutil.rmtree(target_path) - - logging.info(f'Finished Batch {batch}.') - s3coordinator.touch(processed_file) - # Remove lock file - if s3coordinator.exists(lock_file): - s3coordinator.rm(lock_file) - else: - logging.warning(f'Lock file {lock_file} was removed by another process. ' - f'Consider increasing gw_lock_timeout (currently {gw_lock_timeout}s) to repeated processing.') - - -def process_wrapper(sub_process): - delay = random.randint(0, gw_max_time_wait_staggering) - logging.info(f'Staggering start, waiting for {delay} seconds') - time.sleep(delay) - - # Init coordinator dir - coordinator_dir = gw_coordinator_path - s3coordinator.makedirs(coordinator_dir, exist_ok=True) - - # get batches - cos_gw_batch_file = str(gw_source_path / gw_batch_file) - if (gw_batch_file is not None and (os.path.isfile(gw_batch_file) or s3source.exists(cos_gw_batch_file))): - if not os.path.isfile(gw_batch_file): - # Download batch file from s3 - if s3batch_file.exists(gw_batch_file): - s3batch_file.get(gw_batch_file, gw_batch_file) - else: - s3batch_file.get(str(gw_source_path / gw_batch_file), gw_batch_file) - batches = load_batches_from_file(gw_batch_file) - if gw_file_path_pattern: - cos_files = get_files_from_pattern(gw_file_path_pattern) - else: - logging.warning('gw_file_path_pattern is not provided. ' - 'Grid wrapper expects the wrapped operator to handle COS files instead of the automatic download and upload.') - cos_files = [] - elif gw_file_path_pattern is not None and gw_group_by is not None: - batches, cos_files = identify_batches_from_pattern(gw_file_path_pattern, gw_group_by) - else: - raise ValueError("Cannot identify batches. " - "Provide valid gw_batch_file (local path or path within source bucket) " - "or gw_file_path_pattern and gw_group_by.") - - # Iterate over all batches - for batch in batches: - perform_process(sub_process, batch, cos_files) - - # Check and log status of batches - processed_status = [s3coordinator.exists(coordinator_dir / (batch + gw_processed_file_suffix)) for batch in batches] - lock_status = [s3coordinator.exists(coordinator_dir / (batch + gw_lock_file_suffix)) for batch in batches] - error_status = [s3coordinator.exists(coordinator_dir / (batch + gw_error_file_suffix)) for batch in batches] - - logging.info(f'Finished current process. Status batches: ' - f'{sum(processed_status)} processed / {sum(lock_status)} locked / {sum(error_status)} errors / {len(processed_status)} total') - - if sum(error_status): - logging.error(f'Found errors! Resolve errors and rerun operator with gw_ignore_error_files=True.') - # print all error messages - for error_file in s3coordinator.glob(str(coordinator_dir / ('**/*' + gw_error_file_suffix))): - with s3coordinator.open(error_file, 'r') as f: - logging.error(f.read()) - - -if __name__ == '__main__': - process_wrapper(${component_process}) diff --git a/build/lib/c3/templates/python_dockerfile_template b/build/lib/c3/templates/python_dockerfile_template deleted file mode 100644 index d4498650..00000000 --- a/build/lib/c3/templates/python_dockerfile_template +++ /dev/null @@ -1,11 +0,0 @@ -FROM ${base_image} -USER root -${additional_files_docker} -RUN pip install --upgrade pip -RUN pip install ipython nbformat -${requirements_docker} -ADD ${target_code} ${working_dir}${target_dir} -RUN chmod -R 777 ${working_dir} -USER default -WORKDIR "${working_dir}" -CMD ["${command}", "${target_dir}${target_code}"] diff --git a/build/lib/c3/templates/s3kv_grid_wrapper_template.py b/build/lib/c3/templates/s3kv_grid_wrapper_template.py deleted file mode 100644 index 799be82b..00000000 --- a/build/lib/c3/templates/s3kv_grid_wrapper_template.py +++ /dev/null @@ -1,643 +0,0 @@ -""" -${component_name} got wrapped by grid_wrapper, which wraps any CLAIMED component and implements the generic grid computing pattern https://romeokienzler.medium.com/the-generic-grid-computing-pattern-transforms-any-sequential-workflow-step-into-a-transient-grid-c7f3ca7459c8 - -CLAIMED component description: ${component_description} -""" - -# pip install s3fs boto3 pandas -# component dependencies -# ${component_dependencies} - -import os -import json -import random -import logging -import time -import glob -from pathlib import Path -import pandas as pd -import s3fs -from hashlib import sha256 - - - -# import component code -from ${component_name} import * - -#------------------REMOVE once pip install for s3kv is fixed -import os -import time -from datetime import datetime -import shutil -import boto3 -import json - - -class S3KV: - def __init__(self, s3_endpoint_url:str, bucket_name: str, - aws_access_key_id: str = None, aws_secret_access_key: str = None , enable_local_cache=True): - """ - Initializes the S3KV object with the given S3 bucket, AWS credentials, and Elasticsearch host. - - :param s3_endpoint_url: The s3 endpoint. - :param bucket_name: The name of the S3 bucket to use for storing the key-value data. - :param aws_access_key_id: (Optional) AWS access key ID. - :param aws_secret_access_key: (Optional) AWS secret access key. - """ - self.bucket_name = bucket_name - self.enable_local_cache = enable_local_cache - self.s3_client = boto3.client( - 's3', - endpoint_url=s3_endpoint_url, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key - ) - - if not os.path.exists('/tmp/s3kv_cache'): - os.makedirs('/tmp/s3kv_cache') - - def _get_object_key(self, key: str) -> str: - """ - Constructs the S3 object key for the given key. - - :param key: The key used to access the value in the S3 bucket. - :return: The S3 object key for the given key. - """ - return f"s3kv/{key}.json" - - def cache_all_keys(self): - """ - Saves all keys to the local /tmp directory as they are being added. - """ - keys = self.list_keys() - for key in keys: - value = self.get(key) - if value is not None: - with open(f'/tmp/s3kv_cache/{key}.json', 'w') as f: - json.dump(value, f) - - def get_from_cache(self, key: str) -> dict: - """ - Retrieves a key from the local cache if present, and clears old cache entries. - - :param key: The key to retrieve from the cache. - :return: The value associated with the given key if present in the cache, else None. - """ - self.clear_old_cache() - cache_path = f'/tmp/s3kv_cache/{key}.json' - if os.path.exists(cache_path): - with open(cache_path, 'r') as f: - return json.load(f) - else: - return None - - - def add(self, key: str, value: dict, metadata: dict = None): - """ - Adds a new key-value pair to the S3KV database, caches it locally, and sends metadata to Elasticsearch. - - :param key: The key to be added. - :param value: The value corresponding to the key. - :param metadata: (Optional) Metadata associated with the data (will be sent to Elasticsearch). - """ - s3_object_key = self._get_object_key(key) - serialized_value = json.dumps(value) - self.s3_client.put_object(Bucket=self.bucket_name, Key=s3_object_key, Body=serialized_value) - - with open(f'/tmp/s3kv_cache/{key}.json', 'w') as f: - json.dump(value, f) - - - - def delete(self, key: str): - """ - Deletes a key-value pair from the S3KV database. - - :param key: The key to be deleted. - """ - s3_object_key = self._get_object_key(key) - self.s3_client.delete_object(Bucket=self.bucket_name, Key=s3_object_key) - - cache_path = f'/tmp/s3kv_cache/{key}.json' - if os.path.exists(cache_path): - os.remove(cache_path) - - - def get(self, key: str, default: dict = None) -> dict: - """ - Retrieves the value associated with the given key from the S3KV database. - - :param key: The key whose value is to be retrieved. - :param default: (Optional) The default value to return if the key does not exist. - :return: The value associated with the given key, or the default value if the key does not exist. - """ - s3_object_key = self._get_object_key(key) - try: - response = self.s3_client.get_object(Bucket=self.bucket_name, Key=s3_object_key) - value = response['Body'].read() - return json.loads(value) - except self.s3_client.exceptions.NoSuchKey: - return default - - - def list_keys(self) -> list: - """ - Lists all the keys in the S3KV database. - - :return: A list of all keys in the database. - """ - response = self.s3_client.list_objects_v2(Bucket=self.bucket_name, Prefix="") - keys = [obj['Key'][5:-5] for obj in response.get('Contents', []) if obj['Key'].endswith('.json')] - return keys - - - def clear_cache(self): - """ - Clears the local cache by removing all cached JSON files. - """ - cache_directory = '/tmp/s3kv_cache' - if os.path.exists(cache_directory): - shutil.rmtree(cache_directory) - os.makedirs('/tmp/s3kv_cache') - - - def clear_old_cache(self, max_days: int = 7): - """ - Clears the cache for keys that have been in the cache for longer than a specific number of days. - - :param max_days: The maximum number of days a key can stay in the cache before being cleared. - """ - cache_directory = '/tmp/s3kv_cache' - current_time = time.time() - - if os.path.exists(cache_directory): - for filename in os.listdir(cache_directory): - file_path = os.path.join(cache_directory, filename) - if os.path.isfile(file_path): - file_age = current_time - os.path.getmtime(file_path) - if file_age > max_days * 86400: # Convert days to seconds - os.remove(file_path) - - - def clear_cache_for_key(self, key: str): - """ - Clears the local cache for a specific key in the S3KV database. - - :param key: The key for which to clear the local cache. - """ - cache_path = f'/tmp/s3kv_cache/{key}.json' - if os.path.exists(cache_path): - os.remove(cache_path) - - - def key_exists(self, key: str) -> bool: - """ - Checks if a key exists in the S3KV database. - - :param key: The key to check. - :return: True if the key exists, False otherwise. - """ - s3_object_key = self._get_object_key(key) - try: - self.s3_client.head_object(Bucket=self.bucket_name, Key=s3_object_key) - return True - except Exception as e: - # Return false even if response is unauthorized or similar - return False - - - def list_keys_with_prefix(self, prefix: str) -> list: - """ - Lists all the keys in the S3KV database that have a specific prefix. - - :param prefix: The prefix to filter the keys. - :return: A list of keys in the database that have the specified prefix. - """ - response = self.s3_client.list_objects_v2(Bucket=self.bucket_name, Prefix=prefix) - keys = [obj['Key'][5:-5] for obj in response.get('Contents', []) if obj['Key'].endswith('.json')] - return keys - - - def copy_key(self, source_key: str, destination_key: str): - """ - Copies the value of one key to another key in the S3KV database. - - :param source_key: The key whose value will be copied. - :param destination_key: The key to which the value will be copied. - """ - source_s3_object_key = self._get_object_key(source_key) - destination_s3_object_key = self._get_object_key(destination_key) - - response = self.s3_client.get_object(Bucket=self.bucket_name, Key=source_s3_object_key) - value = response['Body'].read() - - self.s3_client.put_object(Bucket=self.bucket_name, Key=destination_s3_object_key, Body=value) - - # Copy the key in the local cache if it exists - source_cache_path = f'/tmp/s3kv_cache/{source_key}.json' - destination_cache_path = f'/tmp/s3kv_cache/{destination_key}.json' - if os.path.exists(source_cache_path): - shutil.copy(source_cache_path, destination_cache_path) - - - def get_key_size(self, key: str) -> int: - """ - Gets the size (file size) of a key in the S3KV database. - - :param key: The key whose size will be retrieved. - :return: The size (file size) of the key in bytes, or 0 if the key does not exist. - """ - s3_object_key = self._get_object_key(key) - try: - response = self.s3_client.head_object(Bucket=self.bucket_name, Key=s3_object_key) - return response['ContentLength'] - except self.s3_client.exceptions.NoSuchKey: - return 0 - - - def get_key_last_updated_time(self, key: str) -> float: - """ - Gets the last updated time of a key in the S3KV database. - - :param key: The key whose last updated time will be retrieved. - :return: The last updated time of the key as a floating-point timestamp, or 0 if the key does not exist. - """ - s3_object_key = self._get_object_key(key) - try: - response = self.s3_client.head_object(Bucket=self.bucket_name, Key=s3_object_key) - last_modified = response['LastModified'] - st = time.mktime(last_modified.timetuple()) - - return datetime.fromtimestamp(st) - - except self.s3_client.exceptions.NoSuchKey: - return 0 - - - def set_bucket_policy(self): - """ - Sets a bucket policy to grant read and write access to specific keys used by the S3KV library. - """ - policy = { - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "S3KVReadWriteAccess", - "Effect": "Allow", - "Principal": { - "AWS": "*" - }, - "Action": [ - "s3:GetObject", - "s3:PutObject" - ], - "Resource": f"arn:aws:s3:::{self.bucket_name}/s3kv/*" - } - ] - } - - policy_json = json.dumps(policy) - self.s3_client.put_bucket_policy(Bucket=self.bucket_name, Policy=policy_json) - - - def tag_key(self, key: str, tags: dict): - """ - Tags a key in the S3KV database with the provided tags. - - :param key: The key to be tagged. - :param tags: A dictionary containing the tags to be added to the key. - For example, {'TagKey1': 'TagValue1', 'TagKey2': 'TagValue2'} - """ - s3_object_key = self._get_object_key(key) - - # Convert the tags dictionary to a format compatible with the `put_object_tagging` method - tagging = {'TagSet': [{'Key': k, 'Value': v} for k, v in tags.items()]} - - # Apply the tags to the object - self.s3_client.put_object_tagging(Bucket=self.bucket_name, Key=s3_object_key, Tagging=tagging) - - - def tag_keys_with_prefix(self, prefix: str, tags: dict): - """ - Tags all keys in the S3KV database with the provided prefix with the specified tags. - - :param prefix: The prefix of the keys to be tagged. - :param tags: A dictionary containing the tags to be added to the keys. - For example, {'TagKey1': 'TagValue1', 'TagKey2': 'TagValue2'} - """ - keys_to_tag = self.list_keys_with_prefix(prefix) - - for key in keys_to_tag: - self.tag_key(key, tags) - - - def merge_keys(self, source_keys: list, destination_key: str): - """ - Merges the values of source keys into the value of the destination key in the S3KV database. - - :param source_keys: A list of source keys whose values will be merged. - :param destination_key: The key whose value will be updated by merging the source values. - """ - destination_s3_object_key = self._get_object_key(destination_key) - - # Initialize an empty dictionary for the destination value - destination_value = {} - - # Retrieve and merge values from source keys - for source_key in source_keys: - source_value = self.get(source_key) - if source_value: - destination_value.update(source_value) - - # Update the destination value in the S3 bucket - serialized_value = json.dumps(destination_value) - self.s3_client.put_object(Bucket=self.bucket_name, Key=destination_s3_object_key, Body=serialized_value) - - # Update the value in the local cache if it exists - destination_cache_path = f'/tmp/s3kv_cache/{destination_key}.json' - with open(destination_cache_path, 'w') as f: - json.dump(destination_value, f) - - - - def find_keys_by_tag_value(self, tag_key: str, tag_value: str) -> list: - """ - Finds keys in the S3KV database based on the value of a specific tag. - - :param tag_key: The tag key to search for. - :param tag_value: The tag value to search for. - :return: A list of keys that have the specified tag key with the specified value. - """ - response = self.s3_client.list_objects_v2(Bucket=self.bucket_name, Prefix="s3kv/") - keys_with_tag = [] - - for obj in response.get('Contents', []): - s3_object_key = obj['Key'] - tags = self.get_tags(s3_object_key) - if tags and tag_key in tags and tags[tag_key] == tag_value: - keys_with_tag.append(s3_object_key[5:-5]) # Extract the key name - - return keys_with_tag - - def get_tags(self, s3_object_key: str) -> dict: - """ - Gets the tags of an object in the S3KV database. - - :param s3_object_key: The S3 object key whose tags will be retrieved. - :return: A dictionary containing the tags of the object. - """ - response = self.s3_client.get_object_tagging(Bucket=self.bucket_name, Key=s3_object_key) - tags = {} - for tag in response.get('TagSet', []): - tags[tag['Key']] = tag['Value'] - return tags - - - - def place_retention_lock(self, key: str, retention_days: int): - """ - Places a retention lock on a key in the S3KV database for the specified number of days. - - :param key: The key to place the retention lock on. - :param retention_days: The number of days to lock the key for. - """ - s3_object_key = self._get_object_key(key) - print(s3_object_key) - - retention_period = retention_days * 24 * 60 * 60 # Convert days to seconds - - self.s3_client.put_object_retention( - Bucket=self.bucket_name, - Key=s3_object_key, - Retention={ - 'Mode': 'GOVERNANCE', - 'RetainUntilDate': int(time.time()) + retention_period - } - ) - - - def remove_retention_lock(self, key: str): - """ - Removes the retention lock from a key in the S3KV database. - - :param key: The key to remove the retention lock from. - """ - s3_object_key = self._get_object_key(key) - - self.s3_client.put_object_retention( - Bucket=self.bucket_name, - Key=s3_object_key, - BypassGovernanceRetention=True, - Retention={ - - } - ) - - - def delete_by_tag(self, tag_key: str, tag_value: str): - """ - Deletes keys in the S3KV database based on a specific tag. - - :param tag_key: The tag key to match for deletion. - :param tag_value: The tag value to match for deletion. - """ - keys_to_delete = self.find_keys_by_tag_value(tag_key, tag_value) - - for key in keys_to_delete: - self.delete(key) - - - def apply_legal_hold(self, key: str): - """ - Applies a legal hold on a key in the S3KV database. - - :param key: The key on which to apply the legal hold. - """ - s3_object_key = self._get_object_key(key) - - self.s3_client.put_object_legal_hold( - Bucket=self.bucket_name, - Key=s3_object_key, - LegalHold={ - 'Status': 'ON' - } - ) - - - - - - def is_legal_hold_applied(self, key: str) -> bool: - """ - Checks if a key in the S3KV database is under legal hold. - - :param key: The key to check for legal hold. - :return: True if the key is under legal hold, False otherwise. - """ - s3_object_key = self._get_object_key(key) - - response = self.s3_client.get_object_legal_hold(Bucket=self.bucket_name, Key=s3_object_key) - - legal_hold_status = response.get('LegalHold', {}).get('Status') - return legal_hold_status == 'ON' - - - def release_legal_hold(self, key: str): - """ - Releases a key from legal hold in the S3KV database. - - :param key: The key to release from legal hold. - """ - s3_object_key = self._get_object_key(key) - - self.s3_client.put_object_legal_hold( - Bucket=self.bucket_name, - Key=s3_object_key, - LegalHold={ - 'Status': 'OFF' - } - ) - -#----------------------------------------------------------- - - -def explode_connection_string(cs): - if cs is None: - return None, None, None, None - if cs.startswith('cos') or cs.startswith('s3'): - buffer=cs.split('://')[1] - access_key_id=buffer.split('@')[0].split(':')[0] - secret_access_key=buffer.split('@')[0].split(':')[1] - endpoint=f"https://{buffer.split('@')[1].split('/')[0]}" - path=buffer.split('@')[1].split('/', 1)[1] - return (access_key_id, secret_access_key, endpoint, path) - else: - return (None, None, None, cs) - # TODO consider cs as secret and grab connection string from kubernetes - - - -# File with batches. Provided as a comma-separated list of strings, keys in a json dict or single column CSV with 'filename' has header. Either local path as [cos|s3]://user:pw@endpoint/path -gw_batch_file = os.environ.get('gw_batch_file', None) -(gw_batch_file_access_key_id, gw_batch_file_secret_access_key, gw_batch_file_endpoint, gw_batch_file) = explode_connection_string(gw_batch_file) -# Optional column name for a csv batch file (default: 'filename') -gw_batch_file_col_name = os.environ.get('gw_batch_file_col_name', 'filename') - -# cos gw_coordinator_connection -gw_coordinator_connection = os.environ.get('gw_coordinator_connection') -(gw_coordinator_access_key_id, gw_coordinator_secret_access_key, gw_coordinator_endpoint, gw_coordinator_path) = explode_connection_string(gw_coordinator_connection) - -# maximal wait time for staggering start -gw_max_time_wait_staggering = int(os.environ.get('gw_max_time_wait_staggering',60)) - -# component interface -#${component_interface} - -def load_batches_from_file(batch_file): - # Download batch file from s3 - s3_batch_file = s3fs.S3FileSystem( - anon=False, - key=gw_batch_file_access_key_id, - secret=gw_batch_file_secret_access_key, - client_kwargs={'endpoint_url': gw_batch_file_endpoint}) - s3_batch_file.get(batch_file, batch_file) - - if batch_file.endswith('.json'): - # load batches from keys of a json file - logging.info(f'Loading batches from json file: {batch_file}') - with open(batch_file, 'r') as f: - batch_dict = json.load(f) - batches = batch_dict.keys() - - elif batch_file.endswith('.csv'): - # load batches from keys of a csv file - logging.info(f'Loading batches from csv file: {batch_file}') - df = pd.read_csv(batch_file, header='infer') - assert gw_batch_file_col_name in df.columns, \ - f'gw_batch_file_col_name {gw_batch_file_col_name} not in columns of batch file {batch_file}' - batches = df[gw_batch_file_col_name].to_list() - - elif batch_file.endswith('.txt'): - # Load batches from comma-separated txt file - logging.info(f'Loading comma-separated batch strings from file: {batch_file}') - with open(batch_file, 'r') as f: - batch_string = f.read() - batches = [b.strip() for b in batch_string.split(',')] - else: - raise ValueError(f'C3 only supports batch files of type ' - f'json (batches = dict keys), ' - f'csv (batches = column values), or ' - f'txt (batches = comma-seperated list).') - - logging.info(f'Loaded {len(batches)} batches') - logging.debug(f'List of batches: {batches}') - assert len(batches) > 0, f"batch_file {batch_file} has no batches." - return batches - - -def perform_process(process, batch, coordinator): - logging.debug(f'Check coordinator files for batch {batch}.') - - batch_id = sha256(batch.encode('utf-8')).hexdigest() # ensure no special characters break cos - logging.info(f'Generating {batch_id} for {batch}') - - if coordinator.key_exists(batch_id): - if coordinator.get(batch_id) == 'locked': - logging.debug(f'Batch {batch_id} is locked') - return - elif coordinator.get(batch_id) == 'processed': - logging.debug(f'Batch {batch_id} is processed') - return - else: - logging.debug(f'Batch {batch_id} is failed') - return - - - logging.debug(f'Locking batch {batch_id}.') - coordinator.add(batch_id,'locked') - - # processing files with custom process - logging.info(f'Processing batch {batch_id}.') - try: - process(batch, ${component_inputs}) - except Exception as err: - logging.exception(err) - coordinator.add(batch_id,f"{type(err).__name__} in batch {batch_id}: {err}") - logging.error(f'Continue processing.') - return - - logging.info(f'Finished Batch {batch_id}.') - coordinator.add(batch_id,'processed') - - -def process_wrapper(sub_process): - delay = random.randint(0, gw_max_time_wait_staggering) - logging.info(f'Staggering start, waiting for {delay} seconds') - time.sleep(delay) - - # Init coordinator - coordinator = S3KV(gw_coordinator_endpoint, - gw_coordinator_path, - gw_coordinator_access_key_id, gw_coordinator_secret_access_key, - enable_local_cache=False) - - - # get batches - batches = load_batches_from_file(gw_batch_file) - - # Iterate over all batches - for batch in batches: - perform_process(sub_process, batch, coordinator) - - # Check and log status of batches - processed_status = sum(coordinator.get(batch_id) == 'processed' for batch_id in batches) - lock_status = sum(coordinator.get(batch_id) == 'locked' for batch_id in batches) - exists_status = sum(coordinator.key_exists(batch_id) for batch_id in batches) - error_status = exists_status - processed_status - lock_status - - logging.info(f'Finished current process. Status batches: ' - f'{processed_status} processed / {lock_status} locked / {error_status} errors / {len(batches)} total') - - -if __name__ == '__main__': - process_wrapper(${component_process}) diff --git a/build/lib/c3/templates/simple_grid_wrapper_template.py b/build/lib/c3/templates/simple_grid_wrapper_template.py deleted file mode 100644 index 66908801..00000000 --- a/build/lib/c3/templates/simple_grid_wrapper_template.py +++ /dev/null @@ -1,105 +0,0 @@ -""" -${component_name} got wrapped by grid_wrapper, which wraps any CLAIMED component and implements the generic grid computing pattern https://romeokienzler.medium.com/the-generic-grid-computing-pattern-transforms-any-sequential-workflow-step-into-a-transient-grid-c7f3ca7459c8 -This simple grid wrapper just scans a folder and for each file the grid_process function is called. Locking is achieved the following way: -Given source file1.ext is processed, simple_grid_wrapper creates files in the target_directory following the pattern file1.{STATUS}.ext where STATUS in: -LOCKED -PROCESSED -FAILED - - -CLAIMED component description: ${component_description} -""" - -# pip install pandas - -# component dependencies -# ${component_dependencies} - -import os -import json -import random -import logging -import time -import glob -from pathlib import Path -import pandas as pd - -# import component code -from ${component_name} import * - - -#folder containing input data in single files -sgw_source_folder = os.environ.get('sgw_source_folder') - -#folder to store the output data in single files. Default: sgw_source_folder, in case sgw_source_folder==sgw_target_folder, files containing .LOCKED., .PROCESSED., .FAILED. are ignored -sgw_target_folder = os.environ.get('sgw_target_folder', sgw_source_folder) - -# component interface -${component_interface} - -def get_next_batch(): - files = os.listdir(sgw_source_folder) - if sgw_source_folder == sgw_target_folder: - files = [ - f for f in files - if not any(keyword in f for keyword in ["LOCKED", "PROCESSED", "FAILED"]) - ] - - # Filter files and check if corresponding target file exists - filtered_files = [] - for file in files: - file_name, file_ext = os.path.splitext(file) - - # Create target file names with LOCKED, PROCESSED, FAILED extensions - target_file_locked = f"{file_name}.LOCKED{file_ext}" - target_file_processed = f"{file_name}.PROCESSED{file_ext}" - target_file_failed = f"{file_name}.FAILED{file_ext}" - - # Check if any of the target files exists - if not any( - os.path.exists(os.path.join(sgw_target_folder, target_file)) - for target_file in [target_file_locked, target_file_processed, target_file_failed] - ): - filtered_files.append(file) - - if filtered_files: - return random.choice(filtered_files) - else: - return None - - -def process_wrapper(sub_process): - sgw_target_folder_path = Path(sgw_target_folder) - sgw_target_folder_path.mkdir(exist_ok=True, parents=True) - - while True: - file_to_process = get_next_batch() - logging.info(f"Processing batch: {file_to_process}") - if file_to_process is None: - break - - file_name = Path(file_to_process).stem - file_ext = Path(file_to_process).suffix - locked_file = sgw_target_folder+f"/{file_name}.LOCKED{file_ext}" - locked_file_path = Path(locked_file) - - try: - locked_file_path.touch() - sub_process(sgw_source_folder +'/'+ file_to_process, locked_file) - processed_file = sgw_target_folder+f"/{file_name}.PROCESSED{file_ext}" - locked_file_path.rename(processed_file) - - except Exception as e: - failed_file = sgw_target_folder+f"/{file_name}.FAILED{file_ext}" - locked_file_path.rename(failed_file) - - with open(failed_file, 'w') as f: - f.write(f"Exception occurred: {str(e)}\n") - - logging.error(f"Processing failed for {file_to_process}: {str(e)}") - - logging.info("Finished processing all batches.") - - -if __name__ == '__main__': - process_wrapper(${component_process}) diff --git a/build/lib/c3/utils.py b/build/lib/c3/utils.py deleted file mode 100644 index 0bbe5442..00000000 --- a/build/lib/c3/utils.py +++ /dev/null @@ -1,146 +0,0 @@ -import os -import logging -import nbformat -import re -import subprocess -from nbconvert.exporters import PythonExporter - - -def convert_notebook(path): - notebook = nbformat.read(path, as_version=4) - - # backwards compatibility (v0.1 description was included in second cell, merge first two markdown cells) - if notebook['cells'][0]['cell_type'] == 'markdown' and notebook['cells'][1]['cell_type'] == 'markdown': - logging.info('Merge first two markdown cells. File name is used as operator name, not first markdown cell.') - notebook['cells'][1]['source'] = notebook['cells'][0]['source'] + '\n' + notebook['cells'][1]['source'] - notebook['cells'] = notebook['cells'][1:] - - for cell in notebook['cells']: - if cell['cell_type'] == 'markdown': - # convert markdown to doc string - cell['cell_type'] = 'code' - cell['source'] = '"""\n' + cell['source'] + '\n"""' - cell['outputs'] = [] - cell['execution_count'] = 0 - if cell['cell_type'] == 'code' and re.search('![ ]*pip', cell['source']): - # replace !pip with #pip - cell['source'] = re.sub('![ ]*pip[ ]*install', '# pip install', cell['source']) - - # convert tp python script - (code, _) = PythonExporter().from_notebook_node(notebook) - - # add import get_ipython - code = 'from IPython import get_ipython \n' + code - - py_path = path.split('/')[-1].replace('.ipynb', '.py').replace('-', '_') - - assert not os.path.exists(py_path), f"File {py_path} already exist. Cannot convert notebook." - with open(py_path, 'w') as py_file: - py_file.write(code) - - return py_path - - -def increase_image_version(last_version): - try: - # increase last version value by 1 - version = last_version.split('.') - version[-1] = str(int(version[-1]) + 1) - version = '.'.join(version) - except: - # fails if a string value was used for the last tag - version = last_version + '.1' - logging.debug(f'Failed to increase last value, adding .1') - pass - return version - - -def pull_docker_image_tags(image): - logging.warning("The current implementation can only query local docker images. " - "Please use an argument '-v ' to avoid duplicates.") - # TODO: Add script for reading image tags from docker hub - # list images - output = subprocess.run( - ['docker', 'image', 'ls', image], - stdout=subprocess.PIPE - ).stdout.decode('utf-8') - try: - # remove header - image_list = output.splitlines()[1:] - # get list of image tags - image_tags = [line.split()[1] for line in image_list] - except: - image_tags = [] - logging.error(f"Could not load image tags from 'docker image ls' output: {output}") - pass - - # filter latest and none - image_tags = [t for t in image_tags if t not in ['latest', '']] - return image_tags - - -def pull_icr_image_tags(image): - # list images from icr - output = subprocess.run( - ['ibmcloud', 'cr', 'images', '--restrict', image.split('icr.io/', 1)[1]], - stdout=subprocess.PIPE - ).stdout.decode('utf-8') - - try: - assert 'You have no images in the namespaces' not in output - # remove header and final status - image_list = output.splitlines()[3:-2] - # get list of image tags - image_tags = [line.split()[1] for line in image_list] - except: - image_tags = [] - logging.warning(f"Could not load image tags from 'ibmcloud cr images' output: {output}") - pass - - # filter latest and none - image_tags = [t for t in image_tags if t not in ['latest', '']] - return image_tags - - -def get_image_version(repository, name): - """ - Get current version of the image from the registry and increase the version by 1. - Defaults to 0.1 if no image is found in the registry. - """ - if repository is None: - logging.debug('Using 0.1 as local version.') - return '0.1' - - logging.debug(f'Get image version from registry.') - if 'docker.io' in repository: - logging.debug('Get image tags from docker.') - image_tags = pull_docker_image_tags(f'{repository}/claimed-{name}') - elif 'icr.io' in repository: - logging.debug('Get image tags from ibmcloud container registry.') - image_tags = pull_icr_image_tags(f'{repository}/claimed-{name}') - else: - logging.warning('Unrecognised container registry, using docker to query image tags.') - image_tags = pull_docker_image_tags(f'{repository}/claimed-{name}') - logging.debug(f'Image tags: {image_tags}') - - def check_only_numbers(test_str): - return set(test_str) <= set('.0123456789') - - if len(image_tags) == 0: - # default version - version = '0.1' - logging.info(f'Using default version {version}. No prior image tag found for {repository}/claimed-{name}.') - - elif not check_only_numbers(image_tags[0]): - # increase last version - version = increase_image_version(image_tags[0]) - logging.info(f'Using version {version} based on last version {image_tags[0]}.') - - else: - # find the highest numerical version - image_tags = list(filter(check_only_numbers, image_tags)) - image_tags.sort(key=lambda s: list(map(int, s.split('.')))) - version = increase_image_version(image_tags[-1]) - logging.info(f'Using version {version} based on highest previous version {image_tags[-1]}.') - - return version diff --git a/build/lib/claimed/__init__.py b/build/lib/claimed/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/build/lib/claimed/claimed.py b/build/lib/claimed/claimed.py deleted file mode 100644 index c19c11b1..00000000 --- a/build/lib/claimed/claimed.py +++ /dev/null @@ -1,12 +0,0 @@ -import subprocess -import sys -import os - - -def main(): - dir_path = os.path.dirname(os.path.realpath(__file__)) - return subprocess.call(f'{dir_path}/scripts/claimed ' + ' '.join(sys.argv[1:]), shell=True) - - -if __name__ == '__main__': - main() diff --git a/build/lib/mlx/__init__.py b/build/lib/mlx/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/build/lib/mlx/cos_backend.py b/build/lib/mlx/cos_backend.py deleted file mode 100644 index 03454120..00000000 --- a/build/lib/mlx/cos_backend.py +++ /dev/null @@ -1,111 +0,0 @@ -import json -import boto3 -import jsonschema -from jsonschema import validate -import os - -class COSKVStore: - def __init__(self, bucket_name, schema, cos_client=None): - """ - Initialize the COS Key-Value store. - :param bucket_name: Name of the COS bucket. - :param schema: JSON Schema to validate values. - :param cos_client: Optional COS client instance (for dependency injection). - """ - self.bucket_name = bucket_name - self.schema = schema - self.cos_client = cos_client or boto3.client('s3') - - def put(self, key, value): - """ - Store a value in COS after validating against the JSON schema. - :param key: The key under which the value is stored. - :param value: The value to store (must be JSON-serializable). - """ - try: - validate(instance=value, schema=self.schema) - except jsonschema.exceptions.ValidationError as e: - raise ValueError(f"Validation error: {e.message}") - self.cos_client.put_object( - Bucket=self.bucket_name, - Key=key, - Body=json.dumps(value) - ) - - def get(self, key): - """ - Retrieve a value from COS. - :param key: The key to retrieve. - :return: The stored value as a dictionary. - """ - try: - response = self.cos_client.get_object(Bucket=self.bucket_name, Key=key) - return json.loads(response['Body'].read().decode('utf-8')) - except self.cos_client.exceptions.NoSuchKey: - raise KeyError(f"Key '{key}' not found in bucket '{self.bucket_name}'") - - def delete(self, key): - """ - Delete a key-value pair from COS. - :param key: The key to delete. - """ - self.cos_client.delete_object(Bucket=self.bucket_name, Key=key) - - def list_keys(self): - """ - List all keys in the COS bucket. - :return: A list of keys. - """ - response = self.cos_client.list_objects_v2(Bucket=self.bucket_name) - return [obj['Key'] for obj in response.get('Contents', [])] - -def load_schemas(schema_folder): - """ - Loads all JSON schemas from the given folder. - :param schema_folder: Path to the folder containing JSON schema files. - :return: A dictionary of schema names and their corresponding JSON objects. - """ - schemas = {} - for filename in os.listdir(schema_folder): - if filename.endswith(".json"): - filepath = os.path.join(schema_folder, filename) - with open(filepath, 'r') as f: - schemas[filename[:-5]] = json.load(f) - return schemas - -# Example Usage -if __name__ == "__main__": - schema_folder = "../schemas" - schemas = load_schemas(schema_folder) - - if "example_schema" in schemas: - example_schema = schemas["example_schema"] - cos_store = COSKVStore("my-cos-bucket", example_schema) - - test_data = { - "id": "model_123", - "name": "MyModel", - "framework": "TensorFlow", - "version": "2.10", - "description": "A simple neural network model.", - "metrics": { - "accuracy": 0.95, - "loss": 0.1 - } - } - cos_store.put("model_123", test_data) - print(cos_store.get("model_123")) - - #Example data without the metric key, which is not required - test_data_no_metrics = { - "id": "model_456", - "name": "MyOtherModel", - "framework": "PyTorch", - "version": "1.12", - "description": "Another neural network model.", - } - cos_store.put("model_456", test_data_no_metrics) - print(cos_store.get("model_456")) - - else: - print("Schema 'example_schema' not found.") \ No newline at end of file diff --git a/build/lib/mlx/s3_kv_store.py b/build/lib/mlx/s3_kv_store.py deleted file mode 100644 index 40b327cd..00000000 --- a/build/lib/mlx/s3_kv_store.py +++ /dev/null @@ -1,271 +0,0 @@ -import json -import posixpath -import re -import argparse -from typing import Optional, Dict, List, Any, Tuple -from urllib.parse import quote, unquote -import boto3 -from botocore.exceptions import ClientError - -INDEX_SEPARATOR = "__i__" -KV_SEPARATOR = "=" -FILENAME_SUFFIX = ".json" - - -def _encode_component(s: str) -> str: - return quote(s, safe="") - - -def _decode_component(s: str) -> str: - return unquote(s) - - -def _build_filename(key: str, indexes: Optional[Dict[str, str]] = None) -> str: - parts = [_encode_component(key)] - if indexes: - for k in sorted(indexes.keys()): - v = indexes[k] - parts.append(f"{_encode_component(k)}{KV_SEPARATOR}{_encode_component(str(v))}") - return INDEX_SEPARATOR.join(parts) + FILENAME_SUFFIX - - -def _parse_filename(filename: str) -> Tuple[str, Dict[str, str]]: - if not filename.endswith(FILENAME_SUFFIX): - raise ValueError("invalid filename (missing .json suffix)") - core = filename[:-len(FILENAME_SUFFIX)] - parts = core.split(INDEX_SEPARATOR) - if not parts: - raise ValueError("invalid filename") - key = _decode_component(parts[0]) - indexes: Dict[str, str] = {} - for p in parts[1:]: - if KV_SEPARATOR not in p: - continue - k_enc, v_enc = p.split(KV_SEPARATOR, 1) - k = _decode_component(k_enc) - v = _decode_component(v_enc) - indexes[k] = v - return key, indexes - - -class S3KVStore: - def __init__(self, bucket: str, store_name: str, s3_client: Optional[Any] = None, endpoint_url: Optional[str] = None, aws_access_key_id: Optional[str] = None, aws_secret_access_key: Optional[str] = None): - self.bucket = bucket - self.store_name = store_name.strip("/") - if s3_client is None: - self.s3 = boto3.client( - "s3", - endpoint_url=endpoint_url, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - ) - else: - self.s3 = s3_client - - def _prefix(self) -> str: - return f"{self.store_name}/" if self.store_name else "" - - def _s3_key_for_filename(self, filename: str) -> str: - return posixpath.join(self._prefix(), filename) - - def list(self, prefix: Optional[str] = None, max_keys: int = 1000) -> List[Dict[str, Any]]: - s3_prefix = self._prefix() - continuation_token = None - results: List[Dict[str, Any]] = [] - - while True: - kwargs = {"Bucket": self.bucket, "Prefix": s3_prefix, "MaxKeys": max_keys} - if continuation_token: - kwargs["ContinuationToken"] = continuation_token - resp = self.s3.list_objects_v2(**kwargs) - contents = resp.get("Contents", []) - for obj in contents: - full_key = obj["Key"] - filename = posixpath.basename(full_key) - try: - logical_key, indexes = _parse_filename(filename) - except ValueError: - continue - if prefix and not logical_key.startswith(prefix): - continue - results.append({ - "s3_key": full_key, - "filename": filename, - "key": logical_key, - "indexes": indexes, - "size": obj.get("Size", 0), - "last_modified": obj.get("LastModified"), - }) - if not resp.get("IsTruncated"): - break - continuation_token = resp.get("NextContinuationToken") - - return results - - def _match_indexes(self, item_indexes: Dict[str, str], filt: Dict[str, Any]) -> bool: - for fk, fv in filt.items(): - if fk not in item_indexes: - return False - val = item_indexes[fk] - if isinstance(fv, (list, tuple, set)): - if val not in {str(x) for x in fv}: - return False - elif isinstance(fv, re.Pattern): - if not fv.search(val): - return False - else: - if val != str(fv): - return False - return True - - def get(self, key: str, index_filter: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: - matches = self._find_objects_for_key(key, index_filter=index_filter) - if not matches: - raise KeyError(f"key not found: {key} (filter={index_filter})") - if len(matches) > 1: - raise ValueError(f"multiple objects match key={key}; refine using index_filter: {matches}") - s3_key = matches[0]["s3_key"] - try: - resp = self.s3.get_object(Bucket=self.bucket, Key=s3_key) - body = resp["Body"].read() - return json.loads(body.decode("utf-8")) - except ClientError as e: - raise IOError(f"s3 get_object failed: {e}") - - def put(self, key: str, value: Dict[str, Any], indexes: Optional[Dict[str, Any]] = None, overwrite: bool = False) -> str: - if overwrite: - existing = self._find_objects_for_key(key) - for obj in existing: - self.s3.delete_object(Bucket=self.bucket, Key=obj["s3_key"]) - - filename = _build_filename(key, {k: str(v) for k, v in (indexes or {}).items()}) - s3_key = self._s3_key_for_filename(filename) - if not overwrite: - try: - self.s3.head_object(Bucket=self.bucket, Key=s3_key) - raise FileExistsError(f"object already exists: {s3_key}") - except ClientError as e: - if e.response["Error"]["Code"] not in ("404", "NotFound", "NoSuchKey"): - raise - - payload = json.dumps(value, ensure_ascii=False).encode("utf-8") - self.s3.put_object(Bucket=self.bucket, Key=s3_key, Body=payload, ContentType="application/json") - return s3_key - - def update(self, key: str, value: Dict[str, Any], index_filter: Optional[Dict[str, Any]] = None, new_indexes: Optional[Dict[str, Any]] = None) -> str: - matches = self._find_objects_for_key(key, index_filter=index_filter) - if not matches: - raise KeyError(f"no object matches key={key} index_filter={index_filter}") - if len(matches) > 1: - raise ValueError(f"multiple objects match key={key} index_filter={index_filter}: {matches}") - - old = matches[0] - target_indexes = new_indexes if new_indexes is not None else old["indexes"] - new_filename = _build_filename(key, {k: str(v) for k, v in (target_indexes or {}).items()}) - new_s3_key = self._s3_key_for_filename(new_filename) - payload = json.dumps(value, ensure_ascii=False).encode("utf-8") - self.s3.put_object(Bucket=self.bucket, Key=new_s3_key, Body=payload, ContentType="application/json") - if old["s3_key"] != new_s3_key: - self.s3.delete_object(Bucket=self.bucket, Key=old["s3_key"]) - return new_s3_key - - def delete(self, key: str, index_filter: Optional[Dict[str, Any]] = None) -> int: - matches = self._find_objects_for_key(key, index_filter=index_filter) - count = 0 - for obj in matches: - self.s3.delete_object(Bucket=self.bucket, Key=obj["s3_key"]) - count += 1 - return count - - def search(self, index_filter: Dict[str, Any]) -> List[Dict[str, Any]]: - all_items = self.list() - return [it for it in all_items if self._match_indexes(it["indexes"], index_filter)] - - def _find_objects_for_key(self, key: str, index_filter: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]: - candidates = self.list(prefix=key) - if index_filter is None: - return candidates - return [c for c in candidates if self._match_indexes(c["indexes"], index_filter)] - - -# ---------------- CLI ---------------- -def main(): - parser = argparse.ArgumentParser(description="S3 KV Store CLI") - parser.add_argument("bucket") - parser.add_argument("store") - parser.add_argument("--endpoint") - - sub = parser.add_subparsers(dest="cmd", required=True) - - # put - sp = sub.add_parser("put") - sp.add_argument("key") - sp.add_argument("--indexes", type=json.loads, default="{}") - sp.add_argument("--value") - sp.add_argument("--value-file") - sp.add_argument("--overwrite", action="store_true") - - # get - sp = sub.add_parser("get") - sp.add_argument("key") - sp.add_argument("--filter", type=json.loads, default="{}") - - # update - sp = sub.add_parser("update") - sp.add_argument("key") - sp.add_argument("--filter", type=json.loads, default="{}") - sp.add_argument("--new-indexes", type=json.loads, default=None) - sp.add_argument("--value") - sp.add_argument("--value-file") - - # delete - sp = sub.add_parser("delete") - sp.add_argument("key") - sp.add_argument("--filter", type=json.loads, default="{}") - - # list - sp = sub.add_parser("list") - sp.add_argument("--prefix") - - # search - sp = sub.add_parser("search") - sp.add_argument("--filter", type=json.loads, required=True) - - args = parser.parse_args() - store = MLX(bucket=args.bucket, store_name=args.store, endpoint_url=args.endpoint) - - if args.cmd == "put": - if args.value_file: - value = json.load(open(args.value_file)) - else: - value = json.loads(args.value) - key = store.put(args.key, value, indexes=args.indexes, overwrite=args.overwrite) - print(key) - - elif args.cmd == "get": - value = store.get(args.key, index_filter=args.filter) - print(json.dumps(value, indent=2)) - - elif args.cmd == "update": - if args.value_file: - value = json.load(open(args.value_file)) - else: - value = json.loads(args.value) - key = store.update(args.key, value, index_filter=args.filter, new_indexes=args.new_indexes) - print(key) - - elif args.cmd == "delete": - count = store.delete(args.key, index_filter=args.filter) - print(f"Deleted {count} object(s)") - - elif args.cmd == "list": - items = store.list(prefix=args.prefix) - print(json.dumps(items, indent=2, default=str)) - - elif args.cmd == "search": - items = store.search(args.filter) - print(json.dumps(items, indent=2, default=str)) - - -if __name__ == "__main__": - main() diff --git a/build/lib/scripts/claimed b/build/lib/scripts/claimed deleted file mode 100755 index 1aefe327..00000000 --- a/build/lib/scripts/claimed +++ /dev/null @@ -1,79 +0,0 @@ -#!/bin/bash -if [ -z "$1" ]; then - echo "Usage: $0 --component component-name/image-location [--component-parameters-name component-parameters-value ...]" - echo "Example: claimed --component blumenstiel/claimed-generate-random-numbers --num_random 5" - echo "or (deprecated)" - echo "Usage: $0 component-name/image-location [component-parameters-name=component-parameters-value ...]" - echo "Example: claimed blumenstiel/claimed-generate-random-numbers num_random=5" - exit 1 -fi - -# Create operator and gridwrapper with C3 -if [ $1 = "create" ]; then - if ! command -v c3_create_operator &> /dev/null; then - echo "CLAIMED C3 not found. Install with 'pip install claimed-c3'" - exit 1 - fi - if [ $2 = "operator" ]; then - c3_create_operator ${@:3} - elif [ $2 = "gridwrapper" ]; then - c3_create_gridwrapper ${@:3} - else - echo "C3 can only create 'operator' and 'gridwrapper'." - exit 1 - fi - exit 0 -fi - -# Nothing above matched, so we assume we want to run a component - -envs="" -if [[ $1 == "--component" ]]; then - image=$2 - shift 2 - for var in "$@"; do - if [[ $var == "--"* ]]; then - envs="${envs} --env ${var:2}=" - else - envs="${envs}${var}" - fi - done -else - echo "Assuming arguments contain = sign, you need to provide the --component to run, aborting" - exit 1 -fi - - - -if [[ "$image" != */* ]]; then - image=docker.io/claimed/$image -fi - -if [[ "$image" != *:* ]]; then - image=$image:latest -fi - -if [[ "$image" != *containerless* ]]; then - if [ -z ${CLAIMED_DATA_PATH+x} ]; then - echo "CLAIMED_DATA_PATH variable not set, not mounting /data to the CLAIMED component" - docker run $envs $image - else - echo "CLAIMED_DATA_PATH variable is set, mounting $CLAIMED_DATA_PATH to /opt/app-root/src/data" - docker run $envs -u 0 -v `echo $CLAIMED_DATA_PATH`:/opt/app-root/src/data:z $image - fi -else - echo "Entering containerless operation" - if [ -z ${CLAIMED_CONTAINERLESS_OPERATOR_PATH+x} ]; then - echo "CLAIMED_CONTAINERLESS_OPERATOR_PATH not set, aborting" - exit 1 - else - containerlesscomponentpath=`sed "s/containerless//g" <<< "$image"` - containerlesscomponentpath=`sed "s/:/./g" <<< "$containerlesscomponentpath"` - containerlesscomponent=$containerlesscomponentpath"/runnable.py" - command="python "$CLAIMED_CONTAINERLESS_OPERATOR_PATH"/"$containerlesscomponent" "$envs - echo "Executing: "$command - source $CLAIMED_CONTAINERLESS_OPERATOR_PATH'/'$containerlesscomponentpath"/claimedenv/bin/activate" - chmod 755 $CLAIMED_CONTAINERLESS_OPERATOR_PATH"/"$containerlesscomponent - $command - fi -fi diff --git a/dist/claimed-0.2.2-py3-none-any.whl b/dist/claimed-0.2.2-py3-none-any.whl deleted file mode 100644 index b911fbfc..00000000 Binary files a/dist/claimed-0.2.2-py3-none-any.whl and /dev/null differ diff --git a/dist/claimed-0.2.2.tar.gz b/dist/claimed-0.2.2.tar.gz deleted file mode 100644 index bdf8e335..00000000 Binary files a/dist/claimed-0.2.2.tar.gz and /dev/null differ diff --git a/src/claimed.egg-info/PKG-INFO b/src/claimed.egg-info/PKG-INFO deleted file mode 100644 index 9363b56a..00000000 --- a/src/claimed.egg-info/PKG-INFO +++ /dev/null @@ -1,289 +0,0 @@ -Metadata-Version: 2.4 -Name: claimed -Version: 0.2.2 -Summary: The CLAIMED framework -Home-page: https://github.com/claimed-framework/component-library -Author: The CLAIMED authors -Author-email: The CLAIMED authors -Maintainer: Benedikt Blumenstiel -Maintainer-email: Romeo Kienzler -License: Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright 2025 The Linux Foundation AI - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - -Project-URL: Homepage, https://github.com/claimed-framework/c3 -Project-URL: Bug Tracker, https://github.com/claimed-framework/c3/issues -Keywords: CLAIMED,compiler,KubeFlow,Kubernetes -Classifier: Programming Language :: Python :: 3 -Classifier: License :: OSI Approved :: Apache Software License -Classifier: Operating System :: OS Independent -Requires-Python: >=3.7 -Description-Content-Type: text/markdown -License-File: LICENSE -Requires-Dist: nbconvert>=7.9.2 -Requires-Dist: ipython>=8.16.1 -Requires-Dist: traitlets>=5.11.2 -Requires-Dist: pandas -Requires-Dist: boto3 -Dynamic: author -Dynamic: home-page -Dynamic: license-file - -[![OpenSSF Best Practices](https://bestpractices.coreinfrastructure.org/projects/6718/badge)](https://bestpractices.coreinfrastructure.org/projects/6718) -[![GitHub](https://img.shields.io/badge/issue_tracking-github-blue.svg)](https://github.com/claimed-framework/component-library/issues) - - - -# C3 - the CLAIMED Component Compiler - -**TL;DR** -- takes arbitrary assets (Jupyter notebooks, python scripts, R scripts) as input -- automatically creates container images and pushes to container registries -- automatically installs all required dependencies into the container image -- creates KubeFlow Pipeline components (target workflow execution engines are pluggable) -- creates Kubernetes job configs for execution on Kubernetes/Openshift clusters -- can be triggered from CICD pipelines - - -To learn more on how this library works in practice, please have a look at the following [video](https://www.youtube.com/watch?v=FuV2oG55C5s) - -## Related work -[Ploomber](https://github.com/ploomber/ploomber) - -[Orchest](https://www.orchest.io/) - -## Getting started - -### Install - -```sh -pip install claimed -``` - -### Usage - -Just run the following command with your python script or notebook: -```sh -c3_create_operator ".py" --repository "/" -``` - -Your code needs to follow certain requirements which are explained in [Getting Started](https://github.com/claimed-framework/c3/blob/main/GettingStarted.md). - - -## Getting Help - -```sh -c3_create_operator --help -``` - -We welcome your questions, ideas, and feedback. Please create an [issue](https://github.com/claimed-framework/component-library/issues) or a [discussion thread](https://github.com/claimed-framework/component-library/discussions). -Please see [VULNERABILITIES.md](VULNERABILITIES.md) for reporting vulnerabilities. - -## Contributing to CLAIMED -Interested in helping make CLAIMED better? We encourage you to take a look at our -[Contributing](CONTRIBUTING.md) page. - -## Credits - -CLAIMED is supported by the EU’s Horizon Europe program under Grant Agreement number 101131841 and also received funding from the Swiss State Secretariat for Education, Research and Innovation (SERI) and the UK Research and Innovation (UKRI). - -## License -This software is released under Apache License v2.0. diff --git a/src/claimed.egg-info/SOURCES.txt b/src/claimed.egg-info/SOURCES.txt deleted file mode 100644 index 9552be5d..00000000 --- a/src/claimed.egg-info/SOURCES.txt +++ /dev/null @@ -1,314 +0,0 @@ -.gitignore -CHANGELOG.md -CODEOWNERS -CODE_OF_CONDUCT.md -CONTRIBUTING.md -FAQ.md -GettingStarted.md -LICENSE -MAINTAINERS.md -OWNERS -README.md -SECURITY.md -SUPPORT.md -VULNERABILITIES.md -contribution_process.md -pyproject.toml -release_process.md -setup.py -test_requirements.txt -.github/build_operators.sh -.github/build_operators_commits.txt -.github/pull_request_template.md -.github/ISSUE_TEMPLATE/-report-a-vulnerability-.md -.github/ISSUE_TEMPLATE/bug_report.md -.github/ISSUE_TEMPLATE/feature_request.md -.github/ISSUE_TEMPLATE/issue-report.md -.github/workflows/build_operators.yaml -.github/workflows/build_push_container_image.yaml -.github/workflows/integration-test.yml -.github/workflows/publish-pypi.yml -.github/workflows/pylint.yml -.github/workflows/python-app.yml -.github/workflows/python-package-conda.yml -.github/workflows/python-publish.yml -artwork/LICENSE -artwork/README.md -artwork/black/claimed-black.png -artwork/black/claimed-black.svg -artwork/color/claimed-color.png -artwork/color/claimed-color.svg -artwork/white/claimed-white.png -artwork/white/claimed-white.svg -claimed/__init__.py -claimed/claimed_utils.py -claimed/generic-notebook-runner.ipynb -claimed/run_tests.py -claimed/voila-notebook-runner.ipynb -claimed/akfire/README.md -claimed/akfire/akfire_claimed_dag.py -claimed/akfire/build_components.sh -claimed/akfire/config.json -claimed/akfire/operators/create_training_zarr.cwl -claimed/akfire/operators/create_training_zarr.py -claimed/akfire/operators/logistic_prediction.cwl -claimed/akfire/operators/logistic_prediction.py -claimed/akfire/operators/optimize_xgb_hyperparameters_from_df.cwl -claimed/akfire/operators/optimize_xgb_hyperparameters_from_df.py -claimed/akfire/operators/train_logistic.cwl -claimed/akfire/operators/train_logistic.py -claimed/akfire/operators/training_xgboost.cwl -claimed/akfire/operators/training_xgboost.py -claimed/akfire/operators/xgboost_prediction.cwl -claimed/akfire/operators/xgboost_prediction.py -claimed/analyze/spark-ts-trends.ipynb -claimed/analyze/spark-ts-trends.yaml -claimed/anomaly/anomaly-score-unsupervised.ipynb -claimed/anomaly/anomaly-score-unsupervised/Dockerfile -claimed/anomaly/anomaly-score-unsupervised/build.sh -claimed/anomaly/anomaly-score-unsupervised/test-anomaly-score-unsupervised.ipynb -claimed/anomaly/anomaly-score-unsupervised/watsoniotp.broken.phase_aligned.pickle -claimed/anomaly/anomaly-score-unsupervised/watsoniotp.healthy.phase_aligned.pickle -claimed/anomaly/anomaly-score-unsupervised/dapr/publish_event.sh -claimed/anomaly/anomaly-score-unsupervised/dapr/pubsub.yaml -claimed/anomaly/anomaly-score-unsupervised/dapr/start_sidecar.sh -claimed/anomaly/anomaly-score-unsupervised/dapr/start_subscriber.sh -claimed/anomaly/anomaly-score-unsupervised/dapr/statestore.yaml -claimed/anomaly/anomaly-score-unsupervised/dapr/subscriber.py -claimed/anomaly/anomaly-score-unsupervised/dapr/subscription.yaml -claimed/checkpoint/pull_asset.ipynb -claimed/checkpoint/pull_asset.yaml -claimed/checkpoint/store_asset.ipynb -claimed/checkpoint/store_asset.yaml -claimed/deploy/README.md -claimed/deploy/condition-blessing.ipynb -claimed/deploy/condition-blessing.yaml -claimed/deploy/deploy-kfserving.ipynb -claimed/deploy/deploy_watson_machine_learning.ipynb -claimed/deploy/deploy_watson_machine_learning.yaml -claimed/deploy/deploy_wml_pmml.ipynb -claimed/deploy/deploy_wml_pmml.yaml -claimed/examples/alert_for_content_in_url.cwl -claimed/examples/alert_for_content_in_url.ipynb -claimed/examples/alert_for_content_in_url.job.yaml -claimed/examples/alert_for_content_in_url.yaml -claimed/examples/fibonacci.cwl -claimed/examples/fibonacci.job.yaml -claimed/examples/fibonacci.py -claimed/examples/fibonacci.yaml -claimed/examples/hello_world.ipynb -claimed/examples/hello_world.job.yaml -claimed/examples/hello_world.yaml -claimed/filter/README.md -claimed/filter/filter.cwl -claimed/filter/filter.ipynb -claimed/filter/filter.job.yaml -claimed/filter/filter.yaml -claimed/filter/filter_docker.cwl -claimed/filter/spark-sample.ipynb -claimed/geo/gdal.ipynb -claimed/input/README.md -claimed/input/input-Xview-download.cwl -claimed/input/input-Xview-download.ipynb -claimed/input/input-Xview-download.job.yaml -claimed/input/input-Xview-download.yaml -claimed/input/input-climate-copernicus.ipynb -claimed/input/input-climate-copernicus.yaml -claimed/input/input-codenet-LangClass.ipynb -claimed/input/input-codenet-LangClass.yaml -claimed/input/input-cos-zarr.ipynb -claimed/input/input-covid-chestxray.ipynb -claimed/input/input-covid-chestxray.yaml -claimed/input/input-from-mongodb.ipynb -claimed/input/input-hmp.ipynb -claimed/input/input-hmp.yaml -claimed/input/input-mqtt.ipynb -claimed/input/input-pardata.ipynb -claimed/input/input-pardata.yaml -claimed/input/input-pei.ipynb -claimed/input/input-postgresql.ipynb -claimed/input/input-postgresql.yaml -claimed/input/input-rki-covid19-deaths.ipynb -claimed/input/input-swissmedic.ipynb -claimed/input/input-url.ipynb -claimed/input/input-url.yaml -claimed/input/input-webcam.ipynb -claimed/input/ls-cos.ipynb -claimed/metric/README.md -claimed/metric/metric-aif360.ipynb -claimed/metric/metric-aif360.yaml -claimed/metric/metric-aix360-lime.ipynb -claimed/metric/metric-aix360-lime.yaml -claimed/metric/metric-confusion-matrix.ipynb -claimed/metric/metric-confusion-matrix.yaml -claimed/monitoring/README.md -claimed/monitoring/notification-email.ipynb -claimed/monitoring/notification-email.yaml -claimed/nlp/nlp-classify-text-simple.ipynb -claimed/nlp/nlp-classify-text-simple.yaml -claimed/output/output-elastic.ipynb -claimed/output/output-rdbms-sqlalchemy.ipynb -claimed/output/output-rdbms-sqlalchemy.yaml -claimed/output/upload-to-cos-http-adapter.ipynb -claimed/output/upload-to-cos.cwl -claimed/output/upload-to-cos.ipynb -claimed/output/upload-to-cos.job.yaml -claimed/output/upload-to-cos.yaml -claimed/predict/README.md -claimed/predict/image-endpoint-tester.ipynb -claimed/predict/image-endpoint.ipynb -claimed/predict/predict-images.ipynb -claimed/predict/predict-images.yaml -claimed/predict/tvn2.ipynb -claimed/predict/yolo.ipynb -claimed/segment-anything/generate-masks.ipynb -claimed/segment-anything/generate-masks.yaml -claimed/segment-anything/get-masks.ipynb -claimed/segment-anything/get-masks.yaml -claimed/sim/wrf.ipynb -claimed/sim/wrf.yaml -claimed/train/README.md -claimed/train/nvflare.ipynb -claimed/train/spark-train-lr.ipynb -claimed/train/spark-train-lr.yaml -claimed/train/train-mobilenet_v2.ipynb -claimed/train/train-mobilenet_v2.yaml -claimed/transform/README.md -claimed/transform/alchemy-sql-query.ipynb -claimed/transform/cloud-object-store-housekeeping.ipynb -claimed/transform/cpdconfig.yaml -claimed/transform/ibm-sql-query-cpd-manual.yaml -claimed/transform/ibm-sql-query-cpd-test.ipynb -claimed/transform/ibm-sql-query-cpd.ipynb -claimed/transform/ibm-sql-query-cpd.yaml -claimed/transform/ibm-sql-query-test.ipynb -claimed/transform/ibm-sql-query.config -claimed/transform/ibm-sql-query.dockerfile -claimed/transform/ibm-sql-query.ipynb -claimed/transform/ibm-sql-query.secrets.template -claimed/transform/ibm-sql-query.yaml -claimed/transform/image-tiling-with-metadata_adjustment.cwl -claimed/transform/image-tiling-with-metadata_adjustment.cwl:Zone.Identifier -claimed/transform/image-tiling-with-metadata_adjustment.ipynb -claimed/transform/image-tiling-with-metadata_adjustment.ipynb:Zone.Identifier -claimed/transform/image-tiling-with-metadata_adjustment.job.yaml -claimed/transform/image-tiling-with-metadata_adjustment.job.yaml:Zone.Identifier -claimed/transform/image-tiling-with-metadata_adjustment.yaml -claimed/transform/image-tiling-with-metadata_adjustment.yaml:Zone.Identifier -claimed/transform/spark-condense-parquet.ipynb -claimed/transform/spark-condense-parquet.yaml -claimed/transform/spark-csv-to-parquet.ipynb -claimed/transform/spark-csv-to-parquet.yaml -claimed/transform/spark-json-to-parquet.ipynb -claimed/transform/spark-json-to-parquet.yaml -claimed/transform/spark-parquet-to-csv.ipynb -claimed/transform/spark-sql-interactive.ipynb -claimed/transform/spark-sql.ipynb -claimed/transform/spark-sql.yaml -claimed/transform/transform-apply.ipynb -claimed/transform/transform-images.ipynb -claimed/transform/transform-images.yaml -claimed/transform/spark-sql-interactive/Dockerfile -claimed/transform/spark-sql-interactive/app.py -claimed/transform/spark-sql-interactive/build.sh -claimed/util/__init__.py -claimed/util/cgw-util-cos-sync.ipynb -claimed/util/cosutils.ipynb -claimed/util/cosutils.py -claimed/util/sparksql-interactive.ipynb -claimed/visualize/map-from-coordinates.ipynb -claimed/visualize/timeseries-runchart.ipynb -claimed/visualize/timeseries-runchart.yaml -claimed/visualize/visualize-with-quickchart-mongodb.ipynb -claimed/visualize/visualize-with-quickchart.ipynb -docs/mmd/c3.mmd -docs/mmd/c3_examplified.mmd -docs/mmd/gridwrapper.mmd -docs/mmd/sotab4claimed.mmd -examples/example_rscript.R -examples/gw_simple_grid_wrapper_example.cwl -examples/gw_simple_grid_wrapper_example.job.yaml -examples/gw_simple_grid_wrapper_example.py -examples/gw_simple_grid_wrapper_example.yaml -examples/operator_example.cwl -examples/operator_example.ipynb -examples/operator_example.job.yaml -examples/operator_example.py -examples/operator_example.yaml -examples/pipeline_example.py -examples/simple_grid_wrapper_example.py -examples/workflow_example.cwl -examples/folder_grid_wrapper_example/folder_grid_wrapper_example.py -examples/folder_grid_wrapper_example/gw_folder_grid_wrapper_example.cwl -examples/folder_grid_wrapper_example/gw_folder_grid_wrapper_example.job.yaml -examples/folder_grid_wrapper_example/gw_folder_grid_wrapper_example.yaml -examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_1/1.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_1/2.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_1/3.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_2/1.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_2/2.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_2/3.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_3/1.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_3/2.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_source/folder_3/3.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_1.PROCESSED/1.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_1.PROCESSED/2.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_1.PROCESSED/3.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_2.PROCESSED/1.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_2.PROCESSED/2.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_2.PROCESSED/3.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_3.PROCESSED/1.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_3.PROCESSED/2.txt -examples/folder_grid_wrapper_example/folder_grid_wrapper_target/folder_3.PROCESSED/3.txt -examples/simple_grid_wrapper_source/1.txt -examples/simple_grid_wrapper_source/2.txt -examples/simple_grid_wrapper_source/3.txt -examples/simple_grid_wrapper_source_and_target/1.txt -examples/simple_grid_wrapper_source_and_target/2.txt -examples/simple_grid_wrapper_source_and_target/3.txt -examples/simple_grid_wrapper_target/1.PROCESSED.txt -examples/simple_grid_wrapper_target/2.PROCESSED.txt -examples/simple_grid_wrapper_target/3.PROCESSED.txt -src/setup.py -src/c3/__init__.py -src/c3/create_containerless_operator.py -src/c3/create_gridwrapper.py -src/c3/create_operator.py -src/c3/notebook.py -src/c3/operator_utils.py -src/c3/parser.py -src/c3/pythonscript.py -src/c3/rscript.py -src/c3/utils.py -src/c3/templates/R_dockerfile_template -src/c3/templates/__init__.py -src/c3/templates/component_setup_code.R -src/c3/templates/component_setup_code.py -src/c3/templates/component_setup_code_wo_logging.py -src/c3/templates/cos_grid_wrapper_template.py -src/c3/templates/cwl_component_template.cwl -src/c3/templates/folder_grid_wrapper_template.py -src/c3/templates/grid_wrapper_template.py -src/c3/templates/kfp_component_template.yaml -src/c3/templates/kubernetes_job_template.job.yaml -src/c3/templates/legacy_cos_grid_wrapper_template.py -src/c3/templates/python_dockerfile_template -src/c3/templates/s3kv_grid_wrapper_template.py -src/c3/templates/simple_grid_wrapper_template.py -src/claimed/__init__.py -src/claimed/claimed.py -src/claimed.egg-info/PKG-INFO -src/claimed.egg-info/SOURCES.txt -src/claimed.egg-info/dependency_links.txt -src/claimed.egg-info/entry_points.txt -src/claimed.egg-info/not-zip-safe -src/claimed.egg-info/requires.txt -src/claimed.egg-info/top_level.txt -src/mlx/__init__.py -src/mlx/cos_backend.py -src/mlx/s3_kv_store.py -tests/example_notebook.ipynb -tests/example_rscript.R -tests/example_script.py -tests/test_compiler.py -tests/test_operator_utils.py \ No newline at end of file diff --git a/src/claimed.egg-info/dependency_links.txt b/src/claimed.egg-info/dependency_links.txt deleted file mode 100644 index 8b137891..00000000 --- a/src/claimed.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/claimed.egg-info/entry_points.txt b/src/claimed.egg-info/entry_points.txt deleted file mode 100644 index 7bad7810..00000000 --- a/src/claimed.egg-info/entry_points.txt +++ /dev/null @@ -1,5 +0,0 @@ -[console_scripts] -c3_create_containerless_operator = c3.create_containerless_operator:main -c3_create_gridwrapper = c3.create_gridwrapper:main -c3_create_operator = c3.create_operator:main -claimed = claimed.claimed:main diff --git a/src/claimed.egg-info/not-zip-safe b/src/claimed.egg-info/not-zip-safe deleted file mode 100644 index 8b137891..00000000 --- a/src/claimed.egg-info/not-zip-safe +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/claimed.egg-info/requires.txt b/src/claimed.egg-info/requires.txt deleted file mode 100644 index d1fee80a..00000000 --- a/src/claimed.egg-info/requires.txt +++ /dev/null @@ -1,5 +0,0 @@ -nbconvert>=7.9.2 -ipython>=8.16.1 -traitlets>=5.11.2 -pandas -boto3 diff --git a/src/claimed.egg-info/top_level.txt b/src/claimed.egg-info/top_level.txt deleted file mode 100644 index 92a82100..00000000 --- a/src/claimed.egg-info/top_level.txt +++ /dev/null @@ -1,3 +0,0 @@ -c3 -claimed -mlx