diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000..cc3e6fc6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,38 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: '' +assignees: '' + +--- + +**Describe the bug** + +A clear and concise description of what the bug is. + +**To Reproduce** + +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** + +A clear and concise description of what you expected to happen. + +**Screenshots** + +If applicable, add screenshots to help explain your problem. + +**Environment** + + - OS: [e.g. iOS] + - Browser [e.g. chrome, safari] + - MLX Version [e.g. 22] + +**Additional context** + +Add any other context about the problem here. diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml new file mode 100644 index 00000000..935daa9a --- /dev/null +++ b/.github/workflows/publish-pypi.yml @@ -0,0 +1,37 @@ +name: Build and publish Python package + +on: + push: + tags: + - 'v*' # publish only on tag pushes like v1.2.3 + +jobs: + build-and-publish: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 # necessary for setuptools_scm to see tags + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install build & twine + run: python -m pip install --upgrade pip build twine setuptools_scm + + - name: Build sdist and wheel + run: python -m build --sdist --wheel + + - name: Check artifacts + run: ls -l dist + + - name: Publish to PyPI + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} + run: | + python -m twine check dist/* + python -m twine upload dist/* --non-interactive diff --git a/.gitignore b/.gitignore index c1e7a4aa..5fbd6c8e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ venv/ .venv/ __pycache__ -.ipynb_checkpoints/ \ No newline at end of file +.ipynb_checkpoints/ diff --git a/MAINTAINERS.md b/MAINTAINERS.md new file mode 100644 index 00000000..10f62353 --- /dev/null +++ b/MAINTAINERS.md @@ -0,0 +1,8 @@ +# MAINTAINERS + +Following is the current list of maintainers on this project + +The maintainers are listed in alphabetical order. + +- [@romeokienzler](https://github.com/romeokienzler) (Romeo Kienzler) +- [@GlennVerhaag](https://github.com/GlennVerhaag) (Glenn Verhaag) \ No newline at end of file diff --git a/OWNERS b/OWNERS new file mode 100644 index 00000000..6d386309 --- /dev/null +++ b/OWNERS @@ -0,0 +1,12 @@ +approvers: + - animeshsingh + - ckadner + - Tomcli + - drewbutlerbb4 + - yhwang +reviewers: + - animeshsingh + - ckadner + - Tomcli + - drewbutlerbb4 + - yhwang diff --git a/README.md b/README.md index b6f6a552..68c5fd10 100644 --- a/README.md +++ b/README.md @@ -57,4 +57,4 @@ Interested in helping make CLAIMED better? We encourage you to take a look at ou CLAIMED is supported by the EU’s Horizon Europe program under Grant Agreement number 101131841 and also received funding from the Swiss State Secretariat for Education, Research and Innovation (SERI) and the UK Research and Innovation (UKRI). ## License -This software is released under Apache License v2.0. \ No newline at end of file +This software is released under Apache License v2.0. diff --git a/pyproject.toml b/pyproject.toml index 72b8216a..d8440d0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,4 +47,4 @@ c3_create_gridwrapper = "c3.create_gridwrapper:main" where = ["src"] [tool.setuptools.package-data] -"c3.templates" = ["*"] \ No newline at end of file +"c3.templates" = ["*"] diff --git a/src/mlx/__init__.py b/src/mlx/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/mlx/cos_backend.py b/src/mlx/cos_backend.py new file mode 100644 index 00000000..03454120 --- /dev/null +++ b/src/mlx/cos_backend.py @@ -0,0 +1,111 @@ +import json +import boto3 +import jsonschema +from jsonschema import validate +import os + +class COSKVStore: + def __init__(self, bucket_name, schema, cos_client=None): + """ + Initialize the COS Key-Value store. + :param bucket_name: Name of the COS bucket. + :param schema: JSON Schema to validate values. + :param cos_client: Optional COS client instance (for dependency injection). + """ + self.bucket_name = bucket_name + self.schema = schema + self.cos_client = cos_client or boto3.client('s3') + + def put(self, key, value): + """ + Store a value in COS after validating against the JSON schema. + :param key: The key under which the value is stored. + :param value: The value to store (must be JSON-serializable). + """ + try: + validate(instance=value, schema=self.schema) + except jsonschema.exceptions.ValidationError as e: + raise ValueError(f"Validation error: {e.message}") + self.cos_client.put_object( + Bucket=self.bucket_name, + Key=key, + Body=json.dumps(value) + ) + + def get(self, key): + """ + Retrieve a value from COS. + :param key: The key to retrieve. + :return: The stored value as a dictionary. + """ + try: + response = self.cos_client.get_object(Bucket=self.bucket_name, Key=key) + return json.loads(response['Body'].read().decode('utf-8')) + except self.cos_client.exceptions.NoSuchKey: + raise KeyError(f"Key '{key}' not found in bucket '{self.bucket_name}'") + + def delete(self, key): + """ + Delete a key-value pair from COS. + :param key: The key to delete. + """ + self.cos_client.delete_object(Bucket=self.bucket_name, Key=key) + + def list_keys(self): + """ + List all keys in the COS bucket. + :return: A list of keys. + """ + response = self.cos_client.list_objects_v2(Bucket=self.bucket_name) + return [obj['Key'] for obj in response.get('Contents', [])] + +def load_schemas(schema_folder): + """ + Loads all JSON schemas from the given folder. + :param schema_folder: Path to the folder containing JSON schema files. + :return: A dictionary of schema names and their corresponding JSON objects. + """ + schemas = {} + for filename in os.listdir(schema_folder): + if filename.endswith(".json"): + filepath = os.path.join(schema_folder, filename) + with open(filepath, 'r') as f: + schemas[filename[:-5]] = json.load(f) + return schemas + +# Example Usage +if __name__ == "__main__": + schema_folder = "../schemas" + schemas = load_schemas(schema_folder) + + if "example_schema" in schemas: + example_schema = schemas["example_schema"] + cos_store = COSKVStore("my-cos-bucket", example_schema) + + test_data = { + "id": "model_123", + "name": "MyModel", + "framework": "TensorFlow", + "version": "2.10", + "description": "A simple neural network model.", + "metrics": { + "accuracy": 0.95, + "loss": 0.1 + } + } + cos_store.put("model_123", test_data) + print(cos_store.get("model_123")) + + #Example data without the metric key, which is not required + test_data_no_metrics = { + "id": "model_456", + "name": "MyOtherModel", + "framework": "PyTorch", + "version": "1.12", + "description": "Another neural network model.", + } + cos_store.put("model_456", test_data_no_metrics) + print(cos_store.get("model_456")) + + else: + print("Schema 'example_schema' not found.") \ No newline at end of file diff --git a/src/mlx/s3_kv_store.py b/src/mlx/s3_kv_store.py new file mode 100644 index 00000000..40b327cd --- /dev/null +++ b/src/mlx/s3_kv_store.py @@ -0,0 +1,271 @@ +import json +import posixpath +import re +import argparse +from typing import Optional, Dict, List, Any, Tuple +from urllib.parse import quote, unquote +import boto3 +from botocore.exceptions import ClientError + +INDEX_SEPARATOR = "__i__" +KV_SEPARATOR = "=" +FILENAME_SUFFIX = ".json" + + +def _encode_component(s: str) -> str: + return quote(s, safe="") + + +def _decode_component(s: str) -> str: + return unquote(s) + + +def _build_filename(key: str, indexes: Optional[Dict[str, str]] = None) -> str: + parts = [_encode_component(key)] + if indexes: + for k in sorted(indexes.keys()): + v = indexes[k] + parts.append(f"{_encode_component(k)}{KV_SEPARATOR}{_encode_component(str(v))}") + return INDEX_SEPARATOR.join(parts) + FILENAME_SUFFIX + + +def _parse_filename(filename: str) -> Tuple[str, Dict[str, str]]: + if not filename.endswith(FILENAME_SUFFIX): + raise ValueError("invalid filename (missing .json suffix)") + core = filename[:-len(FILENAME_SUFFIX)] + parts = core.split(INDEX_SEPARATOR) + if not parts: + raise ValueError("invalid filename") + key = _decode_component(parts[0]) + indexes: Dict[str, str] = {} + for p in parts[1:]: + if KV_SEPARATOR not in p: + continue + k_enc, v_enc = p.split(KV_SEPARATOR, 1) + k = _decode_component(k_enc) + v = _decode_component(v_enc) + indexes[k] = v + return key, indexes + + +class S3KVStore: + def __init__(self, bucket: str, store_name: str, s3_client: Optional[Any] = None, endpoint_url: Optional[str] = None, aws_access_key_id: Optional[str] = None, aws_secret_access_key: Optional[str] = None): + self.bucket = bucket + self.store_name = store_name.strip("/") + if s3_client is None: + self.s3 = boto3.client( + "s3", + endpoint_url=endpoint_url, + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + ) + else: + self.s3 = s3_client + + def _prefix(self) -> str: + return f"{self.store_name}/" if self.store_name else "" + + def _s3_key_for_filename(self, filename: str) -> str: + return posixpath.join(self._prefix(), filename) + + def list(self, prefix: Optional[str] = None, max_keys: int = 1000) -> List[Dict[str, Any]]: + s3_prefix = self._prefix() + continuation_token = None + results: List[Dict[str, Any]] = [] + + while True: + kwargs = {"Bucket": self.bucket, "Prefix": s3_prefix, "MaxKeys": max_keys} + if continuation_token: + kwargs["ContinuationToken"] = continuation_token + resp = self.s3.list_objects_v2(**kwargs) + contents = resp.get("Contents", []) + for obj in contents: + full_key = obj["Key"] + filename = posixpath.basename(full_key) + try: + logical_key, indexes = _parse_filename(filename) + except ValueError: + continue + if prefix and not logical_key.startswith(prefix): + continue + results.append({ + "s3_key": full_key, + "filename": filename, + "key": logical_key, + "indexes": indexes, + "size": obj.get("Size", 0), + "last_modified": obj.get("LastModified"), + }) + if not resp.get("IsTruncated"): + break + continuation_token = resp.get("NextContinuationToken") + + return results + + def _match_indexes(self, item_indexes: Dict[str, str], filt: Dict[str, Any]) -> bool: + for fk, fv in filt.items(): + if fk not in item_indexes: + return False + val = item_indexes[fk] + if isinstance(fv, (list, tuple, set)): + if val not in {str(x) for x in fv}: + return False + elif isinstance(fv, re.Pattern): + if not fv.search(val): + return False + else: + if val != str(fv): + return False + return True + + def get(self, key: str, index_filter: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + matches = self._find_objects_for_key(key, index_filter=index_filter) + if not matches: + raise KeyError(f"key not found: {key} (filter={index_filter})") + if len(matches) > 1: + raise ValueError(f"multiple objects match key={key}; refine using index_filter: {matches}") + s3_key = matches[0]["s3_key"] + try: + resp = self.s3.get_object(Bucket=self.bucket, Key=s3_key) + body = resp["Body"].read() + return json.loads(body.decode("utf-8")) + except ClientError as e: + raise IOError(f"s3 get_object failed: {e}") + + def put(self, key: str, value: Dict[str, Any], indexes: Optional[Dict[str, Any]] = None, overwrite: bool = False) -> str: + if overwrite: + existing = self._find_objects_for_key(key) + for obj in existing: + self.s3.delete_object(Bucket=self.bucket, Key=obj["s3_key"]) + + filename = _build_filename(key, {k: str(v) for k, v in (indexes or {}).items()}) + s3_key = self._s3_key_for_filename(filename) + if not overwrite: + try: + self.s3.head_object(Bucket=self.bucket, Key=s3_key) + raise FileExistsError(f"object already exists: {s3_key}") + except ClientError as e: + if e.response["Error"]["Code"] not in ("404", "NotFound", "NoSuchKey"): + raise + + payload = json.dumps(value, ensure_ascii=False).encode("utf-8") + self.s3.put_object(Bucket=self.bucket, Key=s3_key, Body=payload, ContentType="application/json") + return s3_key + + def update(self, key: str, value: Dict[str, Any], index_filter: Optional[Dict[str, Any]] = None, new_indexes: Optional[Dict[str, Any]] = None) -> str: + matches = self._find_objects_for_key(key, index_filter=index_filter) + if not matches: + raise KeyError(f"no object matches key={key} index_filter={index_filter}") + if len(matches) > 1: + raise ValueError(f"multiple objects match key={key} index_filter={index_filter}: {matches}") + + old = matches[0] + target_indexes = new_indexes if new_indexes is not None else old["indexes"] + new_filename = _build_filename(key, {k: str(v) for k, v in (target_indexes or {}).items()}) + new_s3_key = self._s3_key_for_filename(new_filename) + payload = json.dumps(value, ensure_ascii=False).encode("utf-8") + self.s3.put_object(Bucket=self.bucket, Key=new_s3_key, Body=payload, ContentType="application/json") + if old["s3_key"] != new_s3_key: + self.s3.delete_object(Bucket=self.bucket, Key=old["s3_key"]) + return new_s3_key + + def delete(self, key: str, index_filter: Optional[Dict[str, Any]] = None) -> int: + matches = self._find_objects_for_key(key, index_filter=index_filter) + count = 0 + for obj in matches: + self.s3.delete_object(Bucket=self.bucket, Key=obj["s3_key"]) + count += 1 + return count + + def search(self, index_filter: Dict[str, Any]) -> List[Dict[str, Any]]: + all_items = self.list() + return [it for it in all_items if self._match_indexes(it["indexes"], index_filter)] + + def _find_objects_for_key(self, key: str, index_filter: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]: + candidates = self.list(prefix=key) + if index_filter is None: + return candidates + return [c for c in candidates if self._match_indexes(c["indexes"], index_filter)] + + +# ---------------- CLI ---------------- +def main(): + parser = argparse.ArgumentParser(description="S3 KV Store CLI") + parser.add_argument("bucket") + parser.add_argument("store") + parser.add_argument("--endpoint") + + sub = parser.add_subparsers(dest="cmd", required=True) + + # put + sp = sub.add_parser("put") + sp.add_argument("key") + sp.add_argument("--indexes", type=json.loads, default="{}") + sp.add_argument("--value") + sp.add_argument("--value-file") + sp.add_argument("--overwrite", action="store_true") + + # get + sp = sub.add_parser("get") + sp.add_argument("key") + sp.add_argument("--filter", type=json.loads, default="{}") + + # update + sp = sub.add_parser("update") + sp.add_argument("key") + sp.add_argument("--filter", type=json.loads, default="{}") + sp.add_argument("--new-indexes", type=json.loads, default=None) + sp.add_argument("--value") + sp.add_argument("--value-file") + + # delete + sp = sub.add_parser("delete") + sp.add_argument("key") + sp.add_argument("--filter", type=json.loads, default="{}") + + # list + sp = sub.add_parser("list") + sp.add_argument("--prefix") + + # search + sp = sub.add_parser("search") + sp.add_argument("--filter", type=json.loads, required=True) + + args = parser.parse_args() + store = MLX(bucket=args.bucket, store_name=args.store, endpoint_url=args.endpoint) + + if args.cmd == "put": + if args.value_file: + value = json.load(open(args.value_file)) + else: + value = json.loads(args.value) + key = store.put(args.key, value, indexes=args.indexes, overwrite=args.overwrite) + print(key) + + elif args.cmd == "get": + value = store.get(args.key, index_filter=args.filter) + print(json.dumps(value, indent=2)) + + elif args.cmd == "update": + if args.value_file: + value = json.load(open(args.value_file)) + else: + value = json.loads(args.value) + key = store.update(args.key, value, index_filter=args.filter, new_indexes=args.new_indexes) + print(key) + + elif args.cmd == "delete": + count = store.delete(args.key, index_filter=args.filter) + print(f"Deleted {count} object(s)") + + elif args.cmd == "list": + items = store.list(prefix=args.prefix) + print(json.dumps(items, indent=2, default=str)) + + elif args.cmd == "search": + items = store.search(args.filter) + print(json.dumps(items, indent=2, default=str)) + + +if __name__ == "__main__": + main() diff --git a/src/schema/dataset.json b/src/schema/dataset.json new file mode 100644 index 00000000..f08c87f8 --- /dev/null +++ b/src/schema/dataset.json @@ -0,0 +1,13 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "id": { "type": "string" }, + "name": { "type": "string" }, + "description": { "type": "string" }, + "size": { "type": "integer" }, + "format": { "type": "string" }, + "source": { "type": "string" } + }, + "required": ["id", "name", "description", "size", "format", "source"] +} diff --git a/src/schema/job.json b/src/schema/job.json new file mode 100644 index 00000000..f1bfe1ef --- /dev/null +++ b/src/schema/job.json @@ -0,0 +1,65 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://example.com/job-queue.schema.json", + "title": "JobQueueItem", + "description": "Schema for a job queue item", + "type": "object", + "required": ["id", "status", "created_at", "payload"], + "properties": { + "id": { + "type": "string", + "description": "Unique job identifier (UUID or similar)" + }, + "status": { + "type": "string", + "enum": ["queued", "running", "completed", "failed", "cancelled"], + "description": "Current status of the job" + }, + "priority": { + "type": "integer", + "description": "Optional priority (higher = more important)", + "default": 0 + }, + "created_at": { + "type": "string", + "format": "date-time", + "description": "Timestamp when the job was created" + }, + "started_at": { + "type": ["string", "null"], + "format": "date-time", + "description": "Timestamp when the job started" + }, + "finished_at": { + "type": ["string", "null"], + "format": "date-time", + "description": "Timestamp when the job finished" + }, + "retries": { + "type": "integer", + "minimum": 0, + "description": "How many times the job has been retried", + "default": 0 + }, + "max_retries": { + "type": "integer", + "minimum": 0, + "description": "Maximum number of retries allowed", + "default": 3 + }, + "error_message": { + "type": ["string", "null"], + "description": "Error message if the job failed" + }, + "payload": { + "type": "object", + "description": "Job-specific input data (opaque to queue)", + "additionalProperties": true + }, + "metadata": { + "type": "object", + "description": "Optional metadata for tracking, tags, etc.", + "additionalProperties": true + } + } +} diff --git a/src/schema/model.json b/src/schema/model.json new file mode 100644 index 00000000..8938ee8e --- /dev/null +++ b/src/schema/model.json @@ -0,0 +1,13 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "id": { "type": "string" }, + "name": { "type": "string" }, + "framework": { "type": "string" }, + "version": { "type": "string" }, + "description": { "type": "string" }, + "metrics": { "type": "object" } + }, + "required": ["id", "name", "framework", "version", "description"] +} diff --git a/src/schema/notebook.json b/src/schema/notebook.json new file mode 100644 index 00000000..547ea620 --- /dev/null +++ b/src/schema/notebook.json @@ -0,0 +1,12 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "id": { "type": "string" }, + "name": { "type": "string" }, + "description": { "type": "string" }, + "language": { "type": "string" }, + "kernel": { "type": "string" } + }, + "required": ["id", "name", "description", "language", "kernel"] +} diff --git a/src/schema/pipeline.json b/src/schema/pipeline.json new file mode 100644 index 00000000..e9aef1b0 --- /dev/null +++ b/src/schema/pipeline.json @@ -0,0 +1,14 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "id": { "type": "string" }, + "name": { "type": "string" }, + "description": { "type": "string" }, + "components": { + "type": "array", + "items": { "type": "string" } + } + }, + "required": ["id", "name", "description", "components"] +} \ No newline at end of file diff --git a/src/schema/pipeline_operator.json b/src/schema/pipeline_operator.json new file mode 100644 index 00000000..80004b03 --- /dev/null +++ b/src/schema/pipeline_operator.json @@ -0,0 +1,11 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "id": { "type": "string" }, + "name": { "type": "string" }, + "type": { "type": "string" }, + "parameters": { "type": "object" } + }, + "required": ["id", "name", "type"] +}