From 288c043473af9116336bb9f2842f62edc8b707e4 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 18 Aug 2025 17:05:53 +0200 Subject: [PATCH 1/6] introduce new component --- .../process_task_results/config.vsh.yaml | 120 +++++++++--------- src/reporting/process_task_results/main.nf | 44 +++++-- 2 files changed, 94 insertions(+), 70 deletions(-) diff --git a/src/reporting/process_task_results/config.vsh.yaml b/src/reporting/process_task_results/config.vsh.yaml index e1703bf52..677cb5e85 100644 --- a/src/reporting/process_task_results/config.vsh.yaml +++ b/src/reporting/process_task_results/config.vsh.yaml @@ -44,9 +44,66 @@ argument_groups: description: Nextflow execution trace file example: resources_test/openproblems/task_results_v4/raw/trace.txt + - name: Dataset filtering + description: | + Use these arguments to filter datasets by name. By default, all datasets are + run. If `--datasets_include` is defined, only those datasets are run. If + `--datasets_exclude` is defined, all datasets except those specified are run. + These arguments are mutually exclusive, so only `--datasets_include` OR + `--datasets_exclude` can set but not both. + arguments: + - name: "--datasets_include" + type: string + multiple: true + description: | + A list of dataset ids to include. If specified, only these datasets will be run. + - name: "--datasets_exclude" + type: string + multiple: true + description: | + A list of dataset ids to exclude. If specified, all datasets except the ones listed will be run. + + - name: Method filtering + description: | + Use these arguments to filter methods by name. By default, all methods are + run. If `--methods_include` is defined, only those methods are run. If + `--methods_exclude` is defined, all methods except those specified are run. + These arguments are mutually exclusive, so only `--methods_include` OR + `--methods_exclude` can set but not both. + arguments: + - name: "--methods_include" + type: string + multiple: true + description: | + A list of method ids to include. If specified, only these methods will be run. + - name: "--methods_exclude" + type: string + multiple: true + description: | + A list of method ids to exclude. If specified, all methods except the ones listed will be run. + + - name: Metric filtering + description: | + Use these arguments to filter metrics by name. By default, all metrics are + run. If `--metrics_include` is defined, only those metrics are run. If + `--metrics_exclude` is defined, all metrics except those specified are run. + These arguments are mutually exclusive, so only `--metrics_include` OR + `--metrics_exclude` can set but not both. + arguments: + - name: "--metrics_include" + type: string + multiple: true + description: | + A list of metric ids to include. If specified, only these metrics will be run. + - name: "--metrics_exclude" + type: string + multiple: true + description: | + A list of metric ids to exclude. If specified, all metrics except the ones listed will be run. + - name: Outputs arguments: - - name: "--output_combined" + - name: "--output_data" type: file required: true direction: output @@ -65,66 +122,6 @@ argument_groups: info: format: type: html - - name: "--output_task_info" - type: file - required: true - direction: output - description: Task info JSON file - default: task_info.json - info: - format: - type: json - schema: /common/schemas/results_v4/task_info.json - - name: "--output_dataset_info" - type: file - required: true - direction: output - description: Dataset info JSON file - default: dataset_info.json - info: - format: - type: json - schema: /common/schemas/results_v4/dataset_info.json - - name: "--output_method_info" - type: file - required: true - direction: output - description: Method info JSON file - default: method_info.json - info: - format: - type: json - schema: /common/schemas/results_v4/method_info.json - - name: "--output_metric_info" - type: file - required: true - direction: output - description: Metric info JSON file - default: metric_info.json - info: - format: - type: json - schema: /common/schemas/results_v4/metric_info.json - - name: "--output_results" - type: file - required: true - direction: output - description: Results JSON file - default: results.json - info: - format: - type: json - schema: /common/schemas/results_v4/results.json - - name: "--output_quality_control" - type: file - required: true - direction: output - description: Quality control JSON file - default: quality_control.json - info: - format: - type: json - schema: /common/schemas/results_v4/quality_control.json resources: - type: nextflow_script @@ -137,6 +134,7 @@ dependencies: - name: reporting/get_metric_info - name: reporting/get_dataset_info - name: reporting/get_task_info + - name: reporting/filter_results - name: reporting/generate_qc - name: reporting/combine_output - name: reporting/render_report diff --git a/src/reporting/process_task_results/main.nf b/src/reporting/process_task_results/main.nf index 1fc64f389..97f5b1220 100644 --- a/src/reporting/process_task_results/main.nf +++ b/src/reporting/process_task_results/main.nf @@ -52,13 +52,45 @@ workflow run_wf { "input_trace": "input_trace", "input_dataset_info": "output_dataset", "input_method_info": "output_method", - "input_metric_info": "output_metric" + "input_metric_info": "output_metric", + "datasets_include": "datasets_include", + "datasets_exclude": "datasets_exclude", + "methods_include": "methods_include", + "methods_exclude": "methods_exclude", + "metrics_include": "metrics_include", + "metrics_exclude": "metrics_exclude" ], toState: [ "output_results": "output" ] ) + | filter_results.run( + runIf: { id, state -> + // Only run filtering if there are include/exclude lists defined + return state.datasets_exclude || state.methods_exclude || state.metrics_exclude || + state.datasets_include || state.methods_include || state.metrics_include + }, + fromState: [ + "input_dataset_info": "output_dataset", + "input_method_info": "output_method", + "input_metric_info": "output_metric", + "input_results": "output_results", + "datasets_include": "datasets_include", + "datasets_exclude": "datasets_exclude", + "methods_include": "methods_include", + "methods_exclude": "methods_exclude", + "metrics_include": "metrics_include", + "metrics_exclude": "metrics_exclude" + ], + toState: [ + "output_dataset": "output_dataset_info", + "output_method": "output_method_info", + "output_metric": "output_metric_info", + "output_results": "output_results" + ] + ) + | generate_qc.run( fromState: [ "input_task_info": "output_task", @@ -90,14 +122,8 @@ workflow run_wf { ) | setState([ - "output_combined": "output_combined", - "output_report": "output_report", - "output_task_info": "output_task", - "output_dataset_info": "output_dataset", - "output_method_info": "output_method", - "output_metric_info": "output_metric", - "output_results": "output_results", - "output_quality_control": "output_qc" + "output_data": "output_combined", + "output_report": "output_report" ]) emit: From ffd0a733a31bb3455b60c6bac825c012b8ff6304 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 18 Aug 2025 22:36:49 +0200 Subject: [PATCH 2/6] implement component --- src/reporting/filter_results/config.vsh.yaml | 162 ++++++++++ src/reporting/filter_results/script.py | 298 +++++++++++++++++++ 2 files changed, 460 insertions(+) create mode 100644 src/reporting/filter_results/config.vsh.yaml create mode 100644 src/reporting/filter_results/script.py diff --git a/src/reporting/filter_results/config.vsh.yaml b/src/reporting/filter_results/config.vsh.yaml new file mode 100644 index 000000000..09e22ba7a --- /dev/null +++ b/src/reporting/filter_results/config.vsh.yaml @@ -0,0 +1,162 @@ +name: filter_results +namespace: reporting +description: Filter dataset, method, metric info and results based on include/exclude criteria + +argument_groups: + - name: Inputs + arguments: + - name: --input_dataset_info + type: file + description: JSON file containing dataset information + required: true + example: resources_test/openproblems/task_results_v4/processed/dataset_info.json + + - name: --input_method_info + type: file + description: JSON file containing method information + required: true + example: resources_test/openproblems/task_results_v4/processed/method_info.json + + - name: --input_metric_info + type: file + description: JSON file containing metric information + required: true + example: resources_test/openproblems/task_results_v4/processed/metric_info.json + + - name: --input_results + type: file + description: JSON file containing results + required: true + example: resources_test/openproblems/task_results_v4/processed/results.json + + - name: Dataset filtering + description: | + Use these arguments to filter datasets by name. By default, all datasets are + included. If `--datasets_include` is defined, only those datasets are included. If + `--datasets_exclude` is defined, all datasets except those specified are included. + These arguments are mutually exclusive, so only `--datasets_include` OR + `--datasets_exclude` can be set but not both. + arguments: + - name: "--datasets_include" + type: string + multiple: true + description: | + A list of dataset ids to include. If specified, only these datasets will be included. + - name: "--datasets_exclude" + type: string + multiple: true + description: | + A list of dataset ids to exclude. If specified, all datasets except the ones listed will be included. + + - name: Method filtering + description: | + Use these arguments to filter methods by name. By default, all methods are + included. If `--methods_include` is defined, only those methods are included. If + `--methods_exclude` is defined, all methods except those specified are included. + These arguments are mutually exclusive, so only `--methods_include` OR + `--methods_exclude` can be set but not both. + arguments: + - name: "--methods_include" + type: string + multiple: true + description: | + A list of method ids to include. If specified, only these methods will be included. + - name: "--methods_exclude" + type: string + multiple: true + description: | + A list of method ids to exclude. If specified, all methods except the ones listed will be included. + + - name: Metric filtering + description: | + Use these arguments to filter metrics by name. By default, all metrics are + included. If `--metrics_include` is defined, only those metrics are included. If + `--metrics_exclude` is defined, all metrics except those specified are included. + These arguments are mutually exclusive, so only `--metrics_include` OR + `--metrics_exclude` can be set but not both. + arguments: + - name: "--metrics_include" + type: string + multiple: true + description: | + A list of metric ids to include. If specified, only these metrics will be included. + - name: "--metrics_exclude" + type: string + multiple: true + description: | + A list of metric ids to exclude. If specified, all metrics except the ones listed will be included. + + - name: Outputs + arguments: + - name: --output_dataset_info + type: file + direction: output + default: filtered_dataset_info.json + description: Filtered dataset info JSON file + info: + format: + type: json + schema: /common/schemas/results_v4/dataset_info.json + example: resources_test/openproblems/task_results_v4/processed/filtered_dataset_info.json + + - name: --output_method_info + type: file + direction: output + default: filtered_method_info.json + description: Filtered method info JSON file + info: + format: + type: json + schema: /common/schemas/results_v4/method_info.json + example: resources_test/openproblems/task_results_v4/processed/filtered_method_info.json + + - name: --output_metric_info + type: file + direction: output + default: filtered_metric_info.json + description: Filtered metric info JSON file + info: + format: + type: json + schema: /common/schemas/results_v4/metric_info.json + example: resources_test/openproblems/task_results_v4/processed/filtered_metric_info.json + + - name: --output_results + type: file + direction: output + default: filtered_results.json + description: Filtered results JSON file + info: + format: + type: json + schema: /common/schemas/results_v4/results.json + example: resources_test/openproblems/task_results_v4/processed/filtered_results.json + +resources: + - type: python_script + path: script.py + - path: /common/schemas + dest: schemas + +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/openproblems/task_results_v4 + dest: resources_test/openproblems/task_results_v4 + +engines: + - type: docker + image: openproblems/base_python:1 + setup: + - type: apt + packages: + - nodejs + - npm + - type: docker + run: npm install -g ajv-cli + +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/reporting/filter_results/script.py b/src/reporting/filter_results/script.py new file mode 100644 index 000000000..4c8d410e4 --- /dev/null +++ b/src/reporting/filter_results/script.py @@ -0,0 +1,298 @@ + +## VIASH START +par = { + "input_dataset_info": "resources_test/openproblems/task_results_v4/processed/dataset_info.json", + "input_method_info": "resources_test/openproblems/task_results_v4/processed/method_info.json", + "input_metric_info": "resources_test/openproblems/task_results_v4/processed/metric_info.json", + "input_results": "resources_test/openproblems/task_results_v4/processed/results.json", + "output_dataset_info": "resources_test/openproblems/task_results_v4/processed/filtered_dataset_info.json", + "output_method_info": "resources_test/openproblems/task_results_v4/processed/filtered_method_info.json", + "output_metric_info": "resources_test/openproblems/task_results_v4/processed/filtered_metric_info.json", + "output_results": "resources_test/openproblems/task_results_v4/processed/filtered_results.json", + "datasets_exclude": ["cellxgene_census/tabula_sapiens", "cellxgene_census/mouse_pancreas_atlas"], + "datasets_include": None, + "methods_exclude": None, + "methods_include": None, + "metrics_exclude": None, + "metrics_include": None +} +meta = { + "resources_dir": "target/executable/reporting/filter_results" +} +## VIASH END + +import json +import subprocess +import sys +from pathlib import Path +from typing import List, Dict, Any, Optional + + +def validate_filtering_args(): + """Validate that include/exclude arguments are mutually exclusive.""" + if par["datasets_include"] and par["datasets_exclude"]: + raise ValueError("Cannot specify both --datasets_include and --datasets_exclude") + + if par["methods_include"] and par["methods_exclude"]: + raise ValueError("Cannot specify both --methods_include and --methods_exclude") + + if par["metrics_include"] and par["metrics_exclude"]: + raise ValueError("Cannot specify both --metrics_include and --metrics_exclude") + + +def apply_name_filter( + data_list: List[Dict[str, Any]], + include_list: Optional[List[str]] = None, + exclude_list: Optional[List[str]] = None, + item_type: str = "item" +) -> List[Dict[str, Any]]: + """Apply filtering to a list based on name field.""" + if not data_list: + return data_list + + original_count = len(data_list) + item_names = [item["name"] for item in data_list] + + if include_list: + items_to_include = set(item_names) & set(include_list) + if not items_to_include: + print(f"Warning: None of the specified {item_type}s to include were found in the data", + file=sys.stderr) + return [] + + missing_items = set(include_list) - set(item_names) + if missing_items: + print(f"Warning: The following {item_type}s specified in include list were not found: " + + ", ".join(missing_items), file=sys.stderr) + + filtered_data = [item for item in data_list if item["name"] in items_to_include] + print(f">>> Included {len(filtered_data)} out of {original_count} {item_type}s") + return filtered_data + + elif exclude_list: + items_to_exclude = set(item_names) & set(exclude_list) + + missing_items = set(exclude_list) - set(item_names) + if missing_items: + print(f"Warning: The following {item_type}s specified in exclude list were not found: " + + ", ".join(missing_items), file=sys.stderr) + + filtered_data = [item for item in data_list if item["name"] not in items_to_exclude] + print(f">>> Excluded {len(items_to_exclude)} {item_type}s, keeping {len(filtered_data)} out of {original_count} {item_type}s") + return filtered_data + + # No filtering applied + return data_list + + +def filter_results_data( + results_data: List[Dict[str, Any]], + dataset_names: List[str], + method_names: List[str], + metric_names: List[str] +) -> List[Dict[str, Any]]: + """Filter results based on dataset, method, and metric filters.""" + if not results_data: + return results_data + + original_count = len(results_data) + + # Filter result entries based on dataset_name, method_name, and metric_names + filtered_results = [] + for result in results_data: + dataset_keep = result["dataset_name"] in dataset_names + method_keep = result["method_name"] in method_names + + # Check whether this result should be kept + if dataset_keep and method_keep: + filtered_result = result.copy() + + filtered_metrics = [ + (i, name) + for i, name in enumerate(result["metric_names"]) + if name in metric_names + ] + + # store metric names + filtered_result["metric_names"] = [name for _, name in filtered_metrics] + + # store metric values + filtered_result["metric_values"] = [result["metric_values"][i] for i, _ in filtered_metrics] + + # store metric components + new_metric_components = [] + for component in result.get("metric_components", []): + new_component = component.copy() + new_component["metric_names"] = [name for name in component["metric_names"] if name in metric_names] + + # if metric_names are not empty + if new_component["metric_names"]: + new_metric_components.append(new_component) + filtered_result["metric_components"] = new_metric_components + + filtered_results.append(filtered_result) + + print(f">>> Filtered results: keeping {len(filtered_results)} out of {original_count} result entries") + return filtered_results + + +def validate_json_against_schema(json_file: str, schema_file: str, name: str) -> tuple[bool, str]: + """Validate a JSON file against its schema using ajv-cli. + + Returns: + tuple[bool, str]: (is_valid, error_message) + """ + try: + cmd = [ + "ajv", "validate", + "--spec", "draft2020", + "-s", schema_file, + "-r", str(Path(meta["resources_dir"]) / "schemas" / "results_v4" / "core.json"), + "-d", json_file + ] + + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode == 0: + print(f"✓ {name} validation passed") + return True, "" + else: + error_msg = "" + if result.stderr: + error_msg += f"stderr: {result.stderr.strip()}" + if result.stdout: + error_msg += f"\nstdout: {result.stdout.strip()}" + if not error_msg: + error_msg = "Unknown validation error" + + return False, error_msg + + except FileNotFoundError: + return False, "ajv-cli not found. Cannot validate schema" + + +print("====== Filter results ======") + +# Validation +print("\n>>> Validating arguments...") +validate_filtering_args() + +# Read input files +print("\n>>> Reading input files...") + +print(f'Reading dataset info from "{par["input_dataset_info"]}"...') +with open(par["input_dataset_info"], "r") as f: + dataset_info = json.load(f) + +print(f'Reading method info from "{par["input_method_info"]}"...') +with open(par["input_method_info"], "r") as f: + method_info = json.load(f) + +print(f'Reading metric info from "{par["input_metric_info"]}"...') +with open(par["input_metric_info"], "r") as f: + metric_info = json.load(f) + +print(f'Reading results from "{par["input_results"]}"...') +with open(par["input_results"], "r") as f: + results = json.load(f) + +# Apply filters +print("\n>>> Applying filters...") + +print("Filtering datasets...") +filtered_dataset_info = apply_name_filter( + dataset_info, + par["datasets_include"], + par["datasets_exclude"], + "dataset" +) + +print("Filtering methods...") +filtered_method_info = apply_name_filter( + method_info, + par["methods_include"], + par["methods_exclude"], + "method" +) + +print("Filtering metrics...") +filtered_metric_info = apply_name_filter( + metric_info, + par["metrics_include"], + par["metrics_exclude"], + "metric" +) + +# Get names for results filtering +filtered_dataset_names = [item["name"] for item in filtered_dataset_info] +filtered_method_names = [item["name"] for item in filtered_method_info] +filtered_metric_names = [item["name"] for item in filtered_metric_info] + +print("Filtering results...") +filtered_results = filter_results_data( + results, + filtered_dataset_names, + filtered_method_names, + filtered_metric_names +) + +# Write and validate output files +print("\n>>> Writing and validating output files...") +results_schemas_dir = Path(meta["resources_dir"]) / "schemas" / "results_v4" + +validation_files = [ + { + "data": filtered_dataset_info, + "schema": "dataset_info.json", + "file": par["output_dataset_info"], + "name": "dataset info" + }, + { + "data": filtered_method_info, + "schema": "method_info.json", + "file": par["output_method_info"], + "name": "method info" + }, + { + "data": filtered_metric_info, + "schema": "metric_info.json", + "file": par["output_metric_info"], + "name": "metric info" + }, + { + "data": filtered_results, + "schema": "results.json", + "file": par["output_results"], + "name": "results" + } +] + +all_valid = True +for validation in validation_files: + print(f'Writing {validation["name"]} to "{validation["file"]}"...') + with open(validation["file"], "w") as f: + json.dump(validation["data"], f, indent=2, ensure_ascii=False) + + print(f'Validating {validation["name"]}...') + schema_file = str(results_schemas_dir / validation["schema"]) + is_valid, error_msg = validate_json_against_schema( + validation["file"], + schema_file, + validation["name"] + ) + if not is_valid: + print(f'✗ {validation["name"]} validation failed') + print(f"Validation error: {error_msg}", file=sys.stderr) + all_valid = False + +if not all_valid: + raise RuntimeError("One or more output files do not conform to their schemas") + +# Summary +print("\n>>> Summary of filtering results:") +print(f"Datasets: {len(filtered_dataset_info)} (from {len(dataset_info)})") +print(f"Methods: {len(filtered_method_info)} (from {len(method_info)})") +print(f"Metrics: {len(filtered_metric_info)} (from {len(metric_info)})") +print(f"Results: {len(filtered_results)} (from {len(results)})") + +print("\n>>> Done!") From 06cf1800bef500859300df0608a48c906d8cd94a Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 18 Aug 2025 22:38:56 +0200 Subject: [PATCH 3/6] format code --- src/reporting/filter_results/script.py | 163 +++++++++++++------------ 1 file changed, 88 insertions(+), 75 deletions(-) diff --git a/src/reporting/filter_results/script.py b/src/reporting/filter_results/script.py index 4c8d410e4..1d267a001 100644 --- a/src/reporting/filter_results/script.py +++ b/src/reporting/filter_results/script.py @@ -1,4 +1,3 @@ - ## VIASH START par = { "input_dataset_info": "resources_test/openproblems/task_results_v4/processed/dataset_info.json", @@ -9,16 +8,17 @@ "output_method_info": "resources_test/openproblems/task_results_v4/processed/filtered_method_info.json", "output_metric_info": "resources_test/openproblems/task_results_v4/processed/filtered_metric_info.json", "output_results": "resources_test/openproblems/task_results_v4/processed/filtered_results.json", - "datasets_exclude": ["cellxgene_census/tabula_sapiens", "cellxgene_census/mouse_pancreas_atlas"], + "datasets_exclude": [ + "cellxgene_census/tabula_sapiens", + "cellxgene_census/mouse_pancreas_atlas", + ], "datasets_include": None, "methods_exclude": None, "methods_include": None, "metrics_exclude": None, - "metrics_include": None -} -meta = { - "resources_dir": "target/executable/reporting/filter_results" + "metrics_include": None, } +meta = {"resources_dir": "target/executable/reporting/filter_results"} ## VIASH END import json @@ -31,56 +31,70 @@ def validate_filtering_args(): """Validate that include/exclude arguments are mutually exclusive.""" if par["datasets_include"] and par["datasets_exclude"]: - raise ValueError("Cannot specify both --datasets_include and --datasets_exclude") - + raise ValueError( + "Cannot specify both --datasets_include and --datasets_exclude" + ) + if par["methods_include"] and par["methods_exclude"]: raise ValueError("Cannot specify both --methods_include and --methods_exclude") - + if par["metrics_include"] and par["metrics_exclude"]: raise ValueError("Cannot specify both --metrics_include and --metrics_exclude") def apply_name_filter( - data_list: List[Dict[str, Any]], + data_list: List[Dict[str, Any]], include_list: Optional[List[str]] = None, exclude_list: Optional[List[str]] = None, - item_type: str = "item" + item_type: str = "item", ) -> List[Dict[str, Any]]: """Apply filtering to a list based on name field.""" if not data_list: return data_list - + original_count = len(data_list) item_names = [item["name"] for item in data_list] - + if include_list: items_to_include = set(item_names) & set(include_list) if not items_to_include: - print(f"Warning: None of the specified {item_type}s to include were found in the data", - file=sys.stderr) + print( + f"Warning: None of the specified {item_type}s to include were found in the data", + file=sys.stderr, + ) return [] - + missing_items = set(include_list) - set(item_names) if missing_items: - print(f"Warning: The following {item_type}s specified in include list were not found: " + - ", ".join(missing_items), file=sys.stderr) - + print( + f"Warning: The following {item_type}s specified in include list were not found: " + + ", ".join(missing_items), + file=sys.stderr, + ) + filtered_data = [item for item in data_list if item["name"] in items_to_include] print(f">>> Included {len(filtered_data)} out of {original_count} {item_type}s") return filtered_data - + elif exclude_list: items_to_exclude = set(item_names) & set(exclude_list) - + missing_items = set(exclude_list) - set(item_names) if missing_items: - print(f"Warning: The following {item_type}s specified in exclude list were not found: " + - ", ".join(missing_items), file=sys.stderr) - - filtered_data = [item for item in data_list if item["name"] not in items_to_exclude] - print(f">>> Excluded {len(items_to_exclude)} {item_type}s, keeping {len(filtered_data)} out of {original_count} {item_type}s") + print( + f"Warning: The following {item_type}s specified in exclude list were not found: " + + ", ".join(missing_items), + file=sys.stderr, + ) + + filtered_data = [ + item for item in data_list if item["name"] not in items_to_exclude + ] + print( + f">>> Excluded {len(items_to_exclude)} {item_type}s, keeping {len(filtered_data)} out of {original_count} {item_type}s" + ) return filtered_data - + # No filtering applied return data_list @@ -88,15 +102,15 @@ def apply_name_filter( def filter_results_data( results_data: List[Dict[str, Any]], dataset_names: List[str], - method_names: List[str], - metric_names: List[str] + method_names: List[str], + metric_names: List[str], ) -> List[Dict[str, Any]]: """Filter results based on dataset, method, and metric filters.""" if not results_data: return results_data - + original_count = len(results_data) - + # Filter result entries based on dataset_name, method_name, and metric_names filtered_results = [] for result in results_data: @@ -106,53 +120,66 @@ def filter_results_data( # Check whether this result should be kept if dataset_keep and method_keep: filtered_result = result.copy() - + filtered_metrics = [ (i, name) for i, name in enumerate(result["metric_names"]) if name in metric_names ] - + # store metric names filtered_result["metric_names"] = [name for _, name in filtered_metrics] - + # store metric values - filtered_result["metric_values"] = [result["metric_values"][i] for i, _ in filtered_metrics] - + filtered_result["metric_values"] = [ + result["metric_values"][i] for i, _ in filtered_metrics + ] + # store metric components new_metric_components = [] for component in result.get("metric_components", []): new_component = component.copy() - new_component["metric_names"] = [name for name in component["metric_names"] if name in metric_names] - + new_component["metric_names"] = [ + name for name in component["metric_names"] if name in metric_names + ] + # if metric_names are not empty if new_component["metric_names"]: new_metric_components.append(new_component) filtered_result["metric_components"] = new_metric_components filtered_results.append(filtered_result) - - print(f">>> Filtered results: keeping {len(filtered_results)} out of {original_count} result entries") + + print( + f">>> Filtered results: keeping {len(filtered_results)} out of {original_count} result entries" + ) return filtered_results -def validate_json_against_schema(json_file: str, schema_file: str, name: str) -> tuple[bool, str]: +def validate_json_against_schema( + json_file: str, schema_file: str, name: str +) -> tuple[bool, str]: """Validate a JSON file against its schema using ajv-cli. - + Returns: tuple[bool, str]: (is_valid, error_message) """ try: cmd = [ - "ajv", "validate", - "--spec", "draft2020", - "-s", schema_file, - "-r", str(Path(meta["resources_dir"]) / "schemas" / "results_v4" / "core.json"), - "-d", json_file + "ajv", + "validate", + "--spec", + "draft2020", + "-s", + schema_file, + "-r", + str(Path(meta["resources_dir"]) / "schemas" / "results_v4" / "core.json"), + "-d", + json_file, ] - + result = subprocess.run(cmd, capture_output=True, text=True) - + if result.returncode == 0: print(f"✓ {name} validation passed") return True, "" @@ -164,9 +191,9 @@ def validate_json_against_schema(json_file: str, schema_file: str, name: str) -> error_msg += f"\nstdout: {result.stdout.strip()}" if not error_msg: error_msg = "Unknown validation error" - + return False, error_msg - + except FileNotFoundError: return False, "ajv-cli not found. Cannot validate schema" @@ -201,26 +228,17 @@ def validate_json_against_schema(json_file: str, schema_file: str, name: str) -> print("Filtering datasets...") filtered_dataset_info = apply_name_filter( - dataset_info, - par["datasets_include"], - par["datasets_exclude"], - "dataset" + dataset_info, par["datasets_include"], par["datasets_exclude"], "dataset" ) print("Filtering methods...") filtered_method_info = apply_name_filter( - method_info, - par["methods_include"], - par["methods_exclude"], - "method" + method_info, par["methods_include"], par["methods_exclude"], "method" ) print("Filtering metrics...") filtered_metric_info = apply_name_filter( - metric_info, - par["metrics_include"], - par["metrics_exclude"], - "metric" + metric_info, par["metrics_include"], par["metrics_exclude"], "metric" ) # Get names for results filtering @@ -230,10 +248,7 @@ def validate_json_against_schema(json_file: str, schema_file: str, name: str) -> print("Filtering results...") filtered_results = filter_results_data( - results, - filtered_dataset_names, - filtered_method_names, - filtered_metric_names + results, filtered_dataset_names, filtered_method_names, filtered_metric_names ) # Write and validate output files @@ -245,26 +260,26 @@ def validate_json_against_schema(json_file: str, schema_file: str, name: str) -> "data": filtered_dataset_info, "schema": "dataset_info.json", "file": par["output_dataset_info"], - "name": "dataset info" + "name": "dataset info", }, { "data": filtered_method_info, "schema": "method_info.json", "file": par["output_method_info"], - "name": "method info" + "name": "method info", }, { "data": filtered_metric_info, "schema": "metric_info.json", "file": par["output_metric_info"], - "name": "metric info" + "name": "metric info", }, { "data": filtered_results, "schema": "results.json", "file": par["output_results"], - "name": "results" - } + "name": "results", + }, ] all_valid = True @@ -276,9 +291,7 @@ def validate_json_against_schema(json_file: str, schema_file: str, name: str) -> print(f'Validating {validation["name"]}...') schema_file = str(results_schemas_dir / validation["schema"]) is_valid, error_msg = validate_json_against_schema( - validation["file"], - schema_file, - validation["name"] + validation["file"], schema_file, validation["name"] ) if not is_valid: print(f'✗ {validation["name"]} validation failed') From 564400d75a34b01fb879e7a24bdcd41a3120b39b Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 Aug 2025 17:10:41 +0200 Subject: [PATCH 4/6] Update src/reporting/process_task_results/main.nf --- src/reporting/process_task_results/main.nf | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/reporting/process_task_results/main.nf b/src/reporting/process_task_results/main.nf index 97f5b1220..f02ec5658 100644 --- a/src/reporting/process_task_results/main.nf +++ b/src/reporting/process_task_results/main.nf @@ -52,13 +52,7 @@ workflow run_wf { "input_trace": "input_trace", "input_dataset_info": "output_dataset", "input_method_info": "output_method", - "input_metric_info": "output_metric", - "datasets_include": "datasets_include", - "datasets_exclude": "datasets_exclude", - "methods_include": "methods_include", - "methods_exclude": "methods_exclude", - "metrics_include": "metrics_include", - "metrics_exclude": "metrics_exclude" + "input_metric_info": "output_metric" ], toState: [ "output_results": "output" From 58d9a5473966ffc8d899ff5d59d5f2395a3e91c3 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 Aug 2025 17:12:09 +0200 Subject: [PATCH 5/6] Apply suggestions from code review Co-authored-by: Luke Zappia --- src/reporting/filter_results/script.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/reporting/filter_results/script.py b/src/reporting/filter_results/script.py index 1d267a001..952e5fab7 100644 --- a/src/reporting/filter_results/script.py +++ b/src/reporting/filter_results/script.py @@ -73,7 +73,7 @@ def apply_name_filter( ) filtered_data = [item for item in data_list if item["name"] in items_to_include] - print(f">>> Included {len(filtered_data)} out of {original_count} {item_type}s") + print(f"Included {len(filtered_data)} out of {original_count} {item_type}s") return filtered_data elif exclude_list: @@ -91,7 +91,7 @@ def apply_name_filter( item for item in data_list if item["name"] not in items_to_exclude ] print( - f">>> Excluded {len(items_to_exclude)} {item_type}s, keeping {len(filtered_data)} out of {original_count} {item_type}s" + f"Excluded {len(items_to_exclude)} {item_type}s, keeping {len(filtered_data)} out of {original_count} {item_type}s" ) return filtered_data @@ -151,7 +151,7 @@ def filter_results_data( filtered_results.append(filtered_result) print( - f">>> Filtered results: keeping {len(filtered_results)} out of {original_count} result entries" + f"Filtered results: keeping {len(filtered_results)} out of {original_count} result entries" ) return filtered_results @@ -295,7 +295,7 @@ def validate_json_against_schema( ) if not is_valid: print(f'✗ {validation["name"]} validation failed') - print(f"Validation error: {error_msg}", file=sys.stderr) + print(f"Validation error: {error_msg}") all_valid = False if not all_valid: From 87827474e4bd2a6415639ce61a28eab8a3c3cace Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 Aug 2025 17:27:02 +0200 Subject: [PATCH 6/6] add back previously removed arguments --- .../process_task_results/config.vsh.yaml | 62 ++++++++++++++++++- src/reporting/process_task_results/main.nf | 10 ++- 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/src/reporting/process_task_results/config.vsh.yaml b/src/reporting/process_task_results/config.vsh.yaml index 677cb5e85..c3dcd3699 100644 --- a/src/reporting/process_task_results/config.vsh.yaml +++ b/src/reporting/process_task_results/config.vsh.yaml @@ -103,7 +103,7 @@ argument_groups: - name: Outputs arguments: - - name: "--output_data" + - name: "--output_combined" type: file required: true direction: output @@ -122,6 +122,66 @@ argument_groups: info: format: type: html + - name: "--output_task_info" + type: file + required: true + direction: output + description: Task info JSON file + default: task_info.json + info: + format: + type: json + schema: /common/schemas/results_v4/task_info.json + - name: "--output_dataset_info" + type: file + required: true + direction: output + description: Dataset info JSON file + default: dataset_info.json + info: + format: + type: json + schema: /common/schemas/results_v4/dataset_info.json + - name: "--output_method_info" + type: file + required: true + direction: output + description: Method info JSON file + default: method_info.json + info: + format: + type: json + schema: /common/schemas/results_v4/method_info.json + - name: "--output_metric_info" + type: file + required: true + direction: output + description: Metric info JSON file + default: metric_info.json + info: + format: + type: json + schema: /common/schemas/results_v4/metric_info.json + - name: "--output_results" + type: file + required: true + direction: output + description: Results JSON file + default: results.json + info: + format: + type: json + schema: /common/schemas/results_v4/results.json + - name: "--output_quality_control" + type: file + required: true + direction: output + description: Quality control JSON file + default: quality_control.json + info: + format: + type: json + schema: /common/schemas/results_v4/quality_control.json resources: - type: nextflow_script diff --git a/src/reporting/process_task_results/main.nf b/src/reporting/process_task_results/main.nf index f02ec5658..059960d65 100644 --- a/src/reporting/process_task_results/main.nf +++ b/src/reporting/process_task_results/main.nf @@ -116,8 +116,14 @@ workflow run_wf { ) | setState([ - "output_data": "output_combined", - "output_report": "output_report" + "output_combined": "output_combined", + "output_report": "output_report", + "output_task_info": "output_task", + "output_dataset_info": "output_dataset", + "output_method_info": "output_method", + "output_metric_info": "output_metric", + "output_results": "output_results", + "output_quality_control": "output_qc" ]) emit: