From aca85f517ad16ea0d879c4d75ea6859a6ecc4c86 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 7 Jul 2025 14:22:53 -0400 Subject: [PATCH 1/8] fixed #141 --- geofetch/geofetch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/geofetch/geofetch.py b/geofetch/geofetch.py index b490428..7eb4247 100755 --- a/geofetch/geofetch.py +++ b/geofetch/geofetch.py @@ -1087,7 +1087,7 @@ def _find_genome(metadata_list: list) -> list: sample_genome = "" for key in proj_gen_keys: sample_genome = " ".join([sample_genome, sample[1][key]]) - metadata_list[sample[0]][NEW_GENOME_COL_NAME] = sample_genome + metadata_list[sample[0]][NEW_GENOME_COL_NAME] = sample_genome.strip() return metadata_list def _write_raw_annotation_new( From 7b4fa636738e237fcc7bc785a01a223694a208e6 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 7 Jul 2025 14:36:44 -0400 Subject: [PATCH 2/8] fixed #135 --- geofetch/cli.py | 8 +++----- geofetch/sraconvert.py | 7 +++++-- requirements/requirements-all.txt | 1 + 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/geofetch/cli.py b/geofetch/cli.py index 168b2a3..a56407c 100644 --- a/geofetch/cli.py +++ b/geofetch/cli.py @@ -2,6 +2,7 @@ import os import logmuse +from ubiquerg import VersionInHelpParser from geofetch._version import __version__ @@ -15,7 +16,7 @@ def _parse_cmdl(cmdl): """ parser """ - parser = argparse.ArgumentParser( + parser = VersionInHelpParser( description="Automatic GEO and SRA data downloader", usage="""geofetch [] @@ -26,15 +27,12 @@ def _parse_cmdl(cmdl): geofetch -i GSE67303 --processed --geo-folder -m """, + version=__version__, ) processed_group = parser.add_argument_group("processed") raw_group = parser.add_argument_group("raw") - parser.add_argument( - "-V", "--version", action="version", version=f"%(prog)s {__version__}" - ) - # Required parser.add_argument( "-i", diff --git a/geofetch/sraconvert.py b/geofetch/sraconvert.py index 6e64a9a..7b05a34 100755 --- a/geofetch/sraconvert.py +++ b/geofetch/sraconvert.py @@ -2,7 +2,7 @@ import os import sys -from argparse import ArgumentParser +from ubiquerg import VersionInHelpParser import logmuse import pypiper @@ -15,7 +15,7 @@ def _parse_cmdl(cmdl): provides convenience functions for converting or deleting sra data in various formats. """ - parser = ArgumentParser(description=description) + parser = VersionInHelpParser(description=description) # parser = pypiper.add_pypiper_args(parser, args=["output-parent"]) parser.add_argument( "-m", @@ -72,6 +72,9 @@ def _parse_cmdl(cmdl): help="Name for sample to run", metavar="SAMPLE_NAME", ) + parser.add_argument( + "-V", "--version", action="version", version=f"%(prog)s {__version__}" + ) parser.add_argument("-r", "--srr", required=True, nargs="+", help="SRR files") diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index f9b4326..9440999 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -7,3 +7,4 @@ pandas>=1.5.3 peppy>=0.40.6 rich>=12.5.1 coloredlogs>=15.0.1 +piper>=0.14.4 From 5908c88647c7c3eba39abb051c70825b8b58984c Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 7 Jul 2025 14:37:51 -0400 Subject: [PATCH 3/8] updated github actions --- .github/workflows/black.yml | 4 ++-- .github/workflows/python-publish.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index 8b48ddf..052e2ec 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -6,6 +6,6 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 - uses: psf/black@stable diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 59c6af8..6598590 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -24,4 +24,4 @@ jobs: run: | python setup.py sdist bdist_wheel - name: Publish package distributions to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 \ No newline at end of file + uses: pypa/gh-action-pypi-publish@release/v1 From 9ee1df24288ebd95f1beae805ae2b9a83fdf4bd6 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 7 Jul 2025 17:35:33 -0400 Subject: [PATCH 4/8] fixed #143 --- geofetch/geofetch.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/geofetch/geofetch.py b/geofetch/geofetch.py index 7eb4247..726a73b 100755 --- a/geofetch/geofetch.py +++ b/geofetch/geofetch.py @@ -867,6 +867,8 @@ def _expand_metadata_list(self, metadata_list: list) -> list: _LOGGER.info("Expanding metadata list...") list_of_keys = _get_list_of_keys(metadata_list) for key_in_list in list_of_keys: + if key_in_list == "Sample_characteristics_ch1": + pass metadata_list = self._expand_metadata_list_item(metadata_list, key_in_list) return metadata_list @@ -881,7 +883,13 @@ def _expand_metadata_list_item(self, metadata_list: list, dict_key: str): """ try: element_is_list = any( - isinstance(list_item.get(dict_key), list) for list_item in metadata_list + isinstance(list_item.get(dict_key), list) + or ( + len(list_item.get(dict_key).split(": ")) == 2 + if list_item.get(dict_key) + else False + ) + for list_item in metadata_list ) # # checking if some items have two keys: @@ -900,6 +908,8 @@ def _expand_metadata_list_item(self, metadata_list: list, dict_key: str): metadata_list[n_elem][dict_key] = [ metadata_list[n_elem][dict_key] ] + else: + pass just_string = False this_string = "" From f2dff6d22ef98b4eee110560a3944ad599c1f5d5 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 10 Jul 2025 01:09:41 -0400 Subject: [PATCH 5/8] fixed sra convert functionality --- .looper.yaml | 4 ++ MANIFEST.in | 9 ++-- geofetch/_version.py | 2 +- geofetch/const.py | 9 +++- geofetch/geofetch.py | 54 +++++++++++++++++-- .../config_processed_template.yaml | 0 geofetch/{ => templates}/config_template.yaml | 0 .../templates/looper_config_template.yaml | 4 ++ .../{ => templates}/looper_sra_convert.yaml | 19 +------ .../templates/pipeline_interface_convert.yaml | 12 +++++ .../templates/resources.tsv | 0 .../templates/sra_convert_schema.yaml | 0 manual_testing.py | 14 +++++ pipeline_interface_convert.yaml | 12 ----- pipeline_interface_convert_v1.yaml | 31 ----------- 15 files changed, 99 insertions(+), 71 deletions(-) create mode 100644 .looper.yaml rename geofetch/{ => templates}/config_processed_template.yaml (100%) rename geofetch/{ => templates}/config_template.yaml (100%) create mode 100644 geofetch/templates/looper_config_template.yaml rename geofetch/{ => templates}/looper_sra_convert.yaml (50%) create mode 100644 geofetch/templates/pipeline_interface_convert.yaml rename resources.tsv => geofetch/templates/resources.tsv (100%) rename sra_convert_schema.yaml => geofetch/templates/sra_convert_schema.yaml (100%) create mode 100644 manual_testing.py delete mode 100644 pipeline_interface_convert.yaml delete mode 100644 pipeline_interface_convert_v1.yaml diff --git a/.looper.yaml b/.looper.yaml new file mode 100644 index 0000000..a694f52 --- /dev/null +++ b/.looper.yaml @@ -0,0 +1,4 @@ +pep_config: /home/bnt4me/virginia/repos/geofetch/red_algae/GSE67303_PEP/GSE67303_PEP.yaml +output_dir: /home/bnt4me/virginia/repos/geofetch/red_algae/GSE67303_PEP +pipeline_interfaces: + - /home/bnt4me/virginia/repos/geofetch/pipeline_interface_convert.yaml \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in index 4f3018a..a9b1f4a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,9 @@ include requirements/* include README.md include docs/img/geofetch_logo.svg -include geofetch/config_template.yaml -include geofetch/config_processed_template.yaml -include geofetch/looper_sra_convert.yaml +include geofetch/templates/* +include geofetch/templates/config_template.yaml +include geofetch/templates/config_processed_template.yaml +include geofetch/templates/looper_sra_convert.yaml +include geofetch/templates/looper_config_template.yaml +include geofetch/templates/pipeline_interface_convert.yaml diff --git a/geofetch/_version.py b/geofetch/_version.py index 6ece8ad..5e44a42 100644 --- a/geofetch/_version.py +++ b/geofetch/_version.py @@ -1 +1 @@ -__version__ = "0.12.7" +__version__ = "0.12.8" diff --git a/geofetch/const.py b/geofetch/const.py index 2267223..e018920 100644 --- a/geofetch/const.py +++ b/geofetch/const.py @@ -44,9 +44,14 @@ NEW_GENOME_COL_NAME = "ref_genome" +TEMPLATES_DIR = "templates" CONFIG_PROCESSED_TEMPLATE_NAME = "config_processed_template.yaml" CONFIG_RAW_TEMPLATE_NAME = "config_template.yaml" -CONFIG_SRA_TEMPLATE = "looper_sra_convert.yaml" +CONFIG_SRA_TEMPLATE_NAME = "looper_sra_convert.yaml" +PIPELINE_INTERFACE_CONVERT_TEMPLATE_NAME = "pipeline_interface_convert.yaml" +LOOPER_SRA_CONVERT = "looper_config_template.yaml" +# SRA_CONVERT_SCHEMA_NAME = "sra_convert_schema.yaml" +# RESOURCES_NAME = "resources.tsv" # const for Finder: RETMAX = 10000000 # once it should be increased @@ -63,3 +68,5 @@ '+AND+("{start_date}"[Publication%20Date]%20:%20"{end_date}"[Publication%20Date])' ) THREE_MONTH_FILTER = '+AND+"published+last+3+months"[Filter]' + +LOOPER_CONFIG_FILE_NAME = "looper_config.yaml" diff --git a/geofetch/geofetch.py b/geofetch/geofetch.py index 726a73b..e841716 100755 --- a/geofetch/geofetch.py +++ b/geofetch/geofetch.py @@ -20,7 +20,7 @@ from geofetch.const import ( CONFIG_PROCESSED_TEMPLATE_NAME, CONFIG_RAW_TEMPLATE_NAME, - CONFIG_SRA_TEMPLATE, + CONFIG_SRA_TEMPLATE_NAME, EXP_SUPP_METADATA_FILE, EXPERIMENT_PATTERN, FILE_RAW_NAME_SAMPLE_PATTERN, @@ -34,6 +34,10 @@ SAMPLE_SUPP_METADATA_FILE, SER_SUPP_FILE_PATTERN, SUPP_FILE_PATTERN, + TEMPLATES_DIR, + PIPELINE_INTERFACE_CONVERT_TEMPLATE_NAME, + LOOPER_SRA_CONVERT, + LOOPER_CONFIG_FILE_NAME, ) from geofetch.utils import ( Accession, @@ -1171,11 +1175,43 @@ def _write_raw_annotation_new( if len(subannot_dict) > 0: self._write_subannotation(subannot_dict, proj_root_subsample) - self._write(proj_root_yaml, template, msg_pre=" Config file: ") + self._write(proj_root_yaml, template, msg_pre="Config file: ") if self.add_dotfile: _create_dot_yaml(dot_yaml_path, yaml_name) + if self.add_convert_modifier: + geofetchdir = os.path.dirname(__file__) + pipeline_interface_convert_path = os.path.join( + geofetchdir, TEMPLATES_DIR, PIPELINE_INTERFACE_CONVERT_TEMPLATE_NAME + ) + + looper_config_template_path = os.path.join( + geofetchdir, TEMPLATES_DIR, LOOPER_SRA_CONVERT + ) + + with open(looper_config_template_path, "r") as template_file: + template_looper = template_file.read() + + template_values = { + "pep_config": proj_root_yaml, + "output_dir": os.path.join(self.metadata_root_full, "output_dir"), + "pipeline_interface_convert": pipeline_interface_convert_path, + } + + for k, v in template_values.items(): + placeholder = "{" + str(k) + "}" + template_looper = template_looper.replace(placeholder, str(v)) + + looper_config_file = os.path.join( + self.metadata_root_full, + LOOPER_CONFIG_FILE_NAME, + ) + + self._write( + looper_config_file, template_looper, msg_pre="Looper config file: " + ) + else: meta_df = pd.DataFrame.from_dict(metadata_dict, orient="index") @@ -1214,8 +1250,11 @@ def _create_config_processed( :param meta_in_series: :return: generated, complete config file content """ + geofetchdir = os.path.dirname(__file__) - config_template = os.path.join(geofetchdir, CONFIG_PROCESSED_TEMPLATE_NAME) + config_template = os.path.join( + geofetchdir, TEMPLATES_DIR, CONFIG_PROCESSED_TEMPLATE_NAME + ) with open(config_template, "r") as template_file: template = template_file.read() meta_list_str = [ @@ -1270,9 +1309,13 @@ def _create_config_raw( else: sample_modifier_str = "" if not self.config_template: - self.config_template = os.path.join(geofetchdir, CONFIG_RAW_TEMPLATE_NAME) + self.config_template = os.path.join( + geofetchdir, TEMPLATES_DIR, CONFIG_RAW_TEMPLATE_NAME + ) if self.add_convert_modifier: - sra_convert_path = os.path.join(geofetchdir, CONFIG_SRA_TEMPLATE) + sra_convert_path = os.path.join( + geofetchdir, TEMPLATES_DIR, CONFIG_SRA_TEMPLATE_NAME + ) with open(sra_convert_path, "r") as template_file: sra_convert_template = template_file.read() else: @@ -1301,6 +1344,7 @@ def _create_config_raw( for k, v in template_values.items(): placeholder = "{" + str(k) + "}" template = template.replace(placeholder, str(v)) + return template @staticmethod diff --git a/geofetch/config_processed_template.yaml b/geofetch/templates/config_processed_template.yaml similarity index 100% rename from geofetch/config_processed_template.yaml rename to geofetch/templates/config_processed_template.yaml diff --git a/geofetch/config_template.yaml b/geofetch/templates/config_template.yaml similarity index 100% rename from geofetch/config_template.yaml rename to geofetch/templates/config_template.yaml diff --git a/geofetch/templates/looper_config_template.yaml b/geofetch/templates/looper_config_template.yaml new file mode 100644 index 0000000..21a4dd0 --- /dev/null +++ b/geofetch/templates/looper_config_template.yaml @@ -0,0 +1,4 @@ +pep_config: {pep_config} +output_dir: {output_dir} +pipeline_interfaces: + - {pipeline_interface_convert} \ No newline at end of file diff --git a/geofetch/looper_sra_convert.yaml b/geofetch/templates/looper_sra_convert.yaml similarity index 50% rename from geofetch/looper_sra_convert.yaml rename to geofetch/templates/looper_sra_convert.yaml index 94525f1..861e423 100644 --- a/geofetch/looper_sra_convert.yaml +++ b/geofetch/templates/looper_sra_convert.yaml @@ -4,7 +4,7 @@ derive: attributes: [read1, read2, SRR_files] sources: - SRA: "${SRABAM}/{srr}.bam" + SRA: "${SRARAW}/{srr}/{srr}.sra" FQ: "${SRAFQ}/{srr}.fastq.gz" FQ1: "${SRAFQ}/{srr}_1.fastq.gz" FQ2: "${SRAFQ}/{srr}_2.fastq.gz" @@ -26,20 +26,3 @@ read_type: "SINGLE" then: read1: FQ1 - -project_modifiers: - amend: - sra_convert: - looper: - results_subdir: sra_convert_results - sample_modifiers: - append: - SRR_files: SRA - pipeline_interfaces: ${CODE}/geofetch/pipeline_interface_convert.yaml - derive: - attributes: [read1, read2, SRR_files] - sources: - SRA: "${SRARAW}/{srr}/{srr}.sra" - FQ: "${SRAFQ}/{srr}.fastq.gz" - FQ1: "${SRAFQ}/{srr}_1.fastq.gz" - FQ2: "${SRAFQ}/{srr}_2.fastq.gz" diff --git a/geofetch/templates/pipeline_interface_convert.yaml b/geofetch/templates/pipeline_interface_convert.yaml new file mode 100644 index 0000000..9ec2217 --- /dev/null +++ b/geofetch/templates/pipeline_interface_convert.yaml @@ -0,0 +1,12 @@ +pipeline_name: sra_convert +path: sraconvert +input_schema: ./sra_convert_schema.yaml +sample_interface: + command_template: > + {pipeline.path} --srr {sample.SRR_files} + {% if sample.SRX is defined %} --sample-name {sample.SRX} {% endif %} + {% if project.fqfolder is defined %} --fqfolder {project.fqfolder} {% endif %} + -O {looper.results_subdir} +compute: + bulker_crate: databio/sra_convert + size_dependent_variables: ./resources.tsv diff --git a/resources.tsv b/geofetch/templates/resources.tsv similarity index 100% rename from resources.tsv rename to geofetch/templates/resources.tsv diff --git a/sra_convert_schema.yaml b/geofetch/templates/sra_convert_schema.yaml similarity index 100% rename from sra_convert_schema.yaml rename to geofetch/templates/sra_convert_schema.yaml diff --git a/manual_testing.py b/manual_testing.py new file mode 100644 index 0000000..0164911 --- /dev/null +++ b/manual_testing.py @@ -0,0 +1,14 @@ +from geofetch import Geofetcher + + +if __name__ == "__main__": + # Create a Geofetcher instance + geofetcher = Geofetcher( + "GSE15805", "./test/", "./test/", just_metadata=True, processed=True + ) + + # Fetch the data + data = geofetcher.fetch_all("GSE15805") + + # Print the fetched data + print(data) diff --git a/pipeline_interface_convert.yaml b/pipeline_interface_convert.yaml deleted file mode 100644 index 33e51cc..0000000 --- a/pipeline_interface_convert.yaml +++ /dev/null @@ -1,12 +0,0 @@ -pipeline_name: sra_convert -pipeline_type: sample -path: sraconvert -input_schema: sra_convert_schema.yaml -command_template: > - {pipeline.path} --srr {sample.SRR_files} - {% if sample.SRX is defined %} --sample-name {sample.SRX} {% endif %} - {% if project.fqfolder is defined %} --fqfolder {project.fqfolder} {% endif %} - -O {looper.results_subdir} -compute: - bulker_crate: databio/sra_convert - size_dependent_variables: resources.tsv diff --git a/pipeline_interface_convert_v1.yaml b/pipeline_interface_convert_v1.yaml deleted file mode 100644 index bbaddc9..0000000 --- a/pipeline_interface_convert_v1.yaml +++ /dev/null @@ -1,31 +0,0 @@ -protocol_mapping: - "*": convert - -pipelines: - convert: - name: convert - path: sraconvert - required_input_files: SRR_files - arguments: - "--srr": SRR_files - resources: - default: - file_size: "0" - cores: "1" - mem: "4000" - time: "0-06:00:00" - package1: - file_size: "15" - cores: "1" - mem: "4000" - time: "0-12:00:00" - package2: - file_size: "100" - cores: "1" - mem: "4000" - time: "1-00:00:00" - package3: - file_size: "300" - cores: "1" - mem: "4000" - time: "2-00:00:00" From 24710d4eeeef3e6bc19858211f20aca732ea10e2 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 10 Jul 2025 01:35:22 -0400 Subject: [PATCH 6/8] lint --- geofetch/__init__.py | 2 +- geofetch/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/geofetch/__init__.py b/geofetch/__init__.py index 2e3620e..e1b75f9 100644 --- a/geofetch/__init__.py +++ b/geofetch/__init__.py @@ -1,4 +1,4 @@ -""" Package-level data """ +"""Package-level data""" import coloredlogs import logmuse diff --git a/geofetch/utils.py b/geofetch/utils.py index c006e7b..66bdd88 100644 --- a/geofetch/utils.py +++ b/geofetch/utils.py @@ -1,4 +1,4 @@ -""" Independently-importable utilities to circumvent true scripts. """ +"""Independently-importable utilities to circumvent true scripts.""" import csv import logging From b6c6ffed1a3e93e073310103f26fe8554b4dcdfe Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 10 Jul 2025 01:36:49 -0400 Subject: [PATCH 7/8] Removed unused files --- .looper.yaml | 4 ---- manual_testing.py | 14 -------------- 2 files changed, 18 deletions(-) delete mode 100644 .looper.yaml delete mode 100644 manual_testing.py diff --git a/.looper.yaml b/.looper.yaml deleted file mode 100644 index a694f52..0000000 --- a/.looper.yaml +++ /dev/null @@ -1,4 +0,0 @@ -pep_config: /home/bnt4me/virginia/repos/geofetch/red_algae/GSE67303_PEP/GSE67303_PEP.yaml -output_dir: /home/bnt4me/virginia/repos/geofetch/red_algae/GSE67303_PEP -pipeline_interfaces: - - /home/bnt4me/virginia/repos/geofetch/pipeline_interface_convert.yaml \ No newline at end of file diff --git a/manual_testing.py b/manual_testing.py deleted file mode 100644 index 0164911..0000000 --- a/manual_testing.py +++ /dev/null @@ -1,14 +0,0 @@ -from geofetch import Geofetcher - - -if __name__ == "__main__": - # Create a Geofetcher instance - geofetcher = Geofetcher( - "GSE15805", "./test/", "./test/", just_metadata=True, processed=True - ) - - # Fetch the data - data = geofetcher.fetch_all("GSE15805") - - # Print the fetched data - print(data) From a0ff9dc5f23df845c8868a67d690f0af71fd2ee1 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 10 Jul 2025 01:41:44 -0400 Subject: [PATCH 8/8] Updated Readme --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 3e4ca98..9188cd1 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,12 @@ or install the latest version from the GitHub repository: pip install git+https://github.com/pepkit/geofetch.git ``` +## All GEO projects (GSE + GSM) in PEP format. + +All GEO projects are available in PEPhub under geo namespace: https://pephub.databio.org/geo/ . +User can search for GEO projects using the search bar, or download archive with all GEO PEPs from archive section of the namespace: +[https://pephub.databio.org/geo?view=archive](https://pephub.databio.org/geo?view=archive) + ## How to cite: https://doi.org/10.1093/bioinformatics/btad069