diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index 8b48ddf..052e2ec 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -6,6 +6,6 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 - uses: psf/black@stable diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 59c6af8..6598590 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -24,4 +24,4 @@ jobs: run: | python setup.py sdist bdist_wheel - name: Publish package distributions to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 \ No newline at end of file + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/MANIFEST.in b/MANIFEST.in index 4f3018a..a9b1f4a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,9 @@ include requirements/* include README.md include docs/img/geofetch_logo.svg -include geofetch/config_template.yaml -include geofetch/config_processed_template.yaml -include geofetch/looper_sra_convert.yaml +include geofetch/templates/* +include geofetch/templates/config_template.yaml +include geofetch/templates/config_processed_template.yaml +include geofetch/templates/looper_sra_convert.yaml +include geofetch/templates/looper_config_template.yaml +include geofetch/templates/pipeline_interface_convert.yaml diff --git a/README.md b/README.md index 3e4ca98..9188cd1 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,12 @@ or install the latest version from the GitHub repository: pip install git+https://github.com/pepkit/geofetch.git ``` +## All GEO projects (GSE + GSM) in PEP format. + +All GEO projects are available in PEPhub under geo namespace: https://pephub.databio.org/geo/ . +User can search for GEO projects using the search bar, or download archive with all GEO PEPs from archive section of the namespace: +[https://pephub.databio.org/geo?view=archive](https://pephub.databio.org/geo?view=archive) + ## How to cite: https://doi.org/10.1093/bioinformatics/btad069 diff --git a/geofetch/__init__.py b/geofetch/__init__.py index 2e3620e..e1b75f9 100644 --- a/geofetch/__init__.py +++ b/geofetch/__init__.py @@ -1,4 +1,4 @@ -""" Package-level data """ +"""Package-level data""" import coloredlogs import logmuse diff --git a/geofetch/_version.py b/geofetch/_version.py index 6ece8ad..5e44a42 100644 --- a/geofetch/_version.py +++ b/geofetch/_version.py @@ -1 +1 @@ -__version__ = "0.12.7" +__version__ = "0.12.8" diff --git a/geofetch/cli.py b/geofetch/cli.py index 168b2a3..a56407c 100644 --- a/geofetch/cli.py +++ b/geofetch/cli.py @@ -2,6 +2,7 @@ import os import logmuse +from ubiquerg import VersionInHelpParser from geofetch._version import __version__ @@ -15,7 +16,7 @@ def _parse_cmdl(cmdl): """ parser """ - parser = argparse.ArgumentParser( + parser = VersionInHelpParser( description="Automatic GEO and SRA data downloader", usage="""geofetch [] @@ -26,15 +27,12 @@ def _parse_cmdl(cmdl): geofetch -i GSE67303 --processed --geo-folder -m """, + version=__version__, ) processed_group = parser.add_argument_group("processed") raw_group = parser.add_argument_group("raw") - parser.add_argument( - "-V", "--version", action="version", version=f"%(prog)s {__version__}" - ) - # Required parser.add_argument( "-i", diff --git a/geofetch/const.py b/geofetch/const.py index 2267223..e018920 100644 --- a/geofetch/const.py +++ b/geofetch/const.py @@ -44,9 +44,14 @@ NEW_GENOME_COL_NAME = "ref_genome" +TEMPLATES_DIR = "templates" CONFIG_PROCESSED_TEMPLATE_NAME = "config_processed_template.yaml" CONFIG_RAW_TEMPLATE_NAME = "config_template.yaml" -CONFIG_SRA_TEMPLATE = "looper_sra_convert.yaml" +CONFIG_SRA_TEMPLATE_NAME = "looper_sra_convert.yaml" +PIPELINE_INTERFACE_CONVERT_TEMPLATE_NAME = "pipeline_interface_convert.yaml" +LOOPER_SRA_CONVERT = "looper_config_template.yaml" +# SRA_CONVERT_SCHEMA_NAME = "sra_convert_schema.yaml" +# RESOURCES_NAME = "resources.tsv" # const for Finder: RETMAX = 10000000 # once it should be increased @@ -63,3 +68,5 @@ '+AND+("{start_date}"[Publication%20Date]%20:%20"{end_date}"[Publication%20Date])' ) THREE_MONTH_FILTER = '+AND+"published+last+3+months"[Filter]' + +LOOPER_CONFIG_FILE_NAME = "looper_config.yaml" diff --git a/geofetch/geofetch.py b/geofetch/geofetch.py index b490428..e841716 100755 --- a/geofetch/geofetch.py +++ b/geofetch/geofetch.py @@ -20,7 +20,7 @@ from geofetch.const import ( CONFIG_PROCESSED_TEMPLATE_NAME, CONFIG_RAW_TEMPLATE_NAME, - CONFIG_SRA_TEMPLATE, + CONFIG_SRA_TEMPLATE_NAME, EXP_SUPP_METADATA_FILE, EXPERIMENT_PATTERN, FILE_RAW_NAME_SAMPLE_PATTERN, @@ -34,6 +34,10 @@ SAMPLE_SUPP_METADATA_FILE, SER_SUPP_FILE_PATTERN, SUPP_FILE_PATTERN, + TEMPLATES_DIR, + PIPELINE_INTERFACE_CONVERT_TEMPLATE_NAME, + LOOPER_SRA_CONVERT, + LOOPER_CONFIG_FILE_NAME, ) from geofetch.utils import ( Accession, @@ -867,6 +871,8 @@ def _expand_metadata_list(self, metadata_list: list) -> list: _LOGGER.info("Expanding metadata list...") list_of_keys = _get_list_of_keys(metadata_list) for key_in_list in list_of_keys: + if key_in_list == "Sample_characteristics_ch1": + pass metadata_list = self._expand_metadata_list_item(metadata_list, key_in_list) return metadata_list @@ -881,7 +887,13 @@ def _expand_metadata_list_item(self, metadata_list: list, dict_key: str): """ try: element_is_list = any( - isinstance(list_item.get(dict_key), list) for list_item in metadata_list + isinstance(list_item.get(dict_key), list) + or ( + len(list_item.get(dict_key).split(": ")) == 2 + if list_item.get(dict_key) + else False + ) + for list_item in metadata_list ) # # checking if some items have two keys: @@ -900,6 +912,8 @@ def _expand_metadata_list_item(self, metadata_list: list, dict_key: str): metadata_list[n_elem][dict_key] = [ metadata_list[n_elem][dict_key] ] + else: + pass just_string = False this_string = "" @@ -1087,7 +1101,7 @@ def _find_genome(metadata_list: list) -> list: sample_genome = "" for key in proj_gen_keys: sample_genome = " ".join([sample_genome, sample[1][key]]) - metadata_list[sample[0]][NEW_GENOME_COL_NAME] = sample_genome + metadata_list[sample[0]][NEW_GENOME_COL_NAME] = sample_genome.strip() return metadata_list def _write_raw_annotation_new( @@ -1161,11 +1175,43 @@ def _write_raw_annotation_new( if len(subannot_dict) > 0: self._write_subannotation(subannot_dict, proj_root_subsample) - self._write(proj_root_yaml, template, msg_pre=" Config file: ") + self._write(proj_root_yaml, template, msg_pre="Config file: ") if self.add_dotfile: _create_dot_yaml(dot_yaml_path, yaml_name) + if self.add_convert_modifier: + geofetchdir = os.path.dirname(__file__) + pipeline_interface_convert_path = os.path.join( + geofetchdir, TEMPLATES_DIR, PIPELINE_INTERFACE_CONVERT_TEMPLATE_NAME + ) + + looper_config_template_path = os.path.join( + geofetchdir, TEMPLATES_DIR, LOOPER_SRA_CONVERT + ) + + with open(looper_config_template_path, "r") as template_file: + template_looper = template_file.read() + + template_values = { + "pep_config": proj_root_yaml, + "output_dir": os.path.join(self.metadata_root_full, "output_dir"), + "pipeline_interface_convert": pipeline_interface_convert_path, + } + + for k, v in template_values.items(): + placeholder = "{" + str(k) + "}" + template_looper = template_looper.replace(placeholder, str(v)) + + looper_config_file = os.path.join( + self.metadata_root_full, + LOOPER_CONFIG_FILE_NAME, + ) + + self._write( + looper_config_file, template_looper, msg_pre="Looper config file: " + ) + else: meta_df = pd.DataFrame.from_dict(metadata_dict, orient="index") @@ -1204,8 +1250,11 @@ def _create_config_processed( :param meta_in_series: :return: generated, complete config file content """ + geofetchdir = os.path.dirname(__file__) - config_template = os.path.join(geofetchdir, CONFIG_PROCESSED_TEMPLATE_NAME) + config_template = os.path.join( + geofetchdir, TEMPLATES_DIR, CONFIG_PROCESSED_TEMPLATE_NAME + ) with open(config_template, "r") as template_file: template = template_file.read() meta_list_str = [ @@ -1260,9 +1309,13 @@ def _create_config_raw( else: sample_modifier_str = "" if not self.config_template: - self.config_template = os.path.join(geofetchdir, CONFIG_RAW_TEMPLATE_NAME) + self.config_template = os.path.join( + geofetchdir, TEMPLATES_DIR, CONFIG_RAW_TEMPLATE_NAME + ) if self.add_convert_modifier: - sra_convert_path = os.path.join(geofetchdir, CONFIG_SRA_TEMPLATE) + sra_convert_path = os.path.join( + geofetchdir, TEMPLATES_DIR, CONFIG_SRA_TEMPLATE_NAME + ) with open(sra_convert_path, "r") as template_file: sra_convert_template = template_file.read() else: @@ -1291,6 +1344,7 @@ def _create_config_raw( for k, v in template_values.items(): placeholder = "{" + str(k) + "}" template = template.replace(placeholder, str(v)) + return template @staticmethod diff --git a/geofetch/sraconvert.py b/geofetch/sraconvert.py index 6e64a9a..7b05a34 100755 --- a/geofetch/sraconvert.py +++ b/geofetch/sraconvert.py @@ -2,7 +2,7 @@ import os import sys -from argparse import ArgumentParser +from ubiquerg import VersionInHelpParser import logmuse import pypiper @@ -15,7 +15,7 @@ def _parse_cmdl(cmdl): provides convenience functions for converting or deleting sra data in various formats. """ - parser = ArgumentParser(description=description) + parser = VersionInHelpParser(description=description) # parser = pypiper.add_pypiper_args(parser, args=["output-parent"]) parser.add_argument( "-m", @@ -72,6 +72,9 @@ def _parse_cmdl(cmdl): help="Name for sample to run", metavar="SAMPLE_NAME", ) + parser.add_argument( + "-V", "--version", action="version", version=f"%(prog)s {__version__}" + ) parser.add_argument("-r", "--srr", required=True, nargs="+", help="SRR files") diff --git a/geofetch/config_processed_template.yaml b/geofetch/templates/config_processed_template.yaml similarity index 100% rename from geofetch/config_processed_template.yaml rename to geofetch/templates/config_processed_template.yaml diff --git a/geofetch/config_template.yaml b/geofetch/templates/config_template.yaml similarity index 100% rename from geofetch/config_template.yaml rename to geofetch/templates/config_template.yaml diff --git a/geofetch/templates/looper_config_template.yaml b/geofetch/templates/looper_config_template.yaml new file mode 100644 index 0000000..21a4dd0 --- /dev/null +++ b/geofetch/templates/looper_config_template.yaml @@ -0,0 +1,4 @@ +pep_config: {pep_config} +output_dir: {output_dir} +pipeline_interfaces: + - {pipeline_interface_convert} \ No newline at end of file diff --git a/geofetch/looper_sra_convert.yaml b/geofetch/templates/looper_sra_convert.yaml similarity index 50% rename from geofetch/looper_sra_convert.yaml rename to geofetch/templates/looper_sra_convert.yaml index 94525f1..861e423 100644 --- a/geofetch/looper_sra_convert.yaml +++ b/geofetch/templates/looper_sra_convert.yaml @@ -4,7 +4,7 @@ derive: attributes: [read1, read2, SRR_files] sources: - SRA: "${SRABAM}/{srr}.bam" + SRA: "${SRARAW}/{srr}/{srr}.sra" FQ: "${SRAFQ}/{srr}.fastq.gz" FQ1: "${SRAFQ}/{srr}_1.fastq.gz" FQ2: "${SRAFQ}/{srr}_2.fastq.gz" @@ -26,20 +26,3 @@ read_type: "SINGLE" then: read1: FQ1 - -project_modifiers: - amend: - sra_convert: - looper: - results_subdir: sra_convert_results - sample_modifiers: - append: - SRR_files: SRA - pipeline_interfaces: ${CODE}/geofetch/pipeline_interface_convert.yaml - derive: - attributes: [read1, read2, SRR_files] - sources: - SRA: "${SRARAW}/{srr}/{srr}.sra" - FQ: "${SRAFQ}/{srr}.fastq.gz" - FQ1: "${SRAFQ}/{srr}_1.fastq.gz" - FQ2: "${SRAFQ}/{srr}_2.fastq.gz" diff --git a/geofetch/templates/pipeline_interface_convert.yaml b/geofetch/templates/pipeline_interface_convert.yaml new file mode 100644 index 0000000..9ec2217 --- /dev/null +++ b/geofetch/templates/pipeline_interface_convert.yaml @@ -0,0 +1,12 @@ +pipeline_name: sra_convert +path: sraconvert +input_schema: ./sra_convert_schema.yaml +sample_interface: + command_template: > + {pipeline.path} --srr {sample.SRR_files} + {% if sample.SRX is defined %} --sample-name {sample.SRX} {% endif %} + {% if project.fqfolder is defined %} --fqfolder {project.fqfolder} {% endif %} + -O {looper.results_subdir} +compute: + bulker_crate: databio/sra_convert + size_dependent_variables: ./resources.tsv diff --git a/resources.tsv b/geofetch/templates/resources.tsv similarity index 100% rename from resources.tsv rename to geofetch/templates/resources.tsv diff --git a/sra_convert_schema.yaml b/geofetch/templates/sra_convert_schema.yaml similarity index 100% rename from sra_convert_schema.yaml rename to geofetch/templates/sra_convert_schema.yaml diff --git a/geofetch/utils.py b/geofetch/utils.py index c006e7b..66bdd88 100644 --- a/geofetch/utils.py +++ b/geofetch/utils.py @@ -1,4 +1,4 @@ -""" Independently-importable utilities to circumvent true scripts. """ +"""Independently-importable utilities to circumvent true scripts.""" import csv import logging diff --git a/pipeline_interface_convert.yaml b/pipeline_interface_convert.yaml deleted file mode 100644 index 33e51cc..0000000 --- a/pipeline_interface_convert.yaml +++ /dev/null @@ -1,12 +0,0 @@ -pipeline_name: sra_convert -pipeline_type: sample -path: sraconvert -input_schema: sra_convert_schema.yaml -command_template: > - {pipeline.path} --srr {sample.SRR_files} - {% if sample.SRX is defined %} --sample-name {sample.SRX} {% endif %} - {% if project.fqfolder is defined %} --fqfolder {project.fqfolder} {% endif %} - -O {looper.results_subdir} -compute: - bulker_crate: databio/sra_convert - size_dependent_variables: resources.tsv diff --git a/pipeline_interface_convert_v1.yaml b/pipeline_interface_convert_v1.yaml deleted file mode 100644 index bbaddc9..0000000 --- a/pipeline_interface_convert_v1.yaml +++ /dev/null @@ -1,31 +0,0 @@ -protocol_mapping: - "*": convert - -pipelines: - convert: - name: convert - path: sraconvert - required_input_files: SRR_files - arguments: - "--srr": SRR_files - resources: - default: - file_size: "0" - cores: "1" - mem: "4000" - time: "0-06:00:00" - package1: - file_size: "15" - cores: "1" - mem: "4000" - time: "0-12:00:00" - package2: - file_size: "100" - cores: "1" - mem: "4000" - time: "1-00:00:00" - package3: - file_size: "300" - cores: "1" - mem: "4000" - time: "2-00:00:00" diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index f9b4326..9440999 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -7,3 +7,4 @@ pandas>=1.5.3 peppy>=0.40.6 rich>=12.5.1 coloredlogs>=15.0.1 +piper>=0.14.4