Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/black.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
- uses: psf/black@stable
2 changes: 1 addition & 1 deletion .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ jobs:
run: |
python setup.py sdist bdist_wheel
- name: Publish package distributions to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
uses: pypa/gh-action-pypi-publish@release/v1
9 changes: 6 additions & 3 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
include requirements/*
include README.md
include docs/img/geofetch_logo.svg
include geofetch/config_template.yaml
include geofetch/config_processed_template.yaml
include geofetch/looper_sra_convert.yaml
include geofetch/templates/*
include geofetch/templates/config_template.yaml
include geofetch/templates/config_processed_template.yaml
include geofetch/templates/looper_sra_convert.yaml
include geofetch/templates/looper_config_template.yaml
include geofetch/templates/pipeline_interface_convert.yaml
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ or install the latest version from the GitHub repository:
pip install git+https://github.com/pepkit/geofetch.git
```

## All GEO projects (GSE + GSM) in PEP format.

All GEO projects are available in PEPhub under geo namespace: https://pephub.databio.org/geo/ .
User can search for GEO projects using the search bar, or download archive with all GEO PEPs from archive section of the namespace:
[https://pephub.databio.org/geo?view=archive](https://pephub.databio.org/geo?view=archive)


## How to cite:
https://doi.org/10.1093/bioinformatics/btad069
Expand Down
2 changes: 1 addition & 1 deletion geofetch/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Package-level data """
"""Package-level data"""

import coloredlogs
import logmuse
Expand Down
2 changes: 1 addition & 1 deletion geofetch/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.12.7"
__version__ = "0.12.8"
8 changes: 3 additions & 5 deletions geofetch/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os

import logmuse
from ubiquerg import VersionInHelpParser

from geofetch._version import __version__

Expand All @@ -15,7 +16,7 @@ def _parse_cmdl(cmdl):
"""
parser
"""
parser = argparse.ArgumentParser(
parser = VersionInHelpParser(
description="Automatic GEO and SRA data downloader",
usage="""geofetch [<args>]

Expand All @@ -26,15 +27,12 @@ def _parse_cmdl(cmdl):
geofetch -i GSE67303 --processed --geo-folder <folder> -m <folder>

""",
version=__version__,
)

processed_group = parser.add_argument_group("processed")
raw_group = parser.add_argument_group("raw")

parser.add_argument(
"-V", "--version", action="version", version=f"%(prog)s {__version__}"
)

# Required
parser.add_argument(
"-i",
Expand Down
9 changes: 8 additions & 1 deletion geofetch/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,14 @@

NEW_GENOME_COL_NAME = "ref_genome"

TEMPLATES_DIR = "templates"
CONFIG_PROCESSED_TEMPLATE_NAME = "config_processed_template.yaml"
CONFIG_RAW_TEMPLATE_NAME = "config_template.yaml"
CONFIG_SRA_TEMPLATE = "looper_sra_convert.yaml"
CONFIG_SRA_TEMPLATE_NAME = "looper_sra_convert.yaml"
PIPELINE_INTERFACE_CONVERT_TEMPLATE_NAME = "pipeline_interface_convert.yaml"
LOOPER_SRA_CONVERT = "looper_config_template.yaml"
# SRA_CONVERT_SCHEMA_NAME = "sra_convert_schema.yaml"
# RESOURCES_NAME = "resources.tsv"

# const for Finder:
RETMAX = 10000000 # once it should be increased
Expand All @@ -63,3 +68,5 @@
'+AND+("{start_date}"[Publication%20Date]%20:%20"{end_date}"[Publication%20Date])'
)
THREE_MONTH_FILTER = '+AND+"published+last+3+months"[Filter]'

LOOPER_CONFIG_FILE_NAME = "looper_config.yaml"
68 changes: 61 additions & 7 deletions geofetch/geofetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from geofetch.const import (
CONFIG_PROCESSED_TEMPLATE_NAME,
CONFIG_RAW_TEMPLATE_NAME,
CONFIG_SRA_TEMPLATE,
CONFIG_SRA_TEMPLATE_NAME,
EXP_SUPP_METADATA_FILE,
EXPERIMENT_PATTERN,
FILE_RAW_NAME_SAMPLE_PATTERN,
Expand All @@ -34,6 +34,10 @@
SAMPLE_SUPP_METADATA_FILE,
SER_SUPP_FILE_PATTERN,
SUPP_FILE_PATTERN,
TEMPLATES_DIR,
PIPELINE_INTERFACE_CONVERT_TEMPLATE_NAME,
LOOPER_SRA_CONVERT,
LOOPER_CONFIG_FILE_NAME,
)
from geofetch.utils import (
Accession,
Expand Down Expand Up @@ -867,6 +871,8 @@ def _expand_metadata_list(self, metadata_list: list) -> list:
_LOGGER.info("Expanding metadata list...")
list_of_keys = _get_list_of_keys(metadata_list)
for key_in_list in list_of_keys:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is minor, but I'd recommend simplifying variable names like this to key instead of key_in_list. Given the logic, it is implied, and so, the name is perhaps a bit redundant.

if key_in_list == "Sample_characteristics_ch1":
pass
metadata_list = self._expand_metadata_list_item(metadata_list, key_in_list)
return metadata_list

Expand All @@ -881,7 +887,13 @@ def _expand_metadata_list_item(self, metadata_list: list, dict_key: str):
"""
try:
element_is_list = any(
isinstance(list_item.get(dict_key), list) for list_item in metadata_list
isinstance(list_item.get(dict_key), list)
or (
len(list_item.get(dict_key).split(": ")) == 2
if list_item.get(dict_key)
else False
)
for list_item in metadata_list
)

# # checking if some items have two keys:
Expand All @@ -900,6 +912,8 @@ def _expand_metadata_list_item(self, metadata_list: list, dict_key: str):
metadata_list[n_elem][dict_key] = [
metadata_list[n_elem][dict_key]
]
else:
pass

just_string = False
this_string = ""
Expand Down Expand Up @@ -1087,7 +1101,7 @@ def _find_genome(metadata_list: list) -> list:
sample_genome = ""
for key in proj_gen_keys:
sample_genome = " ".join([sample_genome, sample[1][key]])
metadata_list[sample[0]][NEW_GENOME_COL_NAME] = sample_genome
metadata_list[sample[0]][NEW_GENOME_COL_NAME] = sample_genome.strip()
return metadata_list

def _write_raw_annotation_new(
Expand Down Expand Up @@ -1161,11 +1175,43 @@ def _write_raw_annotation_new(
if len(subannot_dict) > 0:
self._write_subannotation(subannot_dict, proj_root_subsample)

self._write(proj_root_yaml, template, msg_pre=" Config file: ")
self._write(proj_root_yaml, template, msg_pre="Config file: ")

if self.add_dotfile:
_create_dot_yaml(dot_yaml_path, yaml_name)

if self.add_convert_modifier:
geofetchdir = os.path.dirname(__file__)
pipeline_interface_convert_path = os.path.join(
geofetchdir, TEMPLATES_DIR, PIPELINE_INTERFACE_CONVERT_TEMPLATE_NAME
)

looper_config_template_path = os.path.join(
geofetchdir, TEMPLATES_DIR, LOOPER_SRA_CONVERT
)

with open(looper_config_template_path, "r") as template_file:
template_looper = template_file.read()

template_values = {
"pep_config": proj_root_yaml,
"output_dir": os.path.join(self.metadata_root_full, "output_dir"),
"pipeline_interface_convert": pipeline_interface_convert_path,
}

for k, v in template_values.items():
placeholder = "{" + str(k) + "}"
template_looper = template_looper.replace(placeholder, str(v))

looper_config_file = os.path.join(
self.metadata_root_full,
LOOPER_CONFIG_FILE_NAME,
)

self._write(
looper_config_file, template_looper, msg_pre="Looper config file: "
)

else:
meta_df = pd.DataFrame.from_dict(metadata_dict, orient="index")

Expand Down Expand Up @@ -1204,8 +1250,11 @@ def _create_config_processed(
:param meta_in_series:
:return: generated, complete config file content
"""

geofetchdir = os.path.dirname(__file__)
config_template = os.path.join(geofetchdir, CONFIG_PROCESSED_TEMPLATE_NAME)
config_template = os.path.join(
geofetchdir, TEMPLATES_DIR, CONFIG_PROCESSED_TEMPLATE_NAME
)
with open(config_template, "r") as template_file:
template = template_file.read()
meta_list_str = [
Expand Down Expand Up @@ -1260,9 +1309,13 @@ def _create_config_raw(
else:
sample_modifier_str = ""
if not self.config_template:
self.config_template = os.path.join(geofetchdir, CONFIG_RAW_TEMPLATE_NAME)
self.config_template = os.path.join(
geofetchdir, TEMPLATES_DIR, CONFIG_RAW_TEMPLATE_NAME
)
if self.add_convert_modifier:
sra_convert_path = os.path.join(geofetchdir, CONFIG_SRA_TEMPLATE)
sra_convert_path = os.path.join(
geofetchdir, TEMPLATES_DIR, CONFIG_SRA_TEMPLATE_NAME
)
with open(sra_convert_path, "r") as template_file:
sra_convert_template = template_file.read()
else:
Expand Down Expand Up @@ -1291,6 +1344,7 @@ def _create_config_raw(
for k, v in template_values.items():
placeholder = "{" + str(k) + "}"
template = template.replace(placeholder, str(v))

return template

@staticmethod
Expand Down
7 changes: 5 additions & 2 deletions geofetch/sraconvert.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
import sys
from argparse import ArgumentParser
from ubiquerg import VersionInHelpParser
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did we change from using ArgumentParser to a class from ubiquerg?


import logmuse
import pypiper
Expand All @@ -15,7 +15,7 @@ def _parse_cmdl(cmdl):
provides convenience functions for converting or deleting sra data in
various formats.
"""
parser = ArgumentParser(description=description)
parser = VersionInHelpParser(description=description)
# parser = pypiper.add_pypiper_args(parser, args=["output-parent"])
parser.add_argument(
"-m",
Expand Down Expand Up @@ -72,6 +72,9 @@ def _parse_cmdl(cmdl):
help="Name for sample to run",
metavar="SAMPLE_NAME",
)
parser.add_argument(
"-V", "--version", action="version", version=f"%(prog)s {__version__}"
)

parser.add_argument("-r", "--srr", required=True, nargs="+", help="SRR files")

Expand Down
File renamed without changes.
4 changes: 4 additions & 0 deletions geofetch/templates/looper_config_template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
pep_config: {pep_config}
output_dir: {output_dir}
pipeline_interfaces:
- {pipeline_interface_convert}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
derive:
attributes: [read1, read2, SRR_files]
sources:
SRA: "${SRABAM}/{srr}.bam"
SRA: "${SRARAW}/{srr}/{srr}.sra"
FQ: "${SRAFQ}/{srr}.fastq.gz"
FQ1: "${SRAFQ}/{srr}_1.fastq.gz"
FQ2: "${SRAFQ}/{srr}_2.fastq.gz"
Expand All @@ -26,20 +26,3 @@
read_type: "SINGLE"
then:
read1: FQ1

project_modifiers:
amend:
sra_convert:
looper:
results_subdir: sra_convert_results
sample_modifiers:
append:
SRR_files: SRA
pipeline_interfaces: ${CODE}/geofetch/pipeline_interface_convert.yaml
derive:
attributes: [read1, read2, SRR_files]
sources:
SRA: "${SRARAW}/{srr}/{srr}.sra"
FQ: "${SRAFQ}/{srr}.fastq.gz"
FQ1: "${SRAFQ}/{srr}_1.fastq.gz"
FQ2: "${SRAFQ}/{srr}_2.fastq.gz"
12 changes: 12 additions & 0 deletions geofetch/templates/pipeline_interface_convert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
pipeline_name: sra_convert
path: sraconvert
input_schema: ./sra_convert_schema.yaml
sample_interface:
command_template: >
{pipeline.path} --srr {sample.SRR_files}
{% if sample.SRX is defined %} --sample-name {sample.SRX} {% endif %}
{% if project.fqfolder is defined %} --fqfolder {project.fqfolder} {% endif %}
-O {looper.results_subdir}
compute:
bulker_crate: databio/sra_convert
size_dependent_variables: ./resources.tsv
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion geofetch/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Independently-importable utilities to circumvent true scripts. """
"""Independently-importable utilities to circumvent true scripts."""

import csv
import logging
Expand Down
12 changes: 0 additions & 12 deletions pipeline_interface_convert.yaml

This file was deleted.

31 changes: 0 additions & 31 deletions pipeline_interface_convert_v1.yaml

This file was deleted.

1 change: 1 addition & 0 deletions requirements/requirements-all.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ pandas>=1.5.3
peppy>=0.40.6
rich>=12.5.1
coloredlogs>=15.0.1
piper>=0.14.4
Loading