From 714038533e0a87c578c5fcee17066c75a2a352db Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 9 Jun 2025 13:22:42 +0300 Subject: [PATCH 1/5] fix typo --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 43e0a8b..d6461cc 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,6 @@ X9,0,3,2,16,7,1,23,12,10,9,1,2,134,40,390,289,29,372,27,81,150,90,9,88,32,287,88 ``` Sample_id,taxon,rel_abund_perc -Sample_id,taxon,rel_abund_perc X1,Pseudomonadota,85.03558294577552 X1,Bacillota,10.72121619814011 X1,Other (<4.0%),4.243200856084384 From bb95d84e18b031eb45700d71af14b9cfa7262d6b Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 23 Jun 2025 15:30:30 +0300 Subject: [PATCH 2/5] add diversity --- krakenparser/diversity.py | 114 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 krakenparser/diversity.py diff --git a/krakenparser/diversity.py b/krakenparser/diversity.py new file mode 100644 index 0000000..2fe68f5 --- /dev/null +++ b/krakenparser/diversity.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python + +import pandas as pd +import numpy as np +import sys +import shutil +import argparse +from pathlib import Path +from skbio.diversity import beta_diversity +from skbio.stats import subsample_counts + + +# Define Shannon index +def shannon_index(counts): + counts = np.array(counts) + counts = counts[counts > 0] + proportions = counts / counts.sum() + return -np.sum(proportions * np.log(proportions)) + + +# Define Pielou's evenness +def pielou_evenness(counts): + counts = np.array(counts) + counts = counts[counts > 0] + H = shannon_index(counts) + S = len(counts) + return H / np.log(S) if S > 1 else 0 + + +# Define Chao1 richness estimator +def chao1_index(counts): + counts = np.array(counts) + S_obs = np.sum(counts > 0) + F1 = np.sum(counts == 1) + F2 = np.sum(counts == 2) + if F2 == 0: + return S_obs + F1 * (F1 - 1) / 2 + return S_obs + (F1 * F1) / (2 * F2) + + +def calc_alpha_div(df, output_path): + results = [] + for sample_id, row in df.iterrows(): + counts = row.values + results.append( + { + "Sample": sample_id, + "Shannon": shannon_index(counts), + "Pielou": pielou_evenness(counts), + "Chao1": chao1_index(counts), + } + ) + alpha_df = pd.DataFrame(results).set_index("Sample") + alpha_df.to_csv(output_path / "alpha_div.csv") + + +def calc_beta_div(df, output_path, rarefaction_depth): + rarefied_counts = [] + sample_ids = [] + + for sample, row in df.iterrows(): + counts = row.values.astype(int) + if counts.sum() >= rarefaction_depth: + rarefied = subsample_counts(counts, n=rarefaction_depth) + rarefied_counts.append(rarefied) + sample_ids.append(sample) + + if len(rarefied_counts) < 2: + raise ValueError("Not enough samples passed the rarefaction threshold.") + + bray_df = beta_diversity( + "braycurtis", rarefied_counts, ids=sample_ids + ).to_data_frame() + jaccard_df = beta_diversity( + "jaccard", rarefied_counts, ids=sample_ids + ).to_data_frame() + + bray_df.to_csv(output_path / "beta_div_bray.csv") + jaccard_df.to_csv(output_path / "beta_div_jaccard.csv") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Calculate α & β-diversities.") + parser.add_argument( + "-i", + "--input", + required=True, + help="Input total count table CSV (species level).", + ) + parser.add_argument("-o", "--output", required=True, help="Output directory path.") + parser.add_argument( + "-d", + "--depth", + type=int, + default=1000, + help="Rarefaction depth for β diversity (default: 1000).", + ) + args = parser.parse_args() + + input_file = Path(args.input) + output_dir = Path(args.output) + output_dir.mkdir(parents=True, exist_ok=True) + + df = pd.read_csv(input_file, index_col=0) + + calc_alpha_div(df, output_dir) + calc_beta_div(df, output_dir, args.depth) + print( + f"α & β-diversities have been successfully calculated and saved to '{output_dir}'." + ) + + pycache_dir = Path(__file__).resolve().parent / "__pycache__" + if pycache_dir.exists() and pycache_dir.is_dir(): + shutil.rmtree(pycache_dir) From 187dfd2cc2bc7ff7c97fc1a4f7463c9bf8c156fd Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 23 Jun 2025 15:31:21 +0300 Subject: [PATCH 3/5] upd readme --- README.md | 104 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 84 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index d6461cc..308916b 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ X9,0,3,2,16,7,1,23,12,10,9,1,2,134,40,390,289,29,372,27,81,150,90,9,88,32,287,88 ### Relative abundance output -`counts_phylum.csv` parsed from 7 kraken2 reports of metagenomic samples using `KrakenParser`: +`ra_phylum.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`: ``` Sample_id,taxon,rel_abund_perc @@ -58,6 +58,43 @@ X9,Bacillota,12.473649123439218 X9,Other (<4.0%),1.8979510606688494 ``` +### α-diversity output + +`alpha_div.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`: + +``` +Sample,Shannon,Pielou,Chao1 +X1,3.911345447107001,0.5269245043289149,2274.533185840708 +X2,3.9944130792536563,0.4906424221265042,4155.0 +... +X8,3.442077115880119,0.42753293021330063,4177.251358695652 +X9,4.033664950188261,0.5050385978575492,3492.16 +``` + +### β-diversity output + +`beta_div_bray.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`: + +``` +,X1,X2,...,X8,X9 +X1,0.0,0.398,...,0.61,0.353 +X2,0.398,0.0,...,0.723,0.388 +... +X8,0.61,0.723,...,0.0,0.665 +X9,0.353,0.388,...,0.665,0.0 +``` + +`beta_div_jaccard.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`: + +``` +,X1,X2,...,X8,X9 +X1,0.0,0.7073170731707317,...,0.8223938223938224,0.7232472324723247 +X2,0.7073170731707317,0.0,...,0.835016835016835,0.7352941176470589 +... +X8,0.8223938223938224,0.835016835016835,...,0.0,0.8066914498141264 +X9,0.7232472324723247,0.7352941176470589,...,0.8066914498141264,0.0 +``` + ### Visualization examples gallery |[Stacked Barplot](https://github.com/PopovIILab/KrakenParser/wiki/Stacked-Barplot-API)|[Streamgraph](https://github.com/PopovIILab/KrakenParser/wiki/Streamgraph-API)| @@ -81,6 +118,7 @@ This will: 4. Process extracted text files 5. Convert them into CSV format 6. Calculate relative abundance +7. Calculate α & β-diversities ### **Input Requirements** - The Kraken2 reports must be inside a **subdirectory** (e.g., `data/kreports`). @@ -159,6 +197,22 @@ KrakenParser --relabund -i data/counts/csv/counts_phylum.csv -o data/counts/csv_ This will group all the taxa that have abundance <3.5 into "Other <3.5%" group. Other parameters are welcome! +### **Step 7: Calculate α & β-diversities** +```bash +KrakenParser --diversity -i data/counts/csv/counts_species.csv -o data/diversity +#Having troubles? Run KrakenParser --diversity -h +``` + +This calculates α & β-diversities and saves them as CSV format to directory provided in the output. + +If user wants to use another depth for β-diversity calculations: +```bash +KrakenParser --diversity -i data/counts/csv/counts_species.csv -o data/diversity --depth 750 +#Having troubles? Run KrakenParser --diversity -h +``` + +Other parameters are welcome! + ## Arguments Breakdown ### **KrakenParser** (Main Pipeline) - Automates the entire workflow. @@ -189,29 +243,39 @@ This will group all the taxa that have abundance <3.5 into "Other <3.5%" group. - Calculates relative abundance based on total abundance CSV. - Optionally can group low abundant taxa. +### **--diversity** (Step 7) +- Calculates α & β-diversities based on total species abundance CSV. +- Shannon, Pielou & Chao1 indices for α-diversity +- Bray-Curtis & Jaccard indices for β-diversity +- Uses 1000 depth for β-diversity as default (can be adjusted with -d) + ## Example Output Structure After running the full pipeline, the output directory will look like this: ``` data/ -├─ kreports/ # Input Kraken2 reports -├─ mpa/ # Converted MPA files -├─ COMBINED.txt # Merged MPA file -└─ counts/ - ├─ txt/ # Extracted taxonomic levels in TXT - │ ├─ counts_species.txt - │ ├─ counts_genus.txt - │ ├─ counts_family.txt - │ ├─ ... - └─ csv/ # Total abundance CSV output - │ ├─ counts_species.csv - │ ├─ counts_genus.csv - │ ├─ counts_family.csv - │ ├─ ... - └─ csv_relabund/ # Relative abundance CSV output - ├─ counts_species.csv - ├─ counts_genus.csv - ├─ counts_family.csv - ├─ ... +├─ kreports/ # Input Kraken2 reports +├─ mpa/ # Converted MPA files +├─ COMBINED.txt # Merged MPA file +├─ counts/ +│ ├─ txt/ # Extracted taxonomic levels in TXT +│ │ ├─ counts_species.txt +│ │ ├─ counts_genus.txt +│ │ ├─ counts_family.txt +│ │ ├─ ... +│ └─ csv/ # Total abundance CSV output +│ │ ├─ counts_species.csv +│ │ ├─ counts_genus.csv +│ │ ├─ counts_family.csv +│ │ ├─ ... +├─ rel_abund/ # Relative abundance CSV output +│ ├─ ra_species.csv +│ ├─ ra_genus.csv +│ ├─ ra_family.csv +│ ├─ ... +└─ diversity/ + ├─ alpha_div.csv + ├─ beta_div_bray.csv + └─ beta_div_jaccard.csv ``` ## Conclusion From aeca82878acbba13ea15732ee6fba4e7de132713 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 23 Jun 2025 15:34:14 +0300 Subject: [PATCH 4/5] add diversity function --- krakenparser/decombine.sh | 1 - krakenparser/kraken2csv.sh | 12 ++++++++++-- krakenparser/krakenparser.py | 6 ++++++ krakenparser/processing_script.py | 2 ++ krakenparser/version.py | 2 +- 5 files changed, 19 insertions(+), 4 deletions(-) diff --git a/krakenparser/decombine.sh b/krakenparser/decombine.sh index 6053a8b..0832018 100755 --- a/krakenparser/decombine.sh +++ b/krakenparser/decombine.sh @@ -55,7 +55,6 @@ fi # Create destination directories mkdir -p "${DESTINATION_DIR}/txt" mkdir -p "${DESTINATION_DIR}/csv" -mkdir -p "${DESTINATION_DIR}/csv_relabund" # Process input file and generate output files grep -E "s__" "${SOURCE_FILE}" \ diff --git a/krakenparser/kraken2csv.sh b/krakenparser/kraken2csv.sh index bb2729d..5b53f75 100755 --- a/krakenparser/kraken2csv.sh +++ b/krakenparser/kraken2csv.sh @@ -62,7 +62,6 @@ for file in "$COUNTS_DIR"/txt/counts_*.txt; do echo "Error: Failed to process $file" exit 1 fi - echo "Processed $file successfully." done # PART 5: CONVERT TXT FILES TO CSV @@ -74,9 +73,18 @@ done # PART 6: CALCULATE RELATIVE ABUNDANCE +mkdir -p "$PARENT_DIR/rel_abund" + for file in "$COUNTS_DIR"/csv/counts_*.csv; do - CSV_RA_FILE="$COUNTS_DIR/csv_relabund/$(basename "$file")" + base=$(basename "$file") + new_name="${base/counts_/ra_}" + CSV_RA_FILE="$PARENT_DIR/rel_abund/$new_name" python "$SCRIPT_DIR/relabund.py" -i "$file" -o "$CSV_RA_FILE" done +# PART 7: CALCULATE α & β-DIVERSITIES + +CSV_SPECIES_FILE="$COUNTS_DIR/csv/counts_species.csv" +python "$SCRIPT_DIR/diversity.py" -i "$CSV_SPECIES_FILE" -o "$PARENT_DIR/diversity" + echo "All steps completed successfully!" \ No newline at end of file diff --git a/krakenparser/krakenparser.py b/krakenparser/krakenparser.py index 30b68c3..5eb9c71 100755 --- a/krakenparser/krakenparser.py +++ b/krakenparser/krakenparser.py @@ -54,6 +54,11 @@ def main(): action="store_true", help="Calculate relative abundance", ) + parser.add_argument( + "--diversity", + action="store_true", + help="Calculate α & β-diversities", + ) parser.add_argument( "-V", "--version", action="version", version=f"%(prog)s {__version__}" ) @@ -72,6 +77,7 @@ def main(): "process": package_dir / "processing_script.py", "txt2csv": package_dir / "convert2csv.py", "relabund": package_dir / "relabund.py", + "diversity": package_dir / "diversity.py", } if "-h" in sys.argv or "--help" in sys.argv: diff --git a/krakenparser/processing_script.py b/krakenparser/processing_script.py index 1bc42d7..911d306 100755 --- a/krakenparser/processing_script.py +++ b/krakenparser/processing_script.py @@ -35,6 +35,8 @@ def process_files(source_file, destination_file): with open(destination_file, "w") as file: file.write(updated_content) + print(f"Processed {destination_file} successfully.") + # Get the path to the current directory (same location as the script) current_dir = Path(__file__).resolve().parent pycache_dir = current_dir / "__pycache__" diff --git a/krakenparser/version.py b/krakenparser/version.py index 3d18726..906d362 100755 --- a/krakenparser/version.py +++ b/krakenparser/version.py @@ -1 +1 @@ -__version__ = "0.5.0" +__version__ = "0.6.0" From e3a99b90de7cc7d0291b305e5fb0a6745f33f394 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 23 Jun 2025 15:38:30 +0300 Subject: [PATCH 5/5] upd tests --- tests/test_full_pipeline.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_full_pipeline.py b/tests/test_full_pipeline.py index 182a1e7..360c781 100644 --- a/tests/test_full_pipeline.py +++ b/tests/test_full_pipeline.py @@ -42,8 +42,8 @@ def test_full_pipeline_end_to_end(demo_run): assert csv_path.stat().st_size > 0, f"counts_{rank}.csv is empty" # 3. Assert relative‐abundance CSVs exist and are non‐empty - rel_dir = run_dir / "counts" / "csv_relabund" - assert rel_dir.exists(), "csv_relabund directory is missing" - rel_species = rel_dir / "counts_species.csv" - assert rel_species.exists(), "Missing counts_species.csv in csv_relabund" - assert rel_species.stat().st_size > 0, "counts_species.csv is empty" + rel_dir = run_dir / "rel_abund" + assert rel_dir.exists(), "rel_abund directory is missing" + rel_species = rel_dir / "ra_species.csv" + assert rel_species.exists(), "Missing ra_species.csv in csv_relabund" + assert rel_species.stat().st_size > 0, "ra_species.csv is empty"