From 714038533e0a87c578c5fcee17066c75a2a352db Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Mon, 9 Jun 2025 13:22:42 +0300
Subject: [PATCH 1/5] fix typo

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 43e0a8b..d6461cc 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,6 @@ X9,0,3,2,16,7,1,23,12,10,9,1,2,134,40,390,289,29,372,27,81,150,90,9,88,32,287,88
 
 ```
 Sample_id,taxon,rel_abund_perc
-Sample_id,taxon,rel_abund_perc
 X1,Pseudomonadota,85.03558294577552
 X1,Bacillota,10.72121619814011
 X1,Other (<4.0%),4.243200856084384

From bb95d84e18b031eb45700d71af14b9cfa7262d6b Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Mon, 23 Jun 2025 15:30:30 +0300
Subject: [PATCH 2/5] add diversity

---
 krakenparser/diversity.py | 114 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 krakenparser/diversity.py

diff --git a/krakenparser/diversity.py b/krakenparser/diversity.py
new file mode 100644
index 0000000..2fe68f5
--- /dev/null
+++ b/krakenparser/diversity.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python
+
+import pandas as pd
+import numpy as np
+import sys
+import shutil
+import argparse
+from pathlib import Path
+from skbio.diversity import beta_diversity
+from skbio.stats import subsample_counts
+
+
+# Define Shannon index
+def shannon_index(counts):
+    counts = np.array(counts)
+    counts = counts[counts > 0]
+    proportions = counts / counts.sum()
+    return -np.sum(proportions * np.log(proportions))
+
+
+# Define Pielou's evenness
+def pielou_evenness(counts):
+    counts = np.array(counts)
+    counts = counts[counts > 0]
+    H = shannon_index(counts)
+    S = len(counts)
+    return H / np.log(S) if S > 1 else 0
+
+
+# Define Chao1 richness estimator
+def chao1_index(counts):
+    counts = np.array(counts)
+    S_obs = np.sum(counts > 0)
+    F1 = np.sum(counts == 1)
+    F2 = np.sum(counts == 2)
+    if F2 == 0:
+        return S_obs + F1 * (F1 - 1) / 2
+    return S_obs + (F1 * F1) / (2 * F2)
+
+
+def calc_alpha_div(df, output_path):
+    results = []
+    for sample_id, row in df.iterrows():
+        counts = row.values
+        results.append(
+            {
+                "Sample": sample_id,
+                "Shannon": shannon_index(counts),
+                "Pielou": pielou_evenness(counts),
+                "Chao1": chao1_index(counts),
+            }
+        )
+    alpha_df = pd.DataFrame(results).set_index("Sample")
+    alpha_df.to_csv(output_path / "alpha_div.csv")
+
+
+def calc_beta_div(df, output_path, rarefaction_depth):
+    rarefied_counts = []
+    sample_ids = []
+
+    for sample, row in df.iterrows():
+        counts = row.values.astype(int)
+        if counts.sum() >= rarefaction_depth:
+            rarefied = subsample_counts(counts, n=rarefaction_depth)
+            rarefied_counts.append(rarefied)
+            sample_ids.append(sample)
+
+    if len(rarefied_counts) < 2:
+        raise ValueError("Not enough samples passed the rarefaction threshold.")
+
+    bray_df = beta_diversity(
+        "braycurtis", rarefied_counts, ids=sample_ids
+    ).to_data_frame()
+    jaccard_df = beta_diversity(
+        "jaccard", rarefied_counts, ids=sample_ids
+    ).to_data_frame()
+
+    bray_df.to_csv(output_path / "beta_div_bray.csv")
+    jaccard_df.to_csv(output_path / "beta_div_jaccard.csv")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Calculate α & β-diversities.")
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=True,
+        help="Input total count table CSV (species level).",
+    )
+    parser.add_argument("-o", "--output", required=True, help="Output directory path.")
+    parser.add_argument(
+        "-d",
+        "--depth",
+        type=int,
+        default=1000,
+        help="Rarefaction depth for β diversity (default: 1000).",
+    )
+    args = parser.parse_args()
+
+    input_file = Path(args.input)
+    output_dir = Path(args.output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    df = pd.read_csv(input_file, index_col=0)
+
+    calc_alpha_div(df, output_dir)
+    calc_beta_div(df, output_dir, args.depth)
+    print(
+        f"α & β-diversities have been successfully calculated and saved to '{output_dir}'."
+    )
+
+    pycache_dir = Path(__file__).resolve().parent / "__pycache__"
+    if pycache_dir.exists() and pycache_dir.is_dir():
+        shutil.rmtree(pycache_dir)

From 187dfd2cc2bc7ff7c97fc1a4f7463c9bf8c156fd Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Mon, 23 Jun 2025 15:31:21 +0300
Subject: [PATCH 3/5] upd readme

---
 README.md | 104 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 84 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index d6461cc..308916b 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ X9,0,3,2,16,7,1,23,12,10,9,1,2,134,40,390,289,29,372,27,81,150,90,9,88,32,287,88
 
 ### Relative abundance output
 
-`counts_phylum.csv` parsed from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+`ra_phylum.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
 
 ```
 Sample_id,taxon,rel_abund_perc
@@ -58,6 +58,43 @@ X9,Bacillota,12.473649123439218
 X9,Other (<4.0%),1.8979510606688494
 ```
 
+### α-diversity output
+
+`alpha_div.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+
+```
+Sample,Shannon,Pielou,Chao1
+X1,3.911345447107001,0.5269245043289149,2274.533185840708
+X2,3.9944130792536563,0.4906424221265042,4155.0
+...
+X8,3.442077115880119,0.42753293021330063,4177.251358695652
+X9,4.033664950188261,0.5050385978575492,3492.16
+```
+
+### β-diversity output
+
+`beta_div_bray.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+
+```
+,X1,X2,...,X8,X9
+X1,0.0,0.398,...,0.61,0.353
+X2,0.398,0.0,...,0.723,0.388
+...
+X8,0.61,0.723,...,0.0,0.665
+X9,0.353,0.388,...,0.665,0.0
+```
+
+`beta_div_jaccard.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+
+```
+,X1,X2,...,X8,X9
+X1,0.0,0.7073170731707317,...,0.8223938223938224,0.7232472324723247
+X2,0.7073170731707317,0.0,...,0.835016835016835,0.7352941176470589
+...
+X8,0.8223938223938224,0.835016835016835,...,0.0,0.8066914498141264
+X9,0.7232472324723247,0.7352941176470589,...,0.8066914498141264,0.0
+```
+
 ### Visualization examples gallery
 
 |[Stacked Barplot](https://github.com/PopovIILab/KrakenParser/wiki/Stacked-Barplot-API)|[Streamgraph](https://github.com/PopovIILab/KrakenParser/wiki/Streamgraph-API)|
@@ -81,6 +118,7 @@ This will:
 4. Process extracted text files
 5. Convert them into CSV format
 6. Calculate relative abundance
+7. Calculate α & β-diversities
 
 ### **Input Requirements**
 - The Kraken2 reports must be inside a **subdirectory** (e.g., `data/kreports`).
@@ -159,6 +197,22 @@ KrakenParser --relabund -i data/counts/csv/counts_phylum.csv -o data/counts/csv_
 
 This will group all the taxa that have abundance <3.5 into "Other <3.5%" group. Other parameters are welcome!
 
+### **Step 7: Calculate α & β-diversities**
+```bash
+KrakenParser --diversity -i data/counts/csv/counts_species.csv -o data/diversity
+#Having troubles? Run KrakenParser --diversity -h
+```
+
+This calculates α & β-diversities and saves them as CSV format to directory provided in the output.
+
+If user wants to use another depth for β-diversity calculations:
+```bash
+KrakenParser --diversity -i data/counts/csv/counts_species.csv -o data/diversity --depth 750
+#Having troubles? Run KrakenParser --diversity -h
+```
+
+Other parameters are welcome!
+
 ## Arguments Breakdown
 ### **KrakenParser** (Main Pipeline)
 - Automates the entire workflow.
@@ -189,29 +243,39 @@ This will group all the taxa that have abundance <3.5 into "Other <3.5%" group.
 - Calculates relative abundance based on total abundance CSV.
 - Optionally can group low abundant taxa.
 
+### **--diversity** (Step 7)
+- Calculates α & β-diversities based on total species abundance CSV.
+- Shannon, Pielou & Chao1 indices for α-diversity
+- Bray-Curtis & Jaccard indices for β-diversity
+- Uses 1000 depth for β-diversity as default (can be adjusted with -d)
+
 ## Example Output Structure
 After running the full pipeline, the output directory will look like this:
 ```
 data/
-├─ kreports/           # Input Kraken2 reports
-├─ mpa/                # Converted MPA files
-├─ COMBINED.txt        # Merged MPA file
-└─ counts/
-   ├─ txt/             # Extracted taxonomic levels in TXT
-   │  ├─ counts_species.txt
-   │  ├─ counts_genus.txt
-   │  ├─ counts_family.txt
-   │  ├─ ...
-   └─ csv/             # Total abundance CSV output
-   │  ├─ counts_species.csv
-   │  ├─ counts_genus.csv
-   │  ├─ counts_family.csv
-   │  ├─ ...
-   └─ csv_relabund/    # Relative abundance CSV output
-      ├─ counts_species.csv
-      ├─ counts_genus.csv
-      ├─ counts_family.csv
-      ├─ ...
+├─ kreports/               # Input Kraken2 reports
+├─ mpa/                    # Converted MPA files
+├─ COMBINED.txt            # Merged MPA file
+├─ counts/
+│  ├─ txt/                 # Extracted taxonomic levels in TXT
+│  │  ├─ counts_species.txt
+│  │  ├─ counts_genus.txt
+│  │  ├─ counts_family.txt
+│  │  ├─ ...
+│  └─ csv/                 # Total abundance CSV output
+│  │  ├─ counts_species.csv
+│  │  ├─ counts_genus.csv
+│  │  ├─ counts_family.csv
+│  │  ├─ ...
+├─ rel_abund/              # Relative abundance CSV output
+│  ├─ ra_species.csv
+│  ├─ ra_genus.csv
+│  ├─ ra_family.csv
+│  ├─ ...
+└─ diversity/
+   ├─ alpha_div.csv
+   ├─ beta_div_bray.csv
+   └─ beta_div_jaccard.csv
 ```
 
 ## Conclusion

From aeca82878acbba13ea15732ee6fba4e7de132713 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Mon, 23 Jun 2025 15:34:14 +0300
Subject: [PATCH 4/5] add diversity function

---
 krakenparser/decombine.sh         |  1 -
 krakenparser/kraken2csv.sh        | 12 ++++++++++--
 krakenparser/krakenparser.py      |  6 ++++++
 krakenparser/processing_script.py |  2 ++
 krakenparser/version.py           |  2 +-
 5 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/krakenparser/decombine.sh b/krakenparser/decombine.sh
index 6053a8b..0832018 100755
--- a/krakenparser/decombine.sh
+++ b/krakenparser/decombine.sh
@@ -55,7 +55,6 @@ fi
 # Create destination directories
 mkdir -p "${DESTINATION_DIR}/txt"
 mkdir -p "${DESTINATION_DIR}/csv"
-mkdir -p "${DESTINATION_DIR}/csv_relabund"
 
 # Process input file and generate output files
 grep -E "s__" "${SOURCE_FILE}" \
diff --git a/krakenparser/kraken2csv.sh b/krakenparser/kraken2csv.sh
index bb2729d..5b53f75 100755
--- a/krakenparser/kraken2csv.sh
+++ b/krakenparser/kraken2csv.sh
@@ -62,7 +62,6 @@ for file in "$COUNTS_DIR"/txt/counts_*.txt; do
         echo "Error: Failed to process $file"
         exit 1
     fi
-    echo "Processed $file successfully."
 done
 
 # PART 5: CONVERT TXT FILES TO CSV
@@ -74,9 +73,18 @@ done
 
 # PART 6: CALCULATE RELATIVE ABUNDANCE
 
+mkdir -p "$PARENT_DIR/rel_abund"
+
 for file in "$COUNTS_DIR"/csv/counts_*.csv; do
-    CSV_RA_FILE="$COUNTS_DIR/csv_relabund/$(basename "$file")"
+    base=$(basename "$file")
+    new_name="${base/counts_/ra_}"
+    CSV_RA_FILE="$PARENT_DIR/rel_abund/$new_name"
     python "$SCRIPT_DIR/relabund.py" -i "$file" -o "$CSV_RA_FILE"
 done
 
+# PART 7: CALCULATE α & β-DIVERSITIES
+
+CSV_SPECIES_FILE="$COUNTS_DIR/csv/counts_species.csv"
+python "$SCRIPT_DIR/diversity.py" -i "$CSV_SPECIES_FILE" -o "$PARENT_DIR/diversity"
+
 echo "All steps completed successfully!"
\ No newline at end of file
diff --git a/krakenparser/krakenparser.py b/krakenparser/krakenparser.py
index 30b68c3..5eb9c71 100755
--- a/krakenparser/krakenparser.py
+++ b/krakenparser/krakenparser.py
@@ -54,6 +54,11 @@ def main():
         action="store_true",
         help="Calculate relative abundance",
     )
+    parser.add_argument(
+        "--diversity",
+        action="store_true",
+        help="Calculate α & β-diversities",
+    )
     parser.add_argument(
         "-V", "--version", action="version", version=f"%(prog)s {__version__}"
     )
@@ -72,6 +77,7 @@ def main():
         "process": package_dir / "processing_script.py",
         "txt2csv": package_dir / "convert2csv.py",
         "relabund": package_dir / "relabund.py",
+        "diversity": package_dir / "diversity.py",
     }
 
     if "-h" in sys.argv or "--help" in sys.argv:
diff --git a/krakenparser/processing_script.py b/krakenparser/processing_script.py
index 1bc42d7..911d306 100755
--- a/krakenparser/processing_script.py
+++ b/krakenparser/processing_script.py
@@ -35,6 +35,8 @@ def process_files(source_file, destination_file):
     with open(destination_file, "w") as file:
         file.write(updated_content)
 
+    print(f"Processed {destination_file} successfully.")
+
     # Get the path to the current directory (same location as the script)
     current_dir = Path(__file__).resolve().parent
     pycache_dir = current_dir / "__pycache__"
diff --git a/krakenparser/version.py b/krakenparser/version.py
index 3d18726..906d362 100755
--- a/krakenparser/version.py
+++ b/krakenparser/version.py
@@ -1 +1 @@
-__version__ = "0.5.0"
+__version__ = "0.6.0"

From e3a99b90de7cc7d0291b305e5fb0a6745f33f394 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Mon, 23 Jun 2025 15:38:30 +0300
Subject: [PATCH 5/5] upd tests

---
 tests/test_full_pipeline.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_full_pipeline.py b/tests/test_full_pipeline.py
index 182a1e7..360c781 100644
--- a/tests/test_full_pipeline.py
+++ b/tests/test_full_pipeline.py
@@ -42,8 +42,8 @@ def test_full_pipeline_end_to_end(demo_run):
         assert csv_path.stat().st_size > 0, f"counts_{rank}.csv is empty"
 
     # 3. Assert relative‐abundance CSVs exist and are non‐empty
-    rel_dir = run_dir / "counts" / "csv_relabund"
-    assert rel_dir.exists(), "csv_relabund directory is missing"
-    rel_species = rel_dir / "counts_species.csv"
-    assert rel_species.exists(), "Missing counts_species.csv in csv_relabund"
-    assert rel_species.stat().st_size > 0, "counts_species.csv is empty"
+    rel_dir = run_dir / "rel_abund"
+    assert rel_dir.exists(), "rel_abund directory is missing"
+    rel_species = rel_dir / "ra_species.csv"
+    assert rel_species.exists(), "Missing ra_species.csv in csv_relabund"
+    assert rel_species.stat().st_size > 0, "ra_species.csv is empty"