GW-HIVE · BVishal-Geek · Jan 15, 2026
diff --git a/flask_backend/models/BreastCancer_SVM_v1/BCO.json b/flask_backend/models/BreastCancer_SVM_v1/BCO.json
@@ -0,0 +1,199 @@
+{
+    "object_id": "https://biocomputeobject.org/HIVE_000006/DRAFT",
+    "spec_version": "https://w3id.org/ieee/ieee-2791-schema/2791object.json",
+    "etag": "",
+    "provenance_domain": {
+        "name": "BreastCancer_SVM",
+        "version": "1.0",
+        "license": "https://spdx.org/licenses/CC-BY-4.0.html",
+        "created": "2025-12-15T19:01:57",
+        "modified": "2025-12-15T19:42:32",
+        "derived_from": "",
+        "contributors": [
+            {
+                "name": "Vishal Bakshi",
+                "affiliation": "George Washington University",
+                "email": "vishal.bakshi@gwu.edu",
+                "contribution": [
+                    "createdBy"
+                ],
+                "orcid": ""
+            },
+            {
+                "name": "Patrick McNeely",
+                "affiliation": "George Washington University",
+                "email": "pmcneely@email.gwu.edu",
+                "contribution": [
+                    "createdBy"
+                ],
+                "orcid": ""
+            },
+            {
+                "name": "Lori Krammer",
+                "affiliation": "George Washington University",
+                "email": "lorikrammer@email.gwu.edu",
+                "contribution": [
+                    "createdBy"
+                ],
+                "orcid": ""
+            }
+        ],
+        "review": []
+    },
+    "usability_domain": [
+        "This pipeline is intended to support biomedical researchers, data scientists, and clinical informaticians interested in: Extracting the response percentage of breast cancer patients to chemotherapy and combo therapy using a pre-trained SVM model."
+    ],
+    "description_domain": {
+        "keywords": [
+            "python",
+            "machine learning",
+            "single-cell RNA-seq"
+        ],
+        "xref": [],
+        "platform": [
+            "VS Code"
+        ],
+        "pipeline_steps": [
+            {
+                "step_number": 1,
+                "name": "model inference",
+                "description": "Run inference on the new participant's single-cell RNA-seq data using a pre-trained SVM model.",
+                "version": "1.0",
+                "input_list": [
+                    {
+                        "filename": "chemo_model.pkl",
+                        "access_time": "2025-12-15T04:00:00.000Z",
+                        "uri": "https://github.com/GW-HIVE/PredictMod/blob/main/flask_backend/models/BreastCancer_SVM_v1/chemo_model.pkl",
+                        "sha1_checksum": ""
+                    },
+                    {
+                        "filename": "combo_model.pkl",
+                        "access_time": "2025-12-15T04:00:00.000Z",
+                        "uri": "https://github.com/GW-HIVE/PredictMod/blob/main/flask_backend/models/BreastCancer_SVM_v1/combo_model.pkl",
+                        "sha1_checksum": ""
+                    },
+                    {
+                        "filename": "test_script.py",
+                        "access_time": "2025-12-15T04:00:00.000Z",
+                        "uri": "https://github.com/GW-HIVE/PredictMod/blob/main/flask_backend/models/BreastCancer_SVM_v1/test_script.py",
+                        "sha1_checksum": ""
+                    },
+                    {
+                        "filename": "testset_P020.csv",
+                        "access_time": "2025-12-15T04:00:00.000Z",
+                        "uri": "https://github.com/GW-HIVE/PredictMod/blob/main/flask_backend/models/BreastCancer_SVM_v1/testset_P020.csv",
+                        "sha1_checksum": ""
+                    }
+                ],
+                "output_file": [
+                    {
+                        "filename": "NA",
+                        "access_time": "2025-12-15T04:00:00.000Z",
+                        "uri": "NA",
+                        "sha1_checksum": ""
+                    }
+                ]
+            }
+        ]
+    },
+    "parametric_domain": [],
+    "io_domain": {
+        "input_subdomain": [
+            {
+                "uri": {
+                    "filename": "chemo_model.pkl",
+                    "uri": "https://github.com/GW-HIVE/PredictMod/blob/main/flask_backend/models/BreastCancer_SVM_v1/chemo_model.pkl",
+                    "access_time": "2025-12-15T04:00:00.000Z"
+                }
+            },
+            {
+                "uri": {
+                    "filename": "combo_model.pkl",
+                    "uri": "https://github.com/GW-HIVE/PredictMod/blob/main/flask_backend/models/BreastCancer_SVM_v1/combo_model.pkl",
+                    "access_time": "2025-12-15T04:00:00.000Z"
+                }
+            },
+            {
+                "uri": {
+                    "filename": "test_script.py",
+                    "uri": "https://github.com/GW-HIVE/PredictMod/blob/main/flask_backend/models/BreastCancer_SVM_v1/test_script.py",
+                    "access_time": "2025-12-15T04:00:00.000Z"
+                }
+            },
+            {
+                "uri": {
+                    "filename": "testset_P020.csv",
+                    "uri": "https://github.com/GW-HIVE/PredictMod/blob/main/flask_backend/models/BreastCancer_SVM_v1/testset_P020.csv",
+                    "access_time": "2025-12-15T04:00:00.000Z"
+                }
+            },
+            {
+                "uri": {
+                    "filename": "testset_P025.csv",
+                    "uri": "https://github.com/GW-HIVE/PredictMod/blob/main/flask_backend/models/BreastCancer_SVM_v1/testset_P025.csv",
+                    "access_time": "2025-12-15T04:00:00.000Z"
+                }
+            }
+        ],
+        "output_subdomain": [
+            {
+                "mediatype": "string",
+                "uri": {
+                    "filename": "NA",
+                    "uri": "NA",
+                    "access_time": "2025-12-15T04:00:00.000Z"
+                }
+            }
+        ]
+    },
+    "execution_domain": {
+        "script": [],
+        "script_driver": "",
+        "software_prerequisites": [
+            {
+                "name": "Python",
+                "version": "3.11.13",
+                "uri": {
+                    "filename": "",
+                    "uri": "https://www.python.org/",
+                    "access_time": "2025-12-15T04:00:00.000Z",
+                    "sha1_checksum": ""
+                }
+            },
+            {
+                "name": "pandas",
+                "version": "2.2.2",
+                "uri": {
+                    "filename": "",
+                    "uri": "https://pandas.pydata.org/",
+                    "access_time": "2025-12-15T04:00:00.000Z",
+                    "sha1_checksum": ""
+                }
+            },
+            {
+                "name": "numpy",
+                "version": "2.0.2",
+                "uri": {
+                    "filename": "",
+                    "uri": "https://numpy.org/",
+                    "access_time": "2025-12-15T04:00:00.000Z",
+                    "sha1_checksum": ""
+                }
+            },
+            {
+                "name": "scikit-learn",
+                "version": "2.4.2",
+                "uri": {
+                    "filename": "",
+                    "uri": "https://scikit-learn.org/",
+                    "access_time": "2025-12-15T04:00:00.000Z",
+                    "sha1_checksum": ""
+                }
+            }
+        ],
+        "external_data_endpoints": [],
+        "environment_variables": {}
+    },
+    "extension_domain": [],
+    "error_domain": {}
+}
diff --git a/flask_backend/models/BreastCancer_SVM_v1/README.md b/flask_backend/models/BreastCancer_SVM_v1/README.md
@@ -0,0 +1,118 @@
+# 🧬 Breast Cancer Treatment Response Prediction  
+### Single-Cell RNA-seq–based Machine Learning Pipeline
+
+---
+
+## 📌 Project Overview
+
+Triple-Negative Breast Cancer (TNBC) patients typically receive either **Chemotherapy** or **Anti–PD-L1 + Chemotherapy**. However, only a subset of patients respond effectively to each treatment.
+
+This project builds **two machine learning models** using **pre-treatment single-cell RNA-seq data** to:
+
+- Predict response likelihood for **Chemotherapy**
+- Predict response likelihood for **Anti–PD-L1 + Chemotherapy**
+- Aggregate **cell-level predictions** into a **patient-level response estimate**
+
+The system is designed for **real-world inference**, where each patient may contribute **multiple single-cell observations**.
+
+---
+
+## 🎯 Objective
+
+- Use **pre-treatment immune cell features** to predict treatment response
+- Build **separate, treatment-specific models**
+- Prevent **data leakage** by respecting patient-level grouping
+- Provide a **deployable and interpretable inference pipeline**
+
+---
+
+## 📊 Data Source
+
+**Primary Publication**  
+*Single-cell analyses reveal key immune cell subsets associated with response to PD-L1 blockade in triple-negative breast cancer*  
+**PMID:** 33589889  
+**GEO Accession:** GSE169246
+
+**Data Characteristics**
+- Single-cell RNA sequencing (scRNA-seq)
+- Pre-treatment immune cells
+- Multiple cells per patient
+- Cell-level annotations linked to patient response
+
+---
+
+## 🧪 Dataset Structure
+
+Each row corresponds to **one immune cell**.
+
+| Column | Description |
+|------|-------------|
+| Expression | Aggregate gene expression score |
+| nUMI | Total RNA molecules per cell |
+| nGene | Number of detected genes |
+| percent_mito | Mitochondrial gene fraction (cell stress) |
+| percent_hsp | Heat shock protein expression |
+| percent_ig | Immunoglobulin expression |
+| percent_rp | Ribosomal protein expression |
+| PDCD1 | PD-1 gene expression |
+| Origin | Tissue source |
+| Response | Clinical response label |
+| Patient_code | Unique patient identifier |
+| Timeline | Pre-treatment / Post-treatment |
+
+---
+
+## 🧠 Modeling Strategy
+
+### Why two models?
+
+Different treatments were trained on **different tissue distributions**:
+
+- **Chemo model**
+  - `breast`, `liver`
+- **Anti–PD-L1 + Chemo model**
+  - `chest_wall`, `liver`, `lymph_node`
+
+Training a single model would introduce **distribution shift**, so treatment-specific models are used.
+
+---
+
+## 🤖 Models Used
+
+- **Support Vector Machine (SVM)**
+  - RBF kernel
+  - Class-weight balanced
+  - StandardScaler included in pipeline
+
+**Why SVM?**
+- Suitable for small datasets
+- Handles non-linear decision boundaries
+- Stable under limited samples
+
+---
+
+## 🔍 Prediction Logic
+
+1. User uploads a CSV containing **single-cell observations**
+2. Data is filtered to **pre-treatment** rows
+3. Tissue origin is **one-hot encoded**
+4. Each model:
+   - Receives only the features it was trained on
+   - Produces **cell-level predictions**
+5. Final response score:
+
+    percentage = (Number of responder cells / Total number of cells) * 100
+
+---
+
+## Train the model
+
+`python chemo_model_training.py --input "path/to/your/data.csv"`
+
+`python combo_model_training.py --input "path/to/your/data.csv"`
+
+## Run the model
+
+`python test_script.py --input "path/to/your/testdata.csv" --model "chemo_model.pkl"`
+
+`python test_script.py --input "path/to/your/testdata.csv" --model "combo_model.pkl"`
diff --git a/flask_backend/models/BreastCancer_SVM_v1/chemo_model.pkl b/flask_backend/models/BreastCancer_SVM_v1/chemo_model.pkl