ByteBard58 · ByteBard58 · Nov 7, 2025 · Nov 7, 2025 · Nov 7, 2025 · Nov 7, 2025
diff --git a/.gitignore b/.gitignore
@@ -20,4 +20,5 @@ scaler.pkl
 instance/
 shit/
 *.db
-.vscode/
+.vscode/
+*.pkl
diff --git a/app.py b/app.py
@@ -1,34 +1,119 @@
-from flask import render_template,request,jsonify,Flask
-import numpy as np
+import os
 import joblib
+import pandas as pd
+import numpy as np
+from flask import Flask, render_template, request, jsonify
+from fit import main
+
+# --- Configuration ---
+MODEL_DIR = "models"
+PIPE_PATH = os.path.join(MODEL_DIR, "pipe.pkl")
+COLUMNS_PATH = os.path.join(MODEL_DIR, "column_names.pkl")
+reverse_mapping = {0: "FALSE POSITIVE", 1: "CANDIDATE", 2: "CONFIRMED"}
+
+# --- Self-Heal Function ---
+def initialize_artifacts():
+    """
+    Checks if model artifacts exist. If not, runs the training script.
+    """
+    # 1. Ensure the model directory exists
+    os.makedirs(MODEL_DIR, exist_ok=True)
+
+    # 2. Check for missing files
+    pipe_exists = os.path.exists(PIPE_PATH)
+    columns_exists = os.path.exists(COLUMNS_PATH)
+
+    if not pipe_exists or not columns_exists:
+        print("--- MODEL ARTIFACTS MISSING ---")
+        if not pipe_exists:
+            print(f"Missing: {PIPE_PATH}")
+        if not columns_exists:
+            print(f"Missing: {COLUMNS_PATH}")
+
+        print("Running training routine (fit.main())... This may take a moment.")
+        try:
+            # Run the main training function from fit.py
+            main()
+            print("Training complete. Artifacts generated successfully.")
+            print("---------------------------------")
+        except Exception as e:
+            print(f"\nFATAL: Error during self-heal training: {e}")
+            print("Application cannot start without model artifacts.")
+            print("Please fix the training script (fit.py) and restart.")
+            exit(1) # Exit if training fails
+    else:
+        print("Model artifacts found. Loading...")
 
-model = joblib.load("model.pkl")
-scaler = joblib.load("scaler.pkl")
+# --- Application Startup ---
 
-reverse_mapping = {0:"FALSE POSITIVE",1:"CANDIDATE",2:"CONFIRMED"}
+# Run the self-heal check *before* loading models
+initialize_artifacts()
 
+# Load models
+try:
+    pipe = joblib.load(PIPE_PATH)
+    column_names = joblib.load(COLUMNS_PATH)
+    print("Models loaded successfully.")
+except Exception as e:
+    print(f"\nFATAL: Error loading model artifacts: {e}")
+    print("Files might be corrupt. Try deleting the 'models' directory and restarting.")
+    exit(1) # Exit if loading fails
+
+# Initialize Flask App
 app = Flask(__name__)
 
 @app.route("/")
 def home():
-  return render_template("index.html")
-
-@app.route("/predict",methods=["POST"])
-def predict():
-  try:
-    data = request.json["features"]
-    arr = np.array(data).reshape(1,-1)
-    arr_scaled = scaler.transform(arr)
-    pred = model.predict(arr_scaled)[0]
-    proba_pred = model.predict_proba(arr_scaled)[0]
-    proba_dict = {reverse_mapping[i]: round(p,3) for i,p in enumerate(proba_pred)}
-    return jsonify({"prediction":reverse_mapping[pred],"probabilities":proba_dict})
-  except Exception as e:
-    return jsonify({"error":e})
+    return render_template("index.html")
 
 @app.route("/about")
 def about():
-  return render_template("about.html")
+    return render_template("about.html")
+
+@app.route("/predict", methods=["POST"])
+def predict():
+    try:
+        # Extract features from the JSON request
+        raw_features = [
+            request.json["orbital-period"],
+            request.json["transit-epoch"],
+            request.json["transit-depth"],
+            request.json["planet-radius"],
+            request.json["semi-major-axis"],
+            request.json["inclination"],
+            request.json["equilibrium-temp"],
+            request.json["insolation-flux"],
+            request.json["impact-parameter"],
+            request.json["radius-ratio"],
+            request.json["stellar-density"],
+            request.json["star-distance"],
+            request.json["num-transits"],
+        ]
+
+        # Create DataFrame with correct column names
+        df = pd.DataFrame([raw_features], columns=column_names)
+
+        # Get prediction and probabilities
+        pred = int(pipe.predict(df)[0])
+        proba = pipe.predict_proba(df)[0]
+
+        # Format probabilities for the response
+        proba_dict = {
+            reverse_mapping[i]: round(p, 3) for i, p in enumerate(proba)
+        }
+
+        # Send response
+        return jsonify(
+            {"prediction": reverse_mapping[pred], "probabilities": proba_dict}
+        )
+
+    except KeyError as e:
+        print(f"Prediction Error: Missing key in request {e}")
+        return jsonify({"error": f"Missing feature in request: {e}"}), 400
+    except Exception as e:
+        print(f"Prediction Error: {e}")
+        return jsonify({"error": str(e)}), 400
+
 
 if __name__ == "__main__":
-  app.run(debug=True)
+    app.run(debug=True)
diff --git a/fit.py b/fit.py
@@ -0,0 +1,159 @@
+import time
+import pandas as pd
+import numpy as np
+import joblib
+
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.impute import SimpleImputer
+from sklearn.ensemble import RandomForestClassifier, StackingClassifier
+from sklearn.linear_model import LogisticRegression
+from xgboost import XGBClassifier
+from imblearn.over_sampling import SMOTE
+from imblearn.pipeline import Pipeline
+from sklearn.metrics import classification_report
+
+
+def get_window(camps, campaign_dates):
+    if pd.isna(camps) or not camps:
+        return np.nan, np.nan
+
+    camps = str(camps).split(',') if isinstance(camps, str) else camps
+    starts, ends = [], []
+
+    for c in camps:
+        try:
+            camp_num = int(c.strip())
+            if camp_num in campaign_dates:
+                start, end = campaign_dates[camp_num]
+                starts.append(start)
+                ends.append(end)
+        except (ValueError, KeyError):
+            continue
+
+    return (min(starts) if starts else np.nan, max(ends) if ends else np.nan)
+
+
+def load_and_prepare_data():
+    # Load Kepler dataset
+    df_raw = pd.read_csv("data/kepler_data.csv", comment="#")
+    feature_list = [
+        "koi_disposition", "koi_period", "koi_time0bk", "koi_depth", "koi_prad",
+        "koi_sma", "koi_incl", "koi_teq", "koi_insol", "koi_impact",
+        "koi_ror", "koi_srho", "koi_dor", "koi_num_transits"
+    ]
+    df_1 = df_raw[feature_list].copy()
+
+    # Load K2 dataset
+    df_2 = pd.read_csv("data/k2_data.csv", comment="#")
+
+    # Define campaign windows
+    campaign_dates = {
+        0: (2456725.0, 2456805.0), 1: (2456808.0, 2456891.0), 2: (2456893.0, 2456975.0),
+        3: (2456976.0, 2457064.0), 4: (2457065.0, 2457159.0), 5: (2457159.0, 2457246.0),
+        6: (2457250.0, 2457338.0), 7: (2457339.0, 2457420.0), 8: (2457421.0, 2457530.0),
+        9: (2457504.0, 2457579.0), 10: (2457577.0, 2457653.0), 11: (2457657.0, 2457732.0),
+        12: (2457731.0, 2457819.0), 13: (2457820.0, 2457900.0), 14: (2457898.0, 2457942.0),
+        15: (2457941.0, 2458022.0), 16: (2458020.0, 2458074.0), 17: (2458074.0, 2458176.0),
+        18: (2458151.0, 2458201.0), 19: (2458232.0, 2458348.0)
+    }
+
+    # Add observation window
+    df_2['campaigns'] = df_2['k2_campaigns']
+    df_2[['obs_start_bjd', 'obs_end_bjd']] = df_2['campaigns'].apply(
+        lambda x: pd.Series(get_window(x, campaign_dates))
+    )
+
+    # Transit counting
+    df_2['n_min'] = np.ceil((df_2['obs_start_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper'])
+    df_2['n_max'] = np.floor((df_2['obs_end_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper'])
+    df_2['num_transits'] = (df_2['n_max'] - df_2['n_min'] + 1).clip(lower=0)
+
+    # Select and rename columns
+    df_2 = df_2[
+        ["disposition", "pl_orbper", "pl_tranmid", "pl_trandep", "pl_rade",
+         "pl_orbsmax", "pl_orbincl", "pl_eqt", "pl_insol", "pl_imppar",
+         "pl_ratror", "pl_dens", "pl_ratdor", "num_transits"]
+    ]
+
+    mapping = {
+        "disposition": "koi_disposition", "pl_orbper": "koi_period", "pl_tranmid": "koi_time0bk",
+        "pl_trandep": "koi_depth", "pl_rade": "koi_prad", "pl_orbsmax": "koi_sma",
+        "pl_orbincl": "koi_incl", "pl_eqt": "koi_teq", "pl_insol": "koi_insol",
+        "pl_imppar": "koi_impact", "pl_ratror": "koi_ror", "pl_dens": "koi_srho",
+        "pl_ratdor": "koi_dor", "num_transits": "koi_num_transits"
+    }
+    df_2 = df_2.rename(columns=mapping)
+
+    # Combine both datasets
+    df = pd.concat([df_1, df_2])
+
+    # Prepare input/output
+    X = df.iloc[:, 1:].to_numpy()
+    y = df["koi_disposition"].map({
+        "FALSE POSITIVE": 0, "CANDIDATE": 1, "CONFIRMED": 2, "REFUTED": 0
+    }).to_numpy()
+
+    return X, y, df.columns[1:]
+
+
+def build_pipeline():
+    rf = RandomForestClassifier(
+        n_estimators=1000, max_depth=None, random_state=542, class_weight="balanced"
+    )
+    xgb = XGBClassifier(
+        n_estimators=1000, max_depth=None, learning_rate=0.5, random_state=9
+    )
+    estimators = [("rf", rf), ("xgb", xgb)]
+
+    final_estimator = LogisticRegression(
+        random_state=891, class_weight="balanced", C=0.1,
+        penalty="l2", solver="saga", max_iter=5000
+    )
+
+    mv = StackingClassifier(
+        estimators=estimators, final_estimator=final_estimator,
+        cv=5, passthrough=True, n_jobs=-1
+    )
+
+    pipe = Pipeline([
+        ("impute", SimpleImputer(strategy="mean")),
+        ("scale", StandardScaler()),
+        ("smote", SMOTE()),
+        ("model", mv)
+    ])
+    return pipe
+
+def eval(y_test,x_test,estimator):
+    y_true = y_test
+    y_pred = estimator.predict(x_test)
+    return classification_report(y_true,y_pred)
+
+def main():
+    X, y, column_name = load_and_prepare_data()
+
+    x_train, x_test, y_train, y_test = train_test_split(
+        X, y, test_size=1/3, shuffle=True, random_state=91, stratify=y
+    )
+
+    pipe_mv = build_pipeline()
+
+    print("Starting model training. It will take some time, sit tight......")
+    t1 = time.time()
+    pipe_mv.fit(x_train, y_train)
+    t2 = time.time()
+
+    print("Model trained successfully")
+    minutes, seconds = np.divmod(t2 - t1, 60)
+    print(f"Time Elapsed: {minutes:.0f} M {seconds:.2f} S")
+
+
+    print(eval(y_test,x_test,pipe_mv))
+
+    joblib.dump(pipe_mv, "models/pipe.pkl")
+    joblib.dump(column_name, "models/column_names.pkl")
+    print("Model and column names saved successfully.")
+
+
+if __name__ == "__main__":
+    main()