Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ scaler.pkl
instance/
shit/
*.db
.vscode/
.vscode/
*.pkl
127 changes: 106 additions & 21 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,119 @@
from flask import render_template,request,jsonify,Flask
import numpy as np
import os
import joblib
import pandas as pd
import numpy as np
from flask import Flask, render_template, request, jsonify
from fit import main

# --- Configuration ---
MODEL_DIR = "models"
PIPE_PATH = os.path.join(MODEL_DIR, "pipe.pkl")
COLUMNS_PATH = os.path.join(MODEL_DIR, "column_names.pkl")
reverse_mapping = {0: "FALSE POSITIVE", 1: "CANDIDATE", 2: "CONFIRMED"}

# --- Self-Heal Function ---
def initialize_artifacts():
"""
Checks if model artifacts exist. If not, runs the training script.
"""
# 1. Ensure the model directory exists
os.makedirs(MODEL_DIR, exist_ok=True)

# 2. Check for missing files
pipe_exists = os.path.exists(PIPE_PATH)
columns_exists = os.path.exists(COLUMNS_PATH)

if not pipe_exists or not columns_exists:
print("--- MODEL ARTIFACTS MISSING ---")
if not pipe_exists:
print(f"Missing: {PIPE_PATH}")
if not columns_exists:
print(f"Missing: {COLUMNS_PATH}")

print("Running training routine (fit.main())... This may take a moment.")
try:
# Run the main training function from fit.py
main()
print("Training complete. Artifacts generated successfully.")
print("---------------------------------")
except Exception as e:
print(f"\nFATAL: Error during self-heal training: {e}")
print("Application cannot start without model artifacts.")
print("Please fix the training script (fit.py) and restart.")
exit(1) # Exit if training fails
else:
print("Model artifacts found. Loading...")

model = joblib.load("model.pkl")
scaler = joblib.load("scaler.pkl")
# --- Application Startup ---

reverse_mapping = {0:"FALSE POSITIVE",1:"CANDIDATE",2:"CONFIRMED"}
# Run the self-heal check *before* loading models
initialize_artifacts()

# Load models
try:
pipe = joblib.load(PIPE_PATH)
column_names = joblib.load(COLUMNS_PATH)
print("Models loaded successfully.")
except Exception as e:
print(f"\nFATAL: Error loading model artifacts: {e}")
print("Files might be corrupt. Try deleting the 'models' directory and restarting.")
exit(1) # Exit if loading fails

# Initialize Flask App
app = Flask(__name__)

@app.route("/")
def home():
return render_template("index.html")

@app.route("/predict",methods=["POST"])
def predict():
try:
data = request.json["features"]
arr = np.array(data).reshape(1,-1)
arr_scaled = scaler.transform(arr)
pred = model.predict(arr_scaled)[0]
proba_pred = model.predict_proba(arr_scaled)[0]
proba_dict = {reverse_mapping[i]: round(p,3) for i,p in enumerate(proba_pred)}
return jsonify({"prediction":reverse_mapping[pred],"probabilities":proba_dict})
except Exception as e:
return jsonify({"error":e})
return render_template("index.html")

@app.route("/about")
def about():
return render_template("about.html")
return render_template("about.html")

@app.route("/predict", methods=["POST"])
def predict():
try:
# Extract features from the JSON request
raw_features = [
request.json["orbital-period"],
request.json["transit-epoch"],
request.json["transit-depth"],
request.json["planet-radius"],
request.json["semi-major-axis"],
request.json["inclination"],
request.json["equilibrium-temp"],
request.json["insolation-flux"],
request.json["impact-parameter"],
request.json["radius-ratio"],
request.json["stellar-density"],
request.json["star-distance"],
request.json["num-transits"],
]

# Create DataFrame with correct column names
df = pd.DataFrame([raw_features], columns=column_names)

# Get prediction and probabilities
pred = int(pipe.predict(df)[0])
proba = pipe.predict_proba(df)[0]

# Format probabilities for the response
proba_dict = {
reverse_mapping[i]: round(p, 3) for i, p in enumerate(proba)
}

# Send response
return jsonify(
{"prediction": reverse_mapping[pred], "probabilities": proba_dict}
)

except KeyError as e:
print(f"Prediction Error: Missing key in request {e}")
return jsonify({"error": f"Missing feature in request: {e}"}), 400
except Exception as e:
print(f"Prediction Error: {e}")
return jsonify({"error": str(e)}), 400


if __name__ == "__main__":
app.run(debug=True)
app.run(debug=True)
159 changes: 159 additions & 0 deletions fit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import time
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report


def get_window(camps, campaign_dates):
if pd.isna(camps) or not camps:
return np.nan, np.nan

camps = str(camps).split(',') if isinstance(camps, str) else camps
starts, ends = [], []

for c in camps:
try:
camp_num = int(c.strip())
if camp_num in campaign_dates:
start, end = campaign_dates[camp_num]
starts.append(start)
ends.append(end)
except (ValueError, KeyError):
continue

return (min(starts) if starts else np.nan, max(ends) if ends else np.nan)


def load_and_prepare_data():
# Load Kepler dataset
df_raw = pd.read_csv("data/kepler_data.csv", comment="#")
feature_list = [
"koi_disposition", "koi_period", "koi_time0bk", "koi_depth", "koi_prad",
"koi_sma", "koi_incl", "koi_teq", "koi_insol", "koi_impact",
"koi_ror", "koi_srho", "koi_dor", "koi_num_transits"
]
df_1 = df_raw[feature_list].copy()

# Load K2 dataset
df_2 = pd.read_csv("data/k2_data.csv", comment="#")

# Define campaign windows
campaign_dates = {
0: (2456725.0, 2456805.0), 1: (2456808.0, 2456891.0), 2: (2456893.0, 2456975.0),
3: (2456976.0, 2457064.0), 4: (2457065.0, 2457159.0), 5: (2457159.0, 2457246.0),
6: (2457250.0, 2457338.0), 7: (2457339.0, 2457420.0), 8: (2457421.0, 2457530.0),
9: (2457504.0, 2457579.0), 10: (2457577.0, 2457653.0), 11: (2457657.0, 2457732.0),
12: (2457731.0, 2457819.0), 13: (2457820.0, 2457900.0), 14: (2457898.0, 2457942.0),
15: (2457941.0, 2458022.0), 16: (2458020.0, 2458074.0), 17: (2458074.0, 2458176.0),
18: (2458151.0, 2458201.0), 19: (2458232.0, 2458348.0)
}

# Add observation window
df_2['campaigns'] = df_2['k2_campaigns']
df_2[['obs_start_bjd', 'obs_end_bjd']] = df_2['campaigns'].apply(
lambda x: pd.Series(get_window(x, campaign_dates))
)

# Transit counting
df_2['n_min'] = np.ceil((df_2['obs_start_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper'])
df_2['n_max'] = np.floor((df_2['obs_end_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper'])
df_2['num_transits'] = (df_2['n_max'] - df_2['n_min'] + 1).clip(lower=0)

# Select and rename columns
df_2 = df_2[
["disposition", "pl_orbper", "pl_tranmid", "pl_trandep", "pl_rade",
"pl_orbsmax", "pl_orbincl", "pl_eqt", "pl_insol", "pl_imppar",
"pl_ratror", "pl_dens", "pl_ratdor", "num_transits"]
]

mapping = {
"disposition": "koi_disposition", "pl_orbper": "koi_period", "pl_tranmid": "koi_time0bk",
"pl_trandep": "koi_depth", "pl_rade": "koi_prad", "pl_orbsmax": "koi_sma",
"pl_orbincl": "koi_incl", "pl_eqt": "koi_teq", "pl_insol": "koi_insol",
"pl_imppar": "koi_impact", "pl_ratror": "koi_ror", "pl_dens": "koi_srho",
"pl_ratdor": "koi_dor", "num_transits": "koi_num_transits"
}
df_2 = df_2.rename(columns=mapping)

# Combine both datasets
df = pd.concat([df_1, df_2])

# Prepare input/output
X = df.iloc[:, 1:].to_numpy()
y = df["koi_disposition"].map({
"FALSE POSITIVE": 0, "CANDIDATE": 1, "CONFIRMED": 2, "REFUTED": 0
}).to_numpy()

return X, y, df.columns[1:]


def build_pipeline():
rf = RandomForestClassifier(
n_estimators=1000, max_depth=None, random_state=542, class_weight="balanced"
)
xgb = XGBClassifier(
n_estimators=1000, max_depth=None, learning_rate=0.5, random_state=9
)
estimators = [("rf", rf), ("xgb", xgb)]

final_estimator = LogisticRegression(
random_state=891, class_weight="balanced", C=0.1,
penalty="l2", solver="saga", max_iter=5000
)

mv = StackingClassifier(
estimators=estimators, final_estimator=final_estimator,
cv=5, passthrough=True, n_jobs=-1
)

pipe = Pipeline([
("impute", SimpleImputer(strategy="mean")),
("scale", StandardScaler()),
("smote", SMOTE()),
("model", mv)
])
return pipe

def eval(y_test,x_test,estimator):
y_true = y_test
y_pred = estimator.predict(x_test)
return classification_report(y_true,y_pred)

def main():
X, y, column_name = load_and_prepare_data()

x_train, x_test, y_train, y_test = train_test_split(
X, y, test_size=1/3, shuffle=True, random_state=91, stratify=y
)

pipe_mv = build_pipeline()

print("Starting model training. It will take some time, sit tight......")
t1 = time.time()
pipe_mv.fit(x_train, y_train)
t2 = time.time()

print("Model trained successfully")
minutes, seconds = np.divmod(t2 - t1, 60)
print(f"Time Elapsed: {minutes:.0f} M {seconds:.2f} S")


print(eval(y_test,x_test,pipe_mv))

joblib.dump(pipe_mv, "models/pipe.pkl")
joblib.dump(column_name, "models/column_names.pkl")
print("Model and column names saved successfully.")


if __name__ == "__main__":
main()
Loading