Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions app/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@

else:
st.error(f"Failed to load model info: {info_response.status_code}")
st.error(f"Error: {info_response.text}")
st.error(f"Base URL: {API_BASE_URL}")

except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
st.error(
Expand Down
Binary file modified models/v1/metadata.pkl
Binary file not shown.
Binary file modified models/v1/model.pkl
Binary file not shown.
Binary file modified models/v1/preprocessor.pkl
Binary file not shown.
Binary file modified models/v2/metadata.pkl
Binary file not shown.
Binary file modified models/v2/model.pkl
Binary file not shown.
43 changes: 36 additions & 7 deletions src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,21 +114,50 @@ def save_model(model, preprocessor, metadata, version: str) -> str:


def get_feature_importance(model, preprocessor):
"""Get feature importance from the trained model."""
"""Get feature importance from the trained model, aggregated by original features."""
if not hasattr(model, 'feature_importances_'):
return None

# Get feature names after preprocessing
if hasattr(preprocessor, 'get_feature_names_out'):
feature_names = preprocessor.get_feature_names_out()
transformed_feature_names = preprocessor.get_feature_names_out()
else:
feature_names = [f"feature_{i}" for i in range(
transformed_feature_names = [f"feature_{i}" for i in range(
len(model.feature_importances_))]

importance_df = pd.DataFrame({
'feature': feature_names,
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
# Create mapping from transformed features to original features
original_importance = {}

for i, transformed_name in enumerate(transformed_feature_names):
# Get raw importance and format as percentage
importance = model.feature_importances_[i] * 100

# Extract original feature name from transformed name
# OneHotEncoder creates names like 'cat_transformer__feature_name_category'
# RobustScaler creates names like 'num_transformer__feature_name'
if '__' in transformed_name:
parts = transformed_name.split('__')
if len(parts) >= 2:
original_feature = parts[1]
# For one-hot encoded features, remove the category suffix
if '_' in original_feature and parts[0] == 'cat_transformer':
original_feature = '_'.join(original_feature.split('_')[:-1])
else:
original_feature = transformed_name
else:
original_feature = transformed_name

# Aggregate importance by original feature
if original_feature in original_importance:
original_importance[original_feature] += importance
else:
original_importance[original_feature] = importance

# Create DataFrame and sort by importance
importance_df = pd.DataFrame([
{'feature': feature, 'importance': importance}
for feature, importance in original_importance.items()
]).sort_values('importance', ascending=False)

return importance_df.to_dict()

Expand Down