33================================================================
44A demo of Robust Regression on real dataset "california housing"
55================================================================
6- In this example we compare the RobustWeightedEstimator using SGDRegressor
7- for regression on the real dataset california housing.
8- WARNING: running this example can take some time (<1hour).
9-
10- We also compare with robust estimators from scikit-learn: TheilSenRegressor
11- and RANSACRegressor
6+ In this example we compare the RobustWeightedRegressor to other scikit-learn
7+ regressors on the real dataset california housing.
8+ WARNING: running this example can take some time (<1 hour on recent computer).
129
1310One of the main point of this example is the importance of taking into account
1411outliers in the test dataset when dealing with real datasets.
1512
16- For this example, we took a parameter so that RobustWeightedEstimator is better
13+ For this example, we took a parameter so that RobustWeightedRegressor is better
1714than RANSAC and TheilSen when talking about the mean squared error and it
1815is better than the SGDRegressor when talking about the median squared error.
1916Depending on what criterion one want to optimize, the parameter measuring
20- robustness in RobustWeightedEstimator can change and this is not so
17+ robustness in RobustWeightedRegressor can change and this is not so
2118straightforward when using RANSAC and TheilSenRegressor.
2219"""
2320import matplotlib .pyplot as plt
2421import numpy as np
25- from sklearn_extra .robust import RobustWeightedEstimator
22+ from sklearn_extra .robust import RobustWeightedRegressor
2623from sklearn .linear_model import (
2724 SGDRegressor ,
2825 TheilSenRegressor ,
@@ -57,19 +54,18 @@ def quadratic_loss(est, X, y, X_test, y_test):
5754 ),
5855 ),
5956 (
60- "RWE, Huber weights" ,
61- RobustWeightedEstimator (
62- SGDRegressor (
63- learning_rate = "adaptive" ,
64- eta0 = 1e-6 ,
65- max_iter = 1000 ,
66- n_iter_no_change = 100 ,
67- ),
68- loss = "squared_loss" ,
57+ "RobustWeightedRegressor" ,
58+ RobustWeightedRegressor (
6959 weighting = "huber" ,
7060 c = 0.5 ,
7161 eta0 = 1e-6 ,
7262 max_iter = 500 ,
63+ sgd_args = {
64+ "max_iter" : 1000 ,
65+ "n_iter_no_change" : 100 ,
66+ "learning_rate" : "adaptive" ,
67+ "eta0" : 1e-6 ,
68+ },
7369 ),
7470 ),
7571 ("RANSAC" , RANSACRegressor ()),
@@ -82,14 +78,19 @@ def quadratic_loss(est, X, y, X_test, y_test):
8278for f in range (M ):
8379 print ("\r Progress: %s / %s" % (f + 1 , M ), end = "" )
8480
81+ rng = np .random .RandomState (f )
82+
8583 # Split in a training set and a test set
86- X_train , X_test , y_train , y_test = train_test_split (X , y , test_size = 0.2 )
84+ X_train , X_test , y_train , y_test = train_test_split (
85+ X , y , test_size = 0.2 , random_state = rng
86+ )
8787
8888 for i , (name , est ) in enumerate (estimators ):
8989 cv = quadratic_loss (est , X_train , y_train , X_test , y_test )
9090
9191 # It is preferable to use the median of the validation losses
92- # because it is possible that some outliers are present in the test set.
92+ # because it is possible that some outliers are present in the
93+ # test set.
9394 # We compute both for comparison.
9495 res [i , f , 0 ] = np .mean (cv )
9596 res [i , f , 1 ] = np .median (cv )
0 commit comments