|
| 1 | +#!/usr/bin/env python |
| 2 | +"""Tests for `pytorch_tabular` package.""" |
| 3 | +import pytest |
| 4 | + |
| 5 | +from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig |
| 6 | +from pytorch_tabular.models import FTTransformerConfig |
| 7 | +from pytorch_tabular import TabularModel |
| 8 | +from pytorch_tabular.categorical_encoders import CategoricalEmbeddingTransformer |
| 9 | + |
| 10 | + |
| 11 | +@pytest.mark.parametrize("multi_target", [True, False]) |
| 12 | +@pytest.mark.parametrize( |
| 13 | + "continuous_cols", |
| 14 | + [ |
| 15 | + [ |
| 16 | + "AveRooms", |
| 17 | + "AveBedrms", |
| 18 | + "Population", |
| 19 | + "AveOccup", |
| 20 | + "Latitude", |
| 21 | + "Longitude", |
| 22 | + ], |
| 23 | + ], |
| 24 | +) |
| 25 | +@pytest.mark.parametrize("categorical_cols", [["HouseAgeBin"]]) |
| 26 | +@pytest.mark.parametrize("continuous_feature_transform", [None]) |
| 27 | +@pytest.mark.parametrize("normalize_continuous_features", [True]) |
| 28 | +@pytest.mark.parametrize("target_range", [True, False]) |
| 29 | +def test_regression( |
| 30 | + regression_data, |
| 31 | + multi_target, |
| 32 | + continuous_cols, |
| 33 | + categorical_cols, |
| 34 | + continuous_feature_transform, |
| 35 | + normalize_continuous_features, |
| 36 | + target_range, |
| 37 | +): |
| 38 | + (train, test, target) = regression_data |
| 39 | + if len(continuous_cols) + len(categorical_cols) == 0: |
| 40 | + assert True |
| 41 | + else: |
| 42 | + data_config = DataConfig( |
| 43 | + target=target + ["MedInc"] if multi_target else target, |
| 44 | + continuous_cols=continuous_cols, |
| 45 | + categorical_cols=categorical_cols, |
| 46 | + continuous_feature_transform=continuous_feature_transform, |
| 47 | + normalize_continuous_features=normalize_continuous_features, |
| 48 | + ) |
| 49 | + model_config_params = dict( |
| 50 | + task="regression", |
| 51 | + input_embed_dim=8, |
| 52 | + num_attn_blocks=1, |
| 53 | + num_heads=2, |
| 54 | + ) |
| 55 | + if target_range: |
| 56 | + _target_range = [] |
| 57 | + for target in data_config.target: |
| 58 | + _target_range.append( |
| 59 | + ( |
| 60 | + train[target].min().item(), |
| 61 | + train[target].max().item(), |
| 62 | + ) |
| 63 | + ) |
| 64 | + model_config_params["target_range"] = _target_range |
| 65 | + model_config = FTTransformerConfig(**model_config_params) |
| 66 | + trainer_config = TrainerConfig( |
| 67 | + max_epochs=1, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True |
| 68 | + ) |
| 69 | + optimizer_config = OptimizerConfig() |
| 70 | + |
| 71 | + tabular_model = TabularModel( |
| 72 | + data_config=data_config, |
| 73 | + model_config=model_config, |
| 74 | + optimizer_config=optimizer_config, |
| 75 | + trainer_config=trainer_config, |
| 76 | + ) |
| 77 | + tabular_model.fit(train=train, test=test) |
| 78 | + |
| 79 | + result = tabular_model.evaluate(test) |
| 80 | + assert "test_mean_squared_error" in result[0].keys() |
| 81 | + pred_df = tabular_model.predict(test) |
| 82 | + assert pred_df.shape[0] == test.shape[0] |
| 83 | + |
| 84 | + |
| 85 | +@pytest.mark.parametrize( |
| 86 | + "continuous_cols", |
| 87 | + [ |
| 88 | + [f"feature_{i}" for i in range(54)], |
| 89 | + ], |
| 90 | +) |
| 91 | +@pytest.mark.parametrize("categorical_cols", [["feature_0_cat"]]) |
| 92 | +@pytest.mark.parametrize("continuous_feature_transform", [None]) |
| 93 | +@pytest.mark.parametrize("normalize_continuous_features", [True]) |
| 94 | +def test_classification( |
| 95 | + classification_data, |
| 96 | + continuous_cols, |
| 97 | + categorical_cols, |
| 98 | + continuous_feature_transform, |
| 99 | + normalize_continuous_features, |
| 100 | +): |
| 101 | + (train, test, target) = classification_data |
| 102 | + if len(continuous_cols) + len(categorical_cols) == 0: |
| 103 | + assert True |
| 104 | + else: |
| 105 | + data_config = DataConfig( |
| 106 | + target=target, |
| 107 | + continuous_cols=continuous_cols, |
| 108 | + categorical_cols=categorical_cols, |
| 109 | + continuous_feature_transform=continuous_feature_transform, |
| 110 | + normalize_continuous_features=normalize_continuous_features, |
| 111 | + ) |
| 112 | + model_config_params = dict( |
| 113 | + task="classification", |
| 114 | + input_embed_dim=8, |
| 115 | + num_attn_blocks=1, |
| 116 | + num_heads=2, |
| 117 | + ) |
| 118 | + model_config = FTTransformerConfig(**model_config_params) |
| 119 | + trainer_config = TrainerConfig( |
| 120 | + max_epochs=1, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True |
| 121 | + ) |
| 122 | + optimizer_config = OptimizerConfig() |
| 123 | + |
| 124 | + tabular_model = TabularModel( |
| 125 | + data_config=data_config, |
| 126 | + model_config=model_config, |
| 127 | + optimizer_config=optimizer_config, |
| 128 | + trainer_config=trainer_config, |
| 129 | + ) |
| 130 | + tabular_model.fit(train=train, test=test) |
| 131 | + |
| 132 | + result = tabular_model.evaluate(test) |
| 133 | + assert "test_accuracy" in result[0].keys() |
| 134 | + pred_df = tabular_model.predict(test) |
| 135 | + assert pred_df.shape[0] == test.shape[0] |
| 136 | + |
| 137 | + |
| 138 | +def test_embedding_transformer(regression_data): |
| 139 | + (train, test, target) = regression_data |
| 140 | + data_config = DataConfig( |
| 141 | + target=target, |
| 142 | + continuous_cols=[ |
| 143 | + "AveRooms", |
| 144 | + "AveBedrms", |
| 145 | + "Population", |
| 146 | + "AveOccup", |
| 147 | + "Latitude", |
| 148 | + "Longitude", |
| 149 | + ], |
| 150 | + categorical_cols=["HouseAgeBin"], |
| 151 | + ) |
| 152 | + model_config_params = dict( |
| 153 | + task="regression", |
| 154 | + input_embed_dim=8, |
| 155 | + num_attn_blocks=1, |
| 156 | + num_heads=2, |
| 157 | + ) |
| 158 | + model_config = FTTransformerConfig(**model_config_params) |
| 159 | + trainer_config = TrainerConfig( |
| 160 | + max_epochs=1, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True |
| 161 | + ) |
| 162 | + optimizer_config = OptimizerConfig() |
| 163 | + |
| 164 | + tabular_model = TabularModel( |
| 165 | + data_config=data_config, |
| 166 | + model_config=model_config, |
| 167 | + optimizer_config=optimizer_config, |
| 168 | + trainer_config=trainer_config, |
| 169 | + ) |
| 170 | + tabular_model.fit(train=train, test=test) |
| 171 | + |
| 172 | + transformer = CategoricalEmbeddingTransformer(tabular_model) |
| 173 | + train_transform = transformer.fit_transform(train) |
| 174 | + embed_cols = [ |
| 175 | + col for col in train_transform.columns if "HouseAgeBin_embed_dim" in col |
| 176 | + ] |
| 177 | + assert len(train["HouseAgeBin"].unique()) + 1 == len( |
| 178 | + transformer._mapping["HouseAgeBin"].keys() |
| 179 | + ) |
| 180 | + assert all( |
| 181 | + [ |
| 182 | + val.shape[0] == len(embed_cols) |
| 183 | + for val in transformer._mapping["HouseAgeBin"].values() |
| 184 | + ] |
| 185 | + ) |
| 186 | + |
| 187 | + |
| 188 | +# import numpy as np |
| 189 | +# import pandas as pd |
| 190 | +# from sklearn.datasets import fetch_california_housing, fetch_covtype |
| 191 | + |
| 192 | + |
| 193 | +# def regression_data(): |
| 194 | +# dataset = fetch_california_housing(data_home="data", as_frame=True) |
| 195 | +# df = dataset.frame.sample(5000) |
| 196 | +# df["HouseAgeBin"] = pd.qcut(df["HouseAge"], q=4) |
| 197 | +# df["HouseAgeBin"] = "age_" + df.HouseAgeBin.cat.codes.astype(str) |
| 198 | +# test_idx = df.sample(int(0.2 * len(df)), random_state=42).index |
| 199 | +# test = df[df.index.isin(test_idx)] |
| 200 | +# train = df[~df.index.isin(test_idx)] |
| 201 | +# return (train, test, dataset.target_names) |
| 202 | + |
| 203 | + |
| 204 | +# def classification_data(): |
| 205 | +# dataset = fetch_covtype(data_home="data") |
| 206 | +# data = np.hstack([dataset.data, dataset.target.reshape(-1, 1)])[:10000, :] |
| 207 | +# col_names = [f"feature_{i}" for i in range(data.shape[-1])] |
| 208 | +# col_names[-1] = "target" |
| 209 | +# data = pd.DataFrame(data, columns=col_names) |
| 210 | +# data["feature_0_cat"] = pd.qcut(data["feature_0"], q=4) |
| 211 | +# data["feature_0_cat"] = "feature_0_" + data.feature_0_cat.cat.codes.astype(str) |
| 212 | +# test_idx = data.sample(int(0.2 * len(data)), random_state=42).index |
| 213 | +# test = data[data.index.isin(test_idx)] |
| 214 | +# train = data[~data.index.isin(test_idx)] |
| 215 | +# return (train, test, ["target"]) |
| 216 | + |
| 217 | + |
| 218 | +# test_regression( |
| 219 | +# regression_data(), |
| 220 | +# multi_target=True, |
| 221 | +# continuous_cols=[ |
| 222 | +# "AveRooms", |
| 223 | +# "AveBedrms", |
| 224 | +# "Population", |
| 225 | +# "AveOccup", |
| 226 | +# "Latitude", |
| 227 | +# "Longitude", |
| 228 | +# ], |
| 229 | +# categorical_cols=[], |
| 230 | +# continuous_feature_transform="yeo-johnson", |
| 231 | +# normalize_continuous_features=False, |
| 232 | +# target_range=True, |
| 233 | +# ) |
0 commit comments