Skip to content

Commit 9db56a9

Browse files
committed
-- added feature extaractor
-- added unittests
1 parent a2c40a5 commit 9db56a9

File tree

3 files changed

+214
-32
lines changed

3 files changed

+214
-32
lines changed

examples/to_test_regression.py

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -116,18 +116,24 @@ def fake_metric(y_hat, y):
116116
optimizer=torch.optim.Adagrad,
117117
optimizer_params={},
118118
)
119-
tabular_model.save_model("examples/sample")
120-
result = tabular_model.evaluate(test)
121-
print(result)
122-
# # print(result[0]['train_loss'])
123-
new_mdl = TabularModel.load_from_checkpoint("examples/sample")
124-
# TODO test none no test loader
125-
result = new_mdl.evaluate(test)
126-
print(result)
127-
tabular_model.fit(
128-
train=train, test=test, metrics=[fake_metric], target_transform=tr, max_epochs=2
129-
)
130-
pred_df = tabular_model.predict(test, quantiles=[0.25], ret_logits=True)
131-
print(pred_df.head())
119+
120+
from pytorch_tabular.feature_extractor import DeepFeatureExtractor
121+
122+
dt = DeepFeatureExtractor(tabular_model)
123+
enc_df = dt.fit_transform(test)
124+
print(enc_df.head())
125+
# tabular_model.save_model("examples/sample")
126+
# result = tabular_model.evaluate(test)
127+
# print(result)
128+
# # # print(result[0]['train_loss'])
129+
# new_mdl = TabularModel.load_from_checkpoint("examples/sample")
130+
# # TODO test none no test loader
131+
# result = new_mdl.evaluate(test)
132+
# print(result)
133+
# tabular_model.fit(
134+
# train=train, test=test, metrics=[fake_metric], target_transform=tr, max_epochs=2
135+
# )
136+
# pred_df = tabular_model.predict(test, quantiles=[0.25], ret_logits=True)
137+
# print(pred_df.head())
132138

133139
# pred_df.to_csv("output/temp2.csv")
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# Pytorch Tabular
2+
# Author: Manu Joseph <manujoseph@gmail.com>
3+
# For license information, see LICENSE.TXT
4+
from collections import defaultdict
5+
6+
import numpy as np
7+
import pandas as pd
8+
from sklearn.base import BaseEstimator, TransformerMixin
9+
from tqdm.autonotebook import tqdm
10+
11+
from pytorch_tabular.models import NODEModel, TabNetModel
12+
from pytorch_tabular.models.mixture_density import BaseMDN
13+
14+
try:
15+
import cPickle as pickle
16+
except ImportError:
17+
import pickle
18+
19+
import torch
20+
21+
22+
class DeepFeatureExtractor(BaseEstimator, TransformerMixin):
23+
def __init__(
24+
self, tabular_model, extract_keys=["backbone_features"], drop_original=True
25+
):
26+
"""Initializes the Transformer and extracts the neural features
27+
28+
Args:
29+
tabular_model (TabularModel): The trained TabularModel object
30+
"""
31+
assert not (
32+
isinstance(tabular_model.model, NODEModel)
33+
or isinstance(tabular_model.model, TabNetModel)
34+
or isinstance(tabular_model.model, BaseMDN)
35+
), "FeatureExtractor doesn't work for Mixture Density Networks, NODE Model, & Tabnet Model"
36+
self.tabular_model = tabular_model
37+
self.extract_keys = extract_keys
38+
self.drop_original = drop_original
39+
40+
def fit(self, X, y=None):
41+
"""Just for compatibility. Does not do anything"""
42+
return self
43+
44+
def transform(self, X: pd.DataFrame, y=None):
45+
"""Transforms the categorical columns specified to the trained neural features from the model
46+
47+
Args:
48+
X (pd.DataFrame): DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.
49+
y ([type], optional): Only for compatibility. Not used. Defaults to None.
50+
51+
Raises:
52+
ValueError: [description]
53+
54+
Returns:
55+
pd.DataFrame: The encoded dataframe
56+
"""
57+
58+
X_encoded = X.copy(deep=True)
59+
orig_features = X_encoded.columns
60+
self.tabular_model.model.eval()
61+
inference_dataloader = (
62+
self.tabular_model.datamodule.prepare_inference_dataloader(X_encoded)
63+
)
64+
logits_predictions = defaultdict(list)
65+
for batch in tqdm(inference_dataloader, desc="Generating Features..."):
66+
for k, v in batch.items():
67+
if isinstance(v, list) and (len(v) == 0):
68+
# Skipping empty list
69+
continue
70+
batch[k] = v.to(self.tabular_model.model.device)
71+
_, ret_value = self.tabular_model.model.predict(
72+
batch, ret_model_output=True
73+
)
74+
for k in self.extract_keys:
75+
if k in ret_value.keys():
76+
logits_predictions[k].append(ret_value[k].detach().cpu())
77+
78+
for k, v in logits_predictions.items():
79+
v = torch.cat(v, dim=0).numpy()
80+
if v.ndim == 1:
81+
v = v.reshape(-1, 1)
82+
for i in range(v.shape[-1]):
83+
if v.shape[-1] > 1:
84+
X_encoded[f"{k}_{i}"] = v[:, i]
85+
else:
86+
X_encoded[f"{k}"] = v[:, i]
87+
88+
if self.drop_original:
89+
X_encoded.drop(columns=orig_features, inplace=True)
90+
return X_encoded
91+
92+
def fit_transform(self, X: pd.DataFrame, y=None):
93+
"""Encode given columns of X based on the learned features.
94+
95+
Args:
96+
X (pd.DataFrame): DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.
97+
y ([type], optional): Only for compatibility. Not used. Defaults to None.
98+
99+
Returns:
100+
pd.DataFrame: The encoded dataframe
101+
"""
102+
self.fit(X, y)
103+
return self.transform(X)
104+
105+
def save_as_object_file(self, path):
106+
if not self._mapping:
107+
raise ValueError(
108+
"`fit` method must be called before `save_as_object_file`."
109+
)
110+
pickle.dump(self.__dict__, open(path, "wb"))
111+
112+
def load_from_object_file(self, path):
113+
for k, v in pickle.load(open(path, "rb")).items():
114+
setattr(self, k, v)

tests/test_common.py

Lines changed: 81 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,27 @@
22
"""Tests for `pytorch_tabular` package."""
33

44
import pytest
5-
import numpy as np
65
import torch
7-
from sklearn.preprocessing import PowerTransformer
8-
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
9-
from pytorch_tabular.models import CategoryEmbeddingModelConfig, AutoIntConfig, NodeConfig, TabNetModelConfig, CategoryEmbeddingMDNConfig
6+
107
from pytorch_tabular import TabularModel
11-
from pytorch_tabular.categorical_encoders import CategoricalEmbeddingTransformer
12-
13-
MODEL_CONFIGS = [
14-
CategoryEmbeddingModelConfig,
15-
AutoIntConfig,
16-
NodeConfig,
17-
TabNetModelConfig,
18-
CategoryEmbeddingMDNConfig
8+
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
9+
from pytorch_tabular.feature_extractor import DeepFeatureExtractor
10+
from pytorch_tabular.models import (
11+
AutoIntConfig,
12+
CategoryEmbeddingModelConfig,
13+
NodeConfig,
14+
TabNetModelConfig,
15+
)
16+
17+
MODEL_CONFIG_SAVE_TEST = [
18+
CategoryEmbeddingModelConfig,
19+
AutoIntConfig,
20+
TabNetModelConfig,
21+
]
22+
23+
MODEL_CONFIG_FEATURE_EXT_TEST = [
24+
CategoryEmbeddingModelConfig,
25+
AutoIntConfig,
1926
]
2027

2128

@@ -25,7 +32,7 @@ def fake_metric(y_hat, y):
2532

2633
@pytest.mark.parametrize(
2734
"model_config_class",
28-
MODEL_CONFIGS,
35+
MODEL_CONFIG_SAVE_TEST,
2936
)
3037
@pytest.mark.parametrize(
3138
"continuous_cols",
@@ -52,7 +59,7 @@ def test_save_load(
5259
custom_metrics,
5360
custom_loss,
5461
custom_optimizer,
55-
tmpdir
62+
tmpdir,
5663
):
5764
(train, test, target) = regression_data
5865
data_config = DataConfig(
@@ -83,12 +90,67 @@ def test_save_load(
8390
)
8491

8592
result_1 = tabular_model.evaluate(test)
86-
print(result_1)
87-
tmpdir.mkdir("save_model")
88-
tabular_model.save_model("save_model")
89-
new_mdl = TabularModel.load_from_checkpoint("save_model")
93+
sv_dir = tmpdir.mkdir("save_model")
94+
tabular_model.save_model(str(sv_dir))
95+
new_mdl = TabularModel.load_from_checkpoint(str(sv_dir))
9096
result_2 = new_mdl.evaluate(test)
91-
assert result_1[0][f'test_{tabular_model.model.hparams.metrics[0]}'] == result_2[0][f'test_{new_mdl.model.hparams.metrics[0]}']
97+
assert (
98+
result_1[0][f"test_{tabular_model.model.hparams.metrics[0]}"]
99+
== result_2[0][f"test_{new_mdl.model.hparams.metrics[0]}"]
100+
)
101+
102+
103+
@pytest.mark.parametrize(
104+
"model_config_class",
105+
MODEL_CONFIG_FEATURE_EXT_TEST,
106+
)
107+
@pytest.mark.parametrize(
108+
"continuous_cols",
109+
[
110+
[
111+
"AveRooms",
112+
"AveBedrms",
113+
"Population",
114+
"AveOccup",
115+
"Latitude",
116+
"Longitude",
117+
],
118+
],
119+
)
120+
@pytest.mark.parametrize("categorical_cols", [["HouseAgeBin"]])
121+
def test_feature_extractor(
122+
regression_data,
123+
model_config_class,
124+
continuous_cols,
125+
categorical_cols,
126+
):
127+
(train, test, target) = regression_data
128+
data_config = DataConfig(
129+
target=target,
130+
continuous_cols=continuous_cols,
131+
categorical_cols=categorical_cols,
132+
)
133+
model_config_params = dict(task="regression")
134+
model_config = model_config_class(**model_config_params)
135+
trainer_config = TrainerConfig(
136+
max_epochs=3, checkpoints=None, early_stopping=None, gpus=0
137+
)
138+
optimizer_config = OptimizerConfig()
139+
140+
tabular_model = TabularModel(
141+
data_config=data_config,
142+
model_config=model_config,
143+
optimizer_config=optimizer_config,
144+
trainer_config=trainer_config,
145+
)
146+
tabular_model.fit(
147+
train=train,
148+
test=test,
149+
)
150+
dt = DeepFeatureExtractor(tabular_model)
151+
enc_df = dt.fit_transform(test)
152+
assert any([col for col in enc_df.columns if "backbone" in col])
153+
92154

93155
# import numpy as np
94156
# import pandas as pd

0 commit comments

Comments
 (0)