Skip to content

Commit 7319fe2

Browse files
authored
Add files via upload
1 parent f7696ca commit 7319fe2

File tree

7 files changed

+153
-58
lines changed

7 files changed

+153
-58
lines changed

workspace/Datasets/dataset.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from torch.utils.data import Dataset
2-
from torch import tensor
32

43
class ProblemDataset(Dataset):
54
def __init__(self, data, inputs_encoder, labels_encoder):
@@ -10,4 +9,16 @@ def __len__(self):
109
return len(self.inputs)
1110

1211
def __getitem__(self, index):
13-
return self.inputs[index], self.labels[index]
12+
return self.inputs[index], self.labels[index]
13+
14+
def tolist(dataset):
15+
inputs = []
16+
labels = []
17+
18+
for inp, lab in dataset:
19+
inputs.append(inp[0, :].numpy().tolist())
20+
labels.append(lab.numpy().tolist())
21+
22+
labels = [item for sublist in labels for item in sublist]
23+
24+
return inputs, labels

workspace/Datasets/encoders.py

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -25,21 +25,10 @@ def inputs_encoder(inputs):
2525

2626
encoded_input = cat((encoded_input_ids, encoded_attention_mask, encoded_token_type_ids), dim = 0).unsqueeze(0)
2727
encoded_inputs.append(encoded_input)
28+
29+
encoded = cat(encoded_inputs)
2830

29-
try:
30-
encoded = cat(encoded_inputs)
31-
return encoded
32-
except:
33-
print('Number of inputs:', len(encoded_inputs))
34-
max_dif = max([e.shape[2] for e in encoded_inputs])
35-
print('Max shape:', max_dif)
36-
for idx, e in enumerate(encoded_inputs):
37-
if e.shape[2] != max_len:
38-
print(idx)
39-
print(e.shape[2])
40-
print(inputs[idx])
41-
break
42-
return None
31+
return encoded
4332

4433
def labels_encoder(labels):
4534
encoded_labels = []
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from sklearn.pipeline import Pipeline
2+
from sklearn.feature_extraction.text import TfidfTransformer
3+
4+
import matplotlib.pyplot as plt
5+
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
6+
7+
class ClassicClassifier():
8+
def __init__(self, clf):
9+
self.text_clf = Pipeline([
10+
('tfidf', TfidfTransformer()),
11+
('clf', clf()),
12+
])
13+
14+
def fit(self, inputs, labels):
15+
self.text_clf = self.text_clf.fit(inputs, labels)
16+
17+
def predict(self, inputs):
18+
return self.text_clf.predict(inputs)
19+
20+
def evaluate(self, labels, predicts):
21+
target_names = ['Easy', 'Medium', 'Hard']
22+
macro_f1 = f1_score(labels, predicts, average='macro')
23+
cm = confusion_matrix(labels, predicts)
24+
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
25+
print(f'Macro F1: {macro_f1}')
26+
disp.plot()
27+
plt.show()

workspace/Models/model.py

Lines changed: 52 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,47 @@
1+
import torch
12
from torch import nn, long, argmax, optim, save
2-
from torch import no_grad
33
from transformers import BertModel
44
from torch import cuda
5-
from Datasets.dataloader import create_dataloader
65
from loss import calc_loss
76

87
import matplotlib.pyplot as plt
98
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
109

1110
class BERTModule(nn.Module):
12-
def __init__(self, epochs = 10, learning_rate = 1e-05, dropout_p = 0.3):
11+
def __init__(self, n_classes, dropout_p = 0.3):
1312
super(BERTModule, self).__init__()
1413
self.bert = BertModel.from_pretrained('bert-base-uncased')
1514
# for param in self.bert.parameters():
1615
# param.requires_grad = False
1716
self.dropout = nn.Dropout(p = dropout_p)
18-
self.fc = nn.Linear(768, 3)
17+
self.fc = nn.Linear(768, n_classes)
1918

20-
self.epochs = epochs
21-
self.learning_rate = learning_rate
19+
self.device = 'cuda' if cuda.is_available() else 'cpu'
20+
self.to(self.device)
2221

2322
def forward(self, ids, masks, ttis):
2423
_, pooled_output = self.bert(ids, attention_mask = masks, token_type_ids = ttis, return_dict = False)
25-
output_2 = self.dropout(pooled_output)
26-
output = self.fc(output_2)
27-
24+
output_drop = self.dropout(pooled_output)
25+
output = self.fc(output_drop)
26+
2827
return output
29-
30-
def fit(self, train_loader, test_loader):
31-
self.device = 'cuda' if cuda.is_available() else 'cpu'
32-
self.to(self.device)
3328

34-
self.train()
35-
29+
def fit(self, train_loader, test_loader, epochs = 10, learning_rate = 1e-05):
30+
self.epochs = epochs
31+
self.learning_rate = learning_rate
32+
3633
criterion = nn.CrossEntropyLoss()
3734
optimizer = optim.Adam(params = self.parameters(), lr = self.learning_rate)
38-
39-
print('Begin training...')
35+
36+
self.to(self.device)
37+
38+
self.train()
4039

4140
train_losses = []
4241
test_losses = []
4342

43+
print('Begin training...')
44+
4445
for epoch in range(self.epochs):
4546
train_loss = 0.
4647

@@ -66,9 +67,9 @@ def fit(self, train_loader, test_loader):
6667
optimizer.step()
6768

6869
train_loss += loss.item()
69-
70+
7071
avg_train_loss = train_loss / len(train_loader)
71-
avg_test_loss = calc_loss(self, test_loader, criterion, self.device)
72+
avg_test_loss = calc_loss(self, test_loader, criterion)
7273

7374
train_losses.append(avg_train_loss)
7475
test_losses.append(avg_test_loss)
@@ -88,9 +89,9 @@ def evaluate(self, dataloader):
8889
data_labels = []
8990
data_outputs = []
9091

91-
with no_grad():
92+
with torch.no_grad():
9293
for inputs, labels in dataloader:
93-
94+
9495
ids = inputs[:, 0].to(self.device, dtype=long)
9596
masks = inputs[:, 1].to(self.device, dtype=long)
9697
tti = inputs[:, 2].to(self.device, dtype=long)
@@ -109,15 +110,41 @@ def evaluate(self, dataloader):
109110
data_labels.extend(labels.cpu().detach().numpy().tolist())
110111
data_outputs.extend(outputs.cpu().detach().numpy().tolist())
111112

112-
113113
target_names = ['Easy', 'Medium', 'Hard']
114114
macro_f1 = f1_score(data_labels, data_outputs, average='macro')
115115
cm = confusion_matrix(data_labels, data_outputs)
116-
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Easy', 'Medium', 'Hard'])
116+
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
117117
print(f'Macro F1: {macro_f1}')
118118
disp.plot()
119119
plt.show()
120120

121+
def predict(self, text):
122+
self.eval()
123+
124+
from Datasets.encoders import define_encoders
125+
input_encoder, _ = define_encoders(max_len=300)
126+
127+
with torch.no_grad():
128+
input = input_encoder(text)
129+
130+
ids = input[:, 0].to(self.device, dtype=long)
131+
masks = input[:, 1].to(self.device, dtype=long)
132+
tti = input[:, 2].to(self.device, dtype=long)
133+
labels = labels.squeeze().to(self.device, dtype=long)
134+
135+
assert ids.shape == masks.shape, 'Ids != Masks'
136+
assert masks.shape == tti.shape, 'Masks != Ttis'
137+
assert ids.shape == tti.shape, 'Ids != Ttis'
138+
139+
assert ids.shape[0] == labels.shape[0], 'inputs and labels are incompatible'
140+
141+
outputs = self(ids, masks, tti)
142+
outputs = nn.functional.softmax(outputs, dim=1)
143+
outputs = argmax(outputs, dim=1)
144+
145+
outputs = outputs.cpu().detach().numpy().tolist()
146+
147+
print(len(outputs))
121148

122-
def predict():
123-
return 1
149+
print(f'Text: {text}')
150+
print(f'Difficulty: {text}')

workspace/inference.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from torch import load
2+
3+
from Datasets.test_dataset import ProblemDataset
4+
from Datasets.load import load_data
5+
from Models.model import BERTModule
6+
7+
model_path = 'model_ep2_lr1e-05.pth'
8+
9+
model = BERTModule(n_classes = 3)
10+
model.load_state_dict(load(model_path))
11+
12+
text = """
13+
You want to create as many non-degenerate triangles as possible while satisfying the following requirements. Each triangle consists of 3
14+
distinct special points (not necessarily from different sides) as its corners. Each special point can only become the corner of at most 1
15+
triangle. All triangles must not intersect with each other.
16+
17+
Determine the maximum number of non-degenerate triangles that you can create.
18+
"""
19+
20+
model.predict(text=text)

workspace/loss.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
import torch
22
from torch import long
33

4-
def calc_loss(model, dataloader, criterion, device):
5-
4+
def calc_loss(model, dataloader, criterion):
65
with torch.no_grad():
7-
total_loss = 0.0
6+
total_loss = 0.
87
for inputs, labels in dataloader:
9-
ids = inputs[:, 0].to(device, dtype=long)
10-
masks = inputs[:, 1].to(device, dtype=long)
11-
tti = inputs[:, 2].to(device, dtype=long)
12-
labels = labels.squeeze().to(device, dtype=long)
8+
ids = inputs[:, 0].to(model.device, dtype=long)
9+
masks = inputs[:, 1].to(model.device, dtype=long)
10+
tti = inputs[:, 2].to(model.device, dtype=long)
11+
labels = labels.squeeze().to(model.device, dtype=long)
12+
13+
# print(ids.shape)
1314

1415
outputs = model(ids, masks, tti)
1516

workspace/train.py

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from torch import nn, long, optim, save
2-
31
from Datasets.dataset import ProblemDataset
42
from Models.model import BERTModule
53

@@ -11,18 +9,25 @@
119

1210
from show_loss import show_loss_evolution
1311

12+
from Models.classic_classifier import ClassicClassifier
13+
14+
from sklearn.svm import SVC
15+
from sklearn.ensemble import GradientBoostingClassifier
16+
from sklearn.ensemble import RandomForestClassifier
17+
18+
from Datasets.dataset import tolist
19+
1420
SEED = 42
1521
MAX_LEN = 200
16-
BATCH_SIZE = 8
22+
TRAIN_BATCH_SIZE = 8
23+
TEST_BATCH_SIZE = 4
1724
EPOCHS = 5
1825
LEARNING_RATE = 1e-05
1926
DATA_PATH = '../data/leetcode.csv'
2027

2128
define_seed(SEED)
2229

2330
inputs_encoder, labels_encoder = define_encoders(MAX_LEN)
24-
# TODO: Error in inputs_encoder, some inputs are getting dim_size greatter than MAX_LEN
25-
# an code is done to print the first input that gets a tensor with different shape
2631

2732
data = load_data(data_path=DATA_PATH)
2833

@@ -37,14 +42,29 @@
3742
seed=SEED
3843
)
3944

40-
train_loader = create_dataloader(dataset = train_set, batch_size = BATCH_SIZE, type='train')
41-
test_loader = create_dataloader(dataset = test_set, batch_size = BATCH_SIZE, type='test')
45+
train_loader = create_dataloader(dataset = train_set, batch_size = TRAIN_BATCH_SIZE, type='train')
46+
test_loader = create_dataloader(dataset = test_set, batch_size = TEST_BATCH_SIZE, type='test')
4247

43-
model = BERTModule(epochs = EPOCHS, learning_rate = LEARNING_RATE)
48+
model = BERTModule(n_classes = 3)
4449

45-
train_losses, test_losses = model.fit(train_loader=train_loader, test_loader=test_loader)
50+
train_losses, test_losses = model.fit(train_loader=train_loader, test_loader=test_loader, epochs = EPOCHS, learning_rate = LEARNING_RATE)
4651

4752
model.evaluate(dataloader=train_loader)
4853
model.evaluate(dataloader=test_loader)
4954

50-
show_loss_evolution(EPOCHS, train_losses, test_losses)
55+
show_loss_evolution(EPOCHS, train_losses, test_losses)
56+
57+
X_train, y_train = tolist(train_set)
58+
X_test, y_test = tolist(test_set)
59+
60+
svc = ClassicClassifier(SVC)
61+
gb = ClassicClassifier(GradientBoostingClassifier)
62+
rf = ClassicClassifier(RandomForestClassifier)
63+
64+
svc.fit(X_train, y_train)
65+
gb.fit(X_train, y_train)
66+
rf.fit(X_train, y_train)
67+
68+
svc.evaluate(y_test, svc.predict(X_test))
69+
gb.evaluate(y_test, gb.predict(X_test))
70+
rf.evaluate(y_test, rf.predict(X_test))

0 commit comments

Comments
 (0)