Skip to content

Commit f7696ca

Browse files
authored
Adicionando arquivos do projeto
1 parent 6a3818f commit f7696ca

File tree

11 files changed

+2072
-0
lines changed

11 files changed

+2072
-0
lines changed

data/leetcode.csv

Lines changed: 1759 additions & 0 deletions
Large diffs are not rendered by default.

workspace/Datasets/dataloader.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from torch.utils.data import DataLoader
2+
3+
def create_dataloader(dataset, batch_size, type):
4+
if type == 'train':
5+
shuffle = True
6+
else:
7+
shuffle = False
8+
9+
loader = DataLoader(dataset=dataset, batch_size=batch_size, drop_last=True, shuffle=shuffle)
10+
11+
return loader

workspace/Datasets/dataset.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from torch.utils.data import Dataset
2+
from torch import tensor
3+
4+
class ProblemDataset(Dataset):
5+
def __init__(self, data, inputs_encoder, labels_encoder):
6+
self.inputs = inputs_encoder(data['inputs'])
7+
self.labels = labels_encoder(data['labels'])
8+
9+
def __len__(self):
10+
return len(self.inputs)
11+
12+
def __getitem__(self, index):
13+
return self.inputs[index], self.labels[index]

workspace/Datasets/encoders.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from torch import tensor, long
2+
from torch import unsqueeze, cat
3+
from transformers import BertTokenizer
4+
5+
def define_encoders(max_len):
6+
def inputs_encoder(inputs):
7+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased').encode_plus
8+
encoded_inputs = []
9+
for input in inputs:
10+
encoding = tokenizer(
11+
text=input,
12+
add_special_tokens=True,
13+
padding='max_length',
14+
truncation='longest_first',
15+
max_length=max_len
16+
)
17+
18+
input_ids = encoding['input_ids']
19+
attention_mask = encoding['attention_mask']
20+
token_type_ids = encoding['token_type_ids']
21+
22+
encoded_input_ids = tensor(input_ids, dtype=long).unsqueeze(0)
23+
encoded_attention_mask = tensor(attention_mask, dtype=long).unsqueeze(0)
24+
encoded_token_type_ids = tensor(token_type_ids, dtype=long).unsqueeze(0)
25+
26+
encoded_input = cat((encoded_input_ids, encoded_attention_mask, encoded_token_type_ids), dim = 0).unsqueeze(0)
27+
encoded_inputs.append(encoded_input)
28+
29+
try:
30+
encoded = cat(encoded_inputs)
31+
return encoded
32+
except:
33+
print('Number of inputs:', len(encoded_inputs))
34+
max_dif = max([e.shape[2] for e in encoded_inputs])
35+
print('Max shape:', max_dif)
36+
for idx, e in enumerate(encoded_inputs):
37+
if e.shape[2] != max_len:
38+
print(idx)
39+
print(e.shape[2])
40+
print(inputs[idx])
41+
break
42+
return None
43+
44+
def labels_encoder(labels):
45+
encoded_labels = []
46+
for label in labels:
47+
encoded_labels.append(
48+
tensor([label], dtype=long).unsqueeze(0)
49+
)
50+
51+
encoded = cat(encoded_labels)
52+
53+
return encoded
54+
55+
return inputs_encoder, labels_encoder

workspace/Datasets/load.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import pandas as pd
2+
3+
def load_data(data_path):
4+
df = pd.read_csv(data_path, encoding='utf-8')
5+
data = {
6+
'inputs': df.inputs.values.tolist(),
7+
'labels': df.labels.values.tolist()
8+
}
9+
10+
return data

workspace/Datasets/seed.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import torch
2+
import random
3+
import numpy as np
4+
5+
def define_seed(seed=42):
6+
torch.manual_seed(seed)
7+
random.seed(seed)
8+
np.random.seed(seed)
9+
#torch.use_deterministic_algorithms(True)

workspace/Datasets/split.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from torch.utils.data import random_split
2+
from torch import Generator
3+
4+
def split_data(dataset, lengths, seed=42):
5+
assert len(lengths) == 2, "You must define the size of train dataset and test dataset"
6+
7+
train_set, test_set = random_split(dataset, lengths, Generator().manual_seed(seed))
8+
9+
return train_set, test_set

workspace/Models/model.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
from torch import nn, long, argmax, optim, save
2+
from torch import no_grad
3+
from transformers import BertModel
4+
from torch import cuda
5+
from Datasets.dataloader import create_dataloader
6+
from loss import calc_loss
7+
8+
import matplotlib.pyplot as plt
9+
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
10+
11+
class BERTModule(nn.Module):
12+
def __init__(self, epochs = 10, learning_rate = 1e-05, dropout_p = 0.3):
13+
super(BERTModule, self).__init__()
14+
self.bert = BertModel.from_pretrained('bert-base-uncased')
15+
# for param in self.bert.parameters():
16+
# param.requires_grad = False
17+
self.dropout = nn.Dropout(p = dropout_p)
18+
self.fc = nn.Linear(768, 3)
19+
20+
self.epochs = epochs
21+
self.learning_rate = learning_rate
22+
23+
def forward(self, ids, masks, ttis):
24+
_, pooled_output = self.bert(ids, attention_mask = masks, token_type_ids = ttis, return_dict = False)
25+
output_2 = self.dropout(pooled_output)
26+
output = self.fc(output_2)
27+
28+
return output
29+
30+
def fit(self, train_loader, test_loader):
31+
self.device = 'cuda' if cuda.is_available() else 'cpu'
32+
self.to(self.device)
33+
34+
self.train()
35+
36+
criterion = nn.CrossEntropyLoss()
37+
optimizer = optim.Adam(params = self.parameters(), lr = self.learning_rate)
38+
39+
print('Begin training...')
40+
41+
train_losses = []
42+
test_losses = []
43+
44+
for epoch in range(self.epochs):
45+
train_loss = 0.
46+
47+
for inputs, labels in train_loader:
48+
optimizer.zero_grad()
49+
50+
ids = inputs[:, 0].to(self.device, dtype=long)
51+
masks = inputs[:, 1].to(self.device, dtype=long)
52+
tti = inputs[:, 2].to(self.device, dtype=long)
53+
labels = labels.squeeze().to(self.device, dtype=long)
54+
55+
assert ids.shape == masks.shape, 'Ids != Masks'
56+
assert masks.shape == tti.shape, 'Masks != Ttis'
57+
assert ids.shape == tti.shape, 'Ids != Ttis'
58+
59+
assert ids.shape[0] == labels.shape[0], 'inputs and labels are incompatible'
60+
61+
outputs = self(ids, masks, tti)
62+
63+
loss = criterion(outputs, labels)
64+
65+
loss.backward()
66+
optimizer.step()
67+
68+
train_loss += loss.item()
69+
70+
avg_train_loss = train_loss / len(train_loader)
71+
avg_test_loss = calc_loss(self, test_loader, criterion, self.device)
72+
73+
train_losses.append(avg_train_loss)
74+
test_losses.append(avg_test_loss)
75+
76+
print(f'Epoch {epoch + 1}/{self.epochs} Train Loss: {avg_train_loss} Test Loss: {avg_test_loss}')
77+
78+
print('Ending training...')
79+
80+
model_name = 'model' + '_' + 'ep' + str(self.epochs) + '_' + 'lr' + str(self.learning_rate) + '.pth'
81+
save(self.state_dict(), model_name)
82+
83+
return train_losses, test_losses
84+
85+
def evaluate(self, dataloader):
86+
self.eval()
87+
88+
data_labels = []
89+
data_outputs = []
90+
91+
with no_grad():
92+
for inputs, labels in dataloader:
93+
94+
ids = inputs[:, 0].to(self.device, dtype=long)
95+
masks = inputs[:, 1].to(self.device, dtype=long)
96+
tti = inputs[:, 2].to(self.device, dtype=long)
97+
labels = labels.squeeze().to(self.device, dtype=long)
98+
99+
assert ids.shape == masks.shape, 'Ids != Masks'
100+
assert masks.shape == tti.shape, 'Masks != Ttis'
101+
assert ids.shape == tti.shape, 'Ids != Ttis'
102+
103+
assert ids.shape[0] == labels.shape[0], 'inputs and labels are incompatible'
104+
105+
outputs = self(ids, masks, tti)
106+
outputs = nn.functional.softmax(outputs, dim=1)
107+
outputs = argmax(outputs, dim=1)
108+
109+
data_labels.extend(labels.cpu().detach().numpy().tolist())
110+
data_outputs.extend(outputs.cpu().detach().numpy().tolist())
111+
112+
113+
target_names = ['Easy', 'Medium', 'Hard']
114+
macro_f1 = f1_score(data_labels, data_outputs, average='macro')
115+
cm = confusion_matrix(data_labels, data_outputs)
116+
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Easy', 'Medium', 'Hard'])
117+
print(f'Macro F1: {macro_f1}')
118+
disp.plot()
119+
plt.show()
120+
121+
122+
def predict():
123+
return 1

workspace/loss.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import torch
2+
from torch import long
3+
4+
def calc_loss(model, dataloader, criterion, device):
5+
6+
with torch.no_grad():
7+
total_loss = 0.0
8+
for inputs, labels in dataloader:
9+
ids = inputs[:, 0].to(device, dtype=long)
10+
masks = inputs[:, 1].to(device, dtype=long)
11+
tti = inputs[:, 2].to(device, dtype=long)
12+
labels = labels.squeeze().to(device, dtype=long)
13+
14+
outputs = model(ids, masks, tti)
15+
16+
loss = criterion(outputs, labels)
17+
18+
total_loss += loss.item()
19+
20+
average_loss = total_loss / len(dataloader)
21+
22+
return average_loss

workspace/show_loss.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import matplotlib.pyplot as plt
2+
3+
def show_loss_evolution(num_epochs, train_losses, test_losses):
4+
plt.plot(range(1, num_epochs + 1), train_losses, marker='o', linestyle='-', color='b', label='Train')
5+
plt.plot(range(1, num_epochs + 1), test_losses, marker='x', linestyle='-', color='g', label='Test')
6+
7+
plt.title('Learning Curve')
8+
plt.xlabel('Epochs')
9+
plt.ylabel('Average Loss')
10+
plt.legend()
11+
plt.show()

0 commit comments

Comments
 (0)