seq to seq 예제 1
import torch
import torch.nn as nn
x = list(map(ord, 'hello'))
y = list(map(ord, 'hola'))
print(x)
'''
[104, 101, 108, 108, 111]
'''
print(y)
'''
[104, 111, 108, 97]
'''
vocab_size = 256
x_data = torch.LongTensor(x)
y_data = torch.LongTensor(y)
class Seq2SeqNet(nn.Module):
def __init__(self, vocab_size, hidden_size):
super().__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding(vocab_size, hidden_size)
self.encoder = nn.GRU(hidden_size, hidden_size)
self.decoder = nn.GRU(hidden_size, hidden_size)
self.fc = nn.Linear(hidden_size, vocab_size)
def init_state(self, batch_size=1):
return torch.zeros(1, batch_size, self.hidden_size)
def forward(self, inputs, targets):
initstate = self.init_state()
embedding = self.embedding(inputs).unsqueeze(dim=1)
encoder_output, encoder_state = self.encoder(embedding, initstate)
decoder_state = encoder_state
decoder_input = torch.LongTensor([0])
outputs = []
for i in range(targets.size()[0]):
decoder_input = self.embedding(decoder_input).unsqueeze(dim=1)
decoder_output, decoder_state = self.decoder(decoder_input, decoder_state)
foutput = self.fc(decoder_output)
outputs.append(foutput)
decoder_input = torch.LongTensor([targets[i]])
outputs = torch.stack(outputs).squeeze()
return outputs
seq2seq = Seq2SeqNet(vocab_size, 16)
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(seq2seq.parameters(), lr=1e-3)
for epoch in range(1000):
hypothesis = seq2seq(x_data, y_data)
loss = loss_func(hypothesis, y_data)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % 50 == 0:
print(f'epoch{epoch+1}, loss:{loss.item():.4f}')
_, top = hypothesis.data.topk(k=1, dim=1)
print([chr(c) for c in top.squeeze().numpy().tolist()])
'''
epoch1, loss:5.6597
['\x18', '\x18', '/', 'ñ']
epoch51, loss:4.0978
['h', 'o', 'l', 'l']
epoch101, loss:1.9054
['h', 'o', 'l', 'l']
epoch151, loss:0.9075
['h', 'o', 'l', 'a']
epoch201, loss:0.5561
['h', 'o', 'l', 'a']
epoch251, loss:0.3881
['h', 'o', 'l', 'a']
epoch301, loss:0.2807
['h', 'o', 'l', 'a']
epoch351, loss:0.2125
['h', 'o', 'l', 'a']
epoch401, loss:0.1683
['h', 'o', 'l', 'a']
epoch451, loss:0.1379
['h', 'o', 'l', 'a']
epoch501, loss:0.1158
['h', 'o', 'l', 'a']
epoch551, loss:0.0992
['h', 'o', 'l', 'a']
epoch601, loss:0.0861
['h', 'o', 'l', 'a']
epoch651, loss:0.0757
['h', 'o', 'l', 'a']
epoch701, loss:0.0672
['h', 'o', 'l', 'a']
epoch751, loss:0.0602
['h', 'o', 'l', 'a']
epoch801, loss:0.0542
['h', 'o', 'l', 'a']
epoch851, loss:0.0492
['h', 'o', 'l', 'a']
epoch901, loss:0.0448
['h', 'o', 'l', 'a']
epoch951, loss:0.0410
['h', 'o', 'l', 'a']
'''
seq to seq 예제 2
import re
import unicodedata
import numpy as np
from collections import Counter
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
num_samples = 33000
def unicode_to_ascii(s):
return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
def preprocess_sentence(sent):
sent = unicode_to_ascii(sent.lower())
sent = re.sub(r"([?.!,¿])", r" \1", sent)
sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)
sent = re.sub(r"\s+", " ", sent)
return sent
def load_preprocessed_data():
encoder_input, decoder_input, decoder_target = [], [], []
with open("data/fra.txt", "r", encoding='UTF-8') as lines:
for i, line in enumerate(lines):
src_line, tar_line, _ = line.strip().split('\t')
src_line = [w for w in preprocess_sentence(src_line).split()]
tar_line = preprocess_sentence(tar_line)
tar_line_in = [w for w in ("<sos> " + tar_line).split()]
tar_line_out = [w for w in (tar_line + " <eos>").split()]
encoder_input.append(src_line)
decoder_input.append(tar_line_in)
decoder_target.append(tar_line_out)
if i == num_samples - 1:
break
return encoder_input, decoder_input, decoder_target
en_sent = u"Have you had dinner?"
fr_sent = u"Avez-vous déjà diné?"
print('전처리 전 영어 문장 :', en_sent)
print('전처리 후 영어 문장 :',preprocess_sentence(en_sent))
print('전처리 전 프랑스어 문장 :', fr_sent)
print('전처리 후 프랑스어 문장 :', preprocess_sentence(fr_sent))
print()
sents_en_in, sents_fra_in, sents_fra_out = load_preprocessed_data()
print('인코더의 입력 :',sents_en_in[:5])
print('디코더의 입력 :',sents_fra_in[:5])
print('디코더의 레이블 :',sents_fra_out[:5])
def build_vocab(sents):
word_list = []
for sent in sents:
for word in sent:
word_list.append(word)
word_counts = Counter(word_list)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_index = {}
word_to_index['<PAD>'] = 0
word_to_index['<UNK>'] = 1
for index, word in enumerate(vocab) :
word_to_index[word] = index + 2
return word_to_index
src_vocab = build_vocab(sents_en_in)
tar_vocab = build_vocab(sents_fra_in + sents_fra_out)
src_vocab_size = len(src_vocab)
tar_vocab_size = len(tar_vocab)
print("영어 단어 집합의 크기 : {:d}, 프랑스어 단어 집합의 크기 : {:d}".format(src_vocab_size, tar_vocab_size))
index_to_src = {v: k for k, v in src_vocab.items()}
index_to_tar = {v: k for k, v in tar_vocab.items()}
def texts_to_sequences(sents, word_to_index):
encoded_X_data = []
for sent in tqdm(sents):
index_sequences = []
for word in sent:
try:
index_sequences.append(word_to_index[word])
except KeyError:
index_sequences.append(word_to_index['<UNK>'])
encoded_X_data.append(index_sequences)
return encoded_X_data
encoder_input = texts_to_sequences(sents_en_in, src_vocab)
decoder_input = texts_to_sequences(sents_fra_in, tar_vocab)
decoder_target = texts_to_sequences(sents_fra_out, tar_vocab)
for i, (item1, item2) in zip(range(5), zip(sents_en_in, encoder_input)):
print(f"Index: {i}, 정수 인코딩 전: {item1}, 정수 인코딩 후: {item2}")
def pad_sequences(sentences, max_len=None):
if max_len is None:
max_len = max([len(sentence) for sentence in sentences])
features = np.zeros((len(sentences), max_len), dtype=int)
for index, sentence in enumerate(sentences):
if len(sentence) != 0:
features[index, :len(sentence)] = np.array(sentence)
return features
encoder_input = pad_sequences(encoder_input)
decoder_input = pad_sequences(decoder_input)
decoder_target = pad_sequences(decoder_target)
print('인코더의 입력의 크기(shape) :',encoder_input.shape)
print('디코더의 입력의 크기(shape) :',decoder_input.shape)
print('디코더의 레이블의 크기(shape) :',decoder_target.shape)
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)
print('랜덤 시퀀스 :',indices)
n_of_val = int(33000*0.1)
print('검증 데이터의 개수 :',n_of_val)
encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]
encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]
print('훈련 source 데이터의 크기 :',encoder_input_train.shape)
print('훈련 target 데이터의 크기 :',decoder_input_train.shape)
print('훈련 target 레이블의 크기 :',decoder_target_train.shape)
print('테스트 source 데이터의 크기 :',encoder_input_test.shape)
print('테스트 target 데이터의 크기 :',decoder_input_test.shape)
print('테스트 target 레이블의 크기 :',decoder_target_test.shape)
import torch
import torch.nn as nn
import torch.optim as optim
embedding_dim = 256
hidden_units = 256
class Encoder(nn.Module):
def __init__(self, src_vocab_size, embedding_dim, hidden_units):
super(Encoder, self).__init__()
self.embedding = nn.Embedding(src_vocab_size, embedding_dim, padding_idx=0)
self.lstm = nn.LSTM(embedding_dim, hidden_units, batch_first=True)
def forward(self, x):
x = self.embedding(x)
_, (hidden, cell) = self.lstm(x)
return hidden, cell
class Decoder(nn.Module):
def __init__(self, tar_vocab_size, embedding_dim, hidden_units):
super(Decoder, self).__init__()
self.embedding = nn.Embedding(tar_vocab_size, embedding_dim, padding_idx=0)
self.lstm = nn.LSTM(embedding_dim, hidden_units, batch_first=True)
self.fc = nn.Linear(hidden_units, tar_vocab_size)
def forward(self, x, hidden, cell):
x = self.embedding(x)
output, (hidden, cell) = self.lstm(x, (hidden, cell))
output = self.fc(output)
return output, hidden, cell
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder):
super(Seq2Seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, src, trg):
hidden, cell = self.encoder(src)
output, _, _ = self.decoder(trg, hidden, cell)
return output
encoder = Encoder(src_vocab_size, embedding_dim, hidden_units)
decoder = Decoder(tar_vocab_size, embedding_dim, hidden_units)
model = Seq2Seq(encoder, decoder)
loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters())
print(model)
def evaluation(model, dataloader, loss_function, device):
model.eval()
total_loss = 0.0
total_correct = 0
total_count = 0
with torch.no_grad():
for encoder_inputs, decoder_inputs, decoder_targets in dataloader:
encoder_inputs = encoder_inputs.to(device)
decoder_inputs = decoder_inputs.to(device)
decoder_targets = decoder_targets.to(device)
outputs = model(encoder_inputs, decoder_inputs)
loss = loss_function(outputs.view(-1, outputs.size(-1)), decoder_targets.view(-1))
total_loss += loss.item()
mask = decoder_targets != 0
total_correct += ((outputs.argmax(dim=-1) == decoder_targets) * mask).sum().item()
total_count += mask.sum().item()
return total_loss / len(dataloader), total_correct / total_count
encoder_input_train_tensor = torch.tensor(encoder_input_train, dtype=torch.long)
decoder_input_train_tensor = torch.tensor(decoder_input_train, dtype=torch.long)
decoder_target_train_tensor = torch.tensor(decoder_target_train, dtype=torch.long)
encoder_input_test_tensor = torch.tensor(encoder_input_test, dtype=torch.long)
decoder_input_test_tensor = torch.tensor(decoder_input_test, dtype=torch.long)
decoder_target_test_tensor = torch.tensor(decoder_target_test, dtype=torch.long)
batch_size = 128
train_dataset = TensorDataset(encoder_input_train_tensor,
decoder_input_train_tensor, decoder_target_train_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataset = TensorDataset(encoder_input_test_tensor,
decoder_input_test_tensor, decoder_target_test_tensor)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
num_epochs = 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
best_val_loss = float('inf')
for epoch in range(num_epochs):
model.train()
for encoder_inputs, decoder_inputs, decoder_targets in train_dataloader:
encoder_inputs = encoder_inputs.to(device)
decoder_inputs = decoder_inputs.to(device)
decoder_targets = decoder_targets.to(device)
optimizer.zero_grad()
outputs = model(encoder_inputs, decoder_inputs)
loss = loss_function(outputs.view(-1, outputs.size(-1)), decoder_targets.view(-1))
loss.backward()
optimizer.step()
train_loss, train_acc = evaluation(model, train_dataloader, loss_function, device)
valid_loss, valid_acc = evaluation(model, valid_dataloader, loss_function, device)
print(f'Epoch: {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}'
f' | Valid Loss: {valid_loss:.4f} | Valid Acc: {valid_acc:.4f}')
if valid_loss < best_val_loss:
print(f'Validation loss improved from {best_val_loss:.4f} to {valid_loss:.4f}. 체크포인트를 저장합니다.')
best_val_loss = valid_loss
torch.save(model.state_dict(), 'best_model_checkpoint.pth')
model.load_state_dict(torch.load('best_model_checkpoint.pth'))
model.to(device)
val_loss, val_accuracy = evaluation(model, valid_dataloader, loss_function, device)
print(f'Best model validation loss: {val_loss:.4f}')
print(f'Best model validation accuracy: {val_accuracy:.4f}')
print(tar_vocab['<sos>'])
print(tar_vocab['<eos>'])
index_to_src = {v: k for k, v in src_vocab.items()}
index_to_tar = {v: k for k, v in tar_vocab.items()}
def seq_to_src(input_seq):
sentence = ''
for encoded_word in input_seq:
if(encoded_word != 0):
sentence = sentence + index_to_src[encoded_word] + ' '
return sentence
def seq_to_tar(input_seq):
sentence = ''
for encoded_word in input_seq:
if(encoded_word != 0 and encoded_word != tar_vocab['<sos>'] and encoded_word != tar_vocab['<eos>']):
sentence = sentence + index_to_tar[encoded_word] + ' '
return sentence
print(encoder_input_test[25])
print(decoder_input_test[25])
print(decoder_target_test[25])
def decode_sequence(input_seq, model, src_vocab_size, tar_vocab_size, max_output_len,
int_to_src_token, int_to_tar_token):
encoder_inputs = torch.tensor(input_seq, dtype=torch.long).unsqueeze(0).to(device)
hidden, cell = model.encoder(encoder_inputs)
decoder_input = torch.tensor([3], dtype=torch.long).unsqueeze(0).to(device)
decoded_tokens = []
for _ in range(max_output_len):
output, hidden, cell = model.decoder(decoder_input, hidden, cell)
output_token = output.argmax(dim=-1).item()
if output_token == 4:
break
decoded_tokens.append(output_token)
decoder_input = torch.tensor([output_token], dtype=torch.long).unsqueeze(0).to(device)
return ' '.join(int_to_tar_token[token] for token in decoded_tokens)
for seq_index in [3, 50, 100, 300, 1001]:
input_seq = encoder_input_train[seq_index]
translated_text = decode_sequence(input_seq, model, src_vocab_size, tar_vocab_size, 20,
index_to_src, index_to_tar)
print("입력문장 :",seq_to_src(encoder_input_train[seq_index]))
print("정답문장 :",seq_to_tar(decoder_input_train[seq_index]))
print("번역문장 :",translated_text)
print("-"*50)
전처리 전 영어 문장 : Have you had dinner?
전처리 후 영어 문장 : have you had dinner ?
전처리 전 프랑스어 문장 : Avez-vous déjà diné?
전처리 후 프랑스어 문장 : avez vous deja dine ?
인코더의 입력 : [['go', '.'], ['go', '.'], ['go', '.'], ['go', '.'], ['hi', '.']]
디코더의 입력 : [['<sos>', 'va', '!'], ['<sos>', 'marche', '.'], ['<sos>', 'en', 'route', '!'], ['<sos>', 'bouge', '!'], ['<sos>', 'salut', '!']]
디코더의 레이블 : [['va', '!', '<eos>'], ['marche', '.', '<eos>'], ['en', 'route', '!', '<eos>'], ['bouge', '!', '<eos>'], ['salut', '!', '<eos>']]
영어 단어 집합의 크기 : 4486, 프랑스어 단어 집합의 크기 : 7879
100%|██████████| 33000/33000 [00:00<00:00, 1377330.08it/s]
100%|██████████| 33000/33000 [00:00<00:00, 430012.62it/s]
100%|██████████| 33000/33000 [00:00<00:00, 1143485.28it/s]
Index: 0, 정수 인코딩 전: ['go', '.'], 정수 인코딩 후: [27, 2]
Index: 1, 정수 인코딩 전: ['go', '.'], 정수 인코딩 후: [27, 2]
Index: 2, 정수 인코딩 전: ['go', '.'], 정수 인코딩 후: [27, 2]
Index: 3, 정수 인코딩 전: ['go', '.'], 정수 인코딩 후: [27, 2]
Index: 4, 정수 인코딩 전: ['hi', '.'], 정수 인코딩 후: [736, 2]
인코더의 입력의 크기(shape) : (33000, 7)
디코더의 입력의 크기(shape) : (33000, 16)
디코더의 레이블의 크기(shape) : (33000, 16)
랜덤 시퀀스 : [ 1070 28491 7463 ... 18993 12704 11595]
검증 데이터의 개수 : 3300
훈련 source 데이터의 크기 : (29700, 7)
훈련 target 데이터의 크기 : (29700, 16)
훈련 target 레이블의 크기 : (29700, 16)
테스트 source 데이터의 크기 : (3300, 7)
테스트 target 데이터의 크기 : (3300, 16)
테스트 target 레이블의 크기 : (3300, 16)
Seq2Seq(
(encoder): Encoder(
(embedding): Embedding(4486, 256, padding_idx=0)
(lstm): LSTM(256, 256, batch_first=True)
)
(decoder): Decoder(
(embedding): Embedding(7879, 256, padding_idx=0)
(lstm): LSTM(256, 256, batch_first=True)
(fc): Linear(in_features=256, out_features=7879, bias=True)
)
)
Epoch: 1/100 | Train Loss: 2.9177 | Train Acc: 0.5298 | Valid Loss: 3.2859 | Valid Acc: 0.5009
Validation loss improved from inf to 3.2859. 체크포인트를 저장합니다.
Epoch: 2/100 | Train Loss: 2.2537 | Train Acc: 0.6068 | Valid Loss: 2.7700 | Valid Acc: 0.5642
Validation loss improved from 3.2859 to 2.7700. 체크포인트를 저장합니다.
Epoch: 3/100 | Train Loss: 1.8379 | Train Acc: 0.6486 | Valid Loss: 2.4960 | Valid Acc: 0.5951
Validation loss improved from 2.7700 to 2.4960. 체크포인트를 저장합니다.
Epoch: 4/100 | Train Loss: 1.5281 | Train Acc: 0.6864 | Valid Loss: 2.3431 | Valid Acc: 0.6108
Validation loss improved from 2.4960 to 2.3431. 체크포인트를 저장합니다.
Epoch: 5/100 | Train Loss: 1.2744 | Train Acc: 0.7247 | Valid Loss: 2.2489 | Valid Acc: 0.6217
Validation loss improved from 2.3431 to 2.2489. 체크포인트를 저장합니다.
Epoch: 6/100 | Train Loss: 1.0668 | Train Acc: 0.7609 | Valid Loss: 2.1640 | Valid Acc: 0.6348
Validation loss improved from 2.2489 to 2.1640. 체크포인트를 저장합니다.
Epoch: 7/100 | Train Loss: 0.8854 | Train Acc: 0.7957 | Valid Loss: 2.1130 | Valid Acc: 0.6413
Validation loss improved from 2.1640 to 2.1130. 체크포인트를 저장합니다.
Epoch: 8/100 | Train Loss: 0.7395 | Train Acc: 0.8275 | Valid Loss: 2.0769 | Valid Acc: 0.6460
Validation loss improved from 2.1130 to 2.0769. 체크포인트를 저장합니다.
Epoch: 9/100 | Train Loss: 0.6192 | Train Acc: 0.8524 | Valid Loss: 2.0538 | Valid Acc: 0.6553
Validation loss improved from 2.0769 to 2.0538. 체크포인트를 저장합니다.
Epoch: 10/100 | Train Loss: 0.5284 | Train Acc: 0.8725 | Valid Loss: 2.0402 | Valid Acc: 0.6553
Validation loss improved from 2.0538 to 2.0402. 체크포인트를 저장합니다.
Epoch: 11/100 | Train Loss: 0.4454 | Train Acc: 0.8878 | Valid Loss: 2.0429 | Valid Acc: 0.6589
Epoch: 12/100 | Train Loss: 0.3900 | Train Acc: 0.8958 | Valid Loss: 2.0511 | Valid Acc: 0.6612
Epoch: 13/100 | Train Loss: 0.3476 | Train Acc: 0.9039 | Valid Loss: 2.0516 | Valid Acc: 0.6638
Epoch: 14/100 | Train Loss: 0.3068 | Train Acc: 0.9102 | Valid Loss: 2.0711 | Valid Acc: 0.6632
...
...
...
Epoch: 93/100 | Train Loss: 0.1320 | Train Acc: 0.9288 | Valid Loss: 2.6425 | Valid Acc: 0.6622
Epoch: 94/100 | Train Loss: 0.1313 | Train Acc: 0.9290 | Valid Loss: 2.6218 | Valid Acc: 0.6640
Epoch: 95/100 | Train Loss: 0.1317 | Train Acc: 0.9288 | Valid Loss: 2.6541 | Valid Acc: 0.6624
Epoch: 96/100 | Train Loss: 0.1314 | Train Acc: 0.9287 | Valid Loss: 2.6714 | Valid Acc: 0.6630
Epoch: 97/100 | Train Loss: 0.1317 | Train Acc: 0.9287 | Valid Loss: 2.6623 | Valid Acc: 0.6634
Epoch: 98/100 | Train Loss: 0.1312 | Train Acc: 0.9284 | Valid Loss: 2.6688 | Valid Acc: 0.6616
Epoch: 99/100 | Train Loss: 0.1315 | Train Acc: 0.9282 | Valid Loss: 2.6900 | Valid Acc: 0.6624
C:\Users\hi\PycharmProjects\NL_deepPart\day4_code\Seq2SeqEx2.py:335: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
model.load_state_dict(torch.load('best_model_checkpoint.pth'))
Epoch: 100/100 | Train Loss: 0.1313 | Train Acc: 0.9286 | Valid Loss: 2.6774 | Valid Acc: 0.6611
Best model validation loss: 2.0402
Best model validation accuracy: 0.6553
3
4
[ 25 11 48 105 2 0 0]
[ 3 46 42 61 521 2 0 0 0 0 0 0 0 0 0 0]
[ 46 42 61 521 2 4 0 0 0 0 0 0 0 0 0 0]
입력문장 : go .
정답문장 : bouge !
번역문장 : va en route !
--------------------------------------------------
입력문장 : hello !
정답문장 : bonjour !
번역문장 : bonjour !
--------------------------------------------------
입력문장 : got it !
정답문장 : j ai pige !
번역문장 : ca a l air !
--------------------------------------------------
입력문장 : go home .
정답문장 : rentre a la maison .
번역문장 : rentrez a la maison .
--------------------------------------------------
입력문장 : forget me .
정답문장 : oublie moi .
번역문장 : oubliez moi .
--------------------------------------------------