Paper : https://proceedings.neurips.cc/paper/2014/file/a14ac55a4f27472c5d894ec1c3c743d2-Paper.pdf
Background
A surprising example of the power of DNNs is their ability to sort N N-bit numbers using only 2 hidden layers of quadratic size [27].
=> 이게 엄청난 이유가 뭘까?Goal
Challenge
Approach
long range temporal dependencies
Contribution
establish communication
을 가능하게 한다. (=> 가까운게 더 좋겠지..?)task : WMT'14 English to French MT task
Metric : BLEU (Bilingual Evaluation Understudy) (n-gram 기반 기계 번역 결과와 사람이 직접 번역한 결과 비교)
Experiment setting
1) Decoding: directly translate the input sentence without using a reference SMT system, 이때 beam search decoding 방식 적용
2) Rescoring: rescore the n-best lists of an SMT baseline
[원문] We also used the LSTM to rescore the 1000-best lists produced by the baseline system [29]. To rescore an n-best list, we computed the log probability of every hypothesis with our LSTM and took an even average with their score and the LSTM’s score.
https://m.blog.naver.com/sooftware/221816126290
3) Reversing
- [원문] While we do not have a complete explanation to this phenomenon, we believe that it is caused by the introduction of many short term dependencies to the dataset. Normally, when we concatenate a source sentence with a target sentence, each word in the source sentence is far from its corresponding word in the target sentence.
- 왜 이게 좋은지 정확히 설명할 순 없지만, 우리는 dataset의 short term dependencies 때문이라고 생각한다.(source와 target사이 거리가 머니까)
- SOTA model => Durrani, N., Haddow, B., Koehn, P., & Heafield, K. (2014, June). Edinburgh’s phrase-based machine translation systems for WMT-14. In Proceedings of the Ninth Workshop on Statistical Machine Translation (pp. 97-104).
=> 통계적 방식, Greedy coding 방식
https://en.wikipedia.org/wiki/Statistical_machine_translation#Phrase-based_translation
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
# https://github.com/bentrevett/pytorch-seq2seq
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
# # 1 - Sequence to Sequence Learning with Neural Networks
#
# [Sequence to Sequence Learning with Neural Networks](https://arxiv.org/abs/1409.3215) paper.
# In[1]:
import re
import torch
import torch.nn as nn
import torch.optim as optim
#from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Example,Dataset,Field, BucketIterator
import spacy
import numpy as np
import random
import math
import time
from tqdm import tqdm
# In[2]:
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
# # 1. Preparing Data
# # dataset 출처 : https://github.com/jungyeul/korean-parallel-corpora
#
# # gz 푸는 방법
# import tarfile
# tar = tarfile.open("korean-english-park.train.tar.gz")
# tar.extractall()
# tar.close()
# In[3]:
with open("korean-english-park.train.en", "r") as f:
train_en = f.readlines()
with open("korean-english-park.test.en", "r") as f:
test_en = f.readlines()
with open("korean-english-park.train.ko", "r") as f:
train_ko = f.readlines()
with open("korean-english-park.test.ko", "r") as f:
test_ko = f.readlines()
# In[4]:
train_en[0], train_ko[0]
# In[60]:
test_en[0], test_ko[0]
# In[5]:
spacy_kor = spacy.load('ko_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')
# In[ ]:
# ### Tokenizer function
# - These can be passed to torchtext and will take in the sentence as a string and return the sentence as a list of tokens.
# - We copy this by reversing the sentence after it has been transformed into a list of tokens.
# In[6]:
def clean_text(text):
"""
remove special characters from the input sentence to normalize it
Args:
text: (string) text string which may contain special character
Returns:
normalized sentence
"""
text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`…》]', '', text)
return text
# In[7]:
def tokenize_kor(text):
"""
Tokenizes korean text from a string into a list of strings (tokens) and reverses it
"""
return [tok.text for tok in spacy_kor.tokenizer(text)][::-1]
def tokenize_en(text):
"""
Tokenizes English text from a string into a list of strings (tokens)
"""
return [tok.text for tok in spacy_en.tokenizer(text)]
# In[9]:
[tok.text for tok in spacy_kor.tokenizer("나는 바보입니다 왜 이러세요")][::-1]
# ### Torchtext Fields
# - handle how data should be processed
# - field also appends the "start of sequence" and "end of sequence" tokens via the init_token and eos_token arguments, and converts all words to lowercase.
# In[10]:
SRC = Field(tokenize = tokenize_kor,
init_token = '<sos>',
eos_token = '<eos>',
lower = True)
TRG = Field(tokenize = tokenize_en,
init_token = '<sos>',
eos_token = '<eos>',
lower = True)
# In[11]:
def convert_to_dataset(SRC, TRG, train_src, train_tg):
# convert each row of DataFrame to torchtext 'Example' containing 'kor' and 'eng' Fields
list_of_examples = [Example.fromlist([clean_text(ko),clean_text(en)],
fields=[('src', SRC), ('trg', TRG)]) for ko,en in zip(train_src,train_tg)]
# construct torchtext 'Dataset' using torchtext 'Example' list
dataset = Dataset(examples=list_of_examples, fields=[('src', SRC), ('trg', TRG)])
return dataset
# In[102]:
train_data = convert_to_dataset(SRC, TRG, train_ko[:10000], train_en[:10000])
val_data = convert_to_dataset(SRC, TRG, test_ko,test_en)
test_data = convert_to_dataset(SRC, TRG, test_ko,test_en)
# In[103]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")
# In[104]:
print(vars(train_data.examples[0]))
# ### Build Vocab
# - The vocabulary is used to associate each unique token with an index (an integer).
# - The vocabularies of the source and target languages are distinct.
# - Using the min_freq argument, we only allow tokens that appear at least 2 times to appear in our vocabulary.
# - Tokens that appear only once are converted into an <unk> (unknown) token.
# - our vocabulary should only be built from the training set and not the validation/test set.
# In[105]:
SRC.build_vocab(train_data, min_freq = 1)
TRG.build_vocab(train_data, min_freq = 1)
# In[106]:
print(f"Unique tokens in source (kor) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")
# ### Iterator
# : BucketIterator instead of the standard Iterator as it creates batches in such a way that it minimizes the amount of padding in both the source and target sentences.
# In[17]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# In[107]:
BATCH_SIZE = 128
train_iterator,val_iterator, test_iterator = BucketIterator.splits(
(train_data, val_data, test_data),
batch_size = BATCH_SIZE,
device = device
,sort=False)
# # 2. Building the Seq2Seq Model
# ### Encoder
# : 2 layer LSTM. (The paper we are implementing uses a 4-layer LSTM, but in the interest of training time we cut this down to 2-layers.)
#
# $$\begin{align*}
# (h_t^1, c_t^1) &= \text{EncoderLSTM}^1(e(x_t), (h_{t-1}^1, c_{t-1}^1))\\
# (h_t^2, c_t^2) &= \text{EncoderLSTM}^2(h_t^1, (h_{t-1}^2, c_{t-1}^2))
# \end{align*}$$
#
# ![](https://github.com/bentrevett/pytorch-seq2seq/blob/master/assets/seq2seq2.png?raw=1)
# In[108]:
class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
super().__init__()
self.hid_dim = hid_dim
self.n_layers = n_layers
self.embedding = nn.Embedding(input_dim, emb_dim)
self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
self.dropout = nn.Dropout(dropout)
def forward(self, src):
#src = [src len, batch size]
embedded = self.dropout(self.embedding(src))
#embedded = [src len, batch size, emb dim]
outputs, (hidden, cell) = self.rnn(embedded)
#outputs = [src len, batch size, hid dim * n directions]
#hidden = [n layers * n directions, batch size, hid dim]
#cell = [n layers * n directions, batch size, hid dim]
#outputs are always from the top hidden layer
return hidden, cell
# ### Decoder
#
# : a 2-layer (4 in the paper) LSTM.
#
# ![](https://github.com/bentrevett/pytorch-seq2seq/blob/master/assets/seq2seq3.png?raw=1)
# In[109]:
class Decoder(nn.Module):
def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
super().__init__()
self.output_dim = output_dim
self.hid_dim = hid_dim
self.n_layers = n_layers
self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
self.fc_out = nn.Linear(hid_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, input, hidden, cell):
#input = [batch size]
#hidden = [n layers * n directions, batch size, hid dim]
#cell = [n layers * n directions, batch size, hid dim]
#n directions in the decoder will both always be 1, therefore:
#hidden = [n layers, batch size, hid dim]
#context = [n layers, batch size, hid dim]
input = input.unsqueeze(0)
#input = [1, batch size]
embedded = self.dropout(self.embedding(input))
#embedded = [1, batch size, emb dim]
output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
#output = [seq len, batch size, hid dim * n directions]
#hidden = [n layers * n directions, batch size, hid dim]
#cell = [n layers * n directions, batch size, hid dim]
#seq len and n directions will always be 1 in the decoder, therefore:
#output = [1, batch size, hid dim]
#hidden = [n layers, batch size, hid dim]
#cell = [n layers, batch size, hid dim]
prediction = self.fc_out(output.squeeze(0))
#prediction = [batch size, output dim]
return prediction, hidden, cell
# ### Seq2Seq
#
# For the final part of the implemenetation, we'll implement the seq2seq model. This will handle:
# - receiving the input/source sentence
# - using the encoder to produce the context vectors
# - using the decoder to produce the predicted output/target sentence
#
# ![](https://github.com/bentrevett/pytorch-seq2seq/blob/master/assets/seq2seq4.png?raw=1)
#
# The teacher forcing ratio is used when training our model.
#
# **Note**: our decoder loop starts at 1, not 0. This means the 0th element of our `outputs` tensor remains all zeros. So our `trg` and `outputs` look something like:
#
# $$\begin{align*}
# \text{trg} = [<sos>, &y_1, y_2, y_3, <eos>]\\
# \text{outputs} = [0, &\hat{y}_1, \hat{y}_2, \hat{y}_3, <eos>]
# \end{align*}$$
#
# Later on when we calculate the loss, we cut off the first element of each tensor to get:
#
# $$\begin{align*}
# \text{trg} = [&y_1, y_2, y_3, <eos>]\\
# \text{outputs} = [&\hat{y}_1, \hat{y}_2, \hat{y}_3, <eos>]
# \end{align*}$$
# In[110]:
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder, device):
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
assert encoder.hid_dim == decoder.hid_dim, "Hidden dimensions of encoder and decoder must be equal!"
assert encoder.n_layers == decoder.n_layers, "Encoder and decoder must have equal number of layers!"
def forward(self, src, trg, teacher_forcing_ratio = 0.5):
#src = [src len, batch size]
#trg = [trg len, batch size]
#teacher_forcing_ratio is probability to use teacher forcing
#e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
batch_size = trg.shape[1]
trg_len = trg.shape[0]
trg_vocab_size = self.decoder.output_dim
#tensor to store decoder outputs
outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
#last hidden state of the encoder is used as the initial hidden state of the decoder
hidden, cell = self.encoder(src)
#first input to the decoder is the <sos> tokens
input = trg[0,:]
for t in range(1, trg_len):
#insert input token embedding, previous hidden and previous cell states
#receive output tensor (predictions) and new hidden and cell states
output, hidden, cell = self.decoder(input, hidden, cell)
#place predictions in a tensor holding predictions for each token
outputs[t] = output
#decide if we are going to use teacher forcing or not
teacher_force = random.random() < teacher_forcing_ratio
#get the highest predicted token from our predictions
top1 = output.argmax(1)
#if teacher forcing, use actual next token as next input
#if not, use predicted token
input = trg[t] if teacher_force else top1
return outputs
# # 3. Training the Seq2Seq Model
# In[111]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)
# In[112]:
def init_weights(m):
for name, param in m.named_parameters():
nn.init.uniform_(param.data, -0.08, 0.08)
model.apply(init_weights)
# In[113]:
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')
# In[114]:
optimizer = optim.Adam(model.parameters())
# In[115]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
# ### train loop
# In[116]:
def train(model, iterator, optimizer, criterion, clip):
model.train()
epoch_loss = 0
for i, batch in enumerate(iterator):
src = batch.src
trg = batch.trg
optimizer.zero_grad()
output = model(src, trg)
#trg = [trg len, batch size]
#output = [trg len, batch size, output dim]
output_dim = output.shape[-1]
output = output[1:].view(-1, output_dim)
trg = trg[1:].view(-1)
#trg = [(trg len - 1) * batch size]
#output = [(trg len - 1) * batch size, output dim]
loss = criterion(output, trg)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
epoch_loss += loss.item()
return epoch_loss / len(iterator)
# ### evaluation loop
# In[117]:
def evaluate(model, iterator, criterion):
model.eval()
epoch_loss = 0
with torch.no_grad():
for i, batch in enumerate(iterator):
src = batch.src
trg = batch.trg
output = model(src, trg, 0) #turn off teacher forcing
#trg = [trg len, batch size]
#output = [trg len, batch size, output dim]
output_dim = output.shape[-1]
output = output[1:].view(-1, output_dim)
trg = trg[1:].view(-1)
#trg = [(trg len - 1) * batch size]
#output = [(trg len - 1) * batch size, output dim]
loss = criterion(output, trg)
epoch_loss += loss.item()
return epoch_loss / len(iterator)
# In[118]:
def epoch_time(start_time, end_time):
elapsed_time = end_time - start_time
elapsed_mins = int(elapsed_time / 60)
elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
return elapsed_mins, elapsed_secs
# In[119]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
def showPlot(points):
plt.figure()
fig, ax = plt.subplots()
# this locator puts ticks at regular intervals
loc = ticker.MultipleLocator()#base=0.2
ax.yaxis.set_major_locator(loc)
plt.plot(points)
# In[120]:
N_EPOCHS = 10
CLIP = 1
best_valid_loss = float('inf')
print_every=3
plot_every=3
start = time.time()
plot_losses = []
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every
for epoch in tqdm(range(N_EPOCHS)):
start_time = time.time()
train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
#valid_loss = evaluate(model, valid_iterator, criterion)
print_loss_total += train_loss
plot_loss_total += train_loss
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
# if epoch % print_every == 0:
# print_loss_avg = print_loss_total / print_every
# print_loss_total = 0
# print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
# iter, iter / n_iters * 100, print_loss_avg))
if epoch % plot_every == 0:
plot_loss_avg = plot_loss_total / plot_every
plot_losses.append(plot_loss_avg)
plot_loss_total = 0
showPlot(plot_losses)
# if valid_loss < best_valid_loss:
# best_valid_loss = valid_loss
# torch.save(model.state_dict(), 'tut1-model.pt')
# print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
# print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
# #print(f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')
# In[121]:
#model.load_state_dict(torch.load('tut1-model.pt'))
test_loss = evaluate(model, test_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')
# In[ ]:
### 결과 출력
# In[157]:
for i, batch in enumerate(test_iterator):
src = batch.src
trg = batch.trg
output = model(src, trg, 0) #turn off teacher forcing
# In[191]:
src = batch.src
trg = batch.trg
output = model(src, trg, 0) #turn off teacher forcing
for src_,trg_,out_ in zip(src.transpose(1,0),trg.transpose(1,0),output.transpose(1,0)):
src_list = []
trg_list = []
pred_list = []
for token in src_:
word = SRC.vocab.itos[token]
if word not in ['<sos>','<pad>','<eos>']:
src_list.append(word)
for token in trg_:
word = TRG.vocab.itos[token]
if word not in ['<sos>','<pad>','<eos>']:
trg_list.append(word)
for token in out_:
topv, topi = token.data.topk(1)
word = TRG.vocab.itos[topi.item()]
if word not in ['<sos>','<pad>','<eos>']:
pred_list.append(word)
print("src : ", src_list[::-1])
print("trg : ", trg_list)
print("predict : ", pred_list)
print()
# ### in detail
# In[68]:
val_iterator
# In[76]:
for i, batch in enumerate(val_iterator):
src = batch.src
trg = batch.trg
# In[79]:
i
# In[82]:
src.size(), trg.size()
# In[102]:
output = model(src, trg)
output_dim = output.shape[-1]
output = output[1:].view(-1, output_dim)
trg = trg[1:].view(-1)
# In[109]:
topv, topi = output[0].data.topk(1)
# In[110]:
topi.item()
# In[117]:
TRG.vocab.itos[topi.item()]