(1) word-based embedding(word2vec, GloVe)
(2) character-based embedding(FastText)
(3) character-based w/ word-based embedding(Character-Aware Neural Language Models -> charCNN)
nn.embedding()
이용)Our work questions the necessity of word embeddings (as iputs) for neural language modeling
import torch
import torch.nn as nn
import torchvision.models as models
import numpy as np
class HighwayNetwork(nn.Module):
def __init__(self, input_size,activation='ReLU'):
super(HighwayNetwork, self).__init__()
#transform gate(t)
self.trans_gate = nn.Sequential(
nn.Linear(input_size,input_size),
nn.Sigmoid())
#highway
if activation== 'ReLU':
self.activation = nn.ReLU()
self.h_layer = nn.Sequential(
nn.Linear(input_size,input_size),
self.activation)
self.trans_gate[0].bias.data.fill_(-2)
def forward(self,x):
t = self.trans_gate(x)
h = self.h_layer(x)
z = torch.mul(t,h)+torch.mul(1-t,x)
return z
class LM(nn.Module):
def __init__(self,word_vocab,char_vocab,max_len,embed_dim,out_channels,kernels,hidden_size,batch_size):
super(LM, self).__init__()
self.word_vocab = word_vocab
self.char_vocab = char_vocab
#Embedding layer
self.embed = nn.Embedding(len(char_vocab)+1, embed_dim,padding_idx=0)
#CNN layer
self.cnns = []
for kernel in kernels:
self.cnns.append(nn.Sequential(
nn.Conv2d(1,out_channels*kernel,kernel_size=(kernel,embed_dim)),
nn.Tanh(),
nn.MaxPool2d((max_len-kernel+1,1))))
self.cnns = nn.ModuleList(self.cnns)
#highway layer
input_size = np.asscalar(out_channels*np.sum(kernels))
self.highway = HighwayNetwork(input_size)
#lstm layer
self.lstm = nn.LSTM(input_size,hidden_size,2,batch_first=True,dropout=0.5)
#output layer
self.linear = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(hidden_size,len(word_vocab)))
def forward(self,x,h):
batch_size = x.shape[0]
seq_len = x.shape[1]
x = x.view(-1,x.shape[2])
x = self.embed(x)
x = x.view(x.shape[0],1,x.shape[1],x.shape[2])
y = [cnn(x).squeeze() for cnn in self.cnns]
w = torch.cat(y,1)
w = self.highway(w)
w = w.view(batch_size,seq_len,-1)
out, h = self.lstm(w,h)
out = out.view(batch_size*seq_len,-1)
out = self.linear(out)
return out,h
#Hyper Parameters
batch_size = 20
max_len = dic['max_len']+2 # character level length, real max length(19) + start, end
embed_dim = 15 # character embedding dimension
kernels = [1,2,3,4,5,6]
out_channels = 25
seq_len = 35 # word level
hidden_size = 500
#train_input_data
to_char(data,char_vocab,max_len)
data = np.array(data)
data = torch.from_numpy(data)
data = data.view(batch_size,-1,max_len)
to_char는 단어를 character vocabulary(char_vocab) 참조하여 character level로 분리하고, max_len(19)까지 zero-padding하는 custom 함수
data.shape # torch.Size[20, 46797, 21] = (batch size, total word length, characters by word)
input_ = data[0] # test용 input data
input_ = input_[0:0+seq_len, :]
input_ = input_.view(1, input_.shape[0], input_.shape[1])
input_
shape: [1, 35, 21] == [batch_size, word sequence, characters by word]
# view(batch size * seq_len(# of words), max_len(character-level))
input_ = input_.view(-1, input_.shape[2])
input_.shape
embed = embed.view(batch_size * seq_len, 1, max_len, emb_dim)
embed.shape #[35, 1, 21, 15]
before embed
input_
shape: [1, 35, 21]
after embed
input_
shape: [35, 1, 21, 15] -> [batch_size * word sequence, 1, characters by word, character embedding dimension]
out_channels = 25
kernels = [1, 2, 3, 4, 5]
embed_dim = 15
max_len = 21
_cnns = []
for kernel in kernels:
_cnns.append(nn.Sequential(
nn.Conv2d(1,out_channels*kernel,kernel_size=(kernel,embed_dim)),
nn.Tanh(),
nn.MaxPool2d((max_len-kernel+1,1))))
cnns_ = nn.ModuleList(_cnns)
small model에서는 kernel size를 1, 2, 3, 4, 5로 설정(output channel size = 25)
각각 conv2d의 output shape
- kernel = 1: torch.Size([35, 25, 21, 1])
- kernel = 2: torch.Size([35, 50, 20, 2])
- kernel = 3: torch.Size([35, 75, 19, 3])
cnn_output = [cnns(embed) for cnns in cnns_]
print([cnn_output[i].shape for i in range(0, 5)])
max pooling까지 한 결과: 좌측에서부터 kernel = 1, kernel = 2, ..., kernel = 5
[torch.Size([35, 25, 1, 1]), torch.Size([35, 50, 1, 1]), torch.Size([35, 75, 1, 1]), torch.Size([35, 100, 1, 1]), torch.Size([35, 125, 1, 1])]
cnn_squeeze = [i.squeeze() for i in cnn_output]
print([cnn_squeeze[i].shape for i in range(0, 5)])
[torch.Size([35, 25]), torch.Size([35, 50]), torch.Size([35, 75]), torch.Size([35, 100]), torch.Size([35, 125])]
cnn_squeeze_cat = torch.cat(cnn_squeeze,1)
print(cnn_squeeze_cat.shape)
torch.Size([35, 375])
: 375 = 25 + 50 + 75 + 100 + 125
class HighwayNetwork(nn.Module):
def __init__(self, input_size,activation='ReLU'):
super(HighwayNetwork, self).__init__()
#transform gate(t)
self.trans_gate = nn.Sequential(
nn.Linear(input_size,input_size),
nn.Sigmoid())
#highway
if activation== 'ReLU':
self.activation = nn.ReLU()
self.h_layer = nn.Sequential(
nn.Linear(input_size,input_size),
self.activation)
self.trans_gate[0].bias.data.fill_(-2) # linear layer에 bias 설정
def forward(self,x):
t = self.trans_gate(x)
h = self.h_layer(x)
z = torch.mul(t,h)+torch.mul(1-t,x)
return z
# transform gate
transform_ = highway.trans_gate(cnn_squeeze_cat)
# highway gate
highway_ = highway.h_layer(cnn_squeeze_cat)
shape: both [35, 375] == [batch_size * seq_len, squeezed_size]