앞 내용이 궁금하신 분들은,,,
Glow-tts 코드리뷰 3편을 보고 오시면 됩니다.!
(encoder): Encoder(
(drop): Dropout(p=0.1, inplace=False)
(attn_layers): ModuleList(
(0): MultiHeadAttention(
(conv_q): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_k): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_v): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_o): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
(1): MultiHeadAttention(
(conv_q): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_k): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_v): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_o): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
(2): MultiHeadAttention(
(conv_q): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_k): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_v): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_o): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
(3): MultiHeadAttention(
(conv_q): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_k): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_v): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_o): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
(4): MultiHeadAttention(
(conv_q): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_k): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_v): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_o): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
(5): MultiHeadAttention(
(conv_q): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_k): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_v): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_o): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
)
(norm_layers_1): ModuleList(
(0): LayerNorm()
(1): LayerNorm()
(2): LayerNorm()
(3): LayerNorm()
(4): LayerNorm()
(5): LayerNorm()
)
(ffn_layers): ModuleList(
(0): FFN(
(conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
(1): FFN(
(conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
(2): FFN(
(conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
(3): FFN(
(conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
(4): FFN(
(conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
(5): FFN(
(conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
)
(norm_layers_2): ModuleList(
(0): LayerNorm()
(1): LayerNorm()
(2): LayerNorm()
(3): LayerNorm()
(4): LayerNorm()
(5): LayerNorm()
)
)
def forward(self, x, x_lengths, g=None):
#print(f'원래 x : {x}')
#print(f'x shape : {x.shape}')
#print(f'x len : {len(x)}')
#print(f'임베딩 x : {self.emb(x)}')
#print(f'임베딩 x shape : {self.emb(x).shape}')
#print(f'임베딩 x len : {len(self.emb(x))}')
x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
if self.prenet:
x = self.pre(x, x_mask)
x = self.encoder(x, x_mask)
if g is not None:
g_exp = g.expand(-1, -1, x.size(-1))
x_dp = torch.cat([torch.detach(x), g_exp], 1)
else:
x_dp = torch.detach(x)
x_m = self.proj_m(x) * x_mask
if not self.mean_only:
x_logs = self.proj_s(x) * x_mask
else:
x_logs = torch.zeros_like(x_m)
logw = self.proj_w(x_dp, x_mask)
return x_m, x_logs, logw, x_mask
self.emb = nn.Embedding(n_vocab, hidden_channels)
# 임베딩 차원이 192(hidden channels)인 벡터가 126개 생성
x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
print(f'1: {commons.sequence_mask(x_lengths, x.size(2)).shape}')
print(f'2: {torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).shape}')
print(f'3: {x_mask.shape}')
def sequence_mask(length, max_length=None):
if max_length is None:
max_length = length.max()
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
# print(f'sequence_mask x : {x}')
# print(f'sequence_mask x.shape : {x.shape}')
# print(f'length.dtype : {length.dtype}')
# print(f'length : {length}')
# print(f'x.unsqueeze(0) : {x.unsqueeze(0)}')
# print(f'x.unsqueeze(0).shape : {x.unsqueeze(0).shape}')
# print(f'length.unsqueeze(1) : {length.unsqueeze(1)}')
# print(f'length.unsqueeze(1).shape : {length.unsqueeze(1).shape}')
# print(f'x.unsqueeze(0) < length.unsqueeze(1) : {x.unsqueeze(0) < length.unsqueeze(1)}')
return x.unsqueeze(0) < length.unsqueeze(1)
if prenet:
self.pre = modules.ConvReluNorm(hidden_channels, hidden_channels, hidden_channels, kernel_size=5, n_layers=3, p_dropout=0.5)
ConvReluNorm을 거치면서 3개의 hidden layers를 거치면서 조금 더 많은 주변 정보들을 갖게됨.
class Encoder(nn.Module):
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=None, block_length=None, **kwargs):
super().__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.window_size = window_size
self.block_length = block_length
self.drop = nn.Dropout(p_dropout)
self.attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, window_size=window_size, p_dropout=p_dropout, block_length=block_length))
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask):
# print('encoder 들어가기 이전에')
# print(f'x_mask.unsqueeze(2).shape : {x_mask.unsqueeze(2).shape}')
# print(f'x_mask.unsqueeze(-1).shape : {x_mask.unsqueeze(-1).shape}')
#print(f'x_mask.unsqueeze(2) : {x_mask.unsqueeze(2)}')
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
#print(f'attn_mask shape : {attn_mask.shape}')
#print(f'attn_mask : {attn_mask}')
for i in range(self.n_layers):
#print(f'attn_layers[{i}]번째 layer : {self.attn_layers[i]}')
x = x * x_mask
#print(f'x = x * x_mask : {x}')
#print(f'x = x * x_mask : {x.shape}')
y = self.attn_layers[i](x, x, attn_mask)
#print(f'y : {y}')
#print(f'y shape : {y.shape}')
y = self.drop(y)
x = self.norm_layers_1[i](x + y)
y = self.ffn_layers[i](x, x_mask)
y = self.drop(y)
x = self.norm_layers_2[i](x + y)
x = x * x_mask
print(f'encoder에서 최종 x : {x.shape}')
return x
다시 forward구문으로 돌아가서 보면,,,
x_dp = torch.detach(x)
torch.detach(x)
: 기존 Tensor를 복사하는 방법 중 하나x_m
: 길이 정보가 담긴 x_mask와 Conv1d를 거친 x를 스칼라 곱해준 값self.proj_m = nn.Conv1d(hidden_channels, out_channels, 1)
x_m : proj_m(x) * x_mask
x_logs
if not self.mean_only:
x_logs = self.proj_s(x) * x_mask
# print(f'x_logs shape : {x_logs.shape}')
# print(f'self.proj_s(x) shape : {self.proj_s(x).shape}')
# print(f'x_mask : {x_mask}')
else:
x_logs = torch.zeros_like(x_m)
print(f'x_m shape : {x_m.shape}')
print(f'x_logs shape : {x_logs.shape}')
print(f'x_m : {x_m}')
print(f'x_logs : {x_logs}')
logw
logw = self.proj_w(x_dp, x_mask)
self.proj_w = DurationPredictor(hidden_channels + gin_channels, filter_channels_dp, kernel_size, p_dropout)
다시 DurationPredictor 부분을 보자면,,,
hidden_channels + gin_channels = in_channels(Batch) = 192
filter_channels_dp = filter_channels(Feature dimension) = 256
DurationPredictor forwarding flow
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
self.norm_1 = attentions.LayerNorm(filter_channels)
self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
self.norm_2 = attentions.LayerNorm(filter_channels)
self.proj = nn.Conv1d(filter_channels, 1, 1)
print(f'in_channels : {self.in_channels}') # 192
print(f'filter_channels : {self.filter_channels}') # 256
def forward(self, x, x_mask):
print('시작')
print(f'x shape : {x.shape}')
print(f'x_mask shape : {x_mask.shape}')
print('1. conv_1')
x = self.conv_1(x * x_mask)
print(f'x shape : {x.shape}')
x = torch.relu(x)
x = self.norm_1(x)
x = self.drop(x)
print('2. conv_2')
x = self.conv_2(x * x_mask)
print(f'x shape : {x.shape}')
x = torch.relu(x)
x = self.norm_2(x)
x = self.drop(x)
print('3. proj(=nn.Conv1d')
x = self.proj(x * x_mask)
print(f'x shape : {x.shape}')
print('최종 반환값')
print(f'x * x_mask shape : {(x * x_mask).shape}')
return x * x_mask
output/shape | shape | 특징 |
---|---|---|
x_m | ([batch size, n_mel_channels(out_channels:80), max(x_lengths) per batch]) | encoder 이후 나온 x에 Conv1d 연산을 취하고x_mask와 스칼라 곱을 해 준 텐서 |
x_logs | ([batch size, n_mel_channels(out_channels:80), max(x_lengths) per batch]) | x_m값의 모든 값을 0으로 치환 |
logw | ([batch size, DurationPredictor 이후(=1), max(x_lengths) per batch]) | DurationPredictor()를 활용하여 세 번의 Conv1d연산을 진행하는데, 마지막 Conv1d layer에서 x_mask정보와 동일하게 x.size(1)을 1차원으로 변경 |
x_mask | ([batch size, 길이 정보(=1차원), max(x_lengths) per batch]) | batch당 가장 긴 길이의 데이터를 기준으로 True 혹은 False로 masking한 텐서 |
이상 Encoder부분까지 코드 분석을 해보았습니다.
Decoder부분은 나중에 시간이 생길 때, 추후에 정리해보도록 하겠습니다.
긴 글 읽어주셔서 감사합니다:)