ํ์ด์ฌ ํ๋ก๊ทธ๋๋ฐ ์ํ์ค ์๋ฃํ
list1 = list() # ๋น ๋ฆฌ์คํธ๋ฅผ ์์ฑ
list2 = list('ABCD') # list(iterable object)
# ['A','B','C','D']
list3 = list(range(10)) # [0,1,2,3,4,5,6,7,8,9]
def even_generator() :
for i in range(10) :
if i % 2 == 0 :
yield i
list4 = list(even_generator()) # [0, 2, 4, 6, 8]
list5 = list((i for i in range(10) if i % 2 == 0))
list6 = list( i for i in range(10) if i % 2 == 0 ) # ()๊ฐ ์์ด๋ ๋จ
print('๋ฆฌ์คํธ1 ', list1)
print('๋ฆฌ์คํธ2 ', list2)
print('๋ฆฌ์คํธ3 ', list3)
print('๋ฆฌ์คํธ4 ', list4)
print('๋ฆฌ์คํธ5 ', list5)
print('๋ฆฌ์คํธ6 ', list6)
๋ฆฌ์คํธ1 []
๋ฆฌ์คํธ2 ['A', 'B', 'C', 'D']
๋ฆฌ์คํธ3 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
๋ฆฌ์คํธ4 [0, 2, 4, 6, 8]
๋ฆฌ์คํธ5 [0, 2, 4, 6, 8]
๋ฆฌ์คํธ6 [0, 2, 4, 6, 8]
๋จ์ด๋ฅผ ์ ์ฌ์ ์์ ํ์ฉํ๋ ๋ฅ๋ ฅ์ ๋ฐ๋ฌ(๋ฌธ๋ฒ์ ์ธ ์๋ฆฌ X)
sentence = " ๋๋ ๋ฐฅ์ ๋จน์๋ค "
source_sentence = "<start>" + sentence
target_sentence = sentence + "<end>"
print("Source ๋ฌธ์ฅ:", source_sentence)
print("Target ๋ฌธ์ฅ:", target_sentence)
Source ๋ฌธ์ฅ: <start> ๋๋ ๋ฐฅ์ ๋จน์๋ค
Target ๋ฌธ์ฅ: ๋๋ ๋ฐฅ์ ๋จน์๋ค <end>
$ P({w_n|w_1, ..., w{n-1}}; \theta) $
๋ฒ์งธ๊น์ง์ ๋จ์ด ์ํ์ค๊ฐ x_train์ด ๋๊ณ ๋ฒ์งธ ๋จ์ด๊ฐ y_train์ด ๋๋ ๋ฐ์ดํฐ์
import os, re
import numpy as np
import tensorflow as tf
# ํ์ผ์ ์ฝ๊ธฐ๋ชจ๋๋ก ์ด๊ณ
# ๋ผ์ธ ๋จ์๋ก ๋์ด์ list ํํ๋ก ์ฝ์ด์ต๋๋ค.
file_path = './lyricist/data/shakespeare.txt'
with open(file_path, "r") as f:
raw_corpus = f.read().splitlines()
# ์์์๋ถํฐ 10๋ผ์ธ๋ง ํ๋ฉด์ ์ถ๋ ฅํด ๋ณผ๊น์?
print(raw_corpus[:9])
['First Citizen:', 'Before we proceed any further, hear me speak.', '', 'All:', 'Speak, speak.', '', 'First Citizen:', 'You are all resolved rather to die than to famish?', '']
for idx, sentence in enumerate(raw_corpus):
if len(sentence) == 0: continue # ๊ธธ์ด๊ฐ 0์ธ ๋ฌธ์ฅ์ ๊ฑด๋๋๋๋ค.
if sentence[-1] == ":": continue # ๋ฌธ์ฅ์ ๋์ด : ์ธ ๋ฌธ์ฅ์ ๊ฑด๋๋๋๋ค.
if idx > 9: break # ์ผ๋จ ๋ฌธ์ฅ 10๊ฐ๋ง ํ์ธํด ๋ณผ ๊ฒ๋๋ค.
print(sentence)
The first words that come out
And I can see this song will be about you
I can't believe that I can breathe without you
But all I need to do is carry on
The next line I write down
And there's a tear that falls between the pages
I know that pain's supposed to heal in stages
But it depends which one I'm standing on I write lines down, then rip them up
Describing love can't be this tough I could set this song on fire, send it up in smoke
I could throw it in the river and watch it sink in slowly
# ์
๋ ฅ๋ ๋ฌธ์ฅ์
# 1. ์๋ฌธ์๋ก ๋ฐ๊พธ๊ณ , ์์ชฝ ๊ณต๋ฐฑ์ ์ง์๋๋ค
# 2. ํน์๋ฌธ์ ์์ชฝ์ ๊ณต๋ฐฑ์ ๋ฃ๊ณ
# 3. ์ฌ๋ฌ๊ฐ์ ๊ณต๋ฐฑ์ ํ๋์ ๊ณต๋ฐฑ์ผ๋ก ๋ฐ๊ฟ๋๋ค
# 4. a-zA-Z?.!,ยฟ๊ฐ ์๋ ๋ชจ๋ ๋ฌธ์๋ฅผ ํ๋์ ๊ณต๋ฐฑ์ผ๋ก ๋ฐ๊ฟ๋๋ค
# 5. ๋ค์ ์์ชฝ ๊ณต๋ฐฑ์ ์ง์๋๋ค
# 6. ๋ฌธ์ฅ ์์์๋ <start>, ๋์๋ <end>๋ฅผ ์ถ๊ฐํฉ๋๋ค
# ์ด ์์๋ก ์ฒ๋ฆฌํด์ฃผ๋ฉด ๋ฌธ์ ๊ฐ ๋๋ ์ํฉ์ ๋ฐฉ์งํ ์ ์๊ฒ ๋ค์!
def preprocess_sentence(sentence):
sentence = sentence.lower().strip() # 1
sentence = re.sub(r"([?.!,ยฟ])", r" \1 ", sentence) # 2
sentence = re.sub(r'[" "]+', " ", sentence) # 3
sentence = re.sub(r"[^a-zA-Z?.!,ยฟ]+", " ", sentence) # 4
sentence = sentence.strip() # 5
sentence = '<start> ' + sentence + ' <end>' # 6
return sentence
# ์ด ๋ฌธ์ฅ์ด ์ด๋ป๊ฒ ํํฐ๋ง๋๋์ง ํ์ธํด ๋ณด์ธ์.
print(preprocess_sentence("This @_is ;;;sample sentence."))
<start> this is sample sentence . <end>
์ง์ ๋ถํ ๋ฌธ์ฅ์ ๋ฃ์ด๋ ์์๊ฒ ๋ณํํด ์ฃผ๋ ์ ์ ํจ์๊ฐ ์์ฑ
- \ \ ๋ ์ถ๊ฐ
# ์ฌ๊ธฐ์ ์ ์ ๋ ๋ฌธ์ฅ์ ๋ชจ์๊ฒ๋๋ค
corpus = []
for sentence in raw_corpus:
# ์ฐ๋ฆฌ๊ฐ ์ํ์ง ์๋ ๋ฌธ์ฅ์ ๊ฑด๋๋๋๋ค
if len(sentence) == 0: continue
if sentence[-1] == ":": continue
# ์ ์ ๋ฅผ ํ๊ณ ๋ด์์ฃผ์ธ์
preprocessed_sentence = preprocess_sentence(sentence)
corpus.append(preprocessed_sentence)
# ์ ์ ๋ ๊ฒฐ๊ณผ๋ฅผ 10๊ฐ๋ง ํ์ธํด๋ณด์ฃ
corpus[:10]
['<start> before we proceed any further , hear me speak . <end>',
'<start> speak , speak . <end>',
'<start> you are all resolved rather to die than to famish ? <end>',
'<start> resolved . resolved . <end>',
'<start> first , you know caius marcius is chief enemy to the people . <end>',
'<start> we know t , we know t . <end>',
'<start> let us kill him , and we ll have corn at our own price . <end>',
'<start> is t a verdict ? <end>',
'<start> no more talking on t let it be done away , away ! <end>',
'<start> one word , good citizens . <end>']
- ์ ์ ๋ ๋ฐ์ดํฐ๋ฅผ ํ ํฐํ
- ๋จ์ด ์ฌ์ (vocabulary ๋๋ dictionary๋ผ๊ณ ์นญํจ) ์ ์
- ๋ฐ์ดํฐ๋ฅผ ์ซ์๋ก ๋ณํ.
โ ๋ฒกํฐํ(vectorize), ์ซ์๋ก ๋ณํ๋ ๋ฐ์ดํฐ๋ฅผ ํ ์(tensor)
# ํ ํฐํ ํ ๋ ํ
์ํ๋ก์ฐ์ Tokenizer์ pad_sequences๋ฅผ ์ฌ์ฉํฉ๋๋ค
# ๋ ์ ์๊ธฐ ์ํด ์๋ ๋ฌธ์๋ค์ ์ฐธ๊ณ ํ๋ฉด ์ข์ต๋๋ค
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences
def tokenize(corpus):
# 7000๋จ์ด๋ฅผ ๊ธฐ์ตํ ์ ์๋ tokenizer๋ฅผ ๋ง๋ค๊ฒ๋๋ค
# ์ฐ๋ฆฌ๋ ์ด๋ฏธ ๋ฌธ์ฅ์ ์ ์ ํ์ผ๋ filters๊ฐ ํ์์์ด์
# 7000๋จ์ด์ ํฌํจ๋์ง ๋ชปํ ๋จ์ด๋ '<unk>'๋ก ๋ฐ๊ฟ๊ฑฐ์์
tokenizer = tf.keras.preprocessing.text.Tokenizer(
num_words=7000,
filters=' ',
oov_token="<unk>"
)
# corpus๋ฅผ ์ด์ฉํด tokenizer ๋ด๋ถ์ ๋จ์ด์ฅ์ ์์ฑํฉ๋๋ค
tokenizer.fit_on_texts(corpus)
# ์ค๋นํ tokenizer๋ฅผ ์ด์ฉํด corpus๋ฅผ Tensor๋ก ๋ณํํฉ๋๋ค
tensor = tokenizer.texts_to_sequences(corpus)
# ์
๋ ฅ ๋ฐ์ดํฐ์ ์ํ์ค ๊ธธ์ด๋ฅผ ์ผ์ ํ๊ฒ ๋ง์ถฐ์ค๋๋ค
# ๋ง์ฝ ์ํ์ค๊ฐ ์งง๋ค๋ฉด ๋ฌธ์ฅ ๋ค์ ํจ๋ฉ์ ๋ถ์ฌ ๊ธธ์ด๋ฅผ ๋ง์ถฐ์ค๋๋ค.
# ๋ฌธ์ฅ ์์ ํจ๋ฉ์ ๋ถ์ฌ ๊ธธ์ด๋ฅผ ๋ง์ถ๊ณ ์ถ๋ค๋ฉด padding='pre'๋ฅผ ์ฌ์ฉํฉ๋๋ค
tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
print(tensor,tokenizer)
return tensor, tokenizer
tensor, tokenizer = tokenize(corpus)
[[ 2 143 40 ... 0 0 0]
[ 2 110 4 ... 0 0 0]
[ 2 11 50 ... 0 0 0]
...
[ 2 149 4553 ... 0 0 0]
[ 2 34 71 ... 0 0 0]
[ 2 945 34 ... 0 0 0]] <keras_preprocessing.text.Tokenizer object at 0x7fdf6cb56df0>
โ ํ ์ ๋ฐ์ดํฐ๋ฅผ 3๋ฒ์ฌ ํ, 10๋ฒ์งธ ์ด๊น์ง๋ง ์ถ๋ ฅ
print(tensor[:3, :10])
[[ 2 143 40 933 140 591 4 124 24 110]
[ 2 110 4 110 5 3 0 0 0 0]
[ 2 11 50 43 1201 316 9 201 74 9]]
for idx in tokenizer.index_word:
print(idx, ":", tokenizer.index_word[idx])
if idx >= 10: break
1 : <unk>
2 : <start>
3 : <end>
4 : ,
5 : .
6 : the
7 : and
8 : i
9 : to
10 : of
# tensor์์ ๋ง์ง๋ง ํ ํฐ์ ์๋ผ๋ด์ ์์ค ๋ฌธ์ฅ์ ์์ฑํฉ๋๋ค
# ๋ง์ง๋ง ํ ํฐ์ <end>๊ฐ ์๋๋ผ <pad>์ผ ๊ฐ๋ฅ์ฑ์ด ๋์ต๋๋ค.
src_input = tensor[:, :-1]
# tensor์์ <start>๋ฅผ ์๋ผ๋ด์ ํ๊ฒ ๋ฌธ์ฅ์ ์์ฑํฉ๋๋ค.
tgt_input = tensor[:, 1:]
print(src_input[0])
print(tgt_input[0])
[ 2 143 40 933 140 591 4 124 24 110 5 3 0 0 0 0 0 0
0 0]
[143 40 933 140 591 4 124 24 110 5 3 0 0 0 0 0 0 0
0 0]
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256 #์ 256์ผ๋ก??
steps_per_epoch = len(src_input) // BATCH_SIZE
# tokenizer๊ฐ ๊ตฌ์ถํ ๋จ์ด์ฌ์ ๋ด 7000๊ฐ์, ์ฌ๊ธฐ ํฌํจ๋์ง ์์ 0:<pad>๋ฅผ ํฌํจํ์ฌ 7001๊ฐ
VOCAB_SIZE = tokenizer.num_words + 1
# ์ค๋นํ ๋ฐ์ดํฐ ์์ค๋ก๋ถํฐ ๋ฐ์ดํฐ์
์ ๋ง๋ญ๋๋ค
# ๋ฐ์ดํฐ์
์ ๋ํด์๋ ์๋ ๋ฌธ์๋ฅผ ์ฐธ๊ณ ํ์ธ์
# ์์ธํ ์์๋์๋ก ๋์์ด ๋ง์ด ๋๋ ์ค์ํ ๋ฌธ์์
๋๋ค
# https://www.tensorflow.org/api_docs/python/tf/data/Dataset
dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset
<BatchDataset shapes: ((256, 20), (256, 20)), types: (tf.int32, tf.int32)>
์ฐ๋ฆฌ๊ฐ ๋ง๋ค ๊ตฌ์กฐ๋
- tf.keras.Model์ Subclassingํ๋ ๋ฐฉ์
- 1๊ฐ์ Embedding ๋ ์ด์ด, 2๊ฐ์ LSTM ๋ ์ด์ด, 1๊ฐ์ Dense ๋ ์ด์ด๋ก ๊ตฌ์ฑ
class TextGenerator(tf.keras.Model):
def __init__(self, vocab_size, embedding_size, hidden_size):
super().__init__()
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
self.linear = tf.keras.layers.Dense(vocab_size)
def call(self, x):
out = self.embedding(x)
out = self.rnn_1(out)
out = self.rnn_2(out)
out = self.linear(out)
return out
embedding_size = 256
hidden_size = 1024
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)
์ ์ฝ๋์์ embedding_size ๋ ์๋ ๋ฒกํฐ์ ์ฐจ์์, ์ฆ ๋จ์ด๊ฐ ์ถ์์ ์ผ๋ก ํํ๋๋ ํฌ๊ธฐ
์๋ฅผ ๋ค์ด 2๋ผ๋ฉด,
- ์ฐจ๊ฐ๋ค: [0.0, 1.0]
- ๋จ๊ฒ๋ค: [1.0, 0.0]
- ๋ฏธ์ง๊ทผํ๋ค: [0.5, 0.5]
ํด์๋ก ์ถ์์ ์ธ ํน์ง์ ์ก์๋ผ ์ ์์
์ถฉ๋ถํ ๋ฐ์ดํฐ๊ฐ ์์ผ๋ฉด ํผ๋์ด ์ผ๊ธฐ
์์ ๋ฌธ์ ์์๋ 256์ผ๋ก ์ค์
LSTM ๋ ์ด์ด์hidden state์ ์ฐจ์์์ธ hidden_size๋ ๊ฐ์ ๋งฅ๋ฝ
1024๊ฐ ์ ๋น > ์???
model ๋น๋ฉ์ ์์ง
model์ input shape๊ฐ ๊ฒฐ์ ๋๋ฉด์ model.build()๊ฐ ์๋์ผ๋ก ํธ์ถ
# ๋ฐ์ดํฐ์
์์ ๋ฐ์ดํฐ ํ ๋ฐฐ์น๋ง ๋ถ๋ฌ์ค๋ ๋ฐฉ๋ฒ์
๋๋ค.
# ์ง๊ธ์ ๋์ ์๋ฆฌ์ ๋๋ฌด ๋น ์ ธ๋ค์ง ๋ง์ธ์~
for src_sample, tgt_sample in dataset.take(1): break
# ํ ๋ฐฐ์น๋ง ๋ถ๋ฌ์จ ๋ฐ์ดํฐ๋ฅผ ๋ชจ๋ธ์ ๋ฃ์ด๋ด
๋๋ค
model(src_sample)
<tf.Tensor: shape=(256, 20, 7001), dtype=float32, numpy=
array([[[-4.31300432e-05, -1.61468924e-04, -2.28978643e-05, ...,
-1.30526796e-05, -1.65740246e-04, -1.52532608e-04],
[-3.31729389e-04, -3.35492572e-04, 9.25426939e-05, ...,
-5.40830388e-06, -3.98255826e-04, -1.54450114e-04],
[-3.56497185e-04, -5.21454436e-04, 9.67504602e-05, ...,
5.08658530e-04, -2.61457870e-04, -4.00694320e-04],
...,
[-1.85111989e-04, -4.65744082e-03, -2.55297264e-03, ...,
-1.98253267e-03, 1.08430139e-03, -1.12067943e-03],
[-2.26652643e-04, -4.91228886e-03, -2.57866457e-03, ...,
-2.21222127e-03, 1.26814318e-03, -1.42959598e-03],
[-2.72341655e-04, -5.11216559e-03, -2.58543598e-03, ...,
-2.40770658e-03, 1.43469241e-03, -1.71830074e-03]],
[[-4.31300432e-05, -1.61468924e-04, -2.28978643e-05, ...,
-1.30526796e-05, -1.65740246e-04, -1.52532608e-04],
[-5.15408523e-04, -2.02041178e-04, 1.23195961e-04, ...,
2.49126664e-04, -3.87586042e-04, -2.07115838e-04],
[-4.60794341e-04, -6.10847856e-05, 1.97986214e-04, ...,
4.49985295e-04, -6.17051148e-04, -3.10465286e-04],
...,
[ 8.63971800e-05, -4.08918737e-03, -2.27619591e-03, ...,
-2.37792335e-03, 3.57422628e-04, -1.23522384e-03],
[ 4.76920541e-05, -4.44815168e-03, -2.36615562e-03, ...,
-2.57940381e-03, 6.33708783e-04, -1.43076328e-03],
[-4.88970727e-06, -4.74188896e-03, -2.42232508e-03, ...,
-2.74938415e-03, 8.77008599e-04, -1.63904799e-03]],
[[-4.31300432e-05, -1.61468924e-04, -2.28978643e-05, ...,
-1.30526796e-05, -1.65740246e-04, -1.52532608e-04],
[-3.92017188e-04, -4.05272236e-04, -4.03737446e-04, ...,
2.08484649e-04, -1.78227972e-04, -1.51166416e-04],
[-6.78456272e-04, -8.21831811e-04, -4.12415975e-04, ...,
7.16181123e-04, -2.64001399e-04, -1.19791046e-04],
...,
[ 6.99736876e-04, -3.99236009e-03, -1.08700444e-03, ...,
-1.69491197e-03, 2.50219717e-04, -7.33265886e-04],
[ 6.50285685e-04, -4.42640856e-03, -1.28968060e-03, ...,
-1.98570453e-03, 5.35400759e-04, -1.02897582e-03],
[ 5.66453091e-04, -4.77890717e-03, -1.45729876e-03, ...,
-2.23665801e-03, 7.83600728e-04, -1.31549360e-03]],
...,
[[-4.31300432e-05, -1.61468924e-04, -2.28978643e-05, ...,
-1.30526796e-05, -1.65740246e-04, -1.52532608e-04],
[ 9.46695582e-06, -1.02088648e-04, 2.74929276e-04, ...,
2.84693117e-04, -2.84875568e-04, -2.80248147e-04],
[ 2.56561616e-04, -2.39318193e-04, 5.11636317e-04, ...,
1.42004428e-04, -4.73114924e-04, -4.22215729e-04],
...,
[ 3.18835628e-05, -5.07764611e-03, -2.44524307e-03, ...,
-2.17102608e-03, 9.32406518e-04, -1.62668424e-04],
[-3.98430202e-05, -5.27923508e-03, -2.50015478e-03, ...,
-2.35773833e-03, 1.14679523e-03, -5.46753930e-04],
[-1.08012224e-04, -5.43248793e-03, -2.52589677e-03, ...,
-2.51999241e-03, 1.33289036e-03, -9.18680802e-04]],
[[-4.31300432e-05, -1.61468924e-04, -2.28978643e-05, ...,
-1.30526796e-05, -1.65740246e-04, -1.52532608e-04],
[-2.36422085e-04, -2.61728710e-04, 2.46185751e-04, ...,
-6.08692972e-05, -3.13386961e-04, -1.68910818e-04],
[-2.05222939e-04, -2.19296082e-04, 1.04809449e-04, ...,
1.52687702e-04, -2.95088452e-04, -1.40395117e-04],
...,
[-3.88026383e-04, -4.55201278e-03, -2.38304026e-03, ...,
-2.11151317e-03, 1.44088583e-03, -1.46701431e-03],
[-4.51929373e-04, -4.85128677e-03, -2.44412408e-03, ...,
-2.32648896e-03, 1.53514405e-03, -1.73510751e-03],
[-5.07737219e-04, -5.08674514e-03, -2.47974368e-03, ...,
-2.51027266e-03, 1.62658445e-03, -1.98661070e-03]],
[[-4.31300432e-05, -1.61468924e-04, -2.28978643e-05, ...,
-1.30526796e-05, -1.65740246e-04, -1.52532608e-04],
[-9.77924428e-05, -4.37246083e-04, 3.97726777e-04, ...,
5.69955155e-05, -2.75543978e-04, -6.65865809e-05],
[-1.78766222e-05, -7.11038534e-04, 5.13010891e-04, ...,
6.48956920e-05, -4.87306388e-04, 4.01803874e-04],
...,
[ 7.31389737e-04, -3.84947122e-03, -2.41923332e-03, ...,
-2.36577378e-03, 1.51708734e-03, -6.34322700e-04],
[ 5.93748991e-04, -4.21182066e-03, -2.47242069e-03, ...,
-2.56023020e-03, 1.65442866e-03, -9.69938294e-04],
[ 4.57269518e-04, -4.51200176e-03, -2.50035292e-03, ...,
-2.72135716e-03, 1.77274307e-03, -1.29425328e-03]]],
dtype=float32)>
# ๋๋์ด model.summary() ์ถ๋ ฅ
model.summary()
Model: "text_generator"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) multiple 1792256
_________________________________________________________________
lstm (LSTM) multiple 5246976
_________________________________________________________________
lstm_1 (LSTM) multiple 8392704
_________________________________________________________________
dense (Dense) multiple 7176025
=================================================================
Total params: 22,607,961
Trainable params: 22,607,961
Non-trainable params: 0
_________________________________________________________________
Output Shape???
- ๋ชจ๋ธ์ ์ ๋ ฅ ์ํ์ค์ ๊ธธ์ด๋ฅผ ๋ชจ๋ฅด๊ธฐ ๋๋ฌธ์ Output Shape๋ฅผ ํน์ ํ ์ ์๋ ๊ฒ
๋ชจ๋ธ์ ํ๋ผ๋ฏธํฐ ์ฌ์ด์ฆ๋ ์ธก์
- 22million
- GPT-2์ ํ๋ผ๋ฏธํฐ ์ฌ์ด์ฆ๋, 1.5billion
- GPT-3์ ํ๋ผ๋ฏธํฐ ์ฌ์ด์ฆ๋ GPT-2์ 100๋ฐฐ
# optimizer์ loss๋ฑ์ ์ฐจ์ฐจ ๋ฐฐ์๋๋ค
# ํน์ ๋ฏธ๋ฆฌ ์๊ณ ์ถ๋ค๋ฉด ์๋ ๋ฌธ์๋ฅผ ์ฐธ๊ณ ํ์ธ์
# https://www.tensorflow.org/api_docs/python/tf/keras/optimizers
# https://www.tensorflow.org/api_docs/python/tf/keras/losses
# ์์ด ์๋นํ ๋ง์ ํธ์ด๋ ์ง๊ธ ๋ณด๋ ๊ฒ์ ์ถ์ฒํ์ง ์์ต๋๋ค
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True,
reduction='none'
)
model.compile(loss=loss, optimizer=optimizer)
model.fit(dataset, epochs=30)
Epoch 1/30
93/93 [==============================] - 21s 203ms/step - loss: 1.8062
Epoch 2/30
93/93 [==============================] - 20s 216ms/step - loss: 1.6858
Epoch 3/30
93/93 [==============================] - 19s 205ms/step - loss: 1.6052
Epoch 4/30
93/93 [==============================] - 19s 200ms/step - loss: 1.5281
Epoch 5/30
93/93 [==============================] - 19s 202ms/step - loss: 1.4540
Epoch 6/30
93/93 [==============================] - 19s 207ms/step - loss: 1.3776
Epoch 7/30
93/93 [==============================] - 19s 206ms/step - loss: 1.3050
Epoch 8/30
93/93 [==============================] - 19s 203ms/step - loss: 1.2305
Epoch 9/30
93/93 [==============================] - 19s 203ms/step - loss: 1.1576
Epoch 10/30
93/93 [==============================] - 19s 204ms/step - loss: 1.0847
Epoch 11/30
93/93 [==============================] - 19s 206ms/step - loss: 1.0169
Epoch 12/30
93/93 [==============================] - 19s 205ms/step - loss: 0.9519
Epoch 13/30
93/93 [==============================] - 19s 203ms/step - loss: 0.8902
Epoch 14/30
93/93 [==============================] - 19s 203ms/step - loss: 0.8373
Epoch 15/30
93/93 [==============================] - 19s 203ms/step - loss: 0.7912
Epoch 16/30
93/93 [==============================] - 19s 203ms/step - loss: 0.7523
Epoch 17/30
93/93 [==============================] - 19s 204ms/step - loss: 0.7208
Epoch 18/30
93/93 [==============================] - 19s 204ms/step - loss: 0.6963
Epoch 19/30
93/93 [==============================] - 19s 204ms/step - loss: 0.6760
Epoch 20/30
93/93 [==============================] - 19s 205ms/step - loss: 0.6597
Epoch 21/30
93/93 [==============================] - 19s 205ms/step - loss: 0.6463
Epoch 22/30
93/93 [==============================] - 19s 204ms/step - loss: 0.6353
Epoch 23/30
93/93 [==============================] - 19s 203ms/step - loss: 0.6260
Epoch 24/30
93/93 [==============================] - 19s 203ms/step - loss: 0.6183
Epoch 25/30
93/93 [==============================] - 19s 203ms/step - loss: 0.6118
Epoch 26/30
93/93 [==============================] - 19s 203ms/step - loss: 0.6053
Epoch 27/30
93/93 [==============================] - 19s 203ms/step - loss: 0.6007
Epoch 28/30
93/93 [==============================] - 19s 203ms/step - loss: 0.5963
Epoch 29/30
93/93 [==============================] - 19s 204ms/step - loss: 0.5925
Epoch 30/30
93/93 [==============================] - 19s 205ms/step - loss: 0.5893
<keras.callbacks.History at 0x7fdedafa3400>
Loss๋ ๋ชจ๋ธ์ด ์ค๋ต์ ๋ง๋ค๊ณ ์๋ ์ ๋๋ผ๊ณ ์๊ฐ( Loss๊ฐ 1์ผ ๋ 99%๋ฅผ ๋ง์ถ๊ณ ์๋ค๋ ์๋ฏธ๋ ์๋).
์ค๋ต๋ฅ ์ด ๊ฐ์ํ๊ณ ์์ผ๋ ํ์ต์ด ์ ์งํ๋๊ณ ์๋ค ๊ณ ํด์
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
# ํ
์คํธ๋ฅผ ์ํด์ ์
๋ ฅ๋ฐ์ init_sentence๋ ํ
์๋ก ๋ณํํฉ๋๋ค
test_input = tokenizer.texts_to_sequences([init_sentence])
test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
end_token = tokenizer.word_index["<end>"]
# ๋จ์ด ํ๋์ฉ ์์ธกํด ๋ฌธ์ฅ์ ๋ง๋ญ๋๋ค
# 1. ์
๋ ฅ๋ฐ์ ๋ฌธ์ฅ์ ํ
์๋ฅผ ์
๋ ฅํฉ๋๋ค
# 2. ์์ธก๋ ๊ฐ ์ค ๊ฐ์ฅ ๋์ ํ๋ฅ ์ธ word index๋ฅผ ๋ฝ์๋
๋๋ค
# 3. 2์์ ์์ธก๋ word index๋ฅผ ๋ฌธ์ฅ ๋ค์ ๋ถ์
๋๋ค
# 4. ๋ชจ๋ธ์ด <end>๋ฅผ ์์ธกํ๊ฑฐ๋, max_len์ ๋๋ฌํ๋ค๋ฉด ๋ฌธ์ฅ ์์ฑ์ ๋ง์นฉ๋๋ค
while True:
# 1
predict = model(test_tensor)
# 2
predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1]
# 3
test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
# 4
if predict_word.numpy()[0] == end_token: break
if test_tensor.shape[1] >= max_len: break
generated = ""
# tokenizer๋ฅผ ์ด์ฉํด word index๋ฅผ ๋จ์ด๋ก ํ๋์ฉ ๋ณํํฉ๋๋ค
for word_index in test_tensor[0].numpy():
generated += tokenizer.index_word[word_index] + " "
return generated
generate_text() ํจ์์์ init_sentence๋ฅผ ์ธ์๋ก ๋ฐ๊ณ
๋ฐ์ ์ธ์๋ฅผ ์ผ๋จ ํ ์๋ก
๋ํดํธ๋ก๋ \ ๋จ์ด ํ๋๋ง
- while์ ์ฒซ ๋ฒ์งธ ๋ฃจํ์์ test_tensor์ \<start> ํ๋๋ง ๋ค์ด๊ฐ๋ค๊ณ ํฉ์๋ค.
- ์ฐ๋ฆฌ์ ๋ชจ๋ธ์ด ์ถ๋ ฅ์ผ๋ก 7001๊ฐ์ ๋จ์ด ์ค A๋ฅผ ๊ณจ๋๋ค๊ณ ํฉ์๋ค.
- while์ ๋ ๋ฒ์งธ ๋ฃจํ์์ test_tensor์๋ \<start> A๊ฐ ๋ค์ด๊ฐ๋๋ค.
- ๊ทธ๋์ ์ฐ๋ฆฌ์ ๋ชจ๋ธ์ด ๊ทธ๋ค์ B๋ฅผ ๊ณจ๋๋ค๊ณ ํฉ์๋ค.
- while์ ์ธ ๋ฒ์งธ ๋ฃจํ์์ test_tensor์๋ <\start> A B๊ฐ ๋ค์ด๊ฐ๋๋ค.
- ๊ทธ๋์..... (์ดํ ํ๋ต)
generate_text(model, tokenizer, init_sentence="<start> I")
'<start> i am not well , sir , i am not well . <end> '
import glob
import os
txt_file_path = './lyricist/data/lyrics/*'
txt_list = glob.glob(txt_file_path)
raw_corpus = []
# ์ฌ๋ฌ๊ฐ์ txt ํ์ผ์ ๋ชจ๋ ์ฝ์ด์ raw_corpus ์ ๋ด์ต๋๋ค.
for txt_file in txt_list:
with open(txt_file, "r") as f:
raw = f.read().splitlines()
raw_corpus.extend(raw)
print("๋ฐ์ดํฐ ํฌ๊ธฐ:", len(raw_corpus))
print("Examples:\n", raw_corpus[:3])
๋ฐ์ดํฐ ํฌ๊ธฐ: 187088
Examples:
['The first words that come out', 'And I can see this song will be about you', "I can't believe that I can breathe without you"]
enc_train, enc_val, dec_train, dec_val = <์ฝ๋ ์์ฑ>
File "/tmp/ipykernel_2308/3310982766.py", line 1
enc_train, enc_val, dec_train, dec_val = <์ฝ๋ ์์ฑ>
^
SyntaxError: invalid syntax
# ๊ฒฐ๊ณผํ์ธ
print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
/tmp/ipykernel_2308/1251382030.py in <module>
1 # ๊ฒฐ๊ณผํ์ธ
----> 2 print("Source Train:", enc_train.shape)
3 print("Target Train:", dec_train.shape)
NameError: name 'enc_train' is not defined
# ์ธ๊ณต์ง๋ฅ ๋ง๋ค๊ธฐ
- ๋ชจ๋ธ์ Embedding Size์ Hidden Size๋ฅผ ์กฐ์ ํ๋ฉฐ 10 Epoch ์์ val_loss ๊ฐ์ 2.2 ์์ค์ผ๋ก ์ค์ผ ์ ์๋ ๋ชจ๋ธ์ ์ค๊ณ
- Loss ํจ์๋ฅผ ๊ทธ๋๋ก ์ฌ์ฉ
File "/tmp/ipykernel_2308/1025019345.py", line 2
- ๋ชจ๋ธ์ Embedding Size์ Hidden Size๋ฅผ ์กฐ์ ํ๋ฉฐ 10 Epoch ์์ val_loss ๊ฐ์ 2.2 ์์ค์ผ๋ก ์ค์ผ ์ ์๋ ๋ชจ๋ธ์ ์ค๊ณ
^
SyntaxError: invalid syntax
#Loss
loss = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction='none')
generate_text(lyricist, tokenizer, init_sentence="<start> i love", max_len=20)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
/tmp/ipykernel_2308/936477000.py in <module>
----> 1 generate_text(lyricist, tokenizer, init_sentence="<start> i love", max_len=20)
NameError: name 'lyricist' is not defined
ํ๊ฐ๋ฌธํญ ์์ธ๊ธฐ์ค
1. ๊ฐ์ฌ ํ
์คํธ ์์ฑ ๋ชจ๋ธ์ด ์ ์์ ์ผ๋ก ๋์ํ๋๊ฐ?
ํ ์คํธ ์ ๋๋ ์ด์ ๊ฒฐ๊ณผ๊ฐ ๊ทธ๋ด๋ฏํ ๋ฌธ์ฅ์ผ๋ก ์์ฑ๋๋๊ฐ?
ํน์๋ฌธ์ ์ ๊ฑฐ, ํ ํฌ๋์ด์ ์์ฑ, ํจ๋ฉ์ฒ๋ฆฌ ๋ฑ์ ๊ณผ์ ์ด ๋น ์ง์์ด ์งํ๋์๋๊ฐ?
ํ ์คํธ ์์ฑ๋ชจ๋ธ์ validation loss๊ฐ 2.2 ์ดํ๋ก ๋ฎ์์ก๋๊ฐ?