!mkdir -p ~/data
!mkdir -p ~/models
!wget https://d3s0tskafalll9.cloudfront.net/media/documents/song_lyrics.zip
!unzip song_lyrics.zip -d ~/data/lyrics
!ls ~/data/lyrics
import re
import glob
import os
import tensorflow as tf
txt_file_path = os.getenv('HOME')+'/data/lyrics/*'
txt_list = glob.glob(txt_file_path)
raw_corpus = []
for txt_file in txt_list:
with open(txt_file, "r") as f:
raw = f.read().splitlines()
raw_corpus.extend(raw)
print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:3])
for idx, sentence in enumerate(raw_corpus):
if len(sentence) == 0: continue
if sentence[-1] == ":": continue
if idx > 9: break
print(sentence)
def preprocess_sentence(sentence):
sentence = sentence.lower().strip()
sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
sentence = re.sub(r'[" "]+', " ", sentence)
sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)
sentence = sentence.strip()
sentence = '<start> ' + sentence + ' <end>'
return sentence
print(preprocess_sentence("This @_is ;;;sample sentence."))
corpus = []
for sentence in raw_corpus:
if len(sentence) == 0: continue
if sentence[-1] == ":": continue
preprocessed_sentence = preprocess_sentence(sentence)
corpus.append(preprocessed_sentence)
corpus[:10]
def tokenize(corpus):
tokenizer = tf.keras.preprocessing.text.Tokenizer(
num_words=10000,
filters=' ',
oov_token="<unk>"
)
tokenizer.fit_on_texts(corpus)
tensor = tokenizer.texts_to_sequences(corpus)
tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, maxlen=15,
padding=
print(tensor,tokenizer)
return tensor, tokenizer
tensor, tokenizer = tokenize(corpus)
print(tensor[:3, :10])
for idx in tokenizer.index_word:
print(idx, ":", tokenizer.index_word[idx])
if idx >= 10: break
src_input = tensor[:, :-1]
tgt_input = tensor[:, 1:]
print(src_input[0])
print(tgt_input[0])
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE
VOCAB_SIZE = len(tokenizer.index_word) + 1
print(VOCAB_SIZE)
dataset = tf.data.Dataset.from_tensor_slices((
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset
class TextGenerator(tf.keras.Model):
def __init__(self, vocab_size, embedding_size, hidden_size):
super().__init__()
self.embedding =
self.rnn_1 =
self.rnn_2 =
self.linear =
def call(self, x):
out = self.embedding(x)
out = self.rnn_1(out)
out = self.rnn_2(out)
out = self.linear(out)
return out
embedding_size = 256
hidden_size = 1024
model = TextGenerator(VOCAB_SIZE, embedding_size , hidden_size)
for src_sample, tgt_sample in dataset.take(1): break
model(src_sample)
model.summary()
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True,
reduction='none'
)
model.compile(loss=loss, optimizer=optimizer)
model.fit(
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
test_input = tokenizer.texts_to_sequences([init_sentence])
test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
end_token = tokenizer.word_index["<end>"]
while True:
predict = model(test_tensor)
tmp = 1
predict = predict / tmp
predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1]
test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
if predict_word.numpy()[0] == end_token: break
if test_tensor.shape[1] >= max_len: break
generated = ""
for word_index in test_tensor[0].numpy():
generated += tokenizer.index_word[word_index] + " "
return generated
generate_text(model, tokenizer, init_sentence="<start> i")