[1] Download and prepare the dataset
import tensorflow_datasets as tfds
# Download the subword encoded pretokenized dataset
dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True, as_supervised=True)
# Get the tokenizer
tokenizer = info.features['text'].encoder
BUFFER_SIZE = 10000
BATCH_SIZE = 256
# Get the train and test splits
train_data, test_data = dataset['train'], dataset['test'],
# Shuffle the training data
train_dataset = train_data.shuffle(BUFFER_SIZE)
# Batch and pad the datasets to the maximum length of the sequences
train_dataset = train_dataset.padded_batch(BATCH_SIZE)
test_dataset = test_data.padded_batch(BATCH_SIZE)
[2] Build the Model
먼저, Conv1D와 GlobalMaxPooling1D 레이어에 들어간 후의 형태가 어떻게 변했는지 아래를 참고한다.
import tensorflow as tf
import numpy as np
random_input = np.random.rand(1, 20, 20)
print(random_input.shape)
conv1d = tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu')
result = conv1d(random_input)
print(result.shape)
gmp = tf.keras.layers.GlobalMaxPooling1D()
result = gmp(result)
print(result.shape)
# output
(1, 20, 20)
(1, 16, 128)
(1, 128)
아래는 새로운 모델 아키텍처이다.
model = tf.keras.models.Sequential([
tf.keras.layers.Embedding(tokenizer.vocab_size, 64),
tf.keras.layers.Conv1D(filters=5, kernel_size=5, activation='relu'),
tf.keras.layers.GlobalMaxPooling1D(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentorpy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
[3] Train the model
history = model.fit(train_dataset,
epochs=10,
validation_data = test_dataset)
학습 과정 플로팅 해보기
import matplotlib.pyplot as plt
# Plot utility
def plot_graphs(history, string):
plt.plot(history.history[string])
plt.plot(history.history['val_'+string])
plt.xlabel("Epochs")
plt.ylabel(string)
plt.legend([string, 'val_'+string])
plt.show()
# Plot the accuracy and results
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")
여기서는 텍스트 분류에 사용할 수 있는 또 다른 모델 아키텍처인 컨볼루션 레이어를 살펴봤다.