[1] Download the Dataset
먼저 해당 데이터를 다운로드한다.
import requests
url = 'https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json'
filename = 'sarcasm.json'
response = requests.get(url)
with open(filename, 'wb') as f:
f.write(response.content)
import json
with open(filename, 'r') as f:
datastore = json.load(f)
sentences = []
labels = []
for item in datastore:
sentences.append(item['headline'])
labels.append(item['is_sarcastic'])
[2] Split the Dataset
training_size = 200000
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]
[3] Data preprocessing
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(training_sentences)
padded_train = pad_sequences(training_sequences, padding = 'post', truncating='post', maxlen=120)
testing_sequences= tokenizer.texts_to_sequences(testing_sentences)
padded_test = pad_sequences(testing_sequences, padding='post', truncating='post', maxlen=120)
final_train_labels = np.array(training_labels)
fianl_test_labels = np.array(testing_labels)
[4] Build and Compile the Model
해당 모델의 아키텍처는 IMDB reiew와 유사한 구조를 가지고 있고, 1층의 양방향 LSTM 레이어를 쌓았다.
model = tf.keras.Sequential([
tf.keras.layers.Embedding(10000, 16, input_shape=(120,)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',
metrics=['accuracy'],
optimizer='adam')
model.summary()
[5] Train the model
history = model.fit(padded_train, final_train_labels,
epochs=10,
validation_data = (padded_test, fianl_test_labels))
import matplotlib.pyplot as plt
# Plot Utility
def plot_graphs(history, string):
plt.plot(history.history[string])
plt.plot(history.history['val_'+string])
plt.xlabel("Epochs")
plt.ylabel(string)
plt.legend([string, 'val_'+string])
plt.show()
# Plot the accuracy and loss history
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')