[1] Download the Dataset
먼저 해당 데이터를 다운로드한다.
import requests
url = 'https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json'
filename = 'sarcasm.json'
response = requests.get(url)
with open(filename, 'wb') as f:
    f.write(response.content)
import json
with open(filename, 'r') as f:
    datastore = json.load(f)
    
sentences = []
labels = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
[2] Split the Dataset
training_size = 200000
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]
[3] Data preprocessing
import tensorflow as tf
import numpy as np 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(training_sentences)
padded_train = pad_sequences(training_sequences, padding = 'post', truncating='post', maxlen=120)
testing_sequences= tokenizer.texts_to_sequences(testing_sentences)
padded_test = pad_sequences(testing_sequences, padding='post', truncating='post', maxlen=120)
final_train_labels = np.array(training_labels)
fianl_test_labels = np.array(testing_labels)
[4] Build and Compile the Model
해당 모델의 아키텍처는 IMDB reiew와 유사한 구조를 가지고 있고, 1층의 양방향 LSTM 레이어를 쌓았다.
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_shape=(120,)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',
              metrics=['accuracy'],
              optimizer='adam')
model.summary()


[5] Train the model
history = model.fit(padded_train, final_train_labels,
          epochs=10,
          validation_data = (padded_test, fianl_test_labels))
import matplotlib.pyplot as plt
# Plot Utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
# Plot the accuracy and loss history
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')
