[1] Download and inspect the dataset
sarcasm 데이터 다운로드
import requests
url = 'https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json'
filename = 'sarcasm.json'
response = requests.get(url)
with open(filename, 'wb') as f:
f.write(response.content)
json 데이터 불러오기
import json
with open('./sarcasm.json', 'r') as f:
datastore = json.load(f)
print(len(datastore))
print(datastore[0])
print(datastore[20000])
# output
26709
{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}
{'article_link': 'https://www.theonion.com/pediatricians-announce-2011-newborns-are-ugliest-babies-1819572977', 'headline': 'pediatricians announce 2011 newborns are ugliest babies in 30 years', 'is_sarcastic': 1}
<is_sarastic = 0> label
{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}
<is_sarastic = 1> label
{'article_link': 'https://www.theonion.com/pediatricians-announce-2011-newborns-are-ugliest-babies-1819572977', 'headline': 'pediatricians announce 2011 newborns are ugliest babies in 30 years', 'is_sarcastic': 1}
데이터셋 만들기
sentences = []
labels = []
urls = []
for item in datastore:
sentences.append(item['headline'])
labels.append(item['is_sarcastic'])
urls.append(item['article_link'])
print("\nsentences : ", sentences[:3])
print("\nlabels : ", labels[:3])
print("\nurls : ", urls[:3])
# outout
sentences : ["former versace store clerk sues over secret 'black code' for minority shoppers", "the 'roseanne' revival catches up to our thorny political mood, for better and worse", "mom starting to fear son's web series closest thing she will have to grandchild"]
labels : [0, 0, 1]
urls : ['https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365', 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697']
Preprocessing the headlines
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(f"number of words in word_index", {len(word_index)})
print()
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
# output
number of words in word_index {29657}
샘플 데이터로 확인하기
index = 2
print(f"sample headline : {sentences[index]}")
print(f"padded sequence : {padded[index]}")
print()
print(f"shape of padded sequences : {padded.shape}")
# output
sample headline : mom starting to fear son's web series closest thing she will have to grandchild
padded sequence : [ 145 838 2 907 1749 2093 582 4719 221 143 39 46
2 10736 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0]
shape of padded sequences : (26709, 40)