imdb = tf.keras.datasets.imdb
# IMDb ๋ฐ์ดํฐ์
๋ค์ด๋ก๋
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)
print("ํ๋ จ ์ํ ๊ฐ์: {}, ํ
์คํธ ๊ฐ์: {}".format(len(x_train), len(x_test)))
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
17465344/17464789 [==============================] - 0s 0us/step
17473536/17464789 [==============================] - 0s 0us/step
ํ๋ จ ์ํ ๊ฐ์: 25000, ํ ์คํธ ๊ฐ์: 25000
print(x_train[0]) # 1๋ฒ์งธ ๋ฆฌ๋ทฐ๋ฐ์ดํฐ
print('๋ผ๋ฒจ: ', y_train[0]) # 1๋ฒ์งธ ๋ฆฌ๋ทฐ๋ฐ์ดํฐ์ ๋ผ๋ฒจ
print('1๋ฒ์งธ ๋ฆฌ๋ทฐ ๋ฌธ์ฅ ๊ธธ์ด: ', len(x_train[0]))
print('2๋ฒ์งธ ๋ฆฌ๋ทฐ ๋ฌธ์ฅ ๊ธธ์ด: ', len(x_train[1]))
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
๋ผ๋ฒจ: 1
1๋ฒ์งธ ๋ฆฌ๋ทฐ ๋ฌธ์ฅ ๊ธธ์ด: 218
2๋ฒ์งธ ๋ฆฌ๋ทฐ ๋ฌธ์ฅ ๊ธธ์ด: 189ํ ์คํธ ๋ฐ์ดํฐ๊ฐ ์๋ ์ด๋ฏธ ์ซ์๋ก Encoding ๋ ํ ์คํธ ๋ฐ์ดํฐ์.
word_to_index = imdb.get_word_index()
index_to_word = {index:word for word, index in word_to_index.items()}
print(index_to_word[1]) # 'the' ๊ฐ ์ถ๋ ฅ๋ฉ๋๋ค.
print(word_to_index['the']) # 1 ์ด ์ถ๋ ฅ๋ฉ๋๋ค.
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
1646592/1641221 [==============================] - 0s 0us/step
1654784/1641221 [==============================] - 0s 0us/step
the
1
IMDb ๋ฐ์ดํฐ์ ์ ํ ์คํธ ์ธ์ฝ๋ฉ์ ์ํ word_to_index, index_to_word๋ ์๋์ ๊ฐ์ด ๋ณด์
๋ฐ์ดํฐ ์ ๊ณต์๊ฐ ๊ทธ๋ ๊ฒ ํ๋
- word_to_index๋ IMDb ํ ์คํธ ๋ฐ์ดํฐ์ ์ ๋จ์ด ์ถํ ๋น๋ ๊ธฐ์ค์ผ๋ก ๋ด๋ฆผ์ฐจ์ ์ ๋ ฌ
#์ค์ ์ธ์ฝ๋ฉ ์ธ๋ฑ์ค๋ ์ ๊ณต๋ word_to_index์์ index ๊ธฐ์ค์ผ๋ก 3์ฉ ๋ค๋ก ๋ฐ๋ ค ์์ต๋๋ค. word_to_index = {k:(v+3) for k,v in word_to_index.items()}
word_to_index[""] = 0
word_to_index[""] = 1
word_to_index[""] = 2 # unknown
word_to_index[""] = 3
index_to_word = {index:word for word, index in word_to_index.items()}
print(index_to_word[1]) # '' ๊ฐ ์ถ๋ ฅ๋ฉ๋๋ค.
print(word_to_index['the']) # 4 ์ด ์ถ๋ ฅ๋ฉ๋๋ค.
print(index_to_word[4]) # 'the' ๊ฐ ์ถ๋ ฅ๋ฉ๋๋ค.
>>\<BOS>
4
the
### Decoding ํ๊ธฐ
```python
print(get_decoded_sentence(x_train[0], index_to_word))
print('๋ผ๋ฒจ: ', y_train[0]) # 1๋ฒ์งธ ๋ฆฌ๋ทฐ๋ฐ์ดํฐ์ ๋ผ๋ฒจ
this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert \ is an amazing actor and now the same being director \ father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for \ and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also \ to the two little boy's that played the \ of norman and paul they were just brilliant children are often left out of the \ list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all
๋ผ๋ฒจ: 1
total_data_text = list(x_train) + list(x_test)
# ํ
์คํธ๋ฐ์ดํฐ ๋ฌธ์ฅ๊ธธ์ด์ ๋ฆฌ์คํธ๋ฅผ ์์ฑํ ํ
num_tokens = [len(tokens) for tokens in total_data_text]
num_tokens = np.array(num_tokens)
# ๋ฌธ์ฅ๊ธธ์ด์ ํ๊ท ๊ฐ, ์ต๋๊ฐ, ํ์คํธ์ฐจ๋ฅผ ๊ณ์ฐํด ๋ณธ๋ค.
print('๋ฌธ์ฅ๊ธธ์ด ํ๊ท : ', np.mean(num_tokens))
print('๋ฌธ์ฅ๊ธธ์ด ์ต๋ : ', np.max(num_tokens))
print('๋ฌธ์ฅ๊ธธ์ด ํ์คํธ์ฐจ : ', np.std(num_tokens))
# ์๋ฅผ๋ค์ด, ์ต๋ ๊ธธ์ด๋ฅผ (ํ๊ท + 2*ํ์คํธ์ฐจ)๋ก ํ๋ค๋ฉด,
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
maxlen = int(max_tokens)
print('pad_sequences maxlen : ', maxlen)
print('์ ์ฒด ๋ฌธ์ฅ์ {}%๊ฐ maxlen ์ค์ ๊ฐ ์ด๋ด์ ํฌํจ๋ฉ๋๋ค. '.format(np.sum(num_tokens < max_tokens) / len(num_tokens)))
๋ฌธ์ฅ๊ธธ์ด ํ๊ท : 234.75892
๋ฌธ์ฅ๊ธธ์ด ์ต๋ : 2494
๋ฌธ์ฅ๊ธธ์ด ํ์คํธ์ฐจ : 172.91149458735703
pad_sequences maxlen : 580
์ ์ฒด ๋ฌธ์ฅ์ 0.94536%๊ฐ maxlen ์ค์ ๊ฐ ์ด๋ด์ ํฌํจ๋ฉ๋๋ค.์์ ๊ฒฝ์ฐ maxlen=580
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train,
value=word_to_index["<PAD>"],
padding='post', # ํน์ 'pre'
maxlen=maxlen)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test,
value=word_to_index["<PAD>"],
padding='post', # ํน์ 'pre'
maxlen=maxlen)
print(x_train.shape)
(25000, 580)
## RNN Pre์ฌ์ฉ
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train,
value=word_to_index["<PAD>"],
padding='pre', # ํน์ 'pre'
maxlen=maxlen)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test,
value=word_to_index["<PAD>"],
padding='pre', # ํน์ 'pre'
maxlen=maxlen)
print(x_train.shape)
(25000, 580)
์ด ๊ฒฐ๊ณผ๋ก๋ ์์ ์๋ ๊ฑฐ ์๋๊ฐ???
vocab_size = 10000 # ์ดํ ์ฌ์ ์ ํฌ๊ธฐ์
๋๋ค(10,000๊ฐ์ ๋จ์ด)
word_vector_dim = 16 # ์๋ ๋ฒกํฐ์ ์ฐจ์ ์ (๋ณ๊ฒฝ ๊ฐ๋ฅํ ํ์ดํผํ๋ผ๋ฏธํฐ)
# model ์ค๊ณ - ๋ฅ๋ฌ๋ ๋ชจ๋ธ ์ฝ๋๋ฅผ ์ง์ ์์ฑํด ์ฃผ์ธ์.
model = tf.keras.Sequential()
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(tf.keras.layers.GlobalMaxPooling1D())
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()
Model: "sequential_6"
Layer (type) Output Shape Param #
\=================================================================
embedding_6 (Embedding) (None, None, 16) 160000
global_max_pooling1d_2 (Glob (None, 16) 0
dense_8 (Dense) (None, 8) 136
dense_9 (Dense) (None, 1) 9
\=================================================================
Total params: 160,145
Trainable params: 160,145
Non-trainable params: 0
๊ทธ๋์ ์ด ๊ฒฐ๊ณผ๊ฐ ๋ชจ๋ค?
# validation set 10000๊ฑด ๋ถ๋ฆฌ
x_val = x_train[:10000]
y_val = y_train[:10000]
# validation set์ ์ ์ธํ ๋๋จธ์ง 15000๊ฑด
partial_x_train = x_train[10000:]
partial_y_train = y_train[10000:]
print(partial_x_train.shape)
print(partial_y_train.shape)
(15000, 580)
(15000,)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
epochs=20 # ๋ช epoch๋ฅผ ํ๋ จํ๋ฉด ์ข์์ง ๊ฒฐ๊ณผ๋ฅผ ๋ณด๋ฉด์ ๋ฐ๊พธ์ด ๋ด
์๋ค.
history = model.fit(partial_x_train,
partial_y_train,
epochs=epochs,
batch_size=512,
validation_data=(x_val, y_val),
verbose=1)
Epoch 1/20
30/30 [==============================] - 8s 13ms/step - loss: 0.6919 - accuracy: 0.4991 - val_loss: 0.6889 - val_accuracy: 0.5412
Epoch 2/20
30/30 [==============================] - 0s 8ms/step - loss: 0.6846 - accuracy: 0.6340 - val_loss: 0.6796 - val_accuracy: 0.6835
Epoch 3/20
30/30 [==============================] - 0s 7ms/step - loss: 0.6701 - accuracy: 0.7469 - val_loss: 0.6618 - val_accuracy: 0.7336
Epoch 4/20
30/30 [==============================] - 0s 7ms/step - loss: 0.6429 - accuracy: 0.7901 - val_loss: 0.6296 - val_accuracy: 0.7706
Epoch 5/20
30/30 [==============================] - 0s 6ms/step - loss: 0.5986 - accuracy: 0.8169 - val_loss: 0.5863 - val_accuracy: 0.7821
Epoch 6/20
30/30 [==============================] - 0s 6ms/step - loss: 0.5440 - accuracy: 0.8301 - val_loss: 0.5386 - val_accuracy: 0.7953
Epoch 7/20
30/30 [==============================] - 0s 6ms/step - loss: 0.4867 - accuracy: 0.8430 - val_loss: 0.4944 - val_accuracy: 0.8038
Epoch 8/20
30/30 [==============================] - 0s 6ms/step - loss: 0.4342 - accuracy: 0.8543 - val_loss: 0.4580 - val_accuracy: 0.8120
Epoch 9/20
30/30 [==============================] - 0s 6ms/step - loss: 0.3889 - accuracy: 0.8669 - val_loss: 0.4301 - val_accuracy: 0.8173
Epoch 10/20
30/30 [==============================] - 0s 6ms/step - loss: 0.3508 - accuracy: 0.8789 - val_loss: 0.4081 - val_accuracy: 0.8233
Epoch 11/20
30/30 [==============================] - 0s 6ms/step - loss: 0.3185 - accuracy: 0.8877 - val_loss: 0.3916 - val_accuracy: 0.8276
Epoch 12/20
30/30 [==============================] - 0s 6ms/step - loss: 0.2912 - accuracy: 0.8984 - val_loss: 0.3796 - val_accuracy: 0.8309
Epoch 13/20
30/30 [==============================] - 0s 6ms/step - loss: 0.2678 - accuracy: 0.9060 - val_loss: 0.3706 - val_accuracy: 0.8361
Epoch 14/20
30/30 [==============================] - 0s 6ms/step - loss: 0.2473 - accuracy: 0.9137 - val_loss: 0.3644 - val_accuracy: 0.8371
Epoch 15/20
30/30 [==============================] - 0s 6ms/step - loss: 0.2296 - accuracy: 0.9202 - val_loss: 0.3603 - val_accuracy: 0.8391
Epoch 16/20
30/30 [==============================] - 0s 6ms/step - loss: 0.2137 - accuracy: 0.9257 - val_loss: 0.3578 - val_accuracy: 0.8413
Epoch 17/20
30/30 [==============================] - 0s 6ms/step - loss: 0.1996 - accuracy: 0.9314 - val_loss: 0.3570 - val_accuracy: 0.8418
Epoch 18/20
30/30 [==============================] - 0s 6ms/step - loss: 0.1867 - accuracy: 0.9367 - val_loss: 0.3573 - val_accuracy: 0.8428
Epoch 19/20
30/30 [==============================] - 0s 6ms/step - loss: 0.1750 - accuracy: 0.9422 - val_loss: 0.3583 - val_accuracy: 0.8430
Epoch 20/20
30/30 [==============================] - 0s 6ms/step - loss: 0.1639 - accuracy: 0.9479 - val_loss: 0.3604 - val_accuracy: 0.8423
results = model.evaluate(x_test, y_test, verbose=2)
print(results)
782/782 - 1s - loss: 0.3870 - accuracy: 0.8310
[0.3870142102241516, 0.8309599757194519]
model.fit() ๊ณผ์ ์ค์ train/validation loss, accuracy ๋ฑ์ด ๋งค epoch๋ง๋ค history ๋ณ์์ ์ ์ฅ
์ด ๋ฐ์ดํฐ๋ฅผ ๊ทธ๋ํ๋ก ๊ทธ๋ ค ๋ณด๋ฉด, ์ํํ๋ ๋ฅ๋ฌ๋ ํ์ต์ด ์ ์งํ๋์๋์ง, ์ค๋ฒํผํ ํน์ ์ธ๋ํผํ ํ์ง ์์๋์ง, ์ฑ๋ฅ์ ๊ฐ์ ํ ์ ์๋ ๋ค์ํ ์์ด๋์ด๋ฅผ ์ป์ ์ ์๋ ์ข์ ์๋ฃ
history_dict = history.history
print(history_dict.keys()) # epoch์ ๋ฐ๋ฅธ ๊ทธ๋ํ๋ฅผ ๊ทธ๋ ค๋ณผ ์ ์๋ ํญ๋ชฉ๋ค
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])
import matplotlib.pyplot as plt
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(acc) + 1)
# "bo"๋ "ํ๋์ ์ "์
๋๋ค
plt.plot(epochs, loss, 'bo', label='Training loss')
# b๋ "ํ๋ ์ค์ "์
๋๋ค
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
- Training and validation loss๋ฅผ ๊ทธ๋ ค ๋ณด๋ฉด, ๋ช epoch๊น์ง์ ํธ๋ ์ด๋์ด ์ ์ ํ์ง ์ต์ ์ ์ ์ถ์
- validation loss์ ๊ทธ๋ํ๊ฐ train loss์์ ์ด๊ฒฉ์ด ๋ฐ์ํ๊ฒ ๋๋ฉด ๋ ์ด์์ ํธ๋ ์ด๋์ ๋ฌด์๋ฏธ
plt.clf() # ๊ทธ๋ฆผ์ ์ด๊ธฐํํฉ๋๋ค
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
word Embedding
- ๋ผ๋ฒจ๋ง ๋น์ฉ์ด ๋ง์ด ๋๋ ๋จธ์ ๋ฌ๋ ๊ธฐ๋ฐ ๊ฐ์ฑ๋ถ์์ ๋น์ฉ์ ์ ๊ฐํ๋ฉด์ ์ ํ๋๋ฅผ ํฌ๊ฒ ํฅ์์ํฌ ์ ์๋ ์์ฐ์ด์ฒ๋ฆฌ ๊ธฐ๋ฒ์ผ๋ก ๋จ์ด์ ํน์ฑ์ ์ ์ฐจ์ ๋ฒกํฐ๊ฐ์ผ๋ก ํํํ ์ ์๋ ์๋ ์๋ฒ ๋ฉ(word embedding) ๊ธฐ๋ฒ
model์ ์ฒซ ๋ฒ์งธ ๋ ์ด์ด๋ ๋ฐ๋ก Embedding ๋ ์ด์ด
- ์ฌ์ ์ ๋จ์ด ๊ฐ์ X ์๋ ๋ฒกํฐ ์ฌ์ด์ฆ๋งํผ์ ํฌ๊ธฐ๋ฅผ ๊ฐ์ง ํ์ต ํ๋ผ๋ฏธํฐ
- Embedding ๋ ์ด์ด์ ํ์ต๋ ์๋ ๋ฒกํฐ๋ค๋ ์๋ฏธ ๊ณต๊ฐ์์ ์ ์๋ฏธํ ํํ๋ก ํ์ต
์๋ ๋ฒกํฐ๋ฅผ ๋ค๋ฃจ๋๋ฐ ์ ์ฉํ gensim ํจํค์ง ๋ฒ์ ํ์ธ
embedding_layer = model.layers[0]
weights = embedding_layer.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)
(10000, 300)
# ํ์ตํ Embedding ํ๋ผ๋ฏธํฐ๋ฅผ ํ์ผ์ ์จ์ ์ ์ฅํฉ๋๋ค.
word2vec_file_path = os.getenv('HOME')+'/aiffel/sentiment_classification/data/word2vec.txt'
f = open(word2vec_file_path, 'w')
f.write('{} {}\n'.format(vocab_size-4, word_vector_dim)) # ๋ช๊ฐ์ ๋ฒกํฐ๋ฅผ ์ผ๋ง ์ฌ์ด์ฆ๋ก ๊ธฐ์ฌํ ์ง ํ์ดํ์ ์๋๋ค.
# ๋จ์ด ๊ฐ์(์์ ํน์๋ฌธ์ 4๊ฐ๋ ์ ์ธํ๊ณ )๋งํผ์ ์๋ ๋ฒกํฐ๋ฅผ ํ์ผ์ ๊ธฐ๋กํฉ๋๋ค.
vectors = model.get_weights()[0]
for i in range(4,vocab_size):
f.write('{} {}\n'.format(index_to_word[i], ' '.join(map(str, list(vectors[i, :])))))
f.close()
gensim์์ ์ ๊ณตํ๋ ํจํค์ง๋ฅผ ์ด์ฉํด, ์์ ๋จ๊ธด ์๋ฒ ๋ฉ ํ๋ผ๋ฏธํฐ๋ฅผ ์ฝ์ด์ word vector๋ก ์ฌ์ฉ
from gensim.models.keyedvectors import Word2VecKeyedVectors
word_vectors = Word2VecKeyedVectors.load_word2vec_format(word2vec_file_path, binary=False)
vector = word_vectors['computer']
vector
์์ ๊ฐ์ด ์ป์ ์๋ ๋ฒกํฐ๋ฅผ ๊ฐ์ง๊ณ ์ฌ๋ฏธ์๋ ์คํ์ ํด๋ณผ ์ ์์ต๋๋ค. ์๋ ๋ฒกํฐ๊ฐ ์๋ฏธ ๋ฒกํฐ ๊ณต๊ฐ์์ ์ ์๋ฏธํ๊ฒ ํ์ต๋์๋์ง ํ์ธํ๋ ๋ฐฉ๋ฒ ์ค์, ๋จ์ด๋ฅผ ํ๋ ์ฃผ๊ณ ๊ทธ์ ๊ฐ์ฅ ์ ์ฌํ ๋จ์ด์ ๊ทธ ์ ์ฌ๋๋ฅผ ํ์ธํ๋ ๋ฐฉ๋ฒ์ด ์์ต๋๋ค. gensim์ ์ฌ์ฉํ๋ฉด ์๋์ ๊ฐ์ด ํด๋ณผ ์ ์์ต๋๋ค.
word_vectors.similar_by_word("love")
$ ln -s ~/data/GoogleNews-vectors-negative300.bin.gz ~/aiffel/sentiment_classification/data
from gensim.models import KeyedVectors
word2vec_path = os.getenv('HOME')+'/aiffel/sentiment_classification/data/GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True, limit=1000000)
vector = word2vec['computer']
vector # ๋ฌด๋ ค 300dim์ ์๋ ๋ฒกํฐ์
๋๋ค.
array([ 1.07421875e-01, -2.01171875e-01, 1.23046875e-01, 2.11914062e-01,
-9.13085938e-02, 2.16796875e-01, -1.31835938e-01, 8.30078125e-02,
2.02148438e-01, 4.78515625e-02, 3.66210938e-02, -2.45361328e-02,
2.39257812e-02, -1.60156250e-01, -2.61230469e-02, 9.71679688e-02,
-6.34765625e-02, 1.84570312e-01, 1.70898438e-01, -1.63085938e-01,
-1.09375000e-01, 1.49414062e-01, -4.65393066e-04, 9.61914062e-02,
1.68945312e-01, 2.60925293e-03, 8.93554688e-02, 6.49414062e-02,
3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
-2.27539062e-01, 2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
-2.20703125e-01, 1.30859375e-01, 3.66210938e-02, -3.63769531e-02,
-1.13281250e-01, 1.95312500e-01, 9.76562500e-02, 1.26953125e-01,
6.59179688e-02, 6.93359375e-02, 1.02539062e-02, 1.75781250e-01,
-1.68945312e-01, 1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
5.66406250e-02, -1.77734375e-01, -2.08984375e-01, 1.76757812e-01,
2.38037109e-02, -2.57812500e-01, -4.46777344e-02, 1.88476562e-01,
5.51757812e-02, 5.02929688e-02, -1.06933594e-01, 1.89453125e-01,
-1.16210938e-01, 8.49609375e-02, -1.71875000e-01, 2.45117188e-01,
-1.73828125e-01, -8.30078125e-03, 4.56542969e-02, -1.61132812e-02,
1.86523438e-01, -6.05468750e-02, -4.17480469e-02, 1.82617188e-01,
2.20703125e-01, -1.22558594e-01, -2.55126953e-02, -3.08593750e-01,
9.13085938e-02, 1.60156250e-01, 1.70898438e-01, 1.19628906e-01,
7.08007812e-02, -2.64892578e-02, -3.08837891e-02, 4.06250000e-01,
-1.01562500e-01, 5.71289062e-02, -7.26318359e-03, -9.17968750e-02,
-1.50390625e-01, -2.55859375e-01, 2.16796875e-01, -3.63769531e-02,
2.24609375e-01, 8.00781250e-02, 1.56250000e-01, 5.27343750e-02,
1.50390625e-01, -1.14746094e-01, -8.64257812e-02, 1.19140625e-01,
-7.17773438e-02, 2.73437500e-01, -1.64062500e-01, 7.29370117e-03,
4.21875000e-01, -1.12792969e-01, -1.35742188e-01, -1.31835938e-01,
-1.37695312e-01, -7.66601562e-02, 6.25000000e-02, 4.98046875e-02,
-1.91406250e-01, -6.03027344e-02, 2.27539062e-01, 5.88378906e-02,
-3.24218750e-01, 5.41992188e-02, -1.35742188e-01, 8.17871094e-03,
-5.24902344e-02, -1.74713135e-03, -9.81445312e-02, -2.86865234e-02,
3.61328125e-02, 2.15820312e-01, 5.98144531e-02, -3.08593750e-01,
-2.27539062e-01, 2.61718750e-01, 9.86328125e-02, -5.07812500e-02,
1.78222656e-02, 1.31835938e-01, -5.35156250e-01, -1.81640625e-01,
1.38671875e-01, -3.10546875e-01, -9.71679688e-02, 1.31835938e-01,
-1.16210938e-01, 7.03125000e-02, 2.85156250e-01, 3.51562500e-02,
-1.01562500e-01, -3.75976562e-02, 1.41601562e-01, 1.42578125e-01,
-5.68847656e-02, 2.65625000e-01, -2.09960938e-01, 9.64355469e-03,
-6.68945312e-02, -4.83398438e-02, -6.10351562e-02, 2.45117188e-01,
-9.66796875e-02, 1.78222656e-02, -1.27929688e-01, -4.78515625e-02,
-7.26318359e-03, 1.79687500e-01, 2.78320312e-02, -2.10937500e-01,
-1.43554688e-01, -1.27929688e-01, 1.73339844e-02, -3.60107422e-03,
-2.04101562e-01, 3.63159180e-03, -1.19628906e-01, -6.15234375e-02,
5.93261719e-02, -3.23486328e-03, -1.70898438e-01, -3.14941406e-02,
-8.88671875e-02, -2.89062500e-01, 3.44238281e-02, -1.87500000e-01,
2.94921875e-01, 1.58203125e-01, -1.19628906e-01, 7.61718750e-02,
6.39648438e-02, -4.68750000e-02, -6.83593750e-02, 1.21459961e-02,
-1.44531250e-01, 4.54101562e-02, 3.68652344e-02, 3.88671875e-01,
1.45507812e-01, -2.55859375e-01, -4.46777344e-02, -1.33789062e-01,
-1.38671875e-01, 6.59179688e-02, 1.37695312e-01, 1.14746094e-01,
2.03125000e-01, -4.78515625e-02, 1.80664062e-02, -8.54492188e-02,
-2.48046875e-01, -3.39843750e-01, -2.83203125e-02, 1.05468750e-01,
-2.14843750e-01, -8.74023438e-02, 7.12890625e-02, 1.87500000e-01,
-1.12304688e-01, 2.73437500e-01, -3.26171875e-01, -1.77734375e-01,
-4.24804688e-02, -2.69531250e-01, 6.64062500e-02, -6.88476562e-02,
-1.99218750e-01, -7.03125000e-02, -2.43164062e-01, -3.66210938e-02,
-7.37304688e-02, -1.77734375e-01, 9.17968750e-02, -1.25000000e-01,
-1.65039062e-01, -3.57421875e-01, -2.85156250e-01, -1.66992188e-01,
1.97265625e-01, -1.53320312e-01, 2.31933594e-02, 2.06054688e-01,
1.80664062e-01, -2.74658203e-02, -1.92382812e-01, -9.61914062e-02,
-1.06811523e-02, -4.73632812e-02, 6.54296875e-02, -1.25732422e-02,
1.78222656e-02, -8.00781250e-02, -2.59765625e-01, 9.37500000e-02,
-7.81250000e-02, 4.68750000e-02, -2.22167969e-02, 1.86767578e-02,
3.11279297e-02, 1.04980469e-02, -1.69921875e-01, 2.58789062e-02,
-3.41796875e-02, -1.44042969e-02, -5.46875000e-02, -8.78906250e-02,
1.96838379e-03, 2.23632812e-01, -1.36718750e-01, 1.75781250e-01,
-1.63085938e-01, 1.87500000e-01, 3.44238281e-02, -5.63964844e-02,
-2.27689743e-05, 4.27246094e-02, 5.81054688e-02, -1.07910156e-01,
-3.88183594e-02, -2.69531250e-01, 3.34472656e-02, 9.81445312e-02,
5.63964844e-02, 2.23632812e-01, -5.49316406e-02, 1.46484375e-01,
5.93261719e-02, -2.19726562e-01, 6.39648438e-02, 1.66015625e-02,
4.56542969e-02, 3.26171875e-01, -3.80859375e-01, 1.70898438e-01,
5.66406250e-02, -1.04492188e-01, 1.38671875e-01, -1.57226562e-01,
3.23486328e-03, -4.80957031e-02, -2.48046875e-01, -6.20117188e-02],
dtype=float32)
300dim์ ๋ฒกํฐ๋ก ์ด๋ฃจ์ด์ง 300๋ง ๊ฐ์ ๋จ์ด์ ๋๋ค. ์ด ๋จ์ด ์ฌ์ ์ ๋ฉ๋ชจ๋ฆฌ์ ๋ชจ๋ ๋ก๋ฉํ๋ฉด ์์ฃผ ๋์ ํ๋ฅ ๋ก ์ฌ๋ฌ๋ถ์ ์ค์ตํ๊ฒฝ์ ๋ฉ๋ชจ๋ฆฌ ์๋ฌ๊ฐ ๋ ๊ฒ์ ๋๋ค. ๊ทธ๋์ KeyedVectors.load_word2vec_format ๋ฉ์๋๋ก ์๋ ๋ฒกํฐ๋ฅผ ๋ก๋ฉํ ๋ ๊ฐ์ฅ ๋ง์ด ์ฌ์ฉ๋๋ ์์ 100๋ง ๊ฐ๋ง limt์ผ๋ก ์กฐ๊ฑด์ ์ฃผ์ด ๋ก๋ฉํ์ต๋๋ค.
๋ฉ๋ชจ๋ฆฌ๊ฐ ์ถฉ๋ถํ๋ค๋ฉด limt=None์ผ๋ก ํ์๋ฉด 300๋ง ๊ฐ๋ฅผ ๋ชจ๋ ๋ก๋ฉ
# ๋ฉ๋ชจ๋ฆฌ๋ฅผ ๋ค์ ๋ง์ด ์๋นํ๋ ์์
์ด๋ ์ ์ํด ์ฃผ์ธ์.
word2vec.similar_by_word("love")
[('loved', 0.6907791495323181),
('adore', 0.6816873550415039),
('loves', 0.661863386631012),
('passion', 0.6100708842277527),
('hate', 0.600395679473877),
('loving', 0.5886635780334473),
('affection', 0.5664337873458862),
('undying_love', 0.5547304749488831),
('absolutely_adore', 0.5536840558052063),
('adores', 0.5440906882286072)]
Word2Vec์์ ์ ๊ณตํ๋ ์๋ ์๋ฒ ๋ฉ ๋ฒกํฐ๋ค๋ผ๋ฆฌ๋ ์๋ฏธ์ ์ ์ฌ๋๊ฐ ๊ฐ๊น์ด ๊ฒ์ด ์๋ก ๊ฐ๊น๊ฒ ์ ๋๋ก ํ์ต๋ ๊ฒ์ ํ์ธํ ์ ์์ต๋๋ค. ์ด์ ์ฐ๋ฆฌ๋ ์ด์ ์คํ ์์ ํ์ตํ๋ ๋ชจ๋ธ์ ์๋ฒ ๋ฉ ๋ ์ด์ด๋ฅผ Word2Vec์ ๊ฒ์ผ๋ก ๊ต์ฒดํ์ฌ ๋ค์ ํ์ต์์ผ ๋ณผ ๊ฒ
vocab_size = 10000 # ์ดํ ์ฌ์ ์ ํฌ๊ธฐ์
๋๋ค(10,000๊ฐ์ ๋จ์ด)
word_vector_dim = 300 # ์๋ ๋ฒกํฐ์ ์ฐจ์์
embedding_matrix = np.random.rand(vocab_size, word_vector_dim)
# embedding_matrix์ Word2Vec ์๋ ๋ฒกํฐ๋ฅผ ๋จ์ด ํ๋์ฉ๋ง๋ค ์ฐจ๋ก์ฐจ๋ก ์นดํผํ๋ค.
for i in range(4,vocab_size):
if index_to_word[i] in word2vec:
embedding_matrix[i] = word2vec[index_to_word[i]]
from tensorflow.keras.initializers import Constant
vocab_size = 10000 # ์ดํ ์ฌ์ ์ ํฌ๊ธฐ์
๋๋ค(10,000๊ฐ์ ๋จ์ด)
word_vector_dim = 300 # ์๋ ๋ฒกํฐ์ ์ฐจ์ ์
# ๋ชจ๋ธ ๊ตฌ์ฑ
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size,
word_vector_dim,
embeddings_initializer=Constant(embedding_matrix), # ์นดํผํ ์๋ฒ ๋ฉ์ ์ฌ๊ธฐ์ ํ์ฉ
input_length=maxlen,
trainable=True)) # trainable์ True๋ก ์ฃผ๋ฉด Fine-tuning
model.add(tf.keras.layers.Conv1D(16, 7, activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(5))
model.add(tf.keras.layers.Conv1D(16, 7, activation='relu'))
model.add(tf.keras.layers.GlobalMaxPooling1D())
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()
Model: "sequential_11"
Layer (type) Output Shape Param #
\=================================================================
embedding_11 (Embedding) (None, 580, 300) 3000000
conv1d_10 (Conv1D) (None, 574, 16) 33616
max_pooling1d_5 (MaxPooling1 (None, 114, 16) 0
conv1d_11 (Conv1D) (None, 108, 16) 1808
global_max_pooling1d_7 (Glob (None, 16) 0
dense_18 (Dense) (None, 8) 136
dense_19 (Dense) (None, 1) 9
\=================================================================
Total params: 3,035,569
Trainable params: 3,035,569
Non-trainable params: 0
# ํ์ต์ ์งํ
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
epochs=20 # ๋ช epoch๋ฅผ ํ๋ จํ๋ฉด ์ข์์ง ๊ฒฐ๊ณผ๋ฅผ ๋ณด๋ฉด์ ๋ฐ๊พธ์ด ๋ด
์๋ค.
history = model.fit(partial_x_train,
partial_y_train,
epochs=epochs,
batch_size=512,
validation_data=(x_val, y_val),
verbose=1)
# ํ
์คํธ์
์ ํตํ ๋ชจ๋ธ ํ๊ฐ
results = model.evaluate(x_test, y_test, verbose=2)
print(results)
Epoch 1/20
30/30 [==============================] - 3s 78ms/step - loss: 0.7040 - accuracy: 0.4949 - val_loss: 0.6931 - val_accuracy: 0.5030
Epoch 2/20
30/30 [==============================] - 2s 73ms/step - loss: 0.6928 - accuracy: 0.5125 - val_loss: 0.6934 - val_accuracy: 0.5054
Epoch 3/20
30/30 [==============================] - 2s 72ms/step - loss: 0.6916 - accuracy: 0.5375 - val_loss: 0.6938 - val_accuracy: 0.4943
Epoch 4/20
30/30 [==============================] - 2s 72ms/step - loss: 0.6899 - accuracy: 0.5364 - val_loss: 0.6944 - val_accuracy: 0.4943
Epoch 5/20
30/30 [==============================] - 2s 73ms/step - loss: 0.6878 - accuracy: 0.5438 - val_loss: 0.6928 - val_accuracy: 0.5160
Epoch 6/20
30/30 [==============================] - 2s 74ms/step - loss: 0.6850 - accuracy: 0.6023 - val_loss: 0.6931 - val_accuracy: 0.5050
Epoch 7/20
30/30 [==============================] - 2s 74ms/step - loss: 0.6814 - accuracy: 0.6922 - val_loss: 0.6927 - val_accuracy: 0.5073
Epoch 8/20
30/30 [==============================] - 2s 74ms/step - loss: 0.6747 - accuracy: 0.7303 - val_loss: 0.6916 - val_accuracy: 0.5308
Epoch 9/20
30/30 [==============================] - 2s 73ms/step - loss: 0.6658 - accuracy: 0.7083 - val_loss: 0.6911 - val_accuracy: 0.5227
Epoch 10/20
30/30 [==============================] - 2s 73ms/step - loss: 0.6477 - accuracy: 0.7989 - val_loss: 0.6901 - val_accuracy: 0.5374
Epoch 11/20
30/30 [==============================] - 2s 73ms/step - loss: 0.6121 - accuracy: 0.8204 - val_loss: 0.6895 - val_accuracy: 0.5319
Epoch 12/20
30/30 [==============================] - 2s 72ms/step - loss: 0.5450 - accuracy: 0.8669 - val_loss: 0.6987 - val_accuracy: 0.5382
Epoch 13/20
30/30 [==============================] - 2s 72ms/step - loss: 0.4603 - accuracy: 0.8816 - val_loss: 0.7146 - val_accuracy: 0.5394
Epoch 14/20
30/30 [==============================] - 2s 72ms/step - loss: 0.3577 - accuracy: 0.9413 - val_loss: 0.7043 - val_accuracy: 0.5624
Epoch 15/20
30/30 [==============================] - 2s 72ms/step - loss: 0.2657 - accuracy: 0.9613 - val_loss: 0.7190 - val_accuracy: 0.5716
Epoch 16/20
30/30 [==============================] - 2s 72ms/step - loss: 0.1955 - accuracy: 0.9753 - val_loss: 0.7560 - val_accuracy: 0.5704
Epoch 17/20
30/30 [==============================] - 2s 72ms/step - loss: 0.1357 - accuracy: 0.9881 - val_loss: 0.8460 - val_accuracy: 0.5593
Epoch 18/20
30/30 [==============================] - 2s 72ms/step - loss: 0.0985 - accuracy: 0.9936 - val_loss: 0.8920 - val_accuracy: 0.5555
Epoch 19/20
30/30 [==============================] - 2s 72ms/step - loss: 0.0710 - accuracy: 0.9954 - val_loss: 0.8277 - val_accuracy: 0.5821
Epoch 20/20
30/30 [==============================] - 2s 72ms/step - loss: 0.0540 - accuracy: 0.9965 - val_loss: 0.8434 - val_accuracy: 0.5819