--31.문장분류, 감정분류.ipynb--
import pandas as pd
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate
import os
base_path = r'/content/drive/MyDrive/dataset/chatbot'
data = pd.read_csv(os.path.join(base_path, 'chatbot_data.csv'))
data
data.shape
data.info()
data['label'].unique()
data['label'].value_counts()
features = data['Q'].tolist()
labels = data['label'].tolist()
features[0]
preprocessing.text.text_to_word_sequence(features[0])
corpus = [
preprocessing.text.text_to_word_sequence(text) for text in features
]
corpus
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(corpus)
corpus # 매개변수가 변한게 아니다.
sequences = tokenizer.texts_to_sequences(corpus)
sequences
corpus[0], sequences[0]
corpus[8], sequences[8]
word_index = tokenizer.word_index
word_index
len(word_index)
tokenizer.texts_to_sequences([['여기', '어때'], ['정말', '좋아'], ['온달', '장군']])
max([len(words) for words in corpus])
MAX_SEQ_LEN = 15 # 단어 시퀀스 벡터 크기
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')
padded_seqs
corpus[0], sequences[0], padded_seqs[0]
padded_seqs.shape
len(labels)
ds = tf.data.Dataset.from_tensor_slices((padded_seqs, labels))
ds # Dataset 객체
iter(ds).get_next()
ds = ds.shuffle(len(features))
ds
train_size = int(len(padded_seqs) 0.7)
val_size = int(len(padded_seqs) 0.2)
test_size = int(len(padded_seqs) * 0.1)
len(ds), train_size, val_size, test_size
train_ds = ds.take(train_size).batch(20)
val_ds = ds.skip(train_size).take(val_size).batch(20)
test_ds = ds.skip(train_size + val_size).batch(20)
len(train_ds), len(val_ds), len(test_ds)
dropout_prob = 0.5
EMB_SIZE = 128 # 벡터의 크기
EPOCH = 5
VACAB_SIZE = len(word_index) + 1 # 전체 단어수 + 1
input_layer = Input(shape=(MAX_SEQ_LEN,))
embedding_layer = Embedding(VACAB_SIZE, EMB_SIZE, input_length=MAX_SEQ_LEN)(input_layer)
dropout_emb = Dropout(rate=dropout_prob)(embedding_layer)
conv1 = Conv1D(filters = 128, kernel_size = 3, padding = 'valid', activation=tf.nn.relu)(dropout_emb)
pool1 = GlobalMaxPool1D()(conv1)
conv2 = Conv1D(filters = 128, kernel_size = 4, padding = 'valid', activation=tf.nn.relu)(dropout_emb)
pool2 = GlobalMaxPool1D()(conv2)
conv3 = Conv1D(filters = 128, kernel_size = 5, padding = 'valid', activation=tf.nn.relu)(dropout_emb)
pool3 = GlobalMaxPool1D()(conv3)
concat = concatenate([pool1, pool2, pool3])
hidden = Dense(128, activation=tf.nn.relu)(concat)
dropout_hidden = Dropout(rate=dropout_prob)(hidden)
logits = Dense(3, name='logits')(dropout_hidden)
predictions = Dense(3, activation=tf.nn.softmax)(logits)
model = Model(inputs=input_layer, outputs=predictions)
model.summary()
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(train_ds, validation_data=val_ds, epochs=EPOCH)
loss, accuracy = model.evaluate(test_ds)
print(loss, accuracy)
model.save(os.path.join(base_path, 'out', 'cnn_model.h5'))
model = None
model = tf.keras.models.load_model(os.path.join(base_path, 'out', 'cnn_model.h5'))
model.summary()
ds = tf.data.Dataset.from_tensor_slices((padded_seqs, labels))
ds = ds.shuffle(len(features))
test_ds = ds.take(2000).batch(20)
print(corpus[10212])
print(padded_seqs[10212])
print(labels[10212])
picks = [10212]
predict = model.predict(padded_seqs[picks])
tf.math.argmax(predict, axis=1)
loss, accuracy = model.evaluate(test_ds)
print(loss, accuracy)