--37.의도 분류 모델.ipynb--
문장의 의도 클래스별로 분류하기 위해 CNN모델 을 사용해보겠습니다.
예제에선 아래와 같이 '5가지' 의도로만 분류할수 있도록 구현해보겟습니다
| 의도명 | 분류클래스 | 설명 |
|---|---|---|
| 인사 | 0 | 텍스트가 인사말인 경우 |
| 욕설 | 1 | 텍스트가 욕설인 경우 |
| 주문 | 2 | 텍스트가 주문관련 내용인 경우 |
| 예약 | 3 | 텍스트가 예약관련 내용인 경우 |
| 기타 | 4 | 어떤 의도에도 포함되지 않는 경우 |
!pip install Konlpy
base_path = r'/content/drive/MyDrive/dataset/chatbot'
./config/GlobalParams.py
MAX_SEQ_LEN = 15
def GlobalParams() :
global MAX_SEQ_LEN
학습 데이터셋 : total_train_data.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import tensorflow as tf
from tensorflow import keras
tf.keras.utils.set_random_seed(42)
tf.config.experimental.enable_op_determinism()
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate
train_file = os.path.join(base_path, 'total_train_data.csv')
data = pd.read_csv(train_file)
data
data[data['intent'] == 2]
data[data['intent'] == 3]
queries = data['query'].tolist() # data
intents = data['intent'].tolist() # target
from utils.Preprocess import Preprocess
p = Preprocess(word2index_dic=os.path.join(base_path, 'out', 'chatbot_dict.bin'),
userdic=os.path.join(base_path, 'user_dic.tsv'))
sequences = []
for sentence in queries :
pos = p.pos(sentence)
keywords = p.get_keywords(pos, without_tag=True)
seq = p.get_wordidx_sequence(keywords)
sequences.append(seq)
queries[:10]
sequences[:10]
from config.GlobalParams import MAX_SEQ_LEN
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')
padded_seqs.shape
padded_seqs[:10]
ds = tf.data.Dataset.from_tensor_slices((padded_seqs, intents))
df = ds.shuffle(len(queries))
train_size = int(len(padded_seqs) 0.7)
val_size = int(len(padded_seqs) 0.2)
test_size = int(len(padded_seqs) * 0.1)
train_ds = ds.take(train_size).batch(20)
val_ds = ds.skip(train_size).take(val_size).batch(20)
test_ds = ds.skip(train_size + val_size).take(test_size).batch(20)
len(padded_seqs), len(train_ds), len(val_ds), len(test_ds)
dropout_prob = 0.5
EMB_SIZE = 128
EPOCH = 5
VOCAB_SIZE = len(p.word_index) + 1 # 전체 단어 개수
input_layer = Input(shape=(MAX_SEQ_LEN,)) # 입력의 크기
embedding_layer = Embedding(VOCAB_SIZE, EMB_SIZE, input_length=MAX_SEQ_LEN)(input_layer)
dropout_emb = Dropout(rate=dropout_prob)(embedding_layer)
conv1 = Conv1D(filters=128, kernel_size=3, padding='valid', activation=tf.nn.relu)(dropout_emb) # n-gram = kernel_size 앞에 몇개보고 예측
pool1 = GlobalMaxPool1D()(conv1)
conv2 = Conv1D(filters=128, kernel_size=4, padding='valid', activation=tf.nn.relu)(dropout_emb)
pool2 = GlobalMaxPool1D()(conv2)
conv3 = Conv1D(filters=128, kernel_size=5, padding='valid', activation=tf.nn.relu)(dropout_emb)
pool3 = GlobalMaxPool1D()(conv3)
concat = concatenate([pool1, pool2, pool3])
hidden = Dense(128, activation=tf.nn.relu)(concat)
dropout_hidden = Dropout(rate=dropout_prob)(hidden)
logits = Dense(5, name='logits')(dropout_hidden) # 최종적으로 5가지 의도 클래스 분류.
predictions = Dense(5, activation=tf.nn.softmax)(logits)
model = Model(inputs=input_layer, outputs=predictions)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()
model.fit(train_ds, validation_data=val_ds, epochs=EPOCH, verbose=1)
loss, accuracy = model.evaluate(test_ds, verbose=1)
print('accuracy: %f' % (accuracy * 100))
print('loss: %f' % (loss))
model.save(os.path.join(base_path, 'out', 'intent_model.h5'))
/models/intent/IntentModel.py 생성
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import preprocessing
from config.GlobalParams import MAX_SEQ_LEN
class IntentModel :
def init(self, model_name, preprocess) :
# 의도 클래스 별 레이블
self.labels = {0: "인사", 1: "욕설", 2: "주문", 3: "예약", 4: "기타"}
# 의도분류 모델 불러오기
self.model = load_model(model_name)
# 챗봇 Preprocess 객체
self.p = preprocess
def predict_class(self, query) :
# 형태소 분석
pos = self.p.pos(query)
# 키워드 추출(불용어 제거)
keywords = self.p.get_keywords(pos, without_tag=True)
# 단어 시퀀스 벡터
sequences = [self.p.get_wordidx_sequence(keywords)]
# 패딩처리
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')
# 예측
predict = self.model.predict(padded_seqs)
predict_class = tf.math.argmax(predict, axis=1)
return predict_class.numpy()[0]
from utils.Preprocess import Preprocess
from models.intent.IntentModel import IntentModel
import os
p = Preprocess(word2index_dic=os.path.join(base_path, 'out', 'chatbot_dict.bin'),
userdic=os.path.join(base_path, 'user_dic.tsv'))
intent = IntentModel(model_name=os.path.join(base_path, 'out', 'intent_model.h5'), preprocess=p)
def testIntentModel(query) :
predict = intent.predict_class(query)
predict_label = intent.labels[predict]
print(query)
print('의도 예측 클래스: ', predict)
print('의도 예측 레이블: ', predict_label)
testIntentModel("오늘 탕수육 주문 가능한가요?")
testIntentModel("오늘 날씨가 좋네요")
testIntentModel("오후 2시에 10명 예약 가능한가요?")
testIntentModel("야 이 개객끼야")