챗봇구축(AI학습 57)

이유진·2024년 7월 8일

chat-bot 실습

--38.개체명 인식 모델.ipynb--

사전 필요모듈, 데이터:

./utils/Preprocess.py 준비해두기 ★

Konlpy 설치

./out/chatbot_dict.bin 이전에 만든 사전파일

user_dic.tsv 사용자 사전

pip install seqeval 설치

입력데이터 :

/models/ner/ner_train.txt

!pip install Konlpy

base_path = r'/content/drive/MyDrive/dataset/chatbot'

객체명 인식 모델

NER: Named Entity Recognition

이번 예제 모델에서 인식 가능한 주요 개체명

개체명	설명
B_FOOD	음식
B_DT, B_TI	날짜,시간 (학습데이터의 영향으로 날짜와 시간은 혼용해서 사용합니다)
B_PS	사람
B_OG	조직,회사
B_LC	지역

개체명 인식 모델 학습

모듈 import

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

import tensorflow as tf
from tensorflow import keras

tf.keras.utils.set_random_seed(42)
tf.config.experimental.enable_op_determinism()

from tensorflow.keras import preprocessing
from sklearn.model_selection import train_test_split
from utils.Preprocess import Preprocess

전처리 모듈 불러오기

p = Preprocess(word2index_dic=os.path.join(base_path, 'out', 'chatbot_dict.bin'),
userdic=os.path.join(base_path, 'user_dic.tsv'))

데이터 읽어오기

ner_train.txt

"""
; 가락지빵 주문 하고싶어요
$<가락지빵:FOOD> 주문 하고싶어요
1 가락지빵 NNG B_FOOD
2 주문 NNP O
3 하 VV O
4 고 EC O
5 싶 VX O
6 어요 EC O
"""
None

def read_file(file_name) :
sents = []
with open(file_name, 'r', encoding='utf-8') as f :
lines = f.readlines()
for idx,l in enumerate(lines) :
if l[0] == ';' and lines[idx + 1][0] == ' $': this_sent = [] elif l[0] == '$ ' and lines[idx - 1][0] == ';':
continue
elif l[0] == '\n':
sents.append(this_sent)
else:
this_sent.append(tuple(l.split()))

return sents

corpus = read_file(os.path.join(base_path, 'ner_train.txt'))

len(corpus)

corpus[:20]

'단어', '태그' 학습용 데이터셋

sentences, tags = [], []

for t in corpus :
tagged_sentence = []
sentence, bio_tag = [], []
for w in t :
tagged_sentence.append((w[1],w[3]))
sentence.append(w[1])
bio_tag.append(w[3])

sentences.append(sentence)
tags.append(bio_tag)

sentences[:10]

tags[:10]

print("샘플 크기 : \n", len(sentences))
print("0번 째 샘플 단어 시퀀스 : \n", sentences[0])
print("0번 째 샘플 bio 태그 : \n", tags[0])
print("샘플 단어 시퀀스 최대 길이 :", max(len(l) for l in sentences))
print("샘플 단어 시퀀스 평균 길이 :", (sum(map(len, sentences))/len(sentences)))

평균의 경우. 나중에 시퀀스 벡터 크기 설정시 참조됨.

BIO태그용 Tokenizer 객체 생성

tag_tokenizer = preprocessing.text.Tokenizer(lower=False) # 태그정보는 소문자로 변환하지 않는다.
tag_tokenizer.fit_on_texts(tags)

p.word_index

tag_tokenizer.word_index

단어사전 및 태그 사전 크기

vocab_size = len(p.word_index) + 1
tag_size = len(tag_tokenizer.word_index) + 1

print('단어 사전 크기: ', vocab_size)
print('BIO 태그 사전 크기: ', tag_size)

학습용 단어 시퀀스 생성

x_train = [p.get_wordidx_sequence(sent) for sent in sentences] # 시퀀스로 변환해주는 함수
y_train = tag_tokenizer.texts_to_sequences(tags)

index_to_ner = tag_tokenizer.index_word # 시퀀스 인덱스를 NER로 변환하기 위해 사용
index_to_ner[0] = 'PAD'

len(x_train)

x_train[:10]

len(y_train)

y_train[:10]

index_to_ner

패딩처리

max_len = 40 # 앞서 계산한 평균 길이보다 넉넉하게 큰 40으로 설정
x_train = preprocessing.sequence.pad_sequences(x_train, padding='post', maxlen=max_len)
y_train = preprocessing.sequence.pad_sequences(y_train, padding='post', maxlen=max_len)

print(x_train.shape)
x_train

print(y_train.shape)
y_train

학습용, 테스트용, 데이터셋 생성

학승용 : 테스트용 = 8 : 2 로 분리

한번만 실행!

x_train, x_test, y_train, y_test = \
train_test_split(x_train, y_train, test_size=0.2, random_state=1234)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

y_train

출력 데이터를 One hot encoding

한번만 실행!

y_train = tf.keras.utils.to_categorical(y_train, num_classes=tag_size)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=tag_size)

y_train.shape, y_test.shape

print("학습 샘플 시퀀스 형상 : ", x_train.shape)
print("학습 샘플 레이블 형상 : ", y_train.shape)
print("테스트 샘플 시퀀스 형상 : ", x_test.shape)
print("테스트 샘플 레이블 형상 : ", y_test.shape)

모델 정의 & 컴파일

⑦ 모델 정의 (Bi-LSTM)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=30, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(200, return_sequences=True, dropout=0.5, recurrent_dropout=0.25)))
model.add(TimeDistributed(Dense(tag_size, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer=Adam(0.01), metrics=['accuracy'])

model.summary()

모델 학습

model.fit(x_train, y_train, batch_size=128, epochs=10)

모델 저장

model_file = os.path.join(base_path, 'out', 'ner_model.h5')

model.save(model_file)

모델 평가

model = None
model = tf.keras.models.load_model(model_file)

model.evaluate(x_test, y_test)[1]

F1 score 계산

시퀀스를 NER 태그로 변환

def sequences_to_tag(sequences) :
result = []

for sequence in sequences :
temp = []
for pred in sequence :
pred_index = np.argmax(pred)
temp.append(index_to_ner[pred_index].replace('PAD', 'O'))
result.append(temp)

return result

!pip install seqeval

from seqeval.metrics import f1_score, classification_report

x_test

y_predicted = model.predict(x_test)

y_predicted.shape

pred_tags = sequences_to_tag(y_predicted) # 예측된 NER
test_tags = sequences_to_tag(y_test) # 실제 NER

print(pred_tags[0])
print(test_tags[0])

print(classification_report(test_tags, pred_tags))

print('F1-score: {:.1%}'.format(f1_score(test_tags, pred_tags)))

개체명 인식 모듈 만들기

"""
./utils/Preprocess.py
"""
None

query = "오늘 오전 13시 2분에 탕수육 주문 하고 싶어요"

개체명 클래스 예측

형태소 분석

pos = p.pos(query)
pos

문장내 키워드 추출(불용어 제거)

keywords = p.get_keywords(pos, without_tag=True)
keywords

sequence 인덱스 변환

준비된 단어 사전을 통해 sequence 인덱스로 변환

sequences = [p.get_wordidx_sequence(keywords)]
sequences

패딩 처리

max_len = 40
padded_seqs = preprocessing.sequence.pad_sequences(sequences, padding='post', maxlen=max_len, value=0)
padded_seqs

padded_seqs[0]

예측하기

model = tf.keras.models.load_model(os.path.join(base_path, 'out', 'ner_model.h5'))

predict = model.predict(np.array([padded_seqs[0]]))
predict

predict_class = tf.math.argmax(predict, axis=-1)
predict_class

predict_class.numpy()[0]

tags = [index_to_ner[i] for i in predict_class.numpy()[0]]
print(tags)

keywords

개체명 인식 클래스 예측 결과

predicts = list(zip(keywords, tags))

predicts

O tag를 제외하고 담아보기

tags = []

for tag_idx in predict_class.numpy()[0] :
if tag_idx == 1: continue
tags.append(index_to_ner[tag_idx])

챗봇엔진 개체명 인식 모듈 만들기

/models/ner/NerModel.py 생성

import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import preprocessing

개체명 인식 모델 모듈

class NerModel:

def init(self, model_name, preprocess) :

# BIO태그 클래스별 레이블
self.index_to_ner = {
    1 : "O",
    2: "B_DT",
    3: "B_FOOD",
    4: "I",
    5: "B_OG",
    6: "B_PS",
    7: "B_LC",
    8: "NNP",
    9: "B_TI",
    9: "PAD"
}
# 모델 불러오기
self.model = load_model(model_name)

# 천처리 객체
self.p = preprocess

개체명 클래스 예측

def predict(self, query) :

# 형태소 분석
pos = self.p.pos(query)

# 문장내 키워드 추출(불용어 제거)
keywords = self.p.get_keywords(pos, without_tag=True)
sequences = [self.p.get_wordidx_sequence(keywords)]

# 패딩처리
max_len = 40
padded_seqs = preprocessing.sequence.pad_sequences(sequences, padding="post", value=0, maxlen=max_len)

predict = self.model.predict(np.array([padded_seqs[0]]))
predict_class = tf.math.argmax(predict, axis=-1)

tags = [self.index_to_ner[i] for i in predict_class.numpy()[0]]
return list(zip(keywords, tags))

def predict_tags(self, query) :

# 형태소 분석
pos = self.p.pos(query)

# 문장내 키워드 추출(불용어 제거)
keywords = self.p.get_keywords(pos, without_tag=True)
sequences = [self.p.get_wordidx_sequence(keywords)]

# 패딩처리
max_len = 40
padded_seqs = preprocessing.sequence.pad_sequences(sequences, padding="post", value=0, maxlen=max_len)

predict = self.model.predict(np.array([padded_seqs[0]]))
predict_class = tf.math.argmax(predict, axis=-1)

tags = []
for tag_idx in predict_class.numpy()[0]:
    if tag_idx == 1: continue
    tags.append(self.index_to_ner[tag_idx])

if len(tags) == 0: return None
return tags

모듈 테스트

from models.ner.NerModel import NerModel

ner = NerModel(model_name=os.path.join(base_path, 'out', 'ner_model.h5'),
preprocess=p)

def ner_test(query) :
predict = ner.predict(query)
tags = ner.predict_tags(query)
print(predict)
print(tags)

ner_test("오늘 오전 13시 2분에 탕수육 주문 하고 싶어요")

ner_test("짜장면 2개 배달 부탁 합니다")

이유진

독해지자

이전 포스트

챗봇구축(AI학습 56)

다음 포스트

챗봇구축(AI학습 57)

사전 필요모듈, 데이터:

./utils/Preprocess.py 준비해두기 ★

Konlpy 설치

./out/chatbot_dict.bin 이전에 만든 사전파일

user_dic.tsv 사용자 사전

pip install seqeval 설치

입력데이터 :

/models/ner/ner_train.txt

객체명 인식 모델

개체명 인식 모델 학습

모듈 import

전처리 모듈 불러오기

데이터 읽어오기

'단어', '태그' 학습용 데이터셋

평균의 경우. 나중에 시퀀스 벡터 크기 설정시 참조됨.

BIO태그용 Tokenizer 객체 생성

단어사전 및 태그 사전 크기

학습용 단어 시퀀스 생성

패딩처리

학습용, 테스트용, 데이터셋 생성

학승용 : 테스트용 = 8 : 2 로 분리

한번만 실행!

출력 데이터를 One hot encoding

한번만 실행!

모델 정의 & 컴파일

⑦ 모델 정의 (Bi-LSTM)

모델 학습

model.fit(x_train, y_train, batch_size=128, epochs=10)

모델 저장

모델 평가

F1 score 계산

시퀀스를 NER 태그로 변환

개체명 인식 모듈 만들기

BIO 태그 클래스 별 레이블

개체명 클래스 예측

형태소 분석

문장내 키워드 추출(불용어 제거)

sequence 인덱스 변환

준비된 단어 사전을 통해 sequence 인덱스로 변환

패딩 처리

예측하기

개체명 인식 클래스 예측 결과

O tag를 제외하고 담아보기

챗봇엔진 개체명 인식 모듈 만들기

개체명 인식 모델 모듈

개체명 클래스 예측

모듈 테스트

챗봇구축(AI학습 56)

챗봇구축(AI학습 58)

0개의 댓글