--38.개체명 인식 모델.ipynb--
!pip install Konlpy
base_path = r'/content/drive/MyDrive/dataset/chatbot'
NER: Named Entity Recognition
이번 예제 모델에서 인식 가능한 주요 개체명
| 개체명 | 설명 |
|---|---|
| B_FOOD | 음식 |
| B_DT, B_TI | 날짜,시간 (학습데이터의 영향으로 날짜와 시간은 혼용해서 사용합니다) |
| B_PS | 사람 |
| B_OG | 조직,회사 |
| B_LC | 지역 |
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import tensorflow as tf
from tensorflow import keras
tf.keras.utils.set_random_seed(42)
tf.config.experimental.enable_op_determinism()
from tensorflow.keras import preprocessing
from sklearn.model_selection import train_test_split
from utils.Preprocess import Preprocess
p = Preprocess(word2index_dic=os.path.join(base_path, 'out', 'chatbot_dict.bin'),
userdic=os.path.join(base_path, 'user_dic.tsv'))
ner_train.txt
"""
; 가락지빵 주문 하고싶어요
$<가락지빵:FOOD> 주문 하고싶어요
1 가락지빵 NNG B_FOOD
2 주문 NNP O
3 하 VV O
4 고 EC O
5 싶 VX O
6 어요 EC O
"""
None
def read_file(file_name) :
sents = []
with open(file_name, 'r', encoding='utf-8') as f :
lines = f.readlines()
for idx,l in enumerate(lines) :
if l[0] == ';' and lines[idx + 1][0] == '' and lines[idx - 1][0] == ';':
continue
elif l[0] == '\n':
sents.append(this_sent)
else:
this_sent.append(tuple(l.split()))
return sents
corpus = read_file(os.path.join(base_path, 'ner_train.txt'))
len(corpus)
corpus[:20]
sentences, tags = [], []
for t in corpus :
tagged_sentence = []
sentence, bio_tag = [], []
for w in t :
tagged_sentence.append((w[1],w[3]))
sentence.append(w[1])
bio_tag.append(w[3])
sentences.append(sentence)
tags.append(bio_tag)
sentences[:10]
tags[:10]
print("샘플 크기 : \n", len(sentences))
print("0번 째 샘플 단어 시퀀스 : \n", sentences[0])
print("0번 째 샘플 bio 태그 : \n", tags[0])
print("샘플 단어 시퀀스 최대 길이 :", max(len(l) for l in sentences))
print("샘플 단어 시퀀스 평균 길이 :", (sum(map(len, sentences))/len(sentences)))
tag_tokenizer = preprocessing.text.Tokenizer(lower=False) # 태그정보는 소문자로 변환하지 않는다.
tag_tokenizer.fit_on_texts(tags)
p.word_index
tag_tokenizer.word_index
vocab_size = len(p.word_index) + 1
tag_size = len(tag_tokenizer.word_index) + 1
print('단어 사전 크기: ', vocab_size)
print('BIO 태그 사전 크기: ', tag_size)
x_train = [p.get_wordidx_sequence(sent) for sent in sentences] # 시퀀스로 변환해주는 함수
y_train = tag_tokenizer.texts_to_sequences(tags)
index_to_ner = tag_tokenizer.index_word # 시퀀스 인덱스를 NER로 변환하기 위해 사용
index_to_ner[0] = 'PAD'
len(x_train)
x_train[:10]
len(y_train)
y_train[:10]
index_to_ner
max_len = 40 # 앞서 계산한 평균 길이보다 넉넉하게 큰 40으로 설정
x_train = preprocessing.sequence.pad_sequences(x_train, padding='post', maxlen=max_len)
y_train = preprocessing.sequence.pad_sequences(y_train, padding='post', maxlen=max_len)
print(x_train.shape)
x_train
print(y_train.shape)
y_train
x_train, x_test, y_train, y_test = \
train_test_split(x_train, y_train, test_size=0.2, random_state=1234)
x_train.shape, x_test.shape, y_train.shape, y_test.shape
y_train
y_train = tf.keras.utils.to_categorical(y_train, num_classes=tag_size)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=tag_size)
y_train.shape, y_test.shape
print("학습 샘플 시퀀스 형상 : ", x_train.shape)
print("학습 샘플 레이블 형상 : ", y_train.shape)
print("테스트 샘플 시퀀스 형상 : ", x_test.shape)
print("테스트 샘플 레이블 형상 : ", y_test.shape)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=30, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(200, return_sequences=True, dropout=0.5, recurrent_dropout=0.25)))
model.add(TimeDistributed(Dense(tag_size, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.01), metrics=['accuracy'])
model.summary()
model_file = os.path.join(base_path, 'out', 'ner_model.h5')
model.save(model_file)
model = None
model = tf.keras.models.load_model(model_file)
model.evaluate(x_test, y_test)[1]
def sequences_to_tag(sequences) :
result = []
for sequence in sequences :
temp = []
for pred in sequence :
pred_index = np.argmax(pred)
temp.append(index_to_ner[pred_index].replace('PAD', 'O'))
result.append(temp)
return result
!pip install seqeval
from seqeval.metrics import f1_score, classification_report
x_test
y_predicted = model.predict(x_test)
y_predicted.shape
pred_tags = sequences_to_tag(y_predicted) # 예측된 NER
test_tags = sequences_to_tag(y_test) # 실제 NER
print(pred_tags[0])
print(test_tags[0])
print(classification_report(test_tags, pred_tags))
print('F1-score: {:.1%}'.format(f1_score(test_tags, pred_tags)))
"""
./utils/Preprocess.py
"""
None
query = "오늘 오전 13시 2분에 탕수육 주문 하고 싶어요"
index_to_ner = {
1 : "O",
2: "B_DT",
3: "B_FOOD",
4: "I",
5: "B_OG",
6: "B_PS",
7: "B_LC",
8: "NNP",
9: "B_TI",
9: "PAD"
}
pos = p.pos(query)
pos
keywords = p.get_keywords(pos, without_tag=True)
keywords
sequences = [p.get_wordidx_sequence(keywords)]
sequences
max_len = 40
padded_seqs = preprocessing.sequence.pad_sequences(sequences, padding='post', maxlen=max_len, value=0)
padded_seqs
padded_seqs[0]
model = tf.keras.models.load_model(os.path.join(base_path, 'out', 'ner_model.h5'))
predict = model.predict(np.array([padded_seqs[0]]))
predict
predict_class = tf.math.argmax(predict, axis=-1)
predict_class
predict_class.numpy()[0]
tags = [index_to_ner[i] for i in predict_class.numpy()[0]]
print(tags)
keywords
predicts = list(zip(keywords, tags))
predicts
tags = []
for tag_idx in predict_class.numpy()[0] :
if tag_idx == 1: continue
tags.append(index_to_ner[tag_idx])
tags
/models/ner/NerModel.py 생성
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import preprocessing
class NerModel:
def init(self, model_name, preprocess) :
# BIO태그 클래스별 레이블
self.index_to_ner = {
1 : "O",
2: "B_DT",
3: "B_FOOD",
4: "I",
5: "B_OG",
6: "B_PS",
7: "B_LC",
8: "NNP",
9: "B_TI",
9: "PAD"
}
# 모델 불러오기
self.model = load_model(model_name)
# 천처리 객체
self.p = preprocess
def predict(self, query) :
# 형태소 분석
pos = self.p.pos(query)
# 문장내 키워드 추출(불용어 제거)
keywords = self.p.get_keywords(pos, without_tag=True)
sequences = [self.p.get_wordidx_sequence(keywords)]
# 패딩처리
max_len = 40
padded_seqs = preprocessing.sequence.pad_sequences(sequences, padding="post", value=0, maxlen=max_len)
predict = self.model.predict(np.array([padded_seqs[0]]))
predict_class = tf.math.argmax(predict, axis=-1)
tags = [self.index_to_ner[i] for i in predict_class.numpy()[0]]
return list(zip(keywords, tags))
def predict_tags(self, query) :
# 형태소 분석
pos = self.p.pos(query)
# 문장내 키워드 추출(불용어 제거)
keywords = self.p.get_keywords(pos, without_tag=True)
sequences = [self.p.get_wordidx_sequence(keywords)]
# 패딩처리
max_len = 40
padded_seqs = preprocessing.sequence.pad_sequences(sequences, padding="post", value=0, maxlen=max_len)
predict = self.model.predict(np.array([padded_seqs[0]]))
predict_class = tf.math.argmax(predict, axis=-1)
tags = []
for tag_idx in predict_class.numpy()[0]:
if tag_idx == 1: continue
tags.append(self.index_to_ner[tag_idx])
if len(tags) == 0: return None
return tags
from models.ner.NerModel import NerModel
ner = NerModel(model_name=os.path.join(base_path, 'out', 'ner_model.h5'),
preprocess=p)
def ner_test(query) :
predict = ner.predict(query)
tags = ner.predict_tags(query)
print(predict)
print(tags)
ner_test("오늘 오전 13시 2분에 탕수육 주문 하고 싶어요")
ner_test("짜장면 2개 배달 부탁 합니다")