Keybert 모델 skt/kober-base-v1을 활용하여 문장별로 중요 키워드를 추출 뒤 엘라스틱 서치에 적재
def get_keywords():
model = BertModel.from_pretrained('skt/kobert-base-v1')
kw_model = KeyBERT(model)
# 문장별로 문장, 전처리 문장
sentences, pre_sentences = split_into_sentences(file_open("input.txt"))
for id_num in range(1, len(sentences) + 1):
keywords = kw_model.extract_keywords(pre_sentences[id_num - 1], keyphrase_ngram_range=(1, 1), stop_words=None, top_n=10)
keywords = [item[0] for item in keywords]
print("keyword : ", keywords)
set_data(id_num - 1, sentences[id_num - 1], keywords)
def noun_extractor(text):
results = []
kiwi = Kiwi()
result = kiwi.analyze(text)
for token, pos, _, _ in result[0][0]:
if len(token) != 1 and pos.startswith('N') or pos.startswith('SL'):
results.append(token)
return results
범용적으로 사용하기 위하여 텍스트 필드를 분석 가능하게 두었고 키워드는 정확한 매칭을 위하여 하위 필드 타입으로 두지 않았다.
PUT /keyword_sentences
{
"settings": {
"index": {
"analysis": {
"tokenizer": {
"my_nori_tokenizer": {
"type": "nori_tokenizer",
"decompound_mode":"mixed"
}
},
"filter": {
"stopwords": {
"type": "stop",
"stopwords": [" "]
}
},
"analyzer": {
"my_nori_analyzer": {
"type": "custom",
"tokenizer": "my_nori_tokenizer",
"filter": ["lowercase", "stop", "trim", "stopwords", "nori_part_of_speech"],
"char_filter": ["html_strip"]
}
}
}
}
},
"mappings" : {
"properties" : {
"sequence":{
"type": "integer"
},
"text": {
"type" : "text",
"analyzer": "my_nori_analyzer",
"search_analyzer": "my_nori_analyzer"
},
"keyword": {
"type": "keyword"
}
}
}
}
맵핑 프로퍼티
코드
from keybert import KeyBERT
from kiwipiepy import Kiwi
from transformers import BertModel
from elasticsearch import Elasticsearch
es = Elasticsearch("http://192.168.101.218:9200")
index_name = 'keyword_sentences'
def file_open(path):
with open(path, "r") as f:
text = f.read()
return text
def noun_extractor(text):
results = []
kiwi = Kiwi()
result = kiwi.analyze(text)
for token, pos, _, _ in result[0][0]:
if len(token) != 1 and pos.startswith('N') or pos.startswith('SL'):
results.append(token)
return results
def split_into_sentences(text):
kiwi = Kiwi()
str_list = kiwi.split_into_sents(text)
keyword_list = []
for id_num in range(1, len(str_list) + 1):
str_list[id_num - 1] = str_list[id_num - 1].text
keyword_list.append(preprocess(str_list[id_num - 1]))
return str_list, keyword_list
def set_data(id_num, sentences, keyword):
data = {"sequence": id_num, "text": sentences, "keyword": keyword}
put(data, id_num)
def put(data, id_num):
es.index(index=index_name, id=id_num, body=data)
es.indices.refresh(index=index_name)
def preprocess(text):
nouns = noun_extractor(text)
return ' '.join(nouns)
def get_keywords(kw_model):
# 문장별로 문장, 전처리 문장
sentences, pre_sentences = split_into_sentences(file_open("input.txt"))
for id_num in range(1, len(sentences) + 1):
keywords = kw_model.extract_keywords(pre_sentences[id_num - 1], keyphrase_ngram_range=(1, 1), stop_words=None, top_n=10)
keywords = [item[0] for item in keywords]
set_data(id_num - 1, sentences[id_num - 1], keywords)
if __name__ == '__main__':
# 모델 생성
model = BertModel.from_pretrained('skt/kobert-base-v1')
kw_model = KeyBERT(model)
# input 키워드 생성
get_keywords(kw_model)