!pip install konlpy
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 19.4/19.4 MB 23.6 MB/s eta 0:00:00
Requirement already satisfied: lxml>=4.1.0 in /usr/local/lib/python3.10/dist-packages (from konlpy) (4.9.2)
Requirement already satisfied: numpy>=1.6 in /usr/local/lib/python3.10/dist-packages (from konlpy) (1.22.4)
Collecting JPype1>=0.7.0
Downloading JPype1-1.4.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 465.3/465.3 kB 36.2 MB/s eta 0:00:00
Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from JPype1>=0.7.0->konlpy) (23.1)
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0
import os
import re
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from konlpy.tag import Okt
FILTERS = "([~.,!?\"':;)(])"
PAD = "<PAD>"
STD = "<SOS>"
END = "<END>"
UNK = "<UNK>"
PAD_INDEX = 0
STD_INDEX = 1
END_INDEX = 2
UNK_INDEX = 3
MARKER = [PAD, STD, END, UNK]
CHANGE_FILTER = re.compile(FILTERS)
MAX_SEQUENCE = 25
def load_data(path):
data_df = pd.read_csv(path, header=0)
question, answer = list(data_df['Q']), list(data_df['A'])
return question, answer
def data_tokenizer(data):
words = []
for sentence in data:
sentence = re.sub(CHANGE_FILTER, "", sentence)
for word in sentence.split():
words.append(word)
return [word for word in words if word]
def prepro_like_morphlized(data):
morph_analyzer = Okt()
result_data = list()
for seq in tqdm(data):
morphlized_seq = " ".join(morph_analyzer.morphs(seq.replace(' ', '')))
result_data.append(morphlized_seq)
return result_data
def load_vocabulary(path, vocab_path, tokenize_as_morph=False):
vocabulary_list = []
if not os.path.exists(vocab_path):
if (os.path.exists(path)):
data_df = pd.read_csv(path, encoding='utf-8')
question, answer = list(data_df['Q']), list(data_df['A'])
if tokenize_as_morph:
question = prepro_like_morphlized(question)
answer = prepro_like_morphlized(answer)
data = []
data.extend(question)
data.extend(answer)
words = data_tokenizer(data)
words = list(set(words))
words[:0] = MARKER
with open(vocab_path, 'w', encoding='utf-8') as vocabulary_file:
for word in words:
vocabulary_file.write(word + '\n')
with open(vocab_path, 'r', encoding='utf-8') as vocabulary_file:
for line in vocabulary_file:
vocabulary_list.append(line.strip())
char2idx, idx2char = make_vocabulary(vocabulary_list)
return char2idx, idx2char, len(char2idx)
def make_vocabulary(vocabulary_list):
char2idx = {char: idx for idx, char in enumerate(vocabulary_list)}
idx2char = {idx: char for idx, char in enumerate(vocabulary_list)}
return char2idx, idx2char
def enc_processing(value, dictionary, tokenize_as_morph=False):
sequences_input_index = []
sequences_length = []
if tokenize_as_morph:
value = prepro_like_morphlized(value)
for sequence in value:
sequence = re.sub(CHANGE_FILTER, "", sequence)
sequence_index = []
for word in sequence.split():
if dictionary.get(word) is not None:
sequence_index.extend([dictionary[word]])
else:
sequence_index.extend([dictionary[UNK]])
if len(sequence_index) > MAX_SEQUENCE:
sequence_index = sequence_index[:MAX_SEQUENCE]
sequences_length.append(len(sequence_index))
sequence_index += (MAX_SEQUENCE - len(sequence_index)) * [dictionary[PAD]]
sequences_input_index.append(sequence_index)
return np.asarray(sequences_input_index), sequences_length
def dec_output_processing(value, dictionary, tokenize_as_morph=False):
sequences_output_index = []
sequences_length = []
if tokenize_as_morph:
value = prepro_like_morphlized(value)
for sequence in value:
sequence = re.sub(CHANGE_FILTER, "", sequence)
sequence_index = []
sequence_index = [dictionary[STD]] + [dictionary[word] if word in dictionary else dictionary[UNK] for word in sequence.split()]
if len(sequence_index) > MAX_SEQUENCE:
sequence_index = sequence_index[:MAX_SEQUENCE]
sequences_length.append(len(sequence_index))
sequence_index += (MAX_SEQUENCE - len(sequence_index)) * [dictionary[PAD]]
sequences_output_index.append(sequence_index)
return np.asarray(sequences_output_index), sequences_length
def dec_target_processing(value, dictionary, tokenize_as_morph=False):
sequences_target_index = []
if tokenize_as_morph:
value = prepro_like_morphlized(value)
for sequence in value:
sequence = re.sub(CHANGE_FILTER, "", sequence)
sequence_index = [dictionary[word] if word in dictionary else dictionary[UNK] for word in sequence.split()]
if len(sequence_index) >= MAX_SEQUENCE:
sequence_index = sequence_index[:MAX_SEQUENCE - 1] + [dictionary[END]]
else:
sequence_index += [dictionary[END]]
sequence_index += (MAX_SEQUENCE - len(sequence_index)) * [dictionary[PAD]]
sequences_target_index.append(sequence_index)
return np.asarray(sequences_target_index)
PATH = 'ChatBotData_short.csv'
VOCAB_PATH = 'vocabulary.txt'
inputs, outputs = load_data(PATH)
char2idx, idx2char, vocab_size = load_vocabulary(PATH, VOCAB_PATH, tokenize_as_morph=False)
index_inputs, input_seq_len = enc_processing(inputs, char2idx, tokenize_as_morph=False)
index_outputs, output_seq_len = dec_output_processing(outputs, char2idx, tokenize_as_morph=False)
index_targets = dec_target_processing(outputs, char2idx, tokenize_as_morph=False)
data_configs = {}
data_configs['char2idx'] = char2idx
data_configs['idx2char'] = idx2char
data_configs['vocab_size'] = vocab_size
data_configs['pad_symbol'] = PAD
data_configs['std_symbol'] = STD
data_configs['end_symbol'] = END
data_configs['unk_symbol'] = UNK
DATA_IN_PATH = './'
TRAIN_INPUTS = 'train_inputs.npy'
TRAIN_OUTPUTS = 'train_outputs.npy'
TRAIN_TARGETS = 'train_targets.npy'
DATA_CONFIGS = 'data_configs.json'
np.save(open(DATA_IN_PATH + TRAIN_INPUTS, 'wb'), index_inputs)
np.save(open(DATA_IN_PATH + TRAIN_OUTPUTS , 'wb'), index_outputs)
np.save(open(DATA_IN_PATH + TRAIN_TARGETS , 'wb'), index_targets)
json.dump(data_configs, open(DATA_IN_PATH + DATA_CONFIGS, 'w'))
char2idx
{'<PAD>': 0,
'<SOS>': 1,
'<END>': 2,
'<UNK>': 3,
'구하셨나요': 4,
'궁금해': 5,
'뭘': 6,
'갔어': 7,
'설득해보세요': 8,
'보인다': 9,
'오늘': 10,
'승진': 11,
'좋을까': 12,
'나라를': 13,
'가끔은': 14,
'있어도': 15,
'감기': 16,
'빨리': 17,
'줄까': 18,
'하세요': 19,
'그': 20,
'거짓말': 21,
'돈은': 22,
'나왔다': 23,
'필요했던': 24,
'설움': 25,
'걸리겠어': 26,
'안': 27,
'가상화폐': 28,
'가난한': 29,
'바빠': 30,
'켜놓고': 31,
'잘생겼어': 32,
'많이': 33,
'질린다': 34,
'뭐하는지': 35,
'함께': 36,
'달에는': 37,
'필요한': 38,
'같아': 39,
'해': 40,
'혼자인게': 41,
'운동만': 42,
'게': 43,
'결단은': 44,
'남자친구가': 45,
'뭐가': 46,
'끄고': 47,
'싶어': 48,
'쫄딱': 49,
'들어올': 50,
'잊고': 51,
'가만': 52,
'마음을': 53,
'가스불': 54,
'돌아가서': 55,
'나': 56,
'비싼데': 57,
'좋아요': 58,
'때까지': 59,
'따라': 60,
'집에': 61,
'망함': 62,
'혼자를': 63,
'가스비': 64,
'나갔어': 65,
'좋을': 66,
'땀난다': 67,
'적당히': 68,
'따뜻하게': 69,
'믿어줘': 70,
'좋다': 71,
'데려가고': 72,
'땀을': 73,
'교회': 74,
'운동을': 75,
'해보세요': 76,
'너무': 77,
'절약해봐요': 78,
'운동': 79,
'선물로': 80,
'어서': 81,
'생일인데': 82,
'다시': 83,
'켜고': 84,
'나오세요': 85,
'전생에': 86,
'집착하지': 87,
'사람도': 88,
'남자친구': 89,
'자의': 90,
'사세요': 91,
'또': 92,
'좀': 93,
'빠를수록': 94,
'다음': 95,
'훈훈해': 96,
'것': 97,
'마세요': 98,
'생각해보세요': 99,
'가끔': 100,
'새출발': 101,
'즐기세요': 102,
'그럴': 103,
'평소에': 104,
'나온거': 105,
'식혀주세요': 106,
'열': 107,
'더': 108,
'거예요': 109,
'같아요': 110}
idx2char
{0: '<PAD>',
1: '<SOS>',
2: '<END>',
3: '<UNK>',
4: '구하셨나요',
5: '궁금해',
6: '뭘',
7: '갔어',
8: '설득해보세요',
9: '보인다',
10: '오늘',
11: '승진',
12: '좋을까',
13: '나라를',
14: '가끔은',
15: '있어도',
16: '감기',
17: '빨리',
18: '줄까',
19: '하세요',
20: '그',
21: '거짓말',
22: '돈은',
23: '나왔다',
24: '필요했던',
25: '설움',
26: '걸리겠어',
27: '안',
28: '가상화폐',
29: '가난한',
30: '바빠',
31: '켜놓고',
32: '잘생겼어',
33: '많이',
34: '질린다',
35: '뭐하는지',
36: '함께',
37: '달에는',
38: '필요한',
39: '같아',
40: '해',
41: '혼자인게',
42: '운동만',
43: '게',
44: '결단은',
45: '남자친구가',
46: '뭐가',
47: '끄고',
48: '싶어',
49: '쫄딱',
50: '들어올',
51: '잊고',
52: '가만',
53: '마음을',
54: '가스불',
55: '돌아가서',
56: '나',
57: '비싼데',
58: '좋아요',
59: '때까지',
60: '따라',
61: '집에',
62: '망함',
63: '혼자를',
64: '가스비',
65: '나갔어',
66: '좋을',
67: '땀난다',
68: '적당히',
69: '따뜻하게',
70: '믿어줘',
71: '좋다',
72: '데려가고',
73: '땀을',
74: '교회',
75: '운동을',
76: '해보세요',
77: '너무',
78: '절약해봐요',
79: '운동',
80: '선물로',
81: '어서',
82: '생일인데',
83: '다시',
84: '켜고',
85: '나오세요',
86: '전생에',
87: '집착하지',
88: '사람도',
89: '남자친구',
90: '자의',
91: '사세요',
92: '또',
93: '좀',
94: '빠를수록',
95: '다음',
96: '훈훈해',
97: '것',
98: '마세요',
99: '생각해보세요',
100: '가끔',
101: '새출발',
102: '즐기세요',
103: '그럴',
104: '평소에',
105: '나온거',
106: '식혀주세요',
107: '열',
108: '더',
109: '거예요',
110: '같아요'}