--27.Word2Vec.ipynb--
단어의 '의미' 나 '연관성' 을 벡터로 표현
base_path = r'/content/drive/MyDrive/dataset'
!pip install konlpy
import codecs
from bs4 import BeautifulSoup
from konlpy.tag import Okt
from gensim.models import word2vec
import os
fp = codecs.open(os.path.join(base_path, 'BEXX0003.txt'), 'r', encoding='utf-16')
soup = BeautifulSoup(fp, 'html.parser')
body = soup.select_one("body > text")
text = body.getText()
text
okt = Okt()
results = []
lines = text.split('\n')
for line in lines:
malist = okt.pos(line, norm=True, stem=True)
r = []
for word in malist:
# 어미/조사/구두점 제외
if not word[1] in ['Josa', 'Eomi', 'Pucntuation']:
r.append(word[0])
rl = (" ".join(r)).strip()
results.append(rl)
print(rl)
wakati_file = os.path.join(base_path, 'toji.wakati')
with open(wakati_file, 'w', encoding='utf-8') as fp:
fp.write('\n'.join(results))
data = word2vec.LineSentence(wakati_file) # LineSentence 객체 생성
data
model = word2vec.Word2Vec(data, vector_size=200, window=10, hs=1, min_count=2, sg=1)
# vector_size : 문장의 벡터의 차원
# window: 한 문장 내에서의 최대 거리값.
# hs=1 : hierarchical softmax 학습
# min_count : 발생빈도가 이보다 낮으면 무시
# sg : 학습 알고리즘 선택 1 이면 -> skip-gram 사용
model
model.save(os.path.join(base_path, 'toji.model'))
model = None
model = word2vec.Word2Vec.load(os.path.join(base_path, 'toji.model'))
model.corpus_count
model.corpus_total_words
model.wv.most_similar(positive=['땅'])
model.wv.most_similar(positive=['집'])
model = word2vec.Word2Vec.load(os.path.join(base_path, 'wiki.model'))
model.wv.most_similar(positive=['Python', '파이썬'])
model.wv.most_similar(positive=['아빠', '여성'], negative=['남성'])
model.wv.most_similar(positive=['왕자', '여성'], negative=['남성'])
model.wv.most_similar(positive=['서울', '일본'], negative=['한국'])
model.wv.most_similar(positive=['서울', '중국'], negative=['한국'])
model.wv.most_similar(positive=['서울','맛집'])
model.wv['고양이']