[ML] NLP - 워드클라우드(이상한나라의엘리스, 스타워즈, 육아휴직관련법안)

박미영·2023년 5월 25일

DataSchool StudyNote - ML

목록 보기

19/19

📌워드클라우드

from wordcloud import WordCloud, STOPWORDS

import numpy as np
from PIL import Image

📍이상한 나라의 엘리스

- 데이터 가져오기

text = open('../datasets/06_alice.txt').read()
alice_mask = np.array(Image.open('../datasets/06_alice_mask.png'))

stopwords = set(STOPWORDS)
stopwords.add('said')

소설, 이미지 읽기
본문에서 많이 등장하는 said 단어는 stopword 처리

import matplotlib.pyplot as plt
import platform
from matplotlib import font_manager, rc

path="C:/Windows/Fonts/malgun.ttf"

if platform.system() == "Darwin":
    rc("font", family="Arial Unicode MS")
elif platform.system() == "Windows":
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc("font", family=font_name)
else:
    print("Unknown system. sorry~")

%matplotlib inline

그림

plt.figure(figsize=(8, 8))
plt.imshow(alice_mask, cmap=plt.cm.gray, interpolation='bilinear')
plt.axis('off')
plt.show()

WordCloud 모듈
WordCloud 모듈은 자체적으로 단어를 추출하여 빈도수를 조사하고 정규화하는 기능을 가지고 있다.

wc = WordCloud(
    background_color="white", max_words=2000, mask=alice_mask, stopwords=stopwords
)

wc = wc.generate(text)
wc.words_

max_words: 표현할 최대 단어수

그림 위에 워드클라우드 표현

plt.figure(figsize=(12, 12))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

📍스타워즈

데이터

text = open('../datasets/06_a_new_hope.txt').read()

text = text.replace('HAN', 'Han')
text = text.replace("LUKE'S", "Luke")

mask = np.array(Image.open('../datasets/06_stormtrooper_mask.png'))

STOPWORDS 지정

stopwords = set(STOPWORDS)
stopwords.add('int')
stopwords.add('ext')

word cloud 설정

wc = WordCloud(
    max_words=1000, mask=mask, stopwords=stopwords, margin=10, random_state=1
).generate(text)

default_colors = wc.to_array()

그레이톤으로 그리기 위한 색상함수 정의

import random

def grey_color_func(
        word, font_size, position, orientation, random_State=None, **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)

그리기

plt.figure(figsize=(12, 12))
plt.imshow(
    wc.recolor(color_func=grey_color_func, random_state=3), interpolation='bilinear'
)

plt.axis('off')
plt.show()

📍육아휴직관련법안

육아휴직관련 법안 대한민국 국회 제 1809890호 의안

koNLPy는 대한민국 법령을 가지고 있다.

- 법령 가져오기

import nltk
from konlpy.corpus import kobill

doc_ko = kobill.open("1809890.txt").read()

- Twitter 엔진

명사 분석

from konlpy.tag import Twitter

t = Twitter()
tokens_ko = t.nouns(doc_ko)
tokens_ko

토큰(빈도수 포함) 분석

ko = nltk.Text(tokens_ko, name='대한민국 국회 의안 제 1809890호')

print(len(ko.tokens))
print(len(set(ko.tokens)))

ko.vocab()

- 시각화

plt.figure(figsize=(12, 6))
ko.plot(50)
plt.show()

- stopword

한글 stopword는 상황에 따라 복잡해서 일단 그냥 작성

stop_words = [ ".", "(", ")", ",", "'", "%", "-", "X", ").", "x",
              "의", "자", "에", "안", "번", "호", "을", "이", "다", "만", "로", "가", "를"]

ko = [each_word for each_word in ko if each_word not in stop_words]
ko

ko = nltk.Text(ko, name="대한민국 국회 의안 제 1809890호")

plt.figure(figsize=(12, 6))
ko.plot(50)
plt.show()

- 특정 단어 빈도수 조사

ko.count("초등학교")

plt.figure(figsize=(12, 6))
ko.dispersion_plot(["육아휴직", "초등학교", "공무원"])

concordance

ko.concordance("초등학교")

- collocations

함께 위치하는 단어를
어휘의 조합 또는 짝을 이루는 말

data = ko.vocab().most_common(150)

wordcloud = WordCloud(
    font_path="C:/Windows/Fonts/malgun.ttf",
    relative_scaling=0.2,
    background_color='white',
).generate_from_frequencies(dict(data))

plt.figure(figsize=(12, 8))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()