◾자연어 처리 - 육아 휴직 관련 법안 분석
육아휴직관련 법안 대한민국 국회 제 1809890호 의안
import nltk
from konlpy.corpus import kobill
files_ko = kobill.fileids()
doc_ko = kobill.open("1809890.txt").read()
doc_ko
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2Fe04465e1-9828-4ed4-a913-111fb03f9bb3%2Fimage.png)
from konlpy.tag import Okt
t = Okt()
tokens_ko = t.nouns(doc_ko)
tokens_ko[:10]
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F6382e5c0-850c-48c9-ae73-6e63e82452d0%2Fimage.png)
ko = nltk.Text(tokens_ko, name="대한민국 국회 의안 제 1809890호")
ko
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2Fd45ec27b-23b3-4351-8804-759823437005%2Fimage.png)
print(len(ko.tokens))
print(len(set(ko.tokens)))
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F6a43faf9-04c2-4580-8667-721c6a3c82f3%2Fimage.png)
import matplotlib.pyplot as plt
import set_matplotlib_korean
plt.figure(figsize=(12, 6))
ko.plot(50)
plt.show()
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2Fd642cfdf-d380-46d4-b59f-7e827c193260%2Fimage.png)
stop_words = [
'.',
'(',
')',
',',
"'",
'%',
'-',
'X',
')',
'x',
'의',
'자',
'에',
'안',
'번',
'호',
'을',
'이',
'다',
'만',
'로',
'가',
'를',
]
ko = [each_word for each_word in ko if each_word not in stop_words]
ko[:10]
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F0c6c310d-8b81-40b8-81f7-9abd296467a4%2Fimage.png)
ko = nltk.Text(ko, name='대한민국 국법 의안 제 1809890호')
plt.figure(figsize=(12, 6))
ko.plot(50)
plt.show()
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2Fb8be6533-ae70-4830-b4eb-015584aad461%2Fimage.png)
ko.count("초등학교")
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2Ffd4d68a2-778a-4dc4-b96a-98f7c859d045%2Fimage.png)
plt.figure(figsize=(12, 6))
ko.dispersion_plot(["육아휴직", "초등학교", "공무원"])
plt.show()
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F00e8a896-e684-421c-8d97-e5a559ee119b%2Fimage.png)
연어(collocation)
: 함께 위치하는 단어들이란 뜻으로, 어휘의 조합 또는 짝을 이루는 말을 일컫는다.
ko.concordance("초등학교")
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F05eb830b-6e1f-4a20-8e8a-e7b873e42168%2Fimage.png)
ko.vocab()
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F841606b6-7779-4a10-a38f-77645ba2d907%2Fimage.png)
from wordcloud import WordCloud
data = ko.vocab().most_common(150)
wordcloud = WordCloud(
font_path="C:/Windows/Fonts/malgun.ttf",
relative_scaling=0.2,
background_color="white"
).generate_from_frequencies(dict(data))
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2Ff043eef7-a834-40e1-b1ac-8efe9b43a888%2Fimage.png)
◾나이브 베이즈 정리(Naive Bayes Classifier)
나이브 베이즈 분류
- 기계 학습 분야에서 특성들 사이의 독립을 가정하는 베이즈 정리를 활용한 확률 분류기의 일종이다.
- 적절한 전처리를 거치면 더 진보된 방법들과도 충분히 경쟁력을 가진다.
1. 감성 분석 : 영어
from nltk.tokenize import word_tokenize
import nltk
train = [
('i like you', "pos"),
('i hate you', "neg"),
('you like me', "neg"),
('i like her', "pos"),
]
all_words = set(
word.lower() for sentence in train for word in word_tokenize(sentence[0])
)
all_words
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2Ffd83dd14-843c-436f-b7ee-8429b5486a95%2Fimage.png)
t = [({word : (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]
t
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F257a59f1-4d79-4a7c-963b-5b93940cdac7%2Fimage.png)
- Naive Bayes 분류기 훈련
- 각 단어별로 독립적인 확률 계산하기 때문에 Naive하다고 한다.
classifier = nltk.NaiveBayesClassifier.train(t)
classifier.show_most_informative_features()
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F4068f1d6-5eae-4ef8-bf7b-c039df6de0c3%2Fimage.png)
test_sentence = "i like Merui"
test_sent_features = {
word.lower() : (word in word_tokenize(test_sentence.lower())) for word in all_words
}
test_sent_features
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F738be7ca-c70b-48ff-8528-190ff169d5e2%2Fimage.png)
classifier.classify(test_sent_features)
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2Ffe3b57d9-61a6-40b9-b3fb-0560be9275b9%2Fimage.png)
2. 감성 분석 : 한글
from konlpy.tag import Okt
pos_tagger = Okt()
train = [
('메리가 좋아', 'pos'),
('고양이도 좋아', 'pos'),
('난 수업이 지루해', 'neg'),
('메리는 이븐 고양이야', 'pos'),
('난 마치고 메리랑 놀거야', 'pos'),
]
- 형태소 분석 미진행
- 메리가, 메리는, 메리랑을 모두 다른 단어로 인식한다.
all_words = set(
word.lower() for sentence in train for word in word_tokenize(sentence[0])
)
all_words
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F9f78a899-084d-4fb7-9db6-df87f5ae3cbe%2Fimage.png)
t = [({word : (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]
t
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F62f7d153-e566-431f-81fe-a599a1eebbe9%2Fimage.png)
classifier = nltk.NaiveBayesClassifier.train(t)
classifier.show_most_informative_features()
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F9ee862eb-55ef-4361-a3a7-15a432f7087c%2Fimage.png)
test_sentence = "난 수업이 마치면 메리랑 놀거야"
test_sent_features = {
word.lower() : (word in word_tokenize(test_sentence.lower())) for word in all_words
}
test_sent_features
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F3175b7ca-afc5-4a87-8882-420bc350180f%2Fimage.png)
classifier.classify(test_sent_features)
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F783a1c8d-8053-4dcc-9e19-ca32f7add825%2Fimage.png)
- 형태소 분석 진행
- 형태소 분석을 한 후 품사를 단어 뒤에 붙여준다.
def tokenize(doc):
return ["/".join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
train_docs = [(tokenize(row[0]), row[1]) for row in train]
train_docs
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F24bd6347-3670-4c41-825c-e180d2b83002%2Fimage.png)
tokens = [t for d in train_docs for t in d[0]]
tokens
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2Fe9e2a785-b4ba-4fe4-8f8a-3764d53abc9f%2Fimage.png)
def term_exists(doc):
return {word : (word in set(doc)) for word in tokens}
train_xy = [(term_exists(d), c) for d, c in train_docs]
train_xy
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F7560d3d8-b316-4596-b944-695bacf4082e%2Fimage.png)
classifier = nltk.NaiveBayesClassifier.train(train_xy)
classifier.show_most_informative_features()
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F520bd401-df2d-40a6-998e-d1e7e6e000f6%2Fimage.png)
test_sentence = [("난 수업이 마치면 메리랑 놀거야")]
test_docs = pos_tagger.pos(test_sentence[0])
test_docs = (tokenize(test_sentence[0]))
test_docs
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F3c7493ac-6000-47ac-97eb-4af3e9db6f70%2Fimage.png)
test_sent_features = {word : (word in test_docs) for word in tokens}
test_sent_features
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2Fb59d4c04-bc50-4e77-b495-f7cb6e8e357d%2Fimage.png)
- 결과가 잘못 나온 것같은데.. 어디가 문제인지 잘 모르겠다.
- 잘못된 부분을 찾으면 수정할 예정
classifier.classify(test_sent_features)
![](https://velog.velcdn.com/images%2Fskarb4788%2Fpost%2F1ddc1393-e367-44df-8bc7-24070b4485db%2Fimage.png)