머신러닝 15일차

ParkJinYoung·2022년 11월 7일

텍스트 마이닝

from sklearn.pipeline import make_pipeline

# 데이터를 차례대로 집어 넣어준다.
pipe_model = make_pipeline(CountVectorizer(), LinearSVC())
pipe_model.fit(text_train,y_train)

reviews = ['This movie so good']
pipe_model.predict(reviews)

# GridSearch 진행하기
from sklearn.model_selection import GridSearchCV

pram_gird = {
    'countvectorizer__max_df' : [300,500, 700],
    'countvectorizer__min_df' : [10,20,30],
    'countvectorizer__ngram_range' : [(1,1),(1,2),(1,3)],
    'linearsvc__C' : [0.01, 0.1, 1, 10, 100]
}
grid = GridSearchCV(pipe_model, pram_gird, cv=3, verbose=2)

grid.fit(text_train,y_train)

# 최적의 파라미터
grid.best_params_

# 최고의 점수
grid.best_score_

grid.score(text_train,y_train)


grid.score(text_test,y_test)

final_model = make_pipeline( CountVectorizer(max_df=300, min_df=10,
                                            ngram_range=(1,3)),
                            LinearSVC(C=0.01) )

final_model.steps

final_model.fit(text_train,y_train)

final_cv=final_model.steps[0][1]

len(final_cv.vocabulary_)

final_svm= final_model.steps[1][1]
#각단어별가중치
len(final_svm.coef_[0])

import pandas as pd
# 단어사전의 단어들을 인덱스 번호 기준으로 정렬
df = pd.DataFrame([final_cv.vocabulary_.keys(), # 단어
                  final_cv.vocabulary_.values()]) # 인덱스 번호
df.head()

df_sorted = df.sort_values(by = 1)
df_sorted

# 가중치 데이터 추가하기
df_sorted['coef'] = final_svm.coef_[0]
df_sorted

# coef로 다시 재정렬
df_sorted.sort_values(by = 'coef', inplace=True)
df_sorted

# 시각화
top30_df = pd.concat([
    df_sorted.head(30), # 부정단어 30개
    df_sorted.tail(30) # 긍정단어 30개
])
import matplotlib.pyplot as plt
plt.figure(figsize=(15,5)) # 가로,세로 비율
plt.bar(top30_df[0], top30_df['coef']) # x축은 단어, y축은 가중치
plt.xticks(rotation = 90) # x축 눈금 각도 조정
plt.show()

네이버 영화평점 리뷰

# JPype : Java 와 Python 연결
# 시스템 환경 변수의 Path를 사용
!pip install --upgrade pip

!pip install JPype1-1.1.2-cp38-cp38-win_amd64.whl

!pip install konlpy

from konlpy.tag import Okt

#CountVectorizer와 Konlpy 사용하기
#Konlpy : 4개의 형태소 분류기를 가지고있음
#Komoran, Hannanum, Okt, Kkma
#왼쪽에 있을수록 실행시간이 짧음
#오른쪽에 있을수록 정확함 (세분화)

# 형태소 분류
okt = Okt()
okt.morphs('아버지가방에들어가신다')

# 사용되는 형태소의 종류
okt.tagset

from konlpy.tag import Kkma
kkma = Kkma()
kkma.tagset

# 형태소별로 구분후 사용 형태소 나열
okt.pos('아버지가방에들어가신다')

kkma.pos('아버지가방에들어가신다')

# 명사만 추출
okt.nouns('아버지가방에들어가신다')

!pip install sklearn

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
text=['이제 목소리 괜찮네','솔직히 오전에는 별로였음','지금 목소리가 더 좋은 목소리네요']

# 띄어쓰기 단위로 토큰화
cv.fit(text) # 단어사전 구축
cv.vocabulary_ # 단어사전 확인

# 명사만 추출
def mytokenizer(text):
    return okt.nouns(text)
    
cv_okt = CountVectorizer(tokenizer= mytokenizer)
cv_okt.fit(text)
cv_okt.vocabulary_

#네이버 영화리뷰 데이터셋 감정분석

!pip install pandas
import numpy as np
import pandas as pd

text_train = pd.read_csv('./data/ratings_train.txt', delimiter = '\t')
text_test = pd.read_csv('./data/ratings_test.txt', delimiter = '\t')

text_train.info()
text_test.info()

text_train.dropna(inplace=True)
text_test.dropna(inplace=True)

X_train = text_train['document'][:1500]
X_test = text_test['document'][:500]
y_train = text_train['label'][:1500]
y_test= text_train['label'][:500]

# ex10 : pipline(BOW,SVM) > 학습 > 가중치 확인
# ex11 : pipline(TFIDF,SVM) > 학습 > 가중치 확인
# BOW 토큰화 수치화
# - 수치화시에 많이 나온 단어가 중요한 단어라고 생각
# TFIDF 토큰화 수치화
# - 수치화시에 많이 나온다고 중요한 단어가 아니다
# - 단어의 중요도를 계산하는 방법이 다르다
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline

pipe_model = make_pipeline(TfidfVectorizer(tokenizer=mytokenizer),LinearSVC())

pipe_model.fit(X_train,y_train)

pipe_model.score(X_train,y_train)

pipe_model.score(X_test,y_test)

pipe_model.predict(['재미있다'])

tfidf = pipe_model.steps[0][1]
svm = pipe_model.steps[1][1]

df = pd.DataFrame([tfidf.vocabulary_.keys(),
                   tfidf.vocabulary_.values()])
df.head()

df = df.T
df.head()

df.sort_values(by=1,inplace=True)
df

df['coef'] = svm.coef_[0]
df

df.sort_values(by='coef',inplace=True)
df

top30_df = pd.concat([
    df.head(30), # 부정단어 30개
    df.tail(30) # 긍정단어 30개
])
import matplotlib 
# '-' (마이너스)를 한글에서 인식하게 하기
matplotlib.rcParams['axes.unicode_minus'] = False
from matplotlib import font_manager, rc
# 폰트조정
font_name = font_manager.FontProperties(fname="C:\Windows\Fonts\malgun.ttf").get_name()
rc('font',family=font_name)
plt.figure(figsize=(15,5)) # 가로,세로 비율
plt.bar(top30_df[0], top30_df['coef']) # x축은 단어, y축은 가중치
plt.xticks(rotation = 90) # x축 눈금 각도 조정
plt.show()

ParkJinYoung

꾸준히

이전 포스트

안드로이드 19일차

다음 포스트

머신러닝 15일차

텍스트 마이닝

네이버 영화평점 리뷰

안드로이드 19일차

면접관련

0개의 댓글