텍스트 마이닝
from sklearn.pipeline import make_pipeline
pipe_model = make_pipeline(CountVectorizer(), LinearSVC())
pipe_model.fit(text_train,y_train)
reviews = ['This movie so good']
pipe_model.predict(reviews)
from sklearn.model_selection import GridSearchCV
pram_gird = {
'countvectorizer__max_df' : [300,500, 700],
'countvectorizer__min_df' : [10,20,30],
'countvectorizer__ngram_range' : [(1,1),(1,2),(1,3)],
'linearsvc__C' : [0.01, 0.1, 1, 10, 100]
}
grid = GridSearchCV(pipe_model, pram_gird, cv=3, verbose=2)
grid.fit(text_train,y_train)
grid.best_params_
grid.best_score_
grid.score(text_train,y_train)
grid.score(text_test,y_test)
final_model = make_pipeline( CountVectorizer(max_df=300, min_df=10,
ngram_range=(1,3)),
LinearSVC(C=0.01) )
final_model.steps
final_model.fit(text_train,y_train)
final_cv=final_model.steps[0][1]
len(final_cv.vocabulary_)
final_svm= final_model.steps[1][1]
len(final_svm.coef_[0])
import pandas as pd
df = pd.DataFrame([final_cv.vocabulary_.keys(),
final_cv.vocabulary_.values()])
df.head()
df_sorted = df.sort_values(by = 1)
df_sorted
df_sorted['coef'] = final_svm.coef_[0]
df_sorted
df_sorted.sort_values(by = 'coef', inplace=True)
df_sorted
top30_df = pd.concat([
df_sorted.head(30),
df_sorted.tail(30)
])
import matplotlib.pyplot as plt
plt.figure(figsize=(15,5))
plt.bar(top30_df[0], top30_df['coef'])
plt.xticks(rotation = 90)
plt.show()
네이버 영화평점 리뷰
!pip install --upgrade pip
!pip install JPype1-1.1.2-cp38-cp38-win_amd64.whl
!pip install konlpy
from konlpy.tag import Okt
okt = Okt()
okt.morphs('아버지가방에들어가신다')
okt.tagset
from konlpy.tag import Kkma
kkma = Kkma()
kkma.tagset
okt.pos('아버지가방에들어가신다')
kkma.pos('아버지가방에들어가신다')
okt.nouns('아버지가방에들어가신다')
!pip install sklearn
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
text=['이제 목소리 괜찮네','솔직히 오전에는 별로였음','지금 목소리가 더 좋은 목소리네요']
cv.fit(text)
cv.vocabulary_
def mytokenizer(text):
return okt.nouns(text)
cv_okt = CountVectorizer(tokenizer= mytokenizer)
cv_okt.fit(text)
cv_okt.vocabulary_
!pip install pandas
import numpy as np
import pandas as pd
text_train = pd.read_csv('./data/ratings_train.txt', delimiter = '\t')
text_test = pd.read_csv('./data/ratings_test.txt', delimiter = '\t')
text_train.info()
text_test.info()
text_train.dropna(inplace=True)
text_test.dropna(inplace=True)
X_train = text_train['document'][:1500]
X_test = text_test['document'][:500]
y_train = text_train['label'][:1500]
y_test= text_train['label'][:500]
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
pipe_model = make_pipeline(TfidfVectorizer(tokenizer=mytokenizer),LinearSVC())
pipe_model.fit(X_train,y_train)
pipe_model.score(X_train,y_train)
pipe_model.score(X_test,y_test)
pipe_model.predict(['재미있다'])
tfidf = pipe_model.steps[0][1]
svm = pipe_model.steps[1][1]
df = pd.DataFrame([tfidf.vocabulary_.keys(),
tfidf.vocabulary_.values()])
df.head()
df = df.T
df.head()
df.sort_values(by=1,inplace=True)
df
df['coef'] = svm.coef_[0]
df
df.sort_values(by='coef',inplace=True)
df
top30_df = pd.concat([
df.head(30),
df.tail(30)
])
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="C:\Windows\Fonts\malgun.ttf").get_name()
rc('font',family=font_name)
plt.figure(figsize=(15,5))
plt.bar(top30_df[0], top30_df['coef'])
plt.xticks(rotation = 90)
plt.show()