▶[코드스니펫] - 2주차 숙제 정답 코드
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_table('https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/naver_shopping.txt', names=['ratings', 'reviews'])
print(df['reviews'].nunique())
# 중복 샘플 제거
df.drop_duplicates(subset=['reviews'], inplace=True)
df['ratings'].value_counts().plot(kind = 'bar')
print(df.groupby('ratings').size().reset_index(name = 'count'))
###########################
!pip install konlpy
from konlpy.tag import Okt
tokenizer = Okt()
df['tokenized'] = df['reviews'].apply(tokenizer.nouns)
# 리뷰 점수가 4~5점이면 1, 리뷰 점수가 1~2면 0의 값을 줍니다.
df['label'] = np.select([df.ratings > 3], [1], default=0)
df.head()
positive_reviews = np.hstack(df[df['label']==1]['tokenized'].values)
negative_reviews = np.hstack(df[df['label']==0]['tokenized'].values)
############################
https://colab.research.google.com/drive/1nLJc_QJiRl7DWB8ALQojxTIgzh06s-dW?usp=sharing