NLTK, torchtext
tokenize: corpus를 token 단위로 나누는 작업
corpus(복수형: corpora): 자연언어 연구를 위해 특정한 목적을 가지고 언어의 표본을 추출한 집합
NLTK - Natural Language Tookitimport pandas as pd #
import nltk
import re
df = pd.read_csv("/Users/사용자이름/netflix_reviews.csv")
The only reason I didn't give it four stars is that In my opinion, there are too many foreign films. When I'm done working hard all day. I don't want to have to read subtitles. UPDATE: 10/19/24
cleaned = re.sub('[^a-zA-Z]', ' ', sample) : 알파벳이 아닌 경우 공백 처리cleaned.lower() : 소문자화the only reason i didn t give it four stars is that in my opinion there are too many foreign films when i m done working hard all day i don t want to have to read subtitles update
from nltk.corpus import stopwords: 불용어 목록 불러오기nltk.download('stopwords'): NLTK 는 패키지 설치시 모든 데이터를 다운로드하지 않음, 모듈마다 필요한 데이터를 따로 다운로드eng_stopwords = stopwords.words('english'): 영어 불용어 목록 불러오기['i',
'me',
'my',
...
"won't",
'wouldn',
"wouldn't"]
sample.split(): 공백을 기준으로 단어를 끊어서 나열, 불용어 목록과 대조하기 위해 사용['The',
'only',
'reason',
...
'BEGINNING!!!!!!',
"I'M",
'DONE']
isinstance : 객체의 타입 확인' '.join() : 문자열 하나로 합치기def preprocessing(sentence):
if isinstance(sentence, float): return ''
cleaned = re.sub('[^a-zA-Z]', ' ', sentence)
cleaned = cleaned.lower()
cleaned = [word for word in cleaned.split() if word not in eng_stopwords ]
# 불용어 목록에 없는 단어만 반환
return ' '.join(cleaned)
# .split() 처리한 것 다시 붙여주기
.apply() : 함수 적용reviews = df['content']
cleaned_reviews = reviews.apply(preprocessing)
cleaned_reviews.head(10)
0 open
1 best app
2 famous korean drama dubbed hindi sense paying ...
3 superb please add comments section us like you...
4 reason give four stars opinion many foreign fi...
5 amazing
6 pure greatness
7 good
8 experiencing error
9 anti indian propoganda filler fool
Name: content, dtype: object
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
# 단어의 형태 통일 - 동사의 원형 등
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('runs')
lemmatizer.lemmatize('runs', 'v')
#
'run'
def process_lemma(sentence):
return [lemmatizer.lemmatize(word, 'v') for word in sentence]
def preprocessing(sentence):
if isinstance(sentence, float): return ''
cleaned = re.sub('[^a-zA-Z]', ' ', sentence)
cleaned = cleaned.lower()
cleaned = cleaned.strip() # 띄어쓰기 제외 공백 제거
cleaned = [word for word in cleaned.split() if word not in eng_stopwords ]
cleaned = process_lemma(cleaned)
return ' '.join(cleaned) # 공백 제거 ' ' -> ''
0 open
1 best app
2 famous korean drama dub hindi sense pay subscr...
3 superb please add comment section us like youtube
4 reason give four star opinion many foreign fil...
5 amaze
6 pure greatness
7 good
8 experience error
9 anti indian propoganda filler fool
Name: content, dtype: object
importimport torchtext; torchtext.disable_torchtext_deprecation_warning(): "torchtext의 마지막 버전 경고" 무시from torchtext.data.utils import get_tokenizerfrom torchtext.vocab import build_vocab_from_iteratorwords = "Betty Botter bought bit of Bitter Butter"
tokenizer = get_tokenizer('basic_english')
tokenizer(words)
#
['betty', 'botter', 'bought', 'bit', 'of', 'bitter', 'butter']
vocab = build_vocab_from_iterator(tokenizer(words), specials=['<unk>'])
vocab.get_stoi()
#
{'o': 4,
'h': 10,
'i': 6,
'f': 8,
'b': 2,
'u': 7,
'e': 3,
'y': 11,
'g': 9,
't': 1,
'r': 5,
'<unk>': 0}
def yield_tokens(sentences):
for text in sentences:
yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(df['reviews'].tolist()),
specials=['<UNK>'], # 스페셜 토큰
min_freq=2, # 최소 빈도 토큰
max_tokens=1000, # 최대 토큰 개수
)
# string -> index
stoi = vocab.get_stoi()
# index -> string
itos = vocab.get_itos()
from tensorflow.keras.preprocessing.text import Tokenizertokenizer = Tokenizer(oov_token='<OOV>') # 없는 데이터 치환
tokenizer.fit_on_texts(cleaned_reviews) # 단어집 생성
len(tokenizer.word_index) # 단어집 개수 33014
netflix 2
app 3
watch 4
show 5
movies 6
good 7
like 8
get 9
use 10
love 11
work 12
please 13
time 14
update 15
great 16
phone 17
download 18
even 19
try 20
go 21