--26.텍스트 전처리.ipynb--
자연어(natural language)란 우리가 일상 생활에서 사용하는 언어를 말합니다. 자연어 처리(natural language processing)란 이러한 자연어의 의미를 분석하여 컴퓨터가 처리할 수 있도록 하는 일을 말합니다.
자연어 처리는 음성 인식, 내용 요약, 번역, 사용자의 감성 분석, 텍스트 분류 작업(스팸 메일 분류, 뉴스 기사 카테고리 분류), 질의 응답 시스템, 챗봇과 같은 곳에서 사용되는 분야
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
sentence = "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."
word_tokenize(sentence)
from nltk.tokenize import WordPunctTokenizer
WordPunctTokenizer().tokenize(sentence)
from tensorflow.keras.preprocessing.text import text_to_word_sequence
sentence
text_to_word_sequence(sentence)
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
sentence = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."
print(tokenizer.tokenize(sentence))
Sentence tokenization
from nltk.tokenize import sent_tokenize
text = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to make sure no one was near."
text
sent_tokenize(text)
text = "I am actively looking for Ph.D. students. and you are a Ph.D student."
sent_tokenize(text)
! pip install kss
import kss
text = '딥 러닝 자연어 처리가 재미있기는 합니다. 그런데 문제는 영어보다 한국어로 할 때 너무 어렵습니다. 이제 해보면 알걸요?'
kss.split_sentences(text)
Part-of-speech(POS) tagging
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger')
text = "I am actively looking for Ph.D. students. and you are a Ph.D student."
text
tokenized_sentence = word_tokenize(text)
tokenized_sentence
pos_tag(tokenized_sentence) # 품사 태깅
KoNLPY
!pip install konlpy
KoNLPy는 다음과 같은 다양한 형태소 분석, 태깅 라이브러리를 파이썬에서 쉽게 사용할 수 있도록 모아놓았다.
Hannanum: 한나눔. KAIST Semantic Web Research Center 개발.
Kkma: 꼬꼬마. 서울대학교 IDS(Intelligent Data Systems) 연구실 개발.
Komoran: 코모란. Shineware에서 개발.
Mecab: 메카브. 일본어용 형태소 분석기를 한국어를 사용할 수 있도록 수정.
Open Korean Text: 오픈 소스 한국어 분석기. 과거 트위터 형태소 분석기.
from konlpy.tag import Okt
from konlpy.tag import Kkma
okt = Okt()
kkma = Kkma()
sentence = "열심히 코딩한 당신, 연휴에는 여행을 가봐요"
okt.morphs(sentence) # 형태소 추출
okt.pos(sentence) # 품사태깅
okt.nouns(sentence) # 명사만
print(kkma.morphs(sentence))
print(kkma.pos(sentence))
print(kkma.nouns(sentence))
sentence = "아버지가방에들어가신다"
print(okt.pos(sentence))
print(kkma.pos(sentence))
sentence = '그래욬ㅋㅋ'
okt.pos(sentence)
kkma.pos(sentence)
okt.pos(sentence, norm=True, stem=True) # stem = True 하면 원형을 찾아주고
okt.pos(sentence, norm=True) # norm = True 하면 원본에서 원형을 추출해줌
noise data
import re
text = "I was wondering if anyone out there could enlighten me on this car."
text
shortword = re.compile(r'\w*\b\w{1,2}\b')
shortword.sub('', text)
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words = ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
words
nltk.download('wordnet')
[lemmatizer.lemmatize(word) for word in words]
lemmatizer.lemmatize("dies")
lemmatizer.lemmatize("dies", "v")
lemmatizer.lemmatize("watched")
lemmatizer.lemmatize("watched", "v")
lemmatizer.lemmatize("has")
lemmatizer.lemmatize("has", "v")
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
sentence = "This was not the map we found in Billy Bones's chest, but an accurate copy, complete in all things--names and heights and soundings--with the single exception of the red crosses and the written notes."
tokenized_sentence = word_tokenize(sentence)
print(tokenized_sentence)
print([stemmer.stem(word) for word in tokenized_sentence])
words = ['formalize', 'allowance', 'electricical']
print('어간 추출 전 :',words)
print('어간 추출 후 :',[stemmer.stem(word) for word in words])
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_word_list = stopwords.words('english')
print(len(stop_word_list))
print(stop_word_list)
example = "Family is not an important thing. It's everything."
example
stop_words = set(stop_word_list)
word_tokens = word_tokenize(example)
result = []
for word in word_tokens :
if word not in stop_words :
result.append(word)
print('불용어 제거전: ', word_tokens)
print('불용어 제거후: ', result)
example = "고기를 아무렇게나 구우려고 하면 안 돼. 고기라고 다 같은 게 아니거든. 예컨대 삼겹살을 구울 때는 중요한 게 있지."
stop_words = "를 아무렇게나 구 우려 고 안 돼 같은 게 구울 때 는"
stop_words = set(stop_words.split(' '))
word_tokens = okt.morphs(example)
result = [word for word in word_tokens if not word in stop_words]
print('불용어 제거전: ', word_tokens)
print('불용어 제거후: ', result)
https://www.ranks.nl/stopwords/korean
raw_text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."
raw_text
sentences = sent_tokenize(raw_text)
print(sentences)
vocab = {}
preprocessed_sentences = []
stop_words = set(stopwords.words('english'))
for sentence in sentences :
tokenized_sentence = word_tokenize(sentence)
result = []
for word in tokenized_sentence :
word = word.lower() # 모든 단어 소문자화 하여 단어개수 줄이기
if word not in stop_words : # 불용어 제거
if len(word) >2 : # 2글자 이하의 단어 제거
result.append(word)
if word not in vocab :
vocab[word] = 0
vocab[word] += 1
preprocessed_sentences.append(result)
preprocessed_sentences
vocab
vocab_sorted = sorted(vocab.items(), key = lambda x: x[1], reverse = True)
vocab_sorted
word_to_index = {}
i = 0
for (word, frequency) in vocab_sorted :
if frequency > 1 : # 빈도수 1이하 제거
i = i + 1
word_to_index[word] = i
word_to_index
vocab_size = 5
words_frequency = [word for word,index in word_to_index.items() if index >= vocab_size + 1]
for w in words_frequency :
del word_to_index[w]
print(word_to_index)
word_to_index['OOV'] = len(word_to_index) + 1
print(word_to_index)
encoded_sentences = []
for sentence in preprocessed_sentences :
encoded_sentence = []
for word in sentence :
try :
encoded_sentence.append(word_to_index[word]) # 단어 집합에 있는 단어는 인덱스 추가
except KeyError :
encoded_sentence.append(word_to_index['OOV']) # 단어 집합에 없는 단어는 OOV 인덱스 추가
encoded_sentences.append(encoded_sentence)
print(preprocessed_sentences)
print(encoded_sentences)
from tensorflow.keras.preprocessing.text import Tokenizer
preprocessed_sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_sentences) # 빈도수를 기준으로 단어 집합을 생성
tokenizer.word_index
tokenizer.word_counts
tokenizer.texts_to_sequences(preprocessed_sentences)
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 1) # 상위 5개 단어만 사용
tokenizer.fit_on_texts(preprocessed_sentences)
tokenizer.word_index
tokenizer.word_counts
encoded = tokenizer.texts_to_sequences(preprocessed_sentences)
encoded
preprocessed_sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_sentences)
encoded = tokenizer.texts_to_sequences(preprocessed_sentences)
encoded
max_len = max([len(item) for item in encoded])
max_len
import numpy as np
for sentence in encoded :
while len(sentence) < max_len :
sentence.append(0)
padded_up = np.array(encoded)
padded_up
keras 의 패딩 함수
from tensorflow.keras.preprocessing.sequence import pad_sequences
encoded = tokenizer.texts_to_sequences(preprocessed_sentences)
encoded
padded = pad_sequences(encoded)
padded
padded = pad_sequences(encoded, padding='post')
padded
(padded == padded_up).all()
padded = pad_sequences(encoded, padding='post', maxlen=5)
padded
padded = pad_sequences(encoded, padding='post', maxlen=5, truncating='post')
padded
last_value = len(tokenizer.word_index) + 1
last_value
padded = pad_sequences(encoded, padding='post', value=last_value)
padded
tokens = okt.morphs('나는 자연어 처리를 배운다')
print(tokens)
word_to_index = {word : index for index, word in enumerate(tokens)}
print('단어집합: ',word_to_index)
def one_hot_encoding(word, word_to_index):
one_hot_vector = [0] * (len(word_to_index))
index = word_to_index[word]
one_hot_vector[index] = 1
return one_hot_vector
one_hot_encoding("자연어", word_to_index)
keras 의 one-hot encoding
text = "나랑 점심 먹으러 갈래 점심 메뉴는 햄버거 갈래 갈래 햄버거 최고야"
from tensorflow.keras.utils import to_categorical
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
print('단어집합:', tokenizer.word_index)
sub_text = "점심 먹으로 갈래 메뉴는 햄버거 최고야"
encoded = tokenizer.texts_to_sequences([sub_text])[0]
encoded
one_hot = to_categorical(encoded)
print(one_hot)
!pip install customized_konlpy
from ckonlpy.tag import Twitter
twitter = Twitter()
twitter.morphs('은경이는 사무실로 갔습니다.')
okt.morphs('은경이는 사무실로 갔습니다.')
twitter.add_dictionary('은경이', 'Noun') # 단어 사전에 추가
twitter.morphs('은경이는 사무실로 갔습니다.')