Codecademy [Learn Natural Language Processing]
BAG-OF-WORDS LANGUAGE MODEL
statistical language model based on word count(based on probability)
n-gram model, with n set to 1
단어들의 출현 빈도(frequency)에만 집중하는 텍스트 데이터의 수치화 표현 방법
BoW를 만드는 과정을 이렇게 두 가지 과정으로 생각해보겠습니다.
(1) 우선, 각 단어에 고유한 정수 인덱스를 부여합니다.
(2) 각 인덱스의 위치에 단어 토큰의 등장 횟수를 기록한 벡터를 만듭니다.
from preprocessing import preprocess_text
# Define text_to_bow() below:
def text_to_bow(some_text):
bow_dictionary = {}
tokens = preprocess_text(some_text )
for token in tokens:
if token in bow_dictionary:
bow_dictionary[token] += 1
else:
bow_dictionary[token] = 1
return bow_dictionary
print(text_to_bow("I love fantastic flying fish. These flying fish are just ok, so maybe I will find another few fantastic fish..."))
#{'i': 2, 'love': 1, 'fantastic': 2, 'fly': 2, 'fish': 3, 'these': 1, 'be': 1, 'just': 1, 'ok': 1, 'so': 1, 'maybe': 1, 'will': 1, 'find': 1, 'another': 1, 'few': 1}
feature extraction or vectorization : 텍스트를 bow 벡터로 만들기
feature vector : a numeric representation of an item’s important features. with columns
[1, 0, 2, 0, 0, 0, 1, 1, 0, 0, 2]
이런식으로 표현
from preprocessing import preprocess_text
# Define create_features_dictionary() below:
def create_features_dictionary(documents):
features_dictionary = {}
merged = " ".join(documents)
tokens = preprocess_text(merged)
index = 0
for token in tokens:
if token not in features_dictionary:
features_dictionary[token] = index
index += 1
return features_dictionary, tokens
training_documents = ["Five fantastic fish flew off to find faraway functions.", "Maybe find another five fantastic fish?", "Find my fish with a function please!"]
print(create_features_dictionary(training_documents)[0])
#결과
#{'five': 0, 'fantastic': 1, 'fish': 2, 'fly': 3, 'off': 4, 'to': 5, 'find': 6, 'faraway': 7, 'function': 8, 'maybe': 9, 'another': 10, 'my': 11, 'with': 12, 'a': 13, 'please': 14}
from preprocessing import preprocess_text
# Define text_to_bow_vector() below:
def text_to_bow_vector(some_text, features_dictionary):
bow_vector = len(features_dictionary)*[0]
tokens = preprocess_text(some_text)
for token in tokens:
feature_index = features_dictionary[token]
bow_vector[feature_index] += 1
return bow_vector, tokens
features_dictionary = {'function': 8, 'please': 14, 'find': 6, 'five': 0, 'with': 12, 'fantastic': 1, 'my': 11, 'another': 10, 'a': 13, 'maybe': 9, 'to': 5, 'off': 4, 'faraway': 7, 'fish': 2, 'fly': 3}
text = "Another five fish find another faraway fish."
print(text_to_bow_vector(text, features_dictionary)[0])
#결과
# [1, 0, 2, 0, 0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 0]
from spam_data import training_spam_docs, training_doc_tokens, training_labels, test_labels, test_spam_docs, training_docs, test_docs
from sklearn.naive_bayes import MultinomialNB
def create_features_dictionary(document_tokens):
features_dictionary = {}
index = 0
for token in document_tokens:
if token not in features_dictionary:
features_dictionary[token] = index
index += 1
return features_dictionary
def tokens_to_bow_vector(document_tokens, features_dictionary):
bow_vector = [0] * len(features_dictionary)
for token in document_tokens:
if token in features_dictionary:
feature_index = features_dictionary[token]
bow_vector[feature_index] += 1
return bow_vector
# Define bow_sms_dictionary:
bow_sms_dictionary = create_features_dictionary(training_doc_tokens )
# Define training_vectors:
training_vectors = [tokens_to_bow_vector(training_doc, bow_sms_dictionary) for training_doc in training_spam_docs]
# Define test_vectors:
test_vectors = [tokens_to_bow_vector(test_doc, bow_sms_dictionary) for test_doc in test_spam_docs]
spam_classifier = MultinomialNB()
def spam_or_not(label):
return "spam" if label else "not spam"
# Uncomment the code below when you're done:
spam_classifier.fit(training_vectors, training_labels)
predictions = spam_classifier.score(test_vectors, test_labels)
print("The predictions for the test data were {0}% accurate.\n\nFor example, '{1}' was classified as {2}.\n\nMeanwhile, '{3}' was classified as {4}.".format(predictions * 100, test_docs[0], spam_or_not(test_labels[0]), test_docs[10], spam_or_not(test_labels[10])))
#결과
"""
The predictions for the test data were 99.0% accurate.
For example, 'well obviously not because all the people in my cool college life go home _' was classified as not spam.
Meanwhile, 'urgent we be try to contact you last weekend draw show u have win a 1000 prize guarantee call 09064017295 claim code k52 valid 12hrs 150p pm' was classified as spam.
"""
collections
module’s Counter()
function :
CountVectorizer
from the machine learning library scikit-learn
: For vectorization
fit()
to train the features dictionary and then
transform()
to transform text into a vector
from sklearn.feature_extraction.text import CountVectorizer
training_documents = ["Five fantastic fish flew off to find faraway functions.", "Maybe find another five fantastic fish?", "Find my fish with a function please!"]
test_text = ["Another five fish find another faraway fish."]
bow_vectorizer = CountVectorizer()
bow_vectorizer.fit(training_documents)
bow_vector = bow_vectorizer.transform(test_text)
print(bow_vector.toarray())
# [[2 0 1 1 2 1 0 0 0 0 0 0 0 0 0]]
from spam_data import training_spam_docs, training_doc_tokens, training_labels, test_labels, test_spam_docs, training_docs, test_docs
from sklearn.naive_bayes import MultinomialNB
# Import CountVectorizer from sklearn:
from sklearn.feature_extraction.text import CountVectorizer
# Define bow_vectorizer:
bow_vectorizer = CountVectorizer()
# Define training_vectors:
training_vectors = bow_vectorizer.fit_transform(training_docs)
# Define test_vectors:
test_vectors = bow_vectorizer.transform(test_docs)
spam_classifier = MultinomialNB()
def spam_or_not(label):
return "spam" if label else "not spam"
# Uncomment the code below when you're done:
spam_classifier.fit(training_vectors, training_labels)
predictions = spam_classifier.score(test_vectors, test_labels)
print("The predictions for the test data were {0}% accurate.\n\nFor example, '{1}' was classified as {2}.\n\nMeanwhile, '{3}' was classified as {4}.".format(predictions * 100, test_docs[7], spam_or_not(test_labels[7]), test_docs[15], spam_or_not(test_labels[15])))
from preprocessing import preprocess_text
from nltk.util import ngrams
from collections import Counter
text = "It's exciting to watch flying fish after a hard day's work. I don't know why some fish prefer flying and other fish would rather swim. It seems like the fish just woke up one day and decided, 'hey, today is the day to fly away.'"
tokens = preprocess_text(text)
# Bigram approach:
bigrams_prepped = ngrams(tokens, 2)
bigrams = Counter(bigrams_prepped)
print("Three most frequent word sequences and the number of occurrences according to Bigrams:")
print(bigrams.most_common(3))
# Bag-of-Words approach:
# Define bag_of_words here:
bag_of_words = Counter(tokens)
print("\nThree most frequent words and number of occurrences according to Bag-of-Words:")
most_common_three = bag_of_words.most_common(3)
print(most_common_three)