1-1.Tokenizer
1-2. Build Vocabulary
1-3. Skip-Gram
1-4. Skip-Gram w/ Negative Sampling
2-1. SentencePiece
2-2. Word2Vec and Visualization
3-1. MosesTokenizer & BPE
If you have any questions, feel free to ask
from abc import ABC
from typing import List, Dict, Tuple, Set
import random
import torch.nn.functional as F
from typing import List, Dict, Tuple
from random import randint
import re
import math
from collections import Counter
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
from itertools import chain
In this section, you will implement a simplest tokenizer.
Two functionality of tokenizer:
def tokenize(sentence):
###################################################
# TODO: Separate a sentence into tokens . , ! ? #
# Look https://cheatography.com/davechild/cheat-sheets/regular-expressions/ for details.
sentence = re.sub("[.]", " . ", sentence)
sentence = re.sub("[,]", " , ", sentence)
sentence = re.sub("[!]", " ! ", sentence)
sentence = re.sub("[?]", " ? ", sentence)
tokens = sentence.split()
###################################################
if not isinstance(tokens, list):
raise TypeError("Invalid type for tokens.")
return tokens
def test_tokenize(sentence, hypothesis):
tokens = tokenize(sentence)
print("Tokenizer ouput : ", tokens)
assert tokens == hypothesis, \
"Something is wrong. Please try to fix tokenizer."
print("Test passed.")
sentence1 = "This sentence should be tokenized properly."
answer1 = ['This', 'sentence', 'should', 'be', 'tokenized', 'properly', '.']
sentence2 = "Jhon's book is not popular, but he loves his book."
answer2 = ["Jhon's", "book", "is", "not", "popular", ",", "but", "he", "loves", "his", "book", "."]
sentence3 = " .,! ?,,'-4. ! "
answer3 = ['.', ',', '!', '?', ',', ',', "'-4", '.', '!']
test_tokenize(sentence1, answer1)
test_tokenize(sentence2, answer2)
test_tokenize(sentence3, answer3)
Tokenizer ouput : ['This', 'sentence', 'should', 'be', 'tokenized', 'properly', '.']
Test passed.
Tokenizer ouput : ["Jhon's", 'book', 'is', 'not', 'popular', ',', 'but', 'he', 'loves', 'his', 'book', '.']
Test passed.
Tokenizer ouput : ['.', ',', '!', '?', ',', ',', "'-4", '.', '!']
Test passed.
In this section, you will build the vocabulary.
'''
Arguments:
sentences -- The list of sentence to build vocab. Each sentence is a list of words.
min_freq -- The minimum frequency of a word. Words that have a frequency less than a threshold are excluded from a vocabulary.
Return:
idx2word -- A dictionary which maps an index to the word , key : index (int), value : word (str) ex) {0:'<PAD>', 1:'<UNK>', 2:'This', 3:'sentence', ...}
word2idx -- A dictionary which maps the word to an index, key : word (str), value : index (int) ex) {'<PAD>': 0, '<UNK>': 1, 'This': 2, 'sentence': 3, ...}
word_freq -- A dictionary which maps the word to its frequency, key : word (str), value : index (int)
'''
def build_vocab(sentences, min_freq) :
word2idx = {'<PAD>': 0, '<UNK>': 1}
idx2word = {0:'<PAD>', 1:'<UNK>'}
word_freq = {}
############################################################################
# TODO : Implement build_vocab
# Count word frequency
flatten = list(chain(*sentences))
word_freq = dict(Counter(flatten))
rare_word_freq = 0
# Build vocab
for word, freq in word_freq.copy().items():
# Filter the rare words
if freq >= min_freq:
# Add word to vocabulary
if word not in word2idx:
idx2word[len(word2idx)] = word
word2idx[word] = len(word2idx)
# Count rare words
else:
word_freq.pop(word)
rare_word_freq += freq
# Add <UNK> frequency to word frequency
word_freq["<UNK>"] = rare_word_freq
############################################################################
return idx2word, word2idx, word_freq
print("*"*30, "Test 1","*"*30)
sentences = [["This", "is", "a", "sentence", "."],
["Jhon", "'s", "book", "is", "not", "popular", ",", "but", "he", "loves", "his", "book", "."]]
idx2word, word2idx, word_freq = build_vocab(sentences, min_freq=1)
print("idx2word : ", idx2word)
print("word2idx : ", word2idx)
print("word_freq", word_freq)
print()
print("*"*30, "Test 2","*"*30)
sentences = [["a", "b", "c", "d", "e"],
["c", "d", "f", "g"],
["d", "e", "g", "h"]]
idx2word, word2idx, word_freq = build_vocab(sentences, min_freq=2)
print("idx2word : ", idx2word)
print("word2idx : ", word2idx)
print("word_freq", word_freq)
****************************** Test 1 ******************************
idx2word : {0: '<PAD>', 1: '<UNK>', 2: 'This', 3: 'is', 4: 'a', 5: 'sentence', 6: '.', 7: 'Jhon', 8: "'s", 9: 'book', 10: 'not', 11: 'popular', 12: ',', 13: 'but', 14: 'he', 15: 'loves', 16: 'his'}
word2idx : {'<PAD>': 0, '<UNK>': 1, 'This': 2, 'is': 3, 'a': 4, 'sentence': 5, '.': 6, 'Jhon': 7, "'s": 8, 'book': 9, 'not': 10, 'popular': 11, ',': 12, 'but': 13, 'he': 14, 'loves': 15, 'his': 16}
word_freq {'This': 1, 'is': 2, 'a': 1, 'sentence': 1, '.': 2, 'Jhon': 1, "'s": 1, 'book': 2, 'not': 1, 'popular': 1, ',': 1, 'but': 1, 'he': 1, 'loves': 1, 'his': 1, '<UNK>': 0}
****************************** Test 2 ******************************
idx2word : {0: '<PAD>', 1: '<UNK>', 2: 'c', 3: 'd', 4: 'e', 5: 'g'}
word2idx : {'<PAD>': 0, '<UNK>': 1, 'c': 2, 'd': 3, 'e': 4, 'g': 5}
word_freq {'c': 2, 'd': 3, 'e': 2, 'g': 2, '<UNK>': 4}
In this section, we will implement Skip-Gram. We also prepare the inputs for training Word2Vec.
In this code block, we will implement the function which returns a (center_word, surrounding_words) pair for Skip-Gram.
"""
Argument:
tokens -- A sentence where the center word and the surrounding words come from (List[str])
win_size -- context window size (int)
Return:
window_pairs -- List of (center word, surrounding_word) pairs. (List[Tuple])
For example, # ["Jhon's", "book", "is", "not", "popular"] w/ win_size 2, -> [("Jhon's", 'book'), ("Jhon's", 'is'), ('book', "Jhon's"), ('book', 'is'), ('book', 'not'), ('is', "Jhon's"), ('is', 'book'), ...]
"""
def get_window_pairs(tokens, win_size=4):
window_pairs = []
################# Implement here. #############
for idx, token in enumerate(tokens):
start = max(0, idx - win_size)
end = min(len(tokens), idx+win_size+1)
for win_idx in range(start, end):
if not idx == win_idx:
pair = (token, tokens[win_idx])
window_pairs.append(pair)
###############################################
return window_pairs
sentence = ["Jhon's", "book", "is", "not", "popular", ","]
window_pairs = get_window_pairs(sentence, win_size=2)
for idx in range(10):
print("center word : ", window_pairs[idx][0])
print("surrounding_words : ", window_pairs[idx][1])
print()
center word : Jhon's
surrounding_words : book
center word : Jhon's
surrounding_words : is
center word : book
surrounding_words : Jhon's
center word : book
surrounding_words : is
center word : book
surrounding_words : not
center word : is
surrounding_words : Jhon's
center word : is
surrounding_words : book
center word : is
surrounding_words : not
center word : is
surrounding_words : popular
center word : not
surrounding_words : book
In this section, we will implement the negative sampling function to improve Skip-Gram.
class SkipgramDataset(Dataset):
PAD_TOKEN = '<PAD>'
PAD_TOKEN_IDX = 0
UNK_TOKEN = '<UNK>'
UNK_TOKEN_IDX = 1
def __init__(self, document, window_size, min_freq, negative_sampling=False):
self._window_size = window_size
tokens = [tokenize(line.strip().lower()) for line in document]
idx2word, word2idx, word_freq = build_vocab(tokens, min_freq=min_freq)
self._idx2word = idx2word
self._word2idx = word2idx
self._word_freq = word_freq
if negative_sampling:
self.negative_sample_table = list()
self.prepare_negative_samples(power = 0.75)
self.prepare_data(tokens, negative_sampling)
@property
def n_tokens(self):
return len(self._idx2word)
def prepare_negative_samples(self, power):
"""
Arguments:
power -- (int)
word_freq -- A dictionary which maps the word to its frequency, key : word (str), value : index (int)
Return:
self.negative_sample_table -- 1D numpy array which is filled with the vocabulary index. We desire that the random sampling from this array is equivalent to sample the index from the distribution above.
"""
############################ TODO : implement the negative sampler##################################
vocab_size = len(self._word_freq)
power = 0.75
norm = sum([math.pow(count, power) for count in self._word_freq.values()]) # Normalizing constant
table_size = int(1e6) # Length of the unigram table
table = np.zeros(table_size, dtype=np.int32)
p = 0 # Cumulative probability
i = 0
for word, count in self._word_freq.items():
p += float(math.pow(count, power))/norm
while i < table_size and float(i)/table_size < p:
table[i] = self.word2idx(word)
i += 1
self.negative_sample_table = table
######################################################################################################
def negative_sampler(self, K=5):
indices = np.random.randint(low=0, high=len(self.negative_sample_table), size=K)
return [self.negative_sample_table[i] for i in indices]
def prepare_data(self, tokens, negative_sampling=False):
self.features = list()
for tokenized_sentence in tokens:
feature = [(self.word2idx(x),self.word2idx(y)) for x,y in get_window_pairs(tokenized_sentence, self._window_size)]
if negative_sampling:
feature = list()
for center, surrounding in get_window_pairs(tokenized_sentence, self._window_size):
negative_sample = self.negative_sampler(K=5)
feature.append((self.word2idx(center), [self.word2idx(surrounding),]+ negative_sample))
self.features.append(feature)
self.features = list(chain(*self.features))
def idx2word(self, index: int) -> str:
return self._idx2word[index]
def word2idx(self, word: str) -> int:
if word in self._word2idx:
return self._word2idx[word]
else:
return self._word2idx["<UNK>"]
def __len__(self):
return len(self.features)
def __getitem__(self, i):
return self.features[i]
document = [
"Word2vec is a technique for natural language processing published in 2013.",
"The word2vec algorithm uses a neural network model to learn word associations from a large corpus of text.",
"Once trained, such a model can detect synonymous words or suggest additional words for a partial sentence."
]
db = SkipgramDataset(document, window_size=3, min_freq=1)
In this section, you will implement the obejctive function of Skip-Gram.
The objective function is the softmax over the entire vocabulary.
class Word2Vec(torch.nn.Module, ABC):
def __init__(self, n_tokens, embedding_dimension):
super().__init__()
# maps input indices to the latent dimension
self.input_embedding = torch.nn.Parameter(torch.empty([n_tokens, embedding_dimension]), requires_grad=True)
# maps latent vector to input indices
self.output_embedding = torch.nn.Parameter(torch.empty([n_tokens, embedding_dimension]), requires_grad=True)
self.init_weights()
def init_weights(self):
torch.nn.init.normal_(self.input_embedding.data)
torch.nn.init.normal_(self.output_embedding.data)
def forward(self, input_feature):
"""
Arguments:
input_embedding -- Embedding matrix for the input, the center word in our case.
output_embedding -- Embedding matrix for the ouput, the surrounding words in our case.
input_features -- Tuple;
(
center word indices, shape: [batch_size]
surrounding word indices, shape: [batch_size]
)
Return:
loss
"""
############################################################################
## TODO : Define loss function
input_idx, target_idx = input_feature
input_embed = self.input_embedding[input_idx]
logit = torch.einsum('ik,jk->ij', [input_embed, self.output_embedding])
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(logit, target_idx)
############################################################################
return loss
# Prepare data
db = SkipgramDataset(document, window_size=3, min_freq=1)
data_loader = DataLoader(db, batch_size=4)
# Prepare model & optimizer
model = Word2Vec(n_tokens=len(db._idx2word),embedding_dimension=64)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
for epoch in range(10):
epoch_loss = 0
for batch in data_loader:
optimizer.zero_grad()
# forward
loss = model(batch)
epoch_loss += loss.item()
# backward
loss.backward()
optimizer.step()
print(f"epoch: {epoch}, loss: {epoch_loss}")
epoch: 0, loss: 1194.5214595794678
epoch: 1, loss: 1176.5549955368042
epoch: 2, loss: 1159.8491640090942
epoch: 3, loss: 1144.318012714386
epoch: 4, loss: 1129.6706113815308
epoch: 5, loss: 1115.6645188331604
epoch: 6, loss: 1102.1642055511475
epoch: 7, loss: 1089.0992283821106
epoch: 8, loss: 1076.4286317825317
epoch: 9, loss: 1064.1249046325684
In this section, you will implement the obejctive function of Skip-Gram.
The objective function is the softmax over the positive sample and negative samples.
class NCEWord2Vec(torch.nn.Module, ABC):
def __init__(self, n_tokens, embedding_dimension):
super().__init__()
self.input_embedding = nn.Embedding(n_tokens, embedding_dimension)
self.output_embedding = nn.Embedding(n_tokens, embedding_dimension)
self.init_weights()
def init_weights(self):
torch.nn.init.normal_(self.input_embedding.weight.data)
torch.nn.init.normal_(self.output_embedding.weight.data)
def forward(self, input_feature):
"""
Arguments:
input_embedding -- Embedding matrix for the input, the center word in our case.
output_embedding -- Embedding matrix for the ouput, the surrounding words in our case.
input_feautres -- Tuple;
(
center word indeces, shape: [batch_size]
surrounding word & negative sample indeces, shape: [batch_size, K]
)
Return:
loss
"""
# index of surrounding word is always 0
############################################################################
## TODO : Define loss function
input_idx, negative_sample_idx = input_feature
input_embed = self.input_embedding(input_idx)
logit = torch.einsum('ik,ijk->ij', [input_embed, self.output_embedding(negative_sample_idx)])
loss_fct = nn.CrossEntropyLoss()
# first element of logit is solution so we apply all zero as label.
loss = loss_fct(logit, torch.tensor([0]*input_idx.size(0)))
############################################################################
return loss
# Prepare data
db = SkipgramDataset(document, window_size=3, min_freq=1, negative_sampling=True)
data_loader = DataLoader(db, batch_size=4)
# Prepare model & optimizer
model = NCEWord2Vec(n_tokens=len(db._idx2word),embedding_dimension=64)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
for epoch in range(10):
epoch_loss = 0
for batch in data_loader:
# batchfy negative samples
batch[-1] = torch.stack(batch[-1]).transpose(0,1)
optimizer.zero_grad()
# forward
loss = model(batch)
epoch_loss += loss.item()
# backward
loss.backward()
optimizer.step()
print(f"epoch: {epoch}, loss: {epoch_loss}")
epoch: 0, loss: 672.7852734327316
epoch: 1, loss: 668.479917883873
epoch: 2, loss: 664.1993666887283
epoch: 3, loss: 659.9437568187714
epoch: 4, loss: 655.7134865522385
epoch: 5, loss: 651.5089465379715
epoch: 6, loss: 647.3306492567062
epoch: 7, loss: 643.1791568994522
epoch: 8, loss: 639.0549969673157
epoch: 9, loss: 634.9588532447815
Highly recommend you to use the HuggingFace tokenizers and it provides some excellent tutorial.
!pip install sentencepiece
import nltk
nltk.download('movie_reviews')
!wget https://www.dropbox.com/s/8w9n3cim0b32k2y/train.txt
!wget https://www.dropbox.com/s/pwhn9gyjgvg39v5/pos_train.txt
!wget https://www.dropbox.com/s/7h8aa1xe270fnfy/neg_train.txt
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data] Unzipping corpora/movie_reviews.zip.
--2023-03-20 14:03:43-- https://www.dropbox.com/s/8w9n3cim0b32k2y/train.txt
Resolving www.dropbox.com (www.dropbox.com)... 162.125.81.18, 2620:100:6031:18::a27d:5112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.81.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/8w9n3cim0b32k2y/train.txt [following]
--2023-03-20 14:03:43-- https://www.dropbox.com/s/raw/8w9n3cim0b32k2y/train.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucd0a4dc3833a6946e7cef45dce7.dl.dropboxusercontent.com/cd/0/inline/B4lHrcp6w0LG5VyhDBg2dcZW52QirE_EanNBU1MLtcXbqFdEyp3NeVfzlL91OOJ0Wxz7jmcSgHnANQg1K_mYYf8xSWYt0cNwLXZ3iRVC9j1M99WPtgfEtdREomoDv5KhDyrBbHLABX6ALGyDNG31ygHXkg1tbTl35jBd_B7K5ir43g/file# [following]
--2023-03-20 14:03:44-- https://ucd0a4dc3833a6946e7cef45dce7.dl.dropboxusercontent.com/cd/0/inline/B4lHrcp6w0LG5VyhDBg2dcZW52QirE_EanNBU1MLtcXbqFdEyp3NeVfzlL91OOJ0Wxz7jmcSgHnANQg1K_mYYf8xSWYt0cNwLXZ3iRVC9j1M99WPtgfEtdREomoDv5KhDyrBbHLABX6ALGyDNG31ygHXkg1tbTl35jBd_B7K5ir43g/file
Resolving ucd0a4dc3833a6946e7cef45dce7.dl.dropboxusercontent.com (ucd0a4dc3833a6946e7cef45dce7.dl.dropboxusercontent.com)... 162.125.64.15, 2620:100:6016:15::a27d:10f
Connecting to ucd0a4dc3833a6946e7cef45dce7.dl.dropboxusercontent.com (ucd0a4dc3833a6946e7cef45dce7.dl.dropboxusercontent.com)|162.125.64.15|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7584861 (7.2M) [text/plain]
Saving to: ‘train.txt’
train.txt 100%[===================>] 7.23M 3.29MB/s in 2.2s
2023-03-20 14:03:48 (3.29 MB/s) - ‘train.txt’ saved [7584861/7584861]
--2023-03-20 14:03:48-- https://www.dropbox.com/s/pwhn9gyjgvg39v5/pos_train.txt
Resolving www.dropbox.com (www.dropbox.com)... 162.125.81.18, 2620:100:6031:18::a27d:5112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.81.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/pwhn9gyjgvg39v5/pos_train.txt [following]
--2023-03-20 14:03:48-- https://www.dropbox.com/s/raw/pwhn9gyjgvg39v5/pos_train.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc733f62e6186c791ab25e617687.dl.dropboxusercontent.com/cd/0/inline/B4kGsp6XbqCvMpzL-rBDRrHkScZuIdWZRtNTP96zsEu11QWoycaYU1R8U1uD8tyiHQdzVMI9UG4aZlUu9p_Pk1c-gnBiJaDlyd6u-1FT9O95Vt3T9GKFd2ONmO2AECgXQYQOpj4m8ebBl9Ft5ZywEIuLcGlj6MMZ_pan1TNekYUO1g/file# [following]
--2023-03-20 14:03:49-- https://uc733f62e6186c791ab25e617687.dl.dropboxusercontent.com/cd/0/inline/B4kGsp6XbqCvMpzL-rBDRrHkScZuIdWZRtNTP96zsEu11QWoycaYU1R8U1uD8tyiHQdzVMI9UG4aZlUu9p_Pk1c-gnBiJaDlyd6u-1FT9O95Vt3T9GKFd2ONmO2AECgXQYQOpj4m8ebBl9Ft5ZywEIuLcGlj6MMZ_pan1TNekYUO1g/file
Resolving uc733f62e6186c791ab25e617687.dl.dropboxusercontent.com (uc733f62e6186c791ab25e617687.dl.dropboxusercontent.com)... 162.125.81.15, 2620:100:6016:15::a27d:10f
Connecting to uc733f62e6186c791ab25e617687.dl.dropboxusercontent.com (uc733f62e6186c791ab25e617687.dl.dropboxusercontent.com)|162.125.81.15|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3958800 (3.8M) [text/plain]
Saving to: ‘pos_train.txt’
pos_train.txt 100%[===================>] 3.78M 12.3MB/s in 0.3s
2023-03-20 14:03:50 (12.3 MB/s) - ‘pos_train.txt’ saved [3958800/3958800]
--2023-03-20 14:03:50-- https://www.dropbox.com/s/7h8aa1xe270fnfy/neg_train.txt
Resolving www.dropbox.com (www.dropbox.com)... 162.125.81.18, 2620:100:6031:18::a27d:5112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.81.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/7h8aa1xe270fnfy/neg_train.txt [following]
--2023-03-20 14:03:51-- https://www.dropbox.com/s/raw/7h8aa1xe270fnfy/neg_train.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc6c6227e6fef06615cf8a50b987.dl.dropboxusercontent.com/cd/0/inline/B4nyvHM56RAg3De3bo9qVnoyVaklYafLFbwLCxsFxWzRA6zdop9siYGf-9S_9zOrmF1FA7eS2QBNINlCC1Jh94PugGtpOsNlAkMh55daXx4fovB4-R_5aGpXA9wYp5e79vpovBtEDC-vQvlcUuXijncfnJOoTtIF4itkf2OeTQeX1g/file# [following]
--2023-03-20 14:03:51-- https://uc6c6227e6fef06615cf8a50b987.dl.dropboxusercontent.com/cd/0/inline/B4nyvHM56RAg3De3bo9qVnoyVaklYafLFbwLCxsFxWzRA6zdop9siYGf-9S_9zOrmF1FA7eS2QBNINlCC1Jh94PugGtpOsNlAkMh55daXx4fovB4-R_5aGpXA9wYp5e79vpovBtEDC-vQvlcUuXijncfnJOoTtIF4itkf2OeTQeX1g/file
Resolving uc6c6227e6fef06615cf8a50b987.dl.dropboxusercontent.com (uc6c6227e6fef06615cf8a50b987.dl.dropboxusercontent.com)... 162.125.69.15, 2620:100:6035:15::a27d:550f
Connecting to uc6c6227e6fef06615cf8a50b987.dl.dropboxusercontent.com (uc6c6227e6fef06615cf8a50b987.dl.dropboxusercontent.com)|162.125.69.15|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3626061 (3.5M) [text/plain]
Saving to: ‘neg_train.txt’
neg_train.txt 100%[===================>] 3.46M 1.86MB/s in 1.9s
2023-03-20 14:03:55 (1.86 MB/s) - ‘neg_train.txt’ saved [3626061/3626061]
import os
import random
import numpy as np
# recommend use word2vec in gensim
from gensim.models import word2vec # word2vec library
import nltk
from nltk.corpus import movie_reviews # For Corpus
from sklearn.manifold import TSNE # For TSNE
from sklearn.metrics import accuracy_score # for calculating accuracy score
import matplotlib as mpl
import matplotlib.pyplot as plt
import sentencepiece as spm # For sentencepiece model
# Parameters for learning skip-gram and cbow model
num_features = 300 # Embedding Vector Size
negative = 10 # words for negative sampling
min_word_count = 10 # minimum words in one sentence
window = 5 # context window size
downsampling = 0.75 # Lower frequency for high-frequency words
epoch = 5
# preparing data
sentences_for_SP = []
sentences_naive = []
pos_data = open("./pos_train.txt").readlines()
neg_data = open("./neg_train.txt").readlines()
data_ = pos_data + neg_data
for line in data_:
sentences_naive.append(line.strip().split(' ')) # tokenize token
%%time
# ./train.txt는 text file which include entire corpus
spm.SentencePieceTrainer.train(input="./train.txt", model_prefix='m', vocab_size=10000, user_defined_symbols=['[CLS]', '[SEP]'])
sp = spm.SentencePieceProcessor(model_file='./m.model')
for line in data_:
sentences_for_SP.append(sp.encode(line, out_type=str))
CPU times: user 13.1 s, sys: 200 ms, total: 13.3 s
Wall time: 10.6 s
Nowadays, almost everyone use Hugging Face tokenizer. So highly recommend to use this hugging face library.
%%time
# skip-gram model training with naive splitted data
naive_model = word2vec.Word2Vec(sentences_naive,
sg = 1, # CBOW = 0, skip-gram = 1
negative=negative,
size=num_features,
min_count=min_word_count,
window=window,
sample=downsampling,
iter=epoch)
# skip-gram model training with sentencepiece data
model_with_SP = word2vec.Word2Vec(sentences_for_SP,
sg = 1, # CBOW = 0, skip-gram = 1
negative=negative,
size=num_features,
min_count=min_word_count,
window=window,
sample=downsampling,
iter=epoch)
CPU times: user 10min 22s, sys: 1.05 s, total: 10min 23s
Wall time: 6min 14s
naive_model.wv.most_similar("man") # most similarity word with 'man' in naive skip-gram
[('woman', 0.7349563241004944),
('boy', 0.7184092402458191),
('person', 0.6970902681350708),
('teenager', 0.6965015530586243),
('guy', 0.6851203441619873),
('doctor', 0.6631721258163452),
('girl', 0.6622653007507324),
('lady', 0.6459221839904785),
('salesman', 0.6418094038963318),
('murderer', 0.6395484209060669)]
naive_model.wv.vocab
{'films': <gensim.models.keyedvectors.Vocab at 0x7fe0757e95e0>,
'adapted': <gensim.models.keyedvectors.Vocab at 0x7fdfb1a44400>,
'from': <gensim.models.keyedvectors.Vocab at 0x7fdfb1a446d0>,
'comic': <gensim.models.keyedvectors.Vocab at 0x7fdfb1a44730>,
'books': <gensim.models.keyedvectors.Vocab at 0x7fdfb1a44d90>,
'have': <gensim.models.keyedvectors.Vocab at 0x7fdfb1a44df0>,
'had': <gensim.models.keyedvectors.Vocab at 0x7fdfb1a44e50>,
'plenty': <gensim.models.keyedvectors.Vocab at 0x7fdfb1a44eb0>,
'of': <gensim.models.keyedvectors.Vocab at 0x7fdfb1a44f10>,
'success': <gensim.models.keyedvectors.Vocab at 0x7fdfb1a44f70>,
',': <gensim.models.keyedvectors.Vocab at 0x7fdfb1a44fd0>,
'whether': <gensim.models.keyedvectors.Vocab at 0x7fdfb1a442e0>,
"they're": <gensim.models.keyedvectors.Vocab at 0x7fdfb1a44310>,
'about': <gensim.models.keyedvectors.Vocab at 0x7fdfb1a44370>,
'superheroes': <gensim.models.keyedvectors.Vocab at 0x7fdfbff44a90>,
'(': <gensim.models.keyedvectors.Vocab at 0x7fdfbff448b0>,
'batman': <gensim.models.keyedvectors.Vocab at 0x7fdfbff449a0>,
'superman': <gensim.models.keyedvectors.Vocab at 0x7fdfbff44910>,
'spawn': <gensim.models.keyedvectors.Vocab at 0x7fdfbff44790>,
')': <gensim.models.keyedvectors.Vocab at 0x7fdfbff446d0>,
'or': <gensim.models.keyedvectors.Vocab at 0x7fdfbff446a0>,
'geared': <gensim.models.keyedvectors.Vocab at 0x7fdfbff44a00>,
'toward': <gensim.models.keyedvectors.Vocab at 0x7fdfbff44b50>,
'kids': <gensim.models.keyedvectors.Vocab at 0x7fdfbff44e50>,
'casper': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c040>,
'the': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c0a0>,
'crowd': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c100>,
'ghost': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c160>,
'world': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c1c0>,
'but': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c220>,
"there's": <gensim.models.keyedvectors.Vocab at 0x7fdfb192c280>,
'never': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c2e0>,
'really': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c340>,
'been': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c3a0>,
'a': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c400>,
'book': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c460>,
'like': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c4c0>,
'hell': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c520>,
'before': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c580>,
'.': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c5e0>,
'for': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c640>,
'it': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c6a0>,
'was': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c700>,
'created': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c760>,
'by': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c7c0>,
'alan': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c820>,
'moore': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c880>,
'and': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c8e0>,
'eddie': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c940>,
'campbell': <gensim.models.keyedvectors.Vocab at 0x7fdfb192c9a0>,
'who': <gensim.models.keyedvectors.Vocab at 0x7fdfb192ca00>,
'brought': <gensim.models.keyedvectors.Vocab at 0x7fdfb192ca60>,
'medium': <gensim.models.keyedvectors.Vocab at 0x7fdfb192cac0>,
'to': <gensim.models.keyedvectors.Vocab at 0x7fdfb192cb20>,
'whole': <gensim.models.keyedvectors.Vocab at 0x7fdfb192cb80>,
'new': <gensim.models.keyedvectors.Vocab at 0x7fdfb192cbe0>,
'level': <gensim.models.keyedvectors.Vocab at 0x7fdfb192cc40>,
'in': <gensim.models.keyedvectors.Vocab at 0x7fdfb192cca0>,
'mid': <gensim.models.keyedvectors.Vocab at 0x7fdfb192cd00>,
"'80s": <gensim.models.keyedvectors.Vocab at 0x7fdfb192cd60>,
'with': <gensim.models.keyedvectors.Vocab at 0x7fdfb192cdc0>,
'series': <gensim.models.keyedvectors.Vocab at 0x7fdfb192ce20>,
'called': <gensim.models.keyedvectors.Vocab at 0x7fdfb192ce80>,
'say': <gensim.models.keyedvectors.Vocab at 0x7fdfb192cee0>,
'thoroughly': <gensim.models.keyedvectors.Vocab at 0x7fdfb192cf40>,
'subject': <gensim.models.keyedvectors.Vocab at 0x7fdfb192cfa0>,
'jack': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e040>,
'would': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e0a0>,
'be': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e100>,
'saying': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e160>,
'michael': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e1c0>,
'jackson': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e220>,
'is': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e280>,
'starting': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e2e0>,
'look': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e340>,
'little': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e3a0>,
'odd': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e400>,
'"': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e460>,
'graphic': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e4c0>,
'novel': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e520>,
'if': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e580>,
'you': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e5e0>,
'will': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e640>,
'over': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e6a0>,
'pages': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e700>,
'long': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e760>,
'includes': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e7c0>,
'nearly': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e820>,
'30': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e880>,
'more': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e8e0>,
'that': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e940>,
'nothing': <gensim.models.keyedvectors.Vocab at 0x7fdfb192e9a0>,
'other': <gensim.models.keyedvectors.Vocab at 0x7fdfb192ea00>,
'words': <gensim.models.keyedvectors.Vocab at 0x7fdfb192ea60>,
"don't": <gensim.models.keyedvectors.Vocab at 0x7fdfb192eac0>,
'dismiss': <gensim.models.keyedvectors.Vocab at 0x7fdfb192eb20>,
'this': <gensim.models.keyedvectors.Vocab at 0x7fdfb192eb80>,
'film': <gensim.models.keyedvectors.Vocab at 0x7fdfb192ebe0>,
'because': <gensim.models.keyedvectors.Vocab at 0x7fdfb192ec40>,
'its': <gensim.models.keyedvectors.Vocab at 0x7fdfb192eca0>,
'source': <gensim.models.keyedvectors.Vocab at 0x7fdfb192ed00>,
'can': <gensim.models.keyedvectors.Vocab at 0x7fdfb192ed60>,
'get': <gensim.models.keyedvectors.Vocab at 0x7fdfb192edc0>,
'past': <gensim.models.keyedvectors.Vocab at 0x7fdfb192ee20>,
'thing': <gensim.models.keyedvectors.Vocab at 0x7fdfb192ee80>,
'might': <gensim.models.keyedvectors.Vocab at 0x7fdfb192eee0>,
'find': <gensim.models.keyedvectors.Vocab at 0x7fdfb192ef40>,
'another': <gensim.models.keyedvectors.Vocab at 0x7fdfb192efa0>,
'stumbling': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f040>,
'block': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f0a0>,
'directors': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f100>,
'albert': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f160>,
'allen': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f1c0>,
'hughes': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f220>,
'getting': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f280>,
'brothers': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f2e0>,
'direct': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f340>,
'seems': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f3a0>,
'almost': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f400>,
'as': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f460>,
'ludicrous': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f4c0>,
'casting': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f520>,
'carrot': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f580>,
'top': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f5e0>,
'well': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f640>,
'anything': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f6a0>,
'me': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f700>,
':': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f760>,
'better': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f7c0>,
"that's": <gensim.models.keyedvectors.Vocab at 0x7fdfb192f820>,
'set': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f880>,
'ghetto': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f8e0>,
'features': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f940>,
'violent': <gensim.models.keyedvectors.Vocab at 0x7fdfb192f9a0>,
'street': <gensim.models.keyedvectors.Vocab at 0x7fdfb192fa00>,
'crime': <gensim.models.keyedvectors.Vocab at 0x7fdfb192fa60>,
'than': <gensim.models.keyedvectors.Vocab at 0x7fdfb192fac0>,
'mad': <gensim.models.keyedvectors.Vocab at 0x7fdfb192fb20>,
'geniuses': <gensim.models.keyedvectors.Vocab at 0x7fdfb192fb80>,
'behind': <gensim.models.keyedvectors.Vocab at 0x7fdfb192fbe0>,
'menace': <gensim.models.keyedvectors.Vocab at 0x7fdfb192fc40>,
'ii': <gensim.models.keyedvectors.Vocab at 0x7fdfb192fca0>,
'society': <gensim.models.keyedvectors.Vocab at 0x7fdfb192fd00>,
'?': <gensim.models.keyedvectors.Vocab at 0x7fdfb192fd60>,
'question': <gensim.models.keyedvectors.Vocab at 0x7fdfb192fdc0>,
'course': <gensim.models.keyedvectors.Vocab at 0x7fdfb192fe20>,
'east': <gensim.models.keyedvectors.Vocab at 0x7fdfb192fe80>,
'end': <gensim.models.keyedvectors.Vocab at 0x7fdfb192fee0>,
"it's": <gensim.models.keyedvectors.Vocab at 0x7fdfb192ff40>,
'place': <gensim.models.keyedvectors.Vocab at 0x7fdfb192ffa0>,
'where': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931040>,
'are': <gensim.models.keyedvectors.Vocab at 0x7fdfb19310a0>,
'nervous': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931100>,
'mysterious': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931160>,
'psychopath': <gensim.models.keyedvectors.Vocab at 0x7fdfb19311c0>,
'has': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931220>,
'through': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931280>,
'their': <gensim.models.keyedvectors.Vocab at 0x7fdfb19312e0>,
'profession': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931340>,
'precision': <gensim.models.keyedvectors.Vocab at 0x7fdfb19313a0>,
'when': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931400>,
'first': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931460>,
'stiff': <gensim.models.keyedvectors.Vocab at 0x7fdfb19314c0>,
'turns': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931520>,
'up': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931580>,
'peter': <gensim.models.keyedvectors.Vocab at 0x7fdfb19315e0>,
'robbie': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931640>,
'not': <gensim.models.keyedvectors.Vocab at 0x7fdfb19316a0>,
'enough': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931700>,
'calls': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931760>,
'inspector': <gensim.models.keyedvectors.Vocab at 0x7fdfb19317c0>,
'johnny': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931820>,
'depp': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931880>,
'blow': <gensim.models.keyedvectors.Vocab at 0x7fdfb19318e0>,
'crack': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931940>,
'case': <gensim.models.keyedvectors.Vocab at 0x7fdfb19319a0>,
'dreams': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931a00>,
'he': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931a60>,
'tries': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931ac0>,
'amounts': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931b20>,
'upon': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931b80>,
'arriving': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931be0>,
'befriends': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931c40>,
'an': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931ca0>,
'unfortunate': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931d00>,
'named': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931d60>,
'mary': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931dc0>,
'kelly': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931e20>,
'heather': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931e80>,
'graham': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931ee0>,
"isn't": <gensim.models.keyedvectors.Vocab at 0x7fdfb1931f40>,
'so': <gensim.models.keyedvectors.Vocab at 0x7fdfb1931fa0>,
'proceeds': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933040>,
'investigate': <gensim.models.keyedvectors.Vocab at 0x7fdfb19330a0>,
'horribly': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933100>,
'gruesome': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933160>,
'crimes': <gensim.models.keyedvectors.Vocab at 0x7fdfb19331c0>,
'even': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933220>,
'police': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933280>,
'surgeon': <gensim.models.keyedvectors.Vocab at 0x7fdfb19332e0>,
"can't": <gensim.models.keyedvectors.Vocab at 0x7fdfb1933340>,
'stomach': <gensim.models.keyedvectors.Vocab at 0x7fdfb19333a0>,
'i': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933400>,
'think': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933460>,
'anyone': <gensim.models.keyedvectors.Vocab at 0x7fdfb19334c0>,
'needs': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933520>,
'on': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933580>,
"won't": <gensim.models.keyedvectors.Vocab at 0x7fdfb19335e0>,
'go': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933640>,
'into': <gensim.models.keyedvectors.Vocab at 0x7fdfb19336a0>,
'here': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933700>,
'unique': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933760>,
'interesting': <gensim.models.keyedvectors.Vocab at 0x7fdfb19337c0>,
'theory': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933820>,
'both': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933880>,
'identity': <gensim.models.keyedvectors.Vocab at 0x7fdfb19338e0>,
'killer': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933940>,
'reasons': <gensim.models.keyedvectors.Vocab at 0x7fdfb19339a0>,
'chooses': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933a00>,
'they': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933a60>,
'bother': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933ac0>,
'screenwriters': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933b20>,
'terry': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933b80>,
'limit': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933be0>,
'les': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933c40>,
'do': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933ca0>,
'good': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933d00>,
'job': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933d60>,
'keeping': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933dc0>,
'him': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933e20>,
'hidden': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933e80>,
'viewers': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933ee0>,
'until': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933f40>,
'very': <gensim.models.keyedvectors.Vocab at 0x7fdfb1933fa0>,
'funny': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934040>,
'watch': <gensim.models.keyedvectors.Vocab at 0x7fdfb19340a0>,
'locals': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934100>,
'point': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934160>,
'finger': <gensim.models.keyedvectors.Vocab at 0x7fdfb19341c0>,
'blame': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934220>,
'at': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934280>,
'jews': <gensim.models.keyedvectors.Vocab at 0x7fdfb19342e0>,
'indians': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934340>,
'after': <gensim.models.keyedvectors.Vocab at 0x7fdfb19343a0>,
'all': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934400>,
'could': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934460>,
'capable': <gensim.models.keyedvectors.Vocab at 0x7fdfb19344c0>,
'committing': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934520>,
'such': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934580>,
'acts': <gensim.models.keyedvectors.Vocab at 0x7fdfb19345e0>,
'ending': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934640>,
'song': <gensim.models.keyedvectors.Vocab at 0x7fdfb19346a0>,
'days': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934700>,
'holds': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934760>,
'back': <gensim.models.keyedvectors.Vocab at 0x7fdfb19347c0>,
'electric': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934820>,
'made': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934880>,
'steve': <gensim.models.keyedvectors.Vocab at 0x7fdfb19348e0>,
'star': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934940>,
'worry': <gensim.models.keyedvectors.Vocab at 0x7fdfb19349a0>,
'-': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934a00>,
"it'll": <gensim.models.keyedvectors.Vocab at 0x7fdfb1934a60>,
'make': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934ac0>,
'sense': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934b20>,
'see': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934b80>,
'now': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934be0>,
'onto': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934c40>,
'appearance': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934ca0>,
'certainly': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934d00>,
'dark': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934d60>,
'bleak': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934dc0>,
'surprising': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934e20>,
'how': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934e80>,
'much': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934ee0>,
'looks': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934f40>,
'tim': <gensim.models.keyedvectors.Vocab at 0x7fdfb1934fa0>,
'burton': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936040>,
'planet': <gensim.models.keyedvectors.Vocab at 0x7fdfb19360a0>,
'apes': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936100>,
'did': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936160>,
'times': <gensim.models.keyedvectors.Vocab at 0x7fdfb19361c0>,
'sleepy': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936220>,
'hollow': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936280>,
'2': <gensim.models.keyedvectors.Vocab at 0x7fdfb19362e0>,
'print': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936340>,
'saw': <gensim.models.keyedvectors.Vocab at 0x7fdfb19363a0>,
"wasn't": <gensim.models.keyedvectors.Vocab at 0x7fdfb1936400>,
'completely': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936460>,
'finished': <gensim.models.keyedvectors.Vocab at 0x7fdfb19364c0>,
'color': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936520>,
'music': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936580>,
'no': <gensim.models.keyedvectors.Vocab at 0x7fdfb19365e0>,
'comments': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936640>,
'marilyn': <gensim.models.keyedvectors.Vocab at 0x7fdfb19366a0>,
'cinematographer': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936700>,
'word': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936760>,
'ably': <gensim.models.keyedvectors.Vocab at 0x7fdfb19367c0>,
'captures': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936820>,
'london': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936880>,
'helped': <gensim.models.keyedvectors.Vocab at 0x7fdfb19368e0>,
'flashy': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936940>,
'killing': <gensim.models.keyedvectors.Vocab at 0x7fdfb19369a0>,
'scenes': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936a00>,
'remind': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936a60>,
'crazy': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936ac0>,
'flashbacks': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936b20>,
'twin': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936b80>,
'peaks': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936be0>,
'though': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936c40>,
'violence': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936ca0>,
'comparison': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936d00>,
'black-and-white': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936d60>,
'oscar': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936dc0>,
'winner': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936e20>,
'martin': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936e80>,
'shakespeare': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936ee0>,
'love': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936f40>,
'production': <gensim.models.keyedvectors.Vocab at 0x7fdfb1936fa0>,
'design': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938040>,
'original': <gensim.models.keyedvectors.Vocab at 0x7fdfb19380a0>,
'surroundings': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938100>,
'one': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938160>,
'creepy': <gensim.models.keyedvectors.Vocab at 0x7fdfb19381c0>,
'acting': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938220>,
'solid': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938280>,
'turning': <gensim.models.keyedvectors.Vocab at 0x7fdfb19382e0>,
'typically': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938340>,
'strong': <gensim.models.keyedvectors.Vocab at 0x7fdfb19383a0>,
'performance': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938400>,
'handling': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938460>,
'british': <gensim.models.keyedvectors.Vocab at 0x7fdfb19384c0>,
'accent': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938520>,
'holm': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938580>,
'joe': <gensim.models.keyedvectors.Vocab at 0x7fdfb19385e0>,
'secret': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938640>,
'richardson': <gensim.models.keyedvectors.Vocab at 0x7fdfb19386a0>,
'102': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938700>,
'dalmatians': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938760>,
'great': <gensim.models.keyedvectors.Vocab at 0x7fdfb19387c0>,
'supporting': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938820>,
'roles': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938880>,
'big': <gensim.models.keyedvectors.Vocab at 0x7fdfb19388e0>,
'surprise': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938940>,
'time': <gensim.models.keyedvectors.Vocab at 0x7fdfb19389a0>,
'she': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938a00>,
'opened': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938a60>,
'her': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938ac0>,
'mouth': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938b20>,
'attempt': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938b80>,
'irish': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938be0>,
'actually': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938c40>,
'half': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938ca0>,
'bad': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938d00>,
'however': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938d60>,
'r': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938dc0>,
'sexuality': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938e20>,
'language': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938e80>,
'drug': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938ee0>,
'content': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938f40>,
'every': <gensim.models.keyedvectors.Vocab at 0x7fdfb1938fa0>,
'then': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939040>,
'movie': <gensim.models.keyedvectors.Vocab at 0x7fdfb19390a0>,
'comes': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939100>,
'along': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939160>,
'suspect': <gensim.models.keyedvectors.Vocab at 0x7fdfb19391c0>,
'studio': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939220>,
'indication': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939280>,
'perhaps': <gensim.models.keyedvectors.Vocab at 0x7fdfb19392e0>,
'becomes': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939340>,
'critical': <gensim.models.keyedvectors.Vocab at 0x7fdfb19393a0>,
'mtv': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939400>,
'high': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939460>,
'school': <gensim.models.keyedvectors.Vocab at 0x7fdfb19394c0>,
'comedy': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939520>,
'starring': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939580>,
'matthew': <gensim.models.keyedvectors.Vocab at 0x7fdfb19395e0>,
'broderick': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939640>,
'reese': <gensim.models.keyedvectors.Vocab at 0x7fdfb19396a0>,
'witherspoon': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939700>,
'current': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939760>,
'example': <gensim.models.keyedvectors.Vocab at 0x7fdfb19397c0>,
'anybody': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939820>,
'know': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939880>,
'week': <gensim.models.keyedvectors.Vocab at 0x7fdfb19398e0>,
'plot': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939940>,
'simple': <gensim.models.keyedvectors.Vocab at 0x7fdfb19399a0>,
'george': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939a00>,
'washington': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939a60>,
'carver': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939ac0>,
'having': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939b20>,
'student': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939b80>,
'tracy': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939be0>,
'flick': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939c40>,
'hand': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939ca0>,
'raised': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939d00>,
'way': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939d60>,
'mr': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939dc0>,
'm': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939e20>,
'sick': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939e80>,
'paul': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939ee0>,
'jock': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939f40>,
'run': <gensim.models.keyedvectors.Vocab at 0x7fdfb1939fa0>,
"paul's": <gensim.models.keyedvectors.Vocab at 0x7fdfb193c040>,
'sister': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c0a0>,
'jumps': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c100>,
'race': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c160>,
'personal': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c1c0>,
'side': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c220>,
'sleeper': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c280>,
'expectations': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c2e0>,
'were': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c340>,
'low': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c3a0>,
'going': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c400>,
'fact': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c460>,
'quality': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c4c0>,
'stuff': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c520>,
'reviews': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c580>,
'enthusiastic': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c5e0>,
'any': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c640>,
'right': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c6a0>,
'help': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c700>,
'baggage': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c760>,
'glowing': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c7c0>,
'which': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c820>,
'contrast': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c880>,
'negative': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c8e0>,
'reviewers': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c940>,
'likely': <gensim.models.keyedvectors.Vocab at 0x7fdfb193c9a0>,
'does': <gensim.models.keyedvectors.Vocab at 0x7fdfb193ca00>,
'live': <gensim.models.keyedvectors.Vocab at 0x7fdfb193ca60>,
'hype': <gensim.models.keyedvectors.Vocab at 0x7fdfb193cac0>,
'what': <gensim.models.keyedvectors.Vocab at 0x7fdfb193cb20>,
'makes': <gensim.models.keyedvectors.Vocab at 0x7fdfb193cb80>,
'disappointing': <gensim.models.keyedvectors.Vocab at 0x7fdfb193cbe0>,
'contains': <gensim.models.keyedvectors.Vocab at 0x7fdfb193cc40>,
'significant': <gensim.models.keyedvectors.Vocab at 0x7fdfb193cca0>,
'details': <gensim.models.keyedvectors.Vocab at 0x7fdfb193cd00>,
'lifted': <gensim.models.keyedvectors.Vocab at 0x7fdfb193cd60>,
'directly': <gensim.models.keyedvectors.Vocab at 0x7fdfb193cdc0>,
'released': <gensim.models.keyedvectors.Vocab at 0x7fdfb193ce20>,
'few': <gensim.models.keyedvectors.Vocab at 0x7fdfb193ce80>,
'months': <gensim.models.keyedvectors.Vocab at 0x7fdfb193cee0>,
'earlier': <gensim.models.keyedvectors.Vocab at 0x7fdfb193cf40>,
'similarities': <gensim.models.keyedvectors.Vocab at 0x7fdfb193cfa0>,
'staggering': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e040>,
'president': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e0a0>,
'extraordinary': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e100>,
'number': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e160>,
'clubs': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e1c0>,
'involved': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e220>,
'play': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e280>,
'max': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e2e0>,
'most': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e340>,
'tension': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e3a0>,
'potential': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e400>,
'relationship': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e460>,
'between': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e4c0>,
'teacher': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e520>,
'his': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e580>,
'single': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e5e0>,
'parent': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e640>,
'home': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e6a0>,
'contributed': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e700>,
'drive': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e760>,
'male': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e7c0>,
'bumbling': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e820>,
'adult': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e880>,
'pursues': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e8e0>,
'affair': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e940>,
'gets': <gensim.models.keyedvectors.Vocab at 0x7fdfb193e9a0>,
'caught': <gensim.models.keyedvectors.Vocab at 0x7fdfb193ea00>,
'life': <gensim.models.keyedvectors.Vocab at 0x7fdfb193ea60>,
'ruined': <gensim.models.keyedvectors.Vocab at 0x7fdfb193eac0>,
'bill': <gensim.models.keyedvectors.Vocab at 0x7fdfb193eb20>,
'murray': <gensim.models.keyedvectors.Vocab at 0x7fdfb193eb80>,
'several': <gensim.models.keyedvectors.Vocab at 0x7fdfb193ebe0>,
'happened': <gensim.models.keyedvectors.Vocab at 0x7fdfb193ec40>,
'individual': <gensim.models.keyedvectors.Vocab at 0x7fdfb193eca0>,
'screenplay': <gensim.models.keyedvectors.Vocab at 0x7fdfb193ed00>,
'contain': <gensim.models.keyedvectors.Vocab at 0x7fdfb193ed60>,
'many': <gensim.models.keyedvectors.Vocab at 0x7fdfb193edc0>,
'points': <gensim.models.keyedvectors.Vocab at 0x7fdfb193ee20>,
'yet': <gensim.models.keyedvectors.Vocab at 0x7fdfb193ee80>,
'probably': <gensim.models.keyedvectors.Vocab at 0x7fdfb193eee0>,
'aware': <gensim.models.keyedvectors.Vocab at 0x7fdfb193ef40>,
'each': <gensim.models.keyedvectors.Vocab at 0x7fdfb193efa0>,
'two': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf040>,
'different': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf0a0>,
'studios': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf100>,
'genre': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf160>,
'revenge': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf1c0>,
"hadn't": <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf220>,
'fully': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf280>,
'formed': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf2e0>,
'strengths': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf340>,
'rely': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf3a0>,
'fantastic': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf400>,
'performances': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf460>,
'newcomer': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf4c0>,
'jessica': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf520>,
'playing': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf580>,
'role': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf5e0>,
'fun': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf640>,
"he's": <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf6a0>,
'since': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf700>,
'revelation': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf760>,
'early': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf7c0>,
'year': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf820>,
'teenagers': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf880>,
'my': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf8e0>,
'money': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf940>,
'deserves': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bf9a0>,
'nomination': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bfa00>,
'once': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bfa60>,
'character': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bfac0>,
'speech': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bfb20>,
"you're": <gensim.models.keyedvectors.Vocab at 0x7fdfb18bfb80>,
'won': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bfbe0>,
"i've": <gensim.models.keyedvectors.Vocab at 0x7fdfb18bfc40>,
'seen': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bfca0>,
'there': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bfd00>,
'amount': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bfd60>,
'suppose': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bfdc0>,
'coming': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bfe20>,
'should': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bfe80>,
'expect': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bfee0>,
'less': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bff40>,
'starts': <gensim.models.keyedvectors.Vocab at 0x7fdfb18bffa0>,
'off': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2040>,
'light': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c20a0>,
'sitcom': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2100>,
'screws': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2160>,
'alexander': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c21c0>,
'decides': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2220>,
'add': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2280>,
'elements': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c22e0>,
'frankly': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2340>,
'distract': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c23a0>,
'story': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2400>,
"doesn't": <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2460>,
'determination': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c24c0>,
'win': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2520>,
'costs': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2580>,
'throw': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c25e0>,
'logical': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2640>,
'reason': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c26a0>,
'why': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2700>,
'lot': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2760>,
'takes': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c27c0>,
'explicitly': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2820>,
'mark': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2880>,
'disappointment': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c28e0>,
"you've": <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2940>,
'got': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c29a0>,
'mail': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2a00>,
'works': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2a60>,
'order': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2ac0>,
'cast': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2b20>,
'extremely': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2b80>,
'popular': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2be0>,
'attractive': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2c40>,
'stars': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2ca0>,
'them': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2d00>,
'share': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2d60>,
'screen': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2dc0>,
'hours': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2e20>,
'collect': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2e80>,
'real': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2ee0>,
'inventive': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2f40>,
'bone': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c2fa0>,
'body': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4040>,
'basically': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c40a0>,
'complete': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4100>,
'shop': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4160>,
'around': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c41c0>,
'corner': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4220>,
'only': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4280>,
'adding': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c42e0>,
'modern': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4340>,
'twists': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c43a0>,
'essentially': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4400>,
'goes': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4460>,
'against': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c44c0>,
'defies': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4520>,
'concepts': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4580>,
'contemporary': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c45e0>,
'filmmaking': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4640>,
'overly': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c46a0>,
'sentimental': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4700>,
'terribly': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4760>,
'mention': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c47c0>,
'manipulative': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4820>,
'oh': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4880>,
'enjoyable': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c48e0>,
'manipulation': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4940>,
'must': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c49a0>,
'something': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4a00>,
'work': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4a60>,
'absolutely': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4ac0>,
'hated': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4b20>,
'previous': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4b80>,
'sleepless': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4be0>,
'seattle': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4c40>,
"couldn't": <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4ca0>,
'directing': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4d00>,
'helmed': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4d60>,
'same': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4dc0>,
'woman': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4e20>,
"haven't": <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4e80>,
'quite': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4ee0>,
'figured': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4f40>,
'out': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c4fa0>,
'liked': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5040>,
'again': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c50a0>,
'important': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5100>,
'storyline': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5160>,
'cliched': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c51c0>,
'come': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5220>,
'tom': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5280>,
'hanks': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c52e0>,
'plays': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5340>,
'fox': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c53a0>,
'insanely': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5400>,
'likeable': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5460>,
'owner': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c54c0>,
'chain': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5520>,
'meg': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5580>,
'ryan': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c55e0>,
'kathleen': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5640>,
'kelley': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c56a0>,
"children's": <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5700>,
'nice': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5760>,
'homage': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c57c0>,
'soon': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5820>,
'become': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5880>,
'bitter': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c58e0>,
'rivals': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5940>,
'store': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c59a0>,
'opening': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5a00>,
'across': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5a60>,
'small': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5ac0>,
'business': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5b20>,
'already': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5b80>,
'internet': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5be0>,
'neither': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5c40>,
'party': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5ca0>,
'knows': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5d00>,
"person's": <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5d60>,
'true': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5dc0>,
'rest': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5e20>,
'serve': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5e80>,
'mere': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5ee0>,
'backdrop': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5f40>,
'sure': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c5fa0>,
'some': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8040>,
'mildly': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c80a0>,
'subplots': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8100>,
'fail': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8160>,
'utter': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c81c0>,
'cuteness': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8220>,
'main': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8280>,
'leads': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c82e0>,
'predictable': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8340>,
'climax': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c83a0>,
'damn': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8400>,
'cute': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8460>,
'well-done': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c84c0>,
'doubt': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8520>,
'entire': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8580>,
'scene': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c85e0>,
'evokes': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8640>,
'pure': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c86a0>,
'joy': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8700>,
'part': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8760>,
'discovers': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c87c0>,
'online': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8820>,
'filled': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8880>,
'lack': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c88e0>,
'happiness': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8940>,
'left': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c89a0>,
'theater': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8a00>,
'smiling': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8a60>,
'jaws': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8ac0>,
'rare': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8b20>,
'grabs': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8b80>,
'your': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8be0>,
'attention': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8c40>,
'shows': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8ca0>,
'image': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8d00>,
'opens': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8d60>,
'distant': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8dc0>,
'underwater': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8e20>,
'sounds': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8e80>,
'ominous': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8ee0>,
'bars': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8f40>,
'composer': <gensim.models.keyedvectors.Vocab at 0x7fdfb18c8fa0>,
'john': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca040>,
"williams'": <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca0a0>,
'infamous': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca100>,
'score': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca160>,
'director': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca1c0>,
'steven': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca220>,
'spielberg': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca280>,
'wastes': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca2e0>,
'taking': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca340>,
'us': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca3a0>,
'water': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca400>,
'midnight': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca460>,
'swim': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca4c0>,
'beautiful': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca520>,
'girl': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca580>,
'deadly': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca5e0>,
'away': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca640>,
'lets': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca6a0>,
'vulnerable': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca700>,
'we': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca760>,
'floating': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca7c0>,
'ocean': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca820>,
'attacked': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca880>,
'grip': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca8e0>,
'outstanding': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca940>,
'builds': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ca9a0>,
'theatrical': <gensim.models.keyedvectors.Vocab at 0x7fdfb18caa00>,
'act': <gensim.models.keyedvectors.Vocab at 0x7fdfb18caa60>,
'second': <gensim.models.keyedvectors.Vocab at 0x7fdfb18caac0>,
'unlike': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cab20>,
'filmmakers': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cab80>,
'deal': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cabe0>,
'restraint': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cac40>,
'refuses': <gensim.models.keyedvectors.Vocab at 0x7fdfb18caca0>,
'show': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cad00>,
'shark': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cad60>,
'middle': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cadc0>,
'merely': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cae20>,
'suggests': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cae80>,
'presence': <gensim.models.keyedvectors.Vocab at 0x7fdfb18caee0>,
'subjective': <gensim.models.keyedvectors.Vocab at 0x7fdfb18caf40>,
'shots': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cafa0>,
'building': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb040>,
'bit': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb0a0>,
'arrival': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb100>,
'truly': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb160>,
'terrifying': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb1c0>,
'let': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb220>,
'bored': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb280>,
'imagery': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb2e0>,
'chief': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb340>,
'brody': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb3a0>,
'roy': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb400>,
'york': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb460>,
'cop': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb4c0>,
'taken': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb520>,
'easy': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb580>,
'peaceful': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb5e0>,
'running': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb640>,
'station': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb6a0>,
'island': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb700>,
'england': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb760>,
'resort': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb7c0>,
'town': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb820>,
"hasn't": <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb880>,
'murder': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb8e0>,
'gun': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb940>,
'fired': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cb9a0>,
'25': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cba00>,
'years': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cba60>,
'vicious': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cbac0>,
'white': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cbb20>,
'attacks': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cbb80>,
'fourth': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cbbe0>,
'july': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cbc40>,
'mayor': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cbca0>,
'larry': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cbd00>,
'vaughn': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cbd60>,
'want': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cbdc0>,
'shut': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cbe20>,
'down': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cbe80>,
'summer': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cbee0>,
'tourist': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cbf40>,
'joined': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cbfa0>,
'matt': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce040>,
'richard': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce0a0>,
'dreyfuss': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce100>,
'young': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce160>,
'ambitious': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce1c0>,
'expert': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce220>,
'marine': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce280>,
'fascinated': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce2e0>,
'determined': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce340>,
'stop': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce3a0>,
'--': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce400>,
'knowledge': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce460>,
'exact': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce4c0>,
'perfect': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce520>,
'engine': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce580>,
'eating': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce5e0>,
'machine': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce640>,
'finally': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce6a0>,
'join': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce700>,
'old': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce760>,
'robert': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce7c0>,
'shaw': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce820>,
'boat': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce880>,
'search': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce8e0>,
'three': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce940>,
'men': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ce9a0>,
'hunt': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cea00>,
'inevitably': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cea60>,
'hunted': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ceac0>,
'thriller': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ceb20>,
'keen': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ceb80>,
'humor': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cebe0>,
'incredible': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cec40>,
'pacing': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ceca0>,
'horror': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ced00>,
'ten': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ced60>,
'movies': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cedc0>,
'rolled': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cee20>,
'wonder': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cee80>,
'took': <gensim.models.keyedvectors.Vocab at 0x7fdfb18ceee0>,
'america': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cef40>,
'storm': <gensim.models.keyedvectors.Vocab at 0x7fdfb18cefa0>,
'crown': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0040>,
'box': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d00a0>,
'office': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0100>,
'1977': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0160>,
'wars': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d01c0>,
'today': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0220>,
'fascination': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0280>,
'par': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d02e0>,
"hitchcock's": <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0340>,
'psycho': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d03a0>,
'age': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0400>,
'although': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0460>,
'grand': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d04c0>,
'technology': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0520>,
'exists': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0580>,
'technical': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d05e0>,
'sequences': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0640>,
'including': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d06a0>,
'mechanical': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0700>,
'sharks': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0760>,
'none': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d07c0>,
'improve': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0820>,
'lead': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0880>,
'overkill': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d08e0>,
'faced': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0940>,
'may': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d09a0>,
'produced': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0a00>,
'forced': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0a60>,
'traditional': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0ac0>,
'cinematic': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0b20>,
'characterization': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0b80>,
'sharp': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0be0>,
'editing': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0c40>,
'creative': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0ca0>,
'photography': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0d00>,
'instead': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0d60>,
'simply': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0dc0>,
'audience': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0e20>,
'digital': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0e80>,
'effects': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0ee0>,
'known': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0f40>,
'actors': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d0fa0>,
'draw': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1040>,
'redford': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d10a0>,
'newman': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1100>,
'nevertheless': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1160>,
'guaranteed': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d11c0>,
'successful': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1220>,
'careers': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1280>,
'gave': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d12e0>,
'refused': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1340>,
'overshadowed': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d13a0>,
'hits': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1400>,
'just': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1460>,
'notes': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d14c0>,
'sympathetic': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1520>,
'husband': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1580>,
'father': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d15e0>,
'political': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1640>,
'doing': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d16a0>,
"what's": <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1700>,
'warns': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1760>,
'previously': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d17c0>,
'american': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1820>,
'1973': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1880>,
'1974': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d18e0>,
'gives': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1940>,
'surprisingly': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d19a0>,
'mature': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1a00>,
'complex': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1a60>,
'someone': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1ac0>,
'literally': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1b20>,
'played': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1b80>,
"movie's": <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1be0>,
'captain': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1c40>,
'sorely': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1ca0>,
'overlooked': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1d00>,
'academy': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1d60>,
'awards': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1dc0>,
'parody': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1e20>,
'whose': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1e80>,
'borders': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1ee0>,
'slightly': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1f40>,
'deranged': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d1fa0>,
'caricature': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4040>,
'late': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d40a0>,
'below': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4100>,
'deck': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4160>,
'comparing': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d41c0>,
'drawn': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4220>,
'telling': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4280>,
'experiences': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d42e0>,
'aboard': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4340>,
'u': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d43a0>,
's': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4400>,
'navy': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4460>,
'ship': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d44c0>,
'war': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4520>,
'sunk': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4580>,
'japanese': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d45e0>,
'tale': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4640>,
'1': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d46a0>,
'000': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4700>,
'while': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4760>,
'slowly': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d47c0>,
'put': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4820>,
'delivers': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4880>,
'take': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d48e0>,
'best': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4940>,
'leave': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d49a0>,
'itself': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4a00>,
';': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4a60>,
'black': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4ac0>,
'eyes': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4b20>,
'endless': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4b80>,
'teeth': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4be0>,
'urge': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4c40>,
'eat': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4ca0>,
'epitome': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4d00>,
'fears': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4d60>,
'unknown': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4dc0>,
'threatening': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4e20>,
'nature': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4e80>,
'nemesis': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4ee0>,
'survived': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4f40>,
'dinosaurs': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d4fa0>,
'exist': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6040>,
'large': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d60a0>,
'threat': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6100>,
"spielberg's": <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6160>,
'feel': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d61c0>,
'bunch': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6220>,
'dangling': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6280>,
'legs': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d62e0>,
'ready': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6340>,
'combination': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d63a0>,
'actual': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6400>,
'footage': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6460>,
'five': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d64c0>,
'nicknamed': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6520>,
'bruce': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6580>,
'crew': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d65e0>,
'built': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6640>,
'shot': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d66a0>,
'angles': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6700>,
'forgotten': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6760>,
'sort': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d67c0>,
'waterworld': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6820>,
'1995': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6880>,
'cost': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d68e0>,
'universal': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6940>,
'worried': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d69a0>,
'bomb': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6a00>,
'obstacles': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6a60>,
'delivered': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6ac0>,
'finest': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6b20>,
'primal': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6b80>,
'ever': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6be0>,
'hollywood': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6c40>,
'moviemaking': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6ca0>,
'being': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6d00>,
'general': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6d60>,
'manager': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6dc0>,
'team': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6e20>,
'cap': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6e80>,
'era': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6ee0>,
'resources': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6f40>,
'dollar': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d6fa0>,
'spent': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d7040>,
'tackle': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d70a0>,
'spend': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d7100>,
'centers': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d7160>,
'teams': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d71c0>,
'detroit': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d7220>,
'lions': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d7280>,
'boast': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d72e0>,
'superstar': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d7340>,
'huge': <gensim.models.keyedvectors.Vocab at 0x7fdfb18d73a0>,
...}
vector = naive_model["king"] - naive_model["man"] + naive_model["queen"]
print(naive_model.wv.similar_by_vector(vector, topn=10, restrict_vocab=None))
[('king', 0.7491089105606079), ('queen', 0.6229594349861145), ('sir', 0.5982416868209839), ('nichols', 0.5822482109069824), ('oz', 0.5821514129638672), ('nathan', 0.5698883533477783), ('norman', 0.567724347114563), ('hamill', 0.5669344663619995), ('elijah', 0.5665290951728821), ('fisher', 0.5638027191162109)]
<ipython-input-31-0b2c5f22b770>:1: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
vector = naive_model["king"] - naive_model["man"] + naive_model["queen"]
print(naive_model.wv.similarity(w1 = 'man', w2 = 'woman'))
print(naive_model.wv.similarity(w1 = 'uncle', w2 = 'aunt'))
print(naive_model.wv.similarity(w1 = 'king', w2 = 'queen'))
0.7349564
0.78206044
0.6388496
def render_TSNE(vocab, word_emb):
"""
args:
vocab - vocab list
word_emb - word embeddings
"""
tsne = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32) # initialization
_tsne = tsne.fit_transform(word_emb)
x_coordinate = _tsne[:,0] # x
y_coordinate = _tsne[:,1] # y
# scatter plot initialization
fig, ax = plt.subplots()
fig.set_size_inches(40, 20)
ax.scatter(x_coordinate, y_coordinate)
for i, word in enumerate(random_vocab):
ax.annotate(word,(x_coordinate[i], y_coordinate[i]), fontsize=30) # word labeling for each scatters
plt.show()
vocab = list(naive_model.wv.vocab) # Load vocab list
random_vocab = random.sample(vocab,k=50) # Random sampling of 50 words
word_emb = naive_model[random_vocab] # Load embedding vector about sampled words
render_TSNE(random_vocab, word_emb)
<ipython-input-34-3d1b6738c650>:3: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
word_emb = naive_model[random_vocab] # Load embedding vector about sampled words
vocab = list(model_with_SP.wv.vocab) # Load vocab list
random_vocab = random.sample(vocab,k=50) # Random sampling of 50 words
word_emb = model_with_SP[random_vocab] # Load embedding vector about sampled words
render_TSNE(random_vocab, word_emb)
<ipython-input-35-e3fda3362c12>:3: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
word_emb = model_with_SP[random_vocab] # Load embedding vector about sampled words
Reference
- AI504: Programming for AI Lecture at KAIST AI