교재 : 텐서플로 2와 머신러닝으로 시작하는 자연어처리, 위키북스
# 코사인 유사도(Cosine similarity)
def cos_sim(A, B):
return dot(A, B)/(norm(A)*norm(B)) # = cos(theta)
doc1 = np.array([1,1,0,1])
doc2 = np.array([1,0,1,1])
doc3 = np.array([2,0,2,2])
print(cos_sim(doc1, doc2))
print(cos_sim(doc1, doc3))
print(cos_sim(doc2, doc3))
0.6666666666666667
0.6666666666666667
1.0000000000000002
# 유클리디언 유사도(euclidian distance)
def eu_dist(A, B):
return np.sqrt(np.sum((A-B)**2))
from sklearn.metrics.pairwise import euclidean_distances
print(euclidean_distances([doc1], [doc2]))
print(euclidean_distances([doc1], [doc3]))
print(euclidean_distances([doc2], [doc3]))
[[1.41421356]]
[[2.64575131]]
[[1.73205081]]
# 맨해튼 유사도(manhattan similarity)
from sklearn.metrics.pairwise import manhattan_distances
print(manhattan_distances([doc1], [doc2]))
print(manhattan_distances([doc1], [doc3]))
print(manhattan_distances([doc2], [doc3]))
[[2.]]
[[5.]]
[[3.]]
token_doc1 = doc1.split()
token_doc2 = doc2.split()
token_doc3 = doc3.split()
#합집합과 교집합
union = set(token_doc1).union(set(token_doc2))
inter = set(token_doc1).intersection(set(token_doc2))
len(inter)/len(union)
0.5
print(len(set(token_doc1).intersection(set(token_doc2))) / len(set(token_doc1).union(set(token_doc2))))
print(len(set(token_doc1).intersection(set(token_doc3))) / len(set(token_doc1).union(set(token_doc3))))
print(len(set(token_doc2).intersection(set(token_doc3))) / len(set(token_doc2).union(set(token_doc3))))
0.5
0.5
1.0