1. 데이터 및 라이브러리 불러오기
import pandas as pd
import numpy as np
movies = pd.read_csv('./data_movie_lens/movies.csv')
ratings = pd.read_csv('./data_movie_lens/ratings.csv')
print(movies.shape)
print(ratings.shape)
(9742, 3)
(100836, 4)
movies.head(1)
|
movieId |
title |
genres |
0 |
1 |
Toy Story (1995) |
Adventure|Animation|Children|Comedy|Fantasy |
ratings.head(1)
|
userId |
movieId |
rating |
timestamp |
0 |
1 |
1 |
4.0 |
964982703 |
2. 데이터 전처리
ratings = ratings.drop(['timestamp'],axis=1)
ratings.head(1)
|
userId |
movieId |
rating |
0 |
1 |
1 |
4.0 |
ratings_matrix = ratings.pivot_table('rating', index='userId', columns='movieId')
print(ratings_matrix.shape)
ratings_matrix.head(2)
(610, 9724)
movieId |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
... |
193565 |
193567 |
193571 |
193573 |
193579 |
193581 |
193583 |
193585 |
193587 |
193609 |
userId |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
4.0 |
NaN |
4.0 |
NaN |
NaN |
4.0 |
NaN |
NaN |
NaN |
NaN |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2 rows × 9724 columns
rating_movies = pd.merge(ratings, movies, on='movieId')
print(rating_movies.shape)
rating_movies.head(1)
(100836, 5)
|
userId |
movieId |
rating |
title |
genres |
0 |
1 |
1 |
4.0 |
Toy Story (1995) |
Adventure|Animation|Children|Comedy|Fantasy |
ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title')
ratings_matrix.head(2)
title |
'71 (2014) |
'Hellboy': The Seeds of Creation (2004) |
'Round Midnight (1986) |
'Salem's Lot (2004) |
'Til There Was You (1997) |
'Tis the Season for Love (2015) |
'burbs, The (1989) |
'night Mother (1986) |
(500) Days of Summer (2009) |
*batteries not included (1987) |
... |
Zulu (2013) |
[REC] (2007) |
[REC]² (2009) |
[REC]³ 3 Génesis (2012) |
anohana: The Flower We Saw That Day - The Movie (2013) |
eXistenZ (1999) |
xXx (2002) |
xXx: State of the Union (2005) |
¡Three Amigos! (1986) |
À nous la liberté (Freedom for Us) (1931) |
userId |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
4.0 |
NaN |
2 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2 rows × 9719 columns
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix.head(2)
title |
'71 (2014) |
'Hellboy': The Seeds of Creation (2004) |
'Round Midnight (1986) |
'Salem's Lot (2004) |
'Til There Was You (1997) |
'Tis the Season for Love (2015) |
'burbs, The (1989) |
'night Mother (1986) |
(500) Days of Summer (2009) |
*batteries not included (1987) |
... |
Zulu (2013) |
[REC] (2007) |
[REC]² (2009) |
[REC]³ 3 Génesis (2012) |
anohana: The Flower We Saw That Day - The Movie (2013) |
eXistenZ (1999) |
xXx (2002) |
xXx: State of the Union (2005) |
¡Three Amigos! (1986) |
À nous la liberté (Freedom for Us) (1931) |
userId |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
4.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
2 rows × 9719 columns
3. 영화들 간 유사도 산출
ratings_matrix_T = ratings_matrix.transpose()
print(ratings_matrix_T.shape)
ratings_matrix_T.head(2)
(9719, 610)
userId |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
... |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
title |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'71 (2014) |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
4.0 |
'Hellboy': The Seeds of Creation (2004) |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
2 rows × 610 columns
from sklearn.metrics.pairwise import cosine_similarity
item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)
print(item_sim.shape)
item_sim[0:2]
(9719, 9719)
array([[1. , 0. , 0. , ..., 0.32732684, 0. ,
0. ],
[0. , 1. , 0.70710678, ..., 0. , 0. ,
0. ]])
item_sim_df = pd.DataFrame(data=item_sim, index=ratings_matrix.columns,
columns=ratings_matrix.columns)
print(item_sim_df.shape)
item_sim_df.head(2)
(9719, 9719)
title |
'71 (2014) |
'Hellboy': The Seeds of Creation (2004) |
'Round Midnight (1986) |
'Salem's Lot (2004) |
'Til There Was You (1997) |
'Tis the Season for Love (2015) |
'burbs, The (1989) |
'night Mother (1986) |
(500) Days of Summer (2009) |
*batteries not included (1987) |
... |
Zulu (2013) |
[REC] (2007) |
[REC]² (2009) |
[REC]³ 3 Génesis (2012) |
anohana: The Flower We Saw That Day - The Movie (2013) |
eXistenZ (1999) |
xXx (2002) |
xXx: State of the Union (2005) |
¡Three Amigos! (1986) |
À nous la liberté (Freedom for Us) (1931) |
title |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'71 (2014) |
1.0 |
0.0 |
0.000000 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.141653 |
0.0 |
... |
0.0 |
0.342055 |
0.543305 |
0.707107 |
0.0 |
0.0 |
0.139431 |
0.327327 |
0.0 |
0.0 |
'Hellboy': The Seeds of Creation (2004) |
0.0 |
1.0 |
0.707107 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.000000 |
0.0 |
... |
0.0 |
0.000000 |
0.000000 |
0.000000 |
0.0 |
0.0 |
0.000000 |
0.000000 |
0.0 |
0.0 |
2 rows × 9719 columns
item_sim_df["Godfather, The (1972)"].sort_values(ascending=False)[1:6]
title
Godfather: Part II, The (1974) 0.821773
Goodfellas (1990) 0.664841
One Flew Over the Cuckoo's Nest (1975) 0.620536
Star Wars: Episode IV - A New Hope (1977) 0.595317
Fargo (1996) 0.588614
Name: Godfather, The (1972), dtype: float64
4. 협업 필터링 적용한 추천
아이템 기반 인접이웃 데이터 기반한 예측평점
- 아이템 기반 : 이 상품을 선택한 다른 고객이 구매한 상품 추천
cf. 사용자 기반 : 유저와 비슷한 상품을 구매해 온 다른 고객이 구매한 상품
def predict_rating(ratings_arr, item_sim_arr):
ratings_pred = ratings_arr.dot(item_sim_arr) / np.array([np.abs(item_sim_arr).sum(axis=1)])
return ratings_pred
ratings_pred = predict_rating(ratings_matrix.values, item_sim_df.values)
ratings_pred[0]
array([0.07034471, 0.5778545 , 0.32169559, ..., 0.13602448, 0.29295452,
0.72034722])
ratings_pred_matrix = pd.DataFrame(data = ratings_pred, index = ratings_matrix.index,
columns = ratings_matrix.columns)
print(ratings_pred_matrix.shape)
ratings_pred_matrix.head(2)
(610, 9719)
title |
'71 (2014) |
'Hellboy': The Seeds of Creation (2004) |
'Round Midnight (1986) |
'Salem's Lot (2004) |
'Til There Was You (1997) |
'Tis the Season for Love (2015) |
'burbs, The (1989) |
'night Mother (1986) |
(500) Days of Summer (2009) |
*batteries not included (1987) |
... |
Zulu (2013) |
[REC] (2007) |
[REC]² (2009) |
[REC]³ 3 Génesis (2012) |
anohana: The Flower We Saw That Day - The Movie (2013) |
eXistenZ (1999) |
xXx (2002) |
xXx: State of the Union (2005) |
¡Three Amigos! (1986) |
À nous la liberté (Freedom for Us) (1931) |
userId |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
0.070345 |
0.577855 |
0.321696 |
0.227055 |
0.206958 |
0.194615 |
0.249883 |
0.102542 |
0.157084 |
0.178197 |
... |
0.113608 |
0.181738 |
0.133962 |
0.128574 |
0.006179 |
0.212070 |
0.192921 |
0.136024 |
0.292955 |
0.720347 |
2 |
0.018260 |
0.042744 |
0.018861 |
0.000000 |
0.000000 |
0.035995 |
0.013413 |
0.002314 |
0.032213 |
0.014863 |
... |
0.015640 |
0.020855 |
0.020119 |
0.015745 |
0.049983 |
0.014876 |
0.021616 |
0.024528 |
0.017563 |
0.000000 |
2 rows × 9719 columns
5. 예측평점 정확도 판단 (오차함수 RMSE 적용)
유저가 실제 평점을 부여한 영화와 예측점수 비교
from sklearn.metrics import mean_squared_error
def get_mse(pred, actual):
pred = pred[actual.nonzero()].flatten()
actual = actual[actual.nonzero()].flatten()
return mean_squared_error(pred,actual)
print('아이템 기반 모든 인접 이웃 MSE : ', get_mse(ratings_pred, ratings_matrix.values))
아이템 기반 모든 인접 이웃 MSE : 9.895354759094706
유사도 상위 n개의 예측평점 계산
def predict_rating_topsim(ratings_arr, item_sim_arr, n=20):
pred = np.zeros(ratings_arr.shape)
for col in range(ratings_arr.shape[1]):
top_n_items = [np.argsort(item_sim_arr[:, col])[:-n-1:-1]]
for row in range(ratings_arr.shape[0]):
pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
pred[row, col] /= np.sum(np.abs(item_sim_arr[col,:][top_n_items]))
return pred
import warnings
warnings.filterwarnings('ignore')
ratings_pred = predict_rating_topsim(ratings_matrix.values, item_sim_df.values, n=20)
print('아이템 기반 인접 TOP-20 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values))
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index=ratings_matrix.index,
columns = ratings_matrix.columns)
아이템 기반 인접 TOP-20 이웃 MSE: 3.694999233129397
6. 사용자에게 영화 추천
9번 유저가 높은 평점을 준 영화 확인
user_rating_id = ratings_matrix.loc[9,:]
user_rating_id[user_rating_id > 0].sort_values(ascending=False)[:10]
title
Adaptation (2002) 5.0
Citizen Kane (1941) 5.0
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) 5.0
Producers, The (1968) 5.0
Lord of the Rings: The Two Towers, The (2002) 5.0
Lord of the Rings: The Fellowship of the Ring, The (2001) 5.0
Back to the Future (1985) 5.0
Austin Powers in Goldmember (2002) 5.0
Minority Report (2002) 4.0
Witness (1985) 4.0
Name: 9, dtype: float64
평점 5점을 준 영화는 어댑션, 오스틴파워, 반지의 제왕 등 이다.
흥행성이 높은 어드벤처 영화를 좋아한다고 예상할 수 있다.
관람하지 않은 영화 추천
def get_unseen_movies(ratings_matrix, userId):
user_rating = ratings_matrix.loc[userId,:]
already_seen = user_rating[user_rating > 0].index.tolist()
movies_list = ratings_matrix.columns.tolist()
unseen_list = [ movie for movie in movies_list if movie not in already_seen ]
return unseen_list
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
return recomm_movies
unseen_list = get_unseen_movies(ratings_matrix, 9)
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9, unseen_list, top_n=10)
recomm_movies = pd.DataFrame(data=recomm_movies.values, index=recomm_movies.index, columns=['pred_score'])
recomm_movies
|
pred_score |
title |
|
Shrek (2001) |
0.866202 |
Spider-Man (2002) |
0.857854 |
Last Samurai, The (2003) |
0.817473 |
Indiana Jones and the Temple of Doom (1984) |
0.816626 |
Matrix Reloaded, The (2003) |
0.800990 |
Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001) |
0.765159 |
Gladiator (2000) |
0.740956 |
Matrix, The (1999) |
0.732693 |
Pirates of the Caribbean: The Curse of the Black Pearl (2003) |
0.689591 |
Lord of the Rings: The Return of the King, The (2003) |
0.676711 |
아이템 기반