유저 맞춤 영화추천 (CF 기반)

IngCoding·2022년 2월 24일

파이썬 #4 데이터 분석

목록 보기

13/16

1. 데이터 및 라이브러리 불러오기

import pandas as pd
import numpy as np

movies = pd.read_csv('./data_movie_lens/movies.csv')
ratings = pd.read_csv('./data_movie_lens/ratings.csv')

print(movies.shape)
print(ratings.shape)

# 9천여개 영화에 대해 사용자들(600여명)이 평가한 10만여개 평점 데이터

(9742, 3)
(100836, 4)

movies.head(1)

	movieId	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy

ratings.head(1)

	userId	movieId	rating	timestamp
0	1	1	4.0	964982703

2. 데이터 전처리

# 불필요한 'timestamp' 컬럼 제거 
ratings = ratings.drop(['timestamp'],axis=1)
ratings.head(1)

	userId	movieId	rating
0	1	1	4.0

# pivot_table 메소드를 사용해서 행렬 반환
ratings_matrix = ratings.pivot_table('rating', index='userId', columns='movieId')

# 각 유저가 영화에 매긴 평점을 행렬로 표시
print(ratings_matrix.shape)
ratings_matrix.head(2)

(610, 9724)

movieId	1	2	3	4	5	6	7	8	9	10	...	193565	193567	193571	193573	193579	193581	193583	193585	193587	193609
userId
1	4.0	NaN	4.0	NaN	NaN	4.0	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

2 rows × 9724 columns

# movieId 기준으로 DF 합치기  

rating_movies = pd.merge(ratings, movies, on='movieId')
print(rating_movies.shape)
    # 행의 갯수는 모든 유저가 단 평점갯수의 총합
rating_movies.head(1)

(100836, 5)

	userId	movieId	rating	title	genres
0	1	1	4.0	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy

# title 컬럼 기준으로 pivot 수행

ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title')
ratings_matrix.head(2)

title	'71 (2014)	'Hellboy': The Seeds of Creation (2004)	'Round Midnight (1986)	'Salem's Lot (2004)	'Til There Was You (1997)	'Tis the Season for Love (2015)	'burbs, The (1989)	'night Mother (1986)	(500) Days of Summer (2009)	*batteries not included (1987)	...	Zulu (2013)	[REC] (2007)	[REC]² (2009)	[REC]³ 3 Génesis (2012)	anohana: The Flower We Saw That Day - The Movie (2013)	eXistenZ (1999)	xXx (2002)	xXx: State of the Union (2005)	¡Three Amigos! (1986)	À nous la liberté (Freedom for Us) (1931)
userId
1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	4.0	NaN
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

2 rows × 9719 columns

# NaN 값을 모두 0 으로 변환 

#사용자 - 아이템 행렬 도출
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix.head(2)

title	'71 (2014)	'Hellboy': The Seeds of Creation (2004)	'Round Midnight (1986)	'Salem's Lot (2004)	'Til There Was You (1997)	'Tis the Season for Love (2015)	'burbs, The (1989)	'night Mother (1986)	(500) Days of Summer (2009)	*batteries not included (1987)	...	Zulu (2013)	[REC] (2007)	[REC]² (2009)	[REC]³ 3 Génesis (2012)	anohana: The Flower We Saw That Day - The Movie (2013)	eXistenZ (1999)	xXx (2002)	xXx: State of the Union (2005)	¡Three Amigos! (1986)	À nous la liberté (Freedom for Us) (1931)
userId
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	4.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

2 rows × 9719 columns

3. 영화들 간 유사도 산출

# 아이템 - 사용자 행렬로 transpose 
ratings_matrix_T = ratings_matrix.transpose() # 전치행렬

print(ratings_matrix_T.shape)
ratings_matrix_T.head(2)

(9719, 610)

userId	1	2	3	4	5	6	7	8	9	10	...	601	602	603	604	605	606	607	608	609	610
title
'71 (2014)	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	4.0
'Hellboy': The Seeds of Creation (2004)	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

2 rows × 610 columns

# 영화들 간 코사인 유사도 산출 
from sklearn.metrics.pairwise import cosine_similarity

item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)
print(item_sim.shape)
item_sim[0:2]

(9719, 9719)





array([[1.        , 0.        , 0.        , ..., 0.32732684, 0.        ,
        0.        ],
       [0.        , 1.        , 0.70710678, ..., 0.        , 0.        ,
        0.        ]])

# 코사인 유사도로 반환된 넘파이 행렬에 영화명 매핑 -> DataFrame 으로 변환

item_sim_df = pd.DataFrame(data=item_sim, index=ratings_matrix.columns,
                           columns=ratings_matrix.columns)
print(item_sim_df.shape)
item_sim_df.head(2)

(9719, 9719)

title	'71 (2014)	'Hellboy': The Seeds of Creation (2004)	'Round Midnight (1986)	'Salem's Lot (2004)	'Til There Was You (1997)	'Tis the Season for Love (2015)	'burbs, The (1989)	'night Mother (1986)	(500) Days of Summer (2009)	*batteries not included (1987)	...	Zulu (2013)	[REC] (2007)	[REC]² (2009)	[REC]³ 3 Génesis (2012)	anohana: The Flower We Saw That Day - The Movie (2013)	eXistenZ (1999)	xXx (2002)	xXx: State of the Union (2005)	¡Three Amigos! (1986)	À nous la liberté (Freedom for Us) (1931)
title
'71 (2014)	1.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.141653	0.0	...	0.0	0.342055	0.543305	0.707107	0.0	0.0	0.139431	0.327327	0.0	0.0
'Hellboy': The Seeds of Creation (2004)	0.0	1.0	0.707107	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	...	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.000000	0.000000	0.0	0.0

2 rows × 9719 columns

# Godfather 와 유사한 영화 5개 확인해보기 (0번 인덱스는 Godfater 니까 제외)
item_sim_df["Godfather, The (1972)"].sort_values(ascending=False)[1:6]

title
Godfather: Part II, The (1974)               0.821773
Goodfellas (1990)                            0.664841
One Flew Over the Cuckoo's Nest (1975)       0.620536
Star Wars: Episode IV - A New Hope (1977)    0.595317
Fargo (1996)                                 0.588614
Name: Godfather, The (1972), dtype: float64

4. 협업 필터링 적용한 추천

아이템 기반 인접이웃 데이터 기반한 예측평점

아이템 기반 : 이 상품을 선택한 다른 고객이 구매한 상품 추천

cf. 사용자 기반 : 유저와 비슷한 상품을 구매해 온 다른 고객이 구매한 상품

# 평점 벡터(행), 유사도 벡터(열) 내적하여 예측평점 계산 함수
def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred = ratings_arr.dot(item_sim_arr) / np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred

ratings_pred = predict_rating(ratings_matrix.values, item_sim_df.values)
ratings_pred[0]

array([0.07034471, 0.5778545 , 0.32169559, ..., 0.13602448, 0.29295452,
       0.72034722])

# 데이터프레임으로 변환 -> 영화별 예측 평점

ratings_pred_matrix = pd.DataFrame(data = ratings_pred, index = ratings_matrix.index,
                                    columns = ratings_matrix.columns)
print(ratings_pred_matrix.shape)
ratings_pred_matrix.head(2)

(610, 9719)

title	'71 (2014)	'Hellboy': The Seeds of Creation (2004)	'Round Midnight (1986)	'Salem's Lot (2004)	'Til There Was You (1997)	'Tis the Season for Love (2015)	'burbs, The (1989)	'night Mother (1986)	(500) Days of Summer (2009)	*batteries not included (1987)	...	Zulu (2013)	[REC] (2007)	[REC]² (2009)	[REC]³ 3 Génesis (2012)	anohana: The Flower We Saw That Day - The Movie (2013)	eXistenZ (1999)	xXx (2002)	xXx: State of the Union (2005)	¡Three Amigos! (1986)	À nous la liberté (Freedom for Us) (1931)
userId
1	0.070345	0.577855	0.321696	0.227055	0.206958	0.194615	0.249883	0.102542	0.157084	0.178197	...	0.113608	0.181738	0.133962	0.128574	0.006179	0.212070	0.192921	0.136024	0.292955	0.720347
2	0.018260	0.042744	0.018861	0.000000	0.000000	0.035995	0.013413	0.002314	0.032213	0.014863	...	0.015640	0.020855	0.020119	0.015745	0.049983	0.014876	0.021616	0.024528	0.017563	0.000000

2 rows × 9719 columns

5. 예측평점 정확도 판단 (오차함수 RMSE 적용)

유저가 실제 평점을 부여한 영화와 예측점수 비교

# 예측성능평가(MSE) 적용 함수 
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    pred = pred[actual.nonzero()].flatten() 
        # 넘파이 nonzero() -> 0 이 아닌 값들의 index를 반환해줌
        # 넘파이 flatten() -> 평평하게(1차원으로) 만들어줌
              # -> 평점을 매긴 데이터가 1차원 넘파이 행렬로 추출됨
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred,actual)

print('아이템 기반 모든 인접 이웃 MSE : ', get_mse(ratings_pred, ratings_matrix.values))

아이템 기반 모든 인접 이웃 MSE :  9.895354759094706

유사도 상위 n개의 예측평점 계산

def predict_rating_topsim(ratings_arr, item_sim_arr, n=20):
    # (사용자-아이템 평점)행렬 크기만큼 0으로 채운 예측행렬 초기화 
    pred = np.zeros(ratings_arr.shape)
    
    # 행렬의 열 수만큼 Loop 수행
    for col in range(ratings_arr.shape[1]):
        # 유사도가 1인(자기자신)을 제외하고 큰 순으로 index 반환
        top_n_items = [np.argsort(item_sim_arr[:, col])[:-n-1:-1]]
        # 개인화된 예측 평점 계산 -> 행(유저 수) 만큼 Loop
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col,:][top_n_items]))
    return pred

import warnings
warnings.filterwarnings('ignore')

# 함수 실행 (2분 정도 걸림) 
ratings_pred = predict_rating_topsim(ratings_matrix.values, item_sim_df.values, n=20)
print('아이템 기반 인접 TOP-20 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values))

# 함수로 계산된 예측평점으로 DataFrame 재생성 -> 최종적인 영화별 예측 평점 데이터
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index=ratings_matrix.index,
                                   columns = ratings_matrix.columns)

아이템 기반 인접 TOP-20 이웃 MSE:  3.694999233129397

6. 사용자에게 영화 추천

9번 유저가 높은 평점을 준 영화 확인

user_rating_id = ratings_matrix.loc[9,:]
user_rating_id[user_rating_id > 0].sort_values(ascending=False)[:10]

title
Adaptation (2002)                                                                 5.0
Citizen Kane (1941)                                                               5.0
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    5.0
Producers, The (1968)                                                             5.0
Lord of the Rings: The Two Towers, The (2002)                                     5.0
Lord of the Rings: The Fellowship of the Ring, The (2001)                         5.0
Back to the Future (1985)                                                         5.0
Austin Powers in Goldmember (2002)                                                5.0
Minority Report (2002)                                                            4.0
Witness (1985)                                                                    4.0
Name: 9, dtype: float64

평점 5점을 준 영화는 어댑션, 오스틴파워, 반지의 제왕 등 이다.
흥행성이 높은 어드벤처 영화를 좋아한다고 예상할 수 있다.

관람하지 않은 영화 추천

# user_rating이 0보다 크면 관람한 영화

def get_unseen_movies(ratings_matrix, userId):
    # userId(유저)의 모든 영화 정보 추출 -> Series로 반환
    user_rating = ratings_matrix.loc[userId,:] 
        # 반환된 user_rating 은 title을 index로 가짐
        
    # user_rating이 0 보다 큰 영화의 index를 추출 -> list 로 만듦 
    already_seen = user_rating[user_rating > 0].index.tolist()
    
    # 모든 영화명을 리스트로 만듦 
    movies_list = ratings_matrix.columns.tolist()
    
    # 이미 본 영화 제외 (list comprehension 활용)
    unseen_list = [ movie for movie in movies_list if movie not in already_seen ]
    
    return unseen_list

# 개인별 영화 추천 함수 

def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    # 예측평점 df 에서 userId 의 인덱스와 unseen_list 의 컬럼을 추출 
        # -> 예측평점이 높은 순으로 정렬 
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

# 관람하지 않은 영화 추출 
unseen_list = get_unseen_movies(ratings_matrix, 9)

# 유저에게 영화 추천 
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9, unseen_list, top_n=10)

recomm_movies = pd.DataFrame(data=recomm_movies.values, index=recomm_movies.index, columns=['pred_score'])
recomm_movies

	pred_score
title
Shrek (2001)	0.866202
Spider-Man (2002)	0.857854
Last Samurai, The (2003)	0.817473
Indiana Jones and the Temple of Doom (1984)	0.816626
Matrix Reloaded, The (2003)	0.800990
Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)	0.765159
Gladiator (2000)	0.740956
Matrix, The (1999)	0.732693
Pirates of the Caribbean: The Curse of the Black Pearl (2003)	0.689591
Lord of the Rings: The Return of the King, The (2003)	0.676711