1. 라이브러리 및 자료 불러오기
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
movies = pd.read_csv('tmdb_5000_movies.csv')
print(movies.shape)
(4803, 20)
movies_df = movies[['id','title','genres','vote_average','vote_count','popularity','keywords','overview']]
2. 데이터 전처리
movies_df['genres'][0]
'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'
type(movies_df['genres'][0])
str
genres, keywords 컬럼 자료형 변환 (str -> list)
from ast import literal_eval
movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)
movies_df['genres'][0]
[{'id': 28, 'name': 'Action'},
{'id': 12, 'name': 'Adventure'},
{'id': 14, 'name': 'Fantasy'},
{'id': 878, 'name': 'Science Fiction'}]
type(movies_df['genres'][0])
list
list 내 딕셔너리, name키의 값을 리스트로 변환
movies_df['genres']
0 [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...
1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
2 [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...
3 [{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...
4 [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...
...
4798 [{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...
4799 [{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...
4800 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4801 []
4802 [{'id': 99, 'name': 'Documentary'}]
Name: genres, Length: 4803, dtype: object
movies_df['genres'][0][2]['name']
'Fantasy'
movies_df['genres'] = movies_df['genres'].apply(lambda x : [ y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x :[ y['name'] for y in x])
movies_df[['genres']][0:3]
|
genres |
0 |
[Action, Adventure, Fantasy, Science Fiction] |
1 |
[Adventure, Fantasy, Action] |
2 |
[Action, Adventure, Crime] |
3. 장르 유사도 측정 -> 추천시스템 구축
from sklearn.feature_extraction.text import CountVectorizer
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x))
movies_df['genres_literal'][0:3]
0 Action Adventure Fantasy Science Fiction
1 Adventure Fantasy Action
2 Action Adventure Crime
Name: genres_literal, dtype: object
type(movies_df['genres_literal'][0])
str
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
print(genre_mat.shape)
(4803, 276)
count_vect2 = CountVectorizer(min_df=1, ngram_range=(1, 1))
genre_mat2 = count_vect2.fit_transform(movies_df['genres_literal'])
print(genre_mat2.shape)
(4803, 22)
코사인 유사도(cosine_similarity) 계산
from sklearn.metrics.pairwise import cosine_similarity
genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim[::-1])
(4803, 4803)
[[0. 0. 0. ... 0. 0. 1. ]
[0. 0. 0. ... 0. 0. 0. ]
[0. 0. 0. ... 1. 0. 0. ]
...
[0.4472136 0.4 1. ... 0. 0. 0. ]
[0.59628479 1. 0.4 ... 0. 0. 0. ]
[1. 0.59628479 0.4472136 ... 0. 0. 0. ]]
genre_sim_sorted_ind = genre_sim.argsort()[:,::-1]
genre_sim_sorted_ind[::-1]
array([[4802, 4710, 4521, ..., 3140, 3141, 0],
[4802, 1594, 1596, ..., 3204, 3205, 0],
[4800, 3809, 1895, ..., 2229, 2230, 0],
...,
[ 2, 1740, 1542, ..., 3000, 2999, 2401],
[ 262, 1, 129, ..., 3069, 3067, 2401],
[ 0, 3494, 813, ..., 3038, 3037, 2401]], dtype=int64)
장르 코사인 유사도 활용한 영화 추천
title_movie = movies_df[movies_df['title'] == 'The Godfather']
title_movie
|
id |
title |
genres |
vote_average |
vote_count |
popularity |
keywords |
overview |
genres_literal |
3337 |
238 |
The Godfather |
[Drama, Crime] |
8.4 |
5893 |
143.659698 |
[italy, love at first sight, loss of father, p... |
Spanning the years 1945 to 1955, a chronicle o... |
Drama Crime |
title_index = title_movie.index.values
similar_indexes = genre_sim_sorted_ind[title_index, :10]
similar_indexes = similar_indexes.reshape(-1)
movies_df.iloc[similar_indexes].head(1)
|
id |
title |
genres |
vote_average |
vote_count |
popularity |
keywords |
overview |
genres_literal |
2731 |
240 |
The Godfather: Part II |
[Drama, Crime] |
8.3 |
3338 |
105.792936 |
[italo-american, cuba, vororte, melancholy, pr... |
In the continuing saga of the Corleone crime f... |
Drama Crime |
코사인 유사도 산출 함수
def find_sim_movie_ver1(df, sorted_ind, title_name, top_n=10):
title_movie = df[df['title'] == title_name]
title_index = title_movie.index.values
similar_indexes = sorted_ind[title_index, :(top_n)]
similar_indexes = similar_indexes.reshape(-1)
return df.iloc[similar_indexes]
The Godfather 와 유사한 영화 10개 추천
similar_movies = find_sim_movie_ver1(movies_df, genre_sim_sorted_ind, 'The Godfather', 20)
similar_movies[['title', 'vote_average', 'genres', 'vote_count']].head(1)
|
title |
vote_average |
genres |
vote_count |
2731 |
The Godfather: Part II |
8.3 |
[Drama, Crime] |
3338 |
movies_df[['title','vote_average','vote_count']].sort_values('vote_average', ascending=False)[:10]
|
title |
vote_average |
vote_count |
3519 |
Stiff Upper Lips |
10.0 |
1 |
4247 |
Me You and Five Bucks |
10.0 |
2 |
4045 |
Dancer, Texas Pop. 81 |
10.0 |
1 |
4662 |
Little Big Top |
10.0 |
1 |
3992 |
Sardaarji |
9.5 |
2 |
2386 |
One Man's Hero |
9.3 |
2 |
2970 |
There Goes My Baby |
8.5 |
2 |
1881 |
The Shawshank Redemption |
8.5 |
8205 |
2796 |
The Prisoner of Zenda |
8.4 |
11 |
3337 |
The Godfather |
8.4 |
5893 |
4. 추천시스템에 가중평점 반영
@ 가중평점(Weighted Rating, 평점 & 평가횟수):
(v/(v+m))*R + (m/(v+m))*C
- v : 영화별 평점을 투표한 횟수(vote_count) # 변동값
- m : 평점 부여되는 기준(최소 투표횟수) -> 여기선 투표수 상위 60%
- R : 개별 영화의 평점 # 변동값
- C : 전체 영화의 평점
C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.6)
print('C:', round(C,3), 'm:',round(m,3))
C: 6.092 m: 370.2
가중평균 계산 함수
def weighted_vote_average(record):
v = record['vote_count']
R = record['vote_average']
return ((v/(v+m))*R)+((m/(m+v))*C)
movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis = 1)
movies_df.head(1)
|
id |
title |
genres |
vote_average |
vote_count |
popularity |
keywords |
overview |
genres_literal |
weighted_vote |
0 |
19995 |
Avatar |
[Action, Adventure, Fantasy, Science Fiction] |
7.2 |
11800 |
150.437577 |
[culture clash, future, space war, space colon... |
In the 22nd century, a paraplegic Marine is di... |
Action Adventure Fantasy Science Fiction |
7.166301 |
5. 결과 -> 대부 유사영화 10개 추천 (가중평점 반영)
movies_df[['weighted_vote','title','vote_average','vote_count','genres']].sort_values('weighted_vote',ascending=False)[:10]
|
weighted_vote |
title |
vote_average |
vote_count |
genres |
1881 |
8.396052 |
The Shawshank Redemption |
8.5 |
8205 |
[Drama, Crime] |
3337 |
8.263591 |
The Godfather |
8.4 |
5893 |
[Drama, Crime] |
662 |
8.216455 |
Fight Club |
8.3 |
9413 |
[Drama] |
3232 |
8.207102 |
Pulp Fiction |
8.3 |
8428 |
[Thriller, Crime] |
65 |
8.136930 |
The Dark Knight |
8.2 |
12002 |
[Drama, Action, Crime, Thriller] |
1818 |
8.126069 |
Schindler's List |
8.3 |
4329 |
[Drama, History, War] |
3865 |
8.123248 |
Whiplash |
8.3 |
4254 |
[Drama] |
809 |
8.105954 |
Forrest Gump |
8.2 |
7927 |
[Comedy, Drama, Romance] |
2294 |
8.105867 |
Spirited Away |
8.3 |
3840 |
[Fantasy, Adventure, Animation, Family] |
2731 |
8.079586 |
The Godfather: Part II |
8.3 |
3338 |
[Drama, Crime] |