해당 글은 제로베이스데이터스쿨 학습자료를 참고하여 작성되었습니다
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
movies = pd.read_csv('https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/tmdb_5000_movies.csv')
print(movies.shape)
--------------------
(4803, 20)
movies.head(3)
movies_df = movies[['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity', 'keywords', 'overview']]
movies_df.head(3)
movies_df[['genres']][:1].values
----------------------------------
array([['[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]']],
dtype=object)
from ast import literal_eval
movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)
movies_df.head()
movies_df['genres'][0]
-----------------------------------------
[{'id': 28, 'name': 'Action'},
{'id': 12, 'name': 'Adventure'},
{'id': 14, 'name': 'Fantasy'},
{'id': 878, 'name': 'Science Fiction'}]
movies_df['genres'] = movies_df['genres'].apply(lambda x: [y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: [y['name'] for y in x])
movies_df[['genres', 'keywords']]
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x: (' ').join(x))
movies_df.head(3)
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
print(genre_mat.shape)
------------------------
(4803, 276)
from sklearn.metrics.pairwise import cosine_similarity
genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim[:2])
---------------------------------------------------------------------------
(4803, 4803)
[[1. 0.59628479 0.4472136 ... 0. 0. 0. ]
[0.59628479 1. 0.4 ... 0. 0. 0. ]]
genre_sim
-------------------------------------------------------------------------
array([[1. , 0.59628479, 0.4472136 , ..., 0. , 0. ,
0. ],
[0.59628479, 1. , 0.4 , ..., 0. , 0. ,
0. ],
[0.4472136 , 0.4 , 1. , ..., 0. , 0. ,
0. ],
...,
[0. , 0. , 0. , ..., 1. , 0. ,
0. ],
[0. , 0. , 0. , ..., 0. , 0. ,
0. ],
[0. , 0. , 0. , ..., 0. , 0. ,
1. ]])
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
print(genre_sim_sorted_ind[:1])
--------------------------------------
[[ 0 3494 813 ... 3038 3037 2401]]
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
title_movie = df[df['title'] == title_name]
title_index = title_movie.index.values
similar_indexes = sorted_ind[title_index, :(top_n)]
print(similar_indexes)
similar_indexes = similar_indexes.reshape(-1)
return df.iloc[similar_indexes]
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average']]
------------------------------------------------------
[[2731 1243 3636 1946 2640 4065 1847 4217 883 3866]]
movies_df[['title', 'vote_average', 'vote_count']].sort_values('vote_average', ascending=False)[:10]
C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.6)
print('C:', round(C,3), 'm:',round(m,3))
------------------------------------------
C: 6.092 m: 370.2
def weighted_vote_average(record):
v = record['vote_count']
R = record['vote_average']
return ( (v/(v+m))*R + (m/(v+m))*C )
movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis=1)
movies_df.head()
movies_df[movies_df['vote_count'] < 10]
movies_df[['title', 'vote_average', 'weighted_vote', 'vote_count']].sort_values(by='weighted_vote', ascending=False)[:10]
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
title_movie = df[df['title'] == title_name]
title_index = title_movie.index.values
similar_indexes = sorted_ind[title_index, :(top_n*2)]
print(similar_indexes)
similar_indexes = similar_indexes.reshape(-1)
similar_indexes = similar_indexes[similar_indexes != title_index]
return df.iloc[similar_indexes].sort_values(by='weighted_vote', ascending=False)[:top_n]
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average', 'weighted_vote']]
------------------------------------------------------------------------
[[2731 1243 3636 1946 2640 4065 1847 4217 883 3866 3112 4041 588 3337
3378 281 1663 1464 1149 2839]]
import pandas as pd
import numpy as np
movies = pd.read_csv('./data/ml-latest-small/movies.csv')
ratings = pd.read_csv('./data/ml-latest-small/ratings.csv')
print(movies.shape)
print(ratings.shape)
---------------------
(9742, 3)
(100836, 4)
movies.head()
ratings.head()
ratings = ratings.drop('timestamp', axis=1)
ratings_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')
ratings_matrix.head()
rating_movies = pd.merge(ratings, movies, on='movieId')
rating_movies.head()
ratings_matrix = rating_movies.pivot_table(index='userId', columns='title', values='rating')
ratings_matrix
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix.head()
ratings_matrix_T = ratings_matrix.transpose()
ratings_matrix_T.head()
from sklearn.metrics.pairwise import cosine_similarity
item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)
item_sim_df = pd.DataFrame(data=item_sim, index=ratings_matrix.columns, columns=ratings_matrix.columns)
print(item_sim_df.shape)
item_sim_df
--------------
(9719, 9719)
item_sim_df['Godfather, The (1972)'].sort_values(ascending=False)[:6]
---------------------------------------------------------
title
Godfather, The (1972) 1.000000
Godfather: Part II, The (1974) 0.821773
Goodfellas (1990) 0.664841
One Flew Over the Cuckoo's Nest (1975) 0.620536
Star Wars: Episode IV - A New Hope (1977) 0.595317
Fargo (1996) 0.588614
Name: Godfather, The (1972), dtype: float64
item_sim_df['Inception (2010)'].sort_values(ascending=False)[:6]
------------------------------------------
title
Inception (2010) 1.000000
Dark Knight, The (2008) 0.727263
Inglourious Basterds (2009) 0.646103
Shutter Island (2010) 0.617736
Dark Knight Rises, The (2012) 0.617504
Fight Club (1999) 0.615417
Name: Inception (2010), dtype: float64