오늘도 사이드프로젝트 진행!
import os
import pickle
import pandas as pd
import numpy as np
import torch
from sklearn.neighbors import NearestNeighbors
from transformers import BertTokenizer, BertModel
import tqdm
class MovieRecommenderTrainer:
def __init__(self, model_path, data_path):
self.model_path = model_path
self.data_path = data_path
self.movies_file = os.path.join(data_path, "movie_info_data.csv")
self.ratings_file = os.path.join(data_path, "ratings.csv")
self.embeddings = None
self.user_profiles = {}
self.knn_model = None
self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
self.bert_model = BertModel.from_pretrained("bert-base-uncased")
self.bert_model.eval()
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.bert_model.to(self.device)
os.makedirs(self.model_path, exist_ok=True)
def load_data(self):
"""데이터 로드 및 전처리"""
self.movies = pd.read_csv(self.movies_file)
self.ratings = pd.read_csv(self.ratings_file)
self.movies["content"] = (
self.movies["title"]
+ " "
+ self.movies["cast"].fillna("")
+ " "
+ self.movies["crew"].fillna("")
+ " "
+ self.movies["genres"].fillna("")
)
def get_bert_embeddings(self, texts, batch_size=16):
"""BERT 임베딩 생성 함수"""
embeddings = []
for i in tqdm.tqdm(
range(0, len(texts), batch_size), desc="Generating BERT embeddings"
):
batch_texts = texts[i : i + batch_size]
inputs = self.tokenizer(
batch_texts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512,
)
inputs = {key: val.to(self.device) for key, val in inputs.items()}
with torch.no_grad():
outputs = self.bert_model(**inputs)
cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
embeddings.extend(cls_embeddings)
return embeddings
def prepare_embeddings(self):
"""임베딩 생성 및 저장/로드"""
embeddings_file = os.path.join(self.model_path, "movie_embeddings.pth")
if os.path.exists(embeddings_file):
os.remove(embeddings_file)
print(f"Deleted existing file: {embeddings_file}")
self.embeddings = self.get_bert_embeddings(
self.movies["content"].tolist(), batch_size=16
)
self.movies["embedding"] = self.embeddings
torch.save(
{
"embeddings": self.embeddings,
"movie_ids": self.movies["movieId"].tolist(),
"titles": self.movies["title"].tolist(),
},
embeddings_file,
)
print(f"Embeddings saved to {embeddings_file}")
def create_user_profiles(self):
"""기존 사용자 프로필 생성"""
user_profiles_file = os.path.join(self.model_path, "user_profiles.pth")
if os.path.exists(user_profiles_file):
os.remove(user_profiles_file)
print(f"Deleted existing file: {user_profiles_file}")
for user_id in tqdm.tqdm(
self.ratings["userId"].unique(), desc="Creating user profiles"
):
user_movies = self.ratings[self.ratings["userId"] == user_id]["movieId"]
movie_embeddings = self.movies[
self.movies["movieId"].isin(user_movies)
]["embedding"].tolist()
if len(movie_embeddings) > 0:
self.user_profiles[user_id] = np.mean(
np.stack(movie_embeddings), axis=0
)
torch.save(self.user_profiles, user_profiles_file)
print(f"User profiles saved to {user_profiles_file}")
def train_knn(self):
"""KNN 모델 훈련"""
knn_model_file = os.path.join(self.model_path, "knn_model.pth")
if os.path.exists(knn_model_file):
os.remove(knn_model_file)
print(f"Deleted exisiting file: {knn_model_file}")
embedding_matrix = np.stack(self.movies["embedding"].values)
self.knn_model = NearestNeighbors(metric="cosine", algorithm="auto")
self.knn_model.fit(embedding_matrix)
torch.save(self.knn_model, knn_model_file)
print(f"KNN model saved to {knn_model_file}")
import os
import pickle
import numpy as np
import pandas as pd
import torch
class NewUserRecommender:
def __init__(self, model_path):
self.model_path = model_path
self.movies = None
self.knn_model = None
self.user_profiles = {}
self.load_models()
def load_models(self):
"""모델 및 데이터 로드"""
embeddings_file = os.path.join(self.model_path, "movie_embeddings.pth")
user_profiles_file = os.path.join(self.model_path, "user_profiles.pth")
knn_model_file = os.path.join(self.model_path, "knn_model.pth")
movie_data = torch.load(embeddings_file)
self.movies = pd.DataFrame(
{
"movieId": movie_data["movie_ids"],
"embedding": movie_data["embeddings"],
# "title": movie_data["titles"],
}
)
self.user_profiles = torch.load(user_profiles_file)
self.knn_model = torch.load(knn_model_file)
def add_new_user(self, user_id, movie_ids):
"""새로운 사용자 추가"""
movie_embeddings = self.movies[self.movies["movieId"].isin(movie_ids)][
"embedding"
].tolist()
if len(movie_embeddings) > 0:
self.user_profiles[user_id] = np.mean(np.stack(movie_embeddings), axis=0)
print(f"User profile created for user ID {user_id}")
else:
print(
f"Failed to create profile for user ID {user_id}. No valid movies found."
)
def recommend_movies(self, user_id, movie_ids, top_k=10):
"""새로운 사용자에 대한 추천"""
if user_id not in self.user_profiles:
print(f"User ID {user_id} not found in profiles.")
return pd.DataFrame()
user_embedding = self.user_profiles[user_id].reshape(1, -1)
distances, indices = self.knn_model.kneighbors(
user_embedding, n_neighbors=top_k + 1
)
recommended = []
seen_movies = set(movie_ids)
for dist, idx in zip(distances[0], indices[0]):
movie_id = self.movies.iloc[idx]["movieId"]
if movie_id not in seen_movies:
recommended.append(
{
"movieId": self.movies.iloc[idx]["movieId"],
# "title": self.movies.iloc[idx]["title"],
"similarity": 1 - dist,
}
)
if len(recommended) >= top_k:
break
return pd.DataFrame(recommended)
import random
from fastapi import APIRouter
from sqlmodel import Session
from src.config import config
from src.db import engine
from src.dependency import get_model
from src.domain.model.dto.predictions import PredictionRequest, PredictionResponse
from src.schemas.prediction import PredictionResult
from src.dependency import get_model
router = APIRouter(prefix="/api/model", tags=["Model"])
model_path = config.model_path
@router.post("/predict", response_model=PredictionResponse)
def predict(request: PredictionRequest) -> PredictionResponse:
"""
user_id와 movie_list 받아서 model의 inference 10개 생성 및 반환
"""
# 생성된 모델 dependencies에서 가져오기
# model = get_model()
# 모델 predict 함수의 파라미터 확인 필요
# prediction = model.predict(user_id=request.user_id, movies=request.movie_list)
model = get_model()
model.add_new_user(user_id=request.user_id, movie_ids=request.movie_list)
prediction = model.recommend_movies(
user_id=request.user_id, movie_ids=request.movie_list, top_k=10
)
movie_ids = prediction["movieId"].tolist()
# # 테스트용
# prediction = random.sample(range(1, 101), 10)
# 예측 결과 변환
prediction_result = PredictionResult(
user_id=request.user_id,
movie_inference_list=movie_ids,
movie_pick_list=request.movie_list,
)
with Session(engine) as session:
session.add(prediction_result)
session.commit()
session.refresh(prediction_result)
return PredictionResponse(user_id=request.user_id, movie_list=movie_ids)
아직 초기 모델이라 그런 것도 있고, cold start 문제를 해결하기 위한 모델이라 input 데이터 자체가 10개 내외정도기 때문에 정확도가 매우 낮았다.
임베딩을 오히려 장르만 하는게 정확도가 높게 나왔다.
차차 고쳐나갈 예정