250102 TIL #579 AI Tech #111 Movie Rec++ 진행 - 2

김춘복·2025년 1월 2일
0

TIL : Today I Learned

목록 보기
581/604

Today I Learned

오늘도 사이드프로젝트 진행!


Movie Rec++

CBF 모델 구현

  • CBF_train.py
import os
import pickle
import pandas as pd
import numpy as np
import torch
from sklearn.neighbors import NearestNeighbors
from transformers import BertTokenizer, BertModel
import tqdm


class MovieRecommenderTrainer:
    def __init__(self, model_path, data_path):
        self.model_path = model_path
        self.data_path = data_path
        self.movies_file = os.path.join(data_path, "movie_info_data.csv")
        self.ratings_file = os.path.join(data_path, "ratings.csv")
        self.embeddings = None
        self.user_profiles = {}
        self.knn_model = None

        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.bert_model = BertModel.from_pretrained("bert-base-uncased")
        self.bert_model.eval()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.bert_model.to(self.device)

        os.makedirs(self.model_path, exist_ok=True)

    def load_data(self):
        """데이터 로드 및 전처리"""
        self.movies = pd.read_csv(self.movies_file)
        self.ratings = pd.read_csv(self.ratings_file)

        self.movies["content"] = (
            self.movies["title"]
            + " "
            + self.movies["cast"].fillna("")
            + " "
            + self.movies["crew"].fillna("")
            + " "
            + self.movies["genres"].fillna("")
        )

    def get_bert_embeddings(self, texts, batch_size=16):
        """BERT 임베딩 생성 함수"""
        embeddings = []
        for i in tqdm.tqdm(
            range(0, len(texts), batch_size), desc="Generating BERT embeddings"
        ):
            batch_texts = texts[i : i + batch_size]
            inputs = self.tokenizer(
                batch_texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512,
            )
            inputs = {key: val.to(self.device) for key, val in inputs.items()}
            with torch.no_grad():
                outputs = self.bert_model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.extend(cls_embeddings)
        return embeddings

    def prepare_embeddings(self):
        """임베딩 생성 및 저장/로드"""
        embeddings_file = os.path.join(self.model_path, "movie_embeddings.pth")

        if os.path.exists(embeddings_file):
            os.remove(embeddings_file)
            print(f"Deleted existing file: {embeddings_file}")

        self.embeddings = self.get_bert_embeddings(
            self.movies["content"].tolist(), batch_size=16
        )
        self.movies["embedding"] = self.embeddings
        torch.save(
            {
                "embeddings": self.embeddings,
                "movie_ids": self.movies["movieId"].tolist(),
                "titles": self.movies["title"].tolist(),
            },
            embeddings_file,
        )
        print(f"Embeddings saved to {embeddings_file}")

    def create_user_profiles(self):
        """기존 사용자 프로필 생성"""
        user_profiles_file = os.path.join(self.model_path, "user_profiles.pth")

        if os.path.exists(user_profiles_file):
            os.remove(user_profiles_file)
            print(f"Deleted existing file: {user_profiles_file}")
        
        for user_id in tqdm.tqdm(
            self.ratings["userId"].unique(), desc="Creating user profiles"
        ):
            user_movies = self.ratings[self.ratings["userId"] == user_id]["movieId"]
            movie_embeddings = self.movies[
                self.movies["movieId"].isin(user_movies)
            ]["embedding"].tolist()
            if len(movie_embeddings) > 0:
                self.user_profiles[user_id] = np.mean(
                    np.stack(movie_embeddings), axis=0
                )

        torch.save(self.user_profiles, user_profiles_file)
        print(f"User profiles saved to {user_profiles_file}")

    def train_knn(self):
        """KNN 모델 훈련"""
        knn_model_file = os.path.join(self.model_path, "knn_model.pth")

        if os.path.exists(knn_model_file):
            os.remove(knn_model_file)
            print(f"Deleted exisiting file: {knn_model_file}")

        embedding_matrix = np.stack(self.movies["embedding"].values)
        self.knn_model = NearestNeighbors(metric="cosine", algorithm="auto")
        self.knn_model.fit(embedding_matrix)

        torch.save(self.knn_model, knn_model_file)
        print(f"KNN model saved to {knn_model_file}")
  • CBF_test.py
import os
import pickle
import numpy as np
import pandas as pd
import torch


class NewUserRecommender:
    def __init__(self, model_path):
        self.model_path = model_path
        self.movies = None
        self.knn_model = None
        self.user_profiles = {}

        self.load_models()

    def load_models(self):
        """모델 및 데이터 로드"""
        embeddings_file = os.path.join(self.model_path, "movie_embeddings.pth")
        user_profiles_file = os.path.join(self.model_path, "user_profiles.pth")
        knn_model_file = os.path.join(self.model_path, "knn_model.pth")

        movie_data = torch.load(embeddings_file)
        self.movies = pd.DataFrame(
            {
                "movieId": movie_data["movie_ids"],
                "embedding": movie_data["embeddings"],
                # "title": movie_data["titles"],
            }
        )

        self.user_profiles = torch.load(user_profiles_file)
        self.knn_model = torch.load(knn_model_file)

    def add_new_user(self, user_id, movie_ids):
        """새로운 사용자 추가"""
        movie_embeddings = self.movies[self.movies["movieId"].isin(movie_ids)][
            "embedding"
        ].tolist()
        if len(movie_embeddings) > 0:
            self.user_profiles[user_id] = np.mean(np.stack(movie_embeddings), axis=0)
            print(f"User profile created for user ID {user_id}")
        else:
            print(
                f"Failed to create profile for user ID {user_id}. No valid movies found."
            )

    def recommend_movies(self, user_id, movie_ids, top_k=10):
        """새로운 사용자에 대한 추천"""
        if user_id not in self.user_profiles:
            print(f"User ID {user_id} not found in profiles.")
            return pd.DataFrame()

        user_embedding = self.user_profiles[user_id].reshape(1, -1)
        distances, indices = self.knn_model.kneighbors(
            user_embedding, n_neighbors=top_k + 1
        )
        recommended = []

        seen_movies = set(movie_ids)

        for dist, idx in zip(distances[0], indices[0]):
            movie_id = self.movies.iloc[idx]["movieId"]
            if movie_id not in seen_movies:
                recommended.append(
                    {
                        "movieId": self.movies.iloc[idx]["movieId"],
                        # "title": self.movies.iloc[idx]["title"],
                        "similarity": 1 - dist,
                    }
                )
                if len(recommended) >= top_k:
                    break

        return pd.DataFrame(recommended)

inference 구현

  • model.py
import random

from fastapi import APIRouter
from sqlmodel import Session

from src.config import config
from src.db import engine
from src.dependency import get_model
from src.domain.model.dto.predictions import PredictionRequest, PredictionResponse
from src.schemas.prediction import PredictionResult
from src.dependency import get_model

router = APIRouter(prefix="/api/model", tags=["Model"])

model_path = config.model_path


@router.post("/predict", response_model=PredictionResponse)
def predict(request: PredictionRequest) -> PredictionResponse:
    """
    user_id와 movie_list 받아서 model의 inference 10개 생성 및 반환
    """

    # 생성된 모델 dependencies에서 가져오기
    # model = get_model()
    # 모델 predict 함수의 파라미터 확인 필요
    # prediction = model.predict(user_id=request.user_id, movies=request.movie_list)

    model = get_model()
    model.add_new_user(user_id=request.user_id, movie_ids=request.movie_list)

    prediction = model.recommend_movies(
        user_id=request.user_id, movie_ids=request.movie_list, top_k=10
    )

    movie_ids = prediction["movieId"].tolist()

    # # 테스트용
    # prediction = random.sample(range(1, 101), 10)

    # 예측 결과 변환
    prediction_result = PredictionResult(
        user_id=request.user_id,
        movie_inference_list=movie_ids,
        movie_pick_list=request.movie_list,
    )
    with Session(engine) as session:
        session.add(prediction_result)
        session.commit()
        session.refresh(prediction_result)

        return PredictionResponse(user_id=request.user_id, movie_list=movie_ids)
  • 아직 초기 모델이라 그런 것도 있고, cold start 문제를 해결하기 위한 모델이라 input 데이터 자체가 10개 내외정도기 때문에 정확도가 매우 낮았다.

  • 임베딩을 오히려 장르만 하는게 정확도가 높게 나왔다.

  • 차차 고쳐나갈 예정

profile
Backend Dev / Data Engineer

0개의 댓글