Python 환경 구성:
conda create -n video-r1 python=3.11
conda activate video-r1
필요한 코드 및 리소스 다운로드:
git clone https://github.com/tulerfeng/Video-R1
cd Video-R1
bash setup.sh # dependencies 설치
추가 설정:
qwen-vl-utils 설치:cd src/qwen-vl-utils
pip install -e .[decord]
cd ../..urgent2.mp4)을 example/video/ 디렉토리에 위치시킴.transformers-main.zip을 설치:unzip transformers-main.zip
cd transformers-main
pip install .ImportError: flash_attn_2_cuda.so: undefined symbol: _ZNK3c1011StorageImpl27throw_data_ptr_access_errorEvpip install flash_attn==2.7.0.post2run_single_inference.py 작성 및 실행from transformers import AutoProcessor, AutoTokenizer
from vllm import LLM, SamplingParams
from qwen_vl_utils import process_vision_info
import torch
MODEL_PATH = "./example"
VIDEO_PATH = "./example/video/urgent2.mp4"
llm = LLM(
model=MODEL_PATH,
tensor_parallel_size=torch.cuda.device_count(),
max_model_len=8192,
gpu_memory_utilization=0.8,
limit_mm_per_prompt={"image": 1, "video": 1},
)
sampling_params = SamplingParams(
temperature=0.1,
top_p=0.001,
max_tokens=512,
)
processor = AutoProcessor.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.padding_side = "left"
processor.tokenizer = tokenizer
user_message = {
"role": "user",
"content": [
{"type": "video", "video": VIDEO_PATH},
{"type": "text", "text": "Describe what is happening in this video. <think>reason step by step</think> <answer>Give your final answer here</answer>"}
]
}
messages = [user_message]
prompts = [processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)]
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
llm_inputs = [{
"prompt": prompts[0],
"multi_modal_data": {"video": video_inputs[0]},
"mm_processor_kwargs": {k: v[0] for k, v in video_kwargs.items()}
}]
outputs = llm.generate(llm_inputs, sampling_params=sampling_params)
print("🧠 Model Output:")
print(outputs[0].outputs[0].text)