[CV 16] DETR & RT-DETR - Video Object Detection

방선생·2025년 3월 13일

Computer Vision

목록 보기

16/16

Video Object Detection

비디오 객체 검출은 연속된 프레임에서 객체를 식별하고 추적하는 기술로, 실시간 감시, 자율주행, 스포츠 분석 등 다양한 분야에서 활용 가능

일반적인 이미지 객체 검출과 달리, 비디오 객체 검출은 시간적 연속성을 활용하여 더 정확한 예측 가능

(이 시리즈의 모든 코드는 코랩환경에서 Python으로 작성하였습니다)

Video Object Detection Code 1 (필요한 라이브러리 임폴트)

# 필요한 라이브러리 / 함수 임폴트
import torch
import cv2
from PIL import Image
from transformers import DetrForObjectDetection, DetrImageProcessor

DETR - Video Object Detection Code 1 (모델 및 GPU 설정)

# 사전 학습된 모델 다운로드
model_name="facebook/detr-resnet-101"
processor_detr = DetrImageProcessor.from_pretrained(model_name, revision='no_timm')
model_detr = DetrForObjectDetection.from_pretrained(model_name, revision='no_timm')

# GPU 설정확인
!nvidia-smi

# pytorch 버전 확인
torch.__version__

# pytorch : GPU 사용을 위한 환경 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)
model_detr.to(device)

DETR - Video Object Detection Code 2 (데이터 전처리 및 모델 실행)

# 비디오 영상 처리


# 파일 경로 설정
video_path = '/content/drive/MyDrive/CV/messi_video.mp4'
output_path = '/content/drive/MyDrive/CV/output_DETR.mp4'


# 비디오 읽어오기
cap = cv2.VideoCapture(video_path)


# 특성 추출
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))


# 원본 비디오 --> 결과물 생성
frame_count = 0
last_detections = []  # Para almacenar las últimas detecciones


while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break


    # 비디오 프레임 처리
    if frame_count % 1 == 0:
        # PIL 이미지로 변환
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))


        # RTDetr로 이미지 처리
        inputs = processor_detr(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model_detr(**inputs)


        results = processor_detr.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.6)


        # object detection 결과(bounding box, label) 저장
        last_detections = []
        for result in results:
            for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
                box = [int(i) for i in box.tolist()]
                label = model_detr.config.id2label[label_id.item()]
                last_detections.append((box, label, score.item()))


    # object detection 결과(bounding box, label)를 이미지에 반영
    for box, label, score in last_detections:
        x1, y1, x2, y2 = box
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, f"{label}: {score:.2f}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)


    # 출력 파일 생성
    out.write(frame)
    frame_count += 1


# 작업 마무리
cap.release()
out.release()
print(f"Processed {frame_count} frames and saved to {output_path}")

RT-DETR - Video Object Detection Code 1 (모델 및 GPU 설정)

# 필요한 라이브러리 / 함수 임폴트
from transformers import RTDetrForObjectDetection, RTDetrImageProcessor

# 사전 학습된 모델 다운로드
model_name = "PekingU/rtdetr_r101vd_coco_o365"
processor_RT = DetrImageProcessor.from_pretrained(model_name)
model_RT = DetrForObjectDetection.from_pretrained(model_name)

# pytorch : GPU 사용을 위한 환경 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_RT.to(device)

RT-DETR - Video Object Detection Code 2 (데이터 전처리 및 모델 실행)

# 비디오 영상 처리


# 파일 경로 설정
video_path = '/content/drive/MyDrive/CV/messi_video.mp4'
output_path = '/content/drive/MyDrive/CV/output_RT.mp4'


# 비디오 읽어오기
cap = cv2.VideoCapture(video_path)


# 특성 추출
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))


# 원본 비디오 --> 결과물 생성
frame_count = 0
last_detections = []  # Para almacenar las últimas detecciones


while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break


    # 비디오 프레임 처리
    if frame_count % 1 == 0:
        # PIL 이미지로 변환
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))


        # RTDetr로 이미지 처리
        inputs = processor_RT(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model_RT(**inputs)


        results = processor_RT.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.6)


        # object detection 결과(bounding box, label) 저장
        last_detections = []
        for result in results:
            for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
                box = [int(i) for i in box.tolist()]
                label = model_RT.config.id2label[label_id.item()]
                last_detections.append((box, label, score.item()))

    # object detection 결과(bounding box, label)를 이미지에 반영
    for box, label, score in last_detections:
        x1, y1, x2, y2 = box
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, f"{label}: {score:.2f}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)


    # 출력 파일 생성
    out.write(frame)
    frame_count += 1


# 작업 마무리
cap.release()
out.release()
print(f"Processed {frame_count} frames and saved to {output_path}")