Yolo v3을 이해하고 Pytorch로 모델과 Loss Function을 구현할 수 있다.
num_classes = 20
)Conv
의 채널 수가 75
인 이유는 예측하고자 하는 class가 20개이고, 각 output tensor당 anchor가 3 개씩 할당되기 때문이다. Batch_size x (num_classes + 5) * 3 x H x W
)Batch_size x 3 x H x W x (num_classes + 5)
)
Yolo v3의 특징을 Yolo v2와 비교해보며 살펴보자.
차이점의 대부분이 architecture에 녹아있기 때문에 이해가 쉬울 것이다.
Bounding box를 예측하는 부분은 Yolo v2와 동일하므로 생략하겠다.
Feature Extractor
Class Prediction
Predictions Across Scales
Upsampling(2)
) feature extractor의 중간에서 feature map을 떼어와 채널축으로 포함해 (concat(dim = 1)
) detecting 하는데 사용하였다.Result
import torch
from torch import nn
from torchinfo import summary
# Build Model
class BasicConv(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride = 1):
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding = (kernel_size - 1) // 2, bias = False),
nn.BatchNorm2d(out_channels),
nn.LeakyReLU(0.1, inplace = True)
)
def forward(self, x):
return self.conv(x)
class ResidualBlock(nn.Module):
def __init__(self, channels):
super().__init__()
self.residual = nn.Sequential(
BasicConv(channels, channels // 2, 1),
BasicConv(channels // 2, channels, 3),
)
def forward(self, x):
return self.residual(x) + x
class DarkNet53(nn.Module):
def __init__(self):
super().__init__()
self.first_conv_block = BasicConv(3, 32, 3)
self.residual_block_01 = nn.Sequential(
BasicConv(32, 64, 3, stride = 2),
ResidualBlock(64),
)
self.residual_block_02 = nn.Sequential(
BasicConv(64, 128, 3, stride = 2),
nn.Sequential(*[ResidualBlock(128) for _ in range(2)]),
)
self.residual_block_03 = nn.Sequential(
BasicConv(128, 256, 3, stride = 2),
nn.Sequential(*[ResidualBlock(256) for _ in range(8)]),
)
self.residual_block_04 = nn.Sequential(
BasicConv(256, 512, 3, stride = 2),
nn.Sequential(*[ResidualBlock(512) for _ in range(8)]),
)
self.residual_block_05 = nn.Sequential(
BasicConv(512, 1024, 3, stride = 2),
nn.Sequential(*[ResidualBlock(1024) for _ in range(4)]),
)
def forward(self, x):
x = self.first_conv_block(x)
x = self.residual_block_01(x)
x = self.residual_block_02(x)
feature_map_01 = self.residual_block_03(x)
feature_map_02 = self.residual_block_04(feature_map_01)
feature_map_03 = self.residual_block_05(feature_map_02)
return feature_map_01, feature_map_02, feature_map_03
class YoloBlock(nn.Module):
def __init__(self, in_channels, out_channels):
super().__init__()
self.route_conv = nn.Sequential(
BasicConv(in_channels, out_channels, 1),
BasicConv(out_channels, out_channels * 2, 3),
BasicConv(out_channels * 2, out_channels, 1),
BasicConv(out_channels, out_channels * 2, 3),
BasicConv(out_channels * 2, out_channels, 1),
)
self.output_conv = BasicConv(out_channels, out_channels * 2, 3)
def forward(self, x):
route = self.route_conv(x)
output = self.output_conv(route)
return route, output
class DetectionLayer(nn.Module):
def __init__(self, in_channels, num_classes):
super().__init__()
self.num_classes = num_classes
self.pred = nn.Conv2d(2 * in_channels, (num_classes + 5) * 3, 1)
def forward(self, x):
output = self.pred(x)
output = output.view(x.size(0), 3, self.num_classes + 5, x.size(2), x.size(3))
output = output.permute(0, 1, 3, 4, 2)
return output
class Upsampling(nn.Module):
def __init__(self, in_channels, out_channels):
super().__init__()
self.upsample = nn.Sequential(
BasicConv(in_channels, out_channels, 1),
nn.Upsample(scale_factor = 2)
)
def forward(self, x):
return self.upsample(x)
class Yolov3(nn.Module):
def __init__(self, num_classes = 20):
super().__init__()
self.num_classes = num_classes
self.darknet53 = DarkNet53()
self.yolo_block_01 = YoloBlock(1024, 512)
self.detectlayer_01 = DetectionLayer(512, num_classes)
self.upsample_01 = Upsampling(512, 256)
self.yolo_block_02 = YoloBlock(512 + 256, 256)
self.detectlayer_02 = DetectionLayer(256, num_classes)
self.upsample_02 = Upsampling(256, 128)
self.yolo_block_03 = YoloBlock(256 + 128, 128)
self.detectlayer_03 = DetectionLayer(128, num_classes)
def forward(self, x):
self.feature_map_01, self.feature_map_02, self.feature_map_03 = self.darknet53(x)
x, output_01 = self.yolo_block_01(self.feature_map_03)
output_01 = self.detectlayer_01(output_01)
x = self.upsample_01(x)
x, output_02 = self.yolo_block_02(torch.cat([x, self.feature_map_02], dim = 1))
output_02 = self.detectlayer_02(output_02)
x = self.upsample_02(x)
x, output_03 = self.yolo_block_03(torch.cat([x, self.feature_map_01], dim = 1))
output_03 = self.detectlayer_03(output_03)
return output_01, output_02, output_03
x = torch.randn((1, 3, 416, 416))
model = Yolov3(num_classes = 20)
out = model(x)
print(out[0].shape) # torch.Size([1, 3, 13, 13, 25])
print(out[1].shape) # torch.Size([1, 3, 26, 26, 25])
print(out[2].shape) # torch.Size([1, 3, 52, 52, 25])
summary(model, input_size = (2, 3, 416, 416), device = "cpu")
#### OUTPUT ####
==============================================================================================================
Layer (type:depth-idx) Output Shape Param #
==============================================================================================================
Yolov3 [2, 3, 13, 13, 25] --
├─DarkNet53: 1-1 [2, 256, 52, 52] --
│ └─BasicConv: 2-1 [2, 32, 416, 416] --
│ │ └─Sequential: 3-1 [2, 32, 416, 416] 928
│ └─Sequential: 2-2 [2, 64, 208, 208] --
│ │ └─BasicConv: 3-2 [2, 64, 208, 208] 18,560
│ │ └─ResidualBlock: 3-3 [2, 64, 208, 208] 20,672
│ └─Sequential: 2-3 [2, 128, 104, 104] --
│ │ └─BasicConv: 3-4 [2, 128, 104, 104] 73,984
│ │ └─Sequential: 3-5 [2, 128, 104, 104] 164,608
│ └─Sequential: 2-4 [2, 256, 52, 52] --
│ │ └─BasicConv: 3-6 [2, 256, 52, 52] 295,424
│ │ └─Sequential: 3-7 [2, 256, 52, 52] 2,627,584
│ └─Sequential: 2-5 [2, 512, 26, 26] --
│ │ └─BasicConv: 3-8 [2, 512, 26, 26] 1,180,672
│ │ └─Sequential: 3-9 [2, 512, 26, 26] 10,498,048
│ └─Sequential: 2-6 [2, 1024, 13, 13] --
│ │ └─BasicConv: 3-10 [2, 1024, 13, 13] 4,720,640
│ │ └─Sequential: 3-11 [2, 1024, 13, 13] 20,983,808
├─YoloBlock: 1-2 [2, 512, 13, 13] --
│ └─Sequential: 2-7 [2, 512, 13, 13] --
│ │ └─BasicConv: 3-12 [2, 512, 13, 13] 525,312
│ │ └─BasicConv: 3-13 [2, 1024, 13, 13] 4,720,640
│ │ └─BasicConv: 3-14 [2, 512, 13, 13] 525,312
│ │ └─BasicConv: 3-15 [2, 1024, 13, 13] 4,720,640
│ │ └─BasicConv: 3-16 [2, 512, 13, 13] 525,312
│ └─BasicConv: 2-8 [2, 1024, 13, 13] --
│ │ └─Sequential: 3-17 [2, 1024, 13, 13] 4,720,640
├─DetectionLayer: 1-3 [2, 3, 13, 13, 25] --
│ └─Conv2d: 2-9 [2, 75, 13, 13] 76,875
├─Upsampling: 1-4 [2, 256, 26, 26] --
│ └─Sequential: 2-10 [2, 256, 26, 26] --
│ │ └─BasicConv: 3-18 [2, 256, 13, 13] 131,584
│ │ └─Upsample: 3-19 [2, 256, 26, 26] --
├─YoloBlock: 1-5 [2, 256, 26, 26] --
│ └─Sequential: 2-11 [2, 256, 26, 26] --
│ │ └─BasicConv: 3-20 [2, 256, 26, 26] 197,120
│ │ └─BasicConv: 3-21 [2, 512, 26, 26] 1,180,672
│ │ └─BasicConv: 3-22 [2, 256, 26, 26] 131,584
│ │ └─BasicConv: 3-23 [2, 512, 26, 26] 1,180,672
│ │ └─BasicConv: 3-24 [2, 256, 26, 26] 131,584
│ └─BasicConv: 2-12 [2, 512, 26, 26] --
│ │ └─Sequential: 3-25 [2, 512, 26, 26] 1,180,672
├─DetectionLayer: 1-6 [2, 3, 26, 26, 25] --
│ └─Conv2d: 2-13 [2, 75, 26, 26] 38,475
├─Upsampling: 1-7 [2, 128, 52, 52] --
│ └─Sequential: 2-14 [2, 128, 52, 52] --
│ │ └─BasicConv: 3-26 [2, 128, 26, 26] 33,024
│ │ └─Upsample: 3-27 [2, 128, 52, 52] --
├─YoloBlock: 1-8 [2, 128, 52, 52] --
│ └─Sequential: 2-15 [2, 128, 52, 52] --
│ │ └─BasicConv: 3-28 [2, 128, 52, 52] 49,408
│ │ └─BasicConv: 3-29 [2, 256, 52, 52] 295,424
│ │ └─BasicConv: 3-30 [2, 128, 52, 52] 33,024
│ │ └─BasicConv: 3-31 [2, 256, 52, 52] 295,424
│ │ └─BasicConv: 3-32 [2, 128, 52, 52] 33,024
│ └─BasicConv: 2-16 [2, 256, 52, 52] --
│ │ └─Sequential: 3-33 [2, 256, 52, 52] 295,424
├─DetectionLayer: 1-9 [2, 3, 52, 52, 25] --
│ └─Conv2d: 2-17 [2, 75, 52, 52] 19,275
==============================================================================================================
Total params: 61,626,049
Trainable params: 61,626,049
Non-trainable params: 0
Total mult-adds (G): 65.43
==============================================================================================================
Input size (MB): 4.15
Forward/backward pass size (MB): 1229.50
Params size (MB): 246.50
Estimated Total Size (MB): 1480.15
==============================================================================================================
# Anchors
ANCHORS = [
[(0.28, 0.22), (0.38, 0.48), (0.9, 0.78)],
[(0.07, 0.15), (0.15, 0.11), (0.14, 0.29)],
[(0.02, 0.03), (0.04, 0.07), (0.08, 0.06)],
]
GRID_SIZE = [13, 26, 52]
scaled_anchors = torch.tensor(ANCHORS) / (
1 / torch.tensor(GRID_SIZE).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)
)
print(scaled_anchors, scaled_anchors.shape)
"""
tensor([[[ 3.6400, 2.8600],
[ 4.9400, 6.2400],
[11.7000, 10.1400]],
[[ 1.8200, 3.9000],
[ 3.9000, 2.8600],
[ 3.6400, 7.5400]],
[[ 1.0400, 1.5600],
[ 2.0800, 3.6400],
[ 4.1600, 3.1200]]]) torch.Size([3, 3, 2])
"""
# Loss function
# 출처: https://www.geeksforgeeks.org/yolov3-from-scratch-using-pytorch/
def iou(box1, box2, is_pred = True):
if is_pred:
# IoU score for prediction and label
# box1 (prediction) and box2 (label) are both in [x, y, width, height] format
box1_x_center = box1[..., 0:1]; box2_x_center = box2[..., 0:1]
box1_y_center = box1[..., 1:2]; box2_y_center = box2[..., 1:2]
box1_width = box1[..., 2:3]; box2_width = box2[..., 2:3]
box1_height = box1[..., 3:4]; box2_height = box2[..., 3:4]
# Box coordinates for prediction
box1_xmin = box1_x_center - box1_width / 2
box1_ymin = box1_y_center - box1_height / 2
box1_xmax = box1_x_center + box1_width / 2
box1_ymax = box1_y_center + box1_height / 2
# Box coordinates for ground truth
box2_xmin = box2_x_center - box2_width / 2
box2_ymin = box2_y_center - box2_height / 2
box2_xmax = box2_x_center + box2_width / 2
box2_ymax = box2_y_center + box2_height / 2
# Get the coordinates of the intersection rectangle
its_xmin = torch.max(box1_xmin, box2_xmin)
its_ymin = torch.max(box1_ymin, box2_ymin)
its_xmax = torch.min(box1_xmax, box2_xmax)
its_ymax = torch.min(box1_ymax, box2_ymax)
# Calculate Intersection area (min: 0)
intersection_area = (its_xmax - its_xmin).clamp(min = 0) * (its_ymax - its_ymin).clamp(min = 0)
# Calculate the union area
box1_area = abs(box1_width * box1_height)
box2_area = abs(box2_width * box2_height)
union = box1_area + box2_area - intersection_area
# Calculate the IoU score
epsilon = 1e-6
iou_score = intersection_area / (union + epsilon)
return iou_score
else:
# IoU score based on width and height of bounding boxes (If the two boxes have the same center coordinates)
box1_width = box1[..., 0]; box2_width = box2[..., 0]
box1_height = box1[..., 0]; box2_height = box2[..., 1]
# Calculate interaction area
intersection_area = torch.min(box1_width, box2_width) * torch.min(box2_width, box2_height)
# Calculate union area
box1_area = box1_width * box1_height
box2_area = box2_width * box2_height
union_area = box1_area + box2_area - intersection_area
# Calculate the IoU score
iou_score = intersection_area / union_area
# Return IoU score
return iou_score
def convert_cells_to_bboxes(predictions, anchors, s, is_predictions = True):
batch_size = predictions.shape[0]
num_anchors = len(anchors)
box_predictions = predictions[..., 1:5]
# If the input is predictions then we will pass the x and y coordinate
# through sigmoid function and width and height to exponent function and
# calculate the score and best class.
if is_predictions:
anchors = anchors.reshape(1, len(anchors), 1, 1, 2)
box_predictions[..., 0:2] = torch.sigmoid(box_predictions[..., 0:2])
box_predictions[..., 2:] = torch.exp(box_predictions[..., 2:] * anchors)
scores = torch.sigmoid(predictions[..., 0:1])
best_class = torch.argmax(predictions[..., 5:], dim = 1).unsqueeze(-1)
# Else we will just calculate scores and best class.
else:
scores = predictions[..., 0:1]
best_class = predictions[..., 5:6]
# Calculate cell indices
cell_indices = (
torch.arange(s)
.repeat(predictions.shape[0], 3, s, 1)
.unsqueeze(-1)
.to(predictions.device)
)
# Calculate x, y, width and height with proper scaling
x = 1 / s * (box_predictions[..., 0:1] + cell_indices)
y = 1 / s * (box_predictions[..., 1:2] + cell_indices.permute(0, 1, 3, 2, 4))
width_height = 1 / s * box_predictions[..., 2:4]
# Concatinating the values and reshaping them in
# (BATCH_SIZE, num_anchors * S * S, 6) shape
converted_bboxes = torch.cat(
(best_class, scores, x, y, width_height), dim = -1
).reshape(batch_size, num_anchors * s * s, 6)
# Returning the reshaped and converted bounding box list
return converted_bboxes.tolist()
class YoloLoss(nn.Module):
def __init__(self):
super().__init__()
self.mse = nn.MSELoss()
self.bce = nn.BCEWithLogitsLoss()
self.ce = nn.CrossEntropyLoss()
self.sigmoid = nn.Sigmoid()
def forward(self, pred, target, anchors):
# Identifying which cells in target have objects and which have no objects
obj = target[..., 0] == 1
no_obj = target[..., 0] == 0
# Calculating No object loss
no_object_loss = self.bce(
(pred[..., 0:1][no_obj]), (target[..., 0:1][no_obj]),
)
# Reshaping anchors to match predictions
anchors = anchors.reshape(1, 3, 1, 1, 2)
# Box predict confidence
box_preds = torch.cat([self.sigmoid(pred[..., 1:3]),
torch.exp(pred[..., 3:5]) * anchors], dim = -1)
# Calculating IoU for prediction and target
ious = iou(box_preds[obj], target[..., 1:5][obj]).detach()
# Calculating Object loss
object_loss = self.mse(self.sigmoid(pred[..., 0:1][obj]),
ious * target[..., 0:1][obj])
# Predicted box coordinates
pred[..., 1:3] = self.sigmoid(pred[..., 1:3])
# Target box coordinates
target[..., 3:5] = torch.log(1e-6 + target[..., 3:5] / anchors)
# Calculating box coordinates
box_loss = self.mse(pred[..., 1:5][obj], target[..., 1:5][obj])
# Calculating class loss
class_loss = self.ce((pred[..., 5:][obj]), target[..., 5:][obj].long())
# Total loss
return (
box_loss
+ object_loss
+ no_object_loss
+ class_loss
)