Ultra fast lane detection 코드 리뷰

박민서·2023년 8월 25일

딥러닝 공부

목록 보기

1/9

backbone

class vgg16bn(torch.nn.Module):
    def __init__(self,pretrained = False):
        super(vgg16bn,self).__init__()
        model = list(torchvision.models.vgg16_bn(pretrained=pretrained).features.children())
        model = model[:33]+model[34:43]
        self.model = torch.nn.Sequential(*model)
        
    def forward(self,x):
        return self.model(x)
class resnet(torch.nn.Module):
    def __init__(self,layers,pretrained = False):
        super(resnet,self).__init__()
        if layers == '18':
            model = torchvision.models.resnet18(pretrained=pretrained)
        elif layers == '34':
            model = torchvision.models.resnet34(pretrained=pretrained)
        elif layers == '50':
            model = torchvision.models.resnet50(pretrained=pretrained)
        elif layers == '101':
            model = torchvision.models.resnet101(pretrained=pretrained)
        elif layers == '152':
            model = torchvision.models.resnet152(pretrained=pretrained)
        elif layers == '50next':
            model = torchvision.models.resnext50_32x4d(pretrained=pretrained)
        elif layers == '101next':
            model = torchvision.models.resnext101_32x8d(pretrained=pretrained)
        elif layers == '50wide':
            model = torchvision.models.wide_resnet50_2(pretrained=pretrained)
        elif layers == '101wide':
            model = torchvision.models.wide_resnet101_2(pretrained=pretrained)
        else:
            raise NotImplementedError
        
        self.conv1 = model.conv1
        self.bn1 = model.bn1
        self.relu = model.relu
        self.maxpool = model.maxpool
        self.layer1 = model.layer1
        self.layer2 = model.layer2
        self.layer3 = model.layer3
        self.layer4 = model.layer4

    def forward(self,x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x2 = self.layer2(x)
        x3 = self.layer3(x2)
        x4 = self.layer4(x3)
        return x2,x3,x4코드를 입력하세요

기본적으로 16개 계층으로 구성된 CNN인 VGG-16과 ResNet을 backbone으로 사용한다.
ResNet은 layers의 수를 지정할 수 있으며 지정된 layers에 따라 pretrained된 모델을 선택할 수 있다.

model

class conv_bn_relu(torch.nn.Module):
    def __init__(self,in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1,bias=False):
        super(conv_bn_relu,self).__init__()
        self.conv = torch.nn.Conv2d(in_channels,out_channels, kernel_size, 
            stride = stride, padding = padding, dilation = dilation,bias = bias)
        self.bn = torch.nn.BatchNorm2d(out_channels)
        self.relu = torch.nn.ReLU()

    def forward(self,x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        return x코드를 입력하세요

conv_bn_relu, 즉 컨볼루션 연산과 Batch Normalization, relu연산이 이루어지는 클래스에는 두 가지 함수가 존재한다.
첫 번째로는 입력 채널의 수, 출력 채널의 수, 커널(필터)의 크기, 스트라이드 값, 패딩 값, 확장 값, 편향 값의 인자들을 받아 초기화하는 함수이고, 두 번째로는 실제 순전파 연산을 수행하는 함수로써 합성곱, 배치정규화, ReLU를 순차적으로 적용하여 출력을 생성한다.

class parsingNet(torch.nn.Module):
    def __init__(self, size=(288, 800), pretrained=True, backbone='50', cls_dim=(37, 10, 4), use_aux=False):
        super(parsingNet, self).__init__()

        self.size = size
        self.w = size[0]
        self.h = size[1]
        self.cls_dim = cls_dim # (num_gridding, num_cls_per_lane, num_of_lanes)
        # num_cls_per_lane is the number of row anchors
        self.use_aux = use_aux
        self.total_dim = np.prod(cls_dim)

        # input : nchw,
        # output: (w+1) * sample_rows * 4 
        self.model = resnet(backbone, pretrained=pretrained)

        if self.use_aux:
            self.aux_header2 = torch.nn.Sequential(
                conv_bn_relu(128, 128, kernel_size=3, stride=1, padding=1) if backbone in ['34','18'] else conv_bn_relu(512, 128, kernel_size=3, stride=1, padding=1),
                conv_bn_relu(128,128,3,padding=1),
                conv_bn_relu(128,128,3,padding=1),
                conv_bn_relu(128,128,3,padding=1),
            )
            self.aux_header3 = torch.nn.Sequential(
                conv_bn_relu(256, 128, kernel_size=3, stride=1, padding=1) if backbone in ['34','18'] else conv_bn_relu(1024, 128, kernel_size=3, stride=1, padding=1),
                conv_bn_relu(128,128,3,padding=1),
                conv_bn_relu(128,128,3,padding=1),
            )
            self.aux_header4 = torch.nn.Sequential(
                conv_bn_relu(512, 128, kernel_size=3, stride=1, padding=1) if backbone in ['34','18'] else conv_bn_relu(2048, 128, kernel_size=3, stride=1, padding=1),
                conv_bn_relu(128,128,3,padding=1),
            )
            self.aux_combine = torch.nn.Sequential(
                conv_bn_relu(384, 256, 3,padding=2,dilation=2),
                conv_bn_relu(256, 128, 3,padding=2,dilation=2),
                conv_bn_relu(128, 128, 3,padding=2,dilation=2),
                conv_bn_relu(128, 128, 3,padding=4,dilation=4),
                torch.nn.Conv2d(128, cls_dim[-1] + 1,1)
                # output : n, num_of_lanes+1, h, w
            )
            initialize_weights(self.aux_header2,self.aux_header3,self.aux_header4,self.aux_combine)

        self.cls = torch.nn.Sequential(
            torch.nn.Linear(1800, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, self.total_dim),
        )

        self.pool = torch.nn.Conv2d(512,8,1) if backbone in ['34','18'] else torch.nn.Conv2d(2048,8,1)
        # 1/32,2048 channel
        # 288,800 -> 9,40,2048
        # (w+1) * sample_rows * 4
        # 37 * 10 * 4
        initialize_weights(self.cls)

    def forward(self, x):
        # n c h w - > n 2048 sh sw
        # -> n 2048
        x2,x3,fea = self.model(x)
        if self.use_aux:
            x2 = self.aux_header2(x2)
            x3 = self.aux_header3(x3)
            x3 = torch.nn.functional.interpolate(x3,scale_factor = 2,mode='bilinear')
            x4 = self.aux_header4(fea)
            x4 = torch.nn.functional.interpolate(x4,scale_factor = 4,mode='bilinear')
            aux_seg = torch.cat([x2,x3,x4],dim=1)
            aux_seg = self.aux_combine(aux_seg)
        else:
            aux_seg = None

        fea = self.pool(fea).view(-1, 1800)

        group_cls = self.cls(fea).view(-1, *self.cls_dim)

        if self.use_aux:
            return group_cls, aux_seg

        return group_cls

'ParsingNet'클래스에서는 입력 이미지에서 차선을 감지하고 예측하는 기능을 구현한다.

init 메서드에서는 입력 이미지의 크기, pretrained된 가중치를 사용할지에 대한 여부, 백본에서 사용할 ResNet의 버전-50으로 선택-, 차선클래스의 차원을 나타내는 값(그리드의 수, 차선 당 클래스의 수, 차선의 수), aux(보조 분류기)를 사용할지에 대한 여부를 정하고 초기화한다.

차례대로 살펴보면 입력 이미지의 크기에 대해 입력받고 보조 분류기를 사용한다. 보조분류기는 네트워크 아키텍처 중간에 추가되며, 입력 이미지의 특징을 사용하여 다양한 크기의 정보를 확보하는 기능을 한다. 각 보조 분류기는 네트워크 중간에 들어가 처리하기 때문에 입력받는 이미지의 크기와 필터 사이즈가 다르다. 그 과정은 입력 데이터를 합성곱, 배치정규화, ReLU활성화함수를 순차적으로 적용하는 것으로 이루어진다.

메인 분류를 하는 self.cls와 self.pool은 차선 분류를 하는데 수행되며, 주로 네트워크의 마지막 부분에 위치한다. 이 과정을 통해 차선 클래스의 점수를 예측할 수 있다.
cls레이어는 두 개의 선형 레이어와 활성화 함수를 사용하는데, 1800크기의 입력 크기를 self.total_dim크기의 출력을 반환한다.
pool레이어는 pooling작업을 수행하는데, backbone 네트워크인 ResNet의 버전이 34, 18일 땐 입력, 출력, 필터 사이즈가 512, 8, 1인 합성곱 연산을 하고, 그 외에는 2048, 8, 1의 합성곱 연산을 하는 것을 말한다.

forward메서드에서는 백본 네트워크의 출력으로부터 x2, x3, fea(feature)라는 3개의 feature map을 얻은 후 보조 분류기 레이어에 통과시켜 보조 분류 feature map을 생성한다. 메인 분류에서는 fea라는 feature map을 pooling 레이어에 통과시켜 feature map의 차원을 줄이고, 1차원 텐서로 변환한다. 그 다음 줄어든 feature map을 cls레이어에 통과시켜 차선 클래스의 점수를 예측하고 cls_dim차원에 맞게 형태를 재조정한다.

마지막으로 예측값을 반환한다.

initialize_weights함수를 통해 가중치를 초기화하는 모델을 입력받고 real_init_weights함수를 통해 실제로 가중치를 초기화한다.

손실함수

class OhemCELoss(nn.Module):
    def __init__(self, thresh, n_min, ignore_lb=255, *args, **kwargs):
        super(OhemCELoss, self).__init__()
        self.thresh = -torch.log(torch.tensor(thresh, dtype=torch.float)).cuda()
        self.n_min = n_min
        self.ignore_lb = ignore_lb
        self.criteria = nn.CrossEntropyLoss(ignore_index=ignore_lb, reduction='none')

    def forward(self, logits, labels):
        N, C, H, W = logits.size()
        loss = self.criteria(logits, labels).view(-1)
        loss, _ = torch.sort(loss, descending=True)
        if loss[self.n_min] > self.thresh:
            loss = loss[loss>self.thresh]
        else:
            loss = loss[:self.n_min]
        return torch.mean(loss)

classification을 하기 위한 손실함수이다.
우선 init class에서 임계값과 최소 샘플 개수, 무시할 레이블의 인덱스를 입력받고 초기화한다.
그래고 nn.CrossEntropyLoss의 인스턴스인 criteria를 생성한다

forward 메서드에서는 네트워크의 출력으로 받은 로짓 텐서를 통해 그 사이즈에 따라 배치크기, 채널 수, 높이, 너비를 입력받는다.
init class에서 생성한 criteria를 통해 각 샘플에 대한 손실값을 구하고, 이를 내림차순으로 정렬한다. 이 값 중에서 n_min번째 값이 임계값보다 크다면 이보다 큰 손실값들만 서ㄴ택하여 사용한다. 그렇지 않으면 처음부터 n_min개의 손실값만 사용한다.
마지막으로 손실 값들의 평균을 계산하여 최종 손실 값을 반환한다.

논문에서는 classification을 위한 손실함수 외에도 차선위치에 대한 포인트학습을 목표로 하는 두 가지 손실함수를 제안한다.

class ParsingRelationLoss(nn.Module):
    def __init__(self):
        super(ParsingRelationLoss, self).__init__()
    def forward(self,logits):
        n,c,h,w = logits.shape
        loss_all = []
        for i in range(0,h-1):
            loss_all.append(logits[:,:,i,:] - logits[:,:,i+1,:])
        #loss0 : n,c,w
        loss = torch.cat(loss_all)
        return torch.nn.functional.smooth_l1_loss(loss,torch.zeros_like(loss))

logits은 네트워크의 출력으로부터 얻은 값으로, 각 픽셀이 특정 클래스에 속할 확률을 나타내는 값이다.
n, c, h, w는 로짓 텐서의 차원을 나타내는 값으로, 각각 배치크기, 클래스 수, 로짓의 높이, 로짓의 너비를 의미한다.
이후 for문을 통해 각 위치에서의 로짓 값 간의 차이를 기록한다.
마지막으로 기록된 로짓 값 차이 텐서와 모두 0으로 이루어진 텐서 간의 smooth L1 Loss를 계산한다.

class ParsingRelationDis(nn.Module):
    def __init__(self):
        super(ParsingRelationDis, self).__init__()
        self.l1 = torch.nn.L1Loss()
        # self.l1 = torch.nn.MSELoss()
    def forward(self, x):
        n,dim,num_rows,num_cols = x.shape
        x = torch.nn.functional.softmax(x[:,:dim-1,:,:],dim=1)
        embedding = torch.Tensor(np.arange(dim-1)).float().to(x.device).view(1,-1,1,1)
        pos = torch.sum(x*embedding,dim = 1)

        diff_list1 = []
        for i in range(0,num_rows // 2):
            diff_list1.append(pos[:,i,:] - pos[:,i+1,:])

        loss = 0
        for i in range(len(diff_list1)-1):
            loss += self.l1(diff_list1[i],diff_list1[i+1])
        loss /= len(diff_list1) - 1
        return loss

ParsingRelationDis함수는 차선의 연속성을 평가하는 기능을 한다.
우선 x는 네트워크의 텐서인 로짓 텐서로, 특정 픽셀에서 어떤 클래스에 속할 확률을 나타내는 값이다.
n, dim, num_rows, num_cols는 로짓 텐서의 차원을 나타내는 값으로 각각 배치 크기, 클래스 수, 높이, 너비를 의미한다.
이후 torch.nn.functional.softmax함수를 사용하여 로짓을 확률값으로 변환한다. 이 작업을 통해 픽셀이 특정 클래스에 속하는 확률을 얻을 수 있다.
embedding은 클래스의 인덱스를 나타내는 텐서이고, pos는 embedding의 값을 통해 각 위치에서 추정된 차선 위치를 얻을 수 있다.
반복문은 로짓의 높이의 절반에 대해 반복하며, 이를 통해 추정된 차선 위치로부터 얻은 행 간의 차이를 얻는다.
다음으로 계산된 차이들의 평균 loss를 계산하고, 이를 통해 추정된 차선 위치간의 평균적인 차이를 계산한다.
마지막으로 계산된 손실값을 반환한다.
위 과정을 통해 얻은 값을 이용하여 차선 위치간의 연속성을 학습하는데 사용할 수 있다.

class SoftmaxFocalLoss(nn.Module):
    def __init__(self, gamma, ignore_lb=255, *args, **kwargs):
        super(SoftmaxFocalLoss, self).__init__()
        self.gamma = gamma
        self.nll = nn.NLLLoss(ignore_index=ignore_lb)

    def forward(self, logits, labels):
        scores = F.softmax(logits, dim=1)
        factor = torch.pow(1.-scores, self.gamma)
        log_score = F.log_softmax(logits, dim=1)
        log_score = factor * log_score
        loss = self.nll(log_score, labels)
        return loss

위 softmax함수는 차선 위치에 대한 손실함수를 계산할 때 미분이 불가능한 부분을 해결하기 위해 사용된다. 다시말해 차선의 모양을 제한할 때 이차 미분 방정식을 사용하는데, 그 과정에서 사용되는 argmax함수가 미분이 불가능하다는 점을 해결하기 위해 사용한다.

forward 메서드는 두 개의 인자 logits와 labels를 받습니다. logits는 네트워크의 출력인 로짓 텐서이며, labels는 정답 레이블 텐서입니다.
F.softmax 함수를 사용하여 logits를 확률 분포로 변환합니다. 이로부터 각 클래스에 속할 확률 값을 얻습니다.
torch.pow 함수를 사용하여 1 - scores를 gamma만큼 거듭제곱하여 factor를 계산합니다. 이 값은 Focal Loss의 가중치 역할을 수행합니다.
F.log_softmax 함수를 사용하여 logits를 로그 소프트맥스 로그 값으로 변환합니다.
factor와 log_score를 곱하여 Focal Loss에서 클래스의 가중치를 적용한 값을 얻습니다.

이 클래스는 특히 클래스 불균형 문제를 해결하고자 할 때 사용되며, 더욱 어려운 클래스에 대한 학습을 강화하는 데 도움이 될 수 있습니다.

박민서

다음 포스트

Ultra fast lane detection 코드 리뷰

딥러닝 공부

backbone

model

손실함수

아키텍처란?

0개의 댓글