์ค๋ ๋ฆฌ๋ทฐ/๋ฒ์ญ/๊ตฌํ
ํ ๋
ผ๋ฌธ์ "Abien Fred M. Agarap" ์ ์๊ฐ ์ด ๋
ผ๋ฌธ์ผ๋ก, "Yichuan Tang"์ "Deep Learning using Linear Support Vector Machines"์ ๋ณด๊ณ inspired๋์ด ์ฐ๊ตฌํ๊ฒ ๋์๋ค๊ณ ํ๋ค. ํ๋จ์ ์ฐธ๊ณ ๋
ผ๋ฌธ ์์ค์ ํด๋น ๋
ผ๋ฌธ ๋งํฌ์ ์ด๋ฒ ๋
ผ๋ฌธ์ ๋งํฌ๋ฅผ ์ฒจ๋ถ์๋ค.
(์ฐธ๊ณ ) ๋ ผ๋ฌธ ์์ค
CNN(ํฉ์ฑ๊ณฑ์ ๊ฒฝ๋ง)์ Hidden layer๋ค๊ณผ learnable parameter๋ค๋ก ๊ตฌ์ฑ๋์ด ์์ผ๋ฉฐ, ๊ฐ ๋ด๋ฐ์์๋ input์ ๋ฐ์ผ๋ฉด ์ด๋ฅผ ๋ด์ ํ๊ณ , ๋น์ ํ์ฑ์ ๋ํด์ค๋ค. Raw Image์ ํด๋น class score๋ฅผ ์ด์ด์ฃผ๋ ๋งค๊ฐ์ฒด์ ์ญํ ์ ์ํํ๋ค. (์ฃผ๋ก CNN ๋ง์ง๋ง ๋จ์๋ softmaxํจ์๊ฐ ์ด์ฉ์ด ๋๋ค.
ํ์ง๋ง, ๋ช๋ช ๋ ผ๋ฌธ๋ค์ ์์ ๊ฐ์ ๋ฐฉ๋ฒ๋ก ์ ๋ฌธ์ ๋ฅผ ์ ๊ธฐํ์๋ค:
์์์ ๋ณด์ฌ์ค ๋ ผ๋ฌธ๋ค์ ๊ณตํต์ ์ผ๋ก linear SVM์ ์ด์ฉํ๋ ๊ฒ์ ์ ์ํ๋ค. ์ด์ ์ ์๋ CNN๋จ์ Softmax ๋์ SVM์ ์ด์ฉํ์ฌ ๋ถ์์ ์ํํ๋ค.
MNIST
MNIST-Fasion
์ ์๋ ์ฑ๋ฅ์ ๋น๋ก ์กฐ๊ธ ๋ฎ์ ์ ์์์ง๋ผ๋, ์ข ๋ ๊ณ ๋ํ๋ CNN์ ์ด์ฉํ๋ฉด ์ฑ๋ฅ์ ๋์ฑ ๋ ํฅ์์ํฌ ์ ์์ ๊ฒ์ด๋ผ๊ณ ์ฃผ์ฅํ๋ค.
๐ก ๋ฆฌ๋ทฐ ๋ ผ๋ฌธ ์ ์ ์ด์
ํด๋น ๋ ผ๋ฌธ์์๋ ์ด๋ฅผ ์ด์ฉํ์ฌ State-of-the-art(SOTA)๋ฅผ ์ฐ์ง๋ ์์ง๋ง, ํ์ ๋ค์ํ Vision ๋ถ์ผ์์ ๋ง์ง๋ง ๋จ์ SVM Classifier๋ฅผ ์ฌ์ฉํ๊ธฐ์ ๊ทผ๊ฐ์ด ๋ ๋ ผ๋ฌธ์ ์ ์ ํ๊ฒ ๋์๋ค. ์ต๊ทผ ์ฐ๊ตฌ์ ์์ด์ ๋ชจ๋ธ์ ๊ฐ๋จํ ๋ณํ๋ฅผ (๋ํด)์ค์ผ๋ก์จ ๋ชจ๋ธ์ ์ฑ๋ฅ์ ํฅ์์ํฌ ์ ์์๊น ํ๋ ๊ณ ๋ฏผ์ ์ฐพ์๋ณด๊ณ ์ ๋ฆฌํด๋ณด๊ฒ ๋์๋ค.
# Load Libraries
import torch
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.nn.init
from torch.utils.data import Dataset
from torch.autograd import Variable
from PIL import Image
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import helper
# GPU ์ค์
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# ๋๋ค ์๋ ๊ณ ์
torch.manual_seed(123)
# GPU ์ฌ์ฉ ๊ฐ๋ฅ์ผ ๊ฒฝ์ฐ ๋๋ค ์๋ ๊ณ ์
if device == 'cuda':
torch.cuda.manual_seed_all(123)
# Define a transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=(0.5,), std=(0.5,))])
Table 1: Dataset distribution for both MNIST and Fashion-MNIST
# Download and load the training data
fashion_trainset = datasets.FashionMNIST('~/.pytorch/F_MNIST_data/', download=True, train=True, transform=transform)
fashion_trainloader = torch.utils.data.DataLoader(fashion_trainset, batch_size=128, shuffle=True)
# Download and load the test data
fashion_testset = datasets.FashionMNIST('~/.pytorch/F_MNIST_data/', download=True, train=False, transform=transform)
fashion_testloader = torch.utils.data.DataLoader(fashion_testset, batch_size=128, shuffle=True)
# Download and load the training data
mnist_trainset = datasets.MNIST('~/.pytorch/MNIST_data/', download=True, train=True, transform=transform)
mnist_trainloader = torch.utils.data.DataLoader(mnist_trainset, batch_size=128, shuffle=True)
# Download and load the test data
mnist_testset = datasets.MNIST('~/.pytorch/MNIST_data/', download=True, train=False, transform=transform)
mnist_testloader = torch.utils.data.DataLoader(mnist_testset, batch_size=128, shuffle=True)
class SVM:
# set learning_rate, lambda, n iterations
def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
self.lr = learning_rate
self.lambda_param = lambda_param
self.n_iters = n_iters
self.w = None
self.b = None
# SVM fit function
def fit(self, X, y):
n_samples, n_features = X.shape
y_ = np.where(y <= 0, -1, 1)
self.w = np.zeros(n_features)
self.b = 0
for _ in range(self.n_iters):
for idx, x_i in enumerate(X):
condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
if condition:
self.w -= self.lr * (2 * self.lambda_param * self.w)
else:
self.w -= self.lr * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
self.b -= self.lr * y_[idx]
# SVM predict function
def predict(self, X):
approx = np.dot(X, self.w) - self.b
return np.sign(approx)
์ ์๊ฐ ์ด์ฉํ ๋ชจ๋ธ ๊ตฌ์กฐ(์ง์ ์ ์)
์ ์๊ฐ ์ด์ฉํ ๋ชจ๋ธ ๊ตฌ์กฐ(๋
ผ๋ฌธ ์๋ก)
์ ์๊ฐ ์ด์ฉํ ๋ชจ๋ธ ๊ตฌ์กฐ(์ง์ ๊ตฌํ)
class CNN(torch.nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.drop_prob = 0.5
# define layer1
self.layer1 = torch.nn.Sequential(
torch.nn.Conv2d(1, 32, kernel_size=5, stride=1),
torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size=2, stride=1))
# define layer2
self.layer2 = torch.nn.Sequential(
torch.nn.Conv2d(32, 64, kernel_size=5, stride=1),
torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size=2, stride=1))
# define fully connected layer (1024)
self.fc1 = torch.nn.Linear(18 * 18 * 64, 1024, bias=True)
torch.nn.init.xavier_uniform_(self.fc1.weight)
self.layer3 = torch.nn.Sequential(
self.fc1,
torch.nn.Dropout(p= self.drop_prob))
# define fully connected layer (10 classes)
self.fc2 = torch.nn.Linear(1024, 10, bias=True)
torch.nn.init.xavier_uniform_(self.fc2.weight)
# define feed-forward
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.view(out.size(0), -1) # Flatten them for FC
out = self.layer3(out)
out = self.fc2(out)
return out
class multiClassHingeLoss(nn.Module):
def __init__(self, p=1, margin=1, weight=None, size_average=True):
super(multiClassHingeLoss, self).__init__()
self.p=p
self.margin=margin
self.weight=weight
self.size_average=size_average
# define feed-forward
def forward(self, output, y):`
output_y=output[torch.arange(0,y.size()[0]).long().cuda(),y.data.cuda()].view(-1,1)
# output - output(y) + output(i)
loss=output-output_y+self.margin
# remove i=y items
loss[torch.arange(0,y.size()[0]).long().cuda(),y.data.cuda()]=0
# apply max function
loss[loss<0]=0
# apply power p function
if(self.p!=1):
loss=torch.pow(loss,self.p)
# add weight
if(self.weight is not None):
loss=loss*self.weight
# sum up
loss=torch.sum(loss)
if(self.size_average):
loss/=output.size()[0]
return loss
๐ก ์ ๊น!! hinge loss๋?
- ํ์ต๋ฐ์ดํฐ ๊ฐ๊ฐ์ ๋ฒ์ฃผ๋ฅผ ๊ตฌ๋ถํ๋ฉด์ ๋ฐ์ดํฐ์์ ๊ฑฐ๋ฆฌ๊ฐ ๊ฐ์ฅ ๋จผ ๊ฒฐ์ ๊ฒฝ๊ณ(decision boundary)๋ฅผ ์ฐพ๊ธฐ ์ํด ๊ณ ์๋ ์์คํจ์์ ํ ๋ถ๋ฅ. ์ด๋ก์จ ๋ฐ์ดํฐ์ ๊ฒฝ๊ณ ์ฌ์ด์ ๋ง์ง(margin)์ด ์ต๋ํ๋๋ค.
- ์ด์ง ๋ถ๋ฅ๋ฌธ์ ์์ ๋ชจ๋ธ์ ์์ธก๊ฐย yโฒ(์ค์นผ๋ผ), ํ์ต๋ฐ์ดํฐ์ ์ค์ ๊ฐย y (-1 ๋๋ 1) ์ฌ์ด์ hinge loss๋ ์๋์ ๊ฐ์ด ์ ์๋๋ค.
Table 2: Hyper-parameters used for CNN-Softmax and CNNSVM models.
learning_rate = 0.001
training_epochs = 50
# training_epochs = 10000
# ํด๋น ๋
ผ๋ฌธ์์๋ ๋ง๋ฒ์ epoch๋ฅผ ์ํํ์ง๋ง computation power๋ก ์ธํด epoch 50ํ ์ํ
batch_size = 128
# MNIST CNN ๋ชจ๋ธ ์ ์
mnist_model = CNN().to(device)
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(mnist_model.parameters(), lr=learning_rate)
total_batch = len(mnist_trainloader)
print('์ด ๋ฐฐ์น์ ์ : {}'.format(total_batch))
# MNIST CNN+SVM ๋ชจ๋ธ ์ ์
minst_SVM_model = CNN().to(device)
criterion = multiClassHingeLoss().to(device)
optimizer = torch.optim.Adam(minst_SVM_model.parameters(), lr=learning_rate)
total_batch = len(mnist_trainloader)
print('์ด ๋ฐฐ์น์ ์ : {}'.format(total_batch))
# fashion-MNIST CNN ๋ชจ๋ธ ์ ์
fashion_model = CNN().to(device)
criterion = torch.nn.CrossEntropyLoss().to(device) # ๋น์ฉ ํจ์์ ์ํํธ๋งฅ์ค ํจ์ ํฌํจ๋์ด์ ธ ์์.
optimizer = torch.optim.Adam(fashion_model.parameters(), lr=learning_rate)
total_batch = len(fashion_trainloader)
print('์ด ๋ฐฐ์น์ ์ : {}'.format(total_batch))
# fashion-MNIST CNN + SVM ๋ชจ๋ธ ์ ์
fashion_SVM_model = CNN().to(device)
criterion = multiClassHingeLoss().to(device)
optimizer = torch.optim.Adam(fashion_SVM_model.parameters(), lr=learning_rate)
total_batch = len(fashion_trainloader)
print('์ด ๋ฐฐ์น์ ์ : {}'.format(total_batch))
# mnist_model(CNN)
for epoch in range(training_epochs):
avg_cost = 0
for X, Y in mnist_trainloader:
X = X.to(device)
Y = Y.to(device)
optimizer.zero_grad()
hypothesis = mnist_model(X)
cost = criterion(hypothesis, Y)
cost.backward()
optimizer.step()
avg_cost += cost / total_batch
print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, avg_cost))
# minst_SVM_model(CNN + SVM)
for epoch in range(training_epochs):
avg_cost = 0
for X, Y in mnist_trainloader:
X = X.to(device)
Y = Y.to(device)
optimizer.zero_grad()
hypothesis = minst_SVM_model(X)
cost = criterion(hypothesis, Y)
cost.backward()
optimizer.step()
avg_cost += cost / total_batch
print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, avg_cost))
# fashion_model(CNN)
for epoch in range(training_epochs):
avg_cost = 0
for X, Y in fashion_trainloader:
X = X.to(device)
Y = Y.to(device)
optimizer.zero_grad()
hypothesis = fashion_model(X)
cost = criterion(hypothesis, Y)
cost.backward()
optimizer.step()
avg_cost += cost / total_batch
print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, avg_cost))
# fashion_SVM_model(CNN + SVM)
for epoch in range(training_epochs):
avg_cost = 0
for X, Y in fashion_trainloader:
X = X.to(device)
Y = Y.to(device)
optimizer.zero_grad()
hypothesis = fashion_SVM_model(X)
cost = criterion(hypothesis, Y)
cost.backward()
optimizer.step()
avg_cost += cost / total_batch
print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, avg_cost))
# mnist_model(CNN)
with torch.no_grad():
correct = 0
total = 0
for X_test, Y_test in mnist_testloader:
X_test = X_test.to(device)
Y_test = Y_test.to(device)
prediction = mnist_model(X_test)
predicted = torch.argmax(prediction, 1)
total += Y_test.size(0)
correct += (predicted == Y_test).sum().item()
print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
# mnist_SVM_model(CNN+SVM)
with torch.no_grad():
correct = 0
total = 0
for X_test, Y_test in mnist_testloader:
X_test = X_test.to(device)
Y_test = Y_test.to(device)
prediction = mnist_SVM_model(X_test)
predicted = torch.argmax(prediction, 1)
total += Y_test.size(0)
correct += (predicted == Y_test).sum().item()
print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
# fashion_model(CNN)
with torch.no_grad():
correct = 0
total = 0
for X_test, Y_test in fashion_testloader:
X_test = X_test.to(device)
Y_test = Y_test.to(device)
prediction = fashion_model(X_test)
predicted = torch.argmax(prediction, 1)
total += Y_test.size(0)
correct += (predicted == Y_test).sum().item()
print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
# fashion_SVM_model(CNN + SVM)
with torch.no_grad():
correct = 0
total = 0
for X_test, Y_test in fashion_testloader:
X_test = X_test.to(device)
Y_test = Y_test.to(device)
prediction = fashion_SVM_model(X_test)
predicted = torch.argmax(prediction, 1)
total += Y_test.size(0)
correct += (predicted == Y_test).sum().item()
print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
Table 3: Test accuracy of CNN-Softmax and CNN-SVM on image classification using MNIST and Fashion-MNIST
Dataset | CNN-softmax | CNN-SVM |
---|---|---|
MNIST | 98.47% | 98.77% |
FASHION-MNIST | 88.13% | 87.84% |
๋ณธ ์ฐ๊ตฌ ๊ฒฐ๊ณผ๋ "Deep Learning using Linear Support Vector Machines"์ ์ ์๋ CNN-SVM์ ๋ํ ๊ฒํ ๋ฅผ ๋์ฑ ๊ฒ์ฆํ๊ธฐ ์ํ ๋ฐฉ๋ฒ๋ก ์ ๊ฐ์ ์ ๋ณด์ฆํ๋๋ฐ ์์๋ฅผ ๋๋ค.
"Deep Learning using Linear Support Vector Machines"์ ์กฐ์ฌ ๊ฒฐ๊ณผ์ ๋ชจ์๋จ์๋ ๋ถ๊ตฌํ๊ณ , ์์ ์ผ๋ก ๋งํ๋ฉด, CNN-์ํํธ๋งฅ์ค์ CNN-SVM์ ์ํ ์ ํ๋๋ ๊ด๋ จ ์ฐ๊ตฌ์ ๊ฑฐ์ ๊ฐ๋ค.
๋ฐ๋ผ์, ์ถ๊ฐ์ ์ธ ๋ฐ์ดํฐ ์ฌ์ ์ฒ๋ฆฌ ๋ฐ ๋น๊ต์ ์ ๊ตํ base CNN ๋ชจ๋ธ์ ์ด์ฉํ๋ฉด ์ถฉ๋ถํ ํด๋น ๊ฒฐ๊ณผ๋ฅผ ์ฌํํ ์ ์์ ๊ฒ์ด๋ค.