[PyTorch] 파이토치 전체 구조

지윤곽·2022년 3월 16일

PyTorch 네이버 부스트캠프

Reference

torch.nn.Module

딥러닝을 구성하는 layer의 base class
총 4가지 정의 : Input, Output, Forward, Backward(Autograd:자동미분) 정의
parameter(tensor:weights) 정의 - 학습 대상

Untitled

nn.Parameter

Tensor 객체의 상속 객체
nn.Module 내에 required_grad = True 로 지정되어 학습의 대상이 되는 Tensor
수작업으로 지정하는 일이 없음 - layer안에 weight값들이 지정되어 있음 (Linear .... )

nn.Parameter는 low-level의 api 이기 때문에 실제 학습에서는 잘 사용되지 않는다

import torch
from torch import nn
from torch import Tensor

class MyLinear(nn.Module):
    def __init__(self, in_features, out_features, bias = True):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features

        self.weights = nn.Parameter(
            torch.randn(in_features, out_features)
        )
        self.bias = nn.Parameter(torch.randn(out_features))

    def forward(self, x:Tensor):
        return x@self.weights + self.bias

x = torch.randn(5,7) # 데이터 5개 feature 7개
# row 5, col 7 tensor의 정규분포를 따르는 랜덤 숫자 반환 (input x)
layer = MyLinear(7,12) 
# (weight tensor)
print(layer(x).shape) 
# Q. 어떻게 객체에 input을 넣으면 바로 forward 함수로 들어가지?
# A. nn.Module을 상속하면서 layer와 output을 반환하는 forward(input) 메서드를
#    상속하고 있기 때문에 자연적으로 input을 객체 안에 넣으면 forward 함수를 실행하게 되는것

# backwrard propagation을 할때 미분이 되는 대상
for value in layer.parameters():
    print(value)

# Parameter containing: --> weight
# tensor([[-1.6870e+00, -1.6341e+00, -1.1871e-01, -1.3332e-01,  4.0530e-01,
#           1.4942e+00,  1.2132e-01,  3.0088e-01,  8.7858e-01,  1.2331e+00,
#          -6.3245e-01,  1.3270e+00],
#         [-1.6745e+00, -7.8123e-01,  3.2348e-01, -4.5228e-01, -4.1481e-01,
#           2.0434e-01, -1.9098e+00, -3.1609e-01, -1.8538e+00,  5.9098e-02,
#           8.3901e-03, -1.6366e+00],
#         [-3.3777e-01,  1.7517e-01, -5.5396e-01, -2.0859e+00,  1.6113e-03,
#           9.9615e-01, -9.9338e-01, -2.5963e-01,  7.4590e-01,  8.2320e-02,
#           4.4123e-01,  3.2984e-01],
#         [-1.0102e+00, -8.6517e-01,  5.9489e-01,  2.8615e+00,  3.1414e+00,
#          -4.0385e-01, -3.4812e-01, -2.2696e-01,  1.0443e-01,  4.4085e-01,
#          -1.7535e+00,  8.2616e-01],
#         [ 7.5014e-01, -6.0464e-01, -3.4773e-01, -5.2418e-01, -9.8451e-01,
#          -8.0645e-01, -1.1786e+00,  9.0820e-02, -4.9526e-01,  5.9909e-01,
#           9.5106e-01, -1.7665e+00],
#         [ 9.5683e-01, -1.2164e+00, -1.8714e+00, -1.5876e+00, -1.0724e+00,
#          -2.6991e-02,  1.0431e+00,  1.1481e+00, -1.0769e+00,  2.5158e+00,
#          -6.1803e-01,  5.1032e-01],
#         [-1.1782e+00, -2.8835e+00,  2.3490e-02, -1.6397e+00, -1.6549e+00,
#          -5.8763e-01, -2.0034e+00, -1.0103e+00,  1.1630e+00, -1.9497e+00,
#           1.1685e-01, -3.7031e-01]], requires_grad=True)
# Parameter containing: --> bias
# tensor([ 1.2938, -0.9338, -0.3758,  0.2563, -0.1678, -0.0947,  1.4653, -2.4409,
#          0.9019, -0.1023, -0.9561, -0.0532], requires_grad=True)**tensor**

💌 왜 굳이 Parameter라는 클래스를 사용할까?

Q. 
생각해보면 W, b도 tensor를 이용하면 되는 것 아닌가요? 
왜 굳이 Parameter라는 별개의 클래스를 사용하는 거죠?

A.
Paramter 클래스를 사용해야만 gradient 를 계산하는 함수인 grad_fn 가 생성되기 때문

class MyLinear(nn.Module):
    def __init__(self, in_features, out_features, bias = True):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
				
				# weigth와 bias를 Tensor로 지정함
        self.weights = Tensor(
            torch.randn(in_features, out_features)
        )
        self.bias = Tensor(torch.randn(out_features))

    def forward(self, x:Tensor):
        return x@self.weights + self.bias

layer = MyLinear(7,12)
x = torch.randn(5,7)
layer(x).shape
# 여기서는 weight이 출력이 되지 않음 <- 미분의 대상이 되는 것만 보여주기 때문
for value in layer.parameters(): 
    print(value, '2')

⇒ 이미 layer안에 parameter가 내장되어 있기 때문에 nn.Parameter로 지정할 일이 없음

Backward

layer에 있는 parameter의 미분 수행
forward의 결과값 (model의 output(예측치))와 실제값간의 차이 (loss)에 대해 미분 수행
해당 값으로 parameter 업데이트

★★★ 기본 trainig code - 중요!

for epoch in range(epochs):
    optimizer.zero_grad() # 이전 epoch의 gradient값 없애주기
    outputs = model(inputs)
    loss = criterion(outputs, label) # criterion == loss
    loss.backward() # 파라미터의 gradient 값 계산
    optimizer.step() # update parameter

backward는 module 단계에서 직접 지정 가능하나 직접 미분 수식을 구현해야 하기 때문에 대부분 쓰이지 않는다. (직접 코딩할 일은 없으나 순서는 이해)

linear regression 전체 코드 구현

1) 데이터 생성 - y = 2x+1

import numpy as np
x_values = [i for i in range(11)] # int type
x_train = np.array(x_values, dtype = np.float32) # int -> float
x_train = x_train.reshape(-1,1) # row1개에 1개의 데이터로 shape 변경

y_values = [2*i+1 for i in x_values]
y_train = np.array(y_values, dtype = np.float32)
y_train = y_train.reshape(-1,1)

2) 모델 선언

import torch
from torch.autograd import Variable

class LinearRegression(torch.nn.Module):
    def __init__(self, inputSize, outputSize):
        super(LinearRegression, self).__init__()
        self.linear = torch.nn.Linear(inputSize, outputSize)
        
    def forward(self, x):
        out = self.linear(x)
        return out

torch.nn.Linear 코드 레벨

self.weight = **Parameter**(torch.empty((out_features, in_features), **factory_kwargs))
if bias:
    self.bias = **Parameter**(torch.empty(out_features, **factory_kwargs))
else:
    self.register_parameter('bias', None)

⇒ 살펴보면 nn.Parameter가 미리 Linear layer 안에 구현되어 있음

3) 하이퍼 파리미터 지정

inputDim = 1
outputDim = 1
learningRate = 0.01
epochs = 100

model = LinearRegression(inputDim, outputDim)

if torch.cuda.is_available():
    model.cuda()

4) 손실함수, optimizer 지정

criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr= learningRate)

5) 학습

for epoch in range(epochs):
    if torch.cuda.is_available():
        inputs = Variable(torch.from_numpy(x_train).cuda())
        labels = Variable(torch.from_numpy(y_train).cuda())
    else:
        inputs = Variable(torch.from_numpy(x_train))
        labels = Variable(torch.from_numpy(y_train))
        
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    print(loss)
    loss.backward()
    optimizer.step()
    print(epoch, loss.item())

epoch1 - tensor(97.4232, grad_fn=)
epoch100 - tensor(0.0030, grad_fn=)

[loss.item()] = tensor가 1개의 원소를 가질때 스칼라 값 빼오기

6) test data 예측

with torch.no_grad(): # test 할때는 gradient descent 를 하지 않기 위해 
    if torch.cuda.is_available():
        predicted = model(Varialbe(torch.from_numpy(x_train).cuda())).cpu().data.numpy()
    else:
        predicted = model(Variable(torch.from_numpy(x_train))).data.numpy()
    print(predicted)

7) 학습 결과

predict - y_hat
[[ 0.41637066]
 [ 2.5004187 ]
 [ 4.584467  ]
 [ 6.6685147 ]
 [ 8.7525625 ]
 [10.836611  ]
 [12.920658  ]
 [15.004706  ]
 [17.088755  ]
 [19.172802  ]
 [21.256851  ]]
label - y
array([[ 1.],
       [ 3.],
       [ 5.],
       [ 7.],
       [ 9.],
       [11.],
       [13.],
       [15.],
       [17.],
       [19.],
       [21.]], dtype=float32)

0) required_grad : 학습 파라미터 출력

for p in model.parameters():
    if p.requires_grad:
         print(p.name, p.data)
#### 
None tensor([[2.0840]]) # weight
None tensor([0.4164]) # bias

Logistic regression 전체 코드 구현

: 직접 backward 직접 구현 예제

forward : h_{\theta}(x)=\frac{1}{1+e^{-\theta^{T} \mathbf{x}}}

z = -\theta^{T} \mathbf{x} = w.T*x

output = self.sigmoid(z)

⇒ forward : self.sigmoid(torch.mm(self.w.T, x)): w.T*x에 시그모이드를 씌운것

backward : weight 미분

\frac{\partial}{\partial \theta_{j}} J(\theta)=\frac{1}{m} \sum_{i=1}^{m}\left(h_{\theta}\left(x^{i}\right)-y^{i}\right) x_{j}^{i}

self.grad[”dw”] = (1/x.shape[1])*torch.mm(x, (y_hat-y).T)

self.grad[”db”] = (1/x.shape[1])*torch.sum(y_hat-y): x에 1이 들어가면 bias 의 미분

optimizer : gradient update 수식

\begin{aligned}\theta_{j} &:=\theta_{j}-\alpha \frac{\partial}{\partial \theta_{j}} J(\theta) \\&:=\theta_{j}-\alpha \sum^{m}\left(h_{\theta}\left(x^{i}\right)-y^{i}\right) x_{j}^{i}\end{aligned}

self.w = self.w - self.lr*self.grads[”dw”]

self.b = self.b - self.lr*self.grads[”db”]

class LogisticRegression(nn.Module):
    def __int__(self, dim, lr=torch.scalar_tensor(0.01)):
        super(LogisticRegression,self).__init__()
				
				# wieght & bias 초기값 설정
        self.w = torch.zeros(dim, 1, dtype = torch.float).to(device)
        self.b = torch.scalar_tensor(0).to(device)

# 이번 예제는 자동 미분이 아니라 직접 미분을 구현하기 때문에 nn.parameter 지정하지 않아도 됨
        self.grads = {'dw' : torch.zeros(dim, 1, dtype = torch.float).to(device),
                      'db' : torch.scalar_tensor(0).to(device)}
        self.lr = lr.to(device)
        
    def forward(self,x):
        z = torch.mm(self.w.T,x) + self.b
        a = self.sigmoid(z)
        return a
    
    def sigmoid(self,z):
        return 1/(1+torch.exp(-z))
    
    def backward(self,x,yhat, y):
        self.grads['dw'] = (1/x.shape[1])*torch.mm(x,(yhat-y).T)
        self.grads['db'] = (1/x.shape[1])*torch.sum(yhat-y)
        
    def optimize(self):
        self.w = self.w - self.lr * self.grads['dw']
        self.b = self.b - self.lr * self.grads['db']
        
def loss(yhat, y):
    m = y.size()[1]
    return -(1/m)*torch.sum(y*torch.log(yhat) + (1-y)*torch.log(1-yhat))

def predict(yhat, y):
    y_prediction = torch.zeros(1,y.size()[1])
    for i in range(yhat.size()[1]):
        if yhat[0,i] <=0.5:
            y_prediction[0,i] = 0
        else:
            y_prediction[0,i] = 1
    return 100 - torch.mean(torch.abs(y_prediction-y))*100

## hyperparams
costs = []
dim = x_flatten.shape[0]
learning_rate = torch.scalar_tensor(0.0001).to(device)
num_iterations = 100
lrmodel = LR(dim, learning_rate)
lrmodel.to(device)

## transform the data
def transform_data(x, y):
    x_flatten = x.T
    y = y.unsqueeze(0) 
    return x_flatten, y 

## training the model
# update 값을 저장해주기 위해 for문으로 써서 돌린다. 
for i in range(num_iterations):
    x, y = next(iter(train_dataset))
    test_x, test_y = next(iter(test_dataset))
    x, y = transform_data(x, y)
    test_x, test_y = transform_data(test_x, test_y)

    # forward
    yhat = lrmodel.forward(x.to(device))
    cost = loss(yhat.data.cpu(), y)
    train_pred = predict(yhat, y)
        
    # backward
    lrmodel.backward(x.to(device), 
                    yhat.to(device), 
                    y.to(device))
    lrmodel.optimize()
    ## test
    yhat_test = lrmodel.forward(test_x.to(device))
    test_pred = predict(yhat_test, test_y)

    if i % 10 == 0:
        costs.append(cost)

    if i % 10 == 0:
        print("Cost after iteration {}: {} | Train Acc: {} | Test Acc: {}".format(i, 
                                                                                    cost, 
                                                                                    train_pred,
                                                                                    test_pred))
## the trend in the context of loss
plt.plot(costs)
plt.show()

☝ super(LR,self) == super() : 파이썬 3.0부터 둘이 같은 의미를 가진다.