9일차 요약
- 오전: 강화학습 이론 및 back-propagation 실습
- 오후: 강화학습 실습
- MDP Grid World 실습
import math
import torch
from torch.autograd import Variable
x = Variable(
torch.tensor(1., dtype=torch.float32),
requires_grad=True)
y = Variable(
torch.tensor(1., dtype=torch.float32),
requires_grad=True)
optimizer = torch.optim.Adam(params=[x,y], lr=0.01)
EPOCHS = 100
for epoch in range(EPOCHS):
f = 2*(x**2) + (y**2) + (math.exp(4*x*y))
optimizer.zero_grad()
f.backward()
optimizer.step()
print(x, y)
print(f)

matrix p 생성
import numpy as np
P = np.array([
# up
# 0 1 2 3 4 5 6 7 8 9 10 11
[ [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]],
# down
[ [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]],
# left
[ [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]],
# right
[ [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]], dtype='float32')
P.shape
P = P.transpose(1, 0, 2)
Reward matrix
R = np.array([
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 0], # up
[-1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, 0], # down
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0], # left
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0]], #right
dtype='float32')
R = R.transpose()
MDP
pi = np.ones((12, 4), dtype='float32') * 0.25
def policy_eval(P, R, pi, maxiter=30):
V = np.zeros((maxiter, 12), dtype='float32')
for i in range(maxiter-1):
V[i+1] = np.squeeze(
np.matmul(
np.expand_dims( pi, 1 ),
np.expand_dims( R + 0.6 * np.dot(P, V[i]), 2 )))
return V[maxiter-1]
def policy_upd(P, R, v):
print(np.squeeze(np.expand_dims( R + 0.6 * np.dot(P, v), 2 )))
a_idx = np.argmax(np.squeeze(np.expand_dims( R + 0.6 * np.dot(P, v), 2 )), axis=1)
pi = np.zeros((12, 4), dtype='float32')
pi[range(12), a_idx] = 1.
return pi
pi_old = None
pi = np.ones((12, 4), dtype='float32') * 0.25
while not np.all(np.equal(pi_old, pi)):
pi_old = pi.copy()
v = policy_eval(P, R, pi)
pi = policy_upd(P, R, v)
print(pi)
결과값
