DQN
- 2013년 발표 버전
- Target Q-Network를 따로 두고 Main Q-Network를 학습시킨다
- 몇 단계마다 한번씩 Target Q-Network를 수정해서 Main Q-Network와 일치시킨다
- 2015년 네이처 발표 버전
Qm(st,at)=Qm(st,at)+η∗(Rt+1+γamaxQt(st+1,a)−Qm(st,at))
- Qm : Main Q-network
- Qt : Target Q-network
- 다음 상태 st+1에서 Q값이 최대가 되는 행동 a와 그때의 Q값을 Target Q-network에서 계산
DDQN
- 안정화된 수정식
am=argamaxQm(st+1,a)Qm(st,at)=Qm(st,at)+η∗(Rt+1+γQt(st+1,am)−Qm(st,at))
- 다음 상태 st+1에서 Q값이 최대가 되는 행동 am은 Main Q-Network에서 구하고
- 그때의 Q값은 Target Q-Network에서 구하는 것
DDQN 구현
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self, n_in, n_mid, n_out):
super(Net, self).__init__()
self.fc1 = nn.Linear(n_in, n_mid)
self.fc2 = nn.Linear(n_mid, n_mid)
self.fc3 = nn.Linear(n_mid, n_out)
def forward(self, x):
h1 = F.relu(self.fc1(x))
h2 = F.relu(self.fc2(h1))
output = self.fc3(h2)
return output
Brain 클래스
import numpy as np
import random
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
BATCH_SIZE = 32
CAPACITY = 10000
class Brain:
def __init__(self, num_states, num_actions):
self.num_actions = num_actions
self.memory = ReplayMemory(CAPACITY)
n_in, n_mid, n_out = num_states, 32, num_actions
self.main_q_network = Net(n_in, n_mid, n_out)
self.target_q_network = Net(n_in, n_mid, n_out)
print(self.main_q_network)
self.optimizer = optim.Adam(self.main_q_network.parameters(), lr=0.0001)
def replay(self):
'''Experience Replay로 신경망 결합 가중치 학습'''
if len(self.memory) < BATCH_SIZE :
return
self.batch, self.state_batch, self.action_batch, self.reward_batch, self.non_final_next_states \
= self.make_minibatch()
self.expected_state_action_values = self.get_expected_state_action_values()
self.update_main_q_network()
def decide_action(self, state, episode):
'''현재 상태로부터 행동을 결정함'''
epsilon = 0.5 * (1 / (episode + 1))
if epsilon <= np.random.uniform(0, 1):
self.main_q_network.eval()
with torch.no_grad():
action = self.main_q_network(state).max(1)[1].view(1, 1)
else :
action = torch.LongTensor(
[[random.randrange(self.num_actions)]]
)
return action
def make_minibatch(self):
'''2. 미니배치 생성'''
transitions = self.memory.sample(BATCH_SIZE)
batch = Transition(*zip(*transitions))
state_batch = torch.cat(batch.state)
action_batch = torch.cat(batch.action)
reward_batch = torch.cat(batch.reward)
non_final_next_states = torch.cat([s for s in batch.next_state
if s is not None])
return batch, state_batch, action_batch, reward_batch, non_final_next_states
def get_expected_state_action_values(self):
'''정답신호로 사용할 Q(s_t, a_t)를 계산'''
self.main_q_network.eval()
self.target_q_network.eval()
self.state_action_values = self.main_q_network(
self.state_batch
).gather(1, self.action_batch)
non_final_mask = torch.ByteTensor(tuple(map(lambda s: s is not None,
self.batch.next_state)))
next_state_values = torch.zeros(BATCH_SIZE)
a_m = torch.zeros(BATCH_SIZE).type(torch.LongTensor)
a_m[non_final_mask] = self.main_q_network(
self.non_final_next_states
).detach().max(1)[1]
a_m_non_final_next_states = a_m[non_final_mask].view(-1, 1)
next_state_values[non_final_mask] = self.target_q_network(
self.non_final_next_states
).gather(1, a_m_non_final_next_states).detach().squeeze()
expected_state_action_values = self.reward_batch + GAMMA * next_state_values
return expected_state_action_values
def update_main_q_network(self):
'''4. 결합 가중치 수정'''
self.main_q_network.train()
loss = F.smooth_l1_loss(self.state_action_values,
self.expected_state_action_values.unsqueeze(1))
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def update_target_q_network(self):
'''Target Q-network을 Main Q-Network와 맞춤'''
self.target_q_network.load_state_dict(self.main_q_network.state_dict())
- 최적화 기법 설정시
self.main_q_network.parameters()를 통해 인자로 주어 학습 대상이 Main Q-Network가 되도록 한다
get_expected_state_action_values 메서드를 DDQN용으로 수정한다
update_target_q_network를 정의해 Target Q-Network 결합 가중치를 Main Q-Network와 맞춘다
Agent 클래스
- 에피소드 종료시
update_target_q_function 호출
class Agent :
def __init__(self, num_states, num_actions):
'''태스크의 상태 및 행동의 가짓수를 설정'''
self.brain = Brain(num_states, num_actions)
def update_q_function(self):
'''Q함수를 수정'''
self.brain.replay()
def get_action(self, state, episode):
'''행동을 결정'''
action = self.brain.decide_action(state, episode)
return action
def memorize(self, state, action, state_next, reward):
'''memory 객체에 state, action, state_next, reward 저장'''
self.brain.memory.push(state, action, state_next, reward)
def update_target_q_function(self):
'''Target Q-Network를 Main Q-Network와 맞춤'''
self.brain.update_target_q_network()
Environment 클래스
class Environment:
def __init__(self):
self.env = gym.make(ENV, render_mode = 'rgb_array')
num_states = self.env.observation_space.shape[0]
num_actions = self.env.action_space.n
self.agent = Agent(num_states, num_actions)
def run(self):
'''실행'''
episode_10_list = np.zeros(10)
complete_episodes = 0
episode_final = False
frames = []
for episode in range(NUM_EPISODES):
observation = self.env.reset()
state = observation
state = torch.from_numpy(state[0]).type(
torch.FloatTensor
)
state = torch.unsqueeze(state, 0)
for step in range(MAX_STEPS):
if episode_final is True :
frames.append(self.env.render())
action = self.agent.get_action(state, episode)
observation_next, _, done, _, _ = self.env.step(
action.item()
)
if done :
state_next = None
episode_10_list = np.hstack(
(episode_10_list[1:], step+1)
)
if step < 195:
reward = torch.FloatTensor(
[-1.0]
)
complete_episodes = 0
else :
reward = torch.FloatTensor([1.0])
complete_episodes += 1
else :
reward = torch.FloatTensor([0.0])
state_next = observation_next
state_next = torch.from_numpy(state_next).type(
torch.FloatTensor
)
state_next = torch.unsqueeze(state_next, 0)
self.agent.memorize(state, action, state_next, reward)
self.agent.update_q_function()
state = state_next
if done:
print('%d Episode : Finished after %d steps : 최근 10 에피소드의 평균 단계 수 = %.1lf' % (episode, step + 1, episode_10_list.mean()))
if (episode % 2 == 0) :
self.agent.update_target_q_function()
break
if episode_final is True:
display_frames_as_gif(frames)
break
if complete_episodes >= 10:
print("10 연속 에피소드 성공")
episode_final = True
DQN 보다 훨씬 빨리 학습이 끝났다