DeepQ Implemenation
The third step in our project was to create a DeepQ implementation. DeepQ is a form of reinforcement learning. A reinforcement learning task focuses on training agents to interact within an environment. The agent arrives at different scenarios known as states by performing actions. Actions lead to rewards, which could be positive or negative. Let’s say we know the expected reward of each action at every step. This would essentially be like a cheat sheet for the agent! Our agent will know exactly which action to perform. It will perform the sequence of actions that will eventually generate the maximum total reward. This total reward is also called the Q-value. The Q-value strategy is calculated by a complex equation known as the Bellman Equation, which we will leave out for simplicity. Essentially, you try to maximize your reward by calculating rewards from all the possible states at the next time step. If you do this iteratively, you have Q-Learning!
Deep Q takes this a step further by using a neural network to calculate these action-reward pairs for each input state in parallel. It is typically several convolutional layers to process input images, followed by several fully connected layers to map estimated Q values to all possible actions. The network chooses the max Q value to decide the agents next action. Following the action, it receives a ground truth Q value. Through backpropagation, we minimize the loss between the estimated Q and the ground truth Q value. This is training! Eventually, our agent will learn the appropriate action to take relative to its current state, resulting in the greatest reward!
Source: Analytic Vidhya
Our Code:
DeepQ Model Class:
1##---------------Sources-------------------------##
2# DeepQ Learning with PyTorch: https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
3# DeepQ for OpenAI Atari environments https://github.com/deepanshut041/Reinforcement-Learning
4##------------------------------------------------##
5
6import os
7import sys
8import torch
9import torch.nn as nn
10import torch.autograd as autograd
11import torch.nn.functional as F
12
13script_dir = os.path.dirname(os.path.abspath(__file__))
14project_dir = os.path.abspath(script_dir + "/../..")
15
16class DQN(nn.Module):
17 def __init__(self, input_shape, num_actions, seed=0):
18 super(DQN, self).__init__()
19 self.input_shape = input_shape
20 self.num_actions = num_actions
21 self.seed = seed
22
23
24 # TODO DM Changed kernel and stride to better fit our standard 28 x 40 size.
25 # Currently setup to take Atari 84x84 images
26 self.features = nn.Sequential(
27 nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
28 nn.ReLU(),
29 nn.Conv2d(32, 64, kernel_size=4, stride=2),
30 nn.ReLU(),
31 nn.Conv2d(64, 64, kernel_size=3, stride=1),
32 nn.ReLU()
33 )
34
35 self.fc = nn.Sequential(
36 nn.Linear(self.feature_size(), 512),
37 nn.ReLU(),
38 nn.Linear(512, self.num_actions)
39 )
40
41 def forward(self, x):
42 x = self.features(x)
43 x = x.view(x.size(0), -1)
44 x = self.fc(x)
45 return x
46
47
48 def feature_size(self):
49 return self.features(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1)
DeepQ Agent Class:
1##---------------Sources-------------------------##
2# DeepQ Learning with PyTorch: https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
3# DeepQ Image Processing for GymRetro: https://github.com/deepanshut041/Reinforcement-Learning
4##------------------------------------------------##
5
6
7import sys
8import os
9
10import numpy as np
11import torch
12import torch.nn.functional as F
13import torch.optim as optim
14import random
15
16script_dir = os.path.dirname(os.path.abspath(__file__))
17project_dir = os.path.abspath(script_dir + "/../..")
18
19sys.path.append(os.path.abspath(project_dir + '/source/agents'))
20sys.path.append(os.path.abspath(project_dir + '/source/interface'))
21sys.path.append(os.path.abspath(project_dir + '/source/learning'))
22sys.path.append(os.path.abspath(project_dir + '/source/models'))
23
24from agent_base import *
25from deep_q_buffer import *
26from train_deep_q import *
27from deep_q_model import DQN
28from datetime import datetime, date
29from action_space import *
30
31# DeepQ Neural Network.
32class DeepQ(AgentBase):
33
34 # TODO Empty Constructor
35 def __init__(self, model=None):
36 """Initialize an Agent object.
37
38 Params
39 ======
40 input_shape (tuple): dimension of each state (C, H, W)
41 action_size (int): dimension of each action
42 seed (int): random seed
43 device(string): Use Gpu or CPU
44 buffer_size (int): replay buffer size
45 batch_size (int): minibatch size
46 gamma (float): discount factor
47 lr (float): learning rate
48 update_every (int): how often to update the network
49 replay_after (int): After which replay to be started
50 model(Model): Pytorch Model
51 """
52 input_shape = (4, 84, 84) #stacked frames x channels x w x h
53 self.action_size = 7 #len(possible_actions)
54 self.seed = 0 #random.seed(seed)
55 self.buffer_size = 100000
56 self.batch_size = 32
57 self.gamma = 0.99
58 self.lr = 0.00001
59 self.update_every = 100
60 self.replay_after = 10000
61 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
62
63 self.DQN = DQN
64 self.tau = 1e-3
65
66
67 # Q-Network
68
69 self.policy_net = self.DQN(input_shape, self.action_size, self.seed).to(self.device)
70 if model is not None:
71 os.chdir(script_dir)
72 os.chdir('..')
73 os.chdir('..')
74 root = os.getcwd()
75 checkpoint = torch.load(model, map_location=self.device)
76
77 self.policy_net.load_state_dict(checkpoint['model_state_dict'])
78
79 #self.policy_net.load_state_dict(torch.load(os.path.join(root, model), map_location=self.device), strict=False)
80 self.target_net = self.DQN(input_shape, self.action_size, self.seed).to(self.device)
81 self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
82
83 # Replay memory
84 self.memory = ReplayBuffer(self.buffer_size, self.batch_size, self.seed, self.device)
85 self.t_step = 0
86
87 def step(self, state, action, reward, next_state, done):
88 # Save experience in replay memory
89 self.memory.add(state, action, reward, next_state, done)
90
91 # Learn every UPDATE_EVERY time steps.
92 self.t_step = (self.t_step + 1) % self.update_every
93
94 if self.t_step == 0:
95 # If enough samples are available in memory, get random subset and learn
96 if len(self.memory) > self.replay_after:
97 experiences = self.memory.sample()
98 self.learn(experiences)
99
100 def act(self, state, eps=0.):
101 """Returns actions for given state as per current policy."""
102
103 state = torch.from_numpy(state).unsqueeze(0).to(self.device)
104 self.policy_net.eval()
105 with torch.no_grad():
106 action_values = self.policy_net(state)
107 self.policy_net.train()
108
109 # Epsilon-greedy action selection
110 if random.random() > eps:
111 return np.argmax(action_values.cpu().data.numpy())
112 else:
113 return random.choice(np.arange(self.action_size))
114
115 def learn(self, experiences):
116 states, actions, rewards, next_states, dones = experiences
117
118 # Get expected Q values from policy model
119 Q_expected_current = self.policy_net(states)
120 Q_expected = Q_expected_current.gather(1, actions.unsqueeze(1)).squeeze(1)
121
122 # Get max predicted Q values (for next states) from target model
123 Q_targets_next = self.target_net(next_states).detach().max(1)[0]
124
125 # Compute Q targets for current states
126 Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
127
128 # Compute loss
129 loss = F.mse_loss(Q_expected, Q_targets)
130
131 # Minimize the loss
132 self.optimizer.zero_grad()
133 loss.backward()
134 self.optimizer.step()
135 self.soft_update(self.policy_net, self.target_net, self.tau)
136
137 def soft_update(self, policy_model, target_model, tau):
138 for target_param, policy_param in zip(target_model.parameters(), policy_model.parameters()):
139 target_param.data.copy_(tau*policy_param.data + (1.0-tau)*target_param.data)
140 something = 0
141
142 def load(self, filename):
143 None
144
145 def save(self, filename, epoch):
146 torch.save({
147 'epoch': epoch,
148 'model_state_dict': self.policy_net.state_dict(),
149 }, filename)
150
151 def train(self, env, n_episodes, reward_system, render, ckpt, save_rate):
152 DeepQTrainer.train(self, env, n_episodes, reward_system, render, ckpt, save_rate)
153
154 def decide(self, ob, info) -> list:
155 # Quick Fix
156 if hasattr(self, '__prev_state'):
157 self.__prev_state = DeepQTrainer.stack_frames(self.__prev_state, ob, False)
158 else:
159 self.__prev_state = DeepQTrainer.stack_frames(None, ob, True)
160
161 move = self.act(self.__prev_state)
162
163 return ActionSpace.move(move)
164
165 # Returns name of agent as a string
166 def name(self) -> str:
167 return "DeepQ"
168
169 # Moves data from current memory to 'device' memory
170 # ex: agent.move_to(torch.cuda()) will move neural network data to GPU memory.
171 # If sucessfull, all operations on this NN will be executed on that device (CPU or GPU).
172 # Internal fields will be moved. The object itself does not need to be reassigned like tensors do.
173 def to(self, device) -> None:
174 self.policy_net = self.policy_net.to(device)
175 self.target_net = self.target_net.to(device)
DeepQ Traing Class:
1##---------------Sources-------------------------##
2# DeepQ Learning with PyTorch: https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
3# DeepQ Image Processing for GymRetro: https://github.com/deepanshut041/Reinforcement-Learning
4# Helper Functions for Gym Retro: https://github.com/moversti/sonicNEAT
5##------------------------------------------------##
6
7from fileinput import filename
8import time
9import retro
10import random
11import torch
12import numpy as np
13from collections import deque
14import math
15import os
16import sys
17from datetime import datetime, date
18
19script_dir = os.path.dirname(os.path.abspath(__file__))
20project_dir = os.path.abspath(script_dir + "/../..")
21
22sys.path.append(os.path.abspath(project_dir + '/source/agents'))
23sys.path.append(os.path.abspath(project_dir + '/source/interface'))
24sys.path.append(os.path.abspath(project_dir + '/source/learning'))
25sys.path.append(os.path.abspath(project_dir + '/source/models'))
26sys.path.append(os.path.abspath(project_dir + '/source/vision'))
27
28from all_agents import *
29from checkpoint import *
30from deep_q_agent import *
31from deep_q_model import DQN
32from image_processing import preprocess_frame, stack_frame
33from action_space import *
34from greyImageViewer import GreyImageViewer
35from controllerViewer import ControllerViewer
36from reward_system import *
37
38class DeepQTrainer:
39 def stack_frames(frames, state, is_new=False):
40 """Stacks frames for broader input of environment."""
41
42 frame = preprocess_frame(state)
43 frames = stack_frame(frames, frame, is_new)
44 return frames
45
46
47 def train(agent, env, n_episodes=1000, reward_system=RewardSystem.Contest, render=False, ckpt=None, save_rate=10):
48 """
49 Params
50 ======
51 n_episodes (int): maximum number of training episodes
52 """
53 # if gpu is to be used
54 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
55 print("Device: ", device)
56
57 UPDATE_TARGET = 10000 # After which thershold replay to be started
58 EPS_START = 0.99 # starting value of epsilon
59 EPS_END = 0.01 # Ending value of epsilon
60 EPS_DECAY = 100 # Rate by which epsilon to be decayed
61
62 # Initialize Agent
63 start_epoch = 0
64 scores = []
65 max_score = -9999999 # initialize to a very small number
66 scores_window = deque(maxlen=20)
67
68 # Initialize checkpoint
69 ckpt = Checkpoint(agent)
70 ckpt.make_dir() # Makes new directory if it does not exist
71
72 epsilon_by_epsiode = lambda frame_idx: EPS_END + (EPS_START - EPS_END) * math.exp(-1. * frame_idx /EPS_DECAY)
73
74 for i_episode in range(start_epoch + 1, n_episodes+1):
75 state = DeepQTrainer.stack_frames(None, env.reset(), True)
76 next_state, reward, done, info = env.step(ActionSpace.stand_still()) # make a passive move to initialize data
77
78 score = 0
79 eps = epsilon_by_epsiode(i_episode)
80 reward_system.init(info)
81
82 # Punish the agent for not moving forward
83 prev_state = {}
84 steps_stuck = 0
85 timestamp = 0
86
87 while timestamp < 5000:
88 action = agent.act(state, eps)
89 next_state, reward, done, info = env.step(ActionSpace.move(action))
90 reward = reward_system.calc_reward(info, ActionSpace.move(action))
91
92 if render is True:
93 env.render()
94
95 score += reward
96
97 timestamp += 1
98
99 # Punish the agent for standing still for too long.
100 if (prev_state == info):
101 steps_stuck += 1
102 else:
103 steps_stuck = 0
104 prev_state = info
105
106 if (steps_stuck > 20):
107 reward -= 1
108
109 next_state = DeepQTrainer.stack_frames(state, next_state, False)
110 agent.step(state, action, reward, next_state, done)
111 state = next_state
112 if done:
113 break
114
115 scores_window.append(score) # save most recent score
116 scores.append(score) # save most recent score
117 print ("epoch:", i_episode, "score:", score)
118
119 scores_window.append(score) # save most recent score
120 scores.append(score) # save most recent score
121 if score > max_score:
122 max_score = score
123 print ("epoch:", i_episode, "score:", score)
124
125 if (i_episode % save_rate == 0 and max_score == score):
126 ckpt.epoch = i_episode
127 ckpt.score = score
128 fn = ckpt.generate_path()
129 agent.save(fn, i_episode)
130 print("Saving checkpoint to '", fn, ''', " New best reward found.", sep='')
131 return scores