DeepQ Generalization
The fourth step in our project was to create a DeepQ generalization. Generalization is a further expansion on our DeepQ Implementation.
Generalization refers to a model's ability to adapt to novel data not seen during training. In reinforcement learning, this occurs when an agent utilizes a policy developed outside of the deployment environment. For our case, Sonic is trained on several levels, and the agent attempt to use a policy to conquer an unseen level. While this can decrease overall performance across environments, it is vital to developing viable agents for use in real world applications.
Generalized Runs
This is Sonic's first attempt at Level 2 after generalization training:
As you can see, Sonic has a lot of problems when faced with a loop and has not figured out how to dodge enemies he hasn't seen before.
This is Sonic's first attempt at Level 4 after generalization training:
As you can see, Sonic has a lot of problems with both the lava and the moving platforms, both of which he has not seen before.
Our Code:
DeepQ Model Class:
1##---------------Sources-------------------------##
2# DeepQ Learning with PyTorch: https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
3# DeepQ for OpenAI Atari environments https://github.com/deepanshut041/Reinforcement-Learning
4##------------------------------------------------##
5
6import os
7import sys
8import torch
9import torch.nn as nn
10import torch.autograd as autograd
11import torch.nn.functional as F
12
13script_dir = os.path.dirname(os.path.abspath(__file__))
14project_dir = os.path.abspath(script_dir + "/../..")
15
16class DQN(nn.Module):
17 def __init__(self, input_shape, num_actions, seed=0):
18 super(DQN, self).__init__()
19 self.input_shape = input_shape
20 self.num_actions = num_actions
21 self.seed = seed
22
23
24 # TODO DM Changed kernel and stride to better fit our standard 28 x 40 size.
25 # Currently setup to take Atari 84x84 images
26 self.features = nn.Sequential(
27 nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
28 nn.ReLU(),
29 nn.Conv2d(32, 64, kernel_size=4, stride=2),
30 nn.ReLU(),
31 nn.Conv2d(64, 64, kernel_size=3, stride=1),
32 nn.ReLU()
33 )
34
35 self.fc = nn.Sequential(
36 nn.Linear(self.feature_size(), 512),
37 nn.ReLU(),
38 nn.Linear(512, self.num_actions)
39 )
40
41 def forward(self, x):
42 x = self.features(x)
43 x = x.view(x.size(0), -1)
44 x = self.fc(x)
45 return x
46
47
48 def feature_size(self):
49 return self.features(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1)
DeepQ Agent Class:
1##---------------Sources-------------------------##
2# DeepQ Learning with PyTorch: https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
3# DeepQ Image Processing for GymRetro: https://github.com/deepanshut041/Reinforcement-Learning
4##------------------------------------------------##
5
6
7import sys
8import os
9
10import numpy as np
11import torch
12import torch.nn.functional as F
13import torch.optim as optim
14import random
15
16script_dir = os.path.dirname(os.path.abspath(__file__))
17project_dir = os.path.abspath(script_dir + "/../..")
18
19sys.path.append(os.path.abspath(project_dir + '/source/agents'))
20sys.path.append(os.path.abspath(project_dir + '/source/interface'))
21sys.path.append(os.path.abspath(project_dir + '/source/learning'))
22sys.path.append(os.path.abspath(project_dir + '/source/models'))
23
24from agent_base import *
25from deep_q_buffer import *
26from train_deep_q import *
27from deep_q_model import DQN
28from datetime import datetime, date
29from action_space import *
30
31# DeepQ Neural Network.
32class DeepQ(AgentBase):
33
34 # TODO Empty Constructor
35 def __init__(self, model=None):
36 """Initialize an Agent object.
37
38 Params
39 ======
40 input_shape (tuple): dimension of each state (C, H, W)
41 action_size (int): dimension of each action
42 seed (int): random seed
43 device(string): Use Gpu or CPU
44 buffer_size (int): replay buffer size
45 batch_size (int): minibatch size
46 gamma (float): discount factor
47 lr (float): learning rate
48 update_every (int): how often to update the network
49 replay_after (int): After which replay to be started
50 model(Model): Pytorch Model
51 """
52 input_shape = (4, 84, 84) #stacked frames x channels x w x h
53 self.action_size = 7 #len(possible_actions)
54 self.seed = 0 #random.seed(seed)
55 self.buffer_size = 100000
56 self.batch_size = 32
57 self.gamma = 0.99
58 self.lr = 0.00001
59 self.update_every = 100
60 self.replay_after = 10000
61 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
62
63 self.DQN = DQN
64 self.tau = 1e-3
65
66
67 # Q-Network
68
69 self.policy_net = self.DQN(input_shape, self.action_size, self.seed).to(self.device)
70 if model is not None:
71 os.chdir(script_dir)
72 os.chdir('..')
73 os.chdir('..')
74 root = os.getcwd()
75 checkpoint = torch.load(model, map_location=self.device)
76
77 self.policy_net.load_state_dict(checkpoint['model_state_dict'])
78
79 #self.policy_net.load_state_dict(torch.load(os.path.join(root, model), map_location=self.device), strict=False)
80 self.target_net = self.DQN(input_shape, self.action_size, self.seed).to(self.device)
81 self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
82
83 # Replay memory
84 self.memory = ReplayBuffer(self.buffer_size, self.batch_size, self.seed, self.device)
85 self.t_step = 0
86
87 def step(self, state, action, reward, next_state, done):
88 # Save experience in replay memory
89 self.memory.add(state, action, reward, next_state, done)
90
91 # Learn every UPDATE_EVERY time steps.
92 self.t_step = (self.t_step + 1) % self.update_every
93
94 if self.t_step == 0:
95 # If enough samples are available in memory, get random subset and learn
96 if len(self.memory) > self.replay_after:
97 experiences = self.memory.sample()
98 self.learn(experiences)
99
100 def act(self, state, eps=0.03):
101 """Returns actions for given state as per current policy."""
102
103 state = torch.from_numpy(state).unsqueeze(0).to(self.device)
104 self.policy_net.eval()
105 with torch.no_grad():
106 action_values = self.policy_net(state)
107 self.policy_net.train()
108
109 # Epsilon-greedy action selection
110 if random.random() > eps:
111 return np.argmax(action_values.cpu().data.numpy())
112 else:
113 return random.choice(np.arange(self.action_size))
114
115 def learn(self, experiences):
116 states, actions, rewards, next_states, dones = experiences
117
118 # Get expected Q values from policy model
119 Q_expected_current = self.policy_net(states)
120 Q_expected = Q_expected_current.gather(1, actions.unsqueeze(1)).squeeze(1)
121
122 # Get max predicted Q values (for next states) from target model
123 Q_targets_next = self.target_net(next_states).detach().max(1)[0]
124
125 # Compute Q targets for current states
126 Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
127
128 # Compute loss
129 loss = F.mse_loss(Q_expected, Q_targets)
130
131 # Minimize the loss
132 self.optimizer.zero_grad()
133 loss.backward()
134 self.optimizer.step()
135 self.soft_update(self.policy_net, self.target_net, self.tau)
136
137 def soft_update(self, policy_model, target_model, tau):
138 for target_param, policy_param in zip(target_model.parameters(), policy_model.parameters()):
139 target_param.data.copy_(tau*policy_param.data + (1.0-tau)*target_param.data)
140 something = 0
141
142 def load(self, filename):
143 None
144
145 def save(self, filename, epoch):
146 torch.save({
147 'epoch': epoch,
148 'model_state_dict': self.policy_net.state_dict(),
149 }, filename)
150
151 def train(self, env, n_episodes, reward_system, render, ckpt, save_rate):
152 DeepQTrainer.train(self, env, n_episodes, reward_system, render, ckpt, save_rate)
153
154 def decide(self, ob, info) -> list:
155 # Quick Fix
156 if hasattr(self, '__prev_state'):
157 self.__prev_state = DeepQTrainer.stack_frames(self.__prev_state, ob, False)
158 else:
159 self.__prev_state = DeepQTrainer.stack_frames(None, ob, True)
160
161 move = self.act(self.__prev_state)
162
163 return ActionSpace.move(move)
164
165 # Returns name of agent as a string
166 def name(self) -> str:
167 return "DeepQ"
168
169 # Moves data from current memory to 'device' memory
170 # ex: agent.move_to(torch.cuda()) will move neural network data to GPU memory.
171 # If sucessfull, all operations on this NN will be executed on that device (CPU or GPU).
172 # Internal fields will be moved. The object itself does not need to be reassigned like tensors do.
173 def to(self, device) -> None:
174 self.policy_net = self.policy_net.to(device)
175 self.target_net = self.target_net.to(device)
DeepQ Traing Class:
1##---------------Sources-------------------------##
2# DeepQ Learning with PyTorch: https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
3# DeepQ Image Processing for GymRetro: https://github.com/deepanshut041/Reinforcement-Learning
4# Helper Functions for Gym Retro: https://github.com/moversti/sonicNEAT
5##------------------------------------------------##
6
7from fileinput import filename
8import time
9import retro
10import random
11import torch
12import numpy as np
13from collections import deque
14import math
15import os
16import sys
17from datetime import datetime, date
18
19script_dir = os.path.dirname(os.path.abspath(__file__))
20project_dir = os.path.abspath(script_dir + "/../..")
21
22sys.path.append(os.path.abspath(project_dir + '/source/agents'))
23sys.path.append(os.path.abspath(project_dir + '/source/interface'))
24sys.path.append(os.path.abspath(project_dir + '/source/learning'))
25sys.path.append(os.path.abspath(project_dir + '/source/models'))
26sys.path.append(os.path.abspath(project_dir + '/source/vision'))
27
28from all_agents import *
29from checkpoint import *
30from deep_q_agent import *
31from deep_q_model import DQN
32from image_processing import preprocess_frame, stack_frame
33from action_space import *
34from greyImageViewer import GreyImageViewer
35from controllerViewer import ControllerViewer
36from reward_system import *
37
38class DeepQTrainer:
39 def stack_frames(frames, state, is_new=False):
40 """Stacks frames for broader input of environment."""
41
42 frame = preprocess_frame(state)
43 frames = stack_frame(frames, frame, is_new)
44 return frames
45
46
47 def train(agent, env, n_episodes=1000, reward_system=RewardSystem.Contest, render=False, ckpt=None, save_rate=10):
48 """
49 Params
50 ======
51 n_episodes (int): maximum number of training episodes
52 """
53 # if gpu is to be used
54 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
55 print("Device: ", device)
56
57 UPDATE_TARGET = 10000 # After which thershold replay to be started
58 EPS_START = 0.99 # starting value of epsilon
59 EPS_END = 0.01 # Ending value of epsilon
60 EPS_DECAY = 100 # Rate by which epsilon to be decayed
61
62 # Initialize Agent
63
64 start_epoch = 0
65 scores = []
66 best_ckpt_score = -9999999 # initialize to a very small number
67 scores_window = deque(maxlen=20)
68
69 # Initialize checkpoint
70 ckpt = Checkpoint(agent)
71 ckpt.make_dir() # Makes new directory if it does not exist
72
73 epsilon_by_epsiode = lambda frame_idx: EPS_END + (EPS_START - EPS_END) * math.exp(-1. * frame_idx /EPS_DECAY)
74 for i_episode in range(start_epoch + 1, n_episodes+1):
75 state = DeepQTrainer.stack_frames(None, env.reset(), True)
76 next_state, reward, done, info = env.step(ActionSpace.stand_still()) # make a passive move to initialize data
77
78 score = 0
79 eps = epsilon_by_epsiode(i_episode)
80 reward_system.init(info)
81
82 # Punish the agent for not moving forward
83 prev_state = {}
84 steps_stuck = 0
85 timestamp = 0
86
87 while timestamp < 5000:
88 action = agent.act(state, eps)
89 next_state, reward, done, info = env.step(ActionSpace.move(action))
90 reward = reward_system.calc_reward(info, ActionSpace.move(action))
91
92 if render is True:
93 env.render()
94
95 score += reward
96
97 timestamp += 1
98
99 # Punish the agent for standing still for too long.
100 if (prev_state == info):
101 steps_stuck += 1
102 else:
103 steps_stuck = 0
104 prev_state = info
105
106 if (steps_stuck > 20):
107 reward -= 1
108
109 next_state = DeepQTrainer.stack_frames(state, next_state, False)
110 agent.step(state, action, reward, next_state, done)
111 state = next_state
112 if done:
113 break
114
115 scores_window.append(score) # save most recent score
116 scores.append(score) # save most recent score
117 print ("epoch:", i_episode, "score:", score)
118
119 if (i_episode % save_rate == 0 and score > best_ckpt_score):
120 ckpt.epoch = i_episode
121 ckpt.score = score
122 best_ckpt_score = score
123 fn = ckpt.generate_path()
124 agent.save(fn, i_episode)
125 print(f"Saving checkpoint with new best score {best_ckpt_score}")
126
127 return scores