DeepQ Generalization

The fourth step in our project was to create a DeepQ generalization. Generalization is a further expansion on our DeepQ Implementation.

Generalization refers to a model's ability to adapt to novel data not seen during training. In reinforcement learning, this occurs when an agent utilizes a policy developed outside of the deployment environment. For our case, Sonic is trained on several levels, and the agent attempt to use a policy to conquer an unseen level. While this can decrease overall performance across environments, it is vital to developing viable agents for use in real world applications.

Generalized Runs

This is Sonic's first attempt at Level 2 after generalization training:

As you can see, Sonic has a lot of problems when faced with a loop and has not figured out how to dodge enemies he hasn't seen before.

This is Sonic's first attempt at Level 4 after generalization training:

As you can see, Sonic has a lot of problems with both the lava and the moving platforms, both of which he has not seen before.

Our Code:

DeepQ Model Class:

1##---------------Sources-------------------------##
2# DeepQ Learning with PyTorch: https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
3# DeepQ for OpenAI Atari environments https://github.com/deepanshut041/Reinforcement-Learning 
4##------------------------------------------------##
5
6import os
7import sys
8import torch
9import torch.nn as nn
10import torch.autograd as autograd 
11import torch.nn.functional as F
12
13script_dir = os.path.dirname(os.path.abspath(__file__))
14project_dir = os.path.abspath(script_dir + "/../..")
15
16class DQN(nn.Module):
17    def __init__(self, input_shape, num_actions, seed=0):
18        super(DQN, self).__init__()
19        self.input_shape = input_shape
20        self.num_actions = num_actions
21        self.seed = seed
22        
23        
24        # TODO DM Changed kernel and stride to better fit our standard 28 x 40 size. 
25        # Currently setup to take Atari 84x84 images
26        self.features = nn.Sequential(
27            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
28            nn.ReLU(),
29            nn.Conv2d(32, 64, kernel_size=4, stride=2),
30            nn.ReLU(),
31            nn.Conv2d(64, 64, kernel_size=3, stride=1),
32            nn.ReLU()
33        )
34        
35        self.fc = nn.Sequential(
36            nn.Linear(self.feature_size(), 512),
37            nn.ReLU(),
38            nn.Linear(512, self.num_actions)
39        )
40        
41    def forward(self, x):
42        x = self.features(x)
43        x = x.view(x.size(0), -1)
44        x = self.fc(x)
45        return x
46    
47
48    def feature_size(self):
49        return self.features(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1)

DeepQ Agent Class:

1##---------------Sources-------------------------##
2# DeepQ Learning with PyTorch: https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
3# DeepQ Image Processing for GymRetro:  https://github.com/deepanshut041/Reinforcement-Learning 
4##------------------------------------------------##
5
6
7import sys
8import os
9
10import numpy as np
11import torch
12import torch.nn.functional as F
13import torch.optim as optim
14import random
15
16script_dir = os.path.dirname(os.path.abspath(__file__))
17project_dir = os.path.abspath(script_dir + "/../..")
18
19sys.path.append(os.path.abspath(project_dir + '/source/agents'))
20sys.path.append(os.path.abspath(project_dir + '/source/interface'))
21sys.path.append(os.path.abspath(project_dir + '/source/learning'))
22sys.path.append(os.path.abspath(project_dir + '/source/models'))
23
24from agent_base import * 
25from deep_q_buffer import *
26from train_deep_q import *
27from deep_q_model import DQN
28from datetime import datetime, date
29from action_space import *
30
31# DeepQ Neural Network.
32class DeepQ(AgentBase):
33
34    # TODO Empty Constructor
35    def __init__(self, model=None):
36        """Initialize an Agent object.
37        
38        Params
39        ======
40            input_shape (tuple): dimension of each state (C, H, W)
41            action_size (int): dimension of each action
42            seed (int): random seed
43            device(string): Use Gpu or CPU
44            buffer_size (int): replay buffer size
45            batch_size (int):  minibatch size
46            gamma (float): discount factor
47            lr (float): learning rate 
48            update_every (int): how often to update the network
49            replay_after (int): After which replay to be started
50            model(Model): Pytorch Model
51        """
52        input_shape = (4, 84, 84) #stacked frames x channels x w x h
53        self.action_size = 7 #len(possible_actions)
54        self.seed = 0 #random.seed(seed)
55        self.buffer_size = 100000
56        self.batch_size = 32
57        self.gamma = 0.99
58        self.lr = 0.00001
59        self.update_every = 100
60        self.replay_after = 10000
61        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
62        
63        self.DQN = DQN
64        self.tau = 1e-3
65
66        
67        # Q-Network
68    
69        self.policy_net = self.DQN(input_shape, self.action_size, self.seed).to(self.device)
70        if model is not None:
71            os.chdir(script_dir)
72            os.chdir('..')
73            os.chdir('..')
74            root = os.getcwd()
75            checkpoint = torch.load(model, map_location=self.device)
76
77            self.policy_net.load_state_dict(checkpoint['model_state_dict'])
78
79            #self.policy_net.load_state_dict(torch.load(os.path.join(root, model), map_location=self.device), strict=False)
80        self.target_net = self.DQN(input_shape, self.action_size, self.seed).to(self.device)
81        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
82        
83        # Replay memory
84        self.memory = ReplayBuffer(self.buffer_size, self.batch_size, self.seed, self.device)
85        self.t_step = 0
86
87    def step(self, state, action, reward, next_state, done):
88        # Save experience in replay memory
89        self.memory.add(state, action, reward, next_state, done)
90        
91        # Learn every UPDATE_EVERY time steps.
92        self.t_step = (self.t_step + 1) % self.update_every
93
94        if self.t_step == 0:
95            # If enough samples are available in memory, get random subset and learn
96            if len(self.memory) > self.replay_after:
97                experiences = self.memory.sample()
98                self.learn(experiences)
99                
100    def act(self, state, eps=0.03):
101        """Returns actions for given state as per current policy."""
102        
103        state = torch.from_numpy(state).unsqueeze(0).to(self.device)
104        self.policy_net.eval()
105        with torch.no_grad():
106            action_values = self.policy_net(state)
107        self.policy_net.train()
108        
109        # Epsilon-greedy action selection
110        if random.random() > eps:
111            return np.argmax(action_values.cpu().data.numpy())
112        else:
113            return random.choice(np.arange(self.action_size))
114        
115    def learn(self, experiences):
116        states, actions, rewards, next_states, dones = experiences
117
118        # Get expected Q values from policy model
119        Q_expected_current = self.policy_net(states)
120        Q_expected = Q_expected_current.gather(1, actions.unsqueeze(1)).squeeze(1)
121
122        # Get max predicted Q values (for next states) from target model
123        Q_targets_next = self.target_net(next_states).detach().max(1)[0]
124        
125        # Compute Q targets for current states 
126        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
127        
128        # Compute loss
129        loss = F.mse_loss(Q_expected, Q_targets)
130
131        # Minimize the loss
132        self.optimizer.zero_grad()
133        loss.backward()
134        self.optimizer.step()
135        self.soft_update(self.policy_net, self.target_net, self.tau)
136
137    def soft_update(self, policy_model, target_model, tau):
138        for target_param, policy_param in zip(target_model.parameters(), policy_model.parameters()):
139            target_param.data.copy_(tau*policy_param.data + (1.0-tau)*target_param.data)
140        something = 0
141
142    def load(self, filename):
143        None
144
145    def save(self, filename, epoch):
146        torch.save({
147            'epoch': epoch,
148            'model_state_dict': self.policy_net.state_dict(),
149            }, filename)
150        
151    def train(self, env, n_episodes, reward_system, render, ckpt, save_rate):
152        DeepQTrainer.train(self, env, n_episodes, reward_system, render, ckpt, save_rate)
153    
154    def decide(self, ob, info) -> list:
155        # Quick Fix
156        if hasattr(self, '__prev_state'):
157            self.__prev_state = DeepQTrainer.stack_frames(self.__prev_state, ob, False)
158        else:
159            self.__prev_state = DeepQTrainer.stack_frames(None, ob, True) 
160        
161        move = self.act(self.__prev_state)
162
163        return ActionSpace.move(move)
164
165    # Returns name of agent as a string
166    def name(self) -> str:
167        return "DeepQ"
168
169    # Moves data from current memory to 'device' memory
170    # ex: agent.move_to(torch.cuda()) will move neural network data to GPU memory. 
171    # If sucessfull, all operations on this NN will be executed on that device (CPU or GPU).
172    # Internal fields will be moved. The object itself does not need to be reassigned like tensors do.
173    def to(self, device) -> None:
174        self.policy_net = self.policy_net.to(device)
175        self.target_net = self.target_net.to(device)

DeepQ Traing Class:

1##---------------Sources-------------------------##
2# DeepQ Learning with PyTorch: https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
3# DeepQ Image Processing for GymRetro:  https://github.com/deepanshut041/Reinforcement-Learning 
4# Helper Functions for Gym Retro: https://github.com/moversti/sonicNEAT 
5##------------------------------------------------##
6
7from fileinput import filename
8import time
9import retro
10import random
11import torch
12import numpy as np
13from collections import deque
14import math
15import os
16import sys
17from datetime import datetime, date
18
19script_dir = os.path.dirname(os.path.abspath(__file__))
20project_dir = os.path.abspath(script_dir + "/../..")
21
22sys.path.append(os.path.abspath(project_dir + '/source/agents'))
23sys.path.append(os.path.abspath(project_dir + '/source/interface'))
24sys.path.append(os.path.abspath(project_dir + '/source/learning'))
25sys.path.append(os.path.abspath(project_dir + '/source/models'))
26sys.path.append(os.path.abspath(project_dir + '/source/vision'))
27
28from all_agents import * 
29from checkpoint import *
30from deep_q_agent import *
31from deep_q_model import DQN
32from image_processing import preprocess_frame, stack_frame
33from action_space import *
34from greyImageViewer import GreyImageViewer
35from controllerViewer import ControllerViewer
36from reward_system import *
37
38class DeepQTrainer:
39    def stack_frames(frames, state, is_new=False):
40        """Stacks frames for broader input of environment."""
41
42        frame = preprocess_frame(state)
43        frames = stack_frame(frames, frame, is_new)
44        return frames
45
46
47    def train(agent, env, n_episodes=1000, reward_system=RewardSystem.Contest, render=False, ckpt=None, save_rate=10):
48        """
49        Params
50        ======
51            n_episodes (int): maximum number of training episodes
52        """
53        # if gpu is to be used
54        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
55        print("Device: ", device)
56
57        UPDATE_TARGET = 10000	# After which thershold replay to be started 
58        EPS_START = 0.99		# starting value of epsilon
59        EPS_END = 0.01			# Ending value of epsilon
60        EPS_DECAY = 100			# Rate by which epsilon to be decayed
61
62        # Initialize Agent
63        
64        start_epoch = 0
65        scores = []
66        best_ckpt_score = -9999999	# initialize to a very small number
67        scores_window = deque(maxlen=20)
68
69        # Initialize checkpoint
70        ckpt = Checkpoint(agent)
71        ckpt.make_dir() # Makes new directory if it does not exist
72
73        epsilon_by_epsiode = lambda frame_idx: EPS_END + (EPS_START - EPS_END) * math.exp(-1. * frame_idx /EPS_DECAY)
74        for i_episode in range(start_epoch + 1, n_episodes+1):
75            state = DeepQTrainer.stack_frames(None, env.reset(), True)
76            next_state, reward, done, info = env.step(ActionSpace.stand_still())	# make a passive move to initialize data
77
78            score = 0
79            eps = epsilon_by_epsiode(i_episode)
80            reward_system.init(info)
81
82            # Punish the agent for not moving forward
83            prev_state = {}
84            steps_stuck = 0
85            timestamp = 0
86
87            while timestamp < 5000:
88                action = agent.act(state, eps)
89                next_state, reward, done, info = env.step(ActionSpace.move(action))
90                reward = reward_system.calc_reward(info, ActionSpace.move(action))
91
92                if render is True:
93                    env.render()
94
95                score += reward
96
97                timestamp += 1
98
99                # Punish the agent for standing still for too long.
100                if (prev_state == info):
101                    steps_stuck += 1
102                else:
103                    steps_stuck = 0
104                prev_state = info
105
106                if (steps_stuck > 20):
107                    reward -= 1
108
109                next_state = DeepQTrainer.stack_frames(state, next_state, False)
110                agent.step(state, action, reward, next_state, done)
111                state = next_state
112                if done:
113                    break
114            
115            scores_window.append(score)		# save most recent score
116            scores.append(score)			# save most recent score
117            print ("epoch:", i_episode, "score:", score)
118        
119            if (i_episode % save_rate == 0 and score > best_ckpt_score):
120                ckpt.epoch = i_episode
121                ckpt.score = score
122                best_ckpt_score = score
123                fn = ckpt.generate_path()
124                agent.save(fn, i_episode)
125                print(f"Saving checkpoint with new best score {best_ckpt_score}")
126            
127        return scores