Note: 실행을 위해 아래의 패키지들을 설치해주기 바랍니다.

!pip install tqdm numpy scikit-learn pyglet setuptools && \
!pip install gym asciinema pandas tabulate tornado==5.* PyBullet && \
!pip install git+https://github.com/pybox2d/pybox2d#egg=Box2D && \
!pip install git+https://github.com/mimoralea/gym-bandits#egg=gym-bandits && \
!pip install git+https://github.com/mimoralea/gym-walk#egg=gym-walk && \
!pip install git+https://github.com/mimoralea/gym-aima#egg=gym-aima && \
!pip install gym[atari]
!pip install torch torchvision
import warnings ; warnings.filterwarnings('ignore')
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
from IPython.display import display
from collections import namedtuple, deque
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from itertools import cycle, count
from textwrap import wrap

import matplotlib
import subprocess
import os.path
import tempfile
import random
import base64
import pprint
import glob
import time
import json
import sys
import gym
import io
import os
import gc
import platform

from gym import wrappers
from subprocess import check_output
from IPython.display import HTML

LEAVE_PRINT_EVERY_N_SECS = 60
ERASE_LINE = '\x1b[2K'
EPS = 1e-6
RESULTS_DIR = os.path.join('.', 'gym-results')
SEEDS = (12, 34, 56, 78, 90)

%matplotlib inline
plt.style.use('fivethirtyeight')
params = {
    'figure.figsize': (15, 8),
    'font.size': 24,
    'legend.fontsize': 20,
    'axes.titlesize': 28,
    'axes.labelsize': 24,
    'xtick.labelsize': 20,
    'ytick.labelsize': 20
}
pylab.rcParams.update(params)
np.set_printoptions(suppress=True)
torch.cuda.is_available()
True
def get_make_env_fn(**kargs):
    def make_env_fn(env_name, seed=None, render=None, record=False,
                    unwrapped=False, monitor_mode=None, 
                    inner_wrappers=None, outer_wrappers=None):
        mdir = tempfile.mkdtemp()
        env = None
        if render:
            try:
                env = gym.make(env_name, render=render)
            except:
                pass
        if env is None:
            env = gym.make(env_name)
        if seed is not None: env.seed(seed)
        env = env.unwrapped if unwrapped else env
        if inner_wrappers:
            for wrapper in inner_wrappers:
                env = wrapper(env)
        env = wrappers.Monitor(
            env, mdir, force=True, 
            mode=monitor_mode, 
            video_callable=lambda e_idx: record) if monitor_mode else env
        if outer_wrappers:
            for wrapper in outer_wrappers:
                env = wrapper(env)
        return env
    return make_env_fn, kargs
def get_videos_html(env_videos, title, max_n_videos=5):
    videos = np.array(env_videos)
    if len(videos) == 0:
        return
    
    n_videos = max(1, min(max_n_videos, len(videos)))
    idxs = np.linspace(0, len(videos) - 1, n_videos).astype(int) if n_videos > 1 else [-1,]
    videos = videos[idxs,...]

    strm = '<h2>{}</h2>'.format(title)
    for video_path, meta_path in videos:
        video = io.open(video_path, 'r+b').read()
        encoded = base64.b64encode(video)

        with open(meta_path) as data_file:    
            meta = json.load(data_file)

        html_tag = """
        <h3>{0}</h3>
        <video width="960" height="540" controls>
            <source src="data:video/mp4;base64,{1}" type="video/mp4" />
        </video>"""
        strm += html_tag.format('Episode ' + str(meta['episode_id']), encoded.decode('ascii'))
    return strm
platform.system()
'Windows'
def get_gif_html(env_videos, title, subtitle_eps=None, max_n_videos=4):
    videos = np.array(env_videos)
    if len(videos) == 0:
        return
    
    n_videos = max(1, min(max_n_videos, len(videos)))
    idxs = np.linspace(0, len(videos) - 1, n_videos).astype(int) if n_videos > 1 else [-1,]
    videos = videos[idxs,...]

    strm = '<h2>{}</h2>'.format(title)
    for video_path, meta_path in videos:
        basename = os.path.splitext(video_path)[0]
        gif_path = basename + '.gif'
        if not os.path.exists(gif_path):
            if platform.system() == 'Linux':
                ps = subprocess.Popen(
                    ('ffmpeg', 
                     '-i', video_path, 
                     '-r', '7',
                     '-f', 'image2pipe', 
                     '-vcodec', 'ppm',
                     '-crf', '20',
                     '-vf', 'scale=512:-1',
                     '-'), 
                    stdout=subprocess.PIPE,
                    universal_newlines=True)
                output = subprocess.check_output(
                    ('convert',
                     '-coalesce',
                     '-delay', '7',
                     '-loop', '0',
                     '-fuzz', '2%',
                     '+dither',
                     '-deconstruct',
                     '-layers', 'Optimize',
                     '-', gif_path), 
                    stdin=ps.stdout)
                ps.wait()
            else:
                ps = subprocess.Popen('ffmpeg -i {} -r 7 -f image2pipe \
                                      -vcodec ppm -crf 20 -vf scale=512:-1 - | \
                                      convert -coalesce -delay 7 -loop 0 -fuzz 2% \
                                      +dither -deconstruct -layers Optimize \
                                      - {}'.format(video_path, gif_path), 
                                      stdin=subprocess.PIPE, 
                                      shell=True)
                ps.wait()

        gif = io.open(gif_path, 'r+b').read()
        encoded = base64.b64encode(gif)
            
        with open(meta_path) as data_file:    
            meta = json.load(data_file)

        html_tag = """
        <h3>{0}</h3>
        <img src="data:image/gif;base64,{1}" />"""
        prefix = 'Trial ' if subtitle_eps is None else 'Episode '
        sufix = str(meta['episode_id'] if subtitle_eps is None \
                    else subtitle_eps[meta['episode_id']])
        strm += html_tag.format(prefix + sufix, encoded.decode('ascii'))
    return strm

Dueling DDQN

class FCQ(nn.Module):
    def __init__(self, 
                 input_dim, 
                 output_dim, 
                 hidden_dims=(32,32), 
                 activation_fc=F.relu):
        super(FCQ, self).__init__()
        self.activation_fc = activation_fc

        self.input_layer = nn.Linear(input_dim, hidden_dims[0])

        self.hidden_layers = nn.ModuleList()
        for i in range(len(hidden_dims)-1):
            hidden_layer = nn.Linear(hidden_dims[i], hidden_dims[i+1])
            self.hidden_layers.append(hidden_layer)
        self.output_layer = nn.Linear(hidden_dims[-1], output_dim)

        device = "cpu"
        if torch.cuda.is_available():
            device = "cuda:0"
        self.device = torch.device(device)
        self.to(self.device)
        
    def _format(self, state):
        x = state
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, 
                             device=self.device, 
                             dtype=torch.float32)
            x = x.unsqueeze(0)
        return x

    def forward(self, state):
        x = self._format(state)
        x = self.activation_fc(self.input_layer(x))
        for hidden_layer in self.hidden_layers:
            x = self.activation_fc(hidden_layer(x))
        x = self.output_layer(x)
        return x
    
    def numpy_float_to_device(self, variable):
        variable = torch.from_numpy(variable).float().to(self.device)
        return variable
    
    def load(self, experiences):
        states, actions, new_states, rewards, is_terminals = experiences
        states = torch.from_numpy(states).float().to(self.device)
        actions = torch.from_numpy(actions).long().to(self.device)
        new_states = torch.from_numpy(new_states).float().to(self.device)
        rewards = torch.from_numpy(rewards).float().to(self.device)
        is_terminals = torch.from_numpy(is_terminals).float().to(self.device)
        return states, actions, new_states, rewards, is_terminals
class GreedyStrategy():
    def __init__(self):
        self.exploratory_action_taken = False

    def select_action(self, model, state):
        with torch.no_grad():
            q_values = model(state).cpu().detach().data.numpy().squeeze()
            return np.argmax(q_values)
class EGreedyStrategy():
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon
        self.exploratory_action_taken = None

    def select_action(self, model, state):
        self.exploratory_action_taken = False
        with torch.no_grad():
            q_values = model(state).cpu().detach().data.numpy().squeeze()

        if np.random.rand() > self.epsilon:
            action = np.argmax(q_values)
        else: 
            action = np.random.randint(len(q_values))

        self.exploratory_action_taken = action != np.argmax(q_values)
        return action
class EGreedyLinearStrategy():
    def __init__(self, init_epsilon=1.0, min_epsilon=0.1, max_steps=20000):
        self.t = 0
        self.epsilon = init_epsilon
        self.init_epsilon = init_epsilon
        self.min_epsilon = min_epsilon
        self.max_steps = max_steps
        self.exploratory_action_taken = None
        
    def _epsilon_update(self):
        epsilon = 1 - self.t / self.max_steps
        epsilon = (self.init_epsilon - self.min_epsilon) * epsilon + self.min_epsilon
        epsilon = np.clip(epsilon, self.min_epsilon, self.init_epsilon)
        self.t += 1
        return epsilon

    def select_action(self, model, state):
        self.exploratory_action_taken = False
        with torch.no_grad():
            q_values = model(state).cpu().detach().data.numpy().ipynb_checkpoints/squeeze()

        if np.random.rand() > self.epsilon:
            action = np.argmax(q_values)
        else: 
            action = np.random.randint(len(q_values))

        self.epsilon = self._epsilon_update()
        self.exploratory_action_taken = action != np.argmax(q_values)
        return action
class EGreedyExpStrategy():
    def __init__(self, init_epsilon=1.0, min_epsilon=0.1, decay_steps=20000):
        self.epsilon = init_epsilon
        self.init_epsilon = init_epsilon
        self.decay_steps = decay_steps
        self.min_epsilon = min_epsilon
        self.epsilons = 0.01 / np.logspace(-2, 0, decay_steps, endpoint=False) - 0.01
        self.epsilons = self.epsilons * (init_epsilon - min_epsilon) + min_epsilon
        self.t = 0
        self.exploratory_action_taken = None

    def _epsilon_update(self):
        self.epsilon = self.min_epsilon if self.t >= self.decay_steps else self.epsilons[self.t]
        self.t += 1
        return self.epsilon

    def select_action(self, model, state):
        self.exploratory_action_taken = False
        with torch.no_grad():
            q_values = model(state).detach().cpu().data.numpy().squeeze()

        if np.random.rand() > self.epsilon:
            action = np.argmax(q_values)
        else:
            action = np.random.randint(len(q_values))

        self._epsilon_update()
        self.exploratory_action_taken = action != np.argmax(q_values)
        return action
class SoftMaxStrategy():
    def __init__(self, 
                 init_temp=1.0, 
                 min_temp=0.3, 
                 exploration_ratio=0.8, 
                 max_steps=25000):
        self.t = 0
        self.init_temp = init_temp
        self.exploration_ratio = exploration_ratio
        self.min_temp = min_temp
        self.max_steps = max_steps
        self.exploratory_action_taken = None
        
    def _update_temp(self):
        temp = 1 - self.t / (self.max_steps * self.exploration_ratio)
        temp = (self.init_temp - self.min_temp) * temp + self.min_temp
        temp = np.clip(temp, self.min_temp, self.init_temp)
        self.t += 1
        return temp

    def select_action(self, model, state):
        self.exploratory_action_taken = False
        temp = self._update_temp()

        with torch.no_grad():
            q_values = model(state).cpu().detach().data.numpy().squeeze()
            scaled_qs = q_values/temp
            norm_qs = scaled_qs - scaled_qs.max()            
            e = np.exp(norm_qs)
            probs = e / np.sum(e)
            assert np.isclose(probs.sum(), 1.0)

        action = np.random.choice(np.arange(len(probs)), size=1, p=probs)[0]
        self.exploratory_action_taken = action != np.argmax(q_values)
        return action
class ReplayBuffer():
    def __init__(self, 
                 max_size=10000, 
                 batch_size=64):
        self.ss_mem = np.empty(shape=(max_size), dtype=np.ndarray)
        self.as_mem = np.empty(shape=(max_size), dtype=np.ndarray)
        self.rs_mem = np.empty(shape=(max_size), dtype=np.ndarray)
        self.ps_mem = np.empty(shape=(max_size), dtype=np.ndarray)
        self.ds_mem = np.empty(shape=(max_size), dtype=np.ndarray)

        self.max_size = max_size
        self.batch_size = batch_size
        self._idx = 0
        self.size = 0
    
    def store(self, sample):
        s, a, r, p, d = sample
        self.ss_mem[self._idx] = s
        self.as_mem[self._idx] = a
        self.rs_mem[self._idx] = r
        self.ps_mem[self._idx] = p
        self.ds_mem[self._idx] = d
        
        self._idx += 1
        self._idx = self._idx % self.max_size

        self.size += 1
        self.size = min(self.size, self.max_size)

    def sample(self, batch_size=None):
        if batch_size == None:
            batch_size = self.batch_size

        idxs = np.random.choice(
            self.size, batch_size, replace=False)
        experiences = np.vstack(self.ss_mem[idxs]), \
                      np.vstack(self.as_mem[idxs]), \
                      np.vstack(self.rs_mem[idxs]), \
                      np.vstack(self.ps_mem[idxs]), \
                      np.vstack(self.ds_mem[idxs])
        return experiences

    def __len__(self):
        return self.size
class FCDuelingQ(nn.Module):
    def __init__(self, 
                 input_dim, 
                 output_dim, 
                 hidden_dims=(32,32), 
                 activation_fc=F.relu):
        super(FCDuelingQ, self).__init__()
        self.activation_fc = activation_fc

        self.input_layer = nn.Linear(input_dim, hidden_dims[0])
        self.hidden_layers = nn.ModuleList()
        for i in range(len(hidden_dims)-1):
            hidden_layer = nn.Linear(hidden_dims[i], hidden_dims[i+1])
            self.hidden_layers.append(hidden_layer)
        self.output_value = nn.Linear(hidden_dims[-1], 1)
        self.output_layer = nn.Linear(hidden_dims[-1], output_dim)

        device = "cpu"
        if torch.cuda.is_available():
            device = "cuda:0"
        self.device = torch.device(device)
        self.to(self.device)
        
    def _format(self, state):
        x = state
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, 
                             device=self.device, 
                             dtype=torch.float32)
            x = x.unsqueeze(0)      
        return x

    def forward(self, state):
        x = self._format(state)
        x = self.activation_fc(self.input_layer(x))
        for hidden_layer in self.hidden_layers:
            x = self.activation_fc(hidden_layer(x))
        a = self.output_layer(x)
        v = self.output_value(x).expand_as(a)
        q = v + a - a.mean(1, keepdim=True).expand_as(a)
        return q

    def numpy_float_to_device(self, variable):
        variable = torch.from_numpy(variable).float().to(self.device)
        return variable

    def load(self, experiences):
        states, actions, new_states, rewards, is_terminals = experiences
        states = torch.from_numpy(states).float().to(self.device)
        actions = torch.from_numpy(actions).long().to(self.device)
        new_states = torch.from_numpy(new_states).float().to(self.device)
        rewards = torch.from_numpy(rewards).float().to(self.device)
        is_terminals = torch.from_numpy(is_terminals).float().to(self.device)
        return states, actions, new_states, rewards, is_terminals
class DuelingDDQN():
    def __init__(self, 
                 replay_buffer_fn, 
                 value_model_fn, 
                 value_optimizer_fn, 
                 value_optimizer_lr,
                 max_gradient_norm,
                 training_strategy_fn,
                 evaluation_strategy_fn,
                 n_warmup_batches,
                 update_target_every_steps,
                 tau):
        self.replay_buffer_fn = replay_buffer_fn
        self.value_model_fn = value_model_fn
        self.value_optimizer_fn = value_optimizer_fn
        self.value_optimizer_lr = value_optimizer_lr
        self.max_gradient_norm = max_gradient_norm
        self.training_strategy_fn = training_strategy_fn
        self.evaluation_strategy_fn = evaluation_strategy_fn
        self.n_warmup_batches = n_warmup_batches
        self.update_target_every_steps = update_target_every_steps
        self.tau = tau

    def optimize_model(self, experiences):
        states, actions, rewards, next_states, is_terminals = experiences
        batch_size = len(is_terminals)

        argmax_a_q_sp = self.online_model(next_states).max(1)[1]
        q_sp = self.target_model(next_states).detach()
        max_a_q_sp = q_sp[
            np.arange(batch_size), argmax_a_q_sp].unsqueeze(1)
        target_q_sa = rewards + (self.gamma * max_a_q_sp * (1 - is_terminals))
        q_sa = self.online_model(states).gather(1, actions)

        td_error = q_sa - target_q_sa
        value_loss = td_error.pow(2).mul(0.5).mean()
        self.value_optimizer.zero_grad()
        value_loss.backward()        
        torch.nn.utils.clip_grad_norm_(self.online_model.parameters(), 
                                       self.max_gradient_norm)
        self.value_optimizer.step()

    def interaction_step(self, state, env):
        action = self.training_strategy.select_action(self.online_model, state)
        new_state, reward, is_terminal, info = env.step(action)
        is_truncated = 'TimeLimit.truncated' in info and info['TimeLimit.truncated']
        is_failure = is_terminal and not is_truncated
        experience = (state, action, reward, new_state, float(is_failure))
        self.replay_buffer.store(experience)
        self.episode_reward[-1] += reward
        self.episode_timestep[-1] += 1
        self.episode_exploration[-1] += int(self.training_strategy.exploratory_action_taken)
        return new_state, is_terminal
    
    def update_network(self, tau=None):
        tau = self.tau if tau is None else tau
        for target, online in zip(self.target_model.parameters(), 
                                  self.online_model.parameters()):
            target_ratio = (1.0 - self.tau) * target.data
            online_ratio = self.tau * online.data
            mixed_weights = target_ratio + online_ratio
            target.data.copy_(mixed_weights)

    def train(self, make_env_fn, make_env_kargs, seed, gamma, 
              max_minutes, max_episodes, goal_mean_100_reward):
        training_start, last_debug_time = time.time(), float('-inf')

        self.checkpoint_dir = tempfile.mkdtemp()
        self.make_env_fn = make_env_fn
        self.make_env_kargs = make_env_kargs
        self.seed = seed
        self.gamma = gamma
        
        env = self.make_env_fn(**self.make_env_kargs, seed=self.seed)
        torch.manual_seed(self.seed) ; np.random.seed(self.seed) ; random.seed(self.seed)
    
        nS, nA = env.observation_space.shape[0], env.action_space.n
        self.episode_timestep = []
        self.episode_reward = []
        self.episode_seconds = []
        self.evaluation_scores = []        
        self.episode_exploration = []
        
        self.target_model = self.value_model_fn(nS, nA)
        self.online_model = self.value_model_fn(nS, nA)
        self.update_network(tau=1.0)

        self.value_optimizer = self.value_optimizer_fn(self.online_model, 
                                                       self.value_optimizer_lr)

        self.replay_buffer = self.replay_buffer_fn()
        self.training_strategy = training_strategy_fn()
        self.evaluation_strategy = evaluation_strategy_fn() 
                    
        result = np.empty((max_episodes, 5))
        result[:] = np.nan
        training_time = 0
        for episode in range(1, max_episodes + 1):
            episode_start = time.time()
            
            state, is_terminal = env.reset(), False
            self.episode_reward.append(0.0)
            self.episode_timestep.append(0.0)
            self.episode_exploration.append(0.0)

            for step in count():
                state, is_terminal = self.interaction_step(state, env)
                
                min_samples = self.replay_buffer.batch_size * self.n_warmup_batches
                if len(self.replay_buffer) > min_samples:
                    experiences = self.replay_buffer.sample()
                    experiences = self.online_model.load(experiences)
                    self.optimize_model(experiences)
                
                if np.sum(self.episode_timestep) % self.update_target_every_steps == 0:
                    self.update_network()

                if is_terminal:
                    gc.collect()
                    break
            
            # stats
            episode_elapsed = time.time() - episode_start
            self.episode_seconds.append(episode_elapsed)
            training_time += episode_elapsed
            evaluation_score, _ = self.evaluate(self.online_model, env)
            self.save_checkpoint(episode-1, self.online_model)
            
            total_step = int(np.sum(self.episode_timestep))
            self.evaluation_scores.append(evaluation_score)
            
            mean_10_reward = np.mean(self.episode_reward[-10:])
            std_10_reward = np.std(self.episode_reward[-10:])
            mean_100_reward = np.mean(self.episode_reward[-100:])
            std_100_reward = np.std(self.episode_reward[-100:])
            mean_100_eval_score = np.mean(self.evaluation_scores[-100:])
            std_100_eval_score = np.std(self.evaluation_scores[-100:])
            lst_100_exp_rat = np.array(
                self.episode_exploration[-100:])/np.array(self.episode_timestep[-100:])
            mean_100_exp_rat = np.mean(lst_100_exp_rat)
            std_100_exp_rat = np.std(lst_100_exp_rat)
            
            wallclock_elapsed = time.time() - training_start
            result[episode-1] = total_step, mean_100_reward, \
                mean_100_eval_score, training_time, wallclock_elapsed
            
            reached_debug_time = time.time() - last_debug_time >= LEAVE_PRINT_EVERY_N_SECS
            reached_max_minutes = wallclock_elapsed >= max_minutes * 60
            reached_max_episodes = episode >= max_episodes
            reached_goal_mean_reward = mean_100_eval_score >= goal_mean_100_reward
            training_is_over = reached_max_minutes or \
                               reached_max_episodes or \
                               reached_goal_mean_reward

            elapsed_str = time.strftime("%H:%M:%S", time.gmtime(time.time() - training_start))
            debug_message = 'el {}, ep {:04}, ts {:06}, '
            debug_message += 'ar 10 {:05.1f}\u00B1{:05.1f}, '
            debug_message += '100 {:05.1f}\u00B1{:05.1f}, '
            debug_message += 'ex 100 {:02.1f}\u00B1{:02.1f}, '
            debug_message += 'ev {:05.1f}\u00B1{:05.1f}'
            debug_message = debug_message.format(
                elapsed_str, episode-1, total_step, mean_10_reward, std_10_reward, 
                mean_100_reward, std_100_reward, mean_100_exp_rat, std_100_exp_rat,
                mean_100_eval_score, std_100_eval_score)
            print(debug_message, end='\r', flush=True)
            if reached_debug_time or training_is_over:
                print(ERASE_LINE + debug_message, flush=True)
                last_debug_time = time.time()
            if training_is_over:
                if reached_max_minutes: print(u'--> reached_max_minutes \u2715')
                if reached_max_episodes: print(u'--> reached_max_episodes \u2715')
                if reached_goal_mean_reward: print(u'--> reached_goal_mean_reward \u2713')
                break
                
        final_eval_score, score_std = self.evaluate(self.online_model, env, n_episodes=100)
        wallclock_time = time.time() - training_start
        print('Training complete.')
        print('Final evaluation score {:.2f}\u00B1{:.2f} in {:.2f}s training time,'
              ' {:.2f}s wall-clock time.\n'.format(
                  final_eval_score, score_std, training_time, wallclock_time))
        env.close() ; del env
        self.get_cleaned_checkpoints()
        return result, final_eval_score, training_time, wallclock_time
    
    def evaluate(self, eval_policy_model, eval_env, n_episodes=1):
        rs = []
        for _ in range(n_episodes):
            s, d = eval_env.reset(), False
            rs.append(0)
            for _ in count():
                a = self.evaluation_strategy.select_action(eval_policy_model, s)
                s, r, d, _ = eval_env.step(a)
                rs[-1] += r
                if d: break
        return np.mean(rs), np.std(rs)

    def get_cleaned_checkpoints(self, n_checkpoints=5):
        try: 
            return self.checkpoint_paths
        except AttributeError:
            self.checkpoint_paths = {}

        paths = glob.glob(os.path.join(self.checkpoint_dir, '*.tar'))
        paths_dic = {int(path.split('.')[-2]):path for path in paths}
        last_ep = max(paths_dic.keys())
        # checkpoint_idxs = np.geomspace(1, last_ep+1, n_checkpoints, endpoint=True, dtype=np.int)-1
        checkpoint_idxs = np.linspace(1, last_ep+1, n_checkpoints, endpoint=True, dtype=np.int)-1

        for idx, path in paths_dic.items():
            if idx in checkpoint_idxs:
                self.checkpoint_paths[idx] = path
            else:
                os.unlink(path)

        return self.checkpoint_paths

    def demo_last(self, title='Fully-trained {} Agent', n_episodes=3, max_n_videos=3):
        env = self.make_env_fn(**self.make_env_kargs, monitor_mode='evaluation', render=True, record=True)

        checkpoint_paths = self.get_cleaned_checkpoints()
        last_ep = max(checkpoint_paths.keys())
        self.online_model.load_state_dict(torch.load(checkpoint_paths[last_ep]))

        self.evaluate(self.online_model, env, n_episodes=n_episodes)
        env.close()
        data = get_gif_html(env_videos=env.videos, 
                            title=title.format(self.__class__.__name__),
                            max_n_videos=max_n_videos)
        del env
        return HTML(data=data)

    def demo_progression(self, title='{} Agent progression', max_n_videos=5):
        env = self.make_env_fn(**self.make_env_kargs, monitor_mode='evaluation', render=True, record=True)

        checkpoint_paths = self.get_cleaned_checkpoints()
        for i in sorted(checkpoint_paths.keys()):
            self.online_model.load_state_dict(torch.load(checkpoint_paths[i]))
            self.evaluate(self.online_model, env, n_episodes=1)

        env.close()
        data = get_gif_html(env_videos=env.videos, 
                            title=title.format(self.__class__.__name__),
                            subtitle_eps=sorted(checkpoint_paths.keys()),
                            max_n_videos=max_n_videos)
        del env
        return HTML(data=data)

    def save_checkpoint(self, episode_idx, model):
        torch.save(model.state_dict(), 
                   os.path.join(self.checkpoint_dir, 'model.{}.tar'.format(episode_idx)))
dueling_ddqn_results = []
dueling_ddqn_agents, best_dueling_ddqn_agent_key, best_eval_score = {}, None, float('-inf')
for seed in SEEDS:
    environment_settings = {
        'env_name': 'CartPole-v1',
        'gamma': 1.00,
        'max_minutes': 20,
        'max_episodes': 10000,
        'goal_mean_100_reward': 475
    }
    
    # value_model_fn = lambda nS, nA: FCQ(nS, nA, hidden_dims=(512,128))
    value_model_fn = lambda nS, nA: FCDuelingQ(nS, nA, hidden_dims=(512,128))
    value_optimizer_fn = lambda net, lr: optim.RMSprop(net.parameters(), lr=lr)
    value_optimizer_lr = 0.0005
    max_gradient_norm = float('inf')

    training_strategy_fn = lambda: EGreedyExpStrategy(init_epsilon=1.0,  
                                                      min_epsilon=0.3, 
                                                      decay_steps=20000)
    evaluation_strategy_fn = lambda: GreedyStrategy()

    replay_buffer_fn = lambda: ReplayBuffer(max_size=50000, batch_size=64)
    n_warmup_batches = 5
    update_target_every_steps = 1
    tau = 0.1

    env_name, gamma, max_minutes, \
    max_episodes, goal_mean_100_reward = environment_settings.values()
    agent = DuelingDDQN(replay_buffer_fn,
                        value_model_fn,
                        value_optimizer_fn,
                        value_optimizer_lr,
                        max_gradient_norm,
                        training_strategy_fn,
                        evaluation_strategy_fn,
                        n_warmup_batches,
                        update_target_every_steps,
                        tau)

    make_env_fn, make_env_kargs = get_make_env_fn(env_name=env_name)
    result, final_eval_score, training_time, wallclock_time = agent.train(
        make_env_fn, make_env_kargs, seed, gamma, max_minutes, max_episodes, goal_mean_100_reward)
    dueling_ddqn_results.append(result)
    dueling_ddqn_agents[seed] = agent
    if final_eval_score > best_eval_score:
        best_eval_score = final_eval_score
        best_dueling_ddqn_agent_key = seed
dueling_ddqn_results = np.array(dueling_ddqn_results)
el 00:00:01, ep 0000, ts 000016, ar 10 016.0±000.0, 100 016.0±000.0, ex 100 0.3±0.0, ev 009.0±000.0
el 00:01:02, ep 0131, ts 010190, ar 10 189.8±108.5, 100 092.9±081.8, ex 100 0.3±0.1, ev 324.9±096.6
el 00:02:03, ep 0169, ts 022591, ar 10 458.9±101.1, 100 198.1±148.2, ex 100 0.2±0.1, ev 345.4±103.2
el 00:03:05, ep 0195, ts 035136, ar 10 454.5±089.6, 100 304.9±166.0, ex 100 0.2±0.1, ev 393.3±111.0
el 00:04:07, ep 0222, ts 046773, ar 10 459.3±081.5, 100 382.7±147.9, ex 100 0.2±0.0, ev 441.4±097.0
el 00:04:51, ep 0240, ts 055065, ar 10 453.6±070.4, 100 427.2±120.0, ex 100 0.2±0.0, ev 476.1±063.8
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 253.77s training time, 311.37s wall-clock time.

el 00:00:00, ep 0000, ts 000034, ar 10 034.0±000.0, 100 034.0±000.0, ex 100 0.6±0.0, ev 008.0±000.0
el 00:01:00, ep 0147, ts 010996, ar 10 184.9±072.3, 100 096.4±085.5, ex 100 0.3±0.1, ev 253.6±099.4
el 00:02:01, ep 0195, ts 023123, ar 10 356.0±103.3, 100 196.7±106.2, ex 100 0.2±0.1, ev 316.2±113.4
el 00:03:01, ep 0220, ts 035234, ar 10 500.0±000.0, 100 296.7±138.3, ex 100 0.2±0.0, ev 377.4±119.4
el 00:04:03, ep 0245, ts 047149, ar 10 449.1±104.8, 100 365.6±139.9, ex 100 0.2±0.0, ev 424.7±110.8
el 00:05:04, ep 0268, ts 058543, ar 10 489.4±031.8, 100 431.8±114.2, ex 100 0.2±0.0, ev 471.2±071.3
el 00:05:12, ep 0271, ts 060043, ar 10 489.4±031.8, 100 439.6±109.5, ex 100 0.2±0.0, ev 477.1±063.8
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 277.92s training time, 332.44s wall-clock time.

el 00:00:00, ep 0000, ts 000012, ar 10 012.0±000.0, 100 012.0±000.0, ex 100 0.4±0.0, ev 009.0±000.0
el 00:01:00, ep 0149, ts 011372, ar 10 231.7±040.8, 100 100.6±085.6, ex 100 0.3±0.1, ev 224.6±095.8
el 00:02:01, ep 0186, ts 023906, ar 10 455.9±082.6, 100 214.8±147.6, ex 100 0.2±0.1, ev 309.1±132.2
el 00:03:01, ep 0214, ts 035806, ar 10 494.5±016.5, 100 308.9±158.8, ex 100 0.2±0.1, ev 395.0±124.4
el 00:04:03, ep 0241, ts 047454, ar 10 398.6±156.7, 100 379.9±146.5, ex 100 0.2±0.0, ev 449.2±103.6
el 00:04:34, ep 0254, ts 053255, ar 10 456.3±131.1, 100 410.7±138.4, ex 100 0.2±0.0, ev 475.5±078.4
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 243.35s training time, 294.46s wall-clock time.

el 00:00:00, ep 0000, ts 000012, ar 10 012.0±000.0, 100 012.0±000.0, ex 100 0.2±0.0, ev 011.0±000.0
el 00:01:00, ep 0129, ts 011756, ar 10 233.4±107.3, 100 110.2±091.5, ex 100 0.3±0.1, ev 267.4±102.5
el 00:02:01, ep 0172, ts 024414, ar 10 349.2±138.0, 100 217.6±123.4, ex 100 0.2±0.1, ev 347.5±100.8
el 00:03:03, ep 0199, ts 036949, ar 10 500.0±000.0, 100 314.9±145.4, ex 100 0.2±0.0, ev 403.5±104.8
el 00:04:04, ep 0225, ts 048902, ar 10 497.8±006.6, 100 381.8±140.5, ex 100 0.2±0.0, ev 450.1±087.1
el 00:04:38, ep 0239, ts 055370, ar 10 446.8±108.6, 100 407.1±135.9, ex 100 0.2±0.0, ev 475.0±062.6
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 247.27s training time, 298.50s wall-clock time.

el 00:00:00, ep 0000, ts 000039, ar 10 039.0±000.0, 100 039.0±000.0, ex 100 0.4±0.0, ev 038.0±000.0
el 00:01:00, ep 0142, ts 011535, ar 10 184.4±054.1, 100 102.8±083.3, ex 100 0.3±0.1, ev 255.2±095.3
el 00:02:02, ep 0180, ts 024393, ar 10 485.7±042.9, 100 215.3±148.5, ex 100 0.2±0.1, ev 322.0±119.7
el 00:03:03, ep 0206, ts 036765, ar 10 457.3±085.5, 100 313.3±162.8, ex 100 0.2±0.1, ev 386.2±124.2
el 00:04:05, ep 0232, ts 048735, ar 10 439.3±113.1, 100 390.4±143.2, ex 100 0.2±0.0, ev 436.3±105.9
el 00:04:52, ep 0253, ts 057655, ar 10 422.1±155.8, 100 440.0±113.5, ex 100 0.2±0.0, ev 477.1±071.1
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 259.97s training time, 311.73s wall-clock time.

dueling_ddqn_agents[best_dueling_ddqn_agent_key].demo_progression()

DuelingDDQN Agent progression

Episode 0

Episode 60

Episode 120

Episode 180

Episode 240

dueling_ddqn_agents[best_dueling_ddqn_agent_key].demo_last()

Fully-trained DuelingDDQN Agent

Trial 0

Trial 1

Trial 2

ddqn_root_dir = os.path.join(RESULTS_DIR, 'ddqn')
ddqn_x = np.load(os.path.join(ddqn_root_dir, 'x.npy'))

ddqn_max_r = np.load(os.path.join(ddqn_root_dir, 'max_r.npy'))
ddqn_min_r = np.load(os.path.join(ddqn_root_dir, 'min_r.npy'))
ddqn_mean_r = np.load(os.path.join(ddqn_root_dir, 'mean_r.npy'))

ddqn_max_s = np.load(os.path.join(ddqn_root_dir, 'max_s.npy'))
ddqn_min_s = np.load(os.path.join(ddqn_root_dir, 'min_s.npy'))
ddqn_mean_s = np.load(os.path.join(ddqn_root_dir, 'mean_s.npy'))

ddqn_max_t = np.load(os.path.join(ddqn_root_dir, 'max_t.npy'))
ddqn_min_t = np.load(os.path.join(ddqn_root_dir, 'min_t.npy'))
ddqn_mean_t = np.load(os.path.join(ddqn_root_dir, 'mean_t.npy'))

ddqn_max_sec = np.load(os.path.join(ddqn_root_dir, 'max_sec.npy'))
ddqn_min_sec = np.load(os.path.join(ddqn_root_dir, 'min_sec.npy'))
ddqn_mean_sec = np.load(os.path.join(ddqn_root_dir, 'mean_sec.npy'))

ddqn_max_rt = np.load(os.path.join(ddqn_root_dir, 'max_rt.npy'))
ddqn_min_rt = np.load(os.path.join(ddqn_root_dir, 'min_rt.npy'))
ddqn_mean_rt = np.load(os.path.join(ddqn_root_dir, 'mean_rt.npy'))
dueling_ddqn_max_t, dueling_ddqn_max_r, dueling_ddqn_max_s, \
dueling_ddqn_max_sec, dueling_ddqn_max_rt = np.max(dueling_ddqn_results, axis=0).T
dueling_ddqn_min_t, dueling_ddqn_min_r, dueling_ddqn_min_s, \
dueling_ddqn_min_sec, dueling_ddqn_min_rt = np.min(dueling_ddqn_results, axis=0).T
dueling_ddqn_mean_t, dueling_ddqn_mean_r, dueling_ddqn_mean_s, \
dueling_ddqn_mean_sec, dueling_ddqn_mean_rt = np.mean(dueling_ddqn_results, axis=0).T
dueling_ddqn_x = np.arange(np.max(
    (len(dueling_ddqn_mean_s), len(ddqn_mean_s))))
fig, axs = plt.subplots(5, 1, figsize=(15,30), sharey=False, sharex=True)

# DDQN
axs[0].plot(ddqn_max_r, 'g', linewidth=1)
axs[0].plot(ddqn_min_r, 'g', linewidth=1)
axs[0].plot(ddqn_mean_r, 'g-.', label='DDQN', linewidth=2)
axs[0].fill_between(ddqn_x, ddqn_min_r, ddqn_max_r, facecolor='g', alpha=0.3)

axs[1].plot(ddqn_max_s, 'g', linewidth=1)
axs[1].plot(ddqn_min_s, 'g', linewidth=1)
axs[1].plot(ddqn_mean_s, 'g-.', label='DDQN', linewidth=2)
axs[1].fill_between(ddqn_x, ddqn_min_s, ddqn_max_s, facecolor='g', alpha=0.3)

axs[2].plot(ddqn_max_t, 'g', linewidth=1)
axs[2].plot(ddqn_min_t, 'g', linewidth=1)
axs[2].plot(ddqn_mean_t, 'g-.', label='DDQN', linewidth=2)
axs[2].fill_between(ddqn_x, ddqn_min_t, ddqn_max_t, facecolor='g', alpha=0.3)

axs[3].plot(ddqn_max_sec, 'g', linewidth=1)
axs[3].plot(ddqn_min_sec, 'g', linewidth=1)
axs[3].plot(ddqn_mean_sec, 'g-.', label='DDQN', linewidth=2)
axs[3].fill_between(ddqn_x, ddqn_min_sec, ddqn_max_sec, facecolor='g', alpha=0.3)

axs[4].plot(ddqn_max_rt, 'g', linewidth=1)
axs[4].plot(ddqn_min_rt, 'g', linewidth=1)
axs[4].plot(ddqn_mean_rt, 'g-.', label='DDQN', linewidth=2)
axs[4].fill_between(ddqn_x, ddqn_min_rt, ddqn_max_rt, facecolor='g', alpha=0.3)

# Dueling DDQN
axs[0].plot(dueling_ddqn_max_r, 'r', linewidth=1)
axs[0].plot(dueling_ddqn_min_r, 'r', linewidth=1)
axs[0].plot(dueling_ddqn_mean_r, 'r:', label='Dueling DDQN', linewidth=2)
axs[0].fill_between(
    dueling_ddqn_x, dueling_ddqn_min_r, dueling_ddqn_max_r, facecolor='r', alpha=0.3)

axs[1].plot(dueling_ddqn_max_s, 'r', linewidth=1)
axs[1].plot(dueling_ddqn_min_s, 'r', linewidth=1)
axs[1].plot(dueling_ddqn_mean_s, 'r:', label='Dueling DDQN', linewidth=2)
axs[1].fill_between(
    dueling_ddqn_x, dueling_ddqn_min_s, dueling_ddqn_max_s, facecolor='r', alpha=0.3)

axs[2].plot(dueling_ddqn_max_t, 'r', linewidth=1)
axs[2].plot(dueling_ddqn_min_t, 'r', linewidth=1)
axs[2].plot(dueling_ddqn_mean_t, 'r:', label='Dueling DDQN', linewidth=2)
axs[2].fill_between(
    dueling_ddqn_x, dueling_ddqn_min_t, dueling_ddqn_max_t, facecolor='r', alpha=0.3)

axs[3].plot(dueling_ddqn_max_sec, 'r', linewidth=1)
axs[3].plot(dueling_ddqn_min_sec, 'r', linewidth=1)
axs[3].plot(dueling_ddqn_mean_sec, 'r:', label='Dueling DDQN', linewidth=2)
axs[3].fill_between(
    dueling_ddqn_x, dueling_ddqn_min_sec, dueling_ddqn_max_sec, facecolor='r', alpha=0.3)

axs[4].plot(dueling_ddqn_max_rt, 'r', linewidth=1)
axs[4].plot(dueling_ddqn_min_rt, 'r', linewidth=1)
axs[4].plot(dueling_ddqn_mean_rt, 'r:', label='Dueling DDQN', linewidth=2)
axs[4].fill_between(
    dueling_ddqn_x, dueling_ddqn_min_rt, dueling_ddqn_max_rt, facecolor='r', alpha=0.3)

# ALL
axs[0].set_title('Moving Avg Reward (Training)')
axs[1].set_title('Moving Avg Reward (Evaluation)')
axs[2].set_title('Total Steps')
axs[3].set_title('Training Time')
axs[4].set_title('Wall-clock Time')
plt.xlabel('Episodes')
axs[0].legend(loc='upper left')
plt.show()
dueling_ddqn_root_dir = os.path.join(RESULTS_DIR, 'dueling_ddqn')
not os.path.exists(dueling_ddqn_root_dir) and os.makedirs(dueling_ddqn_root_dir)

np.save(os.path.join(dueling_ddqn_root_dir, 'x'), dueling_ddqn_x)

np.save(os.path.join(dueling_ddqn_root_dir, 'max_r'), dueling_ddqn_max_r)
np.save(os.path.join(dueling_ddqn_root_dir, 'min_r'), dueling_ddqn_min_r)
np.save(os.path.join(dueling_ddqn_root_dir, 'mean_r'), dueling_ddqn_mean_r)

np.save(os.path.join(dueling_ddqn_root_dir, 'max_s'), dueling_ddqn_max_s)
np.save(os.path.join(dueling_ddqn_root_dir, 'min_s'), dueling_ddqn_min_s )
np.save(os.path.join(dueling_ddqn_root_dir, 'mean_s'), dueling_ddqn_mean_s)

np.save(os.path.join(dueling_ddqn_root_dir, 'max_t'), dueling_ddqn_max_t)
np.save(os.path.join(dueling_ddqn_root_dir, 'min_t'), dueling_ddqn_min_t)
np.save(os.path.join(dueling_ddqn_root_dir, 'mean_t'), dueling_ddqn_mean_t)

np.save(os.path.join(dueling_ddqn_root_dir, 'max_sec'), dueling_ddqn_max_sec)
np.save(os.path.join(dueling_ddqn_root_dir, 'min_sec'), dueling_ddqn_min_sec)
np.save(os.path.join(dueling_ddqn_root_dir, 'mean_sec'), dueling_ddqn_mean_sec)

np.save(os.path.join(dueling_ddqn_root_dir, 'max_rt'), dueling_ddqn_max_rt)
np.save(os.path.join(dueling_ddqn_root_dir, 'min_rt'), dueling_ddqn_min_rt)
np.save(os.path.join(dueling_ddqn_root_dir, 'mean_rt'), dueling_ddqn_mean_rt)
env = make_env_fn(**make_env_kargs, seed=123, monitor_mode='evaluation')
state = env.reset()
img = env.render(mode='rgb_array')
env.close()
del env
print(state)
[ 0.02078762 -0.01301236 -0.0209893  -0.03935255]
plt.imshow(img)
plt.axis('off')
plt.title("State s=" + str(np.round(state,2)))
plt.show()
q_values = dueling_ddqn_agents[best_dueling_ddqn_agent_key].online_model(state).detach().cpu().numpy()[0]
print(q_values)
[1850956.5 1843644. ]
q_s = q_values
v_s = q_values.mean()
a_s = q_values - q_values.mean()
plt.bar(('Left (idx=0)','Right (idx=1)'), q_s)
plt.xlabel('Action')
plt.ylabel('Estimate')
plt.title("Action-value function, Q(" + str(np.round(state,2)) + ")")
plt.show()
plt.bar('s='+str(np.round(state,2)), v_s, width=0.1)
plt.xlabel('State')
plt.ylabel('Estimate')
plt.title("State-value function, V("+str(np.round(state,2))+")")
plt.show()
plt.bar(('Left (idx=0)','Right (idx=1)'), a_s)
plt.xlabel('Action')
plt.ylabel('Estimate')
plt.title("Advantage function, (" + str(np.round(state,2)) + ")")
plt.show()
env = make_env_fn(**make_env_kargs, seed=123, monitor_mode='evaluation')

state, states, imgs, t = env.reset(), [], [], False
while not t:
    states.append(state)
    state, r, t, _ = env.step(0)
    imgs.append(env.render(mode='rgb_array'))

env.close()
del env
states[-2]
array([-0.09048686, -1.57504301,  0.13510693,  2.34025535])
plt.imshow(imgs[-2])
plt.axis('off')
plt.title("State s=" + str(np.round(state,2)))
plt.show()
q_values = dueling_ddqn_agents[best_dueling_ddqn_agent_key].online_model(state).detach().cpu().numpy()[0]
print(q_values)
[683447.06 838859.2 ]
q_s = q_values
v_s = q_values.mean()
a_s = q_values - q_values.mean()
plt.bar(('Left (idx=0)','Right (idx=1)'), q_s)
plt.xlabel('Action')
plt.ylabel('Estimate')
plt.title("Action-value function, Q(" + str(np.round(state,2)) + ")")
plt.show()
plt.bar('s='+str(np.round(state,2)), v_s, width=0.1)
plt.xlabel('State')
plt.ylabel('Estimate')
plt.title("State-value function, V("+str(np.round(state,2))+")")
plt.show()
plt.bar(('Left (idx=0)','Right (idx=1)'), a_s)
plt.xlabel('Action')
plt.ylabel('Estimate')
plt.title("Advantage function, (" + str(np.round(state,2)) + ")")
plt.show()
env = make_env_fn(**make_env_kargs, seed=123, monitor_mode='evaluation')

states = []
for agent in dueling_ddqn_agents.values():
    for episode in range(100):
        state, done = env.reset(), False
        while not done:
            states.append(state)
            action = agent.evaluation_strategy.select_action(agent.online_model, state)
            state, _, done, _ = env.step(action)
env.close()
del env

x = np.array(states)[:,0]
xd = np.array(states)[:,1]
a = np.array(states)[:,2]
ad = np.array(states)[:,3]
parts = plt.violinplot((x, xd, a, ad), 
                       vert=False, showmeans=False, showmedians=False, showextrema=False)

colors = ['red','green','yellow','blue']
for i, pc in enumerate(parts['bodies']):
    pc.set_facecolor(colors[i])
    pc.set_edgecolor(colors[i])
    pc.set_alpha(0.5)

plt.yticks(range(1,5), ["cart position", "cart velocity", "pole angle", "pole velocity"])
plt.yticks(rotation=45)
plt.title('Range of state-variable values for ' + str(
    dueling_ddqn_agents[best_dueling_ddqn_agent_key].__class__.__name__))

plt.show()

Prioritized Experience Replay (PER)

class PrioritizedReplayBuffer():
    def __init__(self, 
                 max_samples=10000, 
                 batch_size=64, 
                 rank_based=False,
                 alpha=0.6, 
                 beta0=0.1, 
                 beta_rate=0.99992):
        self.max_samples = max_samples
        self.memory = np.empty(shape=(self.max_samples, 2), dtype=np.ndarray)
        self.batch_size = batch_size
        self.n_entries = 0
        self.next_index = 0
        self.td_error_index = 0
        self.sample_index = 1
        self.rank_based = rank_based # if not rank_based, then proportional
        self.alpha = alpha # how much prioritization to use 0 is uniform (no priority), 1 is full priority
        self.beta = beta0 # bias correction 0 is no correction 1 is full correction
        self.beta0 = beta0 # beta0 is just beta's initial value
        self.beta_rate = beta_rate

    def update(self, idxs, td_errors):
        self.memory[idxs, self.td_error_index] = np.abs(td_errors)
        if self.rank_based:
            sorted_arg = self.memory[:self.n_entries, self.td_error_index].argsort()[::-1]
            self.memory[:self.n_entries] = self.memory[sorted_arg]

    def store(self, sample):
        priority = 1.0
        if self.n_entries > 0:
            priority = self.memory[
                :self.n_entries, 
                self.td_error_index].max()
        self.memory[self.next_index, 
                    self.td_error_index] = priority
        self.memory[self.next_index, 
                    self.sample_index] = np.array(sample)
        self.n_entries = min(self.n_entries + 1, self.max_samples)
        self.next_index += 1
        self.next_index = self.next_index % self.max_samples

    def _update_beta(self):
        self.beta = min(1.0, self.beta * self.beta_rate**-1)
        return self.beta

    def sample(self, batch_size=None):
        batch_size = self.batch_size if batch_size == None else batch_size
        self._update_beta()
        entries = self.memory[:self.n_entries]

        if self.rank_based:
            priorities = 1/(np.arange(self.n_entries) + 1)
        else: # proportional
            priorities = entries[:, self.td_error_index] + EPS
        scaled_priorities = priorities**self.alpha        
        probs = np.array(scaled_priorities/np.sum(scaled_priorities), dtype=np.float64)

        weights = (self.n_entries * probs)**-self.beta
        normalized_weights = weights/weights.max()
        idxs = np.random.choice(self.n_entries, batch_size, replace=False, p=probs)
        samples = np.array([entries[idx] for idx in idxs])
        
        samples_stacks = [np.vstack(batch_type) for batch_type in np.vstack(samples[:, self.sample_index]).T]
        idxs_stack = np.vstack(idxs)
        weights_stack = np.vstack(normalized_weights[idxs])
        return idxs_stack, weights_stack, samples_stacks

    def __len__(self):
        return self.n_entries
    
    def __repr__(self):
        return str(self.memory[:self.n_entries])
    
    def __str__(self):
        return str(self.memory[:self.n_entries])
b = PrioritizedReplayBuffer()
plt.plot([b._update_beta() for _ in range(100000)])
plt.title('PER Beta')
plt.xticks(rotation=45)
plt.show()
class PER():
    def __init__(self, 
                 replay_buffer_fn, 
                 value_model_fn, 
                 value_optimizer_fn, 
                 value_optimizer_lr,
                 max_gradient_norm,
                 training_strategy_fn,
                 evaluation_strategy_fn,
                 n_warmup_batches,
                 update_target_every_steps,
                 tau):
        self.replay_buffer_fn = replay_buffer_fn
        self.value_model_fn = value_model_fn
        self.value_optimizer_fn = value_optimizer_fn
        self.value_optimizer_lr = value_optimizer_lr
        self.max_gradient_norm = max_gradient_norm
        self.training_strategy_fn = training_strategy_fn
        self.evaluation_strategy_fn = evaluation_strategy_fn
        self.n_warmup_batches = n_warmup_batches
        self.update_target_every_steps = update_target_every_steps
        self.tau = tau

    def optimize_model(self, experiences):
        idxs, weights, \
        (states, actions, rewards, next_states, is_terminals) = experiences
        weights = self.online_model.numpy_float_to_device(weights)
        batch_size = len(is_terminals)
        
        argmax_a_q_sp = self.online_model(next_states).max(1)[1]
        q_sp = self.target_model(next_states).detach()
        max_a_q_sp = q_sp[
            np.arange(batch_size), argmax_a_q_sp].unsqueeze(1)
        target_q_sa = rewards + (self.gamma * max_a_q_sp * (1 - is_terminals))
        q_sa = self.online_model(states).gather(1, actions)

        td_error = q_sa - target_q_sa
        value_loss = (weights * td_error).pow(2).mul(0.5).mean()
        self.value_optimizer.zero_grad()
        value_loss.backward()        
        torch.nn.utils.clip_grad_norm_(self.online_model.parameters(), 
                                       self.max_gradient_norm)
        self.value_optimizer.step()

        priorities = np.abs(td_error.detach().cpu().numpy())
        self.replay_buffer.update(idxs, priorities)

    def interaction_step(self, state, env):
        action = self.training_strategy.select_action(self.online_model, state)
        new_state, reward, is_terminal, info = env.step(action)
        is_truncated = 'TimeLimit.truncated' in info and info['TimeLimit.truncated']
        is_failure = is_terminal and not is_truncated
        experience = (state, action, reward, new_state, float(is_failure))

        self.replay_buffer.store(experience)
        self.episode_reward[-1] += reward
        self.episode_timestep[-1] += 1
        self.episode_exploration[-1] += int(self.training_strategy.exploratory_action_taken)
        return new_state, is_terminal
    
    def update_network(self, tau=None):
        tau = self.tau if tau is None else tau
        for target, online in zip(self.target_model.parameters(), 
                                  self.online_model.parameters()):
            target_ratio = (1.0 - self.tau) * target.data
            online_ratio = self.tau * online.data
            mixed_weights = target_ratio + online_ratio
            target.data.copy_(mixed_weights)

    def train(self, make_env_fn, make_env_kargs, seed, gamma, 
              max_minutes, max_episodes, goal_mean_100_reward):
        training_start, last_debug_time = time.time(), float('-inf')

        self.checkpoint_dir = tempfile.mkdtemp()
        self.make_env_fn = make_env_fn
        self.make_env_kargs = make_env_kargs
        self.seed = seed
        self.gamma = gamma
        
        env = self.make_env_fn(**self.make_env_kargs, seed=self.seed)
        torch.manual_seed(self.seed) ; np.random.seed(self.seed) ; random.seed(self.seed)
    
        nS, nA = env.observation_space.shape[0], env.action_space.n
        self.episode_timestep = []
        self.episode_reward = []
        self.episode_seconds = []
        self.evaluation_scores = []        
        self.episode_exploration = []
        
        self.target_model = self.value_model_fn(nS, nA)
        self.online_model = self.value_model_fn(nS, nA)
        self.update_network(tau=1.0)

        self.value_optimizer = self.value_optimizer_fn(self.online_model, 
                                                       self.value_optimizer_lr)

        self.replay_buffer = self.replay_buffer_fn()
        self.training_strategy = training_strategy_fn()
        self.evaluation_strategy = evaluation_strategy_fn() 
                    
        result = np.empty((max_episodes, 5))
        result[:] = np.nan
        training_time = 0
        for episode in range(1, max_episodes + 1):
            episode_start = time.time()
            
            state, is_terminal = env.reset(), False
            self.episode_reward.append(0.0)
            self.episode_timestep.append(0.0)
            self.episode_exploration.append(0.0)

            for step in count():
                state, is_terminal = self.interaction_step(state, env)
                
                min_samples = self.replay_buffer.batch_size * self.n_warmup_batches
                if len(self.replay_buffer) > min_samples:
                    experiences = self.replay_buffer.sample()
                    idxs, weights, samples = experiences
                    experiences = self.online_model.load(samples)
                    experiences = (idxs, weights) + (experiences,)
                    self.optimize_model(experiences)
                
                if np.sum(self.episode_timestep) % self.update_target_every_steps == 0:
                    self.update_network()

                if is_terminal:
                    gc.collect()
                    break
            
            # stats
            episode_elapsed = time.time() - episode_start
            self.episode_seconds.append(episode_elapsed)
            training_time += episode_elapsed
            evaluation_score, _ = self.evaluate(self.online_model, env)
            self.save_checkpoint(episode-1, self.online_model)

            total_step = int(np.sum(self.episode_timestep))
            self.evaluation_scores.append(evaluation_score)
            
            mean_10_reward = np.mean(self.episode_reward[-10:])
            std_10_reward = np.std(self.episode_reward[-10:])
            mean_100_reward = np.mean(self.episode_reward[-100:])
            std_100_reward = np.std(self.episode_reward[-100:])
            mean_100_eval_score = np.mean(self.evaluation_scores[-100:])
            std_100_eval_score = np.std(self.evaluation_scores[-100:])
            lst_100_exp_rat = np.array(
                self.episode_exploration[-100:])/np.array(self.episode_timestep[-100:])
            mean_100_exp_rat = np.mean(lst_100_exp_rat)
            std_100_exp_rat = np.std(lst_100_exp_rat)
            
            wallclock_elapsed = time.time() - training_start
            result[episode-1] = total_step, mean_100_reward, \
                mean_100_eval_score, training_time, wallclock_elapsed
            
            reached_debug_time = time.time() - last_debug_time >= LEAVE_PRINT_EVERY_N_SECS
            reached_max_minutes = wallclock_elapsed >= max_minutes * 60
            reached_max_episodes = episode >= max_episodes
            reached_goal_mean_reward = mean_100_eval_score >= goal_mean_100_reward
            training_is_over = reached_max_minutes or \
                               reached_max_episodes or \
                               reached_goal_mean_reward

            elapsed_str = time.strftime("%H:%M:%S", time.gmtime(time.time() - training_start))
            debug_message = 'el {}, ep {:04}, ts {:06}, '
            debug_message += 'ar 10 {:05.1f}\u00B1{:05.1f}, '
            debug_message += '100 {:05.1f}\u00B1{:05.1f}, '
            debug_message += 'ex 100 {:02.1f}\u00B1{:02.1f}, '
            debug_message += 'ev {:05.1f}\u00B1{:05.1f}'
            debug_message = debug_message.format(
                elapsed_str, episode-1, total_step, mean_10_reward, std_10_reward, 
                mean_100_reward, std_100_reward, mean_100_exp_rat, std_100_exp_rat,
                mean_100_eval_score, std_100_eval_score)
            print(debug_message, end='\r', flush=True)
            if reached_debug_time or training_is_over:
                print(ERASE_LINE + debug_message, flush=True)
                last_debug_time = time.time()
            if training_is_over:
                if reached_max_minutes: print(u'--> reached_max_minutes \u2715')
                if reached_max_episodes: print(u'--> reached_max_episodes \u2715')
                if reached_goal_mean_reward: print(u'--> reached_goal_mean_reward \u2713')
                break
                
        final_eval_score, score_std = self.evaluate(self.online_model, env, n_episodes=100)
        wallclock_time = time.time() - training_start
        print('Training complete.')
        print('Final evaluation score {:.2f}\u00B1{:.2f} in {:.2f}s training time,'
              ' {:.2f}s wall-clock time.\n'.format(
                  final_eval_score, score_std, training_time, wallclock_time))
        env.close() ; del env
        self.get_cleaned_checkpoints()
        return result, final_eval_score, training_time, wallclock_time
    
    def evaluate(self, eval_policy_model, eval_env, n_episodes=1):
        rs = []
        for _ in range(n_episodes):
            s, d = eval_env.reset(), False
            rs.append(0)
            for _ in count():
                a = self.evaluation_strategy.select_action(eval_policy_model, s)
                s, r, d, _ = eval_env.step(a)
                rs[-1] += r
                if d: break
        return np.mean(rs), np.std(rs)

    def get_cleaned_checkpoints(self, n_checkpoints=5):
        try: 
            return self.checkpoint_paths
        except AttributeError:
            self.checkpoint_paths = {}

        paths = glob.glob(os.path.join(self.checkpoint_dir, '*.tar'))
        paths_dic = {int(path.split('.')[-2]):path for path in paths}
        last_ep = max(paths_dic.keys())
        # checkpoint_idxs = np.geomspace(1, last_ep+1, n_checkpoints, endpoint=True, dtype=np.int)-1
        checkpoint_idxs = np.linspace(1, last_ep+1, n_checkpoints, endpoint=True, dtype=np.int)-1

        for idx, path in paths_dic.items():
            if idx in checkpoint_idxs:
                self.checkpoint_paths[idx] = path
            else:
                os.unlink(path)

        return self.checkpoint_paths

    def demo_last(self, title='Fully-trained {} Agent', n_episodes=3, max_n_videos=3):
        env = self.make_env_fn(**self.make_env_kargs, monitor_mode='evaluation', render=True, record=True)

        checkpoint_paths = self.get_cleaned_checkpoints()
        last_ep = max(checkpoint_paths.keys())
        self.online_model.load_state_dict(torch.load(checkpoint_paths[last_ep]))

        self.evaluate(self.online_model, env, n_episodes=n_episodes)
        env.close()
        data = get_gif_html(env_videos=env.videos, 
                            title=title.format(self.__class__.__name__),
                            max_n_videos=max_n_videos)
        del env
        return HTML(data=data)

    def demo_progression(self, title='{} Agent progression', max_n_videos=5):
        env = self.make_env_fn(**self.make_env_kargs, monitor_mode='evaluation', render=True, record=True)

        checkpoint_paths = self.get_cleaned_checkpoints()
        for i in sorted(checkpoint_paths.keys()):
            self.online_model.load_state_dict(torch.load(checkpoint_paths[i]))
            self.evaluate(self.online_model, env, n_episodes=1)

        env.close()
        data = get_gif_html(env_videos=env.videos, 
                            title=title.format(self.__class__.__name__),
                            subtitle_eps=sorted(checkpoint_paths.keys()),
                            max_n_videos=max_n_videos)
        del env
        return HTML(data=data)

    def save_checkpoint(self, episode_idx, model):
        torch.save(model.state_dict(), 
                   os.path.join(self.checkpoint_dir, 'model.{}.tar'.format(episode_idx)))
per_results = []
best_agent, best_eval_score = None, float('-inf')
for seed in SEEDS:
    environment_settings = {
        'env_name': 'CartPole-v1',
        'gamma': 1.00,
        'max_minutes': 30,
        'max_episodes': 10000,
        'goal_mean_100_reward': 475
    }

    value_model_fn = lambda nS, nA: FCDuelingQ(nS, nA, hidden_dims=(512,128))
    value_optimizer_fn = lambda net, lr: optim.RMSprop(net.parameters(), lr=lr)
    value_optimizer_lr = 0.0005
    max_gradient_norm = float('inf')

    training_strategy_fn = lambda: EGreedyExpStrategy(init_epsilon=1.0,  
                                                      min_epsilon=0.3, 
                                                      decay_steps=20000)
    evaluation_strategy_fn = lambda: GreedyStrategy()

    # replay_buffer_fn = lambda: ReplayBuffer(max_size=10000, batch_size=64)
    # replay_buffer_fn = lambda: PrioritizedReplayBuffer(
    #     max_samples=10000, batch_size=64, rank_based=True, 
    #     alpha=0.6, beta0=0.1, beta_rate=0.99995)
    replay_buffer_fn = lambda: PrioritizedReplayBuffer(
        max_samples=20000, batch_size=64, rank_based=False,
        alpha=0.6, beta0=0.1, beta_rate=0.99995)
    n_warmup_batches = 5
    update_target_every_steps = 1
    tau = 0.1

    env_name, gamma, max_minutes, \
    max_episodes, goal_mean_100_reward = environment_settings.values()
    agent = PER(replay_buffer_fn, 
                value_model_fn, 
                value_optimizer_fn, 
                value_optimizer_lr,
                max_gradient_norm,
                training_strategy_fn,
                evaluation_strategy_fn,
                n_warmup_batches,
                update_target_every_steps,
                tau)

    make_env_fn, make_env_kargs = get_make_env_fn(env_name=env_name)
    result, final_eval_score, training_time, wallclock_time = agent.train(
        make_env_fn, make_env_kargs, seed, gamma, max_minutes, max_episodes, goal_mean_100_reward)
    per_results.append(result)
    if final_eval_score > best_eval_score:
        best_eval_score = final_eval_score
        best_agent = agent
per_results = np.array(per_results)
el 00:00:00, ep 0000, ts 000016, ar 10 016.0±000.0, 100 016.0±000.0, ex 100 0.3±0.0, ev 009.0±000.0
el 00:01:00, ep 0114, ts 008195, ar 10 201.3±077.3, 100 077.5±074.4, ex 100 0.4±0.1, ev 249.5±092.0
el 00:02:01, ep 0147, ts 015589, ar 10 214.1±041.9, 100 139.9±098.4, ex 100 0.3±0.1, ev 274.4±088.5
el 00:03:02, ep 0172, ts 021758, ar 10 331.9±141.8, 100 188.4±107.7, ex 100 0.2±0.1, ev 312.0±108.1
el 00:04:02, ep 0185, ts 027603, ar 10 477.6±053.6, 100 237.3±133.0, ex 100 0.2±0.1, ev 345.2±119.8
el 00:05:04, ep 0197, ts 033460, ar 10 500.0±000.0, 100 284.9±142.2, ex 100 0.2±0.0, ev 367.5±124.7
el 00:06:09, ep 0210, ts 039626, ar 10 500.0±000.0, 100 321.6±149.6, ex 100 0.2±0.0, ev 389.6±126.9
el 00:07:12, ep 0222, ts 045626, ar 10 500.0±000.0, 100 356.6±151.1, ex 100 0.2±0.0, ev 409.6±125.3
el 00:08:12, ep 0234, ts 051599, ar 10 497.3±008.1, 100 386.5±148.6, ex 100 0.1±0.0, ev 435.8±112.1
el 00:09:14, ep 0250, ts 057696, ar 10 434.3±133.0, 100 415.7±143.2, ex 100 0.2±0.0, ev 472.2±083.6
el 00:09:29, ep 0253, ts 059187, ar 10 433.4±132.5, 100 422.4±141.5, ex 100 0.2±0.0, ev 476.0±080.4
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 535.72s training time, 589.28s wall-clock time.

el 00:00:00, ep 0000, ts 000034, ar 10 034.0±000.0, 100 034.0±000.0, ex 100 0.6±0.0, ev 008.0±000.0
el 00:01:02, ep 0122, ts 008486, ar 10 198.0±084.3, 100 079.3±070.4, ex 100 0.4±0.1, ev 247.9±093.5
el 00:02:02, ep 0158, ts 015616, ar 10 185.6±058.9, 100 135.0±088.8, ex 100 0.3±0.1, ev 255.4±091.9
el 00:03:03, ep 0186, ts 021644, ar 10 228.0±051.7, 100 179.8±082.1, ex 100 0.2±0.1, ev 270.1±098.8
el 00:04:06, ep 0199, ts 027497, ar 10 464.1±071.6, 100 228.1±114.9, ex 100 0.2±0.1, ev 303.3±115.3
el 00:05:08, ep 0213, ts 033219, ar 10 418.5±146.4, 100 265.1±136.0, ex 100 0.2±0.0, ev 338.4±128.6
el 00:06:08, ep 0249, ts 038526, ar 10 018.2±003.3, 100 246.4±181.8, ex 100 0.2±0.0, ev 309.2±191.7
el 00:07:09, ep 0313, ts 043396, ar 10 053.2±034.6, 100 101.8±163.0, ex 100 0.2±0.1, ev 240.6±238.8
el 00:08:09, ep 0524, ts 048111, ar 10 017.8±004.7, 100 018.3±004.4, ex 100 0.1±0.1, ev 019.1±003.1
el 00:09:09, ep 0701, ts 053099, ar 10 024.6±006.0, 100 027.2±017.9, ex 100 0.1±0.1, ev 030.8±024.7
el 00:10:09, ep 0931, ts 057875, ar 10 015.5±002.6, 100 015.6±003.3, ex 100 0.1±0.1, ev 015.0±002.0
el 00:11:09, ep 1171, ts 062577, ar 10 040.2±017.0, 100 024.2±009.0, ex 100 0.1±0.1, ev 027.0±011.4
el 00:12:09, ep 1303, ts 067680, ar 10 034.4±011.4, 100 041.0±020.5, ex 100 0.2±0.1, ev 054.0±016.7
el 00:13:09, ep 1573, ts 072256, ar 10 011.5±002.2, 100 011.8±002.6, ex 100 0.1±0.1, ev 010.3±001.4
el 00:14:09, ep 1940, ts 076402, ar 10 011.3±001.7, 100 011.4±002.7, ex 100 0.1±0.1, ev 009.4±000.7
el 00:15:09, ep 2308, ts 080536, ar 10 011.7±002.5, 100 011.2±002.1, ex 100 0.1±0.1, ev 009.3±000.8
el 00:16:09, ep 2675, ts 084680, ar 10 010.1±001.6, 100 011.4±002.6, ex 100 0.1±0.1, ev 009.3±000.7
el 00:17:09, ep 3045, ts 088810, ar 10 010.9±002.3, 100 011.2±002.1, ex 100 0.1±0.1, ev 009.3±000.8
el 00:18:10, ep 3382, ts 093083, ar 10 011.3±003.2, 100 015.8±011.8, ex 100 0.1±0.1, ev 013.6±008.3
el 00:19:10, ep 3741, ts 097242, ar 10 012.6±003.7, 100 011.7±002.8, ex 100 0.1±0.1, ev 009.4±000.7
el 00:20:10, ep 4106, ts 101354, ar 10 010.7±001.6, 100 011.4±002.3, ex 100 0.1±0.1, ev 009.5±000.7
el 00:21:10, ep 4439, ts 105573, ar 10 016.1±002.6, 100 014.2±003.0, ex 100 0.1±0.1, ev 012.7±001.5
el 00:22:10, ep 4573, ts 110637, ar 10 084.0±021.5, 100 045.4±027.2, ex 100 0.2±0.1, ev 052.0±049.0
el 00:23:10, ep 4745, ts 115531, ar 10 027.4±005.5, 100 027.3±008.9, ex 100 0.2±0.1, ev 025.9±005.4
el 00:24:11, ep 4856, ts 120737, ar 10 061.3±012.7, 100 048.9±016.5, ex 100 0.2±0.1, ev 051.9±019.3
el 00:25:11, ep 4917, ts 126095, ar 10 088.6±028.7, 100 077.0±030.3, ex 100 0.1±0.0, ev 084.7±034.7
el 00:26:11, ep 4981, ts 131018, ar 10 097.2±053.8, 100 083.2±037.1, ex 100 0.2±0.1, ev 103.6±039.7
el 00:27:12, ep 5043, ts 136095, ar 10 067.0±023.8, 100 079.6±035.5, ex 100 0.2±0.0, ev 090.9±036.6
el 00:28:12, ep 5113, ts 141154, ar 10 071.8±021.8, 100 073.7±019.7, ex 100 0.2±0.0, ev 078.1±014.0
el 00:29:17, ep 5160, ts 146794, ar 10 277.7±133.1, 100 093.7±077.0, ex 100 0.2±0.0, ev 107.9±091.7
el 00:30:02, ep 5170, ts 150923, ar 10 412.9±115.2, 100 127.0±127.6, ex 100 0.2±0.0, ev 142.8±138.0
--> reached_max_minutes ✕
Training complete.
Final evaluation score 467.15±82.72 in 1709.67s training time, 1820.96s wall-clock time.

el 00:00:00, ep 0000, ts 000012, ar 10 012.0±000.0, 100 012.0±000.0, ex 100 0.4±0.0, ev 009.0±000.0
el 00:01:01, ep 0122, ts 008475, ar 10 172.6±071.1, 100 079.5±075.3, ex 100 0.4±0.1, ev 199.4±102.0
el 00:02:01, ep 0154, ts 015513, ar 10 220.0±056.3, 100 138.7±103.9, ex 100 0.3±0.1, ev 258.6±094.6
el 00:03:01, ep 0175, ts 021061, ar 10 297.5±062.6, 100 186.7±104.0, ex 100 0.2±0.1, ev 282.1±090.5
el 00:04:02, ep 0197, ts 026345, ar 10 233.9±065.7, 100 219.9±091.3, ex 100 0.2±0.1, ev 301.6±084.4
el 00:05:06, ep 0213, ts 031711, ar 10 401.5±080.4, 100 247.4±098.5, ex 100 0.2±0.0, ev 324.6±095.0
el 00:06:07, ep 0225, ts 037142, ar 10 443.1±116.9, 100 283.5±114.6, ex 100 0.2±0.0, ev 345.8±107.3
el 00:07:10, ep 0241, ts 042511, ar 10 414.3±144.6, 100 300.6±126.7, ex 100 0.2±0.0, ev 373.6±116.9
el 00:08:10, ep 0254, ts 047785, ar 10 387.5±124.9, 100 322.7±132.8, ex 100 0.2±0.0, ev 398.0±118.3
el 00:09:12, ep 0267, ts 053628, ar 10 435.5±108.0, 100 350.6±135.8, ex 100 0.2±0.0, ev 421.4±112.3
el 00:10:15, ep 0280, ts 059545, ar 10 456.8±091.1, 100 372.7±138.6, ex 100 0.2±0.0, ev 449.5±096.9
el 00:11:10, ep 0293, ts 064692, ar 10 406.2±150.4, 100 394.2±138.7, ex 100 0.2±0.0, ev 475.4±071.7
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 632.94s training time, 690.34s wall-clock time.

el 00:00:00, ep 0000, ts 000012, ar 10 012.0±000.0, 100 012.0±000.0, ex 100 0.2±0.0, ev 011.0±000.0
el 00:01:00, ep 0131, ts 007898, ar 10 166.4±056.3, 100 071.7±058.6, ex 100 0.4±0.1, ev 212.9±086.7
el 00:02:01, ep 0174, ts 015106, ar 10 214.9±084.9, 100 127.0±074.3, ex 100 0.3±0.1, ev 214.6±078.4
el 00:03:03, ep 0195, ts 021079, ar 10 307.3±124.0, 100 174.3±102.1, ex 100 0.2±0.1, ev 259.2±117.2
el 00:04:03, ep 0207, ts 026571, ar 10 494.5±016.5, 100 219.2±136.3, ex 100 0.2±0.1, ev 291.1±137.0
el 00:05:06, ep 0219, ts 032133, ar 10 460.0±120.0, 100 261.9±152.7, ex 100 0.2±0.0, ev 326.6±146.2
el 00:06:10, ep 0232, ts 037842, ar 10 436.4±138.2, 100 298.6±161.9, ex 100 0.2±0.0, ev 361.5±144.8
el 00:07:13, ep 0244, ts 043290, ar 10 479.8±060.6, 100 337.3±159.9, ex 100 0.2±0.0, ev 396.6±137.7
el 00:08:17, ep 0256, ts 049040, ar 10 500.0±000.0, 100 374.6±153.1, ex 100 0.2±0.0, ev 429.5±121.0
el 00:09:21, ep 0268, ts 054925, ar 10 488.5±034.5, 100 412.8±135.5, ex 100 0.2±0.0, ev 463.3±091.1
el 00:09:44, ep 0272, ts 056925, ar 10 500.0±000.0, 100 422.8±130.7, ex 100 0.2±0.0, ev 475.1±074.2
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 551.45s training time, 604.56s wall-clock time.

el 00:00:00, ep 0000, ts 000039, ar 10 039.0±000.0, 100 039.0±000.0, ex 100 0.4±0.0, ev 038.0±000.0
el 00:01:00, ep 0121, ts 008058, ar 10 162.4±078.2, 100 075.3±065.9, ex 100 0.4±0.1, ev 216.7±089.3
el 00:02:04, ep 0162, ts 015542, ar 10 243.1±131.8, 100 135.6±089.3, ex 100 0.3±0.1, ev 219.5±083.3
el 00:03:08, ep 0182, ts 021733, ar 10 343.2±163.8, 100 188.8±117.0, ex 100 0.2±0.1, ev 274.5±127.8
el 00:04:12, ep 0198, ts 027564, ar 10 407.8±161.2, 100 229.7±139.8, ex 100 0.2±0.0, ev 317.1±146.5
el 00:05:15, ep 0211, ts 033310, ar 10 424.6±151.1, 100 268.8±157.8, ex 100 0.2±0.0, ev 345.7±155.8
el 00:06:16, ep 0225, ts 038808, ar 10 414.1±137.0, 100 301.9±163.2, ex 100 0.2±0.0, ev 390.4±146.7
el 00:07:19, ep 0237, ts 044502, ar 10 469.4±063.8, 100 340.3±161.9, ex 100 0.2±0.0, ev 430.2±123.4
el 00:08:19, ep 0250, ts 050042, ar 10 404.0±146.7, 100 372.1±156.6, ex 100 0.2±0.0, ev 465.9±092.4
el 00:08:35, ep 0253, ts 051542, ar 10 435.3±129.4, 100 382.8±152.7, ex 100 0.2±0.0, ev 475.2±078.2
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 483.03s training time, 535.03s wall-clock time.

best_agent.demo_progression()

PER Agent progression

Episode 0

Episode 63

Episode 126

Episode 189

Episode 253

best_agent.demo_last()

Fully-trained PER Agent

Trial 0

Trial 1

Trial 2

per_max_t, per_max_r, per_max_s, per_max_sec, per_max_rt = np.max(per_results, axis=0).T
per_min_t, per_min_r, per_min_s, per_min_sec, per_min_rt = np.min(per_results, axis=0).T
per_mean_t, per_mean_r, per_mean_s, per_mean_sec, per_mean_rt = np.mean(per_results, axis=0).T
per_x = np.arange(np.max(
    (len(per_mean_s), len(dueling_ddqn_mean_s))))
fig, axs = plt.subplots(5, 1, figsize=(15,30), sharey=False, sharex=True)

# Dueling DDQN
axs[0].plot(dueling_ddqn_max_r, 'r', linewidth=1)
axs[0].plot(dueling_ddqn_min_r, 'r', linewidth=1)
axs[0].plot(dueling_ddqn_mean_r, 'r:', label='Dueling DDQN', linewidth=2)
axs[0].fill_between(
    dueling_ddqn_x, dueling_ddqn_min_r, dueling_ddqn_max_r, facecolor='r', alpha=0.3)

axs[1].plot(dueling_ddqn_max_s, 'r', linewidth=1)
axs[1].plot(dueling_ddqn_min_s, 'r', linewidth=1)
axs[1].plot(dueling_ddqn_mean_s, 'r:', label='Dueling DDQN', linewidth=2)
axs[1].fill_between(
    dueling_ddqn_x, dueling_ddqn_min_s, dueling_ddqn_max_s, facecolor='r', alpha=0.3)

axs[2].plot(dueling_ddqn_max_t, 'r', linewidth=1)
axs[2].plot(dueling_ddqn_min_t, 'r', linewidth=1)
axs[2].plot(dueling_ddqn_mean_t, 'r:', label='Dueling DDQN', linewidth=2)
axs[2].fill_between(
    dueling_ddqn_x, dueling_ddqn_min_t, dueling_ddqn_max_t, facecolor='r', alpha=0.3)

axs[3].plot(dueling_ddqn_max_sec, 'r', linewidth=1)
axs[3].plot(dueling_ddqn_min_sec, 'r', linewidth=1)
axs[3].plot(dueling_ddqn_mean_sec, 'r:', label='Dueling DDQN', linewidth=2)
axs[3].fill_between(
    dueling_ddqn_x, dueling_ddqn_min_sec, dueling_ddqn_max_sec, facecolor='r', alpha=0.3)

axs[4].plot(dueling_ddqn_max_rt, 'r', linewidth=1)
axs[4].plot(dueling_ddqn_min_rt, 'r', linewidth=1)
axs[4].plot(dueling_ddqn_mean_rt, 'r:', label='Dueling DDQN', linewidth=2)
axs[4].fill_between(
    dueling_ddqn_x, dueling_ddqn_min_rt, dueling_ddqn_max_rt, facecolor='r', alpha=0.3)

# PER
axs[0].plot(per_max_r, 'k', linewidth=1)
axs[0].plot(per_min_r, 'k', linewidth=1)
axs[0].plot(per_mean_r, 'k', label='PER', linewidth=2)
axs[0].fill_between(per_x, per_min_r, per_max_r, facecolor='k', alpha=0.3)

axs[1].plot(per_max_s, 'k', linewidth=1)
axs[1].plot(per_min_s, 'k', linewidth=1)
axs[1].plot(per_mean_s, 'k', label='PER', linewidth=2)
axs[1].fill_between(per_x, per_min_s, per_max_s, facecolor='k', alpha=0.3)

axs[2].plot(per_max_t, 'k', linewidth=1)
axs[2].plot(per_min_t, 'k', linewidth=1)
axs[2].plot(per_mean_t, 'k', label='PER', linewidth=2)
axs[2].fill_between(per_x, per_min_t, per_max_t, facecolor='k', alpha=0.3)

axs[3].plot(per_max_sec, 'k', linewidth=1)
axs[3].plot(per_min_sec, 'k', linewidth=1)
axs[3].plot(per_mean_sec, 'k', label='PER', linewidth=2)
axs[3].fill_between(per_x, per_min_sec, per_max_sec, facecolor='k', alpha=0.3)

axs[4].plot(per_max_rt, 'k', linewidth=1)
axs[4].plot(per_min_rt, 'k', linewidth=1)
axs[4].plot(per_mean_rt, 'k', label='PER', linewidth=2)
axs[4].fill_between(per_x, per_min_rt, per_max_rt, facecolor='k', alpha=0.3)

# ALL
axs[0].set_title('Moving Avg Reward (Training)')
axs[1].set_title('Moving Avg Reward (Evaluation)')
axs[2].set_title('Total Steps')
axs[3].set_title('Training Time')
axs[4].set_title('Wall-clock Time')
plt.xlabel('Episodes')
axs[0].legend(loc='upper left')
plt.show()
per_root_dir = os.path.join(RESULTS_DIR, 'per')
not os.path.exists(per_root_dir) and os.makedirs(per_root_dir)

np.save(os.path.join(per_root_dir, 'x'), per_x)

np.save(os.path.join(per_root_dir, 'max_r'), per_max_r)
np.save(os.path.join(per_root_dir, 'min_r'), per_min_r)
np.save(os.path.join(per_root_dir, 'mean_r'), per_mean_r)

np.save(os.path.join(per_root_dir, 'max_s'), per_max_s)
np.save(os.path.join(per_root_dir, 'min_s'), per_min_s )
np.save(os.path.join(per_root_dir, 'mean_s'), per_mean_s)

np.save(os.path.join(per_root_dir, 'max_t'), per_max_t)
np.save(os.path.join(per_root_dir, 'min_t'), per_min_t)
np.save(os.path.join(per_root_dir, 'mean_t'), per_mean_t)

np.save(os.path.join(per_root_dir, 'max_sec'), per_max_sec)
np.save(os.path.join(per_root_dir, 'min_sec'), per_min_sec)
np.save(os.path.join(per_root_dir, 'mean_sec'), per_mean_sec)

np.save(os.path.join(per_root_dir, 'max_rt'), per_max_rt)
np.save(os.path.join(per_root_dir, 'min_rt'), per_min_rt)
np.save(os.path.join(per_root_dir, 'mean_rt'), per_mean_rt)