Note: 실행을 위해 아래의 패키지들을 설치해주기 바랍니다.

!pip install tqdm numpy scikit-learn pyglet setuptools && \
!pip install gym asciinema pandas tabulate tornado==5.* PyBullet && \
!pip install git+https://github.com/pybox2d/pybox2d#egg=Box2D && \
!pip install git+https://github.com/mimoralea/gym-bandits#egg=gym-bandits && \
!pip install git+https://github.com/mimoralea/gym-walk#egg=gym-walk && \
!pip install git+https://github.com/mimoralea/gym-aima#egg=gym-aima && \
!pip install gym[atari]

MC 제어, SARSA, Q학습, 이중 Q학습

import warnings ; warnings.filterwarnings('ignore')

import itertools
import gym, gym_walk, gym_aima
import numpy as np
from tabulate import tabulate
from pprint import pprint
from tqdm import tqdm_notebook as tqdm

from itertools import cycle, count

import random
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
SEEDS = (12, 34, 56, 78, 90)

%matplotlib inline
plt.style.use('fivethirtyeight')
params = {
    'figure.figsize': (15, 8),
    'font.size': 24,
    'legend.fontsize': 20,
    'axes.titlesize': 28,
    'axes.labelsize': 24,
    'xtick.labelsize': 20,
    'ytick.labelsize': 20
}
pylab.rcParams.update(params)
np.set_printoptions(suppress=True)

실행에 필요한 helper function

def value_iteration(P, gamma=1.0, theta=1e-10):
    V = np.zeros(len(P), dtype=np.float64)
    while True:
        Q = np.zeros((len(P), len(P[0])), dtype=np.float64)
        for s in range(len(P)):
            for a in range(len(P[s])):
                for prob, next_state, reward, done in P[s][a]:
                    Q[s][a] += prob * (reward + gamma * V[next_state] * (not done))
        if np.max(np.abs(V - np.max(Q, axis=1))) < theta:
            break
        V = np.max(Q, axis=1)
    pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
    return Q, V, pi
def print_policy(pi, P, action_symbols=('<', 'v', '>', '^'), n_cols=4, title='정책:'):
    print(title)
    arrs = {k:v for k,v in enumerate(action_symbols)}
    for s in range(len(P)):
        a = pi(s)
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")
def print_state_value_function(V, P, n_cols=4, prec=3, title='상태-가치 함수:'):
    print(title)
    for s in range(len(P)):
        v = V[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), '{}'.format(np.round(v, prec)).rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")
def print_action_value_function(Q, 
                                optimal_Q=None, 
                                action_symbols=('<', '>'), 
                                prec=3, 
                                title='행동-가치 함수:'):
    vf_types=('',) if optimal_Q is None else ('', '*', 'er')
    headers = ['s',] + [' '.join(i) for i in list(itertools.product(vf_types, action_symbols))]
    print(title)
    states = np.arange(len(Q))[..., np.newaxis]
    arr = np.hstack((states, np.round(Q, prec)))
    if not (optimal_Q is None):
        arr = np.hstack((arr, np.round(optimal_Q, prec), np.round(optimal_Q-Q, prec)))
    print(tabulate(arr, headers, tablefmt="fancy_grid"))
def get_policy_metrics(env, gamma, pi, goal_state, optimal_Q, 
                       n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123) ; env.seed(123)
    reached_goal, episode_reward, episode_regret = [], [], []
    for _ in range(n_episodes):
        state, done, steps = env.reset(), False, 0
        episode_reward.append(0.0)
        episode_regret.append(0.0)
        while not done and steps < max_steps:
            action = pi(state)
            regret = np.max(optimal_Q[state]) - optimal_Q[state][action]
            episode_regret[-1] += regret
            
            state, reward, done, _ = env.step(action)
            episode_reward[-1] += (gamma**steps * reward)
            
            steps += 1

        reached_goal.append(state == goal_state)
    results = np.array((np.sum(reached_goal)/len(reached_goal)*100, 
                        np.mean(episode_reward), 
                        np.mean(episode_regret)))
    return results
def get_metrics_from_tracks(env, gamma, goal_state, optimal_Q, pi_track, coverage=0.1):
    total_samples = len(pi_track)
    n_samples = int(total_samples * coverage)
    samples_e = np.linspace(0, total_samples, n_samples, endpoint=True, dtype=np.int)
    metrics = []
    for e, pi in enumerate(tqdm(pi_track)):
        if e in samples_e:
            metrics.append(get_policy_metrics(
                env, 
                gamma=gamma, 
                pi=lambda s: pi[s], 
                goal_state=goal_state, 
                optimal_Q=optimal_Q))
        else:
            metrics.append(metrics[-1])
    metrics = np.array(metrics)
    success_rate_ma, mean_return_ma, mean_regret_ma = np.apply_along_axis(moving_average, axis=0, arr=metrics).T
    return success_rate_ma, mean_return_ma, mean_regret_ma 
def rmse(x, y, dp=4):
    return np.round(np.sqrt(np.mean((x - y)**2)), dp)
def moving_average(a, n=100) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n
def plot_value_function(title, V_track, V_true=None, log=False, limit_value=0.05, limit_items=5):
    np.random.seed(123)
    per_col = 25
    linecycler = cycle(["-","--",":","-."])
    legends = []

    valid_values = np.argwhere(V_track[-1] > limit_value).squeeze()
    items_idxs = np.random.choice(valid_values, 
                                  min(len(valid_values), limit_items), 
                                  replace=False)
    # 첫번째 참값을 뽑아냅니다.
    if V_true is not None:
        for i, state in enumerate(V_track.T):
            if i not in items_idxs:
                continue
            if state[-1] < limit_value:
                continue

            label = 'v*({})'.format(i)
            plt.axhline(y=V_true[i], color='k', linestyle='-', linewidth=1)
            plt.text(int(len(V_track)*1.02), V_true[i]+.01, label)

    # 이에 대한 추정치를 계산합니다.
    for i, state in enumerate(V_track.T):
        if i not in items_idxs:
            continue
        if state[-1] < limit_value:
            continue
        line_type = next(linecycler)
        label = 'V({})'.format(i)
        p, = plt.plot(state, line_type, label=label, linewidth=3)
        legends.append(p)
        
    legends.reverse()

    ls = []
    for loc, idx in enumerate(range(0, len(legends), per_col)):
        subset = legends[idx:idx+per_col]
        l = plt.legend(subset, [p.get_label() for p in subset], 
                       loc='center right', bbox_to_anchor=(1.25, 0.5))
        ls.append(l)
    [plt.gca().add_artist(l) for l in ls[:-1]]
    if log: plt.xscale('log')
    plt.title(title)
    plt.ylabel('State-value function')
    plt.xlabel('Episodes (log scale)' if log else 'Episodes')
    plt.show()
def decay_schedule(init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10):
    decay_steps = int(max_steps * decay_ratio)
    rem_steps = max_steps - decay_steps
    values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[::-1]
    values = (values - values.min()) / (values.max() - values.min())
    values = (init_value - min_value) * values + min_value
    values = np.pad(values, (0, rem_steps), 'edge')
    return values

미끄러지는 7개의 통로

env = gym.make('SlipperyWalkSeven-v0')
init_state = env.reset()
goal_state = 8
gamma = 0.99
n_episodes = 3000
P = env.env.P
n_cols, svf_prec, err_prec, avf_prec=9, 4, 2, 3
action_symbols=('<', '>')
limit_items, limit_value = 5, 0.0
cu_limit_items, cu_limit_value, cu_episodes = 10, 0.0, 100

알파와 입실론 스케쥴링

plt.plot(decay_schedule(0.5, 0.01, 0.5, n_episodes), 
         '-', linewidth=2, 
         label='Alpha schedule')
plt.plot(decay_schedule(1.0, 0.1, 0.9, n_episodes), 
         ':', linewidth=2, 
         label='Epsilon schedule')
plt.legend(loc=1, ncol=1)

plt.title('Alpha and epsilon schedules')
plt.xlabel('Episodes')
plt.ylabel('Hyperparameter values')
plt.xticks(rotation=45)

plt.show()

이상적인 가치 함수와 정책

optimal_Q, optimal_V, optimal_pi = value_iteration(P, gamma=gamma)
print_state_value_function(optimal_V, P, n_cols=n_cols, prec=svf_prec, title='Optimal state-value function:')
print()

print_action_value_function(optimal_Q, 
                            None, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='Optimal action-value function:')
print()
print_policy(optimal_pi, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_op, mean_return_op, mean_regret_op = get_policy_metrics(
    env, gamma=gamma, pi=optimal_pi, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_op, mean_return_op, mean_regret_op))
Optimal state-value function:
|           | 01 0.5637 | 02  0.763 | 03 0.8449 | 04 0.8892 | 05  0.922 | 06 0.9515 | 07 0.9806 |           |

Optimal action-value function:
╒═════╤═══════╤═══════╕
│   s │     < │     > │
╞═════╪═══════╪═══════╡
│   0 │ 0     │ 0     │
├─────┼───────┼───────┤
│   1 │ 0.312 │ 0.564 │
├─────┼───────┼───────┤
│   2 │ 0.67  │ 0.763 │
├─────┼───────┼───────┤
│   3 │ 0.803 │ 0.845 │
├─────┼───────┼───────┤
│   4 │ 0.864 │ 0.889 │
├─────┼───────┼───────┤
│   5 │ 0.901 │ 0.922 │
├─────┼───────┼───────┤
│   6 │ 0.932 │ 0.952 │
├─────┼───────┼───────┤
│   7 │ 0.961 │ 0.981 │
├─────┼───────┼───────┤
│   8 │ 0     │ 0     │
╘═════╧═══════╧═══════╛

정책:
|           | 01      > | 02      > | 03      > | 04      > | 05      > | 06      > | 07      > |           |
Reaches goal 96.00%. Obtains an average return of 0.8548. Regret of 0.0000

첫방문 몬테카를로 제어 (FVMC)

def generate_trajectory(select_action, Q, epsilon, env, max_steps=200):
    done, trajectory = False, []
    while not done:
        state = env.reset()
        for t in count():
            action = select_action(state, Q, epsilon)
            next_state, reward, done, _ = env.step(action)
            experience = (state, action, reward, next_state, done)
            trajectory.append(experience)
            if done:
                break
            if t >= max_steps - 1:
                trajectory = []
                break
            state = next_state
    return np.array(trajectory, np.object)
def mc_control(env,
               gamma=1.0,
               init_alpha=0.5,
               min_alpha=0.01,
               alpha_decay_ratio=0.5,
               init_epsilon=1.0,
               min_epsilon=0.1,
               epsilon_decay_ratio=0.9,
               n_episodes=3000,
               max_steps=200,
               first_visit=True):
    nS, nA = env.observation_space.n, env.action_space.n
    discounts = np.logspace(0, 
                            max_steps, 
                            num=max_steps, 
                            base=gamma, 
                            endpoint=False) 
    alphas = decay_schedule(init_alpha, 
                           min_alpha, 
                           alpha_decay_ratio, 
                           n_episodes)
    epsilons = decay_schedule(init_epsilon, 
                              min_epsilon, 
                              epsilon_decay_ratio, 
                              n_episodes)
    pi_track = []
    Q = np.zeros((nS, nA), dtype=np.float64)
    Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float64)
    select_action = lambda state, Q, epsilon: np.argmax(Q[state]) \
        if np.random.random() > epsilon \
        else np.random.randint(len(Q[state]))

    for e in tqdm(range(n_episodes), leave=False):
        
        trajectory = generate_trajectory(select_action,
                                         Q,
                                         epsilons[e],
                                         env, 
                                         max_steps)
        visited = np.zeros((nS, nA), dtype=np.bool)
        for t, (state, action, reward, _, _) in enumerate(trajectory):
            if visited[state][action] and first_visit:
                continue
            visited[state][action] = True
            
            n_steps = len(trajectory[t:])
            G = np.sum(discounts[:n_steps] * trajectory[t:, 2])
            Q[state][action] = Q[state][action] + alphas[e] * (G - Q[state][action])

        Q_track[e] = Q
        pi_track.append(np.argmax(Q, axis=1))

    V = np.max(Q, axis=1)
    pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
    return Q, V, pi, Q_track, pi_track
Q_mcs, V_mcs, Q_track_mcs = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
    random.seed(seed); np.random.seed(seed) ; env.seed(seed)
    Q_mc, V_mc, pi_mc, Q_track_mc, pi_track_mc = mc_control(env, gamma=gamma, n_episodes=n_episodes)
    Q_mcs.append(Q_mc) ; V_mcs.append(V_mc) ; Q_track_mcs.append(Q_track_mc)
Q_mc, V_mc, Q_track_mc = np.mean(Q_mcs, axis=0), np.mean(V_mcs, axis=0), np.mean(Q_track_mcs, axis=0)
del Q_mcs ; del V_mcs ; del Q_track_mcs
print_state_value_function(V_mc, P, n_cols=n_cols, 
                           prec=svf_prec, title='State-value function found by FVMC:')
print_state_value_function(optimal_V, P, n_cols=n_cols, 
                           prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_mc - optimal_V, P, n_cols=n_cols, 
                           prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_mc, optimal_V)))
print()
print_action_value_function(Q_mc, 
                            optimal_Q, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='FVMC action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_mc, optimal_Q)))
print()
print_policy(pi_mc, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_mc, mean_return_mc, mean_regret_mc = get_policy_metrics(
    env, gamma=gamma, pi=pi_mc, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_mc, mean_return_mc, mean_regret_mc))
State-value function found by FVMC:
|           | 01 0.4895 | 02 0.7209 | 03 0.8311 | 04 0.8766 | 05 0.9137 | 06 0.9463 | 07 0.9788 |           |
Optimal state-value function:
|           | 01 0.5637 | 02  0.763 | 03 0.8449 | 04 0.8892 | 05  0.922 | 06 0.9515 | 07 0.9806 |           |
State-value function errors:
|           | 01  -0.07 | 02  -0.04 | 03  -0.01 | 04  -0.01 | 05  -0.01 | 06  -0.01 | 07   -0.0 |           |
State-value function RMSE: 0.0293

FVMC action-value function:
╒═════╤═══════╤═══════╤═══════╤═══════╤════════╤════════╕
│   s │     < │     > │   * < │   * > │   er < │   er > │
╞═════╪═══════╪═══════╪═══════╪═══════╪════════╪════════╡
│   0 │ 0     │ 0     │ 0     │ 0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   1 │ 0.194 │ 0.489 │ 0.312 │ 0.564 │  0.118 │  0.074 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   2 │ 0.549 │ 0.721 │ 0.67  │ 0.763 │  0.121 │  0.042 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   3 │ 0.73  │ 0.831 │ 0.803 │ 0.845 │  0.073 │  0.014 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   4 │ 0.843 │ 0.877 │ 0.864 │ 0.889 │  0.021 │  0.013 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   5 │ 0.883 │ 0.914 │ 0.901 │ 0.922 │  0.019 │  0.008 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   6 │ 0.925 │ 0.946 │ 0.932 │ 0.952 │  0.007 │  0.005 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   7 │ 0.955 │ 0.979 │ 0.961 │ 0.981 │  0.006 │  0.002 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   8 │ 0     │ 0     │ 0     │ 0     │  0     │  0     │
╘═════╧═══════╧═══════╧═══════╧═══════╧════════╧════════╛
Action-value function RMSE: 0.0486

정책:
|           | 01      > | 02      > | 03      > | 04      > | 05      > | 06      > | 07      > |           |
Reaches goal 96.00%. Obtains an average return of 0.8548. Regret of 0.0000

SARSA

def sarsa(env,
          gamma=1.0,
          init_alpha=0.5,
          min_alpha=0.01,
          alpha_decay_ratio=0.5,
          init_epsilon=1.0,
          min_epsilon=0.1,
          epsilon_decay_ratio=0.9,
          n_episodes=3000):
    nS, nA = env.observation_space.n, env.action_space.n
    pi_track = []
    Q = np.zeros((nS, nA), dtype=np.float64)
    Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float64)
    select_action = lambda state, Q, epsilon: np.argmax(Q[state]) \
        if np.random.random() > epsilon \
        else np.random.randint(len(Q[state]))
    alphas = decay_schedule(init_alpha, 
                           min_alpha, 
                           alpha_decay_ratio, 
                           n_episodes)
    epsilons = decay_schedule(init_epsilon, 
                              min_epsilon, 
                              epsilon_decay_ratio, 
                              n_episodes)
    
    for e in tqdm(range(n_episodes), leave=False):
        state, done = env.reset(), False
        action = select_action(state, Q, epsilons[e])
        while not done:
            next_state, reward, done, _ = env.step(action)
            next_action = select_action(next_state, Q, epsilons[e])
            td_target = reward + gamma * Q[next_state][next_action] * (not done)
            td_error = td_target - Q[state][action]
            Q[state][action] = Q[state][action] + alphas[e] * td_error
            state, action = next_state, next_action
        Q_track[e] = Q
        pi_track.append(np.argmax(Q, axis=1))

    V = np.max(Q, axis=1)
    pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
    return Q, V, pi, Q_track, pi_track
Q_sarsas, V_sarsas, Q_track_sarsas = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
    random.seed(seed); np.random.seed(seed) ; env.seed(seed)
    Q_sarsa, V_sarsa, pi_sarsa, Q_track_sarsa, pi_track_sarsa = sarsa(env, gamma=gamma, n_episodes=n_episodes)
    Q_sarsas.append(Q_sarsa) ; V_sarsas.append(V_sarsa) ; Q_track_sarsas.append(Q_track_sarsa)
Q_sarsa = np.mean(Q_sarsas, axis=0)
V_sarsa = np.mean(V_sarsas, axis=0)
Q_track_sarsa = np.mean(Q_track_sarsas, axis=0)
del Q_sarsas ; del V_sarsas ; del Q_track_sarsas
print_state_value_function(V_sarsa, P, n_cols=n_cols, 
                           prec=svf_prec, title='State-value function found by Sarsa:')
print_state_value_function(optimal_V, P, n_cols=n_cols, 
                           prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_sarsa - optimal_V, P, n_cols=n_cols, 
                           prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_sarsa, optimal_V)))
print()
print_action_value_function(Q_sarsa, 
                            optimal_Q, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='Sarsa action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_sarsa, optimal_Q)))
print()
print_policy(pi_sarsa, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_sarsa, mean_return_sarsa, mean_regret_sarsa = get_policy_metrics(
    env, gamma=gamma, pi=pi_sarsa, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_sarsa, mean_return_sarsa, mean_regret_sarsa))
State-value function found by Sarsa:
|           | 01  0.461 | 02 0.6868 | 03  0.797 | 04  0.863 | 05 0.9075 | 06 0.9461 | 07 0.9767 |           |
Optimal state-value function:
|           | 01 0.5637 | 02  0.763 | 03 0.8449 | 04 0.8892 | 05  0.922 | 06 0.9515 | 07 0.9806 |           |
State-value function errors:
|           | 01   -0.1 | 02  -0.08 | 03  -0.05 | 04  -0.03 | 05  -0.01 | 06  -0.01 | 07   -0.0 |           |
State-value function RMSE: 0.0467

Sarsa action-value function:
╒═════╤═══════╤═══════╤═══════╤═══════╤════════╤════════╕
│   s │     < │     > │   * < │   * > │   er < │   er > │
╞═════╪═══════╪═══════╪═══════╪═══════╪════════╪════════╡
│   0 │ 0     │ 0     │ 0     │ 0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   1 │ 0.163 │ 0.461 │ 0.312 │ 0.564 │  0.149 │  0.103 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   2 │ 0.5   │ 0.687 │ 0.67  │ 0.763 │  0.17  │  0.076 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   3 │ 0.7   │ 0.797 │ 0.803 │ 0.845 │  0.103 │  0.048 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   4 │ 0.817 │ 0.863 │ 0.864 │ 0.889 │  0.047 │  0.026 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   5 │ 0.874 │ 0.908 │ 0.901 │ 0.922 │  0.028 │  0.014 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   6 │ 0.917 │ 0.946 │ 0.932 │ 0.952 │  0.016 │  0.005 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   7 │ 0.951 │ 0.977 │ 0.961 │ 0.981 │  0.01  │  0.004 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   8 │ 0     │ 0     │ 0     │ 0     │  0     │  0     │
╘═════╧═══════╧═══════╧═══════╧═══════╧════════╧════════╛
Action-value function RMSE: 0.0687

정책:
|           | 01      > | 02      > | 03      > | 04      > | 05      > | 06      > | 07      > |           |
Reaches goal 96.00%. Obtains an average return of 0.8548. Regret of 0.0000

Q학습

def q_learning(env, 
               gamma=1.0,
               init_alpha=0.5,
               min_alpha=0.01,
               alpha_decay_ratio=0.5,
               init_epsilon=1.0,
               min_epsilon=0.1,
               epsilon_decay_ratio=0.9,
               n_episodes=3000):
    nS, nA = env.observation_space.n, env.action_space.n
    pi_track = []
    Q = np.zeros((nS, nA), dtype=np.float64)
    Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float64)
    select_action = lambda state, Q, epsilon: np.argmax(Q[state]) \
        if np.random.random() > epsilon \
        else np.random.randint(len(Q[state]))
    alphas = decay_schedule(init_alpha, 
                           min_alpha, 
                           alpha_decay_ratio, 
                           n_episodes)
    epsilons = decay_schedule(init_epsilon, 
                              min_epsilon, 
                              epsilon_decay_ratio, 
                              n_episodes)
    for e in tqdm(range(n_episodes), leave=False):
        state, done = env.reset(), False
        while not done:
            action = select_action(state, Q, epsilons[e])
            next_state, reward, done, _ = env.step(action)
            td_target = reward + gamma * Q[next_state].max() * (not done)
            td_error = td_target - Q[state][action]
            Q[state][action] = Q[state][action] + alphas[e] * td_error
            state = next_state

        Q_track[e] = Q
        pi_track.append(np.argmax(Q, axis=1))

    V = np.max(Q, axis=1)        
    pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
    return Q, V, pi, Q_track, pi_track
Q_qls, V_qls, Q_track_qls = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
    random.seed(seed); np.random.seed(seed) ; env.seed(seed)
    Q_ql, V_ql, pi_ql, Q_track_ql, pi_track_ql = q_learning(env, gamma=gamma, n_episodes=n_episodes)
    Q_qls.append(Q_ql) ; V_qls.append(V_ql) ; Q_track_qls.append(Q_track_ql)
Q_ql = np.mean(Q_qls, axis=0)
V_ql = np.mean(V_qls, axis=0)
Q_track_ql = np.mean(Q_track_qls, axis=0)
del Q_qls ; del V_qls ; del Q_track_qls
print_state_value_function(V_ql, P, n_cols=n_cols, 
                           prec=svf_prec, title='State-value function found by Q-learning:')
print_state_value_function(optimal_V, P, n_cols=n_cols, 
                           prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_ql - optimal_V, P, n_cols=n_cols, 
                           prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_ql, optimal_V)))
print()
print_action_value_function(Q_ql, 
                            optimal_Q, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='Q-learning action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_ql, optimal_Q)))
print()
print_policy(pi_ql, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_ql, mean_return_ql, mean_regret_ql = get_policy_metrics(
    env, gamma=gamma, pi=pi_ql, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_ql, mean_return_ql, mean_regret_ql))
State-value function found by Q-learning:
|           | 01 0.5523 | 02  0.754 | 03 0.8432 | 04 0.8893 | 05 0.9215 | 06 0.9509 | 07   0.98 |           |
Optimal state-value function:
|           | 01 0.5637 | 02  0.763 | 03 0.8449 | 04 0.8892 | 05  0.922 | 06 0.9515 | 07 0.9806 |           |
State-value function errors:
|           | 01  -0.01 | 02  -0.01 | 03   -0.0 | 04    0.0 | 05   -0.0 | 06   -0.0 | 07   -0.0 |           |
State-value function RMSE: 0.0049

Q-learning action-value function:
╒═════╤═══════╤═══════╤═══════╤═══════╤════════╤════════╕
│   s │     < │     > │   * < │   * > │   er < │   er > │
╞═════╪═══════╪═══════╪═══════╪═══════╪════════╪════════╡
│   0 │ 0     │ 0     │ 0     │ 0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   1 │ 0.303 │ 0.552 │ 0.312 │ 0.564 │  0.009 │  0.011 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   2 │ 0.659 │ 0.754 │ 0.67  │ 0.763 │  0.011 │  0.009 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   3 │ 0.795 │ 0.843 │ 0.803 │ 0.845 │  0.008 │  0.002 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   4 │ 0.864 │ 0.889 │ 0.864 │ 0.889 │ -0.001 │ -0     │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   5 │ 0.901 │ 0.922 │ 0.901 │ 0.922 │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   6 │ 0.932 │ 0.951 │ 0.932 │ 0.952 │  0     │  0.001 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   7 │ 0.961 │ 0.98  │ 0.961 │ 0.981 │ -0     │  0.001 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   8 │ 0     │ 0     │ 0     │ 0     │  0     │  0     │
╘═════╧═══════╧═══════╧═══════╧═══════╧════════╧════════╛
Action-value function RMSE: 0.0052

정책:
|           | 01      > | 02      > | 03      > | 04      > | 05      > | 06      > | 07      > |           |
Reaches goal 96.00%. Obtains an average return of 0.8548. Regret of 0.0000

이중 Q학습

def double_q_learning(env,
                      gamma=1.0,
                      init_alpha=0.5,
                      min_alpha=0.01,
                      alpha_decay_ratio=0.5,
                      init_epsilon=1.0,
                      min_epsilon=0.1,
                      epsilon_decay_ratio=0.9,
                      n_episodes=3000):
    nS, nA = env.observation_space.n, env.action_space.n
    pi_track = []
    Q1 = np.zeros((nS, nA), dtype=np.float64)
    Q2 = np.zeros((nS, nA), dtype=np.float64)
    Q_track1 = np.zeros((n_episodes, nS, nA), dtype=np.float64)
    Q_track2 = np.zeros((n_episodes, nS, nA), dtype=np.float64)
    select_action = lambda state, Q, epsilon: np.argmax(Q[state]) \
        if np.random.random() > epsilon \
        else np.random.randint(len(Q[state]))
    alphas = decay_schedule(init_alpha, 
                           min_alpha, 
                           alpha_decay_ratio, 
                           n_episodes)
    epsilons = decay_schedule(init_epsilon, 
                              min_epsilon, 
                              epsilon_decay_ratio, 
                              n_episodes)
    for e in tqdm(range(n_episodes), leave=False):
        state, done = env.reset(), False
        while not done:
            action = select_action(state, (Q1 + Q2)/2, epsilons[e])
            next_state, reward, done, _ = env.step(action)

            if np.random.randint(2):
                argmax_Q1 = np.argmax(Q1[next_state])
                td_target = reward + gamma * Q2[next_state][argmax_Q1] * (not done)
                td_error = td_target - Q1[state][action]
                Q1[state][action] = Q1[state][action] + alphas[e] * td_error
            else:
                argmax_Q2 = np.argmax(Q2[next_state])
                td_target = reward + gamma * Q1[next_state][argmax_Q2] * (not done)
                td_error = td_target - Q2[state][action]
                Q2[state][action] = Q2[state][action] + alphas[e] * td_error
            state = next_state

        Q_track1[e] = Q1
        Q_track2[e] = Q2        
        pi_track.append(np.argmax((Q1 + Q2)/2, axis=1))

    Q = (Q1 + Q2)/2.
    V = np.max(Q, axis=1)    
    pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
    return Q, V, pi, (Q_track1 + Q_track2)/2., pi_track
Q_dqls, V_dqls, Q_track_dqls = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
    random.seed(seed); np.random.seed(seed) ; env.seed(seed)
    Q_dql, V_dql, pi_dql, Q_track_dql, pi_track_dql = double_q_learning(env, gamma=gamma, n_episodes=n_episodes)
    Q_dqls.append(Q_dql) ; V_dqls.append(V_dql) ; Q_track_dqls.append(Q_track_dql)
Q_dql, V_dql, Q_track_dql = np.mean(Q_dqls, axis=0), np.mean(V_dqls, axis=0), np.mean(Q_track_dqls, axis=0)
del Q_dqls ; del V_dqls ; del Q_track_dqls
print_state_value_function(V_dql, P, n_cols=n_cols, 
                           prec=svf_prec, title='State-value function found by Double Q-Learning:')
print_state_value_function(optimal_V, P, n_cols=n_cols, 
                           prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_dql - optimal_V, P, n_cols=n_cols, 
                           prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_dql, optimal_V)))
print()
print_action_value_function(Q_dql, 
                            optimal_Q, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='Double Q-Learning action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_dql, optimal_Q)))
print()
print_policy(pi_dql, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_dql, mean_return_dql, mean_regret_dql = get_policy_metrics(
    env, gamma=gamma, pi=pi_dql, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_dql, mean_return_dql, mean_regret_dql))
State-value function found by Double Q-Learning:
|           | 01  0.576 | 02 0.7688 | 03 0.8467 | 04 0.8896 | 05 0.9221 | 06 0.9515 | 07 0.9804 |           |
Optimal state-value function:
|           | 01 0.5637 | 02  0.763 | 03 0.8449 | 04 0.8892 | 05  0.922 | 06 0.9515 | 07 0.9806 |           |
State-value function errors:
|           | 01   0.01 | 02   0.01 | 03    0.0 | 04    0.0 | 05    0.0 | 06   -0.0 | 07   -0.0 |           |
State-value function RMSE: 0.0046

Double Q-Learning action-value function:
╒═════╤═══════╤═══════╤═══════╤═══════╤════════╤════════╕
│   s │     < │     > │   * < │   * > │   er < │   er > │
╞═════╪═══════╪═══════╪═══════╪═══════╪════════╪════════╡
│   0 │ 0     │ 0     │ 0     │ 0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   1 │ 0.292 │ 0.576 │ 0.312 │ 0.564 │  0.02  │ -0.012 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   2 │ 0.692 │ 0.769 │ 0.67  │ 0.763 │ -0.021 │ -0.006 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   3 │ 0.811 │ 0.847 │ 0.803 │ 0.845 │ -0.007 │ -0.002 │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   4 │ 0.866 │ 0.89  │ 0.864 │ 0.889 │ -0.002 │ -0     │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   5 │ 0.903 │ 0.922 │ 0.901 │ 0.922 │ -0.001 │ -0     │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   6 │ 0.933 │ 0.951 │ 0.932 │ 0.952 │ -0.001 │  0     │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   7 │ 0.963 │ 0.98  │ 0.961 │ 0.981 │ -0.001 │  0     │
├─────┼───────┼───────┼───────┼───────┼────────┼────────┤
│   8 │ 0     │ 0     │ 0     │ 0     │  0     │  0     │
╘═════╧═══════╧═══════╧═══════╧═══════╧════════╧════════╛
Action-value function RMSE: 0.0078

정책:
|           | 01      > | 02      > | 03      > | 04      > | 05      > | 06      > | 07      > |           |
Reaches goal 96.00%. Obtains an average return of 0.8548. Regret of 0.0000

각 에피소드별 max(Q) 비교

첫방문 몬테카를로

plot_value_function(
    'FVMC estimates through time vs. true values', 
    np.max(Q_track_mc, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=False)
plot_value_function(
    'FVMC estimates through time vs. true values (log scale)', 
    np.max(Q_track_mc, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=True)
plot_value_function(
    'FVMC estimates through time (close up)', 
    np.max(Q_track_mc, axis=2)[:cu_episodes], 
    None,
    limit_items=cu_limit_items,
    limit_value=cu_limit_value,
    log=False)

SARSA

plot_value_function(
    'Sarsa estimates through time vs. true values', 
    np.max(Q_track_sarsa, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=False)
plot_value_function(
    'Sarsa estimates through time vs. true values (log scale)', 
    np.max(Q_track_sarsa, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=True)
plot_value_function(
    'Sarsa estimates through time (close up)', 
    np.max(Q_track_sarsa, axis=2)[:cu_episodes], 
    None,
    limit_items=cu_limit_items,
    limit_value=cu_limit_value,
    log=False)

Q학습

plot_value_function(
    'Q-Learning estimates through time vs. true values', 
    np.max(Q_track_ql, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=False)
plot_value_function(
    'Q-Learning estimates through time vs. true values (log scale)', 
    np.max(Q_track_ql, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=True)
plot_value_function(
    'Q-Learning estimates through time (close up)', 
    np.max(Q_track_ql, axis=2)[:cu_episodes], 
    None,
    limit_items=cu_limit_items,
    limit_value=cu_limit_value,
    log=False)

이중 Q학습

plot_value_function(
    'Double Q-Learning estimates through time vs. true values', 
    np.max(Q_track_dql, axis=2), 
    optimal_V,
    limit_items=limit_items,
    limit_value=limit_value,
    log=False)
plot_value_function(
    'Double Q-Learning estimates through time vs. true values (log scale)', 
    np.max(Q_track_dql, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=True)
plot_value_function(
    'Double Q-Learning estimates through time (close up)', 
    np.max(Q_track_dql, axis=2)[:cu_episodes], 
    None,
    limit_items=cu_limit_items,
    limit_value=cu_limit_value,
    log=False)

정책 평가 비교

mc_success_rate_ma, mc_mean_return_ma, mc_mean_regret_ma = get_metrics_from_tracks(
    env, gamma, goal_state, optimal_Q, pi_track_mc)
sarsa_success_rate_ma, sarsa_mean_return_ma, sarsa_mean_regret_ma = get_metrics_from_tracks(
    env, gamma, goal_state, optimal_Q, pi_track_sarsa)
ql_success_rate_ma, ql_mean_return_ma, ql_mean_regret_ma = get_metrics_from_tracks(
    env, gamma, goal_state, optimal_Q, pi_track_ql)
dql_success_rate_ma, dql_mean_return_ma, dql_mean_regret_ma = get_metrics_from_tracks(
    env, gamma, goal_state, optimal_Q, pi_track_dql)
plt.axhline(y=success_rate_op, color='k', linestyle='-', linewidth=1)
plt.text(int(len(mc_success_rate_ma)*1.02), success_rate_op*1.01, 'π*')

plt.plot(mc_success_rate_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_success_rate_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_success_rate_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_success_rate_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)

plt.title('Policy success rate (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Success rate %')
plt.ylim(-1, 101)
plt.xticks(rotation=45)

plt.show()
plt.axhline(y=mean_return_op, color='k', linestyle='-', linewidth=1)
plt.text(int(len(mc_mean_return_ma)*1.02), mean_return_op*1.01, 'π*')

plt.plot(mc_mean_return_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_mean_return_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_mean_return_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_mean_return_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)

plt.title('Policy episode return (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Return (Gt:T)')

plt.xticks(rotation=45)

plt.show()
plt.plot(mc_mean_regret_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_mean_regret_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_mean_regret_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_mean_regret_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)

plt.title('Policy episode regret (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Regret (q* - Q)')
plt.xticks(rotation=45)

plt.show()
plt.axhline(y=optimal_V[init_state], color='k', linestyle='-', linewidth=1)
plt.text(int(len(Q_track_mc)*1.05), optimal_V[init_state]+.01, 'v*({})'.format(init_state))

plt.plot(moving_average(np.max(Q_track_mc, axis=2).T[init_state]), 
         '-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.max(Q_track_sarsa, axis=2).T[init_state]), 
         '--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.max(Q_track_ql, axis=2).T[init_state]), 
         ':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.max(Q_track_dql, axis=2).T[init_state]), 
         '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)

plt.title('Estimated expected return (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Estimated value of initial state V({})'.format(init_state))
plt.xticks(rotation=45)

plt.show()
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_mc, axis=2) - optimal_V), axis=1)), 
         '-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_sarsa, axis=2) - optimal_V), axis=1)), 
         '--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_ql, axis=2) - optimal_V), axis=1)), 
         ':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_dql, axis=2) - optimal_V), axis=1)), 
         '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)

plt.title('State-value function estimation error (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Mean Absolute Error MAE(V, v*)')
plt.xticks(rotation=45)

plt.show()
plt.plot(moving_average(np.mean(np.abs(Q_track_mc - optimal_Q), axis=(1,2))), 
         '-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.mean(np.abs(Q_track_sarsa - optimal_Q), axis=(1,2))), 
         '--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.mean(np.abs(Q_track_ql - optimal_Q), axis=(1,2))), 
         ':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.mean(np.abs(Q_track_dql - optimal_Q), axis=(1,2))), 
         '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)

plt.title('Action-value function estimation error (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Mean Absolute Error MAE(Q, q*)')
plt.xticks(rotation=45)

plt.show()

Russell & Norvig의 Gridworld 환경

env = gym.make('RussellNorvigGridworld-v0')
init_state = env.reset()
goal_state = 3
gamma = 1.0
n_episodes = 4000
P = env.env.P
n_cols, svf_prec, err_prec, avf_prec=4, 4, 2, 3
action_symbols=('<', 'v', '>', '^')
limit_items, limit_value = 5, 0.01
cu_limit_items, cu_limit_value, cu_episodes = 10, 0.0, 1000

알파와 입실론 스케쥴링

plt.plot(decay_schedule(0.5, 0.01, 0.5, n_episodes), 
         '-', linewidth=2, 
         label='Alpha schedule')
plt.plot(decay_schedule(1.0, 0.1, 0.9, n_episodes), 
         ':', linewidth=2, 
         label='Epsilon schedule')
plt.legend(loc=1, ncol=1)

plt.title('Alpha and epsilon schedules')
plt.xlabel('Episodes')
plt.ylabel('Hyperparameter values')
plt.xticks(rotation=45)

plt.show()

이상적인 가치 함수와 정책

optimal_Q, optimal_V, optimal_pi = value_iteration(P, gamma=gamma)
print_state_value_function(optimal_V, P, n_cols=n_cols, prec=svf_prec, title='Optimal state-value function:')
print()

print_action_value_function(optimal_Q, 
                            None, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='Optimal action-value function:')
print()
print_policy(optimal_pi, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_op, mean_return_op, mean_regret_op = get_policy_metrics(
    env, gamma=gamma, pi=optimal_pi, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_op, mean_return_op, mean_regret_op))
Optimal state-value function:
| 00 0.8116 | 01 0.8678 | 02 0.9178 |           |
| 04 0.7616 |           | 06 0.6603 |           |
| 08 0.7053 | 09 0.6553 | 10 0.6114 | 11 0.3879 |

Optimal action-value function:
╒═════╤═══════╤═══════╤════════╤════════╕
│   s │     < │     v │      > │      ^ │
╞═════╪═══════╪═══════╪════════╪════════╡
│   0 │ 0.767 │ 0.737 │  0.812 │  0.777 │
├─────┼───────┼───────┼────────┼────────┤
│   1 │ 0.783 │ 0.827 │  0.868 │  0.827 │
├─────┼───────┼───────┼────────┼────────┤
│   2 │ 0.812 │ 0.675 │  0.918 │  0.881 │
├─────┼───────┼───────┼────────┼────────┤
│   3 │ 0     │ 0     │  0     │  0     │
├─────┼───────┼───────┼────────┼────────┤
│   4 │ 0.721 │ 0.677 │  0.721 │  0.762 │
├─────┼───────┼───────┼────────┼────────┤
│   5 │ 0     │ 0     │  0     │  0     │
├─────┼───────┼───────┼────────┼────────┤
│   6 │ 0.641 │ 0.415 │ -0.687 │  0.66  │
├─────┼───────┼───────┼────────┼────────┤
│   7 │ 0     │ 0     │  0     │  0     │
├─────┼───────┼───────┼────────┼────────┤
│   8 │ 0.671 │ 0.66  │  0.631 │  0.705 │
├─────┼───────┼───────┼────────┼────────┤
│   9 │ 0.655 │ 0.616 │  0.58  │  0.616 │
├─────┼───────┼───────┼────────┼────────┤
│  10 │ 0.611 │ 0.553 │  0.398 │  0.593 │
├─────┼───────┼───────┼────────┼────────┤
│  11 │ 0.388 │ 0.37  │  0.209 │ -0.74  │
╘═════╧═══════╧═══════╧════════╧════════╛

정책:
| 00      > | 01      > | 02      > |           |
| 04      ^ |           | 06      ^ |           |
| 08      ^ | 09      < | 10      < | 11      < |
Reaches goal 96.00%. Obtains an average return of 0.6424. Regret of 0.0000

몬테카를로 제어

Q_mcs, V_mcs, Q_track_mcs = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
    random.seed(seed); np.random.seed(seed) ; env.seed(seed)
    Q_mc, V_mc, pi_mc, Q_track_mc, pi_track_mc = mc_control(env, gamma=gamma, n_episodes=n_episodes)
    Q_mcs.append(Q_mc) ; V_mcs.append(V_mc) ; Q_track_mcs.append(Q_track_mc)
Q_mc, V_mc, Q_track_mc = np.mean(Q_mcs, axis=0), np.mean(V_mcs, axis=0), np.mean(Q_track_mcs, axis=0)
del Q_mcs ; del V_mcs ; del Q_track_mcs
print_state_value_function(V_mc, P, n_cols=n_cols, 
                           prec=svf_prec, title='State-value function found by FVMC:')
print_state_value_function(optimal_V, P, n_cols=n_cols, 
                           prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_mc - optimal_V, P, n_cols=n_cols, 
                           prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_mc, optimal_V)))
print()
print_action_value_function(Q_mc, 
                            optimal_Q, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='FVMC action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_mc, optimal_Q)))
print()
print_policy(pi_mc, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_mc, mean_return_mc, mean_regret_mc = get_policy_metrics(
    env, gamma=gamma, pi=pi_mc, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_mc, mean_return_mc, mean_regret_mc))
State-value function found by FVMC:
| 00 0.7899 | 01 0.8518 | 02 0.9191 |           |
| 04 0.7349 |           | 06 0.6609 |           |
| 08 0.6698 | 09 0.5816 | 10 0.2939 | 11 -0.2194 |
Optimal state-value function:
| 00 0.8116 | 01 0.8678 | 02 0.9178 |           |
| 04 0.7616 |           | 06 0.6603 |           |
| 08 0.7053 | 09 0.6553 | 10 0.6114 | 11 0.3879 |
State-value function errors:
| 00  -0.02 | 01  -0.02 | 02    0.0 |           |
| 04  -0.03 |           | 06    0.0 |           |
| 08  -0.04 | 09  -0.07 | 10  -0.32 | 11  -0.61 |
State-value function RMSE: 0.1995

FVMC action-value function:
╒═════╤════════╤════════╤════════╤════════╤═══════╤═══════╤════════╤════════╤════════╤════════╤════════╤════════╕
│   s │      < │      v │      > │      ^ │   * < │   * v │    * > │    * ^ │   er < │   er v │   er > │   er ^ │
╞═════╪════════╪════════╪════════╪════════╪═══════╪═══════╪════════╪════════╪════════╪════════╪════════╪════════╡
│   0 │  0.66  │  0.618 │  0.79  │  0.668 │ 0.767 │ 0.737 │  0.812 │  0.777 │  0.106 │  0.119 │  0.022 │  0.109 │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   1 │  0.661 │  0.747 │  0.852 │  0.76  │ 0.783 │ 0.827 │  0.868 │  0.827 │  0.121 │  0.08  │  0.016 │  0.068 │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   2 │  0.727 │  0.571 │  0.919 │  0.816 │ 0.812 │ 0.675 │  0.918 │  0.881 │  0.085 │  0.104 │ -0.001 │  0.065 │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   3 │  0     │  0     │  0     │  0     │ 0     │ 0     │  0     │  0     │  0     │  0     │  0     │  0     │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   4 │  0.587 │  0.542 │  0.582 │  0.735 │ 0.721 │ 0.677 │  0.721 │  0.762 │  0.134 │  0.134 │  0.138 │  0.027 │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   5 │  0     │  0     │  0     │  0     │ 0     │ 0     │  0     │  0     │  0     │  0     │  0     │  0     │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   6 │  0.248 │  0.052 │ -0.698 │  0.661 │ 0.641 │ 0.415 │ -0.687 │  0.66  │  0.393 │  0.363 │  0.011 │ -0.001 │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   7 │  0     │  0     │  0     │  0     │ 0     │ 0     │  0     │  0     │  0     │  0     │  0     │  0     │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   8 │  0.541 │  0.505 │  0.441 │  0.67  │ 0.671 │ 0.66  │  0.631 │  0.705 │  0.13  │  0.155 │  0.19  │  0.036 │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   9 │  0.526 │  0.097 │  0.132 │  0.041 │ 0.655 │ 0.616 │  0.58  │  0.616 │  0.13  │  0.519 │  0.448 │  0.575 │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│  10 │ -0.074 │ -0.218 │ -0.332 │  0.185 │ 0.611 │ 0.553 │  0.398 │  0.593 │  0.685 │  0.771 │  0.73  │  0.408 │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│  11 │ -0.242 │ -0.75  │ -0.829 │ -0.905 │ 0.388 │ 0.37  │  0.209 │ -0.74  │  0.63  │  1.12  │  1.038 │  0.165 │
╘═════╧════════╧════════╧════════╧════════╧═══════╧═══════╧════════╧════════╧════════╧════════╧════════╧════════╛
Action-value function RMSE: 0.3489

정책:
| 00      > | 01      > | 02      > |           |
| 04      ^ |           | 06      ^ |           |
| 08      ^ | 09      < | 10      < | 11      < |
Reaches goal 96.00%. Obtains an average return of 0.6424. Regret of 0.0000

SARSA

Q_sarsas, V_sarsas, Q_track_sarsas = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
    random.seed(seed); np.random.seed(seed) ; env.seed(seed)
    Q_sarsa, V_sarsa, pi_sarsa, Q_track_sarsa, pi_track_sarsa = sarsa(env, gamma=gamma, n_episodes=n_episodes)
    Q_sarsas.append(Q_sarsa) ; V_sarsas.append(V_sarsa) ; Q_track_sarsas.append(Q_track_sarsa)
Q_sarsa = np.mean(Q_sarsas, axis=0)
V_sarsa = np.mean(V_sarsas, axis=0)
Q_track_sarsa = np.mean(Q_track_sarsas, axis=0)
del Q_sarsas ; del V_sarsas ; del Q_track_sarsas
print_state_value_function(V_sarsa, P, n_cols=n_cols, 
                           prec=svf_prec, title='State-value function found by Sarsa:')
print_state_value_function(optimal_V, P, n_cols=n_cols, 
                           prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_sarsa - optimal_V, P, n_cols=n_cols, 
                           prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_sarsa, optimal_V)))
print()
print_action_value_function(Q_sarsa, 
                            optimal_Q, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='Sarsa action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_sarsa, optimal_Q)))
print()
print_policy(pi_sarsa, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_sarsa, mean_return_sarsa, mean_regret_sarsa = get_policy_metrics(
    env, gamma=gamma, pi=pi_sarsa, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_sarsa, mean_return_sarsa, mean_regret_sarsa))
State-value function found by Sarsa:
| 00 0.7646 | 01 0.8317 | 02 0.9003 |           |
| 04 0.7009 |           | 06 0.6164 |           |
| 08 0.6212 | 09 0.5314 | 10 0.1956 | 11 -0.4743 |
Optimal state-value function:
| 00 0.8116 | 01 0.8678 | 02 0.9178 |           |
| 04 0.7616 |           | 06 0.6603 |           |
| 08 0.7053 | 09 0.6553 | 10 0.6114 | 11 0.3879 |
State-value function errors:
| 00  -0.05 | 01  -0.04 | 02  -0.02 |           |
| 04  -0.06 |           | 06  -0.04 |           |
| 08  -0.08 | 09  -0.12 | 10  -0.42 | 11  -0.86 |
State-value function RMSE: 0.2811

Sarsa action-value function:
╒═════╤════════╤════════╤════════╤════════╤═══════╤═══════╤════════╤════════╤════════╤════════╤════════╤════════╕
│   s │      < │      v │      > │      ^ │   * < │   * v │    * > │    * ^ │   er < │   er v │   er > │   er ^ │
╞═════╪════════╪════════╪════════╪════════╪═══════╪═══════╪════════╪════════╪════════╪════════╪════════╪════════╡
│   0 │  0.645 │  0.603 │  0.765 │  0.667 │ 0.767 │ 0.737 │  0.812 │  0.777 │  0.121 │  0.134 │  0.047 │  0.11  │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   1 │  0.67  │  0.739 │  0.832 │  0.738 │ 0.783 │ 0.827 │  0.868 │  0.827 │  0.113 │  0.088 │  0.036 │  0.089 │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   2 │  0.713 │  0.492 │  0.9   │  0.82  │ 0.812 │ 0.675 │  0.918 │  0.881 │  0.1   │  0.183 │  0.018 │  0.061 │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   3 │  0     │  0     │  0     │  0     │ 0     │ 0     │  0     │  0     │  0     │  0     │  0     │  0     │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   4 │  0.583 │  0.511 │  0.586 │  0.701 │ 0.721 │ 0.677 │  0.721 │  0.762 │  0.138 │  0.166 │  0.135 │  0.061 │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   5 │  0     │  0     │  0     │  0     │ 0     │ 0     │  0     │  0     │  0     │  0     │  0     │  0     │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   6 │  0.221 │ -0.22  │ -0.768 │  0.616 │ 0.641 │ 0.415 │ -0.687 │  0.66  │  0.42  │  0.635 │  0.081 │  0.044 │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   7 │  0     │  0     │  0     │  0     │ 0     │ 0     │  0     │  0     │  0     │  0     │  0     │  0     │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   8 │  0.485 │  0.468 │  0.371 │  0.621 │ 0.671 │ 0.66  │  0.631 │  0.705 │  0.186 │  0.192 │  0.26  │  0.084 │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   9 │  0.531 │  0.103 │ -0.047 │  0.125 │ 0.655 │ 0.616 │  0.58  │  0.616 │  0.124 │  0.513 │  0.627 │  0.491 │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│  10 │ -0.052 │ -0.341 │ -0.722 │  0.045 │ 0.611 │ 0.553 │  0.398 │  0.593 │  0.663 │  0.894 │  1.12  │  0.548 │
├─────┼────────┼────────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│  11 │ -0.474 │ -0.828 │ -0.9   │ -0.976 │ 0.388 │ 0.37  │  0.209 │ -0.74  │  0.862 │  1.198 │  1.109 │  0.236 │
╘═════╧════════╧════════╧════════╧════════╧═══════╧═══════╧════════╧════════╧════════╧════════╧════════╧════════╛
Action-value function RMSE: 0.4107

정책:
| 00      > | 01      > | 02      > |           |
| 04      ^ |           | 06      ^ |           |
| 08      ^ | 09      < | 10      ^ | 11      < |
Reaches goal 96.00%. Obtains an average return of 0.6424. Regret of 0.0000

Q학습

Q_qls, V_qls, Q_track_qls = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
    random.seed(seed); np.random.seed(seed) ; env.seed(seed)
    Q_ql, V_ql, pi_ql, Q_track_ql, pi_track_ql = q_learning(env, gamma=gamma, n_episodes=n_episodes)
    Q_qls.append(Q_ql) ; V_qls.append(V_ql) ; Q_track_qls.append(Q_track_ql)
Q_ql = np.mean(Q_qls, axis=0)
V_ql = np.mean(V_qls, axis=0)
Q_track_ql = np.mean(Q_track_qls, axis=0)
del Q_qls ; del V_qls ; del Q_track_qls
print_state_value_function(V_ql, P, n_cols=n_cols, 
                           prec=svf_prec, title='State-value function found by Q-learning:')
print_state_value_function(optimal_V, P, n_cols=n_cols, 
                           prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_ql - optimal_V, P, n_cols=n_cols, 
                           prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_ql, optimal_V)))
print()
print_action_value_function(Q_ql, 
                            optimal_Q, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='Q-learning action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_ql, optimal_Q)))
print()
print_policy(pi_ql, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_ql, mean_return_ql, mean_regret_ql = get_policy_metrics(
    env, gamma=gamma, pi=pi_ql, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_ql, mean_return_ql, mean_regret_ql))
State-value function found by Q-learning:
| 00 0.8146 | 01 0.8688 | 02 0.9164 |           |
| 04 0.7645 |           | 06 0.6005 |           |
| 08 0.7068 | 09 0.6558 | 10 0.6114 | 11  0.438 |
Optimal state-value function:
| 00 0.8116 | 01 0.8678 | 02 0.9178 |           |
| 04 0.7616 |           | 06 0.6603 |           |
| 08 0.7053 | 09 0.6553 | 10 0.6114 | 11 0.3879 |
State-value function errors:
| 00    0.0 | 01    0.0 | 02   -0.0 |           |
| 04    0.0 |           | 06  -0.06 |           |
| 08    0.0 | 09    0.0 | 10    0.0 | 11   0.05 |
State-value function RMSE: 0.0225

Q-learning action-value function:
╒═════╤═══════╤═══════╤════════╤════════╤═══════╤═══════╤════════╤════════╤════════╤════════╤════════╤════════╕
│   s │     < │     v │      > │      ^ │   * < │   * v │    * > │    * ^ │   er < │   er v │   er > │   er ^ │
╞═════╪═══════╪═══════╪════════╪════════╪═══════╪═══════╪════════╪════════╪════════╪════════╪════════╪════════╡
│   0 │ 0.768 │ 0.741 │  0.815 │  0.779 │ 0.767 │ 0.737 │  0.812 │  0.777 │ -0.002 │ -0.004 │ -0.003 │ -0.002 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   1 │ 0.785 │ 0.828 │  0.869 │  0.829 │ 0.783 │ 0.827 │  0.868 │  0.827 │ -0.002 │ -0.001 │ -0.001 │ -0.002 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   2 │ 0.812 │ 0.675 │  0.916 │  0.882 │ 0.812 │ 0.675 │  0.918 │  0.881 │ -0     │  0     │  0.001 │ -0.001 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   3 │ 0     │ 0     │  0     │  0     │ 0     │ 0     │  0     │  0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   4 │ 0.722 │ 0.679 │  0.722 │  0.764 │ 0.721 │ 0.677 │  0.721 │  0.762 │ -0.001 │ -0.002 │ -0.001 │ -0.003 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   5 │ 0     │ 0     │  0     │  0     │ 0     │ 0     │  0     │  0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   6 │ 0.582 │ 0.435 │ -0.716 │  0.593 │ 0.641 │ 0.415 │ -0.687 │  0.66  │  0.059 │ -0.02  │  0.029 │  0.068 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   7 │ 0     │ 0     │  0     │  0     │ 0     │ 0     │  0     │  0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   8 │ 0.674 │ 0.664 │  0.635 │  0.707 │ 0.671 │ 0.66  │  0.631 │  0.705 │ -0.003 │ -0.004 │ -0.004 │ -0.002 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   9 │ 0.656 │ 0.621 │  0.586 │  0.623 │ 0.655 │ 0.616 │  0.58  │  0.616 │ -0.001 │ -0.005 │ -0.006 │ -0.007 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│  10 │ 0.611 │ 0.579 │  0.463 │  0.586 │ 0.611 │ 0.553 │  0.398 │  0.593 │ -0     │ -0.026 │ -0.065 │  0.007 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│  11 │ 0.402 │ 0.423 │  0.298 │ -0.795 │ 0.388 │ 0.37  │  0.209 │ -0.74  │ -0.015 │ -0.052 │ -0.089 │  0.055 │
╘═════╧═══════╧═══════╧════════╧════════╧═══════╧═══════╧════════╧════════╧════════╧════════╧════════╧════════╛
Action-value function RMSE: 0.0243

정책:
| 00      > | 01      > | 02      > |           |
| 04      ^ |           | 06      ^ |           |
| 08      ^ | 09      < | 10      < | 11      < |
Reaches goal 96.00%. Obtains an average return of 0.6424. Regret of 0.0000

이중 Q학습

Q_dqls, V_dqls, Q_track_dqls = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
    random.seed(seed); np.random.seed(seed) ; env.seed(seed)
    Q_dql, V_dql, pi_dql, Q_track_dql, pi_track_dql = double_q_learning(env, gamma=gamma, n_episodes=n_episodes)
    Q_dqls.append(Q_dql) ; V_dqls.append(V_dql) ; Q_track_dqls.append(Q_track_dql)
Q_dql, V_dql, Q_track_dql = np.mean(Q_dqls, axis=0), np.mean(V_dqls, axis=0), np.mean(Q_track_dqls, axis=0)
del Q_dqls ; del V_dqls ; del Q_track_dqls
print_state_value_function(V_dql, P, n_cols=n_cols, 
                           prec=svf_prec, title='State-value function found by Double Q-Learning:')
print_state_value_function(optimal_V, P, n_cols=n_cols, 
                           prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_dql - optimal_V, P, n_cols=n_cols, 
                           prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_dql, optimal_V)))
print()
print_action_value_function(Q_dql, 
                            optimal_Q, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='Double Q-Learning action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_dql, optimal_Q)))
print()
print_policy(pi_dql, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_dql, mean_return_dql, mean_regret_dql = get_policy_metrics(
    env, gamma=gamma, pi=pi_dql, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_dql, mean_return_dql, mean_regret_dql))
State-value function found by Double Q-Learning:
| 00 0.8098 | 01 0.8667 | 02 0.9186 |           |
| 04 0.7589 |           | 06 0.6613 |           |
| 08 0.7035 | 09 0.6532 | 10 0.5885 | 11 0.3222 |
Optimal state-value function:
| 00 0.8116 | 01 0.8678 | 02 0.9178 |           |
| 04 0.7616 |           | 06 0.6603 |           |
| 08 0.7053 | 09 0.6553 | 10 0.6114 | 11 0.3879 |
State-value function errors:
| 00   -0.0 | 01   -0.0 | 02    0.0 |           |
| 04   -0.0 |           | 06    0.0 |           |
| 08   -0.0 | 09   -0.0 | 10  -0.02 | 11  -0.07 |
State-value function RMSE: 0.0201

Double Q-Learning action-value function:
╒═════╤═══════╤═══════╤════════╤════════╤═══════╤═══════╤════════╤════════╤════════╤════════╤════════╤════════╕
│   s │     < │     v │      > │      ^ │   * < │   * v │    * > │    * ^ │   er < │   er v │   er > │   er ^ │
╞═════╪═══════╪═══════╪════════╪════════╪═══════╪═══════╪════════╪════════╪════════╪════════╪════════╪════════╡
│   0 │ 0.761 │ 0.734 │  0.81  │  0.772 │ 0.767 │ 0.737 │  0.812 │  0.777 │  0.005 │  0.003 │  0.002 │  0.005 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   1 │ 0.778 │ 0.821 │  0.867 │  0.822 │ 0.783 │ 0.827 │  0.868 │  0.827 │  0.005 │  0.006 │  0.001 │  0.006 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   2 │ 0.807 │ 0.654 │  0.919 │  0.875 │ 0.812 │ 0.675 │  0.918 │  0.881 │  0.005 │  0.021 │ -0.001 │  0.006 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   3 │ 0     │ 0     │  0     │  0     │ 0     │ 0     │  0     │  0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   4 │ 0.716 │ 0.673 │  0.717 │  0.759 │ 0.721 │ 0.677 │  0.721 │  0.762 │  0.005 │  0.004 │  0.004 │  0.003 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   5 │ 0     │ 0     │  0     │  0     │ 0     │ 0     │  0     │  0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   6 │ 0.536 │ 0.367 │ -0.689 │  0.661 │ 0.641 │ 0.415 │ -0.687 │  0.66  │  0.105 │  0.048 │  0.002 │ -0.001 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   7 │ 0     │ 0     │  0     │  0     │ 0     │ 0     │  0     │  0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   8 │ 0.665 │ 0.656 │  0.622 │  0.704 │ 0.671 │ 0.66  │  0.631 │  0.705 │  0.006 │  0.004 │  0.009 │  0.002 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│   9 │ 0.653 │ 0.601 │  0.534 │  0.598 │ 0.655 │ 0.616 │  0.58  │  0.616 │  0.002 │  0.015 │  0.047 │  0.017 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│  10 │ 0.588 │ 0.458 │  0.259 │  0.494 │ 0.611 │ 0.553 │  0.398 │  0.593 │  0.023 │  0.096 │  0.138 │  0.099 │
├─────┼───────┼───────┼────────┼────────┼───────┼───────┼────────┼────────┼────────┼────────┼────────┼────────┤
│  11 │ 0.314 │ 0.13  │  0.014 │ -0.741 │ 0.388 │ 0.37  │  0.209 │ -0.74  │  0.074 │  0.241 │  0.196 │  0.001 │
╘═════╧═══════╧═══════╧════════╧════════╧═══════╧═══════╧════════╧════════╧════════╧════════╧════════╧════════╛
Action-value function RMSE: 0.0572

정책:
| 00      > | 01      > | 02      > |           |
| 04      ^ |           | 06      ^ |           |
| 08      ^ | 09      < | 10      < | 11      < |
Reaches goal 96.00%. Obtains an average return of 0.6424. Regret of 0.0000

매 에피소드별 max(Q) 비교

첫방문 몬테카를로

plot_value_function(
    'FVMC estimates through time vs. true values', 
    np.max(Q_track_mc, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=False)
plot_value_function(
    'FVMC estimates through time vs. true values (log scale)', 
    np.max(Q_track_mc, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=True)
plot_value_function(
    'FVMC estimates through time (close up)', 
    np.max(Q_track_mc, axis=2)[:cu_episodes], 
    None,
    limit_items=cu_limit_items,
    limit_value=cu_limit_value,
    log=False)

SARSA

plot_value_function(
    'Sarsa estimates through time vs. true values', 
    np.max(Q_track_sarsa, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=False)
plot_value_function(
    'Sarsa estimates through time vs. true values (log scale)', 
    np.max(Q_track_sarsa, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=True)
plot_value_function(
    'Sarsa estimates through time (close up)', 
    np.max(Q_track_sarsa, axis=2)[:cu_episodes], 
    None,
    limit_items=cu_limit_items,
    limit_value=cu_limit_value,
    log=False)

Q학습

plot_value_function(
    'Q-Learning estimates through time vs. true values', 
    np.max(Q_track_ql, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=False)
plot_value_function(
    'Q-Learning estimates through time vs. true values (log scale)', 
    np.max(Q_track_ql, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=True)
plot_value_function(
    'Q-Learning estimates through time (close up)', 
    np.max(Q_track_ql, axis=2)[:cu_episodes], 
    None,
    limit_items=cu_limit_items,
    limit_value=cu_limit_value,
    log=False)

이중 Q학습

plot_value_function(
    'Double Q-Learning estimates through time vs. true values', 
    np.max(Q_track_dql, axis=2), 
    optimal_V,
    limit_items=limit_items,
    limit_value=limit_value,
    log=False)
plot_value_function(
    'Double Q-Learning estimates through time vs. true values (log scale)', 
    np.max(Q_track_dql, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=True)
plot_value_function(
    'Double Q-Learning estimates through time (close up)', 
    np.max(Q_track_dql, axis=2)[:cu_episodes], 
    None,
    limit_items=cu_limit_items,
    limit_value=cu_limit_value,
    log=False)

정책 평가 비교

mc_success_rate_ma, mc_mean_return_ma, mc_mean_regret_ma = get_metrics_from_tracks(
    env, gamma, goal_state, optimal_Q, pi_track_mc)
sarsa_success_rate_ma, sarsa_mean_return_ma, sarsa_mean_regret_ma = get_metrics_from_tracks(
    env, gamma, goal_state, optimal_Q, pi_track_sarsa)
ql_success_rate_ma, ql_mean_return_ma, ql_mean_regret_ma = get_metrics_from_tracks(
    env, gamma, goal_state, optimal_Q, pi_track_ql)
dql_success_rate_ma, dql_mean_return_ma, dql_mean_regret_ma = get_metrics_from_tracks(
    env, gamma, goal_state, optimal_Q, pi_track_dql)
plt.axhline(y=success_rate_op, color='k', linestyle='-', linewidth=1)
plt.text(int(len(mc_success_rate_ma)*1.02), success_rate_op*1.01, 'π*')

plt.plot(mc_success_rate_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_success_rate_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_success_rate_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_success_rate_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)

plt.title('Policy success rate (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Success rate %')
plt.ylim(-1, 101)
plt.xticks(rotation=45)

plt.show()
plt.axhline(y=mean_return_op, color='k', linestyle='-', linewidth=1)
plt.text(int(len(mc_mean_return_ma)*1.02), mean_return_op*1.01, 'π*')

plt.plot(mc_mean_return_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_mean_return_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_mean_return_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_mean_return_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)

plt.title('Policy episode return (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Return (Gt:T)')

plt.xticks(rotation=45)

plt.show()
plt.plot(mc_mean_regret_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_mean_regret_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_mean_regret_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_mean_regret_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)

plt.title('Policy episode regret (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Regret (q* - Q)')
plt.xticks(rotation=45)

plt.show()
plt.axhline(y=optimal_V[init_state], color='k', linestyle='-', linewidth=1)
plt.text(int(len(Q_track_mc)*1.05), optimal_V[init_state]+.01, 'v*({})'.format(init_state))

plt.plot(moving_average(np.max(Q_track_mc, axis=2).T[init_state]), 
         '-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.max(Q_track_sarsa, axis=2).T[init_state]), 
         '--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.max(Q_track_ql, axis=2).T[init_state]), 
         ':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.max(Q_track_dql, axis=2).T[init_state]), 
         '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)

plt.title('Estimated expected return (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Estimated value of initial state V({})'.format(init_state))
plt.xticks(rotation=45)

plt.show()
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_mc, axis=2) - optimal_V), axis=1)), 
         '-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_sarsa, axis=2) - optimal_V), axis=1)), 
         '--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_ql, axis=2) - optimal_V), axis=1)), 
         ':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_dql, axis=2) - optimal_V), axis=1)), 
         '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)

plt.title('State-value function estimation error (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Mean Absolute Error MAE(V, v*)')
plt.xticks(rotation=45)

plt.show()
plt.plot(moving_average(np.mean(np.abs(Q_track_mc - optimal_Q), axis=(1,2))), 
         '-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.mean(np.abs(Q_track_sarsa - optimal_Q), axis=(1,2))), 
         '--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.mean(np.abs(Q_track_ql - optimal_Q), axis=(1,2))), 
         ':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.mean(np.abs(Q_track_dql - optimal_Q), axis=(1,2))), 
         '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)

plt.title('Action-value function estimation error (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Mean Absolute Error MAE(Q, q*)')
plt.xticks(rotation=45)

plt.show()

프로즌레이크 환경

env = gym.make('FrozenLake-v0')
init_state = env.reset()
goal_state = 15
gamma = 0.99
n_episodes = 10000
P = env.env.P
n_cols, svf_prec, err_prec, avf_prec=4, 4, 2, 3
action_symbols=('<', 'v', '>', '^')
limit_items, limit_value = 5, 0.0
cu_limit_items, cu_limit_value, cu_episodes = 10, 0.01, 2000

알파와 입실론 스케쥴링

plt.plot(decay_schedule(0.5, 0.01, 0.5, n_episodes), 
         '-', linewidth=2, 
         label='Alpha schedule')
plt.plot(decay_schedule(1.0, 0.1, 0.9, n_episodes), 
         ':', linewidth=2, 
         label='Epsilon schedule')
plt.legend(loc=1, ncol=1)

plt.title('Alpha and epsilon schedules')
plt.xlabel('Episodes')
plt.ylabel('Hyperparameter values')
plt.xticks(rotation=45)

plt.show()

이상적인 가치 함수와 정책

optimal_Q, optimal_V, optimal_pi = value_iteration(P, gamma=gamma)
print_state_value_function(optimal_V, P, n_cols=n_cols, prec=svf_prec, title='Optimal state-value function:')
print()

print_action_value_function(optimal_Q, 
                            None, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='Optimal action-value function:')
print()
print_policy(optimal_pi, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_op, mean_return_op, mean_regret_op = get_policy_metrics(
    env, gamma=gamma, pi=optimal_pi, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_op, mean_return_op, mean_regret_op))
Optimal state-value function:
| 00  0.542 | 01 0.4988 | 02 0.4707 | 03 0.4569 |
| 04 0.5585 |           | 06 0.3583 |           |
| 08 0.5918 | 09 0.6431 | 10 0.6152 |           |
|           | 13 0.7417 | 14 0.8628 |           |

Optimal action-value function:
╒═════╤═══════╤═══════╤═══════╤═══════╕
│   s │     < │     v │     > │     ^ │
╞═════╪═══════╪═══════╪═══════╪═══════╡
│   0 │ 0.542 │ 0.528 │ 0.528 │ 0.522 │
├─────┼───────┼───────┼───────┼───────┤
│   1 │ 0.343 │ 0.334 │ 0.32  │ 0.499 │
├─────┼───────┼───────┼───────┼───────┤
│   2 │ 0.438 │ 0.434 │ 0.424 │ 0.471 │
├─────┼───────┼───────┼───────┼───────┤
│   3 │ 0.306 │ 0.306 │ 0.302 │ 0.457 │
├─────┼───────┼───────┼───────┼───────┤
│   4 │ 0.558 │ 0.38  │ 0.374 │ 0.363 │
├─────┼───────┼───────┼───────┼───────┤
│   5 │ 0     │ 0     │ 0     │ 0     │
├─────┼───────┼───────┼───────┼───────┤
│   6 │ 0.358 │ 0.203 │ 0.358 │ 0.155 │
├─────┼───────┼───────┼───────┼───────┤
│   7 │ 0     │ 0     │ 0     │ 0     │
├─────┼───────┼───────┼───────┼───────┤
│   8 │ 0.38  │ 0.408 │ 0.397 │ 0.592 │
├─────┼───────┼───────┼───────┼───────┤
│   9 │ 0.44  │ 0.643 │ 0.448 │ 0.398 │
├─────┼───────┼───────┼───────┼───────┤
│  10 │ 0.615 │ 0.497 │ 0.403 │ 0.33  │
├─────┼───────┼───────┼───────┼───────┤
│  11 │ 0     │ 0     │ 0     │ 0     │
├─────┼───────┼───────┼───────┼───────┤
│  12 │ 0     │ 0     │ 0     │ 0     │
├─────┼───────┼───────┼───────┼───────┤
│  13 │ 0.457 │ 0.53  │ 0.742 │ 0.497 │
├─────┼───────┼───────┼───────┼───────┤
│  14 │ 0.733 │ 0.863 │ 0.821 │ 0.781 │
├─────┼───────┼───────┼───────┼───────┤
│  15 │ 0     │ 0     │ 0     │ 0     │
╘═════╧═══════╧═══════╧═══════╧═══════╛

정책:
| 00      < | 01      ^ | 02      ^ | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |
Reaches goal 74.00%. Obtains an average return of 0.5116. Regret of 0.0000

몬테카를로 제어

Q_mcs, V_mcs, Q_track_mcs = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
    random.seed(seed); np.random.seed(seed) ; env.seed(seed)
    Q_mc, V_mc, pi_mc, Q_track_mc, pi_track_mc = mc_control(env, gamma=gamma, n_episodes=n_episodes)
    Q_mcs.append(Q_mc) ; V_mcs.append(V_mc) ; Q_track_mcs.append(Q_track_mc)
Q_mc, V_mc, Q_track_mc = np.mean(Q_mcs, axis=0), np.mean(V_mcs, axis=0), np.mean(Q_track_mcs, axis=0)
del Q_mcs ; del V_mcs ; del Q_track_mcs
print_state_value_function(V_mc, P, n_cols=n_cols, 
                           prec=svf_prec, title='State-value function found by FVMC:')
print_state_value_function(optimal_V, P, n_cols=n_cols, 
                           prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_mc - optimal_V, P, n_cols=n_cols, 
                           prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_mc, optimal_V)))
print()
print_action_value_function(Q_mc, 
                            optimal_Q, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='FVMC action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_mc, optimal_Q)))
print()
print_policy(pi_mc, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_mc, mean_return_mc, mean_regret_mc = get_policy_metrics(
    env, gamma=gamma, pi=pi_mc, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_mc, mean_return_mc, mean_regret_mc))
State-value function found by FVMC:
| 00 0.2924 | 01 0.1962 | 02 0.1745 | 03 0.0798 |
| 04 0.3093 |           | 06 0.2139 |           |
| 08 0.3592 | 09 0.4479 | 10 0.4471 |           |
|           | 13 0.5975 | 14 0.7784 |           |
Optimal state-value function:
| 00  0.542 | 01 0.4988 | 02 0.4707 | 03 0.4569 |
| 04 0.5585 |           | 06 0.3583 |           |
| 08 0.5918 | 09 0.6431 | 10 0.6152 |           |
|           | 13 0.7417 | 14 0.8628 |           |
State-value function errors:
| 00  -0.25 | 01   -0.3 | 02   -0.3 | 03  -0.38 |
| 04  -0.25 |           | 06  -0.14 |           |
| 08  -0.23 | 09   -0.2 | 10  -0.17 |           |
|           | 13  -0.14 | 14  -0.08 |           |
State-value function RMSE: 0.1961

FVMC action-value function:
╒═════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤════════╤════════╤════════╤════════╕
│   s │     < │     v │     > │     ^ │   * < │   * v │   * > │   * ^ │   er < │   er v │   er > │   er ^ │
╞═════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪════════╪════════╪════════╪════════╡
│   0 │ 0.292 │ 0.232 │ 0.239 │ 0.243 │ 0.542 │ 0.528 │ 0.528 │ 0.522 │  0.25  │  0.296 │  0.289 │  0.279 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   1 │ 0.09  │ 0.108 │ 0.079 │ 0.196 │ 0.343 │ 0.334 │ 0.32  │ 0.499 │  0.253 │  0.226 │  0.241 │  0.303 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   2 │ 0.149 │ 0.134 │ 0.114 │ 0.109 │ 0.438 │ 0.434 │ 0.424 │ 0.471 │  0.289 │  0.3   │  0.31  │  0.362 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   3 │ 0.057 │ 0.049 │ 0.013 │ 0.041 │ 0.306 │ 0.306 │ 0.302 │ 0.457 │  0.25  │  0.257 │  0.289 │  0.416 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   4 │ 0.309 │ 0.203 │ 0.192 │ 0.171 │ 0.558 │ 0.38  │ 0.374 │ 0.363 │  0.249 │  0.177 │  0.182 │  0.192 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   5 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   6 │ 0.152 │ 0.094 │ 0.18  │ 0.03  │ 0.358 │ 0.203 │ 0.358 │ 0.155 │  0.207 │  0.109 │  0.178 │  0.125 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   7 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   8 │ 0.189 │ 0.235 │ 0.232 │ 0.359 │ 0.38  │ 0.408 │ 0.397 │ 0.592 │  0.191 │  0.172 │  0.165 │  0.233 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   9 │ 0.261 │ 0.448 │ 0.317 │ 0.2   │ 0.44  │ 0.643 │ 0.448 │ 0.398 │  0.179 │  0.195 │  0.131 │  0.198 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  10 │ 0.405 │ 0.34  │ 0.269 │ 0.162 │ 0.615 │ 0.497 │ 0.403 │ 0.33  │  0.21  │  0.157 │  0.134 │  0.168 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  11 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  12 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  13 │ 0.27  │ 0.378 │ 0.598 │ 0.356 │ 0.457 │ 0.53  │ 0.742 │ 0.497 │  0.187 │  0.151 │  0.144 │  0.141 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  14 │ 0.543 │ 0.778 │ 0.667 │ 0.607 │ 0.733 │ 0.863 │ 0.821 │ 0.781 │  0.19  │  0.084 │  0.154 │  0.174 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  15 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
╘═════╧═══════╧═══════╧═══════╧═══════╧═══════╧═══════╧═══════╧═══════╧════════╧════════╧════════╧════════╛
Action-value function RMSE: 0.1859

정책:
| 00      < | 01      ^ | 02      > | 03      < |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      v |           |
|           | 13      > | 14      v |           |
Reaches goal 62.00%. Obtains an average return of 0.4355. Regret of 0.1419

SARSA

Q_sarsas, V_sarsas, Q_track_sarsas = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
    random.seed(seed); np.random.seed(seed) ; env.seed(seed)
    Q_sarsa, V_sarsa, pi_sarsa, Q_track_sarsa, pi_track_sarsa = sarsa(env, gamma=gamma, n_episodes=n_episodes)
    Q_sarsas.append(Q_sarsa) ; V_sarsas.append(V_sarsa) ; Q_track_sarsas.append(Q_track_sarsa)
Q_sarsa = np.mean(Q_sarsas, axis=0)
V_sarsa = np.mean(V_sarsas, axis=0)
Q_track_sarsa = np.mean(Q_track_sarsas, axis=0)
del Q_sarsas ; del V_sarsas ; del Q_track_sarsas
print_state_value_function(V_sarsa, P, n_cols=n_cols, 
                           prec=svf_prec, title='State-value function found by Sarsa:')
print_state_value_function(optimal_V, P, n_cols=n_cols, 
                           prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_sarsa - optimal_V, P, n_cols=n_cols, 
                           prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_sarsa, optimal_V)))
print()
print_action_value_function(Q_sarsa, 
                            optimal_Q, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='Sarsa action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_sarsa, optimal_Q)))
print()
print_policy(pi_sarsa, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_sarsa, mean_return_sarsa, mean_regret_sarsa = get_policy_metrics(
    env, gamma=gamma, pi=pi_sarsa, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_sarsa, mean_return_sarsa, mean_regret_sarsa))
State-value function found by Sarsa:
| 00 0.2822 | 01 0.2237 | 02 0.1984 | 03 0.1127 |
| 04 0.3003 |           | 06 0.2074 |           |
| 08 0.3473 | 09 0.4417 | 10 0.4533 |           |
|           | 13 0.5771 | 14 0.7754 |           |
Optimal state-value function:
| 00  0.542 | 01 0.4988 | 02 0.4707 | 03 0.4569 |
| 04 0.5585 |           | 06 0.3583 |           |
| 08 0.5918 | 09 0.6431 | 10 0.6152 |           |
|           | 13 0.7417 | 14 0.8628 |           |
State-value function errors:
| 00  -0.26 | 01  -0.28 | 02  -0.27 | 03  -0.34 |
| 04  -0.26 |           | 06  -0.15 |           |
| 08  -0.24 | 09   -0.2 | 10  -0.16 |           |
|           | 13  -0.16 | 14  -0.09 |           |
State-value function RMSE: 0.1915

Sarsa action-value function:
╒═════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤════════╤════════╤════════╤════════╕
│   s │     < │     v │     > │     ^ │   * < │   * v │   * > │   * ^ │   er < │   er v │   er > │   er ^ │
╞═════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪════════╪════════╪════════╪════════╡
│   0 │ 0.282 │ 0.257 │ 0.257 │ 0.253 │ 0.542 │ 0.528 │ 0.528 │ 0.522 │  0.26  │  0.271 │  0.271 │  0.27  │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   1 │ 0.118 │ 0.105 │ 0.092 │ 0.224 │ 0.343 │ 0.334 │ 0.32  │ 0.499 │  0.225 │  0.229 │  0.228 │  0.275 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   2 │ 0.198 │ 0.117 │ 0.115 │ 0.113 │ 0.438 │ 0.434 │ 0.424 │ 0.471 │  0.24  │  0.317 │  0.309 │  0.358 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   3 │ 0.035 │ 0.033 │ 0.024 │ 0.113 │ 0.306 │ 0.306 │ 0.302 │ 0.457 │  0.271 │  0.273 │  0.277 │  0.344 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   4 │ 0.3   │ 0.209 │ 0.209 │ 0.188 │ 0.558 │ 0.38  │ 0.374 │ 0.363 │  0.258 │  0.171 │  0.165 │  0.176 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   5 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   6 │ 0.171 │ 0.114 │ 0.184 │ 0.038 │ 0.358 │ 0.203 │ 0.358 │ 0.155 │  0.188 │  0.089 │  0.174 │  0.117 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   7 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   8 │ 0.203 │ 0.254 │ 0.235 │ 0.347 │ 0.38  │ 0.408 │ 0.397 │ 0.592 │  0.177 │  0.153 │  0.162 │  0.244 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   9 │ 0.281 │ 0.442 │ 0.321 │ 0.246 │ 0.44  │ 0.643 │ 0.448 │ 0.398 │  0.159 │  0.201 │  0.127 │  0.152 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  10 │ 0.453 │ 0.362 │ 0.29  │ 0.166 │ 0.615 │ 0.497 │ 0.403 │ 0.33  │  0.162 │  0.135 │  0.113 │  0.165 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  11 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  12 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  13 │ 0.305 │ 0.404 │ 0.577 │ 0.367 │ 0.457 │ 0.53  │ 0.742 │ 0.497 │  0.152 │  0.125 │  0.165 │  0.13  │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  14 │ 0.536 │ 0.775 │ 0.703 │ 0.621 │ 0.733 │ 0.863 │ 0.821 │ 0.781 │  0.197 │  0.087 │  0.118 │  0.161 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  15 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
╘═════╧═══════╧═══════╧═══════╧═══════╧═══════╧═══════╧═══════╧═══════╧════════╧════════╧════════╧════════╛
Action-value function RMSE: 0.176

정책:
| 00      < | 01      ^ | 02      < | 03      ^ |
| 04      < |           | 06      > |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |
Reaches goal 70.00%. Obtains an average return of 0.4864. Regret of 0.0156

Q학습

Q_qls, V_qls, Q_track_qls = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
    random.seed(seed); np.random.seed(seed) ; env.seed(seed)
    Q_ql, V_ql, pi_ql, Q_track_ql, pi_track_ql = q_learning(env, gamma=gamma, n_episodes=n_episodes)
    Q_qls.append(Q_ql) ; V_qls.append(V_ql) ; Q_track_qls.append(Q_track_ql)
Q_ql = np.mean(Q_qls, axis=0)
V_ql = np.mean(V_qls, axis=0)
Q_track_ql = np.mean(Q_track_qls, axis=0)
del Q_qls ; del V_qls ; del Q_track_qls
print_state_value_function(V_ql, P, n_cols=n_cols, 
                           prec=svf_prec, title='State-value function found by Q-learning:')
print_state_value_function(optimal_V, P, n_cols=n_cols, 
                           prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_ql - optimal_V, P, n_cols=n_cols, 
                           prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_ql, optimal_V)))
print()
print_action_value_function(Q_ql, 
                            optimal_Q, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='Q-learning action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_ql, optimal_Q)))
print()
print_policy(pi_ql, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_ql, mean_return_ql, mean_regret_ql = get_policy_metrics(
    env, gamma=gamma, pi=pi_ql, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_ql, mean_return_ql, mean_regret_ql))
State-value function found by Q-learning:
| 00 0.5219 | 01 0.4762 | 02 0.4434 | 03 0.4284 |
| 04  0.539 |           | 06 0.3521 |           |
| 08 0.5742 | 09 0.6247 | 10 0.6011 |           |
|           | 13 0.7321 | 14 0.8545 |           |
Optimal state-value function:
| 00  0.542 | 01 0.4988 | 02 0.4707 | 03 0.4569 |
| 04 0.5585 |           | 06 0.3583 |           |
| 08 0.5918 | 09 0.6431 | 10 0.6152 |           |
|           | 13 0.7417 | 14 0.8628 |           |
State-value function errors:
| 00  -0.02 | 01  -0.02 | 02  -0.03 | 03  -0.03 |
| 04  -0.02 |           | 06  -0.01 |           |
| 08  -0.02 | 09  -0.02 | 10  -0.01 |           |
|           | 13  -0.01 | 14  -0.01 |           |
State-value function RMSE: 0.0156

Q-learning action-value function:
╒═════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤════════╤════════╤════════╤════════╕
│   s │     < │     v │     > │     ^ │   * < │   * v │   * > │   * ^ │   er < │   er v │   er > │   er ^ │
╞═════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪════════╪════════╪════════╪════════╡
│   0 │ 0.522 │ 0.506 │ 0.506 │ 0.501 │ 0.542 │ 0.528 │ 0.528 │ 0.522 │  0.02  │  0.022 │  0.022 │  0.022 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   1 │ 0.338 │ 0.324 │ 0.307 │ 0.476 │ 0.343 │ 0.334 │ 0.32  │ 0.499 │  0.006 │  0.01  │  0.013 │  0.023 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   2 │ 0.429 │ 0.426 │ 0.414 │ 0.443 │ 0.438 │ 0.434 │ 0.424 │ 0.471 │  0.01  │  0.008 │  0.011 │  0.027 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   3 │ 0.297 │ 0.311 │ 0.289 │ 0.428 │ 0.306 │ 0.306 │ 0.302 │ 0.457 │  0.009 │ -0.005 │  0.012 │  0.028 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   4 │ 0.539 │ 0.368 │ 0.353 │ 0.354 │ 0.558 │ 0.38  │ 0.374 │ 0.363 │  0.019 │  0.011 │  0.021 │  0.009 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   5 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   6 │ 0.339 │ 0.222 │ 0.331 │ 0.161 │ 0.358 │ 0.203 │ 0.358 │ 0.155 │  0.02  │ -0.019 │  0.028 │ -0.005 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   7 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   8 │ 0.379 │ 0.399 │ 0.375 │ 0.574 │ 0.38  │ 0.408 │ 0.397 │ 0.592 │  0.001 │  0.009 │  0.021 │  0.018 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   9 │ 0.424 │ 0.625 │ 0.437 │ 0.378 │ 0.44  │ 0.643 │ 0.448 │ 0.398 │  0.016 │  0.018 │  0.01  │  0.021 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  10 │ 0.601 │ 0.484 │ 0.41  │ 0.322 │ 0.615 │ 0.497 │ 0.403 │ 0.33  │  0.014 │  0.013 │ -0.007 │  0.009 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  11 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  12 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  13 │ 0.451 │ 0.534 │ 0.732 │ 0.47  │ 0.457 │ 0.53  │ 0.742 │ 0.497 │  0.006 │ -0.005 │  0.01  │  0.027 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  14 │ 0.724 │ 0.854 │ 0.814 │ 0.774 │ 0.733 │ 0.863 │ 0.821 │ 0.781 │  0.009 │  0.008 │  0.007 │  0.007 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  15 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
╘═════╧═══════╧═══════╧═══════╧═══════╧═══════╧═══════╧═══════╧═══════╧════════╧════════╧════════╧════════╛
Action-value function RMSE: 0.013

정책:
| 00      < | 01      ^ | 02      ^ | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |
Reaches goal 74.00%. Obtains an average return of 0.5116. Regret of 0.0000

이중 Q학습

Q_dqls, V_dqls, Q_track_dqls = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
    random.seed(seed); np.random.seed(seed) ; env.seed(seed)
    Q_dql, V_dql, pi_dql, Q_track_dql, pi_track_dql = double_q_learning(env, gamma=gamma, n_episodes=n_episodes)
    Q_dqls.append(Q_dql) ; V_dqls.append(V_dql) ; Q_track_dqls.append(Q_track_dql)
Q_dql, V_dql, Q_track_dql = np.mean(Q_dqls, axis=0), np.mean(V_dqls, axis=0), np.mean(Q_track_dqls, axis=0)
del Q_dqls ; del V_dqls ; del Q_track_dqls
print_state_value_function(V_dql, P, n_cols=n_cols, 
                           prec=svf_prec, title='State-value function found by Double Q-Learning:')
print_state_value_function(optimal_V, P, n_cols=n_cols, 
                           prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_dql - optimal_V, P, n_cols=n_cols, 
                           prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_dql, optimal_V)))
print()
print_action_value_function(Q_dql, 
                            optimal_Q, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='Double Q-Learning action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_dql, optimal_Q)))
print()
print_policy(pi_dql, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_dql, mean_return_dql, mean_regret_dql = get_policy_metrics(
    env, gamma=gamma, pi=pi_dql, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_dql, mean_return_dql, mean_regret_dql))
State-value function found by Double Q-Learning:
| 00 0.5184 | 01 0.4354 | 02 0.3635 | 03 0.1936 |
| 04  0.535 |           | 06 0.3091 |           |
| 08 0.5681 | 09 0.6211 | 10 0.5848 |           |
|           | 13 0.7279 | 14 0.8563 |           |
Optimal state-value function:
| 00  0.542 | 01 0.4988 | 02 0.4707 | 03 0.4569 |
| 04 0.5585 |           | 06 0.3583 |           |
| 08 0.5918 | 09 0.6431 | 10 0.6152 |           |
|           | 13 0.7417 | 14 0.8628 |           |
State-value function errors:
| 00  -0.02 | 01  -0.06 | 02  -0.11 | 03  -0.26 |
| 04  -0.02 |           | 06  -0.05 |           |
| 08  -0.02 | 09  -0.02 | 10  -0.03 |           |
|           | 13  -0.01 | 14  -0.01 |           |
State-value function RMSE: 0.0752

Double Q-Learning action-value function:
╒═════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤════════╤════════╤════════╤════════╕
│   s │     < │     v │     > │     ^ │   * < │   * v │   * > │   * ^ │   er < │   er v │   er > │   er ^ │
╞═════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪════════╪════════╪════════╪════════╡
│   0 │ 0.518 │ 0.486 │ 0.485 │ 0.482 │ 0.542 │ 0.528 │ 0.528 │ 0.522 │  0.024 │  0.042 │  0.043 │  0.04  │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   1 │ 0.262 │ 0.245 │ 0.201 │ 0.435 │ 0.343 │ 0.334 │ 0.32  │ 0.499 │  0.081 │  0.089 │  0.119 │  0.063 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   2 │ 0.364 │ 0.226 │ 0.208 │ 0.237 │ 0.438 │ 0.434 │ 0.424 │ 0.471 │  0.075 │  0.208 │  0.216 │  0.234 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   3 │ 0.076 │ 0.063 │ 0.04  │ 0.171 │ 0.306 │ 0.306 │ 0.302 │ 0.457 │  0.231 │  0.243 │  0.261 │  0.286 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   4 │ 0.535 │ 0.371 │ 0.354 │ 0.36  │ 0.558 │ 0.38  │ 0.374 │ 0.363 │  0.023 │  0.009 │  0.02  │  0.003 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   5 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   6 │ 0.273 │ 0.136 │ 0.246 │ 0.069 │ 0.358 │ 0.203 │ 0.358 │ 0.155 │  0.086 │  0.067 │  0.113 │  0.086 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   7 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   8 │ 0.357 │ 0.387 │ 0.385 │ 0.568 │ 0.38  │ 0.408 │ 0.397 │ 0.592 │  0.023 │  0.02  │  0.011 │  0.024 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│   9 │ 0.41  │ 0.621 │ 0.406 │ 0.358 │ 0.44  │ 0.643 │ 0.448 │ 0.398 │  0.03  │  0.022 │  0.042 │  0.04  │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  10 │ 0.585 │ 0.45  │ 0.359 │ 0.255 │ 0.615 │ 0.497 │ 0.403 │ 0.33  │  0.03  │  0.047 │  0.044 │  0.076 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  11 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  12 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  13 │ 0.406 │ 0.493 │ 0.728 │ 0.466 │ 0.457 │ 0.53  │ 0.742 │ 0.497 │  0.051 │  0.037 │  0.014 │  0.031 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  14 │ 0.659 │ 0.856 │ 0.766 │ 0.722 │ 0.733 │ 0.863 │ 0.821 │ 0.781 │  0.073 │  0.007 │  0.055 │  0.059 │
├─────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼────────┤
│  15 │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │ 0     │  0     │  0     │  0     │  0     │
╘═════╧═══════╧═══════╧═══════╧═══════╧═══════╧═══════╧═══════╧═══════╧════════╧════════╧════════╧════════╛
Action-value function RMSE: 0.0899

정책:
| 00      < | 01      ^ | 02      < | 03      v |
| 04      < |           | 06      > |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |
Reaches goal 70.00%. Obtains an average return of 0.4864. Regret of 0.0156

매 에피소드별 max(Q) 비교

첫방문 몬테카를로

plot_value_function(
    'FVMC estimates through time vs. true values', 
    np.max(Q_track_mc, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=False)
plot_value_function(
    'FVMC estimates through time vs. true values (log scale)', 
    np.max(Q_track_mc, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=True)
plot_value_function(
    'FVMC estimates through time (close up)', 
    np.max(Q_track_mc, axis=2)[:cu_episodes], 
    None,
    limit_items=cu_limit_items,
    limit_value=cu_limit_value,
    log=False)

SARSA

plot_value_function(
    'Sarsa estimates through time vs. true values', 
    np.max(Q_track_sarsa, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=False)
plot_value_function(
    'Sarsa estimates through time vs. true values (log scale)', 
    np.max(Q_track_sarsa, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=True)
plot_value_function(
    'Sarsa estimates through time (close up)', 
    np.max(Q_track_sarsa, axis=2)[:cu_episodes], 
    None,
    limit_items=cu_limit_items,
    limit_value=cu_limit_value,
    log=False)

Q학습

plot_value_function(
    'Q-Learning estimates through time vs. true values', 
    np.max(Q_track_ql, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=False)
plot_value_function(
    'Q-Learning estimates through time vs. true values (log scale)', 
    np.max(Q_track_ql, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=True)
plot_value_function(
    'Q-Learning estimates through time (close up)', 
    np.max(Q_track_ql, axis=2)[:cu_episodes], 
    None,
    limit_items=cu_limit_items,
    limit_value=cu_limit_value,
    log=False)

이중 Q학습

plot_value_function(
    'Double Q-Learning estimates through time vs. true values', 
    np.max(Q_track_dql, axis=2), 
    optimal_V,
    limit_items=limit_items,
    limit_value=limit_value,
    log=False)
plot_value_function(
    'Double Q-Learning estimates through time vs. true values (log scale)', 
    np.max(Q_track_dql, axis=2), 
    optimal_V, 
    limit_items=limit_items,
    limit_value=limit_value,
    log=True)
plot_value_function(
    'Double Q-Learning estimates through time (close up)', 
    np.max(Q_track_dql, axis=2)[:cu_episodes], 
    None,
    limit_items=cu_limit_items,
    limit_value=cu_limit_value,
    log=False)

정책 평가 비교

mc_success_rate_ma, mc_mean_return_ma, mc_mean_regret_ma = get_metrics_from_tracks(
    env, gamma, goal_state, optimal_Q, pi_track_mc)
sarsa_success_rate_ma, sarsa_mean_return_ma, sarsa_mean_regret_ma = get_metrics_from_tracks(
    env, gamma, goal_state, optimal_Q, pi_track_sarsa)
ql_success_rate_ma, ql_mean_return_ma, ql_mean_regret_ma = get_metrics_from_tracks(
    env, gamma, goal_state, optimal_Q, pi_track_ql)
dql_success_rate_ma, dql_mean_return_ma, dql_mean_regret_ma = get_metrics_from_tracks(
    env, gamma, goal_state, optimal_Q, pi_track_dql)
plt.axhline(y=success_rate_op, color='k', linestyle='-', linewidth=1)
plt.text(int(len(mc_success_rate_ma)*1.02), success_rate_op*1.01, 'π*')

plt.plot(mc_success_rate_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_success_rate_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_success_rate_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_success_rate_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)

plt.title('Policy success rate (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Success rate %')
plt.ylim(-1, 101)
plt.xticks(rotation=45)

plt.show()
plt.axhline(y=mean_return_op, color='k', linestyle='-', linewidth=1)
plt.text(int(len(mc_mean_return_ma)*1.02), mean_return_op*1.01, 'π*')

plt.plot(mc_mean_return_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_mean_return_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_mean_return_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_mean_return_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)

plt.title('Policy episode return (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Return (Gt:T)')

plt.xticks(rotation=45)

plt.show()
plt.plot(mc_mean_regret_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_mean_regret_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_mean_regret_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_mean_regret_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)

plt.title('Policy episode regret (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Regret (q* - Q)')
plt.xticks(rotation=45)

plt.show()
plt.axhline(y=optimal_V[init_state], color='k', linestyle='-', linewidth=1)
plt.text(int(len(Q_track_mc)*1.05), optimal_V[init_state]+.01, 'v*({})'.format(init_state))

plt.plot(moving_average(np.max(Q_track_mc, axis=2).T[init_state]), 
         '-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.max(Q_track_sarsa, axis=2).T[init_state]), 
         '--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.max(Q_track_ql, axis=2).T[init_state]), 
         ':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.max(Q_track_dql, axis=2).T[init_state]), 
         '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)

plt.title('Estimated expected return (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Estimated value of initial state V({})'.format(init_state))
plt.xticks(rotation=45)

plt.show()
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_mc, axis=2) - optimal_V), axis=1)), 
         '-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_sarsa, axis=2) - optimal_V), axis=1)), 
         '--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_ql, axis=2) - optimal_V), axis=1)), 
         ':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_dql, axis=2) - optimal_V), axis=1)), 
         '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)

plt.title('State-value function estimation error (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Mean Absolute Error MAE(V, v*)')
plt.xticks(rotation=45)

plt.show()
plt.plot(moving_average(np.mean(np.abs(Q_track_mc - optimal_Q), axis=(1,2))), 
         '-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.mean(np.abs(Q_track_sarsa - optimal_Q), axis=(1,2))), 
         '--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.mean(np.abs(Q_track_ql - optimal_Q), axis=(1,2))), 
         ':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.mean(np.abs(Q_track_dql - optimal_Q), axis=(1,2))), 
         '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)

plt.title('Action-value function estimation error (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Mean Absolute Error MAE(Q, q*)')
plt.xticks(rotation=45)

plt.show()