Improving Agents behaviors
그로킹 심층 강화학습 중 6장 내용인 "에이전트의 행동 개선"에 대한 내용입니다.
- MC 제어, SARSA, Q학습, 이중 Q학습
Note: 실행을 위해 아래의 패키지들을 설치해주기 바랍니다.
!pip install tqdm numpy scikit-learn pyglet setuptools && \
!pip install gym asciinema pandas tabulate tornado==5.* PyBullet && \
!pip install git+https://github.com/pybox2d/pybox2d#egg=Box2D && \
!pip install git+https://github.com/mimoralea/gym-bandits#egg=gym-bandits && \
!pip install git+https://github.com/mimoralea/gym-walk#egg=gym-walk && \
!pip install git+https://github.com/mimoralea/gym-aima#egg=gym-aima && \
!pip install gym[atari]
import warnings ; warnings.filterwarnings('ignore')
import itertools
import gym, gym_walk, gym_aima
import numpy as np
from tabulate import tabulate
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
from itertools import cycle, count
import random
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
SEEDS = (12, 34, 56, 78, 90)
%matplotlib inline
plt.style.use('fivethirtyeight')
params = {
'figure.figsize': (15, 8),
'font.size': 24,
'legend.fontsize': 20,
'axes.titlesize': 28,
'axes.labelsize': 24,
'xtick.labelsize': 20,
'ytick.labelsize': 20
}
pylab.rcParams.update(params)
np.set_printoptions(suppress=True)
def value_iteration(P, gamma=1.0, theta=1e-10):
V = np.zeros(len(P), dtype=np.float64)
while True:
Q = np.zeros((len(P), len(P[0])), dtype=np.float64)
for s in range(len(P)):
for a in range(len(P[s])):
for prob, next_state, reward, done in P[s][a]:
Q[s][a] += prob * (reward + gamma * V[next_state] * (not done))
if np.max(np.abs(V - np.max(Q, axis=1))) < theta:
break
V = np.max(Q, axis=1)
pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
return Q, V, pi
def print_policy(pi, P, action_symbols=('<', 'v', '>', '^'), n_cols=4, title='정책:'):
print(title)
arrs = {k:v for k,v in enumerate(action_symbols)}
for s in range(len(P)):
a = pi(s)
print("| ", end="")
if np.all([done for action in P[s].values() for _, _, _, done in action]):
print("".rjust(9), end=" ")
else:
print(str(s).zfill(2), arrs[a].rjust(6), end=" ")
if (s + 1) % n_cols == 0: print("|")
def print_state_value_function(V, P, n_cols=4, prec=3, title='상태-가치 함수:'):
print(title)
for s in range(len(P)):
v = V[s]
print("| ", end="")
if np.all([done for action in P[s].values() for _, _, _, done in action]):
print("".rjust(9), end=" ")
else:
print(str(s).zfill(2), '{}'.format(np.round(v, prec)).rjust(6), end=" ")
if (s + 1) % n_cols == 0: print("|")
def print_action_value_function(Q,
optimal_Q=None,
action_symbols=('<', '>'),
prec=3,
title='행동-가치 함수:'):
vf_types=('',) if optimal_Q is None else ('', '*', 'er')
headers = ['s',] + [' '.join(i) for i in list(itertools.product(vf_types, action_symbols))]
print(title)
states = np.arange(len(Q))[..., np.newaxis]
arr = np.hstack((states, np.round(Q, prec)))
if not (optimal_Q is None):
arr = np.hstack((arr, np.round(optimal_Q, prec), np.round(optimal_Q-Q, prec)))
print(tabulate(arr, headers, tablefmt="fancy_grid"))
def get_policy_metrics(env, gamma, pi, goal_state, optimal_Q,
n_episodes=100, max_steps=200):
random.seed(123); np.random.seed(123) ; env.seed(123)
reached_goal, episode_reward, episode_regret = [], [], []
for _ in range(n_episodes):
state, done, steps = env.reset(), False, 0
episode_reward.append(0.0)
episode_regret.append(0.0)
while not done and steps < max_steps:
action = pi(state)
regret = np.max(optimal_Q[state]) - optimal_Q[state][action]
episode_regret[-1] += regret
state, reward, done, _ = env.step(action)
episode_reward[-1] += (gamma**steps * reward)
steps += 1
reached_goal.append(state == goal_state)
results = np.array((np.sum(reached_goal)/len(reached_goal)*100,
np.mean(episode_reward),
np.mean(episode_regret)))
return results
def get_metrics_from_tracks(env, gamma, goal_state, optimal_Q, pi_track, coverage=0.1):
total_samples = len(pi_track)
n_samples = int(total_samples * coverage)
samples_e = np.linspace(0, total_samples, n_samples, endpoint=True, dtype=np.int)
metrics = []
for e, pi in enumerate(tqdm(pi_track)):
if e in samples_e:
metrics.append(get_policy_metrics(
env,
gamma=gamma,
pi=lambda s: pi[s],
goal_state=goal_state,
optimal_Q=optimal_Q))
else:
metrics.append(metrics[-1])
metrics = np.array(metrics)
success_rate_ma, mean_return_ma, mean_regret_ma = np.apply_along_axis(moving_average, axis=0, arr=metrics).T
return success_rate_ma, mean_return_ma, mean_regret_ma
def rmse(x, y, dp=4):
return np.round(np.sqrt(np.mean((x - y)**2)), dp)
def moving_average(a, n=100) :
ret = np.cumsum(a, dtype=float)
ret[n:] = ret[n:] - ret[:-n]
return ret[n - 1:] / n
def plot_value_function(title, V_track, V_true=None, log=False, limit_value=0.05, limit_items=5):
np.random.seed(123)
per_col = 25
linecycler = cycle(["-","--",":","-."])
legends = []
valid_values = np.argwhere(V_track[-1] > limit_value).squeeze()
items_idxs = np.random.choice(valid_values,
min(len(valid_values), limit_items),
replace=False)
# 첫번째 참값을 뽑아냅니다.
if V_true is not None:
for i, state in enumerate(V_track.T):
if i not in items_idxs:
continue
if state[-1] < limit_value:
continue
label = 'v*({})'.format(i)
plt.axhline(y=V_true[i], color='k', linestyle='-', linewidth=1)
plt.text(int(len(V_track)*1.02), V_true[i]+.01, label)
# 이에 대한 추정치를 계산합니다.
for i, state in enumerate(V_track.T):
if i not in items_idxs:
continue
if state[-1] < limit_value:
continue
line_type = next(linecycler)
label = 'V({})'.format(i)
p, = plt.plot(state, line_type, label=label, linewidth=3)
legends.append(p)
legends.reverse()
ls = []
for loc, idx in enumerate(range(0, len(legends), per_col)):
subset = legends[idx:idx+per_col]
l = plt.legend(subset, [p.get_label() for p in subset],
loc='center right', bbox_to_anchor=(1.25, 0.5))
ls.append(l)
[plt.gca().add_artist(l) for l in ls[:-1]]
if log: plt.xscale('log')
plt.title(title)
plt.ylabel('State-value function')
plt.xlabel('Episodes (log scale)' if log else 'Episodes')
plt.show()
def decay_schedule(init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10):
decay_steps = int(max_steps * decay_ratio)
rem_steps = max_steps - decay_steps
values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[::-1]
values = (values - values.min()) / (values.max() - values.min())
values = (init_value - min_value) * values + min_value
values = np.pad(values, (0, rem_steps), 'edge')
return values
env = gym.make('SlipperyWalkSeven-v0')
init_state = env.reset()
goal_state = 8
gamma = 0.99
n_episodes = 3000
P = env.env.P
n_cols, svf_prec, err_prec, avf_prec=9, 4, 2, 3
action_symbols=('<', '>')
limit_items, limit_value = 5, 0.0
cu_limit_items, cu_limit_value, cu_episodes = 10, 0.0, 100
plt.plot(decay_schedule(0.5, 0.01, 0.5, n_episodes),
'-', linewidth=2,
label='Alpha schedule')
plt.plot(decay_schedule(1.0, 0.1, 0.9, n_episodes),
':', linewidth=2,
label='Epsilon schedule')
plt.legend(loc=1, ncol=1)
plt.title('Alpha and epsilon schedules')
plt.xlabel('Episodes')
plt.ylabel('Hyperparameter values')
plt.xticks(rotation=45)
plt.show()
optimal_Q, optimal_V, optimal_pi = value_iteration(P, gamma=gamma)
print_state_value_function(optimal_V, P, n_cols=n_cols, prec=svf_prec, title='Optimal state-value function:')
print()
print_action_value_function(optimal_Q,
None,
action_symbols=action_symbols,
prec=avf_prec,
title='Optimal action-value function:')
print()
print_policy(optimal_pi, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_op, mean_return_op, mean_regret_op = get_policy_metrics(
env, gamma=gamma, pi=optimal_pi, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
success_rate_op, mean_return_op, mean_regret_op))
def generate_trajectory(select_action, Q, epsilon, env, max_steps=200):
done, trajectory = False, []
while not done:
state = env.reset()
for t in count():
action = select_action(state, Q, epsilon)
next_state, reward, done, _ = env.step(action)
experience = (state, action, reward, next_state, done)
trajectory.append(experience)
if done:
break
if t >= max_steps - 1:
trajectory = []
break
state = next_state
return np.array(trajectory, np.object)
def mc_control(env,
gamma=1.0,
init_alpha=0.5,
min_alpha=0.01,
alpha_decay_ratio=0.5,
init_epsilon=1.0,
min_epsilon=0.1,
epsilon_decay_ratio=0.9,
n_episodes=3000,
max_steps=200,
first_visit=True):
nS, nA = env.observation_space.n, env.action_space.n
discounts = np.logspace(0,
max_steps,
num=max_steps,
base=gamma,
endpoint=False)
alphas = decay_schedule(init_alpha,
min_alpha,
alpha_decay_ratio,
n_episodes)
epsilons = decay_schedule(init_epsilon,
min_epsilon,
epsilon_decay_ratio,
n_episodes)
pi_track = []
Q = np.zeros((nS, nA), dtype=np.float64)
Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float64)
select_action = lambda state, Q, epsilon: np.argmax(Q[state]) \
if np.random.random() > epsilon \
else np.random.randint(len(Q[state]))
for e in tqdm(range(n_episodes), leave=False):
trajectory = generate_trajectory(select_action,
Q,
epsilons[e],
env,
max_steps)
visited = np.zeros((nS, nA), dtype=np.bool)
for t, (state, action, reward, _, _) in enumerate(trajectory):
if visited[state][action] and first_visit:
continue
visited[state][action] = True
n_steps = len(trajectory[t:])
G = np.sum(discounts[:n_steps] * trajectory[t:, 2])
Q[state][action] = Q[state][action] + alphas[e] * (G - Q[state][action])
Q_track[e] = Q
pi_track.append(np.argmax(Q, axis=1))
V = np.max(Q, axis=1)
pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
return Q, V, pi, Q_track, pi_track
Q_mcs, V_mcs, Q_track_mcs = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
random.seed(seed); np.random.seed(seed) ; env.seed(seed)
Q_mc, V_mc, pi_mc, Q_track_mc, pi_track_mc = mc_control(env, gamma=gamma, n_episodes=n_episodes)
Q_mcs.append(Q_mc) ; V_mcs.append(V_mc) ; Q_track_mcs.append(Q_track_mc)
Q_mc, V_mc, Q_track_mc = np.mean(Q_mcs, axis=0), np.mean(V_mcs, axis=0), np.mean(Q_track_mcs, axis=0)
del Q_mcs ; del V_mcs ; del Q_track_mcs
print_state_value_function(V_mc, P, n_cols=n_cols,
prec=svf_prec, title='State-value function found by FVMC:')
print_state_value_function(optimal_V, P, n_cols=n_cols,
prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_mc - optimal_V, P, n_cols=n_cols,
prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_mc, optimal_V)))
print()
print_action_value_function(Q_mc,
optimal_Q,
action_symbols=action_symbols,
prec=avf_prec,
title='FVMC action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_mc, optimal_Q)))
print()
print_policy(pi_mc, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_mc, mean_return_mc, mean_regret_mc = get_policy_metrics(
env, gamma=gamma, pi=pi_mc, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
success_rate_mc, mean_return_mc, mean_regret_mc))
def sarsa(env,
gamma=1.0,
init_alpha=0.5,
min_alpha=0.01,
alpha_decay_ratio=0.5,
init_epsilon=1.0,
min_epsilon=0.1,
epsilon_decay_ratio=0.9,
n_episodes=3000):
nS, nA = env.observation_space.n, env.action_space.n
pi_track = []
Q = np.zeros((nS, nA), dtype=np.float64)
Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float64)
select_action = lambda state, Q, epsilon: np.argmax(Q[state]) \
if np.random.random() > epsilon \
else np.random.randint(len(Q[state]))
alphas = decay_schedule(init_alpha,
min_alpha,
alpha_decay_ratio,
n_episodes)
epsilons = decay_schedule(init_epsilon,
min_epsilon,
epsilon_decay_ratio,
n_episodes)
for e in tqdm(range(n_episodes), leave=False):
state, done = env.reset(), False
action = select_action(state, Q, epsilons[e])
while not done:
next_state, reward, done, _ = env.step(action)
next_action = select_action(next_state, Q, epsilons[e])
td_target = reward + gamma * Q[next_state][next_action] * (not done)
td_error = td_target - Q[state][action]
Q[state][action] = Q[state][action] + alphas[e] * td_error
state, action = next_state, next_action
Q_track[e] = Q
pi_track.append(np.argmax(Q, axis=1))
V = np.max(Q, axis=1)
pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
return Q, V, pi, Q_track, pi_track
Q_sarsas, V_sarsas, Q_track_sarsas = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
random.seed(seed); np.random.seed(seed) ; env.seed(seed)
Q_sarsa, V_sarsa, pi_sarsa, Q_track_sarsa, pi_track_sarsa = sarsa(env, gamma=gamma, n_episodes=n_episodes)
Q_sarsas.append(Q_sarsa) ; V_sarsas.append(V_sarsa) ; Q_track_sarsas.append(Q_track_sarsa)
Q_sarsa = np.mean(Q_sarsas, axis=0)
V_sarsa = np.mean(V_sarsas, axis=0)
Q_track_sarsa = np.mean(Q_track_sarsas, axis=0)
del Q_sarsas ; del V_sarsas ; del Q_track_sarsas
print_state_value_function(V_sarsa, P, n_cols=n_cols,
prec=svf_prec, title='State-value function found by Sarsa:')
print_state_value_function(optimal_V, P, n_cols=n_cols,
prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_sarsa - optimal_V, P, n_cols=n_cols,
prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_sarsa, optimal_V)))
print()
print_action_value_function(Q_sarsa,
optimal_Q,
action_symbols=action_symbols,
prec=avf_prec,
title='Sarsa action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_sarsa, optimal_Q)))
print()
print_policy(pi_sarsa, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_sarsa, mean_return_sarsa, mean_regret_sarsa = get_policy_metrics(
env, gamma=gamma, pi=pi_sarsa, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
success_rate_sarsa, mean_return_sarsa, mean_regret_sarsa))
def q_learning(env,
gamma=1.0,
init_alpha=0.5,
min_alpha=0.01,
alpha_decay_ratio=0.5,
init_epsilon=1.0,
min_epsilon=0.1,
epsilon_decay_ratio=0.9,
n_episodes=3000):
nS, nA = env.observation_space.n, env.action_space.n
pi_track = []
Q = np.zeros((nS, nA), dtype=np.float64)
Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float64)
select_action = lambda state, Q, epsilon: np.argmax(Q[state]) \
if np.random.random() > epsilon \
else np.random.randint(len(Q[state]))
alphas = decay_schedule(init_alpha,
min_alpha,
alpha_decay_ratio,
n_episodes)
epsilons = decay_schedule(init_epsilon,
min_epsilon,
epsilon_decay_ratio,
n_episodes)
for e in tqdm(range(n_episodes), leave=False):
state, done = env.reset(), False
while not done:
action = select_action(state, Q, epsilons[e])
next_state, reward, done, _ = env.step(action)
td_target = reward + gamma * Q[next_state].max() * (not done)
td_error = td_target - Q[state][action]
Q[state][action] = Q[state][action] + alphas[e] * td_error
state = next_state
Q_track[e] = Q
pi_track.append(np.argmax(Q, axis=1))
V = np.max(Q, axis=1)
pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
return Q, V, pi, Q_track, pi_track
Q_qls, V_qls, Q_track_qls = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
random.seed(seed); np.random.seed(seed) ; env.seed(seed)
Q_ql, V_ql, pi_ql, Q_track_ql, pi_track_ql = q_learning(env, gamma=gamma, n_episodes=n_episodes)
Q_qls.append(Q_ql) ; V_qls.append(V_ql) ; Q_track_qls.append(Q_track_ql)
Q_ql = np.mean(Q_qls, axis=0)
V_ql = np.mean(V_qls, axis=0)
Q_track_ql = np.mean(Q_track_qls, axis=0)
del Q_qls ; del V_qls ; del Q_track_qls
print_state_value_function(V_ql, P, n_cols=n_cols,
prec=svf_prec, title='State-value function found by Q-learning:')
print_state_value_function(optimal_V, P, n_cols=n_cols,
prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_ql - optimal_V, P, n_cols=n_cols,
prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_ql, optimal_V)))
print()
print_action_value_function(Q_ql,
optimal_Q,
action_symbols=action_symbols,
prec=avf_prec,
title='Q-learning action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_ql, optimal_Q)))
print()
print_policy(pi_ql, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_ql, mean_return_ql, mean_regret_ql = get_policy_metrics(
env, gamma=gamma, pi=pi_ql, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
success_rate_ql, mean_return_ql, mean_regret_ql))
def double_q_learning(env,
gamma=1.0,
init_alpha=0.5,
min_alpha=0.01,
alpha_decay_ratio=0.5,
init_epsilon=1.0,
min_epsilon=0.1,
epsilon_decay_ratio=0.9,
n_episodes=3000):
nS, nA = env.observation_space.n, env.action_space.n
pi_track = []
Q1 = np.zeros((nS, nA), dtype=np.float64)
Q2 = np.zeros((nS, nA), dtype=np.float64)
Q_track1 = np.zeros((n_episodes, nS, nA), dtype=np.float64)
Q_track2 = np.zeros((n_episodes, nS, nA), dtype=np.float64)
select_action = lambda state, Q, epsilon: np.argmax(Q[state]) \
if np.random.random() > epsilon \
else np.random.randint(len(Q[state]))
alphas = decay_schedule(init_alpha,
min_alpha,
alpha_decay_ratio,
n_episodes)
epsilons = decay_schedule(init_epsilon,
min_epsilon,
epsilon_decay_ratio,
n_episodes)
for e in tqdm(range(n_episodes), leave=False):
state, done = env.reset(), False
while not done:
action = select_action(state, (Q1 + Q2)/2, epsilons[e])
next_state, reward, done, _ = env.step(action)
if np.random.randint(2):
argmax_Q1 = np.argmax(Q1[next_state])
td_target = reward + gamma * Q2[next_state][argmax_Q1] * (not done)
td_error = td_target - Q1[state][action]
Q1[state][action] = Q1[state][action] + alphas[e] * td_error
else:
argmax_Q2 = np.argmax(Q2[next_state])
td_target = reward + gamma * Q1[next_state][argmax_Q2] * (not done)
td_error = td_target - Q2[state][action]
Q2[state][action] = Q2[state][action] + alphas[e] * td_error
state = next_state
Q_track1[e] = Q1
Q_track2[e] = Q2
pi_track.append(np.argmax((Q1 + Q2)/2, axis=1))
Q = (Q1 + Q2)/2.
V = np.max(Q, axis=1)
pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
return Q, V, pi, (Q_track1 + Q_track2)/2., pi_track
Q_dqls, V_dqls, Q_track_dqls = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
random.seed(seed); np.random.seed(seed) ; env.seed(seed)
Q_dql, V_dql, pi_dql, Q_track_dql, pi_track_dql = double_q_learning(env, gamma=gamma, n_episodes=n_episodes)
Q_dqls.append(Q_dql) ; V_dqls.append(V_dql) ; Q_track_dqls.append(Q_track_dql)
Q_dql, V_dql, Q_track_dql = np.mean(Q_dqls, axis=0), np.mean(V_dqls, axis=0), np.mean(Q_track_dqls, axis=0)
del Q_dqls ; del V_dqls ; del Q_track_dqls
print_state_value_function(V_dql, P, n_cols=n_cols,
prec=svf_prec, title='State-value function found by Double Q-Learning:')
print_state_value_function(optimal_V, P, n_cols=n_cols,
prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_dql - optimal_V, P, n_cols=n_cols,
prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_dql, optimal_V)))
print()
print_action_value_function(Q_dql,
optimal_Q,
action_symbols=action_symbols,
prec=avf_prec,
title='Double Q-Learning action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_dql, optimal_Q)))
print()
print_policy(pi_dql, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_dql, mean_return_dql, mean_regret_dql = get_policy_metrics(
env, gamma=gamma, pi=pi_dql, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
success_rate_dql, mean_return_dql, mean_regret_dql))
plot_value_function(
'FVMC estimates through time vs. true values',
np.max(Q_track_mc, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=False)
plot_value_function(
'FVMC estimates through time vs. true values (log scale)',
np.max(Q_track_mc, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=True)
plot_value_function(
'FVMC estimates through time (close up)',
np.max(Q_track_mc, axis=2)[:cu_episodes],
None,
limit_items=cu_limit_items,
limit_value=cu_limit_value,
log=False)
plot_value_function(
'Sarsa estimates through time vs. true values',
np.max(Q_track_sarsa, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=False)
plot_value_function(
'Sarsa estimates through time vs. true values (log scale)',
np.max(Q_track_sarsa, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=True)
plot_value_function(
'Sarsa estimates through time (close up)',
np.max(Q_track_sarsa, axis=2)[:cu_episodes],
None,
limit_items=cu_limit_items,
limit_value=cu_limit_value,
log=False)
plot_value_function(
'Q-Learning estimates through time vs. true values',
np.max(Q_track_ql, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=False)
plot_value_function(
'Q-Learning estimates through time vs. true values (log scale)',
np.max(Q_track_ql, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=True)
plot_value_function(
'Q-Learning estimates through time (close up)',
np.max(Q_track_ql, axis=2)[:cu_episodes],
None,
limit_items=cu_limit_items,
limit_value=cu_limit_value,
log=False)
plot_value_function(
'Double Q-Learning estimates through time vs. true values',
np.max(Q_track_dql, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=False)
plot_value_function(
'Double Q-Learning estimates through time vs. true values (log scale)',
np.max(Q_track_dql, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=True)
plot_value_function(
'Double Q-Learning estimates through time (close up)',
np.max(Q_track_dql, axis=2)[:cu_episodes],
None,
limit_items=cu_limit_items,
limit_value=cu_limit_value,
log=False)
mc_success_rate_ma, mc_mean_return_ma, mc_mean_regret_ma = get_metrics_from_tracks(
env, gamma, goal_state, optimal_Q, pi_track_mc)
sarsa_success_rate_ma, sarsa_mean_return_ma, sarsa_mean_regret_ma = get_metrics_from_tracks(
env, gamma, goal_state, optimal_Q, pi_track_sarsa)
ql_success_rate_ma, ql_mean_return_ma, ql_mean_regret_ma = get_metrics_from_tracks(
env, gamma, goal_state, optimal_Q, pi_track_ql)
dql_success_rate_ma, dql_mean_return_ma, dql_mean_regret_ma = get_metrics_from_tracks(
env, gamma, goal_state, optimal_Q, pi_track_dql)
plt.axhline(y=success_rate_op, color='k', linestyle='-', linewidth=1)
plt.text(int(len(mc_success_rate_ma)*1.02), success_rate_op*1.01, 'π*')
plt.plot(mc_success_rate_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_success_rate_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_success_rate_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_success_rate_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)
plt.title('Policy success rate (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Success rate %')
plt.ylim(-1, 101)
plt.xticks(rotation=45)
plt.show()
plt.axhline(y=mean_return_op, color='k', linestyle='-', linewidth=1)
plt.text(int(len(mc_mean_return_ma)*1.02), mean_return_op*1.01, 'π*')
plt.plot(mc_mean_return_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_mean_return_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_mean_return_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_mean_return_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)
plt.title('Policy episode return (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Return (Gt:T)')
plt.xticks(rotation=45)
plt.show()
plt.plot(mc_mean_regret_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_mean_regret_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_mean_regret_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_mean_regret_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)
plt.title('Policy episode regret (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Regret (q* - Q)')
plt.xticks(rotation=45)
plt.show()
plt.axhline(y=optimal_V[init_state], color='k', linestyle='-', linewidth=1)
plt.text(int(len(Q_track_mc)*1.05), optimal_V[init_state]+.01, 'v*({})'.format(init_state))
plt.plot(moving_average(np.max(Q_track_mc, axis=2).T[init_state]),
'-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.max(Q_track_sarsa, axis=2).T[init_state]),
'--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.max(Q_track_ql, axis=2).T[init_state]),
':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.max(Q_track_dql, axis=2).T[init_state]),
'-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)
plt.title('Estimated expected return (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Estimated value of initial state V({})'.format(init_state))
plt.xticks(rotation=45)
plt.show()
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_mc, axis=2) - optimal_V), axis=1)),
'-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_sarsa, axis=2) - optimal_V), axis=1)),
'--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_ql, axis=2) - optimal_V), axis=1)),
':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_dql, axis=2) - optimal_V), axis=1)),
'-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)
plt.title('State-value function estimation error (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Mean Absolute Error MAE(V, v*)')
plt.xticks(rotation=45)
plt.show()
plt.plot(moving_average(np.mean(np.abs(Q_track_mc - optimal_Q), axis=(1,2))),
'-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.mean(np.abs(Q_track_sarsa - optimal_Q), axis=(1,2))),
'--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.mean(np.abs(Q_track_ql - optimal_Q), axis=(1,2))),
':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.mean(np.abs(Q_track_dql - optimal_Q), axis=(1,2))),
'-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)
plt.title('Action-value function estimation error (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Mean Absolute Error MAE(Q, q*)')
plt.xticks(rotation=45)
plt.show()
env = gym.make('RussellNorvigGridworld-v0')
init_state = env.reset()
goal_state = 3
gamma = 1.0
n_episodes = 4000
P = env.env.P
n_cols, svf_prec, err_prec, avf_prec=4, 4, 2, 3
action_symbols=('<', 'v', '>', '^')
limit_items, limit_value = 5, 0.01
cu_limit_items, cu_limit_value, cu_episodes = 10, 0.0, 1000
plt.plot(decay_schedule(0.5, 0.01, 0.5, n_episodes),
'-', linewidth=2,
label='Alpha schedule')
plt.plot(decay_schedule(1.0, 0.1, 0.9, n_episodes),
':', linewidth=2,
label='Epsilon schedule')
plt.legend(loc=1, ncol=1)
plt.title('Alpha and epsilon schedules')
plt.xlabel('Episodes')
plt.ylabel('Hyperparameter values')
plt.xticks(rotation=45)
plt.show()
optimal_Q, optimal_V, optimal_pi = value_iteration(P, gamma=gamma)
print_state_value_function(optimal_V, P, n_cols=n_cols, prec=svf_prec, title='Optimal state-value function:')
print()
print_action_value_function(optimal_Q,
None,
action_symbols=action_symbols,
prec=avf_prec,
title='Optimal action-value function:')
print()
print_policy(optimal_pi, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_op, mean_return_op, mean_regret_op = get_policy_metrics(
env, gamma=gamma, pi=optimal_pi, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
success_rate_op, mean_return_op, mean_regret_op))
Q_mcs, V_mcs, Q_track_mcs = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
random.seed(seed); np.random.seed(seed) ; env.seed(seed)
Q_mc, V_mc, pi_mc, Q_track_mc, pi_track_mc = mc_control(env, gamma=gamma, n_episodes=n_episodes)
Q_mcs.append(Q_mc) ; V_mcs.append(V_mc) ; Q_track_mcs.append(Q_track_mc)
Q_mc, V_mc, Q_track_mc = np.mean(Q_mcs, axis=0), np.mean(V_mcs, axis=0), np.mean(Q_track_mcs, axis=0)
del Q_mcs ; del V_mcs ; del Q_track_mcs
print_state_value_function(V_mc, P, n_cols=n_cols,
prec=svf_prec, title='State-value function found by FVMC:')
print_state_value_function(optimal_V, P, n_cols=n_cols,
prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_mc - optimal_V, P, n_cols=n_cols,
prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_mc, optimal_V)))
print()
print_action_value_function(Q_mc,
optimal_Q,
action_symbols=action_symbols,
prec=avf_prec,
title='FVMC action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_mc, optimal_Q)))
print()
print_policy(pi_mc, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_mc, mean_return_mc, mean_regret_mc = get_policy_metrics(
env, gamma=gamma, pi=pi_mc, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
success_rate_mc, mean_return_mc, mean_regret_mc))
Q_sarsas, V_sarsas, Q_track_sarsas = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
random.seed(seed); np.random.seed(seed) ; env.seed(seed)
Q_sarsa, V_sarsa, pi_sarsa, Q_track_sarsa, pi_track_sarsa = sarsa(env, gamma=gamma, n_episodes=n_episodes)
Q_sarsas.append(Q_sarsa) ; V_sarsas.append(V_sarsa) ; Q_track_sarsas.append(Q_track_sarsa)
Q_sarsa = np.mean(Q_sarsas, axis=0)
V_sarsa = np.mean(V_sarsas, axis=0)
Q_track_sarsa = np.mean(Q_track_sarsas, axis=0)
del Q_sarsas ; del V_sarsas ; del Q_track_sarsas
print_state_value_function(V_sarsa, P, n_cols=n_cols,
prec=svf_prec, title='State-value function found by Sarsa:')
print_state_value_function(optimal_V, P, n_cols=n_cols,
prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_sarsa - optimal_V, P, n_cols=n_cols,
prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_sarsa, optimal_V)))
print()
print_action_value_function(Q_sarsa,
optimal_Q,
action_symbols=action_symbols,
prec=avf_prec,
title='Sarsa action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_sarsa, optimal_Q)))
print()
print_policy(pi_sarsa, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_sarsa, mean_return_sarsa, mean_regret_sarsa = get_policy_metrics(
env, gamma=gamma, pi=pi_sarsa, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
success_rate_sarsa, mean_return_sarsa, mean_regret_sarsa))
Q_qls, V_qls, Q_track_qls = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
random.seed(seed); np.random.seed(seed) ; env.seed(seed)
Q_ql, V_ql, pi_ql, Q_track_ql, pi_track_ql = q_learning(env, gamma=gamma, n_episodes=n_episodes)
Q_qls.append(Q_ql) ; V_qls.append(V_ql) ; Q_track_qls.append(Q_track_ql)
Q_ql = np.mean(Q_qls, axis=0)
V_ql = np.mean(V_qls, axis=0)
Q_track_ql = np.mean(Q_track_qls, axis=0)
del Q_qls ; del V_qls ; del Q_track_qls
print_state_value_function(V_ql, P, n_cols=n_cols,
prec=svf_prec, title='State-value function found by Q-learning:')
print_state_value_function(optimal_V, P, n_cols=n_cols,
prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_ql - optimal_V, P, n_cols=n_cols,
prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_ql, optimal_V)))
print()
print_action_value_function(Q_ql,
optimal_Q,
action_symbols=action_symbols,
prec=avf_prec,
title='Q-learning action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_ql, optimal_Q)))
print()
print_policy(pi_ql, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_ql, mean_return_ql, mean_regret_ql = get_policy_metrics(
env, gamma=gamma, pi=pi_ql, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
success_rate_ql, mean_return_ql, mean_regret_ql))
Q_dqls, V_dqls, Q_track_dqls = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
random.seed(seed); np.random.seed(seed) ; env.seed(seed)
Q_dql, V_dql, pi_dql, Q_track_dql, pi_track_dql = double_q_learning(env, gamma=gamma, n_episodes=n_episodes)
Q_dqls.append(Q_dql) ; V_dqls.append(V_dql) ; Q_track_dqls.append(Q_track_dql)
Q_dql, V_dql, Q_track_dql = np.mean(Q_dqls, axis=0), np.mean(V_dqls, axis=0), np.mean(Q_track_dqls, axis=0)
del Q_dqls ; del V_dqls ; del Q_track_dqls
print_state_value_function(V_dql, P, n_cols=n_cols,
prec=svf_prec, title='State-value function found by Double Q-Learning:')
print_state_value_function(optimal_V, P, n_cols=n_cols,
prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_dql - optimal_V, P, n_cols=n_cols,
prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_dql, optimal_V)))
print()
print_action_value_function(Q_dql,
optimal_Q,
action_symbols=action_symbols,
prec=avf_prec,
title='Double Q-Learning action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_dql, optimal_Q)))
print()
print_policy(pi_dql, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_dql, mean_return_dql, mean_regret_dql = get_policy_metrics(
env, gamma=gamma, pi=pi_dql, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
success_rate_dql, mean_return_dql, mean_regret_dql))
plot_value_function(
'FVMC estimates through time vs. true values',
np.max(Q_track_mc, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=False)
plot_value_function(
'FVMC estimates through time vs. true values (log scale)',
np.max(Q_track_mc, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=True)
plot_value_function(
'FVMC estimates through time (close up)',
np.max(Q_track_mc, axis=2)[:cu_episodes],
None,
limit_items=cu_limit_items,
limit_value=cu_limit_value,
log=False)
plot_value_function(
'Sarsa estimates through time vs. true values',
np.max(Q_track_sarsa, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=False)
plot_value_function(
'Sarsa estimates through time vs. true values (log scale)',
np.max(Q_track_sarsa, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=True)
plot_value_function(
'Sarsa estimates through time (close up)',
np.max(Q_track_sarsa, axis=2)[:cu_episodes],
None,
limit_items=cu_limit_items,
limit_value=cu_limit_value,
log=False)
plot_value_function(
'Q-Learning estimates through time vs. true values',
np.max(Q_track_ql, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=False)
plot_value_function(
'Q-Learning estimates through time vs. true values (log scale)',
np.max(Q_track_ql, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=True)
plot_value_function(
'Q-Learning estimates through time (close up)',
np.max(Q_track_ql, axis=2)[:cu_episodes],
None,
limit_items=cu_limit_items,
limit_value=cu_limit_value,
log=False)
plot_value_function(
'Double Q-Learning estimates through time vs. true values',
np.max(Q_track_dql, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=False)
plot_value_function(
'Double Q-Learning estimates through time vs. true values (log scale)',
np.max(Q_track_dql, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=True)
plot_value_function(
'Double Q-Learning estimates through time (close up)',
np.max(Q_track_dql, axis=2)[:cu_episodes],
None,
limit_items=cu_limit_items,
limit_value=cu_limit_value,
log=False)
mc_success_rate_ma, mc_mean_return_ma, mc_mean_regret_ma = get_metrics_from_tracks(
env, gamma, goal_state, optimal_Q, pi_track_mc)
sarsa_success_rate_ma, sarsa_mean_return_ma, sarsa_mean_regret_ma = get_metrics_from_tracks(
env, gamma, goal_state, optimal_Q, pi_track_sarsa)
ql_success_rate_ma, ql_mean_return_ma, ql_mean_regret_ma = get_metrics_from_tracks(
env, gamma, goal_state, optimal_Q, pi_track_ql)
dql_success_rate_ma, dql_mean_return_ma, dql_mean_regret_ma = get_metrics_from_tracks(
env, gamma, goal_state, optimal_Q, pi_track_dql)
plt.axhline(y=success_rate_op, color='k', linestyle='-', linewidth=1)
plt.text(int(len(mc_success_rate_ma)*1.02), success_rate_op*1.01, 'π*')
plt.plot(mc_success_rate_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_success_rate_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_success_rate_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_success_rate_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)
plt.title('Policy success rate (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Success rate %')
plt.ylim(-1, 101)
plt.xticks(rotation=45)
plt.show()
plt.axhline(y=mean_return_op, color='k', linestyle='-', linewidth=1)
plt.text(int(len(mc_mean_return_ma)*1.02), mean_return_op*1.01, 'π*')
plt.plot(mc_mean_return_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_mean_return_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_mean_return_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_mean_return_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)
plt.title('Policy episode return (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Return (Gt:T)')
plt.xticks(rotation=45)
plt.show()
plt.plot(mc_mean_regret_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_mean_regret_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_mean_regret_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_mean_regret_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)
plt.title('Policy episode regret (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Regret (q* - Q)')
plt.xticks(rotation=45)
plt.show()
plt.axhline(y=optimal_V[init_state], color='k', linestyle='-', linewidth=1)
plt.text(int(len(Q_track_mc)*1.05), optimal_V[init_state]+.01, 'v*({})'.format(init_state))
plt.plot(moving_average(np.max(Q_track_mc, axis=2).T[init_state]),
'-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.max(Q_track_sarsa, axis=2).T[init_state]),
'--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.max(Q_track_ql, axis=2).T[init_state]),
':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.max(Q_track_dql, axis=2).T[init_state]),
'-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)
plt.title('Estimated expected return (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Estimated value of initial state V({})'.format(init_state))
plt.xticks(rotation=45)
plt.show()
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_mc, axis=2) - optimal_V), axis=1)),
'-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_sarsa, axis=2) - optimal_V), axis=1)),
'--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_ql, axis=2) - optimal_V), axis=1)),
':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_dql, axis=2) - optimal_V), axis=1)),
'-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)
plt.title('State-value function estimation error (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Mean Absolute Error MAE(V, v*)')
plt.xticks(rotation=45)
plt.show()
plt.plot(moving_average(np.mean(np.abs(Q_track_mc - optimal_Q), axis=(1,2))),
'-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.mean(np.abs(Q_track_sarsa - optimal_Q), axis=(1,2))),
'--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.mean(np.abs(Q_track_ql - optimal_Q), axis=(1,2))),
':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.mean(np.abs(Q_track_dql - optimal_Q), axis=(1,2))),
'-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)
plt.title('Action-value function estimation error (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Mean Absolute Error MAE(Q, q*)')
plt.xticks(rotation=45)
plt.show()
env = gym.make('FrozenLake-v0')
init_state = env.reset()
goal_state = 15
gamma = 0.99
n_episodes = 10000
P = env.env.P
n_cols, svf_prec, err_prec, avf_prec=4, 4, 2, 3
action_symbols=('<', 'v', '>', '^')
limit_items, limit_value = 5, 0.0
cu_limit_items, cu_limit_value, cu_episodes = 10, 0.01, 2000
plt.plot(decay_schedule(0.5, 0.01, 0.5, n_episodes),
'-', linewidth=2,
label='Alpha schedule')
plt.plot(decay_schedule(1.0, 0.1, 0.9, n_episodes),
':', linewidth=2,
label='Epsilon schedule')
plt.legend(loc=1, ncol=1)
plt.title('Alpha and epsilon schedules')
plt.xlabel('Episodes')
plt.ylabel('Hyperparameter values')
plt.xticks(rotation=45)
plt.show()
optimal_Q, optimal_V, optimal_pi = value_iteration(P, gamma=gamma)
print_state_value_function(optimal_V, P, n_cols=n_cols, prec=svf_prec, title='Optimal state-value function:')
print()
print_action_value_function(optimal_Q,
None,
action_symbols=action_symbols,
prec=avf_prec,
title='Optimal action-value function:')
print()
print_policy(optimal_pi, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_op, mean_return_op, mean_regret_op = get_policy_metrics(
env, gamma=gamma, pi=optimal_pi, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
success_rate_op, mean_return_op, mean_regret_op))
Q_mcs, V_mcs, Q_track_mcs = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
random.seed(seed); np.random.seed(seed) ; env.seed(seed)
Q_mc, V_mc, pi_mc, Q_track_mc, pi_track_mc = mc_control(env, gamma=gamma, n_episodes=n_episodes)
Q_mcs.append(Q_mc) ; V_mcs.append(V_mc) ; Q_track_mcs.append(Q_track_mc)
Q_mc, V_mc, Q_track_mc = np.mean(Q_mcs, axis=0), np.mean(V_mcs, axis=0), np.mean(Q_track_mcs, axis=0)
del Q_mcs ; del V_mcs ; del Q_track_mcs
print_state_value_function(V_mc, P, n_cols=n_cols,
prec=svf_prec, title='State-value function found by FVMC:')
print_state_value_function(optimal_V, P, n_cols=n_cols,
prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_mc - optimal_V, P, n_cols=n_cols,
prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_mc, optimal_V)))
print()
print_action_value_function(Q_mc,
optimal_Q,
action_symbols=action_symbols,
prec=avf_prec,
title='FVMC action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_mc, optimal_Q)))
print()
print_policy(pi_mc, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_mc, mean_return_mc, mean_regret_mc = get_policy_metrics(
env, gamma=gamma, pi=pi_mc, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
success_rate_mc, mean_return_mc, mean_regret_mc))
Q_sarsas, V_sarsas, Q_track_sarsas = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
random.seed(seed); np.random.seed(seed) ; env.seed(seed)
Q_sarsa, V_sarsa, pi_sarsa, Q_track_sarsa, pi_track_sarsa = sarsa(env, gamma=gamma, n_episodes=n_episodes)
Q_sarsas.append(Q_sarsa) ; V_sarsas.append(V_sarsa) ; Q_track_sarsas.append(Q_track_sarsa)
Q_sarsa = np.mean(Q_sarsas, axis=0)
V_sarsa = np.mean(V_sarsas, axis=0)
Q_track_sarsa = np.mean(Q_track_sarsas, axis=0)
del Q_sarsas ; del V_sarsas ; del Q_track_sarsas
print_state_value_function(V_sarsa, P, n_cols=n_cols,
prec=svf_prec, title='State-value function found by Sarsa:')
print_state_value_function(optimal_V, P, n_cols=n_cols,
prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_sarsa - optimal_V, P, n_cols=n_cols,
prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_sarsa, optimal_V)))
print()
print_action_value_function(Q_sarsa,
optimal_Q,
action_symbols=action_symbols,
prec=avf_prec,
title='Sarsa action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_sarsa, optimal_Q)))
print()
print_policy(pi_sarsa, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_sarsa, mean_return_sarsa, mean_regret_sarsa = get_policy_metrics(
env, gamma=gamma, pi=pi_sarsa, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
success_rate_sarsa, mean_return_sarsa, mean_regret_sarsa))
Q_qls, V_qls, Q_track_qls = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
random.seed(seed); np.random.seed(seed) ; env.seed(seed)
Q_ql, V_ql, pi_ql, Q_track_ql, pi_track_ql = q_learning(env, gamma=gamma, n_episodes=n_episodes)
Q_qls.append(Q_ql) ; V_qls.append(V_ql) ; Q_track_qls.append(Q_track_ql)
Q_ql = np.mean(Q_qls, axis=0)
V_ql = np.mean(V_qls, axis=0)
Q_track_ql = np.mean(Q_track_qls, axis=0)
del Q_qls ; del V_qls ; del Q_track_qls
print_state_value_function(V_ql, P, n_cols=n_cols,
prec=svf_prec, title='State-value function found by Q-learning:')
print_state_value_function(optimal_V, P, n_cols=n_cols,
prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_ql - optimal_V, P, n_cols=n_cols,
prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_ql, optimal_V)))
print()
print_action_value_function(Q_ql,
optimal_Q,
action_symbols=action_symbols,
prec=avf_prec,
title='Q-learning action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_ql, optimal_Q)))
print()
print_policy(pi_ql, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_ql, mean_return_ql, mean_regret_ql = get_policy_metrics(
env, gamma=gamma, pi=pi_ql, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
success_rate_ql, mean_return_ql, mean_regret_ql))
Q_dqls, V_dqls, Q_track_dqls = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
random.seed(seed); np.random.seed(seed) ; env.seed(seed)
Q_dql, V_dql, pi_dql, Q_track_dql, pi_track_dql = double_q_learning(env, gamma=gamma, n_episodes=n_episodes)
Q_dqls.append(Q_dql) ; V_dqls.append(V_dql) ; Q_track_dqls.append(Q_track_dql)
Q_dql, V_dql, Q_track_dql = np.mean(Q_dqls, axis=0), np.mean(V_dqls, axis=0), np.mean(Q_track_dqls, axis=0)
del Q_dqls ; del V_dqls ; del Q_track_dqls
print_state_value_function(V_dql, P, n_cols=n_cols,
prec=svf_prec, title='State-value function found by Double Q-Learning:')
print_state_value_function(optimal_V, P, n_cols=n_cols,
prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_dql - optimal_V, P, n_cols=n_cols,
prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_dql, optimal_V)))
print()
print_action_value_function(Q_dql,
optimal_Q,
action_symbols=action_symbols,
prec=avf_prec,
title='Double Q-Learning action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_dql, optimal_Q)))
print()
print_policy(pi_dql, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_dql, mean_return_dql, mean_regret_dql = get_policy_metrics(
env, gamma=gamma, pi=pi_dql, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
success_rate_dql, mean_return_dql, mean_regret_dql))
plot_value_function(
'FVMC estimates through time vs. true values',
np.max(Q_track_mc, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=False)
plot_value_function(
'FVMC estimates through time vs. true values (log scale)',
np.max(Q_track_mc, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=True)
plot_value_function(
'FVMC estimates through time (close up)',
np.max(Q_track_mc, axis=2)[:cu_episodes],
None,
limit_items=cu_limit_items,
limit_value=cu_limit_value,
log=False)
plot_value_function(
'Sarsa estimates through time vs. true values',
np.max(Q_track_sarsa, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=False)
plot_value_function(
'Sarsa estimates through time vs. true values (log scale)',
np.max(Q_track_sarsa, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=True)
plot_value_function(
'Sarsa estimates through time (close up)',
np.max(Q_track_sarsa, axis=2)[:cu_episodes],
None,
limit_items=cu_limit_items,
limit_value=cu_limit_value,
log=False)
plot_value_function(
'Q-Learning estimates through time vs. true values',
np.max(Q_track_ql, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=False)
plot_value_function(
'Q-Learning estimates through time vs. true values (log scale)',
np.max(Q_track_ql, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=True)
plot_value_function(
'Q-Learning estimates through time (close up)',
np.max(Q_track_ql, axis=2)[:cu_episodes],
None,
limit_items=cu_limit_items,
limit_value=cu_limit_value,
log=False)
plot_value_function(
'Double Q-Learning estimates through time vs. true values',
np.max(Q_track_dql, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=False)
plot_value_function(
'Double Q-Learning estimates through time vs. true values (log scale)',
np.max(Q_track_dql, axis=2),
optimal_V,
limit_items=limit_items,
limit_value=limit_value,
log=True)
plot_value_function(
'Double Q-Learning estimates through time (close up)',
np.max(Q_track_dql, axis=2)[:cu_episodes],
None,
limit_items=cu_limit_items,
limit_value=cu_limit_value,
log=False)
mc_success_rate_ma, mc_mean_return_ma, mc_mean_regret_ma = get_metrics_from_tracks(
env, gamma, goal_state, optimal_Q, pi_track_mc)
sarsa_success_rate_ma, sarsa_mean_return_ma, sarsa_mean_regret_ma = get_metrics_from_tracks(
env, gamma, goal_state, optimal_Q, pi_track_sarsa)
ql_success_rate_ma, ql_mean_return_ma, ql_mean_regret_ma = get_metrics_from_tracks(
env, gamma, goal_state, optimal_Q, pi_track_ql)
dql_success_rate_ma, dql_mean_return_ma, dql_mean_regret_ma = get_metrics_from_tracks(
env, gamma, goal_state, optimal_Q, pi_track_dql)
plt.axhline(y=success_rate_op, color='k', linestyle='-', linewidth=1)
plt.text(int(len(mc_success_rate_ma)*1.02), success_rate_op*1.01, 'π*')
plt.plot(mc_success_rate_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_success_rate_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_success_rate_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_success_rate_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)
plt.title('Policy success rate (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Success rate %')
plt.ylim(-1, 101)
plt.xticks(rotation=45)
plt.show()
plt.axhline(y=mean_return_op, color='k', linestyle='-', linewidth=1)
plt.text(int(len(mc_mean_return_ma)*1.02), mean_return_op*1.01, 'π*')
plt.plot(mc_mean_return_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_mean_return_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_mean_return_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_mean_return_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)
plt.title('Policy episode return (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Return (Gt:T)')
plt.xticks(rotation=45)
plt.show()
plt.plot(mc_mean_regret_ma, '-', linewidth=2, label='FVMC')
plt.plot(sarsa_mean_regret_ma, '--', linewidth=2, label='Sarsa')
plt.plot(ql_mean_regret_ma, ':', linewidth=2, label='Q-Learning')
plt.plot(dql_mean_regret_ma, '-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)
plt.title('Policy episode regret (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Regret (q* - Q)')
plt.xticks(rotation=45)
plt.show()
plt.axhline(y=optimal_V[init_state], color='k', linestyle='-', linewidth=1)
plt.text(int(len(Q_track_mc)*1.05), optimal_V[init_state]+.01, 'v*({})'.format(init_state))
plt.plot(moving_average(np.max(Q_track_mc, axis=2).T[init_state]),
'-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.max(Q_track_sarsa, axis=2).T[init_state]),
'--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.max(Q_track_ql, axis=2).T[init_state]),
':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.max(Q_track_dql, axis=2).T[init_state]),
'-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=4, ncol=1)
plt.title('Estimated expected return (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Estimated value of initial state V({})'.format(init_state))
plt.xticks(rotation=45)
plt.show()
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_mc, axis=2) - optimal_V), axis=1)),
'-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_sarsa, axis=2) - optimal_V), axis=1)),
'--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_ql, axis=2) - optimal_V), axis=1)),
':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.mean(np.abs(np.max(Q_track_dql, axis=2) - optimal_V), axis=1)),
'-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)
plt.title('State-value function estimation error (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Mean Absolute Error MAE(V, v*)')
plt.xticks(rotation=45)
plt.show()
plt.plot(moving_average(np.mean(np.abs(Q_track_mc - optimal_Q), axis=(1,2))),
'-', linewidth=2, label='FVMC')
plt.plot(moving_average(np.mean(np.abs(Q_track_sarsa - optimal_Q), axis=(1,2))),
'--', linewidth=2, label='Sarsa')
plt.plot(moving_average(np.mean(np.abs(Q_track_ql - optimal_Q), axis=(1,2))),
':', linewidth=2, label='Q-Learning')
plt.plot(moving_average(np.mean(np.abs(Q_track_dql - optimal_Q), axis=(1,2))),
'-.', linewidth=2, label='Double Q-Learning')
plt.legend(loc=1, ncol=1)
plt.title('Action-value function estimation error (ma 100)')
plt.xlabel('Episodes')
plt.ylabel('Mean Absolute Error MAE(Q, q*)')
plt.xticks(rotation=45)
plt.show()