nb_states_codes = 5
nb_actions_codes = 5
nb_rewards_codes = 3
-nb_lookahead_rewards_codes = 3
+nb_lookahead_rewards_codes = 4 # stands for -1, 0, +1, and UNKNOWN
first_states_code = 0
first_actions_code = first_states_code + nb_states_codes
def lookahead_reward2code(r):
+ # -1, 0, +1 or 2 for UNKNOWN
return r + 1 + first_lookahead_rewards_code
######################################################################
-def generate_episodes(nb, height=6, width=6, T=10, nb_walls=3, nb_coins=3):
+def generate_episodes(nb, height=6, width=6, T=10, nb_walls=3, nb_coins=2):
rnd = torch.rand(nb, height, width)
rnd[:, 0, :] = 0
rnd[:, -1, :] = 0
t >= first_lookahead_rewards_code
and t < first_lookahead_rewards_code + nb_lookahead_rewards_codes
):
- return "n.p"[t - first_lookahead_rewards_code]
+ return "n.pU"[t - first_lookahead_rewards_code]
else:
return "?"