projects
/
picoclvr.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
Update.
[picoclvr.git]
/
tasks.py
diff --git
a/tasks.py
b/tasks.py
index
6b6b8f2
..
29f1e5a
100755
(executable)
--- a/
tasks.py
+++ b/
tasks.py
@@
-1944,13
+1944,14
@@
class Escape(Task):
# Generate iteration after iteration
optimistic_bias = result.new_zeros(self.nb_codes, device=result.device)
# Generate iteration after iteration
optimistic_bias = result.new_zeros(self.nb_codes, device=result.device)
- optimistic_bias[
(-1) + escape.first_lookahead_rewards_code + 1] = math.log(1e-
1)
- optimistic_bias[
(1) + escape.first_lookahead_rewards_code + 1
] = math.log(1e1)
+ optimistic_bias[
escape.lookahead_reward2code(-1)] = -math.log(1e
1)
+ optimistic_bias[
escape.lookahead_reward2code(1)
] = math.log(1e1)
for u in tqdm.tqdm(
range(it_len, result.size(1) - it_len + 1, it_len), desc="thinking"
):
for u in tqdm.tqdm(
range(it_len, result.size(1) - it_len + 1, it_len), desc="thinking"
):
- # Generate the lookahead_reward pessimistically
+ # Re-generate the lookahead_reward pessimistically in the
+ # previous iterations
ar_mask = (t < u).long() * (t % it_len == index_lookahead_reward).long()
ar(result, ar_mask, logit_biases=-optimistic_bias)
ar_mask = (t < u).long() * (t % it_len == index_lookahead_reward).long()
ar(result, ar_mask, logit_biases=-optimistic_bias)
@@
-1958,7
+1959,8
@@
class Escape(Task):
ar_mask = (t >= u).long() * (t < u + state_len).long()
ar(result, ar_mask)
ar_mask = (t >= u).long() * (t < u + state_len).long()
ar(result, ar_mask)
- # Generate the lookahead_reward optimistically
+ # Re-generate the lookahead_reward optimistically in the
+ # previous iterations
ar_mask = (t < u).long() * (t % it_len == index_lookahead_reward).long()
ar(result, ar_mask, logit_biases=optimistic_bias)
ar_mask = (t < u).long() * (t % it_len == index_lookahead_reward).long()
ar(result, ar_mask, logit_biases=optimistic_bias)