# Written by Francois Fleuret <francois@fleuret.org>
-import math, os, tqdm
+import math, os, tqdm, warnings
import torch, torchvision
result[:, it_len:] = -1
+ snapshots = []
+
def ar(result, ar_mask, logit_biases=None):
ar_mask = ar_mask.expand_as(result)
result *= 1 - ar_mask
device=self.device,
progress_bar_desc=None,
)
+ warnings.warn("keeping thinking snapshots", RuntimeWarning)
+ snapshots.append(result[:10].detach().clone())
# Generate iteration after iteration
optimistic_bias[escape.lookahead_reward2code(-1)] = -math.log(1e1)
optimistic_bias[escape.lookahead_reward2code(1)] = math.log(1e1)
- snapshots = []
-
for u in tqdm.tqdm(
range(it_len, result.size(1) - it_len + 1, it_len), desc="thinking"
):
+ lr, _, _, _ = escape.seq2episodes(result[:, :u], self.height, self.width)
+
# Generate the lookahead_reward and state
- ar_mask = (t >= u + index_lookahead_reward).long() * (
+ ar_mask = (t % it_len == index_lookahead_reward).long() * (
+ t <= u + index_lookahead_reward
+ ).long()
+ ar(result, ar_mask)
+
+ # Generate the lookahead_reward and state
+ ar_mask = (t >= u + index_states).long() * (
t < u + index_states + state_len
).long()
ar(result, ar_mask)
- snapshots.append(result[:10].detach().clone())
- backup_lookahead_reward = result[:, u + index_lookahead_reward]
# Re-generate the lookahead_reward
- ar_mask = (t == u + index_lookahead_reward).long()
+ ar_mask = (t % it_len == index_lookahead_reward).long() * (
+ t <= u + index_lookahead_reward
+ ).long()
ar(result, ar_mask, logit_biases=optimistic_bias)
- snapshots.append(result[:10].detach().clone())
# Generate the action and reward
ar_mask = (t >= u + index_action).long() * (t <= u + index_reward).long()
ar(result, ar_mask)
- snapshots.append(result[:10].detach().clone())
-
- result[:, u + index_lookahead_reward] = backup_lookahead_reward
filename = os.path.join(result_dir, f"test_thinking_compute_{n_epoch:04d}.txt")
with open(filename, "w") as f: