+ # Generate iteration after iteration
+
+ optimistic_bias = result.new_zeros(self.nb_codes, device=result.device)
+ optimistic_bias[(-1) + escape.first_lookahead_rewards_code + 1] = math.log(1e-1)
+ optimistic_bias[(1) + escape.first_lookahead_rewards_code + 1] = math.log(1e1)
+
+ for u in tqdm.tqdm(
+ range(it_len, result.size(1) - it_len + 1, it_len), desc="thinking"