+ # Generate iteration after iteration
+
+ optimistic_bias = result.new_zeros(self.nb_codes, device=result.device)
+ optimistic_bias[escape.lookahead_reward2code(-1)] = -math.log(1e1)
+ optimistic_bias[escape.lookahead_reward2code(1)] = math.log(1e1)
+
+ for u in tqdm.tqdm(
+ range(it_len, result.size(1) - it_len + 1, it_len), desc="thinking"