- # Generate the lookahead_reward and state
- ar_mask = (t % self.it_len == self.index_lookahead_reward).long() * (
- t <= u + self.index_lookahead_reward
- ).long()
- ar(result, ar_mask)
-
- # Generate the lookahead_reward and state
- ar_mask = (t >= u + self.index_states).long() * (
- t < u + self.index_states + self.state_len
- ).long()
- ar(result, ar_mask)
-
- # Re-generate the lookahead_reward
- ar_mask = (t % self.it_len == self.index_lookahead_reward).long() * (
- t <= u + self.index_lookahead_reward
- ).long()
- ar(result, ar_mask, logit_biases=optimistic_bias)
-
- # Generate the action and reward
+ # Generate the next state but keep the initial one, the
+ # lookahead_reward of previous iterations are set to
+ # UNKNOWN
+ if u > 0:
+ result[
+ :, u + self.index_lookahead_reward
+ ] = escape.lookahead_reward2code(2)
+ ar_mask = (t >= u + self.index_states).long() * (
+ t < u + self.index_states + self.state_len
+ ).long()
+ ar(result, ar_mask)
+
+ # Generate the action and reward with lookahead_reward to +1
+ result[:, u + self.index_lookahead_reward] = escape.lookahead_reward2code(1)