- lr, _, _, _ = escape.seq2episodes(result[:, :u], self.height, self.width)
-
- # Generate the lookahead_reward and state
- ar_mask = (t % it_len == index_lookahead_reward).long() * (
- t <= u + index_lookahead_reward
+ # Generate the next state but keep the initial one, the
+ # lookahead_reward of previous iterations are set to
+ # UNKNOWN
+ if u > 0:
+ result[
+ :, u + self.index_lookahead_reward
+ ] = greed.lookahead_reward2code(2)
+ ar_mask = (t >= u + self.index_states).long() * (
+ t < u + self.index_states + self.state_len
+ ).long()
+ ar(result, ar_mask)
+
+ # Generate the action and reward with lookahead_reward to +1
+ result[:, u + self.index_lookahead_reward] = greed.lookahead_reward2code(1)
+ ar_mask = (t >= u + self.index_action).long() * (
+ t <= u + self.index_reward