Update.

author François Fleuret <francois@fleuret.org>

Tue, 26 Mar 2024 08:00:00 +0000 (09:00 +0100)

committer François Fleuret <francois@fleuret.org>

Tue, 26 Mar 2024 08:00:00 +0000 (09:00 +0100)
author François Fleuret <francois@fleuret.org>
Tue, 26 Mar 2024 08:00:00 +0000 (09:00 +0100)
committer François Fleuret <francois@fleuret.org>
Tue, 26 Mar 2024 08:00:00 +0000 (09:00 +0100)
diff --git a/tasks.py b/tasks.py

index 57a4c39..870ab95 100755 (executable)
--- a/tasks.py
+++ b/tasks.py
@@ -1915,13 +1915,13 @@ class Escape(Task):
          for batch in tqdm.tqdm(
              input.split(self.batch_size), dynamic_ncols=True, desc=desc
          ):
          for batch in tqdm.tqdm(
              input.split(self.batch_size), dynamic_ncols=True, desc=desc
          ):
-            t = torch.arange(input.size(1), device=input.device)[None, :]
-            u = torch.randint(input.size(1), (input.size(0), 1), device=input.device)
+            t = torch.arange(batch.size(1), device=batch.device)[None, :]
+            u = torch.randint(batch.size(1), (batch.size(0), 1), device=batch.device)
              lr_mask = (t <= u).long() * (
                  t % self.it_len == self.index_lookahead_reward
              ).long()
  
              lr_mask = (t <= u).long() * (
                  t % self.it_len == self.index_lookahead_reward
              ).long()
  
-            input = lr_mask * escape.lookahead_reward2code(2) + (1 - lr_mask) * input
+            batch = lr_mask * escape.lookahead_reward2code(2) + (1 - lr_mask) * batch
              yield batch
  
      def vocabulary_size(self):
              yield batch
  
      def vocabulary_size(self):
@@ -1930,11 +1930,6 @@ class Escape(Task):
      def thinking_autoregression(
          self, n_epoch, model, result_dir, logger, deterministic_synthesis, nmax=1000
      ):
      def thinking_autoregression(
          self, n_epoch, model, result_dir, logger, deterministic_synthesis, nmax=1000
      ):
-        result = self.test_input[:250].clone()
-        t = torch.arange(result.size(1), device=result.device)[None, :]
-
-        result[:, self.it_len :] = -1
-
          snapshots = []
  
          def ar(result, ar_mask, logit_biases=None):
          snapshots = []
  
          def ar(result, ar_mask, logit_biases=None):
@@ -1955,38 +1950,37 @@ class Escape(Task):
  
          # Generate iteration after iteration
  
  
          # Generate iteration after iteration
  
-        optimistic_bias = result.new_zeros(escape.nb_codes, device=result.device)
-        optimistic_bias[escape.lookahead_reward2code(-1)] = -math.log(1e1)
-        optimistic_bias[escape.lookahead_reward2code(1)] = math.log(1e1)
+        result = self.test_input[:250].clone()
+        result[:, self.it_len :] = -1
+        result[:, self.index_lookahead_reward] = escape.lookahead_reward2code(2)
+        t = torch.arange(result.size(1), device=result.device)[None, :]
  
          for u in tqdm.tqdm(
  
          for u in tqdm.tqdm(
-            range(self.it_len, result.size(1) - self.it_len + 1, self.it_len),
+            range(0, result.size(1), self.it_len),
              desc="thinking",
          ):
              desc="thinking",
          ):
-            # Generate the lookahead_reward and state
-            ar_mask = (t % self.it_len == self.index_lookahead_reward).long() * (
-                t <= u + self.index_lookahead_reward
-            ).long()
-            ar(result, ar_mask)
-
-            # Generate the lookahead_reward and state
-            ar_mask = (t >= u + self.index_states).long() * (
-                t < u + self.index_states + self.state_len
-            ).long()
-            ar(result, ar_mask)
-
-            # Re-generate the lookahead_reward
-            ar_mask = (t % self.it_len == self.index_lookahead_reward).long() * (
-                t <= u + self.index_lookahead_reward
-            ).long()
-            ar(result, ar_mask, logit_biases=optimistic_bias)
-
-            # Generate the action and reward
+            # Generate the next state but keep the initial one, the
+            # lookahead_reward of previous iterations are set to
+            # UNKNOWN
+            if u > 0:
+                result[
+                    :, u + self.index_lookahead_reward
+                ] = escape.lookahead_reward2code(2)
+                ar_mask = (t >= u + self.index_states).long() * (
+                    t < u + self.index_states + self.state_len
+                ).long()
+                ar(result, ar_mask)
+
+            # Generate the action and reward with lookahead_reward to +1
+            result[:, u + self.index_lookahead_reward] = escape.lookahead_reward2code(1)
              ar_mask = (t >= u + self.index_action).long() * (
                  t <= u + self.index_reward
              ).long()
              ar(result, ar_mask)
  
              ar_mask = (t >= u + self.index_action).long() * (
                  t <= u + self.index_reward
              ).long()
              ar(result, ar_mask)
  
+            # Set the lookahead_reward to UNKNOWN for the next iterations
+            result[:, u + self.index_lookahead_reward] = escape.lookahead_reward2code(2)
+
          filename = os.path.join(result_dir, f"test_thinking_compute_{n_epoch:04d}.txt")
          with open(filename, "w") as f:
              for n in range(10):
          filename = os.path.join(result_dir, f"test_thinking_compute_{n_epoch:04d}.txt")
          with open(filename, "w") as f:
              for n in range(10):
author	François Fleuret <francois@fleuret.org>
	Tue, 26 Mar 2024 08:00:00 +0000 (09:00 +0100)
committer	François Fleuret <francois@fleuret.org>
	Tue, 26 Mar 2024 08:00:00 +0000 (09:00 +0100)