X-Git-Url: https://fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=blobdiff_plain;f=tasks.py;h=6b6b8f2a5ed055c3f80473f70fa7b0ac87f6a526;hb=621231cc5bb94f983c556a1b450b66067bec4165;hp=51538366383be455f160ab6158392627ecea5190;hpb=19ec7f3e4030ddece2647983dcf1bed5eb0d9544;p=picoclvr.git

diff --git a/tasks.py b/tasks.py
index 5153836..6b6b8f2 100755
--- a/tasks.py
+++ b/tasks.py
@@ -27,6 +27,7 @@ def masked_inplace_autoregression(
     ar_mask,
     deterministic_synthesis,
     forbidden_tokens=None,
+    logit_biases=None,
     progress_bar_desc="autoregression",
     device=torch.device("cpu"),
 ):
@@ -48,7 +49,11 @@ def masked_inplace_autoregression(
 
         for input, ar_mask in batches:
             model.masked_inplace_autoregression(
-                input, ar_mask, forbidden_tokens, deterministic_synthesis
+                input,
+                ar_mask,
+                deterministic_synthesis,
+                forbidden_tokens,
+                logit_biases,
             )
 
         model.train(t)
@@ -1917,9 +1922,12 @@ class Escape(Task):
         t = torch.arange(result.size(1), device=result.device)[None, :]
 
         state_len = self.height * self.width
+        index_action = state_len
+        index_reward = state_len + 1
+        index_lookahead_reward = state_len + 2
         it_len = state_len + 3  # state / action / reward / lookahead_reward
 
-        def ar(result, ar_mask):
+        def ar(result, ar_mask, logit_biases=None):
             ar_mask = ar_mask.expand_as(result)
             result *= 1 - ar_mask
             masked_inplace_autoregression(
@@ -1927,47 +1935,36 @@ class Escape(Task):
                 self.batch_size,
                 result,
                 ar_mask,
-                deterministic_synthesis,
+                deterministic_synthesis=deterministic_synthesis,
+                logit_biases=logit_biases,
                 device=self.device,
                 progress_bar_desc=None,
             )
 
         # Generate iteration after iteration
 
+        optimistic_bias = result.new_zeros(self.nb_codes, device=result.device)
+        optimistic_bias[(-1) + escape.first_lookahead_rewards_code + 1] = math.log(1e-1)
+        optimistic_bias[(1) + escape.first_lookahead_rewards_code + 1] = math.log(1e1)
+
         for u in tqdm.tqdm(
             range(it_len, result.size(1) - it_len + 1, it_len), desc="thinking"
         ):
-            # Put the lookahead reward to either 0 or -1 for the
-            # current iteration, with a proba that depends with the
-            # sequence index, so that we have diverse examples, sample
-            # the next state
-            s = -(
-                torch.rand(result.size(0), device=result.device)
-                <= torch.linspace(0, 1, result.size(0), device=result.device)
-            ).long()
-            result[:, u - 1] = s + 1 + escape.first_lookahead_rewards_code
+            # Generate the lookahead_reward pessimistically
+            ar_mask = (t < u).long() * (t % it_len == index_lookahead_reward).long()
+            ar(result, ar_mask, logit_biases=-optimistic_bias)
+
+            # Generate the state
             ar_mask = (t >= u).long() * (t < u + state_len).long()
             ar(result, ar_mask)
 
-            # Put the lookahead reward to +1 for the current
-            # iteration, sample the action and reward
-            s = 1
-            result[:, u - 1] = s + 1 + escape.first_lookahead_rewards_code
-            ar_mask = (t >= u + state_len).long() * (t < u + state_len + 2).long()
-            ar(result, ar_mask)
+            # Generate the lookahead_reward optimistically
+            ar_mask = (t < u).long() * (t % it_len == index_lookahead_reward).long()
+            ar(result, ar_mask, logit_biases=optimistic_bias)
 
-            # Fix the previous lookahead rewards in a consistant state
-            for v in range(0, u, it_len):
-                # Extract the rewards
-                r = result[:, range(v + state_len + 1 + it_len, u + it_len - 1, it_len)]
-                r = r - escape.first_rewards_code - 1
-                r = r.clamp(min=-1, max=1)  # the reward is predicted hence can be weird
-                a = r.min(dim=1).values
-                b = r.max(dim=1).values
-                s = (a < 0).long() * a + (a >= 0).long() * b
-                result[:, v + state_len + 2] = (
-                    s + 1 + escape.first_lookahead_rewards_code
-                )
+            # Generate the action and reward
+            ar_mask = (t >= u + index_action).long() * (t <= u + index_reward).long()
+            ar(result, ar_mask)
 
         # Saving the generated sequences