From: François Fleuret <francois@fleuret.org>
Date: Sat, 31 Aug 2024 07:50:05 +0000 (+0200)
Subject: Update.
X-Git-Url: https://fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=commitdiff_plain;ds=sidebyside;h=refs%2Fheads%2Fmaster;hp=408f2335af43590ee2d99c3286cbe3762c76887a;p=mygptrnn.git

Update.
---

diff --git a/main.py b/main.py
index 00b8301..88f56b3 100755
--- a/main.py
+++ b/main.py
@@ -598,43 +598,58 @@ def add_memex_v2(batches, memex_proba, marker_token):
 
 def add_memex_v3(batches, memex_proba, marker_token):
     for input in batches:
-        if torch.rand(1).item() < memex_proba:
-            memex_len = input.size(1) // 4
-
-            t = torch.arange(input.size(1) + memex_len, device=input.device)[
-                None, :
-            ].expand(input.size(0), -1)
-
-            # Call me the tensor-spaghetti master
-
-            trigger = torch.rand(t.size(), device=t.device)
-            trigger[:, -memex_len:] = 1.0
-            trigger = (trigger.sort(dim=1).indices == 0).long()
-            memex_mask = trigger.clone()
-            memex_mask[:, memex_len:] -= memex_mask[:, :-memex_len]
-            memex_mask = memex_mask.cumsum(dim=1)
-            u = 1 - memex_mask
-            u[:, 0] = 0
-            u = u.cumsum(dim=1)
-            # assert u.min() == 0
-            # assert u.max() == input.size(1) - 1
-            v = (
-                (trigger.cumsum(dim=1) - trigger).cumsum(dim=1)
-                + torch.randint(input.size(1), (input.size(0), 1), device=t.device)
-            ) * memex_mask
-            u = u * (1 - memex_mask) + v * memex_mask
-            n = torch.arange(input.size(0), device=input.device)[:, None].expand(
-                -1, t.size(1)
+        memex_len = input.size(1) // 8
+
+        t = torch.arange(input.size(1) + memex_len, device=input.device)[
+            None, :
+        ].expand(input.size(0), -1)
+        n = torch.arange(input.size(0), device=input.device)[:, None].expand(
+            -1, t.size(1)
+        )
+
+        t = (t - 1).clamp(min=0)
+
+        # Call me the tensor-spaghetti master
+
+        trigger = torch.rand(t.size(), device=t.device)
+        trigger[:, -memex_len:] = 2.0
+        trigger[:, : memex_len + 1] = 2.0
+        trigger = (trigger == trigger.min(dim=1, keepdim=True).values).long()
+        memex_mask = trigger.clone()
+        memex_mask[:, memex_len:] -= trigger[:, :-memex_len]
+        memex_mask = memex_mask.cumsum(dim=1)
+
+        u = 1 - memex_mask
+        u[:, 0] = 0
+        u = u.cumsum(dim=1)
+
+        v = (
+            (trigger.cumsum(dim=1) - trigger).cumsum(dim=1)
+            + torch.randint(
+                input.size(1) - memex_len, (input.size(0), 1), device=t.device
             )
-            new_input = input[n, u]
-            limits = trigger.clone()
-            limits[:, memex_len - 1 :] += limits[:, : -(memex_len - 1)]
-            new_input = new_input * (1 - limits) + memex_marker * limits
+        ) * memex_mask
+        u = u * (1 - memex_mask) + v * memex_mask
+
+        new_input = input[n, u]
+        limits = trigger.clone()
+        limits[:, memex_len - 1 :] += limits[:, : -(memex_len - 1)]
+        new_input = new_input * (1 - limits) + marker_token * limits
+        new_input[:, 0] = marker_token
+
+        orig = torch.cat(
+            [
+                input,
+                torch.full((input.size(0), memex_len), memex_marker, device=t.device),
+            ],
+            dim=1,
+        )
 
-            yield new_input, memex_mask
+        a = (torch.rand(input.size(0), 1, device=t.device) <= memex_proba).long()
 
-        else:
-            yield input
+        new_input = (1 - a) * orig + a * new_input
+
+        yield new_input  # memex_mask
 
 
 ######################################################################
@@ -1054,12 +1069,15 @@ for n_epoch in range(nb_epochs_finished, nb_epochs):
 
     log_string(f"memex_proba {memex_proba}")
 
-    warnings.warn("memex v3", RuntimeWarning)
-    train_batches = add_memex_v3(
-        batches=task.batches(split="train"),
-        memex_proba=memex_proba,
-        marker_token=memex_marker,
-    )
+    if args.memex_proba > 0:
+        warnings.warn("memex v3", RuntimeWarning)
+        train_batches = add_memex_v3(
+            batches=task.batches(split="train"),
+            memex_proba=memex_proba,
+            marker_token=memex_marker,
+        )
+    else:
+        train_batches = task.batches(split="train")
 
     def add_none(it):
         for x in it:
@@ -1126,9 +1144,10 @@ for n_epoch in range(nb_epochs_finished, nb_epochs):
             optimizer.step()
             grad_norm = sum([p.grad.pow(2).sum() for p in model.parameters()]).sqrt()
             loss_file.write(f"{n_epoch} {n_batch} {loss.item()} {grad_norm.item()}\n")
-            lambda_file.write(
-                f"{n_epoch} {n_batch} {l_memex} {norm_regular} {norm_memex}\n"
-            )
+            if memex_mask is not None:
+                lambda_file.write(
+                    f"{n_epoch} {n_batch} {l_memex} {norm_regular} {norm_memex}\n"
+                )
             optimizer.zero_grad()
             nb_acc_samples = 0
             n_batch += 1
diff --git a/problems.py b/problems.py
index 9e368c2..3cdd374 100755
--- a/problems.py
+++ b/problems.py
@@ -149,7 +149,13 @@ class ProblemMemory(Problem):
         return sequences, ar_mask
 
     def seq2str(self, seq):
-        return "".join(self.token_string[x.item()] for x in seq)
+        def decode(x):
+            if x < len(self.token_string):
+                return self.token_string[x]
+            else:
+                return "?"
+
+        return "".join(decode(x.item()) for x in seq)
 
 
 class ProblemTwoTargets(Problem):
diff --git a/pscan.py b/pscan.py
index 0bb0d14..b533164 100755
--- a/pscan.py
+++ b/pscan.py
@@ -124,17 +124,17 @@ if __name__ == "__main__":
 
     ######################################################################
 
-    N, T, D = 16, 4096, 32
+    # N, T, D = 16, 4096, 32
 
-    for r in range(timing.size(0)):
-        A = 0.9 + 0.1 * torch.rand(N, T, dtype=torch.float64).requires_grad_()
-        X = torch.randn(N, T, D, dtype=torch.float64).requires_grad_()
-        Y_init = torch.randn(N, D, dtype=torch.float64).requires_grad_()
+    # for r in range(timing.size(0)):
+    # A = 0.9 + 0.1 * torch.rand(N, T, dtype=torch.float64).requires_grad_()
+    # X = torch.randn(N, T, D, dtype=torch.float64).requires_grad_()
+    # Y_init = torch.randn(N, D, dtype=torch.float64).requires_grad_()
 
-        start_time = time.perf_counter()
-        for _ in range(1000):
-            Y = pscan(A, X, Y_init)
-        duration = time.perf_counter() - start_time
+    # start_time = time.perf_counter()
+    # for _ in range(1000):
+    # Y = pscan(A, X, Y_init)
+    # duration = time.perf_counter() - start_time
 
     ######################################################################
 
diff --git a/tasks.py b/tasks.py
index 218ff36..57c6801 100755
--- a/tasks.py
+++ b/tasks.py
@@ -106,7 +106,7 @@ class SandBox(Task):
             device
         ), self.test_ar_mask.to(device)
 
-        self.nb_codes = max(self.train_input.max(), self.test_input.max()) + 1
+        self.nb_codes = (max(self.train_input.max(), self.test_input.max()) + 1).item()
 
         # A bit of paranoia never hurts
         assert self.nb_codes <= max_nb_codes
@@ -579,7 +579,7 @@ class Maze(Task):
         )
         self.test_input = self.map2seq(test_mazes.to(device), test_paths.to(device))
 
-        self.nb_codes = max(self.train_input.max(), self.test_input.max()) + 1
+        self.nb_codes = (max(self.train_input.max(), self.test_input.max()) + 1).item()
 
     def batches(self, split="train", nb_to_use=-1, desc=None):
         assert split in {"train", "test"}
@@ -756,7 +756,7 @@ class Snake(Task):
             self.device,
         )
 
-        self.nb_codes = max(self.train_input.max(), self.test_input.max()) + 1
+        self.nb_codes = (max(self.train_input.max(), self.test_input.max()) + 1).item()
 
     def batches(self, split="train", nb_to_use=-1, desc=None):
         assert split in {"train", "test"}
@@ -871,7 +871,7 @@ class Stack(Task):
         counts = F.one_hot(counts).sum(0)
         logger(f"test_pop_stack_counts {counts}")
 
-        self.nb_codes = max(self.train_input.max(), self.test_input.max()) + 1
+        self.nb_codes = (max(self.train_input.max(), self.test_input.max()) + 1).item()
 
     def batches(self, split="train", nb_to_use=-1, desc=None):
         assert split in {"train", "test"}
@@ -1078,7 +1078,7 @@ class RPL(Task):
                 s = " ".join(seq)
                 logger(f"example_seq {s}")
 
-        self.nb_codes = max(self.train_input.max(), self.test_input.max()) + 1
+        self.nb_codes = (max(self.train_input.max(), self.test_input.max()) + 1).item()
 
     def batches(self, split="train", nb_to_use=-1, desc=None):
         assert split in {"train", "test"}
@@ -1308,7 +1308,7 @@ class Expr(Task):
         self.train_input = self.tensorize(train_sequences)
         self.test_input = self.tensorize(test_sequences)
 
-        self.nb_codes = max(self.train_input.max(), self.test_input.max()) + 1
+        self.nb_codes = (max(self.train_input.max(), self.test_input.max()) + 1).item()
 
     def batches(self, split="train", nb_to_use=-1, desc=None):
         assert split in {"train", "test"}
@@ -1639,7 +1639,7 @@ class QMLP(Task):
             for e in self.test_ref_test_errors:
                 f.write(f"{e}\n")
 
-        self.nb_codes = max(self.train_input.max(), self.test_input.max()) + 1
+        self.nb_codes = (max(self.train_input.max(), self.test_input.max()) + 1).item()
 
     def batches(self, split="train", desc=None):
         assert split in {"train", "test"}