+ dropout_head = (
+ (torch.rand(N, H, 1, t1 - t0, device=G.device).sort(dim=3).indices == 0)
+ .expand_as(G)
+ .float()
+ )
+
+ dropout_tail = dropout_head.cumsum(dim=3) - dropout_head
+
+ dropout_active = (
+ torch.rand(N, 1, 1, 1, device=G.device) < self.proba_gate_dropout
+ ).long()
+
+ dropout_head *= dropout_active
+ dropout_tail *= dropout_active
+
+ G = (
+ G
+ + dropout_head * (1 - epsilon - G.detach())
+ - dropout_tail * G.detach()
+ )
+
+ ######################################################################
+
+ # We prepare the arguments for the parallel scan
+
+ # Clip the gating to avoid values greater than 1 when several
+ # heads hit the same row
+
+ G = G / G.sum(1, keepdim=True).clamp(min=1)
+
+ A = 1 - G.sum(1)
+
+ # warnings.warn("harmonic recurrence", RuntimeWarning)
+ # har = torch.arange(t0, t1, device = G.device).float() + 1
+ # A = har / (har + 1)
+ # G = G / har
+
+ gated_V = torch.einsum("nhrt,nhtd->nrtd", G, V)
+ gated_K = torch.einsum("nhrt,nhtd->nrtd", G, K)
+
+ # We start from cached values, which matters in inference
+
+ init_rec_V = self.rec_V[:, :, t0 - L : t0]
+ init_rec_K = self.rec_K[:, :, t0 - L : t0]
+