- V = torch.einsum("ntc,hdc->nhtd", X, self.w_V)
- K = torch.einsum("ntc,hdc->nhtd", X, self.w_K)
+ # warnings.warn("softmax gating", RuntimeWarning)
+
+ # G = (
+ # torch.einsum("ntc,hrc->nhrt", X, self.w_G) + self.b_G[None, :, :, None]
+ # ).softmax(dim=2)
+
+ ######################################################################
+ # The "flashbacks"
+
+ if self.training and self.proba_gate_dropout > 0.0:
+ # This is a better implementation of "flashbacks".
+
+ # G is NxHxExT where e is the caterpillar's row.
+
+ warnings.warn("gate dropout", RuntimeWarning)
+
+ kill = (
+ torch.rand(G.size(), device=G.device) <= self.proba_gate_dropout
+ ).float()
+
+ alpha = G / (1 - self.proba_gate_dropout)
+
+ G = alpha * (1 - kill)
+
+ def recurrence(G, V, K):
+ # Clip the gating to avoid values greater than 1 when several
+ # heads hit the same row
+
+ G = G / G.sum(1, keepdim=True).clamp(min=1)
+
+ # We prepare the arguments for the parallel scan
+
+ A = 1 - G.sum(1)
+
+ gated_V = torch.einsum("nhrt,nhtd->nrtd", G, V)
+ gated_K = torch.einsum("nhrt,nhtd->nrtd", G, K)