+ # warnings.warn("softmax gating", RuntimeWarning)
+
+ # G = (
+ # torch.einsum("ntc,hrc->nhrt", X, self.w_G) + self.b_G[None, :, :, None]
+ # ).softmax(dim=2)
+
+ ######################################################################
+ # The "flashbacks"
+
+ if self.training and self.proba_gate_dropout > 0.0:
+ # This is a better implementation of "flashbacks".
+
+ # G is NxHxExT where e is the caterpillar's row.
+
+ warnings.warn("gate dropout", RuntimeWarning)
+
+ kill = (
+ torch.rand(G.size(), device=G.device) <= self.proba_gate_dropout
+ ).float()
+
+ alpha = G / (1 - self.proba_gate_dropout)
+
+ G = alpha * (1 - kill)
+
+ ######################################################################