+ ######################################################################
+
+ if self.training and self.proba_gate_dropout > 0.0:
+ warnings.warn("gate dropout", RuntimeWarning)
+ epsilon = 0.5
+
+ #################################################################
+ # Associative scan
+
+ # Here there is a trick: Since the stack at position t is
+ # computed by updating that at position t-CL, the parallel
+ # scan operates with a period of CL. To do so we split the
+ # sequence indexing in two axes, the second of size CL, and
+ # run the parallel scan using the first as the sequence index.
+