- # Here there is a trick: Since the stack at time t is computed
- # by updating that at time t-L, the parallel scan operates
- # with a period of L. To do so we split the time indexing in
- # two axes, the second of size CL, and run the parallel scan
- # using the other as the sequence index.
+ ######################################################################
+
+ if self.training and self.proba_gate_dropout > 0.0:
+ warnings.warn("gate dropout", RuntimeWarning)
+ epsilon = 0.5
+
+ #################################################################
+ # Associative scan
+
+ # Here there is a trick: Since the stack at position t is
+ # computed by updating that at position t-CL, the parallel
+ # scan operates with a period of CL. To do so we split the
+ # sequence indexing in two axes, the second of size CL, and
+ # run the parallel scan using the first as the sequence index.