X-Git-Url: https://fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=blobdiff_plain;f=mygpt.py;h=0e94672777ad3df99016ffd0da855a927f8a6e6a;hb=597f3a01de92be1a4f05114df11d9e39b3608e29;hp=9bacaffbe7100507c93297fb7acd418474e68729;hpb=f06a70eca52e988857ee043f1379d41b09dd365d;p=mygptrnn.git diff --git a/mygpt.py b/mygpt.py index 9bacaff..0e94672 100755 --- a/mygpt.py +++ b/mygpt.py @@ -514,7 +514,7 @@ class Caterpillar(nn.Module): T = bs.x.size(1) DV = self.w_V.size(1) DK = self.w_K.size(1) - Dout = self.w_O.size(1) + DM = self.w_O.size(1) CH = self.caterpillar_height CL = self.caterpillar_length @@ -522,6 +522,8 @@ class Caterpillar(nn.Module): t0 >= CL and (t1 - t0) % CL == 0 ), f"bs.first should be greater than caterpillar_length, and bs.nb should be a multiple of caterpillar_length" + # We cache values to deal efficiently with auto-regression + if bs.init_cache: self.rec_V = X.new_zeros(N, CH, T, DV) self.rec_K = X.new_zeros(N, CH, T, DK) @@ -530,7 +532,7 @@ class Caterpillar(nn.Module): self.rec_V[:, :, t0 - CL : t0] = self.init_V_rec[None, :, :, :] self.rec_K[:, :, t0 - CL : t0] = self.init_K_rec[None, :, :, :] - self.cache_Y = X.new_zeros(N, T, Dout) + self.cache_Y = X.new_zeros(N, T, DM) ###################################################################### # Compute the recurrent state @@ -545,6 +547,8 @@ class Caterpillar(nn.Module): torch.einsum("ntc,hec->nhet", X, self.w_G) + self.b_G[None, :, :, None] ).sigmoid() + G = F.dropout(G, self.attention_dropout, self.training) + V = torch.einsum("ntc,hdc->nhtd", X, self.w_V) K = torch.einsum("ntc,hdc->nhtd", X, self.w_K) @@ -561,7 +565,7 @@ class Caterpillar(nn.Module): # by updating that at time t-L, the parallel scan operates # with a period of L. To do so we split the time indexing in # two axes, the second of size CL, and run the parallel scan - # using the other alone as the sequence index. + # using the other as the sequence index. A = A.unflatten(2, (-1, CL)) gated_V = gated_V.unflatten(2, (-1, CL))