X-Git-Url: https://fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=blobdiff_plain;f=mygpt.py;h=0e94672777ad3df99016ffd0da855a927f8a6e6a;hb=597f3a01de92be1a4f05114df11d9e39b3608e29;hp=33c6fee0f89f60f53014d9b8defebf38a7d5f1b7;hpb=a3abd0f58cfb2f2448c82db836093d20dc2954f2;p=mygptrnn.git diff --git a/mygpt.py b/mygpt.py index 33c6fee..0e94672 100755 --- a/mygpt.py +++ b/mygpt.py @@ -441,6 +441,11 @@ class KVRec(nn.Module): ############################## +# Returns a tensor with an additional index at rank win_dim, that move +# along the same dimension as dim, on a domain {0...win_size-1}, and +# dim is restricted on a domain reduced by win_size-1 values. + + def moving_window(x, dim, win_dim, win_size): size, stride = x.size(), x.stride() size = size[:dim] + (size[dim] - win_size + 1,) + size[dim + 1 :] @@ -509,7 +514,7 @@ class Caterpillar(nn.Module): T = bs.x.size(1) DV = self.w_V.size(1) DK = self.w_K.size(1) - Dout = self.w_O.size(1) + DM = self.w_O.size(1) CH = self.caterpillar_height CL = self.caterpillar_length @@ -517,6 +522,8 @@ class Caterpillar(nn.Module): t0 >= CL and (t1 - t0) % CL == 0 ), f"bs.first should be greater than caterpillar_length, and bs.nb should be a multiple of caterpillar_length" + # We cache values to deal efficiently with auto-regression + if bs.init_cache: self.rec_V = X.new_zeros(N, CH, T, DV) self.rec_K = X.new_zeros(N, CH, T, DK) @@ -525,7 +532,7 @@ class Caterpillar(nn.Module): self.rec_V[:, :, t0 - CL : t0] = self.init_V_rec[None, :, :, :] self.rec_K[:, :, t0 - CL : t0] = self.init_K_rec[None, :, :, :] - self.cache_Y = X.new_zeros(N, T, Dout) + self.cache_Y = X.new_zeros(N, T, DM) ###################################################################### # Compute the recurrent state @@ -540,6 +547,8 @@ class Caterpillar(nn.Module): torch.einsum("ntc,hec->nhet", X, self.w_G) + self.b_G[None, :, :, None] ).sigmoid() + G = F.dropout(G, self.attention_dropout, self.training) + V = torch.einsum("ntc,hdc->nhtd", X, self.w_V) K = torch.einsum("ntc,hdc->nhtd", X, self.w_K) @@ -556,7 +565,7 @@ class Caterpillar(nn.Module): # by updating that at time t-L, the parallel scan operates # with a period of L. To do so we split the time indexing in # two axes, the second of size CL, and run the parallel scan - # using the other alone as the sequence index. + # using the other as the sequence index. A = A.unflatten(2, (-1, CL)) gated_V = gated_V.unflatten(2, (-1, CL))