X-Git-Url: https://fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=blobdiff_plain;ds=inline;f=main.py;h=83227bb43a24897b506f0b82035a69dc33f7acfe;hb=e68f19634d3282e39a488d146480b19bb23e8652;hp=339d18569da3945b0f5617a654e17f99d469ccb6;hpb=915db2eb89076dca35cc89df3ad895ddf346475f;p=mygpt.git diff --git a/main.py b/main.py index 339d185..83227bb 100755 --- a/main.py +++ b/main.py @@ -126,7 +126,7 @@ def autoregression( results = torch.cat((primer, results), 1) for input in results.split(batch_size): - for s in tqdm.tqdm(range(first, input.size(1)), desc = 'synth'): + for s in range(first, input.size(1)): output = model(input) logits = output[:, s] if args.synthesis_sampling: @@ -156,28 +156,32 @@ import picoclvr class TaskPicoCLVR(Task): - def descr2tensor(self, descr): - t = [ [ self.token2id[u] for u in s ] for s in descr ] - return torch.tensor(t, device = self.device) + # Make a tensor from a list of strings + def tensorize(self, descr): + token_descr = [ s.strip().split(' ') for s in descr ] + l = max([ len(s) for s in token_descr ]) + #token_descr = [ [ '' ] * (l - len(s)) + s for s in token_descr ] + token_descr = [ s + [ '' ] * (l - len(s)) for s in token_descr ] + id_descr = [ [ self.token2id[u] for u in s ] for s in token_descr ] + return torch.tensor(id_descr, device = self.device) + + def trim(self, x, token = ''): + n = self.token2id[token] + i = (1 - (F.pad(x, (1, 1), value = n) == n).min(0).values.long()).cumsum(0) + a, b = (i == 0).nonzero().max(), (i == i.max()).nonzero().min() + return x[:, a:b] def __init__(self, batch_size, height, width, nb_colors = 5, device = torch.device('cpu')): def generate_descr(nb): - descr = picoclvr.generate( + return picoclvr.generate( nb, height = self.height, width = self.width, nb_colors = nb_colors ) - descr = [ s.strip().split(' ') for s in descr ] - l = max([ len(s) for s in descr ]) - #descr = [ [ '' ] * (l - len(s)) + s for s in descr ] - descr = [ s + [ '' ] * (l - len(s)) for s in descr ] - - return descr - self.height = height self.width = width self.batch_size = batch_size @@ -188,36 +192,28 @@ class TaskPicoCLVR(Task): self.test_descr = generate_descr((nb * 1) // 5) # Build the tokenizer - tokens = set() + tokens = { '' } for d in [ self.train_descr, self.test_descr ]: for s in d: - for t in s: tokens.add(t) + for t in s.strip().split(' '): tokens.add(t) self.token2id = dict([ (t, n) for n, t in enumerate(tokens) ]) self.id2token = dict([ (n, t) for n, t in enumerate(tokens) ]) # Tokenize the train and test sets - self.train_input = descr2tensor(self.train_descr) - self.test_input = descr2tensor(self.test_descr) + self.train_input = self.tensorize(self.train_descr) + self.test_input = self.tensorize(self.test_descr) def batches(self, split = 'train'): assert split in { 'train', 'test' } input = self.train_input if split == 'train' else self.test_input for batch in tqdm.tqdm(input.split(self.batch_size), desc = f'epoch-{split}'): - yield batch + yield self.trim(batch) def vocabulary_size(self): return len(self.token2id) - def generate(self, primer_descr, model, nb_tokens): - results = autoregression( - model, self.batch_size, - nb_samples = 1, nb_tokens = nb_tokens, primer = descr2tensor(primer_descr), - device = self.device - ) - return ' '.join([ self.id2token[t.item()] for t in results.flatten() ]) - def produce_results(self, n_epoch, model): - nb_tokens = self.height * self.width + 3 + nb_tokens_to_generate = self.height * self.width + 3 result_descr = [ ] nb_per_primer = 8 @@ -228,18 +224,17 @@ class TaskPicoCLVR(Task): 'green bottom yellow bottom green left of blue yellow right of blue blue top ', ]: - for k in range(nb_per_primer): - result_descr.append(self.generate(primer_descr, model, nb_tokens)) + results = autoregression( + model, + self.batch_size, + nb_samples = nb_per_primer, + nb_tokens_to_generate = nb_tokens_to_generate, + primer = self.tensorize([ primer_descr ]).expand(nb_per_primer, -1), + device = self.device + ) - img = [ picoclvr.descr2img(d, height = self.height, width = self.width) - for d in result_descr ] - img = torch.cat(img, 0) - image_name = f'result_picoclvr_{n_epoch:04d}.png' - torchvision.utils.save_image( - img / 255., - image_name, nrow = nb_per_primer, pad_value = 0.8 - ) - log_string(f'wrote {image_name}') + l = [ ' '.join([ self.id2token[t.item()] for t in r ]) for r in results ] + result_descr += l np = picoclvr.nb_properties( result_descr, @@ -250,6 +245,19 @@ class TaskPicoCLVR(Task): log_string(f'nb_requested_properties {sum(nb_requested_properties) / len(result_descr):.02f} nb_missing_properties {sum(nb_missing_properties) / len(result_descr):.02f}') + img = [ + picoclvr.descr2img(d, height = self.height, width = self.width) + for d in result_descr + ] + + img = torch.cat(img, 0) + image_name = f'result_picoclvr_{n_epoch:04d}.png' + torchvision.utils.save_image( + img / 255., + image_name, nrow = nb_per_primer, pad_value = 0.8 + ) + log_string(f'wrote {image_name}') + ###################################################################### class TaskWiki103(Task): @@ -276,15 +284,16 @@ class TaskWiki103(Task): self.vocab = torchtext.vocab.build_vocab_from_iterator( yield_tokens(), - specials = [ '', '' ], + specials = [ '', '' ], min_freq = self.min_freq ) self.vocab.set_default_index(self.vocab[ '' ]) + # makes a tensor from a list of list of tokens def tensorize(self, s): a = max(len(x) for x in s) - return torch.tensor([ self.vocab(x + [ '' ] * (a - len(x))) for x in s ]) + return torch.tensor([ self.vocab(x + [ '' ] * (a - len(x))) for x in s ]) def yield_batches(self, ds): s = [ ] @@ -342,7 +351,7 @@ class TaskWiki103(Task): else: t_next = logits.argmax() t_generated.append(self.vocab.lookup_token(t_next)) - if t_generated[-1] == '': break + if t_generated[-1] == '': break s = ' '.join(t_generated) @@ -461,7 +470,6 @@ for input in task.batches(split = 'train'): token_probas = token_count / token_count.sum() entropy = -torch.xlogy(token_probas, token_probas).sum() train_set_perplexity = math.exp(entropy) -#log_string(f'train set perplexity {train_set_perplexity}') for k in range(nb_epochs_finished, nb_epochs):