Update.
[picoclvr.git] / main.py
1 #!/usr/bin/env python
2
3 # Any copyright is dedicated to the Public Domain.
4 # https://creativecommons.org/publicdomain/zero/1.0/
5
6 # Written by Francois Fleuret <francois@fleuret.org>
7
8 # torch.backends.cuda.matmul.allow_tf23
9 # torch.autocast(torch.bfloat16)
10
11 import math, sys, argparse, time, tqdm, itertools, os
12
13 import torch, torchvision
14 from torch import nn
15 from torch.nn import functional as F
16
17 import mygpt, tensorstack
18
19 ######################################################################
20
21 if torch.cuda.is_available():
22     device = torch.device("cuda")
23     torch.backends.cuda.matmul.allow_tf32 = True
24 else:
25     device = torch.device("cpu")
26
27 ######################################################################
28
29 parser = argparse.ArgumentParser(
30     description="An implementation of GPT with cache to solve a toy geometric reasoning task."
31 )
32
33 parser.add_argument("--task", type=str, default="picoclvr")
34
35 parser.add_argument("--log_filename", type=str, default="train.log")
36
37 parser.add_argument("--result_dir", type=str, default="results_default")
38
39 parser.add_argument("--seed", type=int, default=0)
40
41 parser.add_argument("--nb_epochs", type=int, default=25)
42
43 parser.add_argument("--batch_size", type=int, default=25)
44
45 parser.add_argument("--nb_train_samples", type=int, default=250000)
46
47 parser.add_argument("--nb_test_samples", type=int, default=10000)
48
49 parser.add_argument("--optim", type=str, default="adam")
50
51 parser.add_argument("--learning_rate", type=float, default=1e-4)
52
53 parser.add_argument("--learning_rate_schedule", type=str, default="10: 2e-5,30: 4e-6")
54
55 parser.add_argument("--dim_model", type=int, default=512)
56
57 parser.add_argument("--dim_keys", type=int, default=64)
58
59 parser.add_argument("--dim_hidden", type=int, default=2048)
60
61 parser.add_argument("--nb_heads", type=int, default=8)
62
63 parser.add_argument("--nb_blocks", type=int, default=12)
64
65 parser.add_argument("--dropout", type=float, default=0.1)
66
67 parser.add_argument("--deterministic_synthesis", action="store_true", default=False)
68
69 parser.add_argument("--no_checkpoint", action="store_true", default=False)
70
71 parser.add_argument("--overwrite_results", action="store_true", default=False)
72
73 parser.add_argument("--checkpoint_name", type=str, default="checkpoint.pth")
74
75 ##############################
76 # picoclvr options
77
78 parser.add_argument("--picoclvr_nb_colors", type=int, default=5)
79
80 parser.add_argument("--picoclvr_height", type=int, default=12)
81
82 parser.add_argument("--picoclvr_width", type=int, default=16)
83
84 parser.add_argument("--picocvlr_prune_properties", type=str, default="none")
85
86 ##############################
87 # Maze options
88
89 parser.add_argument("--maze_height", type=int, default=13)
90
91 parser.add_argument("--maze_width", type=int, default=21)
92
93 parser.add_argument("--maze_nb_walls", type=int, default=15)
94
95 ######################################################################
96
97 args = parser.parse_args()
98
99 assert args.picocvlr_prune_properties in {"none", "train+eval", "eval"}
100
101 try:
102     os.mkdir(args.result_dir)
103 except FileExistsError:
104     if not args.overwrite_results:
105         print(f"result directory {args.result_dir} already exists")
106         exit(1)
107
108 log_file = open(os.path.join(args.result_dir, args.log_filename), "a")
109
110 if args.seed >= 0:
111     # torch.backends.cudnn.deterministic = True
112     # torch.backends.cudnn.benchmark = False
113     # torch.use_deterministic_algorithms(True)
114     torch.manual_seed(args.seed)
115     if torch.cuda.is_available():
116         torch.cuda.manual_seed_all(args.seed)
117
118 ######################################################################
119
120
121 def log_string(s):
122     t = time.strftime("%Y%m%d-%H:%M:%S ", time.localtime())
123
124     if log_file is not None:
125         log_file.write(t + s + "\n")
126         log_file.flush()
127
128     print(t + s)
129     sys.stdout.flush()
130
131
132 for n in vars(args):
133     log_string(f"args.{n} {getattr(args, n)}")
134
135 ######################################################################
136
137
138 def masked_inplace_autoregression(
139     model, batch_size, input, ar_mask, forbidden_tokens=None, device=torch.device("cpu")
140 ):
141     for input, ar_mask in zip(input.split(batch_size), ar_mask.split(batch_size)):
142         i = (ar_mask.sum(0) > 0).nonzero()
143         if i.min() > 0:
144             model(
145                 mygpt.BracketedSequence(input, 0, i.min())
146             )  # Needed to initialize the model's cache
147         for s in range(i.min(), i.max() + 1):
148             output = model(mygpt.BracketedSequence(input, s, 1)).x
149             logits = output[:, s]
150             if forbidden_tokens is not None:
151                 logits = logits.masked_fill(forbidden_tokens, float("-inf"))
152             if args.deterministic_synthesis:
153                 t_next = logits.argmax(1)
154             else:
155                 dist = torch.distributions.categorical.Categorical(logits=logits)
156                 t_next = dist.sample()
157             input[:, s] = ar_mask[:, s] * t_next + (1 - ar_mask[:, s]) * input[:, s]
158
159
160 ######################################################################
161
162
163 class Task:
164     def batches(self, split="train"):
165         pass
166
167     def vocabulary_size(self):
168         pass
169
170     def produce_results(self, n_epoch, model):
171         pass
172
173
174 ######################################################################
175
176 import picoclvr
177
178
179 class TaskPicoCLVR(Task):
180     # Make a tensor from a list of strings
181     def tensorize(self, descr):
182         token_descr = [s.strip().split(" ") for s in descr]
183         l = max([len(s) for s in token_descr])
184         token_descr = [s + ["<nul>"] * (l - len(s)) for s in token_descr]
185         id_descr = [[self.token2id[u] for u in s] for s in token_descr]
186         return torch.tensor(id_descr, device=self.device)
187
188     # Make a list of strings from a tensor
189     def detensorize(self, x):
190         return [" ".join([self.id2token[t.item()] for t in r]) for r in x]
191
192     # trim all the tensors in the tuple z to remove as much token from
193     # left and right in the first tensor. If z is a tuple, all its
194     # elements are trimed according to the triming for the first
195     def trim(self, z, token="<nul>"):
196         n = self.token2id[token]
197         if type(z) == tuple:
198             x = z[0]
199             i = (1 - (F.pad(x, (1, 1), value=n) == n).min(0).values.long()).cumsum(0)
200             a, b = (i == 0).nonzero().max(), (i == i.max()).nonzero().min()
201             return tuple([t[:, a:b] for t in z])
202         else:
203             i = (1 - (F.pad(z, (1, 1), value=n) == n).min(0).values.long()).cumsum(0)
204             a, b = (i == 0).nonzero().max(), (i == i.max()).nonzero().min()
205             return z[:, a:b]
206
207     ######################
208     # Not the cleanest part of the code
209
210     # Extract the last image of each sequence, from the last <img>
211     # included, and set to <nul> all the tokens from the beginning of
212     # that image to the end
213     def excise_last_image(self, input):
214         t_img, t_nul = self.token2id["<img>"], self.token2id["<nul>"]
215         nb_img_tokens = self.height * self.width + 1
216
217         input = input.clone()
218         t = (input == t_img).long()
219         tail_masks = (t.cumsum(dim=1) == t.sum(dim=1, keepdim=True)).long()
220         i = (t * tail_masks).nonzero(as_tuple=True)
221         j = (
222             i[0][:, None],
223             i[1][:, None] + torch.arange(nb_img_tokens, device=input.device)[None, :],
224         )
225         images = self.trim(input[j])
226         input[j] = t_nul
227         loss_masks = 1 - tail_masks
228         input, loss_masks = self.trim((input, loss_masks))
229         return input, loss_masks, images
230
231     def add_true_image(self, input, images, loss_masks):
232         t_nul = self.token2id["<nul>"]
233         nb_img_tokens = self.height * self.width + 1
234         input = F.pad(input, (0, nb_img_tokens), value=t_nul)
235         loss_masks = F.pad(loss_masks, (0, nb_img_tokens), value=0)
236         t = (input == t_nul).long()
237         i = (t.cumsum(dim=1) == 1).nonzero(as_tuple=True)
238         j = (
239             i[0][:, None],
240             i[1][:, None] + torch.arange(nb_img_tokens, device=input.device)[None, :],
241         )
242         input[j] = images
243         loss_masks[j] = 1
244         input, loss_masks = self.trim((input, loss_masks))
245         return input, loss_masks
246
247     def add_generated_image(self, input, loss_masks, model):
248         t_img, t_nul = self.token2id["<img>"], self.token2id["<nul>"]
249         nb_img_tokens = self.height * self.width + 1
250
251         input = F.pad(input, (0, nb_img_tokens), value=t_nul)
252         loss_masks = F.pad(loss_masks, (0, nb_img_tokens), value=0)
253         t = (input == t_nul).long()
254         i = (t.cumsum(dim=1) == 1).nonzero(as_tuple=True)
255         input[i] = t_img
256
257         j = (
258             i[0][:, None],
259             i[1][:, None]
260             + 1
261             + torch.arange(nb_img_tokens - 1, device=input.device)[None, :],
262         )
263         ar_masks = input.new_zeros(input.size(), dtype=torch.int64)
264         ar_masks[j] = 1
265         forbidden_tokens = (
266             torch.arange(self.vocabulary_size(), device=input.device) == t_nul
267         )
268         with torch.autograd.no_grad():
269             t = model.training
270             model.eval()
271             masked_inplace_autoregression(
272                 model,
273                 self.batch_size,
274                 input,
275                 ar_masks,
276                 forbidden_tokens,
277                 device=self.device,
278             )
279             model.train(t)
280
281         input, loss_masks = self.trim((input, loss_masks))
282
283         return input, loss_masks
284
285     ######################
286
287     def __init__(
288         self,
289         nb_train_samples,
290         nb_test_samples,
291         batch_size,
292         height,
293         width,
294         nb_colors=5,
295         device=torch.device("cpu"),
296         pruner_train=None,
297         pruner_eval=None,
298     ):
299         def generate_descr(nb, cache_suffix, pruner):
300             return picoclvr.generate(
301                 nb,
302                 height=self.height,
303                 width=self.width,
304                 nb_colors=nb_colors,
305                 pruner=pruner,
306             )
307
308         self.height = height
309         self.width = width
310         self.batch_size = batch_size
311         self.device = device
312         self.pruner_train = pruner_train
313         self.pruner_eval = pruner_eval
314
315         param = {
316             "nb_train_samples": nb_train_samples,
317             "nb_test_samples": nb_test_samples,
318             "height": height,
319             "width": width,
320             "nb_colors": nb_colors,
321             "batch_size": batch_size,
322             "rng_state": list(torch.get_rng_state()),
323         }
324
325         log_string(
326             f"generating {nb_train_samples+nb_test_samples} samples (can take some time)"
327         )
328         self.train_descr = generate_descr(
329             nb_train_samples, "train", pruner=self.pruner_train
330         )
331         self.test_descr = generate_descr(nb_test_samples, "test", pruner=None)
332
333         # Build the tokenizer
334         tokens = {"<nul>", "<img>"}
335         for d in [self.train_descr, self.test_descr]:
336             for s in d:
337                 for t in s.strip().split(" "):
338                     tokens.add(t)
339         # make this set a sorted list to get the same tensors given
340         # the same descr
341         tokens = list(tokens)
342         tokens.sort()
343         self.token2id = dict([(t, n) for n, t in enumerate(tokens)])
344         self.id2token = dict([(n, t) for n, t in enumerate(tokens)])
345
346         # Tokenize the train and test sets
347         self.train_input = self.tensorize(self.train_descr)
348         self.test_input = self.tensorize(self.test_descr)
349
350     def batches(self, split="train"):
351         assert split in {"train", "test"}
352         input = self.train_input if split == "train" else self.test_input
353         for batch in tqdm.tqdm(
354             input.split(self.batch_size), dynamic_ncols=True, desc=f"epoch-{split}"
355         ):
356             yield self.trim(batch)
357
358     def vocabulary_size(self):
359         return len(self.token2id)
360
361     def compute_missing_properties(self, n_epoch, model, pruner=None):
362         acc_nb_requested_properties = []
363         acc_nb_missing_properties = []
364         acc_nb_results = 0
365
366         for input in tqdm.tqdm(
367             self.test_input.split(self.batch_size),
368             dynamic_ncols=True,
369             desc=f"test-properties",
370         ):
371             tape, loss_masks, _ = self.excise_last_image(input)
372             tape, loss_masks = self.add_generated_image(tape, loss_masks, model)
373             result_descr = self.detensorize(tape)
374             np = picoclvr.nb_properties(
375                 result_descr,
376                 height=self.height,
377                 width=self.width,
378                 pruner=pruner,
379             )
380             nb_requested_properties, _, nb_missing_properties = zip(*np)
381             acc_nb_requested_properties += nb_requested_properties
382             acc_nb_missing_properties += nb_missing_properties
383             acc_nb_results += len(result_descr)
384
385         nb_requested_properties = sum(acc_nb_requested_properties)
386         nb_missing_properties = sum(acc_nb_missing_properties)
387
388         prefix = "" if pruner is None else "pruned_"
389         log_string(f"nb_{prefix}samples {n_epoch} {acc_nb_results}")
390         log_string(
391             f"property_{prefix}nb {n_epoch} requested {sum(acc_nb_requested_properties)} missing {sum(acc_nb_missing_properties)}"
392         )
393         log_string(
394             f"property_{prefix}miss {n_epoch} {100*nb_missing_properties/nb_requested_properties:.02f}%"
395         )
396
397     ######################################################################
398
399     def produce_results(self, n_epoch, model):
400         self.compute_missing_properties(n_epoch, model)
401
402         if self.pruner_eval is not None:
403             self.compute_missing_properties(n_epoch, model, self.pruner_eval)
404
405         nb_tokens_to_generate = self.height * self.width + 3
406         result_descr = []
407         nb_per_primer = 8
408         primer = []
409
410         for primer_descr in [
411             "red above green <sep> green top <sep> blue right of red",
412             "there is red <sep> there is yellow <sep> there is blue",
413             "red below yellow <sep> yellow below green <sep> green below blue <sep> red right <sep> yellow left <sep> green right <sep> blue left",
414             "green bottom <sep> yellow bottom <sep> green left of blue <sep> yellow right of blue <sep> blue top",
415         ]:
416             primer += [primer_descr] * nb_per_primer
417
418         tape = self.tensorize(primer)
419         loss_masks = 1 - (tape == self.token2id["<nul>"]).long()
420         tape, loss_masks = self.add_generated_image(tape, loss_masks, model)
421         result_descr = self.detensorize(tape)
422
423         np = picoclvr.nb_properties(result_descr, height=self.height, width=self.width)
424
425         acc_nb_requested_properties, _, acc_nb_missing_properties = zip(*np)
426         acc_nb_results = len(result_descr)
427
428         nb_requested_properties = sum(acc_nb_requested_properties)
429         nb_missing_properties = sum(acc_nb_missing_properties)
430
431         prefix = "demo_"
432         log_string(f"nb_{prefix}samples {n_epoch} {acc_nb_results}")
433         log_string(
434             f"property_{prefix}nb {n_epoch} requested {sum(acc_nb_requested_properties)} missing {sum(acc_nb_missing_properties)}"
435         )
436         log_string(
437             f"property_{prefix}miss {n_epoch} {100*nb_missing_properties/nb_requested_properties:.02f}%"
438         )
439
440         img = picoclvr.descr2img(result_descr, height=self.height, width=self.width)
441
442         if img.dim() == 5:
443             if img.size(1) == 1:
444                 img = F.pad(img.squeeze(1), pad=(1, 1, 1, 1), value=64)
445             else:
446                 img = torch.cat(
447                     [
448                         torchvision.utils.make_grid(x, padding=1, pad_value=64)[None]
449                         for x in img
450                     ],
451                     0,
452                 )
453
454         image_name = os.path.join(args.result_dir, f"result_{n_epoch:04d}.png")
455         torchvision.utils.save_image(
456             img / 255.0, image_name, nrow=nb_per_primer, padding=1, pad_value=1.0
457         )
458         log_string(f"wrote {image_name}")
459
460
461 ######################################################################
462
463 import maze
464
465
466 class TaskMaze(Task):
467     def map2seq(self, *m):
468         return torch.cat([x.flatten(1) for x in m], 1)
469
470     def seq2map(self, s):
471         s = s.reshape(s.size(0), -1, self.height, self.width)
472         return (s[:, k] for k in range(s.size(1)))
473
474     def __init__(
475         self,
476         nb_train_samples,
477         nb_test_samples,
478         batch_size,
479         height,
480         width,
481         nb_walls,
482         device=torch.device("cpu"),
483     ):
484         self.batch_size = batch_size
485         self.height = height
486         self.width = width
487         self.device = device
488
489         train_mazes, train_paths, train_policies = maze.create_maze_data(
490             nb_train_samples,
491             height=height,
492             width=width,
493             nb_walls=nb_walls,
494             progress_bar=lambda x: tqdm.tqdm(x, dynamic_ncols=True, desc=f"data-train"),
495         )
496         self.train_input = self.map2seq(train_mazes.to(device), train_paths.to(device))
497         self.train_policies = train_policies.flatten(-2).to(device)
498
499         test_mazes, test_paths, test_policies = maze.create_maze_data(
500             nb_test_samples,
501             height=height,
502             width=width,
503             nb_walls=nb_walls,
504             progress_bar=lambda x: tqdm.tqdm(x, dynamic_ncols=True, desc=f"data-test"),
505         )
506         self.test_input = self.map2seq(test_mazes.to(device), test_paths.to(device))
507         self.test_policies = test_policies.flatten(-2).to(device)
508
509         self.nb_codes = self.train_input.max() + 1
510
511     def batches(self, split="train", nb_to_use=-1, desc=None):
512         assert split in {"train", "test"}
513         input = self.train_input if split == "train" else self.test_input
514         if nb_to_use > 0:
515             input = input[:nb_to_use]
516         if desc is None:
517             desc = f"epoch-{split}"
518         for batch in tqdm.tqdm(
519             input.split(self.batch_size), dynamic_ncols=True, desc=desc
520         ):
521             yield batch
522
523     def policy_batches(self, split="train", nb_to_use=-1, desc=None):
524         assert split in {"train", "test"}
525         input = self.train_input if split == "train" else self.test_input
526         policies = self.train_policies if split == "train" else self.test_policies
527         input = input[:, : self.height * self.width]
528         policies = policies * (input != maze.v_wall)[:, None]
529
530         if nb_to_use > 0:
531             input = input[:nb_to_use]
532             policies = policies[:nb_to_use]
533
534         if desc is None:
535             desc = f"epoch-{split}"
536         for batch in tqdm.tqdm(
537             zip(input.split(self.batch_size), policies.split(self.batch_size)),
538             dynamic_ncols=True,
539             desc=desc,
540         ):
541             yield batch
542
543     def vocabulary_size(self):
544         return self.nb_codes
545
546     def compute_error(self, model, split="train", nb_to_use=-1):
547         nb_total, nb_correct = 0, 0
548         for input in task.batches(split, nb_to_use):
549             result = input.clone()
550             ar_mask = result.new_zeros(result.size())
551             ar_mask[:, self.height * self.width :] = 1
552             result *= 1 - ar_mask
553             masked_inplace_autoregression(
554                 model, self.batch_size, result, ar_mask, device=self.device
555             )
556             mazes, paths = self.seq2map(result)
557             nb_correct += maze.path_correctness(mazes, paths).long().sum()
558             nb_total += mazes.size(0)
559
560         return nb_total, nb_correct
561
562     def produce_results(self, n_epoch, model):
563         with torch.autograd.no_grad():
564             t = model.training
565             model.eval()
566
567             train_nb_total, train_nb_correct = self.compute_error(
568                 model, "train", nb_to_use=1000
569             )
570             log_string(
571                 f"accuracy_train nb_total {train_nb_total} nb_correct {train_nb_correct} accuracy {(100.0*train_nb_correct)/train_nb_total:.02f}%"
572             )
573
574             test_nb_total, test_nb_correct = self.compute_error(
575                 model, "test", nb_to_use=1000
576             )
577             log_string(
578                 f"accuracy_test nb_total {test_nb_total} nb_correct {test_nb_correct} accuracy {(100.0*test_nb_correct)/test_nb_total:.02f}%"
579             )
580
581             input = self.test_input[:48]
582             result = input.clone()
583             ar_mask = result.new_zeros(result.size())
584             ar_mask[:, self.height * self.width :] = 1
585             result *= 1 - ar_mask
586             masked_inplace_autoregression(
587                 model, self.batch_size, result, ar_mask, device=self.device
588             )
589
590             mazes, paths = self.seq2map(input)
591             _, predicted_paths = self.seq2map(result)
592             filename = f"result_{n_epoch:04d}.png"
593             maze.save_image(
594                 os.path.join(args.result_dir, filename),
595                 mazes=mazes,
596                 target_paths=paths,
597                 predicted_paths=predicted_paths,
598                 path_correct=maze.path_correctness(mazes, predicted_paths),
599             )
600             log_string(f"wrote {filename}")
601
602             model.train(t)
603
604
605 ######################################################################
606
607
608 def picoclvr_pruner_horizontal_green(p):
609     return not ("green" in p and ("left" in p or "right" in p))
610
611
612 picoclvr_pruner_train = (
613     picoclvr_pruner_horizontal_green
614     if args.picocvlr_prune_properties in {"train+eval"}
615     else None
616 )
617
618 picoclvr_pruner_eval = (
619     (lambda p: not picoclvr_pruner_horizontal_green(p))
620     if args.picocvlr_prune_properties in {"train+eval", "eval"}
621     else None
622 )
623
624 ######################################################################
625
626 if args.task == "picoclvr":
627     task = TaskPicoCLVR(
628         nb_train_samples=args.nb_train_samples,
629         nb_test_samples=args.nb_test_samples,
630         batch_size=args.batch_size,
631         height=args.picoclvr_height,
632         width=args.picoclvr_width,
633         nb_colors=args.picoclvr_nb_colors,
634         device=device,
635         pruner_train=picoclvr_pruner_train,
636         pruner_eval=picoclvr_pruner_eval,
637     )
638
639 elif args.task == "maze":
640     task = TaskMaze(
641         nb_train_samples=args.nb_train_samples,
642         nb_test_samples=args.nb_test_samples,
643         batch_size=args.batch_size,
644         height=args.maze_height,
645         width=args.maze_width,
646         nb_walls=args.maze_nb_walls,
647         device=device,
648     )
649
650 else:
651     raise ValueError(f"Unknown task {args.task}")
652
653 ######################################################################
654
655 log_string(f"device {device}")
656
657 vocabulary_size = task.vocabulary_size()
658
659 log_string(f"vocabulary_size {vocabulary_size}")
660
661 ##############################
662
663 model = mygpt.MyGPT(
664     vocabulary_size=vocabulary_size,
665     dim_model=args.dim_model,
666     dim_keys=args.dim_keys,
667     dim_hidden=args.dim_hidden,
668     nb_heads=args.nb_heads,
669     nb_blocks=args.nb_blocks,
670     causal=True,
671     dropout=args.dropout,
672 )
673
674 model.to(device)
675
676 nb_parameters = sum(p.numel() for p in model.parameters())
677 log_string(f"nb_parameters {nb_parameters} ({int(nb_parameters/1e6)}M)")
678
679 ######################################################################
680
681 nb_epochs_finished = 0
682
683 if args.no_checkpoint:
684     log_string(f"not trying to load checkpoint.")
685
686 else:
687     try:
688         checkpoint_name = os.path.join(args.result_dir, args.checkpoint_name)
689         checkpoint = torch.load(checkpoint_name)
690         nb_epochs_finished = checkpoint["nb_epochs_finished"]
691         model.load_state_dict(checkpoint["model_state"])
692         torch.set_rng_state(checkpoint["rng_state"])
693         if torch.cuda.is_available():
694             torch.cuda.set_rng_state(checkpoint["cuda_rng_state"])
695
696         log_string(f"checkpoint loaded with {nb_epochs_finished} epochs finished.")
697
698     except FileNotFoundError:
699         log_string("starting from scratch.")
700
701     except:
702         log_string("error when loading the checkpoint.")
703         exit(1)
704
705 ######################################################################
706
707 nb_epochs = args.nb_epochs if args.nb_epochs > 0 else nb_epochs_default
708
709 token_count = 0
710 for input in task.batches(split="train"):
711     token_count += F.one_hot(input, num_classes=task.vocabulary_size()).sum((0, 1))
712 token_probas = token_count / token_count.sum()
713 entropy = -torch.xlogy(token_probas, token_probas).sum()
714 train_set_perplexity = math.exp(entropy)
715
716 ##############################
717
718 if args.learning_rate_schedule == "cos":
719     learning_rate_schedule = {}
720     for n_epoch in range(args.nb_epochs):
721         u = n_epoch / args.nb_epochs * math.pi
722         learning_rate_schedule[n_epoch] = args.learning_rate * 0.5 * (1 + math.cos(u))
723 else:
724     u = {
725         int(k): float(v)
726         for k, v in [
727             tuple(x.split(":")) for x in args.learning_rate_schedule.split(",")
728         ]
729     }
730
731     learning_rate_schedule = {}
732     learning_rate = args.learning_rate
733     for n_epoch in range(args.nb_epochs):
734         if n_epoch in u:
735             learning_rate = u[n_epoch]
736         learning_rate_schedule[n_epoch] = learning_rate
737
738 log_string(f"learning_rate_schedule {learning_rate_schedule}")
739
740 ##############################
741
742 nb_samples_seen = 0
743
744 if nb_epochs_finished >= nb_epochs:
745     task.produce_results(nb_epochs_finished, model)
746
747 for n_epoch in range(nb_epochs_finished, nb_epochs):
748     learning_rate = learning_rate_schedule[n_epoch]
749
750     log_string(f"learning_rate {learning_rate}")
751
752     if args.optim == "sgd":
753         optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
754     elif args.optim == "adam":
755         optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
756     elif args.optim == "adamw":
757         optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
758     else:
759         raise ValueError(f"Unknown optimizer {args.optim}.")
760
761     model.train()
762
763     nb_train_samples, acc_train_loss = 0, 0.0
764
765     for input in task.batches(split="train"):
766         input = input.to(device)
767         output = model(mygpt.BracketedSequence(input)).x
768         loss = F.cross_entropy(output.transpose(1, 2), input)
769         acc_train_loss += loss.item() * input.size(0)
770         nb_train_samples += input.size(0)
771         nb_samples_seen += input.size(0)
772
773         optimizer.zero_grad()
774         loss.backward()
775         optimizer.step()
776
777     with torch.autograd.no_grad():
778         model.eval()
779
780         nb_test_samples, acc_test_loss = 0, 0.0
781
782         for input in task.batches(split="test"):
783             input = input.to(device)
784
785             # input, loss_masks, true_images = task.excise_last_image(input)
786             # input, loss_masks = task.add_true_image(input, true_images, loss_masks)
787
788             output = model(mygpt.BracketedSequence(input)).x
789             loss = F.cross_entropy(output.transpose(1, 2), input)
790             acc_test_loss += loss.item() * input.size(0)
791             nb_test_samples += input.size(0)
792
793         train_perplexity = math.exp(min(100, acc_train_loss / nb_train_samples))
794         test_perplexity = math.exp(min(100, acc_test_loss / nb_test_samples))
795
796         log_string(
797             f"perplexity {n_epoch} train_set {train_set_perplexity} train_prediction {train_perplexity} test_prediction {test_perplexity}"
798         )
799
800         task.produce_results(n_epoch, model)
801
802     checkpoint = {
803         "nb_epochs_finished": n_epoch + 1,
804         "model_state": model.state_dict(),
805         "rng_state": torch.get_rng_state(),
806     }
807
808     if torch.cuda.is_available():
809         checkpoint["cuda_rng_state"] = torch.cuda.get_rng_state()
810
811     checkpoint_name = os.path.join(args.result_dir, args.checkpoint_name)
812     torch.save(checkpoint, checkpoint_name)
813     log_string(f"saved checkpoint {checkpoint_name}")
814
815 ######################################################################