Update.
[picoclvr.git] / main.py
1 #!/usr/bin/env python
2
3 # Any copyright is dedicated to the Public Domain.
4 # https://creativecommons.org/publicdomain/zero/1.0/
5
6 # Written by Francois Fleuret <francois@fleuret.org>
7
8 # torch.backends.cuda.matmul.allow_tf23
9 # torch.autocast(torch.bfloat16)
10
11 import math, sys, argparse, time, tqdm, itertools, os
12
13 import torch, torchvision
14 from torch import nn
15 from torch.nn import functional as F
16
17 import mygpt, tensorstack
18
19 ######################################################################
20
21 if torch.cuda.is_available():
22     device = torch.device("cuda")
23     torch.backends.cuda.matmul.allow_tf32 = True
24 else:
25     device = torch.device("cpu")
26
27 ######################################################################
28
29 parser = argparse.ArgumentParser(
30     description="An implementation of GPT with cache to solve a toy geometric reasoning task."
31 )
32
33 parser.add_argument("--task", type=str, default="picoclvr")
34
35 parser.add_argument("--log_filename", type=str, default="train.log")
36
37 parser.add_argument("--result_dir", type=str, default="results_default")
38
39 parser.add_argument("--seed", type=int, default=0)
40
41 parser.add_argument("--nb_epochs", type=int, default=25)
42
43 parser.add_argument("--batch_size", type=int, default=25)
44
45 parser.add_argument("--nb_train_samples", type=int, default=250000)
46
47 parser.add_argument("--nb_test_samples", type=int, default=10000)
48
49 parser.add_argument("--optim", type=str, default="adam")
50
51 parser.add_argument("--learning_rate", type=float, default=1e-4)
52
53 parser.add_argument("--learning_rate_schedule", type=str, default="10: 2e-5,30: 4e-6")
54
55 parser.add_argument("--dim_model", type=int, default=512)
56
57 parser.add_argument("--dim_keys", type=int, default=64)
58
59 parser.add_argument("--dim_hidden", type=int, default=2048)
60
61 parser.add_argument("--nb_heads", type=int, default=8)
62
63 parser.add_argument("--nb_blocks", type=int, default=12)
64
65 parser.add_argument("--dropout", type=float, default=0.1)
66
67 parser.add_argument("--deterministic_synthesis", action="store_true", default=False)
68
69 parser.add_argument("--no_checkpoint", action="store_true", default=False)
70
71 parser.add_argument("--overwrite_results", action="store_true", default=False)
72
73 parser.add_argument("--checkpoint_name", type=str, default="checkpoint.pth")
74
75 ##############################
76 # picoclvr options
77
78 parser.add_argument("--picoclvr_nb_colors", type=int, default=5)
79
80 parser.add_argument("--picoclvr_height", type=int, default=12)
81
82 parser.add_argument("--picoclvr_width", type=int, default=16)
83
84 parser.add_argument("--picocvlr_prune_properties", type=str, default="none")
85
86 ##############################
87 # Maze options
88
89 parser.add_argument("--maze_height", type=int, default=13)
90
91 parser.add_argument("--maze_width", type=int, default=21)
92
93 parser.add_argument("--maze_nb_walls", type=int, default=15)
94
95 ######################################################################
96
97 args = parser.parse_args()
98
99 assert args.picocvlr_prune_properties in {"none", "train+eval", "eval"}
100
101 try:
102     os.mkdir(args.result_dir)
103 except FileExistsError:
104     if not args.overwrite_results:
105         print(f"result directory {args.result_dir} already exists")
106         exit(1)
107
108 log_file = open(os.path.join(args.result_dir, args.log_filename), "a")
109
110 if args.seed >= 0:
111     # torch.backends.cudnn.deterministic = True
112     # torch.backends.cudnn.benchmark = False
113     # torch.use_deterministic_algorithms(True)
114     torch.manual_seed(args.seed)
115     if torch.cuda.is_available():
116         torch.cuda.manual_seed_all(args.seed)
117
118 ######################################################################
119
120
121 def log_string(s):
122     t = time.strftime("%Y%m%d-%H:%M:%S ", time.localtime())
123
124     if log_file is not None:
125         log_file.write(t + s + "\n")
126         log_file.flush()
127
128     print(t + s)
129     sys.stdout.flush()
130
131
132 for n in vars(args):
133     log_string(f"args.{n} {getattr(args, n)}")
134
135 ######################################################################
136
137
138 def masked_inplace_autoregression(
139     model, batch_size, input, ar_mask, forbidden_tokens=None, device=torch.device("cpu")
140 ):
141     for input, ar_mask in zip(input.split(batch_size), ar_mask.split(batch_size)):
142         i = (ar_mask.sum(0) > 0).nonzero()
143         if i.min() > 0:
144             model(
145                 mygpt.BracketedSequence(input, 0, i.min())
146             )  # Needed to initialize the model's cache
147         for s in range(i.min(), i.max() + 1):
148             output = model(mygpt.BracketedSequence(input, s, 1)).x
149             logits = output[:, s]
150             if forbidden_tokens is not None:
151                 logits = logits.masked_fill(forbidden_tokens, float("-inf"))
152             if args.deterministic_synthesis:
153                 t_next = logits.argmax(1)
154             else:
155                 dist = torch.distributions.categorical.Categorical(logits=logits)
156                 t_next = dist.sample()
157             input[:, s] = ar_mask[:, s] * t_next + (1 - ar_mask[:, s]) * input[:, s]
158
159
160 ######################################################################
161
162
163 class Task:
164     def batches(self, split="train"):
165         pass
166
167     def vocabulary_size(self):
168         pass
169
170     def produce_results(self, n_epoch, model):
171         pass
172
173
174 ######################################################################
175
176 import picoclvr
177
178
179 class TaskPicoCLVR(Task):
180     # Make a tensor from a list of strings
181     def tensorize(self, descr):
182         token_descr = [s.strip().split(" ") for s in descr]
183         l = max([len(s) for s in token_descr])
184         token_descr = [s + ["<nul>"] * (l - len(s)) for s in token_descr]
185         id_descr = [[self.token2id[u] for u in s] for s in token_descr]
186         return torch.tensor(id_descr, device=self.device)
187
188     # Make a list of strings from a tensor
189     def detensorize(self, x):
190         return [" ".join([self.id2token[t.item()] for t in r]) for r in x]
191
192     # trim all the tensors in the tuple z to remove as much token from
193     # left and right in the first tensor. If z is a tuple, all its
194     # elements are trimed according to the triming for the first
195     def trim(self, z, token="<nul>"):
196         n = self.token2id[token]
197         if type(z) == tuple:
198             x = z[0]
199             i = (1 - (F.pad(x, (1, 1), value=n) == n).min(0).values.long()).cumsum(0)
200             a, b = (i == 0).nonzero().max(), (i == i.max()).nonzero().min()
201             return tuple([t[:, a:b] for t in z])
202         else:
203             i = (1 - (F.pad(z, (1, 1), value=n) == n).min(0).values.long()).cumsum(0)
204             a, b = (i == 0).nonzero().max(), (i == i.max()).nonzero().min()
205             return z[:, a:b]
206
207     ######################
208     # Not the cleanest part of the code
209
210     # Extract the last image of each sequence, from the last <img>
211     # included, and set to <nul> all the tokens from the beginning of
212     # that image to the end
213     def excise_last_image(self, input):
214         t_img, t_nul = self.token2id["<img>"], self.token2id["<nul>"]
215         nb_img_tokens = self.height * self.width + 1
216
217         input = input.clone()
218         t = (input == t_img).long()
219         tail_masks = (t.cumsum(dim=1) == t.sum(dim=1, keepdim=True)).long()
220         i = (t * tail_masks).nonzero(as_tuple=True)
221         j = (
222             i[0][:, None],
223             i[1][:, None] + torch.arange(nb_img_tokens, device=input.device)[None, :],
224         )
225         images = self.trim(input[j])
226         input[j] = t_nul
227         loss_masks = 1 - tail_masks
228         input, loss_masks = self.trim((input, loss_masks))
229         return input, loss_masks, images
230
231     def add_true_image(self, input, images, loss_masks):
232         t_nul = self.token2id["<nul>"]
233         nb_img_tokens = self.height * self.width + 1
234         input = F.pad(input, (0, nb_img_tokens), value=t_nul)
235         loss_masks = F.pad(loss_masks, (0, nb_img_tokens), value=0)
236         t = (input == t_nul).long()
237         i = (t.cumsum(dim=1) == 1).nonzero(as_tuple=True)
238         j = (
239             i[0][:, None],
240             i[1][:, None] + torch.arange(nb_img_tokens, device=input.device)[None, :],
241         )
242         input[j] = images
243         loss_masks[j] = 1
244         input, loss_masks = self.trim((input, loss_masks))
245         return input, loss_masks
246
247     def add_generated_image(self, input, loss_masks, model):
248         t_img, t_nul = self.token2id["<img>"], self.token2id["<nul>"]
249         nb_img_tokens = self.height * self.width + 1
250
251         input = F.pad(input, (0, nb_img_tokens), value=t_nul)
252         loss_masks = F.pad(loss_masks, (0, nb_img_tokens), value=0)
253         t = (input == t_nul).long()
254         i = (t.cumsum(dim=1) == 1).nonzero(as_tuple=True)
255         input[i] = t_img
256
257         j = (
258             i[0][:, None],
259             i[1][:, None]
260             + 1
261             + torch.arange(nb_img_tokens - 1, device=input.device)[None, :],
262         )
263         ar_masks = input.new_zeros(input.size(), dtype=torch.int64)
264         ar_masks[j] = 1
265         forbidden_tokens = (
266             torch.arange(self.vocabulary_size(), device=input.device) == t_nul
267         )
268         with torch.autograd.no_grad():
269             t = model.training
270             model.eval()
271             masked_inplace_autoregression(
272                 model,
273                 self.batch_size,
274                 input,
275                 ar_masks,
276                 forbidden_tokens,
277                 device=self.device,
278             )
279             model.train(t)
280
281         input, loss_masks = self.trim((input, loss_masks))
282
283         return input, loss_masks
284
285     ######################
286
287     def __init__(
288         self,
289         nb_train_samples,
290         nb_test_samples,
291         batch_size,
292         height,
293         width,
294         nb_colors=5,
295         device=torch.device("cpu"),
296         pruner_train=None,
297         pruner_eval=None,
298     ):
299         def generate_descr(nb, cache_suffix, pruner):
300             return picoclvr.generate(
301                 nb,
302                 height=self.height,
303                 width=self.width,
304                 nb_colors=nb_colors,
305                 pruner=pruner,
306             )
307
308         self.height = height
309         self.width = width
310         self.batch_size = batch_size
311         self.device = device
312         self.pruner_train = pruner_train
313         self.pruner_eval = pruner_eval
314
315         param = {
316             "nb_train_samples": nb_train_samples,
317             "nb_test_samples": nb_test_samples,
318             "height": height,
319             "width": width,
320             "nb_colors": nb_colors,
321             "batch_size": batch_size,
322             "rng_state": list(torch.get_rng_state()),
323         }
324
325         log_string(
326             f"generating {nb_train_samples+nb_test_samples} samples (can take some time)"
327         )
328         self.train_descr = generate_descr(
329             nb_train_samples, "train", pruner=self.pruner_train
330         )
331         self.test_descr = generate_descr(nb_test_samples, "test", pruner=None)
332
333         # Build the tokenizer
334         tokens = {"<nul>", "<img>"}
335         for d in [self.train_descr, self.test_descr]:
336             for s in d:
337                 for t in s.strip().split(" "):
338                     tokens.add(t)
339         # make this set a sorted list to get the same tensors given
340         # the same descr
341         tokens = list(tokens)
342         tokens.sort()
343         self.token2id = dict([(t, n) for n, t in enumerate(tokens)])
344         self.id2token = dict([(n, t) for n, t in enumerate(tokens)])
345
346         # Tokenize the train and test sets
347         self.train_input = self.tensorize(self.train_descr)
348         self.test_input = self.tensorize(self.test_descr)
349
350     def batches(self, split="train"):
351         assert split in {"train", "test"}
352         input = self.train_input if split == "train" else self.test_input
353         for batch in tqdm.tqdm(
354             input.split(self.batch_size), dynamic_ncols=True, desc=f"epoch-{split}"
355         ):
356             yield self.trim(batch)
357
358     def vocabulary_size(self):
359         return len(self.token2id)
360
361     def compute_missing_properties(self, n_epoch, model, pruner=None):
362         acc_nb_requested_properties = []
363         acc_nb_missing_properties = []
364         acc_nb_results = 0
365
366         for input in tqdm.tqdm(
367             self.test_input.split(self.batch_size),
368             dynamic_ncols=True,
369             desc=f"test-properties",
370         ):
371             tape, loss_masks, _ = self.excise_last_image(input)
372             tape, loss_masks = self.add_generated_image(tape, loss_masks, model)
373             result_descr = self.detensorize(tape)
374             np = picoclvr.nb_properties(
375                 result_descr,
376                 height=self.height,
377                 width=self.width,
378                 pruner=pruner,
379             )
380             nb_requested_properties, _, nb_missing_properties = zip(*np)
381             acc_nb_requested_properties += nb_requested_properties
382             acc_nb_missing_properties += nb_missing_properties
383             acc_nb_results += len(result_descr)
384
385         nb_requested_properties = sum(acc_nb_requested_properties)
386         nb_missing_properties = sum(acc_nb_missing_properties)
387
388         prefix = "" if pruner is None else "pruned_"
389         log_string(f"nb_{prefix}samples {n_epoch} {acc_nb_results}")
390         log_string(
391             f"property_{prefix}nb {n_epoch} requested {sum(acc_nb_requested_properties)} missing {sum(acc_nb_missing_properties)}"
392         )
393         log_string(
394             f"property_{prefix}miss {n_epoch} {100*nb_missing_properties/nb_requested_properties:.02f}%"
395         )
396
397     ######################################################################
398
399     def produce_results(self, n_epoch, model):
400         self.compute_missing_properties(n_epoch, model)
401
402         if self.pruner_eval is not None:
403             self.compute_missing_properties(n_epoch, model, self.pruner_eval)
404
405         nb_tokens_to_generate = self.height * self.width + 3
406         result_descr = []
407         nb_per_primer = 8
408         primer = []
409
410         for primer_descr in [
411             "red above green <sep> green top <sep> blue right of red",
412             "there is red <sep> there is yellow <sep> there is blue",
413             "red below yellow <sep> yellow below green <sep> green below blue <sep> red right <sep> yellow left <sep> green right <sep> blue left",
414             "green bottom <sep> yellow bottom <sep> green left of blue <sep> yellow right of blue <sep> blue top",
415         ]:
416             primer += [primer_descr] * nb_per_primer
417
418         tape = self.tensorize(primer)
419         loss_masks = 1 - (tape == self.token2id["<nul>"]).long()
420         tape, loss_masks = self.add_generated_image(tape, loss_masks, model)
421         result_descr = self.detensorize(tape)
422
423         np = picoclvr.nb_properties(result_descr, height=self.height, width=self.width)
424
425         acc_nb_requested_properties, _, acc_nb_missing_properties = zip(*np)
426         acc_nb_results = len(result_descr)
427
428         nb_requested_properties = sum(acc_nb_requested_properties)
429         nb_missing_properties = sum(acc_nb_missing_properties)
430
431         prefix = "demo_"
432         log_string(f"nb_{prefix}samples {n_epoch} {acc_nb_results}")
433         log_string(
434             f"property_{prefix}nb {n_epoch} requested {sum(acc_nb_requested_properties)} missing {sum(acc_nb_missing_properties)}"
435         )
436         log_string(
437             f"property_{prefix}miss {n_epoch} {100*nb_missing_properties/nb_requested_properties:.02f}%"
438         )
439
440         img = picoclvr.descr2img(result_descr, height=self.height, width=self.width)
441
442         if img.dim() == 5:
443             if img.size(1) == 1:
444                 img = F.pad(img.squeeze(1), pad=(1, 1, 1, 1), value=64)
445             else:
446                 img = torch.cat(
447                     [
448                         torchvision.utils.make_grid(x, padding=1, pad_value=64)[None]
449                         for x in img
450                     ],
451                     0,
452                 )
453
454         image_name = os.path.join(args.result_dir, f"picoclvr_result_{n_epoch:04d}.png")
455         torchvision.utils.save_image(
456             img / 255.0, image_name, nrow=nb_per_primer, padding=1, pad_value=1.0
457         )
458         log_string(f"wrote {image_name}")
459
460
461 ######################################################################
462
463
464 class TaskMNIST(Task):
465     def __init__(self, batch_size, device=torch.device("cpu")):
466         self.device = device
467         self.batch_size = batch_size
468
469     def batches(self, split="train"):
470         assert split in {"train", "test"}
471         data_set = torchvision.datasets.MNIST(
472             root="./data", train=(split == "train"), download=True
473         )
474         data_input = data_set.data.view(-1, 28 * 28).long()
475         if args.nb_train_samples is not None:
476             data_input = data_input[: args.nb_train_samples]
477         for batch in tqdm.tqdm(
478             data_input.split(self.batch_size), desc=f"epoch-{split}"
479         ):
480             yield batch
481
482     def vocabulary_size(self):
483         return 256
484
485     def produce_results(self, n_epoch, model):
486         results = torch.empty(64, 28 * 28, device=self.device, dtype=torch.int64)
487         ar_mask = torch.full_like(results, 1)
488         masked_inplace_autoregression(
489             model, self.batch_size, results, ar_mask, device=self.device
490         )
491         image_name = os.path.join(args.result_dir, f"result_mnist_{n_epoch:04d}.png")
492         torchvision.utils.save_image(
493             1 - results.reshape(-1, 1, 28, 28) / 255.0,
494             image_name,
495             nrow=16,
496             pad_value=0.8,
497         )
498         log_string(f"wrote {image_name}")
499
500
501 ######################################################################
502
503 import maze
504
505
506 class TaskMaze(Task):
507     def map2seq(self, *m):
508         return torch.cat([x.flatten(1) for x in m], 1)
509
510     def seq2map(self, s):
511         s = s.reshape(s.size(0), -1, self.height, self.width)
512         return (s[:, k] for k in range(s.size(1)))
513
514     def __init__(
515         self,
516         nb_train_samples,
517         nb_test_samples,
518         batch_size,
519         height,
520         width,
521         nb_walls,
522         device=torch.device("cpu"),
523     ):
524         self.batch_size = batch_size
525         self.height = height
526         self.width = width
527         self.device = device
528
529         train_mazes, train_paths, _ = maze.create_maze_data(
530             nb_train_samples,
531             height=height,
532             width=width,
533             nb_walls=nb_walls,
534             progress_bar=lambda x: tqdm.tqdm(x, dynamic_ncols=True, desc=f"data-train"),
535         )
536         self.train_input = self.map2seq(train_mazes.to(device), train_paths.to(device))
537
538         test_mazes, test_paths, _ = maze.create_maze_data(
539             nb_test_samples,
540             height=height,
541             width=width,
542             nb_walls=nb_walls,
543             progress_bar=lambda x: tqdm.tqdm(x, dynamic_ncols=True, desc=f"data-test"),
544         )
545         self.test_input = self.map2seq(test_mazes.to(device), test_paths.to(device))
546
547         self.nb_codes = max(self.train_input.max(), self.test_input.max()) + 1
548
549     def batches(self, split="train", nb_to_use=-1, desc=None):
550         assert split in {"train", "test"}
551         input = self.train_input if split == "train" else self.test_input
552         if nb_to_use > 0:
553             input = input[:nb_to_use]
554         if desc is None:
555             desc = f"epoch-{split}"
556         for batch in tqdm.tqdm(
557             input.split(self.batch_size), dynamic_ncols=True, desc=desc
558         ):
559             yield batch
560
561     def vocabulary_size(self):
562         return self.nb_codes
563
564     def compute_error(self, model, split="train", nb_to_use=-1):
565         nb_total, nb_correct = 0, 0
566         for input in task.batches(split, nb_to_use):
567             result = input.clone()
568             ar_mask = result.new_zeros(result.size())
569             ar_mask[:, self.height * self.width :] = 1
570             result *= 1 - ar_mask
571             masked_inplace_autoregression(
572                 model, self.batch_size, result, ar_mask, device=self.device
573             )
574             mazes, paths = self.seq2map(result)
575             nb_correct += maze.path_correctness(mazes, paths).long().sum()
576             nb_total += mazes.size(0)
577
578         return nb_total, nb_correct
579
580     def produce_results(self, n_epoch, model):
581         with torch.autograd.no_grad():
582             t = model.training
583             model.eval()
584
585             train_nb_total, train_nb_correct = self.compute_error(
586                 model, "train", nb_to_use=1000
587             )
588             log_string(
589                 f"accuracy_train nb_total {train_nb_total} nb_correct {train_nb_correct} accuracy {(100.0*train_nb_correct)/train_nb_total:.02f}%"
590             )
591
592             test_nb_total, test_nb_correct = self.compute_error(
593                 model, "test", nb_to_use=1000
594             )
595             log_string(
596                 f"accuracy_test nb_total {test_nb_total} nb_correct {test_nb_correct} accuracy {(100.0*test_nb_correct)/test_nb_total:.02f}%"
597             )
598
599             input = self.test_input[:48]
600             result = input.clone()
601             ar_mask = result.new_zeros(result.size())
602             ar_mask[:, self.height * self.width :] = 1
603             result *= 1 - ar_mask
604             masked_inplace_autoregression(
605                 model, self.batch_size, result, ar_mask, device=self.device
606             )
607
608             mazes, paths = self.seq2map(input)
609             _, predicted_paths = self.seq2map(result)
610
611             filename = os.path.join(args.result_dir, f"result_{n_epoch:04d}.png")
612             maze.save_image(
613                 filename,
614                 mazes=mazes,
615                 target_paths=paths,
616                 predicted_paths=predicted_paths,
617                 path_correct=maze.path_correctness(mazes, predicted_paths),
618             )
619             log_string(f"wrote {filename}")
620
621             model.train(t)
622
623
624 ######################################################################
625
626 class TaskSnake(Task):
627     def __init__(
628         self,
629         nb_train_samples,
630         nb_test_samples,
631         batch_size,
632         height,
633         width,
634         nb_walls,
635         device=torch.device("cpu"),
636     ):
637         self.batch_size = batch_size
638         self.height = height
639         self.width = width
640         self.device = device
641
642         # self.train_input = 
643         # self.test_input = 
644
645         self.nb_codes = max(self.train_input.max(), self.train_input.max()) + 1
646
647     def batches(self, split="train", nb_to_use=-1, desc=None):
648         assert split in {"train", "test"}
649         input = self.train_input if split == "train" else self.test_input
650         if nb_to_use > 0:
651             input = input[:nb_to_use]
652         if desc is None:
653             desc = f"epoch-{split}"
654         for batch in tqdm.tqdm(
655             input.split(self.batch_size), dynamic_ncols=True, desc=desc
656         ):
657             yield batch
658
659
660 ######################################################################
661
662
663 def picoclvr_pruner_horizontal_green(p):
664     return not ("green" in p and ("left" in p or "right" in p))
665
666
667 picoclvr_pruner_train = (
668     picoclvr_pruner_horizontal_green
669     if args.picocvlr_prune_properties in {"train+eval"}
670     else None
671 )
672
673 picoclvr_pruner_eval = (
674     (lambda p: not picoclvr_pruner_horizontal_green(p))
675     if args.picocvlr_prune_properties in {"train+eval", "eval"}
676     else None
677 )
678
679 ######################################################################
680
681 if args.task == "picoclvr":
682     task = TaskPicoCLVR(
683         nb_train_samples=args.nb_train_samples,
684         nb_test_samples=args.nb_test_samples,
685         batch_size=args.batch_size,
686         height=args.picoclvr_height,
687         width=args.picoclvr_width,
688         nb_colors=args.picoclvr_nb_colors,
689         device=device,
690         pruner_train=picoclvr_pruner_train,
691         pruner_eval=picoclvr_pruner_eval,
692     )
693
694 elif args.task == "mnist":
695     task = TaskMNIST(
696         batch_size=args.batch_size,
697         device=device,
698     )
699
700 elif args.task == "maze":
701     task = TaskMaze(
702         nb_train_samples=args.nb_train_samples,
703         nb_test_samples=args.nb_test_samples,
704         batch_size=args.batch_size,
705         height=args.maze_height,
706         width=args.maze_width,
707         nb_walls=args.maze_nb_walls,
708         device=device,
709     )
710
711 else:
712     raise ValueError(f"Unknown task {args.task}")
713
714 ######################################################################
715
716 log_string(f"device {device}")
717
718 vocabulary_size = task.vocabulary_size()
719
720 log_string(f"vocabulary_size {vocabulary_size}")
721
722 ##############################
723
724 model = mygpt.MyGPT(
725     vocabulary_size=vocabulary_size,
726     dim_model=args.dim_model,
727     dim_keys=args.dim_keys,
728     dim_hidden=args.dim_hidden,
729     nb_heads=args.nb_heads,
730     nb_blocks=args.nb_blocks,
731     causal=True,
732     dropout=args.dropout,
733 )
734
735 model.to(device)
736
737 nb_parameters = sum(p.numel() for p in model.parameters())
738 log_string(f"nb_parameters {nb_parameters} ({int(nb_parameters/1e6)}M)")
739
740 ######################################################################
741
742 nb_epochs_finished = 0
743
744 if args.no_checkpoint:
745     log_string(f"not trying to load checkpoint.")
746
747 else:
748     try:
749         checkpoint_name = os.path.join(args.result_dir, args.checkpoint_name)
750         checkpoint = torch.load(checkpoint_name)
751         nb_epochs_finished = checkpoint["nb_epochs_finished"]
752         model.load_state_dict(checkpoint["model_state"])
753         torch.set_rng_state(checkpoint["rng_state"])
754         if torch.cuda.is_available():
755             torch.cuda.set_rng_state(checkpoint["cuda_rng_state"])
756
757         log_string(f"checkpoint loaded with {nb_epochs_finished} epochs finished.")
758
759     except FileNotFoundError:
760         log_string("starting from scratch.")
761
762     except:
763         log_string("error when loading the checkpoint.")
764         exit(1)
765
766 ######################################################################
767
768 nb_epochs = args.nb_epochs if args.nb_epochs > 0 else nb_epochs_default
769
770 token_count = 0
771 for input in task.batches(split="train"):
772     token_count += F.one_hot(input, num_classes=task.vocabulary_size()).sum((0, 1))
773 token_probas = token_count / token_count.sum()
774 entropy = -torch.xlogy(token_probas, token_probas).sum()
775 train_set_perplexity = math.exp(entropy)
776
777 ##############################
778
779 if args.learning_rate_schedule == "cos":
780     learning_rate_schedule = {}
781     for n_epoch in range(args.nb_epochs):
782         u = n_epoch / args.nb_epochs * math.pi
783         learning_rate_schedule[n_epoch] = args.learning_rate * 0.5 * (1 + math.cos(u))
784 else:
785     u = {
786         int(k): float(v)
787         for k, v in [
788             tuple(x.split(":")) for x in args.learning_rate_schedule.split(",")
789         ]
790     }
791
792     learning_rate_schedule = {}
793     learning_rate = args.learning_rate
794     for n_epoch in range(args.nb_epochs):
795         if n_epoch in u:
796             learning_rate = u[n_epoch]
797         learning_rate_schedule[n_epoch] = learning_rate
798
799 log_string(f"learning_rate_schedule {learning_rate_schedule}")
800
801 ##############################
802
803 nb_samples_seen = 0
804
805 if nb_epochs_finished >= nb_epochs:
806     task.produce_results(nb_epochs_finished, model)
807
808 for n_epoch in range(nb_epochs_finished, nb_epochs):
809     learning_rate = learning_rate_schedule[n_epoch]
810
811     log_string(f"learning_rate {learning_rate}")
812
813     if args.optim == "sgd":
814         optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
815     elif args.optim == "adam":
816         optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
817     elif args.optim == "adamw":
818         optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
819     else:
820         raise ValueError(f"Unknown optimizer {args.optim}.")
821
822     model.train()
823
824     nb_train_samples, acc_train_loss = 0, 0.0
825
826     for input in task.batches(split="train"):
827         input = input.to(device)
828         output = model(mygpt.BracketedSequence(input)).x
829         loss = F.cross_entropy(output.transpose(1, 2), input)
830         acc_train_loss += loss.item() * input.size(0)
831         nb_train_samples += input.size(0)
832         nb_samples_seen += input.size(0)
833
834         optimizer.zero_grad()
835         loss.backward()
836         optimizer.step()
837
838     with torch.autograd.no_grad():
839         model.eval()
840
841         nb_test_samples, acc_test_loss = 0, 0.0
842
843         for input in task.batches(split="test"):
844             input = input.to(device)
845
846             # input, loss_masks, true_images = task.excise_last_image(input)
847             # input, loss_masks = task.add_true_image(input, true_images, loss_masks)
848
849             output = model(mygpt.BracketedSequence(input)).x
850             loss = F.cross_entropy(output.transpose(1, 2), input)
851             acc_test_loss += loss.item() * input.size(0)
852             nb_test_samples += input.size(0)
853
854         train_perplexity = math.exp(min(100, acc_train_loss / nb_train_samples))
855         test_perplexity = math.exp(min(100, acc_test_loss / nb_test_samples))
856
857         log_string(
858             f"perplexity {n_epoch} train_set {train_set_perplexity} train_prediction {train_perplexity} test_prediction {test_perplexity}"
859         )
860
861         task.produce_results(n_epoch, model)
862
863     checkpoint = {
864         "nb_epochs_finished": n_epoch + 1,
865         "model_state": model.state_dict(),
866         "rng_state": torch.get_rng_state(),
867     }
868
869     if torch.cuda.is_available():
870         checkpoint["cuda_rng_state"] = torch.cuda.get_rng_state()
871
872     checkpoint_name = os.path.join(args.result_dir, args.checkpoint_name)
873     torch.save(checkpoint, checkpoint_name)
874     log_string(f"saved checkpoint {checkpoint_name}")
875
876 ######################################################################