Update.
[picoclvr.git] / main.py
1 #!/usr/bin/env python
2
3 # Any copyright is dedicated to the Public Domain.
4 # https://creativecommons.org/publicdomain/zero/1.0/
5
6 # Written by Francois Fleuret <francois@fleuret.org>
7
8 # torch.backends.cuda.matmul.allow_tf23
9 # torch.autocast(torch.bfloat16)
10
11 import math, sys, argparse, time, tqdm, itertools, os
12
13 import torch, torchvision
14 from torch import nn
15 from torch.nn import functional as F
16
17 import mygpt, tensorstack
18
19 ######################################################################
20
21 if torch.cuda.is_available():
22     device = torch.device("cuda")
23     torch.backends.cuda.matmul.allow_tf32 = True
24 else:
25     device = torch.device("cpu")
26
27 ######################################################################
28
29 parser = argparse.ArgumentParser(
30     description="An implementation of GPT with cache to solve a toy geometric reasoning task."
31 )
32
33 parser.add_argument("--task", type=str, default="picoclvr")
34
35 parser.add_argument("--log_filename", type=str, default="train.log")
36
37 parser.add_argument("--result_dir", type=str, default="results_default")
38
39 parser.add_argument("--seed", type=int, default=0)
40
41 parser.add_argument("--nb_epochs", type=int, default=25)
42
43 parser.add_argument("--batch_size", type=int, default=25)
44
45 parser.add_argument("--nb_train_samples", type=int, default=250000)
46
47 parser.add_argument("--nb_test_samples", type=int, default=10000)
48
49 parser.add_argument("--optim", type=str, default="adam")
50
51 parser.add_argument("--learning_rate", type=float, default=1e-4)
52
53 parser.add_argument("--learning_rate_schedule", type=str, default="10: 2e-5,30: 4e-6")
54
55 parser.add_argument("--dim_model", type=int, default=512)
56
57 parser.add_argument("--dim_keys", type=int, default=64)
58
59 parser.add_argument("--dim_hidden", type=int, default=2048)
60
61 parser.add_argument("--nb_heads", type=int, default=8)
62
63 parser.add_argument("--nb_blocks", type=int, default=12)
64
65 parser.add_argument("--dropout", type=float, default=0.1)
66
67 parser.add_argument("--deterministic_synthesis", action="store_true", default=False)
68
69 parser.add_argument("--no_checkpoint", action="store_true", default=False)
70
71 parser.add_argument("--overwrite_results", action="store_true", default=False)
72
73 parser.add_argument("--checkpoint_name", type=str, default="checkpoint.pth")
74
75 ##############################
76 # picoclvr options
77
78 parser.add_argument("--picoclvr_nb_colors", type=int, default=5)
79
80 parser.add_argument("--picoclvr_height", type=int, default=12)
81
82 parser.add_argument("--picoclvr_width", type=int, default=16)
83
84 parser.add_argument("--picocvlr_prune_properties", type=str, default="none")
85
86 ##############################
87 # Maze options
88
89 parser.add_argument("--maze_height", type=int, default=13)
90
91 parser.add_argument("--maze_width", type=int, default=21)
92
93 parser.add_argument("--maze_nb_walls", type=int, default=15)
94
95 ######################################################################
96
97 args = parser.parse_args()
98
99 assert args.picocvlr_prune_properties in {"none", "train+eval", "eval"}
100
101 try:
102     os.mkdir(args.result_dir)
103 except FileExistsError:
104     if not args.overwrite_results:
105         print(f"result directory {args.result_dir} already exists")
106         exit(1)
107
108 log_file = open(os.path.join(args.result_dir, args.log_filename), "a")
109
110 if args.seed >= 0:
111     # torch.backends.cudnn.deterministic = True
112     # torch.backends.cudnn.benchmark = False
113     # torch.use_deterministic_algorithms(True)
114     torch.manual_seed(args.seed)
115     if torch.cuda.is_available():
116         torch.cuda.manual_seed_all(args.seed)
117
118 ######################################################################
119
120
121 def log_string(s):
122     t = time.strftime("%Y%m%d-%H:%M:%S ", time.localtime())
123
124     if log_file is not None:
125         log_file.write(t + s + "\n")
126         log_file.flush()
127
128     print(t + s)
129     sys.stdout.flush()
130
131
132 for n in vars(args):
133     log_string(f"args.{n} {getattr(args, n)}")
134
135 ######################################################################
136
137
138 def masked_inplace_autoregression(
139     model, batch_size, input, ar_mask, forbidden_tokens=None, device=torch.device("cpu")
140 ):
141     for input, ar_mask in zip(input.split(batch_size), ar_mask.split(batch_size)):
142         i = (ar_mask.sum(0) > 0).nonzero()
143         if i.min() > 0:
144             model(
145                 mygpt.BracketedSequence(input, 0, i.min())
146             )  # Needed to initialize the model's cache
147         for s in range(i.min(), i.max() + 1):
148             output = model(mygpt.BracketedSequence(input, s, 1)).x
149             logits = output[:, s]
150             if forbidden_tokens is not None:
151                 logits = logits.masked_fill(forbidden_tokens, float("-inf"))
152             if args.deterministic_synthesis:
153                 t_next = logits.argmax(1)
154             else:
155                 dist = torch.distributions.categorical.Categorical(logits=logits)
156                 t_next = dist.sample()
157             input[:, s] = ar_mask[:, s] * t_next + (1 - ar_mask[:, s]) * input[:, s]
158
159
160 ######################################################################
161
162
163 class Task:
164     def batches(self, split="train"):
165         pass
166
167     def vocabulary_size(self):
168         pass
169
170     def produce_results(self, n_epoch, model):
171         pass
172
173
174 ######################################################################
175
176 import picoclvr
177
178
179 class TaskPicoCLVR(Task):
180     # Make a tensor from a list of strings
181     def tensorize(self, descr):
182         token_descr = [s.strip().split(" ") for s in descr]
183         l = max([len(s) for s in token_descr])
184         token_descr = [s + ["<nul>"] * (l - len(s)) for s in token_descr]
185         id_descr = [[self.token2id[u] for u in s] for s in token_descr]
186         return torch.tensor(id_descr, device=self.device)
187
188     # Make a list of strings from a tensor
189     def detensorize(self, x):
190         return [" ".join([self.id2token[t.item()] for t in r]) for r in x]
191
192     # trim all the tensors in the tuple z to remove as much token from
193     # left and right in the first tensor. If z is a tuple, all its
194     # elements are trimed according to the triming for the first
195     def trim(self, z, token="<nul>"):
196         n = self.token2id[token]
197         if type(z) == tuple:
198             x = z[0]
199             i = (1 - (F.pad(x, (1, 1), value=n) == n).min(0).values.long()).cumsum(0)
200             a, b = (i == 0).nonzero().max(), (i == i.max()).nonzero().min()
201             return tuple([t[:, a:b] for t in z])
202         else:
203             i = (1 - (F.pad(z, (1, 1), value=n) == n).min(0).values.long()).cumsum(0)
204             a, b = (i == 0).nonzero().max(), (i == i.max()).nonzero().min()
205             return z[:, a:b]
206
207     ######################
208     # Not the cleanest part of the code
209
210     # Extract the last image of each sequence, from the last <img>
211     # included, and set to <nul> all the tokens from the beginning of
212     # that image to the end
213     def excise_last_image(self, input):
214         t_img, t_nul = self.token2id["<img>"], self.token2id["<nul>"]
215         nb_img_tokens = self.height * self.width + 1
216
217         input = input.clone()
218         t = (input == t_img).long()
219         tail_masks = (t.cumsum(dim=1) == t.sum(dim=1, keepdim=True)).long()
220         i = (t * tail_masks).nonzero(as_tuple=True)
221         j = (
222             i[0][:, None],
223             i[1][:, None] + torch.arange(nb_img_tokens, device=input.device)[None, :],
224         )
225         images = self.trim(input[j])
226         input[j] = t_nul
227         loss_masks = 1 - tail_masks
228         input, loss_masks = self.trim((input, loss_masks))
229         return input, loss_masks, images
230
231     def add_true_image(self, input, images, loss_masks):
232         t_nul = self.token2id["<nul>"]
233         nb_img_tokens = self.height * self.width + 1
234         input = F.pad(input, (0, nb_img_tokens), value=t_nul)
235         loss_masks = F.pad(loss_masks, (0, nb_img_tokens), value=0)
236         t = (input == t_nul).long()
237         i = (t.cumsum(dim=1) == 1).nonzero(as_tuple=True)
238         j = (
239             i[0][:, None],
240             i[1][:, None] + torch.arange(nb_img_tokens, device=input.device)[None, :],
241         )
242         input[j] = images
243         loss_masks[j] = 1
244         input, loss_masks = self.trim((input, loss_masks))
245         return input, loss_masks
246
247     def add_generated_image(self, input, loss_masks, model):
248         t_img, t_nul = self.token2id["<img>"], self.token2id["<nul>"]
249         nb_img_tokens = self.height * self.width + 1
250
251         input = F.pad(input, (0, nb_img_tokens), value=t_nul)
252         loss_masks = F.pad(loss_masks, (0, nb_img_tokens), value=0)
253         t = (input == t_nul).long()
254         i = (t.cumsum(dim=1) == 1).nonzero(as_tuple=True)
255         input[i] = t_img
256
257         j = (
258             i[0][:, None],
259             i[1][:, None]
260             + 1
261             + torch.arange(nb_img_tokens - 1, device=input.device)[None, :],
262         )
263         ar_masks = input.new_zeros(input.size(), dtype=torch.int64)
264         ar_masks[j] = 1
265         forbidden_tokens = (
266             torch.arange(self.vocabulary_size(), device=input.device) == t_nul
267         )
268         with torch.autograd.no_grad():
269             t = model.training
270             model.eval()
271             masked_inplace_autoregression(
272                 model,
273                 self.batch_size,
274                 input,
275                 ar_masks,
276                 forbidden_tokens,
277                 device=self.device,
278             )
279             model.train(t)
280
281         input, loss_masks = self.trim((input, loss_masks))
282
283         return input, loss_masks
284
285     ######################
286
287     def __init__(
288         self,
289         nb_train_samples,
290         nb_test_samples,
291         batch_size,
292         height,
293         width,
294         nb_colors=5,
295         device=torch.device("cpu"),
296         pruner_train=None,
297         pruner_eval=None,
298     ):
299         def generate_descr(nb, cache_suffix, pruner):
300             return picoclvr.generate(
301                 nb,
302                 height=self.height,
303                 width=self.width,
304                 nb_colors=nb_colors,
305                 pruner=pruner,
306             )
307
308         self.height = height
309         self.width = width
310         self.batch_size = batch_size
311         self.device = device
312         self.pruner_train = pruner_train
313         self.pruner_eval = pruner_eval
314
315         param = {
316             "nb_train_samples": nb_train_samples,
317             "nb_test_samples": nb_test_samples,
318             "height": height,
319             "width": width,
320             "nb_colors": nb_colors,
321             "batch_size": batch_size,
322             "rng_state": list(torch.get_rng_state()),
323         }
324
325         log_string(
326             f"generating {nb_train_samples+nb_test_samples} samples (can take some time)"
327         )
328         self.train_descr = generate_descr(
329             nb_train_samples, "train", pruner=self.pruner_train
330         )
331         self.test_descr = generate_descr(nb_test_samples, "test", pruner=None)
332
333         # Build the tokenizer
334         tokens = {"<nul>", "<img>"}
335         for d in [self.train_descr, self.test_descr]:
336             for s in d:
337                 for t in s.strip().split(" "):
338                     tokens.add(t)
339         # make this set a sorted list to get the same tensors given
340         # the same descr
341         tokens = list(tokens)
342         tokens.sort()
343         self.token2id = dict([(t, n) for n, t in enumerate(tokens)])
344         self.id2token = dict([(n, t) for n, t in enumerate(tokens)])
345
346         # Tokenize the train and test sets
347         self.train_input = self.tensorize(self.train_descr)
348         self.test_input = self.tensorize(self.test_descr)
349
350     def batches(self, split="train"):
351         assert split in {"train", "test"}
352         input = self.train_input if split == "train" else self.test_input
353         for batch in tqdm.tqdm(
354             input.split(self.batch_size), dynamic_ncols=True, desc=f"epoch-{split}"
355         ):
356             yield self.trim(batch)
357
358     def vocabulary_size(self):
359         return len(self.token2id)
360
361     def compute_missing_properties(self, n_epoch, model, pruner=None):
362         acc_nb_requested_properties = []
363         acc_nb_missing_properties = []
364         acc_nb_results = 0
365
366         for input in tqdm.tqdm(
367             self.test_input.split(self.batch_size),
368             dynamic_ncols=True,
369             desc=f"test-properties",
370         ):
371             tape, loss_masks, _ = self.excise_last_image(input)
372             tape, loss_masks = self.add_generated_image(tape, loss_masks, model)
373             result_descr = self.detensorize(tape)
374             np = picoclvr.nb_properties(
375                 result_descr,
376                 height=self.height,
377                 width=self.width,
378                 pruner=pruner,
379             )
380             nb_requested_properties, _, nb_missing_properties = zip(*np)
381             acc_nb_requested_properties += nb_requested_properties
382             acc_nb_missing_properties += nb_missing_properties
383             acc_nb_results += len(result_descr)
384
385         nb_requested_properties = sum(acc_nb_requested_properties)
386         nb_missing_properties = sum(acc_nb_missing_properties)
387
388         prefix = "" if pruner is None else "pruned_"
389         log_string(f"nb_{prefix}samples {n_epoch} {acc_nb_results}")
390         log_string(
391             f"property_{prefix}nb {n_epoch} requested {sum(acc_nb_requested_properties)} missing {sum(acc_nb_missing_properties)}"
392         )
393         log_string(
394             f"property_{prefix}miss {n_epoch} {100*nb_missing_properties/nb_requested_properties:.02f}%"
395         )
396
397     ######################################################################
398
399     def produce_results(self, n_epoch, model):
400         self.compute_missing_properties(n_epoch, model)
401
402         if self.pruner_eval is not None:
403             self.compute_missing_properties(n_epoch, model, self.pruner_eval)
404
405         nb_tokens_to_generate = self.height * self.width + 3
406         result_descr = []
407         nb_per_primer = 8
408         primer = []
409
410         for primer_descr in [
411             "red above green <sep> green top <sep> blue right of red",
412             "there is red <sep> there is yellow <sep> there is blue",
413             "red below yellow <sep> yellow below green <sep> green below blue <sep> red right <sep> yellow left <sep> green right <sep> blue left",
414             "green bottom <sep> yellow bottom <sep> green left of blue <sep> yellow right of blue <sep> blue top",
415         ]:
416             primer += [primer_descr] * nb_per_primer
417
418         tape = self.tensorize(primer)
419         loss_masks = 1 - (tape == self.token2id["<nul>"]).long()
420         tape, loss_masks = self.add_generated_image(tape, loss_masks, model)
421         result_descr = self.detensorize(tape)
422
423         np = picoclvr.nb_properties(result_descr, height=self.height, width=self.width)
424
425         acc_nb_requested_properties, _, acc_nb_missing_properties = zip(*np)
426         acc_nb_results = len(result_descr)
427
428         nb_requested_properties = sum(acc_nb_requested_properties)
429         nb_missing_properties = sum(acc_nb_missing_properties)
430
431         prefix = "demo_"
432         log_string(f"nb_{prefix}samples {n_epoch} {acc_nb_results}")
433         log_string(
434             f"property_{prefix}nb {n_epoch} requested {sum(acc_nb_requested_properties)} missing {sum(acc_nb_missing_properties)}"
435         )
436         log_string(
437             f"property_{prefix}miss {n_epoch} {100*nb_missing_properties/nb_requested_properties:.02f}%"
438         )
439
440         img = picoclvr.descr2img(result_descr, height=self.height, width=self.width)
441
442         if img.dim() == 5:
443             if img.size(1) == 1:
444                 img = F.pad(img.squeeze(1), pad=(1, 1, 1, 1), value=64)
445             else:
446                 img = torch.cat(
447                     [
448                         torchvision.utils.make_grid(x, padding=1, pad_value=64)[None]
449                         for x in img
450                     ],
451                     0,
452                 )
453
454         image_name = os.path.join(args.result_dir, f"picoclvr_result_{n_epoch:04d}.png")
455         torchvision.utils.save_image(
456             img / 255.0, image_name, nrow=nb_per_primer, padding=1, pad_value=1.0
457         )
458         log_string(f"wrote {image_name}")
459
460
461 ######################################################################
462
463
464 class TaskMNIST(Task):
465     def __init__(self, batch_size, device=torch.device("cpu")):
466         self.device = device
467         self.batch_size = batch_size
468
469     def batches(self, split="train"):
470         assert split in {"train", "test"}
471         data_set = torchvision.datasets.MNIST(
472             root="./data", train=(split == "train"), download=True
473         )
474         data_input = data_set.data.view(-1, 28 * 28).long()
475         if args.nb_train_samples is not None:
476             data_input = data_input[: args.nb_train_samples]
477         for batch in tqdm.tqdm(
478             data_input.split(self.batch_size), desc=f"epoch-{split}"
479         ):
480             yield batch
481
482     def vocabulary_size(self):
483         return 256
484
485     def produce_results(self, n_epoch, model):
486         results = torch.empty(64, 28 * 28, device=self.device, dtype=torch.int64)
487         ar_mask = torch.full_like(results, 1)
488         masked_inplace_autoregression(
489             model, self.batch_size, results, ar_mask, device=self.device
490         )
491         image_name = os.path.join(args.result_dir, f"result_mnist_{n_epoch:04d}.png")
492         torchvision.utils.save_image(
493             1 - results.reshape(-1, 1, 28, 28) / 255.0,
494             image_name,
495             nrow=16,
496             pad_value=0.8,
497         )
498         log_string(f"wrote {image_name}")
499
500
501 ######################################################################
502
503 import maze
504
505
506 class TaskMaze(Task):
507     def map2seq(self, *m):
508         return torch.cat([x.flatten(1) for x in m], 1)
509
510     def seq2map(self, s):
511         s = s.reshape(s.size(0), -1, self.height, self.width)
512         return (s[:, k] for k in range(s.size(1)))
513
514     def __init__(
515         self,
516         nb_train_samples,
517         nb_test_samples,
518         batch_size,
519         height,
520         width,
521         nb_walls,
522         device=torch.device("cpu"),
523     ):
524         self.batch_size = batch_size
525         self.height = height
526         self.width = width
527         self.device = device
528
529         train_mazes, train_paths, train_policies = maze.create_maze_data(
530             nb_train_samples,
531             height=height,
532             width=width,
533             nb_walls=nb_walls,
534             progress_bar=lambda x: tqdm.tqdm(x, dynamic_ncols=True, desc=f"data-train"),
535         )
536         self.train_input = self.map2seq(train_mazes.to(device), train_paths.to(device))
537         self.train_policies = train_policies.flatten(-2).to(device)
538
539         test_mazes, test_paths, test_policies = maze.create_maze_data(
540             nb_test_samples,
541             height=height,
542             width=width,
543             nb_walls=nb_walls,
544             progress_bar=lambda x: tqdm.tqdm(x, dynamic_ncols=True, desc=f"data-test"),
545         )
546         self.test_input = self.map2seq(test_mazes.to(device), test_paths.to(device))
547         self.test_policies = test_policies.flatten(-2).to(device)
548
549         self.nb_codes = self.train_input.max() + 1
550
551     def batches(self, split="train", nb_to_use=-1, desc=None):
552         assert split in {"train", "test"}
553         input = self.train_input if split == "train" else self.test_input
554         if nb_to_use > 0:
555             input = input[:nb_to_use]
556         if desc is None:
557             desc = f"epoch-{split}"
558         for batch in tqdm.tqdm(
559             input.split(self.batch_size), dynamic_ncols=True, desc=desc
560         ):
561             yield batch
562
563     def policy_batches(self, split="train", nb_to_use=-1, desc=None):
564         assert split in {"train", "test"}
565         input = self.train_input if split == "train" else self.test_input
566         policies = self.train_policies if split == "train" else self.test_policies
567         input = input[:, : self.height * self.width]
568         policies = policies * (input != maze.v_wall)[:, None]
569
570         if nb_to_use > 0:
571             input = input[:nb_to_use]
572             policies = policies[:nb_to_use]
573
574         if desc is None:
575             desc = f"epoch-{split}"
576         for batch in tqdm.tqdm(
577             zip(input.split(self.batch_size), policies.split(self.batch_size)),
578             dynamic_ncols=True,
579             desc=desc,
580         ):
581             yield batch
582
583     def vocabulary_size(self):
584         return self.nb_codes
585
586     def compute_error(self, model, split="train", nb_to_use=-1):
587         nb_total, nb_correct = 0, 0
588         for input in task.batches(split, nb_to_use):
589             result = input.clone()
590             ar_mask = result.new_zeros(result.size())
591             ar_mask[:, self.height * self.width :] = 1
592             result *= 1 - ar_mask
593             masked_inplace_autoregression(
594                 model, self.batch_size, result, ar_mask, device=self.device
595             )
596             mazes, paths = self.seq2map(result)
597             nb_correct += maze.path_correctness(mazes, paths).long().sum()
598             nb_total += mazes.size(0)
599
600         return nb_total, nb_correct
601
602     def produce_results(self, n_epoch, model):
603         with torch.autograd.no_grad():
604             t = model.training
605             model.eval()
606
607             train_nb_total, train_nb_correct = self.compute_error(
608                 model, "train", nb_to_use=1000
609             )
610             log_string(
611                 f"accuracy_train nb_total {train_nb_total} nb_correct {train_nb_correct} accuracy {(100.0*train_nb_correct)/train_nb_total:.02f}%"
612             )
613
614             test_nb_total, test_nb_correct = self.compute_error(
615                 model, "test", nb_to_use=1000
616             )
617             log_string(
618                 f"accuracy_test nb_total {test_nb_total} nb_correct {test_nb_correct} accuracy {(100.0*test_nb_correct)/test_nb_total:.02f}%"
619             )
620
621             input = self.test_input[:48]
622             result = input.clone()
623             ar_mask = result.new_zeros(result.size())
624             ar_mask[:, self.height * self.width :] = 1
625             result *= 1 - ar_mask
626             masked_inplace_autoregression(
627                 model, self.batch_size, result, ar_mask, device=self.device
628             )
629
630             mazes, paths = self.seq2map(input)
631             _, predicted_paths = self.seq2map(result)
632
633             filename = os.path.join(args.result_dir, f"result_{n_epoch:04d}.png")
634             maze.save_image(
635                 filename,
636                 mazes=mazes,
637                 target_paths=paths,
638                 predicted_paths=predicted_paths,
639                 path_correct=maze.path_correctness(mazes, predicted_paths),
640             )
641             log_string(f"wrote {filename}")
642
643             model.train(t)
644
645
646 ######################################################################
647
648
649 def picoclvr_pruner_horizontal_green(p):
650     return not ("green" in p and ("left" in p or "right" in p))
651
652
653 picoclvr_pruner_train = (
654     picoclvr_pruner_horizontal_green
655     if args.picocvlr_prune_properties in {"train+eval"}
656     else None
657 )
658
659 picoclvr_pruner_eval = (
660     (lambda p: not picoclvr_pruner_horizontal_green(p))
661     if args.picocvlr_prune_properties in {"train+eval", "eval"}
662     else None
663 )
664
665 ######################################################################
666
667 if args.task == "picoclvr":
668     task = TaskPicoCLVR(
669         nb_train_samples=args.nb_train_samples,
670         nb_test_samples=args.nb_test_samples,
671         batch_size=args.batch_size,
672         height=args.picoclvr_height,
673         width=args.picoclvr_width,
674         nb_colors=args.picoclvr_nb_colors,
675         device=device,
676         pruner_train=picoclvr_pruner_train,
677         pruner_eval=picoclvr_pruner_eval,
678     )
679
680 elif args.task == "mnist":
681     task = TaskMNIST(
682         batch_size=args.batch_size,
683         device=device,
684     )
685
686 elif args.task == "maze":
687     task = TaskMaze(
688         nb_train_samples=args.nb_train_samples,
689         nb_test_samples=args.nb_test_samples,
690         batch_size=args.batch_size,
691         height=args.maze_height,
692         width=args.maze_width,
693         nb_walls=args.maze_nb_walls,
694         device=device,
695     )
696
697 else:
698     raise ValueError(f"Unknown task {args.task}")
699
700 ######################################################################
701
702 log_string(f"device {device}")
703
704 vocabulary_size = task.vocabulary_size()
705
706 log_string(f"vocabulary_size {vocabulary_size}")
707
708 ##############################
709
710 model = mygpt.MyGPT(
711     vocabulary_size=vocabulary_size,
712     dim_model=args.dim_model,
713     dim_keys=args.dim_keys,
714     dim_hidden=args.dim_hidden,
715     nb_heads=args.nb_heads,
716     nb_blocks=args.nb_blocks,
717     causal=True,
718     dropout=args.dropout,
719 )
720
721 model.to(device)
722
723 nb_parameters = sum(p.numel() for p in model.parameters())
724 log_string(f"nb_parameters {nb_parameters} ({int(nb_parameters/1e6)}M)")
725
726 ######################################################################
727
728 nb_epochs_finished = 0
729
730 if args.no_checkpoint:
731     log_string(f"not trying to load checkpoint.")
732
733 else:
734     try:
735         checkpoint_name = os.path.join(args.result_dir, args.checkpoint_name)
736         checkpoint = torch.load(checkpoint_name)
737         nb_epochs_finished = checkpoint["nb_epochs_finished"]
738         model.load_state_dict(checkpoint["model_state"])
739         torch.set_rng_state(checkpoint["rng_state"])
740         if torch.cuda.is_available():
741             torch.cuda.set_rng_state(checkpoint["cuda_rng_state"])
742
743         log_string(f"checkpoint loaded with {nb_epochs_finished} epochs finished.")
744
745     except FileNotFoundError:
746         log_string("starting from scratch.")
747
748     except:
749         log_string("error when loading the checkpoint.")
750         exit(1)
751
752 ######################################################################
753
754 nb_epochs = args.nb_epochs if args.nb_epochs > 0 else nb_epochs_default
755
756 token_count = 0
757 for input in task.batches(split="train"):
758     token_count += F.one_hot(input, num_classes=task.vocabulary_size()).sum((0, 1))
759 token_probas = token_count / token_count.sum()
760 entropy = -torch.xlogy(token_probas, token_probas).sum()
761 train_set_perplexity = math.exp(entropy)
762
763 ##############################
764
765 if args.learning_rate_schedule == "cos":
766     learning_rate_schedule = {}
767     for n_epoch in range(args.nb_epochs):
768         u = n_epoch / args.nb_epochs * math.pi
769         learning_rate_schedule[n_epoch] = args.learning_rate * 0.5 * (1 + math.cos(u))
770 else:
771     u = {
772         int(k): float(v)
773         for k, v in [
774             tuple(x.split(":")) for x in args.learning_rate_schedule.split(",")
775         ]
776     }
777
778     learning_rate_schedule = {}
779     learning_rate = args.learning_rate
780     for n_epoch in range(args.nb_epochs):
781         if n_epoch in u:
782             learning_rate = u[n_epoch]
783         learning_rate_schedule[n_epoch] = learning_rate
784
785 log_string(f"learning_rate_schedule {learning_rate_schedule}")
786
787 ##############################
788
789 nb_samples_seen = 0
790
791 if nb_epochs_finished >= nb_epochs:
792     task.produce_results(nb_epochs_finished, model)
793
794 for n_epoch in range(nb_epochs_finished, nb_epochs):
795     learning_rate = learning_rate_schedule[n_epoch]
796
797     log_string(f"learning_rate {learning_rate}")
798
799     if args.optim == "sgd":
800         optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
801     elif args.optim == "adam":
802         optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
803     elif args.optim == "adamw":
804         optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
805     else:
806         raise ValueError(f"Unknown optimizer {args.optim}.")
807
808     model.train()
809
810     nb_train_samples, acc_train_loss = 0, 0.0
811
812     for input in task.batches(split="train"):
813         input = input.to(device)
814         output = model(mygpt.BracketedSequence(input)).x
815         loss = F.cross_entropy(output.transpose(1, 2), input)
816         acc_train_loss += loss.item() * input.size(0)
817         nb_train_samples += input.size(0)
818         nb_samples_seen += input.size(0)
819
820         optimizer.zero_grad()
821         loss.backward()
822         optimizer.step()
823
824     with torch.autograd.no_grad():
825         model.eval()
826
827         nb_test_samples, acc_test_loss = 0, 0.0
828
829         for input in task.batches(split="test"):
830             input = input.to(device)
831
832             # input, loss_masks, true_images = task.excise_last_image(input)
833             # input, loss_masks = task.add_true_image(input, true_images, loss_masks)
834
835             output = model(mygpt.BracketedSequence(input)).x
836             loss = F.cross_entropy(output.transpose(1, 2), input)
837             acc_test_loss += loss.item() * input.size(0)
838             nb_test_samples += input.size(0)
839
840         train_perplexity = math.exp(min(100, acc_train_loss / nb_train_samples))
841         test_perplexity = math.exp(min(100, acc_test_loss / nb_test_samples))
842
843         log_string(
844             f"perplexity {n_epoch} train_set {train_set_perplexity} train_prediction {train_perplexity} test_prediction {test_perplexity}"
845         )
846
847         task.produce_results(n_epoch, model)
848
849     checkpoint = {
850         "nb_epochs_finished": n_epoch + 1,
851         "model_state": model.state_dict(),
852         "rng_state": torch.get_rng_state(),
853     }
854
855     if torch.cuda.is_available():
856         checkpoint["cuda_rng_state"] = torch.cuda.get_rng_state()
857
858     checkpoint_name = os.path.join(args.result_dir, args.checkpoint_name)
859     torch.save(checkpoint, checkpoint_name)
860     log_string(f"saved checkpoint {checkpoint_name}")
861
862 ######################################################################