X-Git-Url: https://fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=blobdiff_plain;f=beaver.py;h=5abe39b767c13299d8dcffcc3369682bfbfff69f;hb=HEAD;hp=f5bd9245319a37f65f655ed7bb05cfbf81563383;hpb=cd3fae5ff5e9b4c76b533dac8bc64cc331ecf51e;p=beaver.git diff --git a/beaver.py b/beaver.py index f5bd924..5abe39b 100755 --- a/beaver.py +++ b/beaver.py @@ -64,6 +64,10 @@ parser.add_argument("--dropout", type=float, default=0.1) parser.add_argument("--deterministic_synthesis", action="store_true", default=False) +parser.add_argument("--random_regression_order", action="store_true", default=False) + +parser.add_argument("--noncausal_prompt", action="store_true", default=False) + parser.add_argument("--no_checkpoint", action="store_true", default=False) parser.add_argument("--overwrite_results", action="store_true", default=False) @@ -86,7 +90,7 @@ parser.add_argument("--oneshot", action="store_true", default=False) parser.add_argument("--oneshot_input", type=str, default="head") -parser.add_argument("--oneshot_output", type=str, default="policy") +parser.add_argument("--oneshot_output", type=str, default="trace") ###################################################################### @@ -123,24 +127,59 @@ def log_string(s): sys.stdout.flush() +log_string(f"cmd {' '.join(sys.argv)}") + for n in vars(args): log_string(f"args.{n} {getattr(args, n)}") ###################################################################### +def reorder(x, order, reverse=False): # x is NxTxD1x...xDk, order is NxT' + u = x.reshape(x.size()[:2] + (-1,)) + order = order.unsqueeze(-1).expand(-1, -1, u.size(-1)) + if reverse: + v = u.new(u.size()).scatter_(1, order, u) + else: + v = u.gather(1, order) + v = v.reshape(v.size()[:2] + x.size()[2:]) + return v + + +def shuffle(x, prompt_len): + if args.random_regression_order: + order = torch.rand(x.size(), device=x.device) + order[:, :prompt_len] = torch.arange(-prompt_len, 0, device=x.device) + order = order.sort(1).indices + else: + order = ( + torch.arange(x.size(1), device=x.device).unsqueeze(0).expand(x.size(0), -1) + ) + return reorder(x, order), order + + +def eval_mygpt(model, input, mode="standard", prompt_len=0): + x, order = shuffle(input, prompt_len) + x = model(mygpt.BracketedSequence(x), mode=mode, order=order).x + return reorder(x, order, reverse=True) + + +###################################################################### + # ar_mask is a Boolean matrix of same shape as input, with 1s on the # tokens that should be generated -def masked_inplace_autoregression(model, batch_size, input, ar_mask): - for input, ar_mask in zip(input.split(batch_size), ar_mask.split(batch_size)): +def masked_inplace_autoregression(model, batch_size, input, ar_mask, order=None): + for input, ar_mask, order in zip( + input.split(batch_size), ar_mask.split(batch_size), order.split(batch_size) + ): i = (ar_mask.sum(0) > 0).nonzero() if i.min() > 0: # Needed to initialize the model's cache - model(mygpt.BracketedSequence(input, 0, i.min())) + model(mygpt.BracketedSequence(input, 0, i.min()), order=order) for s in range(i.min(), i.max() + 1): - output = model(mygpt.BracketedSequence(input, s, 1)).x + output = model(mygpt.BracketedSequence(input, s, 1), order=order).x logits = output[:, s] if args.deterministic_synthesis: t_next = logits.argmax(1) @@ -153,7 +192,7 @@ def masked_inplace_autoregression(model, batch_size, input, ar_mask): ###################################################################### -def compute_perplexity(model, split="train"): +def compute_perplexity(model, task, prompt_len, split="train"): with torch.autograd.no_grad(): t = model.training model.eval() @@ -162,9 +201,12 @@ def compute_perplexity(model, split="train"): for input in task.batches(split=split): input = input.to(device) - - output = model(mygpt.BracketedSequence(input)).x - loss = F.cross_entropy(output.transpose(1, 2), input) + output = eval_mygpt(model, input, prompt_len=prompt_len) + if args.noncausal_prompt: + d = input.size(1) // 2 + loss = F.cross_entropy(output[:, d:].transpose(1, 2), input[:, d:]) + else: + loss = F.cross_entropy(output.transpose(1, 2), input) acc_loss += loss.item() * input.size(0) nb_samples += input.size(0) @@ -193,7 +235,39 @@ def oneshot_trace_loss(mazes, output, policies, height, width): return (output - targets).abs().sum() / masks.sum() -def oneshot(gpt, task): +def oneshot(model, learning_rate_scheduler, task): + t = model.training + model.eval() + mazes = task.test_input[:48].clone() + mazes[:, task.height * task.width :] = 0 + policies = task.test_policies[:48] + targets = maze.stationary_densities( + mazes[:, : task.height * task.width].view(-1, task.height, task.width), + policies.view(-1, 4, task.height, task.width), + ).flatten(-2) + output = eval_mygpt(model, mazes, prompt_len=task.height * task.width) + output = F.softmax(output, dim=2) + print(f"{output.size()=}") + proba_path = output[:, task.height * task.width :, 4].reshape( + -1, task.height, task.width + ) + mazes = mazes[:, : task.height * task.width].reshape(-1, task.height, task.width) + targets = targets.reshape(-1, task.height, task.width) + paths = task.test_input[:48, task.height * task.width :].reshape( + -1, task.height, task.width + ) + filename = f"oneshot.png" + maze.save_image( + os.path.join(args.result_dir, filename), + mazes=mazes, + # target_paths=paths, + score_paths=proba_path, + score_truth=targets, + ) + log_string(f"wrote {filename}") + + +def oneshot_old(gpt, learning_rate_scheduler, task): t = gpt.training gpt.eval() @@ -221,13 +295,19 @@ def oneshot(gpt, task): nn.Linear(args.dim_model, dim_out), ).to(device) + learning_rate_scheduler.reset() + for n_epoch in range(args.nb_epochs): - learning_rate = learning_rate_schedule[n_epoch] + learning_rate = learning_rate_scheduler.get_learning_rate() + log_string(f"learning_rate {n_epoch} {learning_rate}") + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) acc_train_loss, nb_train_samples = 0, 0 for mazes, policies in task.policy_batches(split="train"): - output_gpt = gpt(mygpt.BracketedSequence(mazes), mode=args.oneshot_input).x + output_gpt = eval_mygpt( + gpt, mazes, mode=args.oneshot_input, prompt_len=task.height * task.width + ) output = model(output_gpt) loss = compute_loss(mazes, output, policies, task.height, task.width) @@ -238,9 +318,13 @@ def oneshot(gpt, task): loss.backward() optimizer.step() + learning_rate_scheduler.update(n_epoch + 1, acc_train_loss) + acc_test_loss, nb_test_samples = 0, 0 for mazes, policies in task.policy_batches(split="test"): - output_gpt = gpt(mygpt.BracketedSequence(mazes), mode=args.oneshot_input).x + output_gpt = eval_mygpt( + gpt, mazes, mode=args.oneshot_input, prompt_len=task.height * task.width + ) output = model(output_gpt) loss = compute_loss(mazes, output, policies, task.height, task.width) acc_test_loss += loss.item() * mazes.size(0) @@ -251,9 +335,11 @@ def oneshot(gpt, task): ) # ------------------- - mazes = task.test_input[:32, : task.height * task.width] - policies = task.test_policies[:32] - output_gpt = gpt(mygpt.BracketedSequence(mazes), mode=args.oneshot_input).x + mazes = task.test_input[:48, : task.height * task.width] + policies = task.test_policies[:48] + output_gpt = eval_mygpt( + gpt, mazes, mode=args.oneshot_input, prompt_len=task.height * task.width + ) output = model(output_gpt) if args.oneshot_output == "policy": targets = policies.permute(0, 2, 1) @@ -272,15 +358,17 @@ def oneshot(gpt, task): scores = scores.reshape(-1, task.height, task.width) mazes = mazes.reshape(-1, task.height, task.width) targets = targets.reshape(-1, task.height, task.width) + filename = ( + f"oneshot_{args.oneshot_input}_{args.oneshot_output}_{n_epoch:04d}.png" + ) maze.save_image( - os.path.join( - args.result_dir, - f"oneshot_{args.oneshot_input}_{args.oneshot_output}_{n_epoch:04d}.png", - ), + os.path.join(args.result_dir, filename), mazes=mazes, score_paths=scores, score_truth=targets, ) + log_string(f"wrote {filename}") + # ------------------- gpt.train(t) @@ -289,8 +377,77 @@ def oneshot(gpt, task): ###################################################################### +class LearningRateScheduler: + def get_learning_rate(self): + pass + + def update(self, nb_finished_epochs, loss): + pass + + def reset(self): + pass + + def get_state(self): + return vars(self) + + def set_state(self, state): + print(f"{state=}") + for k, v in state.items(): + setattr(self, k, v) + + +class StepWiseScheduler(LearningRateScheduler): + def __init__(self, schedule): + self.nb_finished_epochs = 0 + self.schedule = schedule + + def get_learning_rate(self): + return self.schedule[self.nb_finished_epochs] + + def update(self, nb_finished_epochs, loss): + self.nb_finished_epochs = nb_finished_epochs + + def reset(self): + self.nb_finished_epochs = 0 + + def get_state(self): + return {"nb_finished_epochs": self.nb_finished_epochs} + + +class AutoScheduler(LearningRateScheduler): + def __init__(self, learning_rate_init, growth=1.0, degrowth=0.2): + self.learning_rate_init = learning_rate_init + self.learning_rate = learning_rate_init + self.growth = growth + self.degrowth = degrowth + self.pred_loss = None + + def get_learning_rate(self): + return self.learning_rate + + def update(self, nb_finished_epochs, loss): + if self.pred_loss is not None: + if loss >= self.pred_loss: + self.learning_rate *= self.degrowth + else: + self.learning_rate *= self.growth + self.pred_loss = loss + + def reset(self): + self.learning_rate = self.learning_rate_init + + def get_state(self): + return { + "learning_rate_init": self.learning_rate_init, + "pred_loss": self.pred_loss, + } + + +###################################################################### + + class Task: - def batches(self, split="train"): + def batches(self, split="train", nb_to_use=-1, desc=None): pass def vocabulary_size(self): @@ -350,17 +507,19 @@ class TaskMaze(Task): self.nb_codes = self.train_input.max() + 1 - def batches(self, split="train", nb_to_use=-1): + def batches(self, split="train", nb_to_use=-1, desc=None): assert split in {"train", "test"} input = self.train_input if split == "train" else self.test_input if nb_to_use > 0: input = input[:nb_to_use] + if desc is None: + desc = f"epoch-{split}" for batch in tqdm.tqdm( - input.split(self.batch_size), dynamic_ncols=True, desc=f"epoch-{split}" + input.split(self.batch_size), dynamic_ncols=True, desc=desc ): yield batch - def policy_batches(self, split="train", nb_to_use=-1): + def policy_batches(self, split="train", nb_to_use=-1, desc=None): assert split in {"train", "test"} input = self.train_input if split == "train" else self.test_input policies = self.train_policies if split == "train" else self.test_policies @@ -371,10 +530,12 @@ class TaskMaze(Task): input = input[:nb_to_use] policies = policies[:nb_to_use] + if desc is None: + desc = f"epoch-{split}" for batch in tqdm.tqdm( zip(input.split(self.batch_size), policies.split(self.batch_size)), dynamic_ncols=True, - desc=f"epoch-{split}", + desc=desc, ): yield batch @@ -388,7 +549,11 @@ class TaskMaze(Task): ar_mask = result.new_zeros(result.size()) ar_mask[:, self.height * self.width :] = 1 result *= 1 - ar_mask - masked_inplace_autoregression(model, self.batch_size, result, ar_mask) + x, order = shuffle(result, self.height * self.width) + masked_inplace_autoregression( + model, self.batch_size, x, ar_mask, order=order + ) + result = reorder(x, order, reverse=True) mazes, paths = self.seq2map(result) nb_correct += maze.path_correctness(mazes, paths).long().sum() nb_total += mazes.size(0) @@ -414,22 +579,28 @@ class TaskMaze(Task): f"accuracy_test nb_total {test_nb_total} nb_correct {test_nb_correct} accuracy {(100.0*test_nb_correct)/test_nb_total:.02f}%" ) - input = self.test_input[:32] + input = self.test_input[:48] result = input.clone() ar_mask = result.new_zeros(result.size()) ar_mask[:, self.height * self.width :] = 1 result *= 1 - ar_mask - masked_inplace_autoregression(model, self.batch_size, result, ar_mask) + x, order = shuffle(result, self.height * self.width) + masked_inplace_autoregression( + model, self.batch_size, x, ar_mask, order=order + ) + result = reorder(x, order, reverse=True) mazes, paths = self.seq2map(input) _, predicted_paths = self.seq2map(result) + filename = f"result_{n_epoch:04d}.png" maze.save_image( - os.path.join(args.result_dir, f"result_{n_epoch:04d}.png"), + os.path.join(args.result_dir, filename), mazes=mazes, target_paths=paths, predicted_paths=predicted_paths, path_correct=maze.path_correctness(mazes, predicted_paths), ) + log_string(f"wrote {filename}") model.train(t) @@ -456,6 +627,30 @@ log_string(f"vocabulary_size {vocabulary_size}") ############################## + +def noncausal_prompt_amm_generator(d): + q = torch.arange(d)[:, None] + k = torch.arange(d)[None, :] + s = args.maze_height * args.maze_width + return torch.logical_and(q < k, torch.logical_or(q >= s, k >= s)) + # return q < k + + +def noncausal_prompt_oneshot_amm_generator(d): + q = torch.arange(d)[:, None] + k = torch.arange(d)[None, :] + s = args.maze_height * args.maze_width + return k >= s + # return q < k + + +if args.oneshot: + amm_generator = noncausal_prompt_oneshot_amm_generator +elif args.noncausal_prompt: + amm_generator = noncausal_prompt_amm_generator +else: + amm_generator = None + model = mygpt.MyGPT( vocabulary_size=vocabulary_size, dim_model=args.dim_model, @@ -465,6 +660,7 @@ model = mygpt.MyGPT( nb_blocks=args.nb_blocks, causal=True, dropout=args.dropout, + amm_generator=amm_generator, ) model.to(device) @@ -474,6 +670,36 @@ log_string(f"nb_parameters {nb_parameters} ({int(nb_parameters/1e6)}M)") ###################################################################### +if args.learning_rate_schedule == "auto": + learning_rate_scheduler = AutoScheduler(args.learning_rate) + +elif args.learning_rate_schedule == "cos": + schedule = {} + for n_epoch in range(args.nb_epochs): + u = n_epoch / args.nb_epochs * math.pi + schedule[n_epoch] = args.learning_rate * 0.5 * (1 + math.cos(u)) + learning_rate_scheduler = StepWiseScheduler(schedule) + log_string(f"learning_rate_schedule {schedule}") + +else: + u = { + int(k): float(v) + for k, v in [ + tuple(x.split(":")) for x in args.learning_rate_schedule.split(",") + ] + } + + schedule = {} + learning_rate = args.learning_rate + for n_epoch in range(args.nb_epochs): + if n_epoch in u: + learning_rate = u[n_epoch] + schedule[n_epoch] = learning_rate + learning_rate_scheduler = StepWiseScheduler(schedule) + log_string(f"learning_rate_schedule {schedule}") + +###################################################################### + nb_epochs_finished = 0 if args.no_checkpoint: @@ -485,6 +711,7 @@ else: checkpoint = torch.load(checkpoint_name) nb_epochs_finished = checkpoint["nb_epochs_finished"] model.load_state_dict(checkpoint["model_state"]) + learning_rate_scheduler.set_state(checkpoint["learning_rate_scheduler_state"]) torch.set_rng_state(checkpoint["rng_state"]) if torch.cuda.is_available(): torch.cuda.set_rng_state(checkpoint["cuda_rng_state"]) @@ -494,9 +721,15 @@ else: except FileNotFoundError: log_string("starting from scratch.") - except: - log_string("error when loading the checkpoint.") - exit(1) + # except: + # log_string("error when loading the checkpoint.") + # exit(1) + +###################################################################### + +if args.oneshot: + oneshot(model, learning_rate_scheduler, task) + exit(0) ###################################################################### @@ -509,34 +742,14 @@ train_set_perplexity = math.exp(entropy) ############################## -if args.learning_rate_schedule == "cos": - learning_rate_schedule = {} - for n_epoch in range(args.nb_epochs): - u = n_epoch / args.nb_epochs * math.pi - learning_rate_schedule[n_epoch] = args.learning_rate * 0.5 * (1 + math.cos(u)) -else: - u = { - int(k): float(v) - for k, v in [ - tuple(x.split(":")) for x in args.learning_rate_schedule.split(",") - ] - } - - learning_rate_schedule = {} - learning_rate = args.learning_rate - for n_epoch in range(args.nb_epochs): - if n_epoch in u: - learning_rate = u[n_epoch] - learning_rate_schedule[n_epoch] = learning_rate - -log_string(f"learning_rate_schedule {learning_rate_schedule}") - -############################## - if nb_epochs_finished >= args.nb_epochs: n_epoch = nb_epochs_finished - train_perplexity = compute_perplexity(model, split="train") - test_perplexity = compute_perplexity(model, split="test") + train_perplexity = compute_perplexity( + model, task, prompt_len=task.height * task.width, split="train" + ) + test_perplexity = compute_perplexity( + model, task, prompt_len=task.height * task.width, split="test" + ) log_string( f"perplexity {n_epoch} train_set {train_set_perplexity} train_prediction {train_perplexity} test_prediction {test_perplexity}" @@ -544,14 +757,13 @@ if nb_epochs_finished >= args.nb_epochs: task.produce_results(n_epoch, model) - exit(0) - ############################## -for n_epoch in range(nb_epochs_finished, args.nb_epochs): - learning_rate = learning_rate_schedule[n_epoch] +learning_rate_scheduler.reset() - log_string(f"learning_rate {learning_rate}") +for n_epoch in range(nb_epochs_finished, args.nb_epochs): + learning_rate = learning_rate_scheduler.get_learning_rate() + log_string(f"learning_rate {n_epoch} {learning_rate}") if args.optim == "sgd": optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) @@ -568,8 +780,12 @@ for n_epoch in range(nb_epochs_finished, args.nb_epochs): for input in task.batches(split="train"): input = input.to(device) - output = model(mygpt.BracketedSequence(input)).x - loss = F.cross_entropy(output.transpose(1, 2), input) + output = eval_mygpt(model, input, prompt_len=task.height * task.width) + if args.noncausal_prompt: + d = input.size(1) // 2 + loss = F.cross_entropy(output[:, d:].transpose(1, 2), input[:, d:]) + else: + loss = F.cross_entropy(output.transpose(1, 2), input) acc_train_loss += loss.item() * input.size(0) nb_train_samples += input.size(0) @@ -577,8 +793,12 @@ for n_epoch in range(nb_epochs_finished, args.nb_epochs): loss.backward() optimizer.step() + learning_rate_scheduler.update(n_epoch + 1, acc_train_loss) + train_perplexity = math.exp(min(100, acc_train_loss / nb_train_samples)) - test_perplexity = compute_perplexity(model, split="test") + test_perplexity = compute_perplexity( + model, task, prompt_len=task.height * task.width, split="test" + ) log_string( f"perplexity {n_epoch} train_set {train_set_perplexity} train_prediction {train_perplexity} test_prediction {test_perplexity}" @@ -589,6 +809,7 @@ for n_epoch in range(nb_epochs_finished, args.nb_epochs): checkpoint = { "nb_epochs_finished": n_epoch + 1, "model_state": model.state_dict(), + "learning_rate_scheduler_state": learning_rate_scheduler.get_state(), "rng_state": torch.get_rng_state(), } @@ -600,8 +821,3 @@ for n_epoch in range(nb_epochs_finished, args.nb_epochs): log_string(f"saved checkpoint {checkpoint_name}") ###################################################################### - -if args.oneshot: - oneshot(model, task) - -######################################################################