X-Git-Url: https://fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=blobdiff_plain;f=tasks.py;h=1d967f9cc543355e48b29ebe4586944fefaebf05;hb=09952eb1ee41e279a1cb7797d2de997c6bcaa5af;hp=6b6b8f2a5ed055c3f80473f70fa7b0ac87f6a526;hpb=621231cc5bb94f983c556a1b450b66067bec4165;p=picoclvr.git diff --git a/tasks.py b/tasks.py index 6b6b8f2..1d967f9 100755 --- a/tasks.py +++ b/tasks.py @@ -1898,8 +1898,6 @@ class Escape(Task): self.train_input = seq[:nb_train_samples].to(self.device) self.test_input = seq[nb_train_samples:].to(self.device) - self.nb_codes = max(self.train_input.max(), self.test_input.max()) + 1 - def batches(self, split="train", nb_to_use=-1, desc=None): assert split in {"train", "test"} input = self.train_input if split == "train" else self.test_input @@ -1913,7 +1911,7 @@ class Escape(Task): yield batch def vocabulary_size(self): - return self.nb_codes + return escape.nb_codes def thinking_autoregression( self, n_epoch, model, result_dir, logger, deterministic_synthesis, nmax=1000 @@ -1927,6 +1925,8 @@ class Escape(Task): index_lookahead_reward = state_len + 2 it_len = state_len + 3 # state / action / reward / lookahead_reward + result[:, it_len:] = -1 + def ar(result, ar_mask, logit_biases=None): ar_mask = ar_mask.expand_as(result) result *= 1 - ar_mask @@ -1943,28 +1943,49 @@ class Escape(Task): # Generate iteration after iteration - optimistic_bias = result.new_zeros(self.nb_codes, device=result.device) - optimistic_bias[(-1) + escape.first_lookahead_rewards_code + 1] = math.log(1e-1) - optimistic_bias[(1) + escape.first_lookahead_rewards_code + 1] = math.log(1e1) + optimistic_bias = result.new_zeros(escape.nb_codes, device=result.device) + optimistic_bias[escape.lookahead_reward2code(-1)] = -math.log(1e1) + optimistic_bias[escape.lookahead_reward2code(1)] = math.log(1e1) + + snapshots = [] for u in tqdm.tqdm( range(it_len, result.size(1) - it_len + 1, it_len), desc="thinking" ): - # Generate the lookahead_reward pessimistically + # Re-generate the lookahead_reward pessimistically in the + # previous iterations ar_mask = (t < u).long() * (t % it_len == index_lookahead_reward).long() ar(result, ar_mask, logit_biases=-optimistic_bias) + snapshots.append(result[:10].detach().clone()) # Generate the state ar_mask = (t >= u).long() * (t < u + state_len).long() ar(result, ar_mask) + snapshots.append(result[:10].detach().clone()) - # Generate the lookahead_reward optimistically + # Re-generate the lookahead_reward optimistically in the + # previous iterations ar_mask = (t < u).long() * (t % it_len == index_lookahead_reward).long() ar(result, ar_mask, logit_biases=optimistic_bias) + snapshots.append(result[:10].detach().clone()) # Generate the action and reward ar_mask = (t >= u + index_action).long() * (t <= u + index_reward).long() ar(result, ar_mask) + snapshots.append(result[:10].detach().clone()) + + filename = os.path.join(result_dir, f"test_thinking_compute_{n_epoch:04d}.txt") + with open(filename, "w") as f: + for n in range(10): + for s in snapshots: + s, a, r, lr = escape.seq2episodes( + s[n : n + 1], self.height, self.width, lookahead=True + ) + str = escape.episodes2str( + s, a, r, lookahead_rewards=lr, unicode=True, ansi_colors=True + ) + f.write(str) + f.write("\n\n") # Saving the generated sequences