+ self.test_descr = self.grid_factory.generate_samples(
+ nb_test_samples, lambda r: tqdm.tqdm(r)
+ )
+
+ # Build the tokenizer
+ tokens = set()
+ for d in [self.train_descr, self.test_descr]:
+ for s in d:
+ for t in s.strip().split(" "):
+ tokens.add(t)
+ # make this set a sorted list to get the same tensors given
+ # the same descr
+ tokens = list(tokens)
+ tokens.sort()
+ tokens = ["#"] + tokens
+ self.token2id = dict([(t, n) for n, t in enumerate(tokens)])
+ self.id2token = dict([(n, t) for n, t in enumerate(tokens)])
+ self.t_nul = self.token2id["#"]
+ self.t_true = self.token2id["true"]
+ self.t_false = self.token2id["false"]
+
+ # Tokenize the train and test sets
+ self.train_input = self.str2tensor(self.train_descr)
+ self.test_input = self.str2tensor(self.test_descr)
+
+ def batches(self, split="train"):
+ assert split in {"train", "test"}
+ input = self.train_input if split == "train" else self.test_input
+ for batch in tqdm.tqdm(
+ input.split(self.batch_size), dynamic_ncols=True, desc=f"epoch-{split}"
+ ):
+ yield self.trim(batch)