X-Git-Url: https://fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=blobdiff_plain;f=tasks.py;h=0ab18233a72e27b5b0b02e96a21abd5009d37ad6;hb=ce969e8372fb161d86be29042a20b044ee6efe2a;hp=c7348d50653cbb58ace6e040bf861b7028513e9e;hpb=6f61f9438799d65c980726e28546f8775bf83a60;p=picoclvr.git diff --git a/tasks.py b/tasks.py index c7348d5..0ab1823 100755 --- a/tasks.py +++ b/tasks.py @@ -1429,7 +1429,7 @@ class Grid(Task): def tensorize(self, descr): token_descr = [s.strip().split(" ") for s in descr] l = max([len(s) for s in token_descr]) - token_descr = [s + [""] * (l - len(s)) for s in token_descr] + token_descr = [s + ["#"] * (l - len(s)) for s in token_descr] id_descr = [[self.token2id[u] for u in s] for s in token_descr] return torch.tensor(id_descr, device=self.device) @@ -1440,7 +1440,7 @@ class Grid(Task): # trim all the tensors in the tuple z to remove as much token from # left and right in the first tensor. If z is a tuple, all its # elements are trimed according to the triming for the first - def trim(self, z, token=""): + def trim(self, z, token="#"): n = self.token2id[token] if type(z) == tuple: x = z[0] @@ -1483,7 +1483,7 @@ class Grid(Task): ) # Build the tokenizer - tokens = {} + tokens = set() for d in [self.train_descr, self.test_descr]: for s in d: for t in s.strip().split(" "): @@ -1492,10 +1492,10 @@ class Grid(Task): # the same descr tokens = list(tokens) tokens.sort() - tokens = [""] + tokens + tokens = ["#"] + tokens self.token2id = dict([(t, n) for n, t in enumerate(tokens)]) self.id2token = dict([(n, t) for n, t in enumerate(tokens)]) - self.t_nul = self.token2id[""] + self.t_nul = self.token2id["#"] self.t_true = self.token2id[""] self.t_false = self.token2id[""]