def tensorize(self, descr):
token_descr = [ s.strip().split(' ') for s in descr ]
l = max([ len(s) for s in token_descr ])
- token_descr = [ [ '<nul>' ] * (l - len(s)) + s for s in token_descr ]
- #token_descr = [ s + [ '<nul>' ] * (l - len(s)) for s in token_descr ]
+ #token_descr = [ [ '<nul>' ] * (l - len(s)) + s for s in token_descr ]
+ token_descr = [ s + [ '<nul>' ] * (l - len(s)) for s in token_descr ]
id_descr = [ [ self.token2id[u] for u in s ] for s in token_descr ]
return torch.tensor(id_descr, device = self.device)
self.device = device
nb = args.data_size if args.data_size > 0 else 250000
+ log_string(f'generating {nb} samples (can take some time)')
self.train_descr = generate_descr((nb * 4) // 5)
self.test_descr = generate_descr((nb * 1) // 5)