######################################################################
+
+def str2bool(x):
+ x = x.lower()
+ if x in {"1", "true", "yes"}:
+ return True
+ elif x in {"0", "false", "no"}:
+ return False
+ else:
+ raise ValueError
+
+
parser = argparse.ArgumentParser(
description="An implementation of GPT with cache.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
# legacy
-parser.add_argument("--legacy_lr_schedule", action="store_true", default=False)
+parser.add_argument("--legacy_lr_schedule", type=str2bool, default=True)
parser.add_argument("--legacy_large_lr", type=float, default=1e-4)
fb_body = fb_body.cumsum(dim=2)
fb_start = fb_start * (fb_body == 1)
- # pick past starting source times
- src_time = (
- fb_start
+ # t_s = t0-(t0//L * R)*L
+
+ t = torch.arange(fb_start.size(2), device=fb_start.device)[None, None, :]
+ src_time = fb_start * (
+ t
+ - CL
* (
- torch.rand(fb_start.size(), device=fb_start.device)
- * (torch.arange(fb_start.size(2), device=fb_start.device) - CL)[
- None, None, :
- ]
- ).long()
+ 1
+ + (
+ torch.rand(fb_start.size(), device=fb_start.device) * (t // CL - 1)
+ ).long()
+ )
)
src_time[:, :, CL:] -= src_time.clone()[:, :, :-CL]
src_time = src_time.cumsum(dim=2)