parser.add_argument("--rho", type=float, default=0.0)
-parser.add_argument("--dim_rec_v", type=int, default=None)
-
parser.add_argument("--nb_blocks", type=int, default=None)
parser.add_argument("--dropout", type=float, default=0.1)
"dim_keys": 32,
"dim_hidden": 32,
"nb_heads": 2,
- "dim_rec_v": 16,
"nb_blocks": 2,
},
"17K-C": {
"nb_heads": 2,
"nb_lines": 16,
"caterpillar_height": 4,
- "dim_rec_v": 16,
"nb_blocks": 2,
},
"4M": {
"dim_keys": 32,
"dim_hidden": 1024,
"nb_heads": 4,
- "dim_rec_v": 64,
"nb_blocks": 6,
},
"4M-C": {
"nb_heads": 4,
"nb_lines": 32,
"caterpillar_height": 4,
- "dim_rec_v": 64, # dim_model / nb_heads
"nb_blocks": 6,
},
"37M": {
"dim_keys": 64,
"dim_hidden": 2048,
"nb_heads": 8,
- "dim_rec_v": 64,
"nb_blocks": 12,
},
"37M-C": {
"nb_heads": 8,
"nb_lines": 256,
"caterpillar_height": 32,
- "dim_rec_v": 64,
"nb_blocks": 12,
},
"122M": {
"dim_keys": 64,
"dim_hidden": 2048,
"nb_heads": 8,
- "dim_rec_v": 96,
"nb_blocks": 24,
},
"122M-C": {
"dim_hidden": 2048,
"nb_heads": 8,
"nb_lines": 128,
- "dim_rec_v": 96,
"nb_blocks": 24,
},
"352M": {
"dim_keys": 64,
"dim_hidden": 2048,
"nb_heads": 8,
- "dim_rec_v": 128,
"nb_blocks": 48,
},
"352M-C": {
"dim_hidden": 2048,
"nb_heads": 8,
"nb_lines": 128,
- "dim_rec_v": 128,
"nb_blocks": 48,
},
}
nb_heads=args.nb_heads,
nb_lines=args.nb_lines,
caterpillar_height=args.caterpillar_height,
- dim_rec_v=args.dim_rec_v,
nb_blocks=args.nb_blocks,
causal=True,
dropout=args.dropout,