20241026-19:07:25 argv ./main.py --nb_models=1 --gpus=0 --model=352M --batch_size=5 --eval_batch_size=5 --result_dir=results_culture_352M --learning_rate=2e-4 20241026-19:07:25 args.log_filename train.log 20241026-19:07:25 args.result_dir results_culture_352M 20241026-19:07:25 args.seed 0 20241026-19:07:25 args.resume False 20241026-19:07:25 args.optimizer adam 20241026-19:07:25 args.nb_warmup_samples 5000 20241026-19:07:25 args.nb_epochs 10000 20241026-19:07:25 args.batch_size 5 20241026-19:07:25 args.train_batch_size None 20241026-19:07:25 args.eval_batch_size 5 20241026-19:07:25 args.nb_train_samples 50000 20241026-19:07:25 args.nb_test_samples 2500 20241026-19:07:25 args.nb_c_quizzes 5000 20241026-19:07:25 args.c_quiz_multiplier 1 20241026-19:07:25 args.learning_rate 0.0002 20241026-19:07:25 args.gradient_clipping None 20241026-19:07:25 args.nb_have_to_be_correct 3 20241026-19:07:25 args.nb_have_to_be_wrong 1 20241026-19:07:25 args.nb_mistakes_to_be_wrong 5 20241026-19:07:25 args.model_type standard 20241026-19:07:25 args.model 352M 20241026-19:07:25 args.dim_model 1024 20241026-19:07:25 args.dim_keys 64 20241026-19:07:25 args.dim_hidden 2048 20241026-19:07:25 args.nb_heads 8 20241026-19:07:25 args.nb_blocks 48 20241026-19:07:25 args.dropout 0.5 20241026-19:07:25 args.nb_threads 1 20241026-19:07:25 args.gpus 0 20241026-19:07:25 args.nb_models 1 20241026-19:07:25 args.diffusion_nb_iterations 25 20241026-19:07:25 args.diffusion_proba_corruption 0.05 20241026-19:07:25 args.accuracy_to_make_c_quizzes 0.95 20241026-19:07:25 args.proba_prompt_noise 0.05 20241026-19:07:25 args.proba_hint 0.25 20241026-19:07:25 args.quizzes None 20241026-19:07:25 args.test None 20241026-19:07:25 args.grids_world_tasks replace_color,translate,grow,frame 20241026-19:07:39 main_device cuda:0 gpus ['cuda:0'] 20241026-19:07:39 vocabulary_size 11 20241026-19:07:40 nb_parameters 352699403 (352M) 20241026-19:07:40 wrote state.pth 20241026-19:07:40 --- epoch 0 ---------------------------------------- 20241026-19:07:40 current_test_accuracies 0.0000 20241026-19:07:40 no_c_quiz 20241026-19:07:40 weakest_accuracies [0.0] 20241026-19:07:53 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241026-19:24:50 train_loss 0 model 0 0.545583251953125 20241026-19:24:50 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-19:25:06 test_loss 0 model 0 0.119640625 20241026-19:25:06 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-19:25:24 test_accuracy 0 model 0 nb_correct 56 / 2500 (2.24%) 20241026-19:25:24 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241026-19:25:55 wrote ae_*.pth 20241026-19:25:55 epoch_duration 18min14s next_finish 19:44:10 20241026-19:25:55 wrote state.pth 20241026-19:25:55 --- epoch 1 ---------------------------------------- 20241026-19:25:55 current_test_accuracies 0.0224 20241026-19:25:55 no_c_quiz 20241026-19:25:55 weakest_accuracies [0.0224] 20241026-19:25:55 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241026-19:44:19 train_loss 1 model 0 0.11347545166015625 20241026-19:44:19 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-19:44:35 test_loss 1 model 0 0.07401565551757812 20241026-19:44:35 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-19:44:51 test_accuracy 1 model 0 nb_correct 637 / 2500 (25.48%) 20241026-19:44:51 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241026-19:45:21 wrote ae_*.pth 20241026-19:45:21 epoch_duration 19min26s next_finish 20:04:48 20241026-19:45:21 wrote state.pth 20241026-19:45:21 --- epoch 2 ---------------------------------------- 20241026-19:45:21 current_test_accuracies 0.2548 20241026-19:45:21 no_c_quiz 20241026-19:45:21 weakest_accuracies [0.2548] 20241026-19:45:22 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241026-20:03:47 train_loss 2 model 0 0.0843360122680664 20241026-20:03:47 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-20:04:03 test_loss 2 model 0 0.05526161193847656 20241026-20:04:03 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-20:04:19 test_accuracy 2 model 0 nb_correct 929 / 2500 (37.16%) 20241026-20:04:19 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241026-20:04:49 wrote ae_*.pth 20241026-20:04:49 epoch_duration 19min27s next_finish 20:24:17 20241026-20:04:49 wrote state.pth 20241026-20:04:49 --- epoch 3 ---------------------------------------- 20241026-20:04:49 current_test_accuracies 0.3716 20241026-20:04:49 no_c_quiz 20241026-20:04:49 weakest_accuracies [0.3716] 20241026-20:04:49 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241026-20:23:17 train_loss 3 model 0 0.07174256896972656 20241026-20:23:17 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-20:23:34 test_loss 3 model 0 0.048340813636779785 20241026-20:23:34 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-20:23:50 test_accuracy 3 model 0 nb_correct 1225 / 2500 (49.00%) 20241026-20:23:50 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241026-20:24:20 wrote ae_*.pth 20241026-20:24:20 epoch_duration 19min30s next_finish 20:43:50 20241026-20:24:20 wrote state.pth 20241026-20:24:20 --- epoch 4 ---------------------------------------- 20241026-20:24:20 current_test_accuracies 0.4900 20241026-20:24:20 no_c_quiz 20241026-20:24:20 weakest_accuracies [0.49] 20241026-20:24:20 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241026-20:43:00 train_loss 4 model 0 0.0622249698638916 20241026-20:43:00 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-20:43:15 test_loss 4 model 0 0.0432762336730957 20241026-20:43:15 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-20:43:32 test_accuracy 4 model 0 nb_correct 1427 / 2500 (57.08%) 20241026-20:43:32 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241026-20:44:02 wrote ae_*.pth 20241026-20:44:02 epoch_duration 19min42s next_finish 21:03:45 20241026-20:44:02 wrote state.pth 20241026-20:44:02 --- epoch 5 ---------------------------------------- 20241026-20:44:02 current_test_accuracies 0.5708 20241026-20:44:02 no_c_quiz 20241026-20:44:02 weakest_accuracies [0.5708] 20241026-20:44:02 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241026-21:02:25 train_loss 5 model 0 0.0573727952003479 20241026-21:02:25 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-21:02:42 test_loss 5 model 0 0.03737013578414917 20241026-21:02:42 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-21:02:59 test_accuracy 5 model 0 nb_correct 1456 / 2500 (58.24%) 20241026-21:02:59 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241026-21:03:29 wrote ae_*.pth 20241026-21:03:29 epoch_duration 19min26s next_finish 21:22:55 20241026-21:03:29 wrote state.pth 20241026-21:03:29 --- epoch 6 ---------------------------------------- 20241026-21:03:29 current_test_accuracies 0.5824 20241026-21:03:29 no_c_quiz 20241026-21:03:29 weakest_accuracies [0.5824] 20241026-21:03:29 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241026-21:21:55 train_loss 6 model 0 0.052121820497512815 20241026-21:21:55 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-21:22:11 test_loss 6 model 0 0.038954888343811035 20241026-21:22:11 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-21:22:27 test_accuracy 6 model 0 nb_correct 1540 / 2500 (61.60%) 20241026-21:22:27 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241026-21:22:56 wrote ae_*.pth 20241026-21:22:56 epoch_duration 19min26s next_finish 21:42:22 20241026-21:22:56 wrote state.pth 20241026-21:22:56 --- epoch 7 ---------------------------------------- 20241026-21:22:56 current_test_accuracies 0.6160 20241026-21:22:56 no_c_quiz 20241026-21:22:56 weakest_accuracies [0.616] 20241026-21:22:56 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241026-21:41:19 train_loss 7 model 0 0.04776791572570801 20241026-21:41:19 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-21:41:35 test_loss 7 model 0 0.03266408491134644 20241026-21:41:35 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-21:41:51 test_accuracy 7 model 0 nb_correct 1716 / 2500 (68.64%) 20241026-21:41:51 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241026-21:42:21 wrote ae_*.pth 20241026-21:42:21 epoch_duration 19min25s next_finish 22:01:46 20241026-21:42:21 wrote state.pth 20241026-21:42:21 --- epoch 8 ---------------------------------------- 20241026-21:42:21 current_test_accuracies 0.6864 20241026-21:42:21 no_c_quiz 20241026-21:42:21 weakest_accuracies [0.6864] 20241026-21:42:21 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241026-22:00:45 train_loss 8 model 0 0.04374433674812317 20241026-22:00:45 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-22:01:01 test_loss 8 model 0 0.029519215166568757 20241026-22:01:01 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-22:01:17 test_accuracy 8 model 0 nb_correct 1944 / 2500 (77.76%) 20241026-22:01:17 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241026-22:01:46 wrote ae_*.pth 20241026-22:01:46 epoch_duration 19min25s next_finish 22:21:11 20241026-22:01:46 wrote state.pth 20241026-22:01:46 --- epoch 9 ---------------------------------------- 20241026-22:01:46 current_test_accuracies 0.7776 20241026-22:01:46 no_c_quiz 20241026-22:01:46 weakest_accuracies [0.7776] 20241026-22:01:46 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241026-22:20:13 train_loss 9 model 0 0.040843696904182436 20241026-22:20:14 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-22:20:30 test_loss 9 model 0 0.028333854854106902 20241026-22:20:30 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-22:20:46 test_accuracy 9 model 0 nb_correct 2057 / 2500 (82.28%) 20241026-22:20:46 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241026-22:21:14 wrote ae_*.pth 20241026-22:21:14 epoch_duration 19min27s next_finish 22:40:42 20241026-22:21:14 wrote state.pth 20241026-22:21:14 --- epoch 10 ---------------------------------------- 20241026-22:21:14 current_test_accuracies 0.8228 20241026-22:21:14 no_c_quiz 20241026-22:21:14 weakest_accuracies [0.8228] 20241026-22:21:14 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241026-22:39:41 train_loss 10 model 0 0.038202108073234556 20241026-22:39:41 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-22:39:57 test_loss 10 model 0 0.02720927083492279 20241026-22:39:57 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-22:40:14 test_accuracy 10 model 0 nb_correct 2180 / 2500 (87.20%) 20241026-22:40:14 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241026-22:40:43 wrote ae_*.pth 20241026-22:40:43 epoch_duration 19min28s next_finish 23:00:11 20241026-22:40:43 wrote state.pth 20241026-22:40:43 --- epoch 11 ---------------------------------------- 20241026-22:40:43 current_test_accuracies 0.8720 20241026-22:40:43 no_c_quiz 20241026-22:40:43 weakest_accuracies [0.872] 20241026-22:40:43 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241026-22:59:14 train_loss 11 model 0 0.0375050094217062 20241026-22:59:14 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-22:59:30 test_loss 11 model 0 0.027711262971162796 20241026-22:59:30 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-22:59:46 test_accuracy 11 model 0 nb_correct 2259 / 2500 (90.36%) 20241026-22:59:46 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241026-23:00:17 wrote ae_*.pth 20241026-23:00:17 epoch_duration 19min33s next_finish 23:19:50 20241026-23:00:17 wrote state.pth 20241026-23:00:17 --- epoch 12 ---------------------------------------- 20241026-23:00:17 current_test_accuracies 0.9036 20241026-23:00:17 no_c_quiz 20241026-23:00:17 weakest_accuracies [0.9036] 20241026-23:00:17 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241026-23:18:45 train_loss 12 model 0 0.034961151081323624 20241026-23:18:45 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-23:19:01 test_loss 12 model 0 0.024840730883181096 20241026-23:19:01 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-23:19:17 test_accuracy 12 model 0 nb_correct 2347 / 2500 (93.88%) 20241026-23:19:17 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241026-23:19:46 wrote ae_*.pth 20241026-23:19:46 epoch_duration 19min29s next_finish 23:39:15 20241026-23:19:46 wrote state.pth 20241026-23:19:46 --- epoch 13 ---------------------------------------- 20241026-23:19:46 current_test_accuracies 0.9388 20241026-23:19:46 no_c_quiz 20241026-23:19:46 weakest_accuracies [0.9388] 20241026-23:19:46 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241026-23:38:14 train_loss 13 model 0 0.03403147121071815 20241026-23:38:14 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-23:38:30 test_loss 13 model 0 0.024604608163237573 20241026-23:38:30 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-23:38:46 test_accuracy 13 model 0 nb_correct 2358 / 2500 (94.32%) 20241026-23:38:46 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241026-23:39:15 wrote ae_*.pth 20241026-23:39:15 epoch_duration 19min28s next_finish 23:58:44 20241026-23:39:15 wrote state.pth 20241026-23:39:15 --- epoch 14 ---------------------------------------- 20241026-23:39:15 current_test_accuracies 0.9432 20241026-23:39:15 no_c_quiz 20241026-23:39:15 weakest_accuracies [0.9432] 20241026-23:39:15 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241026-23:57:43 train_loss 14 model 0 0.03268781835436821 20241026-23:57:43 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-23:57:59 test_loss 14 model 0 0.031215623617172242 20241026-23:57:59 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241026-23:58:16 test_accuracy 14 model 0 nb_correct 2320 / 2500 (92.80%) 20241026-23:58:16 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241026-23:58:46 wrote ae_*.pth 20241026-23:58:46 epoch_duration 19min30s next_finish 00:18:16 20241026-23:58:46 wrote state.pth 20241026-23:58:46 --- epoch 15 ---------------------------------------- 20241026-23:58:46 current_test_accuracies 0.9280 20241026-23:58:46 no_c_quiz 20241026-23:58:46 weakest_accuracies [0.928] 20241026-23:58:46 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241027-00:17:11 train_loss 15 model 0 0.031516154500842095 20241027-00:17:11 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241027-00:17:27 test_loss 15 model 0 0.025180682331323624 20241027-00:17:27 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241027-00:17:43 test_accuracy 15 model 0 nb_correct 2357 / 2500 (94.28%) 20241027-00:17:43 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241027-00:18:11 wrote ae_*.pth 20241027-00:18:11 epoch_duration 19min25s next_finish 00:37:37 20241027-00:18:11 wrote state.pth 20241027-00:18:11 --- epoch 16 ---------------------------------------- 20241027-00:18:11 current_test_accuracies 0.9428 20241027-00:18:11 no_c_quiz 20241027-00:18:11 weakest_accuracies [0.9428] 20241027-00:18:11 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241027-00:36:36 train_loss 16 model 0 0.03089874274432659 20241027-00:36:36 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241027-00:36:52 test_loss 16 model 0 0.024968827426433565 20241027-00:36:52 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241027-00:37:08 test_accuracy 16 model 0 nb_correct 2293 / 2500 (91.72%) 20241027-00:37:08 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241027-00:37:37 wrote ae_*.pth 20241027-00:37:37 epoch_duration 19min25s next_finish 00:57:02 20241027-00:37:37 wrote state.pth 20241027-00:37:37 --- epoch 17 ---------------------------------------- 20241027-00:37:37 current_test_accuracies 0.9172 20241027-00:37:37 no_c_quiz 20241027-00:37:37 weakest_accuracies [0.9172] 20241027-00:37:37 quiz_set nb_w_quizzes 50000 nb_c_quizzes 0 20241027-00:56:08 train_loss 17 model 0 0.030611307680606843 20241027-00:56:08 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241027-00:56:25 test_loss 17 model 0 0.02509162476658821 20241027-00:56:25 quiz_set nb_w_quizzes 2500 nb_c_quizzes 0 20241027-00:56:41 test_accuracy 17 model 0 nb_correct 2395 / 2500 (95.80%) 20241027-00:56:41 quiz_set nb_w_quizzes 150 nb_c_quizzes 0 20241027-00:57:10 wrote ae_*.pth 20241027-00:57:10 epoch_duration 19min33s next_finish 01:16:43 20241027-00:57:10 wrote state.pth 20241027-00:57:10 --- epoch 18 ---------------------------------------- 20241027-00:57:10 current_test_accuracies 0.9580 20241027-00:57:14 wrote ae_*_naive.pth