20240715-16:27:53 argv ./main.py --result_dir=results_grids_v6 20240715-16:27:53 args.log_filename train.log 20240715-16:27:53 args.result_dir results_grids_v6 20240715-16:27:53 args.seed 0 20240715-16:27:53 args.resume False 20240715-16:27:53 args.max_percents_of_test_in_train -1 20240715-16:27:53 args.nb_epochs 10000 20240715-16:27:53 args.batch_size 25 20240715-16:27:53 args.physical_batch_size None 20240715-16:27:53 args.nb_train_samples 100000 20240715-16:27:53 args.nb_test_samples 10000 20240715-16:27:53 args.nb_new_c_quizzes_for_train None 20240715-16:27:53 args.nb_new_c_quizzes_for_test None 20240715-16:27:53 args.learning_rate 0.0005 20240715-16:27:53 args.model 37M 20240715-16:27:53 args.dim_model 512 20240715-16:27:53 args.dim_keys 64 20240715-16:27:53 args.dim_hidden 2048 20240715-16:27:53 args.nb_heads 8 20240715-16:27:53 args.nb_blocks 12 20240715-16:27:53 args.dropout 0.1 20240715-16:27:53 args.deterministic_synthesis False 20240715-16:27:53 args.problem grids 20240715-16:27:53 args.nb_threads 1 20240715-16:27:53 args.gpus all 20240715-16:27:53 args.nb_gpts 5 20240715-16:27:53 args.accuracy_to_make_c_quizzes 0.9 20240715-16:27:53 args.proba_understands 0.9 20240715-16:27:53 args.proba_not_understands 0.5 20240715-16:27:53 args.generation_temperature 2 20240715-16:27:53 args.dirty_debug False 20240715-16:27:53 args.grids_tasks None 20240715-16:27:53 args.sky_height 6 20240715-16:27:53 args.sky_width 8 20240715-16:27:53 args.sky_nb_birds 3 20240715-16:27:53 args.sky_nb_iterations 2 20240715-16:27:53 args.sky_speed 3 20240715-16:28:15 main_device cuda:0 gpus ['cuda:0', 'cuda:1'] 20240715-16:28:15 vocabulary_size 13 20240715-16:28:15 creating model 0 and its w_quizzes 20240715-16:29:53 creating model 1 and its w_quizzes 20240715-16:31:39 creating model 2 and its w_quizzes 20240715-16:33:24 creating model 3 and its w_quizzes 20240715-16:35:10 creating model 4 and its w_quizzes 20240715-16:36:58 nb_parameters 37817357 (37M) 20240715-16:36:59 nb_new_c_quizzes_for_train 1000 nb_new_c_quizzes_for_test 100 20240715-16:36:59 --- epoch 0 ---------------------------------------- 20240715-16:36:59 current_test_accuracies 0.0000 0.0000 0.0000 0.0000 0.0000 20240715-16:36:59 training model 0 20240715-16:36:59 training model 1 20240715-16:45:55 train_perplexity 0 model 1 1.6962359633661162 20240715-16:46:18 train_perplexity 0 model 0 1.6927451994906717 20240715-16:46:25 test_perplexity 0 model 1 1.270926889772941 20240715-16:46:41 test_perplexity 0 model 0 1.2698948983866976 20240715-16:49:34 test_accuracy 0 model 1 forward 11 / 493 backward 3 / 507 20240715-16:49:34 main_test_accuracy 0 0.014000000432133675 20240715-16:49:38 test_accuracy 0 model 0 forward 7 / 484 backward 2 / 516 20240715-16:49:38 main_test_accuracy 0 0.009000000543892384 20240715-16:49:39 wrote gpt_000.pth 20240715-16:49:39 wrote gpt_001.pth 20240715-16:49:48 --- epoch 1 ---------------------------------------- 20240715-16:49:48 current_test_accuracies 0.0090 0.0140 0.0000 0.0000 0.0000 20240715-16:49:48 training model 2 20240715-16:49:48 training model 3 20240715-16:58:43 train_perplexity 1 model 3 1.7657245135749406 20240715-16:59:00 train_perplexity 1 model 2 1.706454500319667 20240715-16:59:15 test_perplexity 1 model 3 1.2697066081658515 20240715-16:59:27 test_perplexity 1 model 2 1.2656771422226274 20240715-17:02:20 test_accuracy 1 model 3 forward 9 / 506 backward 1 / 494 20240715-17:02:20 main_test_accuracy 1 0.010000000707805157 20240715-17:02:23 test_accuracy 1 model 2 forward 12 / 475 backward 3 / 525 20240715-17:02:23 main_test_accuracy 1 0.015000000596046448 20240715-17:02:24 wrote gpt_002.pth 20240715-17:02:25 wrote gpt_003.pth 20240715-17:02:33 --- epoch 2 ---------------------------------------- 20240715-17:02:33 current_test_accuracies 0.0090 0.0140 0.0150 0.0100 0.0000 20240715-17:02:33 training model 4 20240715-17:02:33 training model 0 20240715-17:11:28 train_perplexity 2 model 0 1.2411058885127244 20240715-17:11:46 train_perplexity 2 model 4 1.7647064072016874 20240715-17:12:00 test_perplexity 2 model 0 1.2038803750646887 20240715-17:12:12 test_perplexity 2 model 4 1.2727349838165485 20240715-17:15:07 test_accuracy 2 model 0 forward 151 / 484 backward 77 / 516 20240715-17:15:07 main_test_accuracy 2 0.22800001502037048 20240715-17:15:09 test_accuracy 2 model 4 forward 12 / 528 backward 2 / 472 20240715-17:15:09 main_test_accuracy 2 0.014000000432133675 20240715-17:15:10 wrote gpt_004.pth 20240715-17:15:10 wrote gpt_000.pth 20240715-17:15:18 --- epoch 3 ---------------------------------------- 20240715-17:15:18 current_test_accuracies 0.2280 0.0140 0.0150 0.0100 0.0140 20240715-17:15:18 training model 3 20240715-17:15:18 training model 1 20240715-17:24:13 train_perplexity 3 model 1 1.2486546144231132 20240715-17:24:40 train_perplexity 3 model 3 1.2484658028865774 20240715-17:24:43 test_perplexity 3 model 1 1.2075229591652603 20240715-17:25:01 test_perplexity 3 model 3 1.204365996909972 20240715-17:27:55 test_accuracy 3 model 1 forward 120 / 493 backward 62 / 507 20240715-17:27:55 main_test_accuracy 3 0.18200001120567322 20240715-17:27:59 test_accuracy 3 model 3 forward 96 / 506 backward 44 / 494 20240715-17:27:59 main_test_accuracy 3 0.14000000059604645 20240715-17:28:00 wrote gpt_003.pth 20240715-17:28:00 wrote gpt_001.pth 20240715-17:28:09 --- epoch 4 ---------------------------------------- 20240715-17:28:09 current_test_accuracies 0.2280 0.1820 0.0150 0.1400 0.0140 20240715-17:28:09 training model 4 20240715-17:28:09 training model 2 20240715-17:37:03 train_perplexity 4 model 2 1.2477534195515345 20240715-17:37:31 train_perplexity 4 model 4 1.249737868340399 20240715-17:37:33 test_perplexity 4 model 2 1.2057924259793467 20240715-17:37:52 test_perplexity 4 model 4 1.2087076547709465 20240715-17:40:44 test_accuracy 4 model 2 forward 79 / 475 backward 25 / 525 20240715-17:40:44 main_test_accuracy 4 0.10400000214576721 20240715-17:40:47 test_accuracy 4 model 4 forward 113 / 528 backward 42 / 472 20240715-17:40:47 main_test_accuracy 4 0.1550000011920929 20240715-17:40:48 wrote gpt_004.pth 20240715-17:40:48 wrote gpt_002.pth 20240715-17:40:57 --- epoch 5 ---------------------------------------- 20240715-17:40:57 current_test_accuracies 0.2280 0.1820 0.1040 0.1400 0.1550 20240715-17:40:57 training model 2 20240715-17:40:57 training model 3 20240715-17:49:52 train_perplexity 5 model 3 1.2024429315920526 20240715-17:50:17 train_perplexity 5 model 2 1.2046458259288255 20240715-17:50:22 test_perplexity 5 model 3 1.1904229676303073 20240715-17:50:39 test_perplexity 5 model 2 1.1865104530664998 20240715-17:53:30 test_accuracy 5 model 3 forward 199 / 506 backward 101 / 494 20240715-17:53:30 main_test_accuracy 5 0.30000001192092896 20240715-17:53:35 test_accuracy 5 model 2 forward 175 / 475 backward 161 / 525 20240715-17:53:35 main_test_accuracy 5 0.3360000252723694 20240715-17:53:36 wrote gpt_002.pth 20240715-17:53:36 wrote gpt_003.pth 20240715-17:53:45 --- epoch 6 ---------------------------------------- 20240715-17:53:45 current_test_accuracies 0.2280 0.1820 0.3360 0.3000 0.1550 20240715-17:53:45 training model 4 20240715-17:53:45 training model 1 20240715-18:02:40 train_perplexity 6 model 1 1.2024552353687328 20240715-18:03:07 train_perplexity 6 model 4 1.2047341990618097 20240715-18:03:10 test_perplexity 6 model 1 1.1872974373297007 20240715-18:03:28 test_perplexity 6 model 4 1.1887139678613055 20240715-18:06:21 test_accuracy 6 model 1 forward 221 / 493 backward 142 / 507 20240715-18:06:21 main_test_accuracy 6 0.3630000054836273 20240715-18:06:23 test_accuracy 6 model 4 forward 211 / 528 backward 84 / 472 20240715-18:06:23 main_test_accuracy 6 0.29500001668930054 20240715-18:06:25 wrote gpt_004.pth 20240715-18:06:25 wrote gpt_001.pth 20240715-18:06:34 --- epoch 7 ---------------------------------------- 20240715-18:06:34 current_test_accuracies 0.2280 0.3630 0.3360 0.3000 0.2950 20240715-18:06:34 training model 0 20240715-18:06:34 training model 4 20240715-18:15:29 train_perplexity 7 model 4 1.1899275702106074 20240715-18:15:55 train_perplexity 7 model 0 1.2006797793766903 20240715-18:15:59 test_perplexity 7 model 4 1.1804004427554906 20240715-18:16:17 test_perplexity 7 model 0 1.1861947857387896 20240715-18:19:08 test_accuracy 7 model 4 forward 284 / 528 backward 156 / 472 20240715-18:19:08 main_test_accuracy 7 0.4400000274181366 20240715-18:19:14 test_accuracy 7 model 0 forward 220 / 484 backward 132 / 516 20240715-18:19:14 main_test_accuracy 7 0.35200002789497375 20240715-18:19:15 wrote gpt_000.pth 20240715-18:19:15 wrote gpt_004.pth 20240715-18:19:23 --- epoch 8 ---------------------------------------- 20240715-18:19:23 current_test_accuracies 0.3520 0.3630 0.3360 0.3000 0.4400 20240715-18:19:23 training model 3 20240715-18:19:23 training model 2 20240715-18:28:18 train_perplexity 8 model 2 1.1887773446875023 20240715-18:28:44 train_perplexity 8 model 3 1.1870611081961473 20240715-18:28:48 test_perplexity 8 model 2 1.1791648361746063 20240715-18:29:06 test_perplexity 8 model 3 1.1764893018173475 20240715-18:32:00 test_accuracy 8 model 2 forward 235 / 475 backward 181 / 525 20240715-18:32:00 main_test_accuracy 8 0.41600000858306885 20240715-18:32:03 test_accuracy 8 model 3 forward 265 / 506 backward 202 / 494 20240715-18:32:03 main_test_accuracy 8 0.46700000762939453 20240715-18:32:04 wrote gpt_003.pth 20240715-18:32:05 wrote gpt_002.pth 20240715-18:32:13 --- epoch 9 ---------------------------------------- 20240715-18:32:13 current_test_accuracies 0.3520 0.3630 0.4160 0.4670 0.4400 20240715-18:32:13 training model 0 20240715-18:32:13 training model 1 20240715-18:41:08 train_perplexity 9 model 1 1.1860467570356044 20240715-18:41:37 train_perplexity 9 model 0 1.1863951412152316 20240715-18:41:37 test_perplexity 9 model 1 1.1799994311020128 20240715-18:41:57 test_perplexity 9 model 0 1.1785784018999508 20240715-18:44:49 test_accuracy 9 model 1 forward 277 / 493 backward 183 / 507 20240715-18:44:49 main_test_accuracy 9 0.46000000834465027 20240715-18:44:54 test_accuracy 9 model 0 forward 288 / 484 backward 178 / 516 20240715-18:44:54 main_test_accuracy 9 0.4660000205039978 20240715-18:44:55 wrote gpt_000.pth 20240715-18:44:55 wrote gpt_001.pth 20240715-18:45:03 --- epoch 10 ---------------------------------------- 20240715-18:45:03 current_test_accuracies 0.4660 0.4600 0.4160 0.4670 0.4400 20240715-18:45:03 training model 2 20240715-18:45:03 training model 4 20240715-18:53:59 train_perplexity 10 model 4 1.1814561558080638 20240715-18:54:26 train_perplexity 10 model 2 1.1808929404488853 20240715-18:54:28 test_perplexity 10 model 4 1.174300502385369 20240715-18:54:47 test_perplexity 10 model 2 1.1745098161074348 20240715-18:57:37 test_accuracy 10 model 4 forward 333 / 528 backward 186 / 472 20240715-18:57:37 main_test_accuracy 10 0.5190000534057617 20240715-18:57:43 test_accuracy 10 model 2 forward 292 / 475 backward 204 / 525 20240715-18:57:43 main_test_accuracy 10 0.4960000216960907 20240715-18:57:44 wrote gpt_002.pth 20240715-18:57:44 wrote gpt_004.pth 20240715-18:57:52 --- epoch 11 ---------------------------------------- 20240715-18:57:52 current_test_accuracies 0.4660 0.4600 0.4960 0.4670 0.5190 20240715-18:57:52 training model 1 20240715-18:57:52 training model 0 20240715-19:06:48 train_perplexity 11 model 0 1.1793909567394392 20240715-19:07:12 train_perplexity 11 model 1 1.1805377261447438 20240715-19:07:18 test_perplexity 11 model 0 1.174847118107629 20240715-19:07:35 test_perplexity 11 model 1 1.1749599091565883 20240715-19:10:29 test_accuracy 11 model 0 forward 306 / 484 backward 213 / 516 20240715-19:10:29 main_test_accuracy 11 0.5190000534057617 20240715-19:10:32 test_accuracy 11 model 1 forward 340 / 493 backward 237 / 507 20240715-19:10:32 main_test_accuracy 11 0.5770000219345093 20240715-19:10:34 wrote gpt_001.pth 20240715-19:10:34 wrote gpt_000.pth 20240715-19:10:42 --- epoch 12 ---------------------------------------- 20240715-19:10:42 current_test_accuracies 0.5190 0.5770 0.4960 0.4670 0.5190 20240715-19:10:42 training model 3 20240715-19:10:42 training model 2 20240715-19:19:37 train_perplexity 12 model 2 1.1762755184557174 20240715-19:20:03 train_perplexity 12 model 3 1.179923673323153 20240715-19:20:07 test_perplexity 12 model 2 1.1710831213555972 20240715-19:20:25 test_perplexity 12 model 3 1.1732365592527727 20240715-19:23:19 test_accuracy 12 model 2 forward 311 / 475 backward 249 / 525 20240715-19:23:19 main_test_accuracy 12 0.5600000023841858 20240715-19:23:22 test_accuracy 12 model 3 forward 320 / 506 backward 229 / 494 20240715-19:23:22 main_test_accuracy 12 0.5490000247955322 20240715-19:23:23 wrote gpt_003.pth 20240715-19:23:23 wrote gpt_002.pth 20240715-19:23:32 --- epoch 13 ---------------------------------------- 20240715-19:23:32 current_test_accuracies 0.5190 0.5770 0.5600 0.5490 0.5190 20240715-19:23:32 training model 0 20240715-19:23:32 training model 4 20240715-19:32:27 train_perplexity 13 model 4 1.1769178773560498 20240715-19:32:55 train_perplexity 13 model 0 1.1751063680128422 20240715-19:32:57 test_perplexity 13 model 4 1.1723348168424697 20240715-19:33:16 test_perplexity 13 model 0 1.1711649808396947 20240715-19:36:06 test_accuracy 13 model 4 forward 358 / 528 backward 196 / 472 20240715-19:36:06 main_test_accuracy 13 0.5540000200271606 20240715-19:36:11 test_accuracy 13 model 0 forward 325 / 484 backward 267 / 516 20240715-19:36:11 main_test_accuracy 13 0.5920000076293945 20240715-19:36:13 wrote gpt_000.pth 20240715-19:36:13 wrote gpt_004.pth 20240715-19:36:22 --- epoch 14 ---------------------------------------- 20240715-19:36:22 current_test_accuracies 0.5920 0.5770 0.5600 0.5490 0.5540 20240715-19:36:22 training model 3 20240715-19:36:22 training model 4 20240715-19:45:18 train_perplexity 14 model 4 1.1742961652822685 20240715-19:45:42 train_perplexity 14 model 3 1.1759862217554216 20240715-19:45:48 test_perplexity 14 model 4 1.171107329155257 20240715-19:46:05 test_perplexity 14 model 3 1.1705121622049666 20240715-19:48:56 test_accuracy 14 model 4 forward 377 / 528 backward 233 / 472 20240715-19:48:56 main_test_accuracy 14 0.6100000143051147 20240715-19:49:00 test_accuracy 14 model 3 forward 332 / 506 backward 231 / 494 20240715-19:49:00 main_test_accuracy 14 0.5630000233650208 20240715-19:49:01 wrote gpt_003.pth 20240715-19:49:02 wrote gpt_004.pth 20240715-19:49:10 --- epoch 15 ---------------------------------------- 20240715-19:49:10 current_test_accuracies 0.5920 0.5770 0.5600 0.5630 0.6100 20240715-19:49:10 training model 2 20240715-19:49:10 training model 3 20240715-19:58:06 train_perplexity 15 model 3 1.1724367392902866 20240715-19:58:29 train_perplexity 15 model 2 1.17358006746899 20240715-19:58:36 test_perplexity 15 model 3 1.167631982500388 20240715-19:58:52 test_perplexity 15 model 2 1.1699887626054923 20240715-20:01:45 test_accuracy 15 model 3 forward 363 / 506 backward 262 / 494 20240715-20:01:45 main_test_accuracy 15 0.625 20240715-20:01:49 test_accuracy 15 model 2 forward 333 / 475 backward 285 / 525 20240715-20:01:49 main_test_accuracy 15 0.6180000305175781 20240715-20:01:51 wrote gpt_002.pth 20240715-20:01:51 wrote gpt_003.pth 20240715-20:01:58 --- epoch 16 ---------------------------------------- 20240715-20:01:58 current_test_accuracies 0.5920 0.5770 0.6180 0.6250 0.6100 20240715-20:01:58 training model 1 20240715-20:01:58 training model 0 20240715-20:10:55 train_perplexity 16 model 0 1.172644219434567 20240715-20:11:18 train_perplexity 16 model 1 1.175089031922472 20240715-20:11:25 test_perplexity 16 model 0 1.1696385890109293 20240715-20:11:41 test_perplexity 16 model 1 1.172811510469653 20240715-20:14:36 test_accuracy 16 model 0 forward 355 / 484 backward 303 / 516 20240715-20:14:36 main_test_accuracy 16 0.6580000519752502 20240715-20:14:40 test_accuracy 16 model 1 forward 366 / 493 backward 290 / 507 20240715-20:14:40 main_test_accuracy 16 0.656000018119812 20240715-20:14:41 wrote gpt_001.pth 20240715-20:14:41 wrote gpt_000.pth 20240715-20:14:50 --- epoch 17 ---------------------------------------- 20240715-20:14:50 current_test_accuracies 0.6580 0.6560 0.6180 0.6250 0.6100 20240715-20:14:50 training model 4 20240715-20:14:50 training model 2 20240715-20:23:45 train_perplexity 17 model 2 1.1712131846748688 20240715-20:24:11 train_perplexity 17 model 4 1.1714530071505147 20240715-20:24:15 test_perplexity 17 model 2 1.1683157570856095 20240715-20:24:32 test_perplexity 17 model 4 1.1691343250029471 20240715-20:27:25 test_accuracy 17 model 2 forward 344 / 475 backward 285 / 525 20240715-20:27:25 main_test_accuracy 17 0.6290000081062317 20240715-20:27:27 test_accuracy 17 model 4 forward 393 / 528 backward 240 / 472 20240715-20:27:27 main_test_accuracy 17 0.6330000162124634 20240715-20:27:29 wrote gpt_004.pth 20240715-20:27:29 wrote gpt_002.pth 20240715-20:27:37 --- epoch 18 ---------------------------------------- 20240715-20:27:37 current_test_accuracies 0.6580 0.6560 0.6290 0.6250 0.6330 20240715-20:27:37 training model 3 20240715-20:27:37 training model 2 20240715-20:36:33 train_perplexity 18 model 2 1.1700817668566275 20240715-20:36:58 train_perplexity 18 model 3 1.1705111555845278 20240715-20:37:03 test_perplexity 18 model 2 1.1668778790498655 20240715-20:37:20 test_perplexity 18 model 3 1.166153519205439 20240715-20:40:14 test_accuracy 18 model 2 forward 358 / 475 backward 304 / 525 20240715-20:40:14 main_test_accuracy 18 0.6620000600814819 20240715-20:40:17 test_accuracy 18 model 3 forward 368 / 506 backward 281 / 494 20240715-20:40:17 main_test_accuracy 18 0.6490000486373901 20240715-20:40:18 wrote gpt_003.pth 20240715-20:40:19 wrote gpt_002.pth 20240715-20:40:28 --- epoch 19 ---------------------------------------- 20240715-20:40:28 current_test_accuracies 0.6580 0.6560 0.6620 0.6490 0.6330 20240715-20:40:28 training model 4 20240715-20:40:28 training model 3 20240715-20:49:23 train_perplexity 19 model 3 1.1692673574381276 20240715-20:49:49 train_perplexity 19 model 4 1.1701120163762246 20240715-20:49:53 test_perplexity 19 model 3 1.1649494398341158 20240715-20:50:11 test_perplexity 19 model 4 1.167223843206261 20240715-20:53:01 test_accuracy 19 model 3 forward 392 / 506 backward 290 / 494 20240715-20:53:01 main_test_accuracy 19 0.6820000410079956 20240715-20:53:05 test_accuracy 19 model 4 forward 398 / 528 backward 272 / 472 20240715-20:53:05 main_test_accuracy 19 0.6700000166893005 20240715-20:53:06 wrote gpt_004.pth 20240715-20:53:06 wrote gpt_003.pth 20240715-20:53:15 --- epoch 20 ---------------------------------------- 20240715-20:53:15 current_test_accuracies 0.6580 0.6560 0.6620 0.6820 0.6700 20240715-20:53:15 training model 1 20240715-20:53:15 training model 0 20240715-21:02:11 train_perplexity 20 model 0 1.169896050108601 20240715-21:02:34 train_perplexity 20 model 1 1.17294460842681 20240715-21:02:41 test_perplexity 20 model 0 1.1677458150422304 20240715-21:02:57 test_perplexity 20 model 1 1.1714833241090448 20240715-21:05:51 test_accuracy 20 model 0 forward 353 / 484 backward 325 / 516 20240715-21:05:51 main_test_accuracy 20 0.6780000329017639 20240715-21:05:54 test_accuracy 20 model 1 forward 380 / 493 backward 292 / 507 20240715-21:05:54 main_test_accuracy 20 0.6720000505447388 20240715-21:05:55 wrote gpt_001.pth 20240715-21:05:56 wrote gpt_000.pth 20240715-21:06:04 --- epoch 21 ---------------------------------------- 20240715-21:06:04 current_test_accuracies 0.6780 0.6720 0.6620 0.6820 0.6700 20240715-21:06:04 training model 2 20240715-21:06:04 training model 4 20240715-21:14:59 train_perplexity 21 model 4 1.1689778574751686 20240715-21:15:22 train_perplexity 21 model 2 1.1680351623012462 20240715-21:15:33 test_perplexity 21 model 4 1.1666195908932457 20240715-21:15:47 test_perplexity 21 model 2 1.1672975064330555 20240715-21:18:40 test_accuracy 21 model 4 forward 414 / 528 backward 280 / 472 20240715-21:18:40 main_test_accuracy 21 0.6940000057220459 20240715-21:18:44 test_accuracy 21 model 2 forward 350 / 475 backward 328 / 525 20240715-21:18:44 main_test_accuracy 21 0.6780000329017639 20240715-21:18:46 wrote gpt_002.pth 20240715-21:18:46 wrote gpt_004.pth 20240715-21:18:54 --- epoch 22 ---------------------------------------- 20240715-21:18:54 current_test_accuracies 0.6780 0.6720 0.6780 0.6820 0.6940 20240715-21:18:54 training model 1 20240715-21:18:54 training model 0 20240715-21:27:50 train_perplexity 22 model 0 1.1685482953865127 20240715-21:28:14 train_perplexity 22 model 1 1.1704900474918538 20240715-21:28:20 test_perplexity 22 model 0 1.166679467986111 20240715-21:28:37 test_perplexity 22 model 1 1.1708365455785255 20240715-21:31:31 test_accuracy 22 model 0 forward 366 / 484 backward 332 / 516 20240715-21:31:31 main_test_accuracy 22 0.6980000138282776 20240715-21:31:35 test_accuracy 22 model 1 forward 377 / 493 backward 305 / 507 20240715-21:31:35 main_test_accuracy 22 0.6820000410079956 20240715-21:31:36 wrote gpt_001.pth 20240715-21:31:37 wrote gpt_000.pth 20240715-21:31:45 --- epoch 23 ---------------------------------------- 20240715-21:31:45 current_test_accuracies 0.6980 0.6820 0.6780 0.6820 0.6940 20240715-21:31:45 training model 2 20240715-21:31:45 training model 1 20240715-21:40:40 train_perplexity 23 model 1 1.168715881157189 20240715-21:41:06 train_perplexity 23 model 2 1.167968844107784 20240715-21:41:11 test_perplexity 23 model 1 1.168378660246657 20240715-21:41:29 test_perplexity 23 model 2 1.163832623425657 20240715-21:44:21 test_accuracy 23 model 1 forward 396 / 493 backward 336 / 507 20240715-21:44:21 main_test_accuracy 23 0.7320000529289246 20240715-21:44:25 test_accuracy 23 model 2 forward 372 / 475 backward 317 / 525 20240715-21:44:25 main_test_accuracy 23 0.6890000104904175 20240715-21:44:26 wrote gpt_002.pth 20240715-21:44:27 wrote gpt_001.pth 20240715-21:44:35 --- epoch 24 ---------------------------------------- 20240715-21:44:35 current_test_accuracies 0.6980 0.7320 0.6890 0.6820 0.6940 20240715-21:44:35 training model 3 20240715-21:44:35 training model 2 20240715-21:53:31 train_perplexity 24 model 2 1.1660771835458397 20240715-21:53:57 train_perplexity 24 model 3 1.168118211577099 20240715-21:54:01 test_perplexity 24 model 2 1.1631129592662688 20240715-21:54:19 test_perplexity 24 model 3 1.1644743056952902 20240715-21:57:12 test_accuracy 24 model 2 forward 381 / 475 backward 345 / 525 20240715-21:57:12 main_test_accuracy 24 0.7260000109672546 20240715-21:57:15 test_accuracy 24 model 3 forward 405 / 506 backward 314 / 494 20240715-21:57:15 main_test_accuracy 24 0.7190000414848328 20240715-21:57:17 wrote gpt_003.pth 20240715-21:57:17 wrote gpt_002.pth 20240715-21:57:25 --- epoch 25 ---------------------------------------- 20240715-21:57:25 current_test_accuracies 0.6980 0.7320 0.7260 0.7190 0.6940 20240715-21:57:25 training model 4 20240715-21:57:25 training model 0 20240715-22:06:21 train_perplexity 25 model 0 1.1671032171767446 20240715-22:06:46 train_perplexity 25 model 4 1.1682154181416815 20240715-22:06:51 test_perplexity 25 model 0 1.1661922147829324 20240715-22:07:08 test_perplexity 25 model 4 1.1656109425605414 20240715-22:10:02 test_accuracy 25 model 0 forward 373 / 484 backward 327 / 516 20240715-22:10:02 main_test_accuracy 25 0.7000000476837158 20240715-22:10:04 test_accuracy 25 model 4 forward 410 / 528 backward 286 / 472 20240715-22:10:04 main_test_accuracy 25 0.6960000395774841 20240715-22:10:05 wrote gpt_004.pth 20240715-22:10:05 wrote gpt_000.pth 20240715-22:10:14 --- epoch 26 ---------------------------------------- 20240715-22:10:14 current_test_accuracies 0.7000 0.7320 0.7260 0.7190 0.6960 20240715-22:10:14 training model 4 20240715-22:10:14 training model 0 20240715-22:19:09 train_perplexity 26 model 0 1.1659825606311938 20240715-22:19:35 train_perplexity 26 model 4 1.1668103504397218 20240715-22:19:39 test_perplexity 26 model 0 1.165275162658801 20240715-22:19:57 test_perplexity 26 model 4 1.1640821661046212 20240715-22:22:51 test_accuracy 26 model 0 forward 372 / 484 backward 336 / 516 20240715-22:22:51 main_test_accuracy 26 0.7080000042915344 20240715-22:22:53 test_accuracy 26 model 4 forward 422 / 528 backward 295 / 472 20240715-22:22:53 main_test_accuracy 26 0.7170000076293945 20240715-22:22:55 wrote gpt_004.pth 20240715-22:22:55 wrote gpt_000.pth 20240715-22:23:04 --- epoch 27 ---------------------------------------- 20240715-22:23:04 current_test_accuracies 0.7080 0.7320 0.7260 0.7190 0.7170 20240715-22:23:04 training model 0 20240715-22:23:04 training model 4 20240715-22:31:59 train_perplexity 27 model 4 1.1654629430431782 20240715-22:32:21 train_perplexity 27 model 0 1.1651104936743868 20240715-22:32:32 test_perplexity 27 model 4 1.1630204496172947 20240715-22:32:46 test_perplexity 27 model 0 1.1639659489277983 20240715-22:35:39 test_accuracy 27 model 4 forward 423 / 528 backward 308 / 472 20240715-22:35:39 main_test_accuracy 27 0.7310000061988831 20240715-22:35:43 test_accuracy 27 model 0 forward 407 / 484 backward 328 / 516 20240715-22:35:43 main_test_accuracy 27 0.7350000143051147 20240715-22:35:45 wrote gpt_000.pth 20240715-22:35:45 wrote gpt_004.pth 20240715-22:35:54 --- epoch 28 ---------------------------------------- 20240715-22:35:54 current_test_accuracies 0.7350 0.7320 0.7260 0.7190 0.7310 20240715-22:35:54 training model 3 20240715-22:35:54 training model 2 20240715-22:44:49 train_perplexity 28 model 2 1.1657030913940218 20240715-22:45:14 train_perplexity 28 model 3 1.1663051819100465 20240715-22:45:20 test_perplexity 28 model 2 1.162806812520627 20240715-22:45:36 test_perplexity 28 model 3 1.1635968892744717 20240715-22:48:31 test_accuracy 28 model 2 forward 389 / 475 backward 326 / 525 20240715-22:48:31 main_test_accuracy 28 0.7150000333786011 20240715-22:48:34 test_accuracy 28 model 3 forward 391 / 506 backward 334 / 494 20240715-22:48:34 main_test_accuracy 28 0.7250000238418579 20240715-22:48:35 wrote gpt_003.pth 20240715-22:48:35 wrote gpt_002.pth 20240715-22:48:43 --- epoch 29 ---------------------------------------- 20240715-22:48:43 current_test_accuracies 0.7350 0.7320 0.7150 0.7250 0.7310 20240715-22:48:43 training model 2 20240715-22:48:43 training model 3 20240715-22:57:38 train_perplexity 29 model 3 1.1657818554938788 20240715-22:58:04 train_perplexity 29 model 2 1.1643540590460928 20240715-22:58:08 test_perplexity 29 model 3 1.1618546713501108 20240715-22:58:26 test_perplexity 29 model 2 1.161369152437375 20240715-23:01:18 test_accuracy 29 model 3 forward 414 / 506 backward 328 / 494 20240715-23:01:18 main_test_accuracy 29 0.7420000433921814 20240715-23:01:22 test_accuracy 29 model 2 forward 393 / 475 backward 354 / 525 20240715-23:01:22 main_test_accuracy 29 0.7470000386238098 20240715-23:01:24 wrote gpt_002.pth 20240715-23:01:24 wrote gpt_003.pth 20240715-23:01:33 --- epoch 30 ---------------------------------------- 20240715-23:01:33 current_test_accuracies 0.7350 0.7320 0.7470 0.7420 0.7310 20240715-23:01:33 training model 4 20240715-23:01:33 training model 1 20240715-23:10:29 train_perplexity 30 model 1 1.1677036302420989 20240715-23:10:54 train_perplexity 30 model 4 1.1649089163528707 20240715-23:10:58 test_perplexity 30 model 1 1.1666496176618208 20240715-23:11:16 test_perplexity 30 model 4 1.1627261442523724 20240715-23:12:29 argv ./main.py --result_dir=results_grids_v6 --resume --seed=34231 20240715-23:12:29 args.log_filename train.log 20240715-23:12:29 args.result_dir results_grids_v6 20240715-23:12:29 args.seed 34231 20240715-23:12:29 args.resume True 20240715-23:12:29 args.max_percents_of_test_in_train -1 20240715-23:12:29 args.nb_epochs 10000 20240715-23:12:29 args.batch_size 25 20240715-23:12:29 args.physical_batch_size None 20240715-23:12:29 args.nb_train_samples 100000 20240715-23:12:29 args.nb_test_samples 10000 20240715-23:12:29 args.nb_new_c_quizzes_for_train None 20240715-23:12:29 args.nb_new_c_quizzes_for_test None 20240715-23:12:29 args.learning_rate 0.0005 20240715-23:12:29 args.model 37M 20240715-23:12:29 args.dim_model 512 20240715-23:12:29 args.dim_keys 64 20240715-23:12:29 args.dim_hidden 2048 20240715-23:12:29 args.nb_heads 8 20240715-23:12:29 args.nb_blocks 12 20240715-23:12:29 args.dropout 0.1 20240715-23:12:29 args.deterministic_synthesis False 20240715-23:12:29 args.problem grids 20240715-23:12:29 args.nb_threads 1 20240715-23:12:29 args.gpus all 20240715-23:12:29 args.nb_gpts 5 20240715-23:12:29 args.accuracy_to_make_c_quizzes 0.9 20240715-23:12:29 args.proba_understands 0.9 20240715-23:12:29 args.proba_not_understands 0.5 20240715-23:12:29 args.generation_temperature 2 20240715-23:12:29 args.c_quiz_validation_mode predict 20240715-23:12:29 args.dirty_debug False 20240715-23:12:29 args.grids_tasks None 20240715-23:12:29 args.sky_height 6 20240715-23:12:29 args.sky_width 8 20240715-23:12:29 args.sky_nb_birds 3 20240715-23:12:29 args.sky_nb_iterations 2 20240715-23:12:29 args.sky_speed 3 20240715-23:12:29 main_device cuda:0 gpus ['cuda:0', 'cuda:1'] 20240715-23:12:29 vocabulary_size 13 20240715-23:12:29 creating model 0 and its w_quizzes 20240715-23:14:16 creating model 1 and its w_quizzes 20240715-23:16:03 creating model 2 and its w_quizzes 20240715-23:17:50 creating model 3 and its w_quizzes 20240715-23:19:36 creating model 4 and its w_quizzes 20240715-23:21:24 successfully loaded gpt_000.pth 20240715-23:21:24 successfully loaded gpt_001.pth 20240715-23:21:24 successfully loaded gpt_002.pth 20240715-23:21:24 successfully loaded gpt_003.pth 20240715-23:21:24 successfully loaded gpt_004.pth 20240715-23:21:24 cannot find c_quizzes.pth 20240715-23:21:24 nb_parameters 37817357 (37M) 20240715-23:21:26 nb_new_c_quizzes_for_train 1000 nb_new_c_quizzes_for_test 100 20240715-23:21:26 --- epoch 0 ---------------------------------------- 20240715-23:21:26 current_test_accuracies 0.7350 0.7320 0.7470 0.7420 0.7310 20240715-23:21:26 training model 4 20240715-23:21:26 training model 1 20240715-23:30:23 train_perplexity 0 model 1 1.1681944976711691 20240715-23:30:43 train_perplexity 0 model 4 1.164876289540722 20240715-23:30:55 test_perplexity 0 model 1 1.1663235110996584 20240715-23:31:09 test_perplexity 0 model 4 1.16380195622991 20240715-23:34:02 test_accuracy 0 model 1 forward 400 / 518 backward 311 / 482 20240715-23:34:04 test_accuracy 0 model 4 forward 409 / 527 backward 304 / 473 20240715-23:34:06 wrote gpt_004.pth 20240715-23:34:06 wrote gpt_001.pth 20240715-23:34:13 --- epoch 1 ---------------------------------------- 20240715-23:34:13 current_test_accuracies 0.7350 0.7110 0.7470 0.7420 0.7130 20240715-23:34:13 training model 1 20240715-23:34:13 training model 4 20240715-23:43:09 train_perplexity 1 model 4 1.164717532437211 20240715-23:43:32 train_perplexity 1 model 1 1.1662691029360468 20240715-23:43:39 test_perplexity 1 model 4 1.1625640989502355 20240715-23:43:55 test_perplexity 1 model 1 1.1653906061288677 20240715-23:46:46 test_accuracy 1 model 4 forward 414 / 527 backward 297 / 473 20240715-23:46:50 test_accuracy 1 model 1 forward 415 / 518 backward 287 / 482 20240715-23:46:52 wrote gpt_001.pth 20240715-23:46:52 wrote gpt_004.pth 20240715-23:47:01 --- epoch 2 ---------------------------------------- 20240715-23:47:01 current_test_accuracies 0.7350 0.7020 0.7470 0.7420 0.7110 20240715-23:47:01 training model 1 20240715-23:47:01 training model 4 20240715-23:55:57 train_perplexity 2 model 4 1.1636115875279662 20240715-23:56:21 train_perplexity 2 model 1 1.1654286267286715 20240715-23:56:27 test_perplexity 2 model 4 1.162122379094477 20240715-23:56:44 test_perplexity 2 model 1 1.1645077035190705 20240715-23:59:34 test_accuracy 2 model 4 forward 432 / 527 backward 313 / 473 20240715-23:59:38 test_accuracy 2 model 1 forward 429 / 518 backward 309 / 482 20240715-23:59:40 wrote gpt_001.pth 20240715-23:59:40 wrote gpt_004.pth 20240715-23:59:48 --- epoch 3 ---------------------------------------- 20240715-23:59:48 current_test_accuracies 0.7350 0.7380 0.7470 0.7420 0.7450 20240715-23:59:48 training model 0 20240715-23:59:48 training model 1 20240716-00:08:43 train_perplexity 3 model 1 1.1647133883052014 20240716-00:09:10 train_perplexity 3 model 0 1.164374913117596 20240716-00:09:13 test_perplexity 3 model 1 1.1638636587958564 20240716-00:09:31 test_perplexity 3 model 0 1.1604957903478617 20240716-00:12:22 test_accuracy 3 model 1 forward 425 / 518 backward 324 / 482 20240716-00:12:27 test_accuracy 3 model 0 forward 416 / 493 backward 365 / 507 20240716-00:12:28 wrote gpt_000.pth 20240716-00:12:29 wrote gpt_001.pth 20240716-00:12:36 --- epoch 4 ---------------------------------------- 20240716-00:12:36 current_test_accuracies 0.7810 0.7490 0.7470 0.7420 0.7450 20240716-00:12:36 training model 3 20240716-00:12:36 training model 4 20240716-00:21:32 train_perplexity 4 model 4 1.1628591388478227 20240716-00:21:56 train_perplexity 4 model 3 1.1646216534401794 20240716-00:22:05 test_perplexity 4 model 4 1.1615093278146726 20240716-00:22:20 test_perplexity 4 model 3 1.1642830094318648 20240716-00:25:11 test_accuracy 4 model 4 forward 433 / 527 backward 334 / 473 20240716-00:25:15 test_accuracy 4 model 3 forward 395 / 508 backward 337 / 492 20240716-00:25:17 wrote gpt_003.pth 20240716-00:25:17 wrote gpt_004.pth 20240716-00:25:25 --- epoch 5 ---------------------------------------- 20240716-00:25:25 current_test_accuracies 0.7810 0.7490 0.7470 0.7320 0.7670 20240716-00:25:25 training model 3 20240716-00:25:25 training model 2 20240716-00:34:21 train_perplexity 5 model 2 1.164244148198222 20240716-00:34:46 train_perplexity 5 model 3 1.1640086717663654 20240716-00:34:51 test_perplexity 5 model 2 1.161365350370555 20240716-00:35:08 test_perplexity 5 model 3 1.1633783953270576 20240716-00:37:59 test_accuracy 5 model 2 forward 412 / 512 backward 359 / 488 20240716-00:38:03 test_accuracy 5 model 3 forward 406 / 508 backward 347 / 492 20240716-00:38:04 wrote gpt_003.pth 20240716-00:38:04 wrote gpt_002.pth 20240716-00:38:13 --- epoch 6 ---------------------------------------- 20240716-00:38:13 current_test_accuracies 0.7810 0.7490 0.7710 0.7530 0.7670 20240716-00:38:13 training model 1 20240716-00:38:13 training model 3 20240716-00:47:08 train_perplexity 6 model 3 1.1627806601086885 20240716-00:47:33 train_perplexity 6 model 1 1.1641778893442314 20240716-00:47:38 test_perplexity 6 model 3 1.1634010089780014 20240716-00:47:55 test_perplexity 6 model 1 1.162844999331882 20240716-00:50:47 test_accuracy 6 model 3 forward 407 / 508 backward 349 / 492 20240716-00:50:51 test_accuracy 6 model 1 forward 444 / 518 backward 340 / 482 20240716-00:50:52 wrote gpt_001.pth 20240716-00:50:52 wrote gpt_003.pth 20240716-00:51:01 --- epoch 7 ---------------------------------------- 20240716-00:51:01 current_test_accuracies 0.7810 0.7840 0.7710 0.7560 0.7670 20240716-00:51:01 training model 3 20240716-00:51:01 training model 4 20240716-00:59:56 train_perplexity 7 model 4 1.1625470882964364 20240716-01:00:19 train_perplexity 7 model 3 1.1630592867251963 20240716-01:00:27 test_perplexity 7 model 4 1.1613513975885366 20240716-01:00:42 test_perplexity 7 model 3 1.1628012894544302 20240716-01:03:33 test_accuracy 7 model 4 forward 434 / 527 backward 326 / 473 20240716-01:03:37 test_accuracy 7 model 3 forward 412 / 508 backward 357 / 492 20240716-01:03:39 wrote gpt_003.pth 20240716-01:03:39 wrote gpt_004.pth 20240716-01:03:47 --- epoch 8 ---------------------------------------- 20240716-01:03:47 current_test_accuracies 0.7810 0.7840 0.7710 0.7690 0.7600 20240716-01:03:47 training model 4 20240716-01:03:47 training model 3 20240716-01:12:43 train_perplexity 8 model 3 1.1626809316469238 20240716-01:13:04 train_perplexity 8 model 4 1.161878773303413 20240716-01:13:15 test_perplexity 8 model 3 1.1613891562815921 20240716-01:13:29 test_perplexity 8 model 4 1.160727514445793 20240716-01:16:23 test_accuracy 8 model 3 forward 419 / 508 backward 373 / 492 20240716-01:16:26 test_accuracy 8 model 4 forward 436 / 527 backward 337 / 473 20240716-01:16:27 wrote gpt_004.pth 20240716-01:16:27 wrote gpt_003.pth 20240716-01:16:35 --- epoch 9 ---------------------------------------- 20240716-01:16:35 current_test_accuracies 0.7810 0.7840 0.7710 0.7920 0.7730 20240716-01:16:35 training model 2 20240716-01:16:35 training model 4 20240716-01:25:31 train_perplexity 9 model 4 1.1612737813858176 20240716-01:25:53 train_perplexity 9 model 2 1.162876785355957 20240716-01:26:02 test_perplexity 9 model 4 1.161386770580562 20240716-01:26:17 test_perplexity 9 model 2 1.160122364440327 20240716-01:29:09 test_accuracy 9 model 4 forward 447 / 527 backward 343 / 473 20240716-01:29:13 test_accuracy 9 model 2 forward 440 / 512 backward 350 / 488 20240716-01:29:15 wrote gpt_002.pth 20240716-01:29:15 wrote gpt_004.pth 20240716-01:29:23 --- epoch 10 ---------------------------------------- 20240716-01:29:23 current_test_accuracies 0.7810 0.7840 0.7900 0.7920 0.7900 20240716-01:29:23 training model 0 20240716-01:29:23 training model 1 20240716-01:38:19 train_perplexity 10 model 1 1.1637389604537762 20240716-01:38:41 train_perplexity 10 model 0 1.1639329700985483 20240716-01:38:50 test_perplexity 10 model 1 1.1629297568193566 20240716-01:39:05 test_perplexity 10 model 0 1.159376496995672 20240716-01:41:57 test_accuracy 10 model 1 forward 437 / 518 backward 335 / 482 20240716-01:42:01 test_accuracy 10 model 0 forward 415 / 493 backward 360 / 507 20240716-01:42:02 wrote gpt_000.pth 20240716-01:42:02 wrote gpt_001.pth 20240716-01:42:10 --- epoch 11 ---------------------------------------- 20240716-01:42:10 current_test_accuracies 0.7750 0.7720 0.7900 0.7920 0.7900 20240716-01:42:10 training model 1 20240716-01:42:10 training model 0 20240716-01:51:06 train_perplexity 11 model 0 1.163244358405057 20240716-01:51:30 train_perplexity 11 model 1 1.1629231664489417 20240716-01:51:36 test_perplexity 11 model 0 1.1591030218181229 20240716-01:51:52 test_perplexity 11 model 1 1.1622342183091998 20240716-01:54:46 test_accuracy 11 model 0 forward 410 / 493 backward 363 / 507 20240716-01:54:49 test_accuracy 11 model 1 forward 436 / 518 backward 342 / 482 20240716-01:54:50 wrote gpt_001.pth 20240716-01:54:50 wrote gpt_000.pth 20240716-01:54:59 --- epoch 12 ---------------------------------------- 20240716-01:54:59 current_test_accuracies 0.7730 0.7780 0.7900 0.7920 0.7900 20240716-01:54:59 training model 0 20240716-01:54:59 training model 1 20240716-02:03:55 train_perplexity 12 model 1 1.1625936984636123 20240716-02:04:18 train_perplexity 12 model 0 1.1619294572248016 20240716-02:04:25 test_perplexity 12 model 1 1.1615735600581762 20240716-02:04:41 test_perplexity 12 model 0 1.1587861808668298 20240716-02:07:34 test_accuracy 12 model 1 forward 440 / 518 backward 352 / 482 20240716-02:07:38 test_accuracy 12 model 0 forward 411 / 493 backward 376 / 507 20240716-02:07:40 wrote gpt_000.pth 20240716-02:07:40 wrote gpt_001.pth 20240716-02:07:48 --- epoch 13 ---------------------------------------- 20240716-02:07:48 current_test_accuracies 0.7870 0.7920 0.7900 0.7920 0.7900 20240716-02:07:48 training model 0 20240716-02:07:48 training model 2 20240716-02:16:43 train_perplexity 13 model 2 1.162354481337884 20240716-02:17:09 train_perplexity 13 model 0 1.1623766462564102 20240716-02:17:12 test_perplexity 13 model 2 1.1597140779644217 20240716-02:17:30 test_perplexity 13 model 0 1.1594828119596952 20240716-02:20:23 test_accuracy 13 model 2 forward 442 / 512 backward 363 / 488 20240716-02:20:27 test_accuracy 13 model 0 forward 427 / 493 backward 369 / 507 20240716-02:20:28 wrote gpt_000.pth 20240716-02:20:29 wrote gpt_002.pth 20240716-02:20:37 --- epoch 14 ---------------------------------------- 20240716-02:20:37 current_test_accuracies 0.7960 0.7920 0.8050 0.7920 0.7900 20240716-02:20:37 training model 4 20240716-02:20:37 training model 1 20240716-02:29:33 train_perplexity 14 model 1 1.162338202504428 20240716-02:29:59 train_perplexity 14 model 4 1.1607271252167721 20240716-02:30:02 test_perplexity 14 model 1 1.1609135185628376 20240716-02:30:20 test_perplexity 14 model 4 1.1602774537325466 20240716-02:33:13 test_accuracy 14 model 1 forward 445 / 518 backward 347 / 482 20240716-02:33:16 test_accuracy 14 model 4 forward 446 / 527 backward 349 / 473 20240716-02:33:17 wrote gpt_004.pth 20240716-02:33:18 wrote gpt_001.pth 20240716-02:33:26 --- epoch 15 ---------------------------------------- 20240716-02:33:26 current_test_accuracies 0.7960 0.7920 0.8050 0.7920 0.7950 20240716-02:33:26 training model 1 20240716-02:33:26 training model 3 20240716-02:42:21 train_perplexity 15 model 3 1.1621087333068707 20240716-02:42:49 train_perplexity 15 model 1 1.1613004265709974 20240716-02:42:51 test_perplexity 15 model 3 1.1614653354128044 20240716-02:43:10 test_perplexity 15 model 1 1.160437896040018 20240716-02:46:01 test_accuracy 15 model 3 forward 427 / 508 backward 355 / 492 20240716-02:46:05 test_accuracy 15 model 1 forward 450 / 518 backward 341 / 482 20240716-02:46:06 wrote gpt_001.pth 20240716-02:46:07 wrote gpt_003.pth 20240716-02:46:15 --- epoch 16 ---------------------------------------- 20240716-02:46:15 current_test_accuracies 0.7960 0.7910 0.8050 0.7820 0.7950 20240716-02:46:15 training model 3 20240716-02:46:15 training model 1 20240716-02:55:10 train_perplexity 16 model 1 1.1616597817339704 20240716-02:55:38 train_perplexity 16 model 3 1.1616554467030782 20240716-02:55:40 test_perplexity 16 model 1 1.1604162772079394 20240716-02:55:58 test_perplexity 16 model 3 1.1609313012941647 20240716-02:58:50 test_accuracy 16 model 1 forward 451 / 518 backward 346 / 482 20240716-02:58:54 test_accuracy 16 model 3 forward 440 / 508 backward 364 / 492 20240716-02:58:55 wrote gpt_003.pth 20240716-02:58:55 wrote gpt_001.pth 20240716-02:59:03 --- epoch 17 ---------------------------------------- 20240716-02:59:03 current_test_accuracies 0.7960 0.7970 0.8050 0.8040 0.7950 20240716-02:59:03 training model 4 20240716-02:59:03 training model 0 20240716-03:07:58 train_perplexity 17 model 0 1.1615086282342462 20240716-03:08:26 train_perplexity 17 model 4 1.1611468739766448 20240716-03:08:28 test_perplexity 17 model 0 1.1580814272484872 20240716-03:08:46 test_perplexity 17 model 4 1.1599096235257347 20240716-03:11:40 test_accuracy 17 model 0 forward 442 / 493 backward 388 / 507 20240716-03:11:42 test_accuracy 17 model 4 forward 456 / 527 backward 335 / 473 20240716-03:11:44 wrote gpt_004.pth 20240716-03:11:44 wrote gpt_000.pth 20240716-03:11:52 --- epoch 18 ---------------------------------------- 20240716-03:11:52 current_test_accuracies 0.8300 0.7970 0.8050 0.8040 0.7910 20240716-03:11:52 training model 4 20240716-03:11:52 training model 1 20240716-03:20:48 train_perplexity 18 model 1 1.160860669094151 20240716-03:21:12 train_perplexity 18 model 4 1.160765327797207 20240716-03:21:18 test_perplexity 18 model 1 1.1595407280347287 20240716-03:21:35 test_perplexity 18 model 4 1.1593500069497615 20240716-03:24:27 test_accuracy 18 model 1 forward 458 / 518 backward 349 / 482 20240716-03:24:30 test_accuracy 18 model 4 forward 462 / 527 backward 348 / 473 20240716-03:24:31 wrote gpt_004.pth 20240716-03:24:32 wrote gpt_001.pth 20240716-03:24:39 --- epoch 19 ---------------------------------------- 20240716-03:24:39 current_test_accuracies 0.8300 0.8070 0.8050 0.8040 0.8100 20240716-03:24:39 training model 3 20240716-03:24:39 training model 2 20240716-03:33:35 train_perplexity 19 model 2 1.161724956637467 20240716-03:34:00 train_perplexity 19 model 3 1.1607931683678647 20240716-03:34:05 test_perplexity 19 model 2 1.1597464539701 20240716-03:34:22 test_perplexity 19 model 3 1.1605013766063685 20240716-03:37:14 test_accuracy 19 model 2 forward 442 / 512 backward 367 / 488 20240716-03:37:18 test_accuracy 19 model 3 forward 436 / 508 backward 367 / 492 20240716-03:37:19 wrote gpt_003.pth 20240716-03:37:20 wrote gpt_002.pth 20240716-03:37:28 --- epoch 20 ---------------------------------------- 20240716-03:37:28 current_test_accuracies 0.8300 0.8070 0.8090 0.8030 0.8100 20240716-03:37:28 training model 3 20240716-03:37:28 training model 1 20240716-03:46:24 train_perplexity 20 model 1 1.1615124566821937 20240716-03:46:50 train_perplexity 20 model 3 1.1607335457891645 20240716-03:46:54 test_perplexity 20 model 1 1.1598362485463514 20240716-03:47:12 test_perplexity 20 model 3 1.1602781802583932 20240716-03:50:05 test_accuracy 20 model 1 forward 462 / 518 backward 356 / 482 20240716-03:50:09 test_accuracy 20 model 3 forward 435 / 508 backward 373 / 492 20240716-03:50:10 wrote gpt_003.pth 20240716-03:50:10 wrote gpt_001.pth 20240716-03:50:19 --- epoch 21 ---------------------------------------- 20240716-03:50:19 current_test_accuracies 0.8300 0.8180 0.8090 0.8080 0.8100 20240716-03:50:19 training model 3 20240716-03:50:19 training model 2 20240716-03:59:15 train_perplexity 21 model 2 1.1620423541870768 20240716-03:59:39 train_perplexity 21 model 3 1.1605936271204274 20240716-03:59:45 test_perplexity 21 model 2 1.1604616455454617 20240716-04:00:02 test_perplexity 21 model 3 1.1607672926838677 20240716-04:02:53 test_accuracy 21 model 2 forward 441 / 512 backward 372 / 488 20240716-04:02:56 test_accuracy 21 model 3 forward 436 / 508 backward 377 / 492 20240716-04:02:58 wrote gpt_003.pth 20240716-04:02:58 wrote gpt_002.pth 20240716-04:03:06 --- epoch 22 ---------------------------------------- 20240716-04:03:06 current_test_accuracies 0.8300 0.8180 0.8130 0.8130 0.8100 20240716-04:03:06 training model 4 20240716-04:03:06 training model 2 20240716-04:12:03 train_perplexity 22 model 2 1.160928976122871 20240716-04:12:28 train_perplexity 22 model 4 1.160888563747833 20240716-04:12:33 test_perplexity 22 model 2 1.1589235767833839 20240716-04:12:50 test_perplexity 22 model 4 1.1593199005489936 20240716-04:15:41 test_accuracy 22 model 2 forward 453 / 512 backward 372 / 488 20240716-04:15:45 test_accuracy 22 model 4 forward 468 / 527 backward 355 / 473 20240716-04:15:46 wrote gpt_004.pth 20240716-04:15:46 wrote gpt_002.pth 20240716-04:15:54 --- epoch 23 ---------------------------------------- 20240716-04:15:54 current_test_accuracies 0.8300 0.8180 0.8250 0.8130 0.8230 20240716-04:15:54 training model 3 20240716-04:15:54 training model 1 20240716-04:24:50 train_perplexity 23 model 1 1.1599813433838502 20240716-04:25:12 train_perplexity 23 model 3 1.1603202332322389 20240716-04:25:24 test_perplexity 23 model 1 1.1588670953293396 20240716-04:25:37 test_perplexity 23 model 3 1.160129590586477 20240716-04:28:31 test_accuracy 23 model 1 forward 467 / 518 backward 356 / 482 20240716-04:28:33 test_accuracy 23 model 3 forward 427 / 508 backward 384 / 492 20240716-04:28:35 wrote gpt_003.pth 20240716-04:28:35 wrote gpt_001.pth 20240716-04:28:44 --- epoch 24 ---------------------------------------- 20240716-04:28:44 current_test_accuracies 0.8300 0.8230 0.8250 0.8110 0.8230 20240716-04:28:44 training model 3 20240716-04:28:44 training model 1 20240716-04:37:41 train_perplexity 24 model 1 1.1598169791747441 20240716-04:37:58 train_perplexity 24 model 3 1.1593191857851284 20240716-04:38:15 test_perplexity 24 model 1 1.159158501072529 20240716-04:38:25 test_perplexity 24 model 3 1.159302842416855 20240716-04:41:19 test_accuracy 24 model 1 forward 464 / 518 backward 360 / 482 20240716-04:41:22 test_accuracy 24 model 3 forward 443 / 508 backward 384 / 492 20240716-04:41:23 wrote gpt_003.pth 20240716-04:41:23 wrote gpt_001.pth 20240716-04:41:32 --- epoch 25 ---------------------------------------- 20240716-04:41:32 current_test_accuracies 0.8300 0.8240 0.8250 0.8270 0.8230 20240716-04:41:32 training model 4 20240716-04:41:32 training model 1 20240716-04:50:28 train_perplexity 25 model 1 1.1588652193020466 20240716-04:50:50 train_perplexity 25 model 4 1.1602855774464085 20240716-04:50:59 test_perplexity 25 model 1 1.1588139943312108 20240716-04:51:14 test_perplexity 25 model 4 1.159438193312184 20240716-04:54:06 test_accuracy 25 model 1 forward 470 / 518 backward 367 / 482 20240716-04:54:09 test_accuracy 25 model 4 forward 461 / 527 backward 360 / 473 20240716-04:54:10 wrote gpt_004.pth 20240716-04:54:11 wrote gpt_001.pth 20240716-04:54:19 --- epoch 26 ---------------------------------------- 20240716-04:54:19 current_test_accuracies 0.8300 0.8370 0.8250 0.8270 0.8210 20240716-04:54:19 training model 4 20240716-04:54:19 training model 2 20240716-05:03:16 train_perplexity 26 model 2 1.1609033400964806 20240716-05:03:33 train_perplexity 26 model 4 1.1598703650699327 20240716-05:03:50 test_perplexity 26 model 2 1.1583954550839972 20240716-05:04:01 test_perplexity 26 model 4 1.1590755515544842 20240716-05:06:55 test_accuracy 26 model 2 forward 454 / 512 backward 374 / 488 20240716-05:06:57 test_accuracy 26 model 4 forward 471 / 527 backward 354 / 473 20240716-05:06:58 wrote gpt_004.pth 20240716-05:06:58 wrote gpt_002.pth 20240716-05:07:07 --- epoch 27 ---------------------------------------- 20240716-05:07:07 current_test_accuracies 0.8300 0.8370 0.8280 0.8270 0.8250 20240716-05:07:07 training model 4 20240716-05:07:07 training model 3 20240716-05:16:03 train_perplexity 27 model 3 1.159471986138873 20240716-05:16:24 train_perplexity 27 model 4 1.1592705261748082 20240716-05:16:34 test_perplexity 27 model 3 1.159430653211111 20240716-05:16:48 test_perplexity 27 model 4 1.1592835763123552 20240716-05:19:41 test_accuracy 27 model 3 forward 449 / 508 backward 395 / 492 20240716-05:19:43 test_accuracy 27 model 4 forward 474 / 527 backward 353 / 473 20240716-05:19:44 wrote gpt_004.pth 20240716-05:19:45 wrote gpt_003.pth 20240716-05:19:53 --- epoch 28 ---------------------------------------- 20240716-05:19:53 current_test_accuracies 0.8300 0.8370 0.8280 0.8440 0.8270 20240716-05:19:53 training model 4 20240716-05:19:53 training model 2 20240716-05:28:50 train_perplexity 28 model 2 1.1602643797273204 20240716-05:29:11 train_perplexity 28 model 4 1.1594482517819793 20240716-05:29:21 test_perplexity 28 model 2 1.1579603340647633 20240716-05:29:35 test_perplexity 28 model 4 1.1582434711511191 20240716-05:32:27 test_accuracy 28 model 2 forward 446 / 512 backward 369 / 488 20240716-05:32:30 test_accuracy 28 model 4 forward 473 / 527 backward 367 / 473 20240716-05:32:31 wrote gpt_004.pth 20240716-05:32:32 wrote gpt_002.pth 20240716-05:32:40 --- epoch 29 ---------------------------------------- 20240716-05:32:40 current_test_accuracies 0.8300 0.8370 0.8150 0.8440 0.8400 20240716-05:32:40 training model 2 20240716-05:32:40 training model 0 20240716-05:41:36 train_perplexity 29 model 0 1.1607948978890095 20240716-05:41:57 train_perplexity 29 model 2 1.1602099999411097 20240716-05:42:07 test_perplexity 29 model 0 1.1577161552433821 20240716-05:42:22 test_perplexity 29 model 2 1.1585824743779107 20240716-05:45:17 test_accuracy 29 model 0 forward 429 / 493 backward 389 / 507 20240716-05:45:19 test_accuracy 29 model 2 forward 439 / 512 backward 376 / 488 20240716-05:45:21 wrote gpt_002.pth 20240716-05:45:21 wrote gpt_000.pth 20240716-05:45:28 --- epoch 30 ---------------------------------------- 20240716-05:45:28 current_test_accuracies 0.8180 0.8370 0.8150 0.8440 0.8400 20240716-05:45:28 training model 2 20240716-05:45:28 training model 0 20240716-05:54:24 train_perplexity 30 model 0 1.1602356387018358 20240716-05:54:46 train_perplexity 30 model 2 1.160247466016037 20240716-05:54:55 test_perplexity 30 model 0 1.1573310125511018 20240716-05:55:10 test_perplexity 30 model 2 1.1573603851361278 20240716-05:58:04 test_accuracy 30 model 0 forward 444 / 493 backward 405 / 507 20240716-05:58:07 test_accuracy 30 model 2 forward 458 / 512 backward 389 / 488 20240716-05:58:08 wrote gpt_002.pth 20240716-05:58:08 wrote gpt_000.pth 20240716-05:58:15 --- epoch 31 ---------------------------------------- 20240716-05:58:15 current_test_accuracies 0.8490 0.8370 0.8470 0.8440 0.8400 20240716-05:58:15 training model 1 20240716-05:58:15 training model 4 20240716-06:07:13 train_perplexity 31 model 4 1.1591334992661257 20240716-06:07:34 train_perplexity 31 model 1 1.1588386073611407 20240716-06:07:44 test_perplexity 31 model 4 1.1576841404799447 20240716-06:07:58 test_perplexity 31 model 1 1.1590938139457425 20240716-06:10:49 test_accuracy 31 model 4 forward 487 / 527 backward 368 / 473 20240716-06:10:53 test_accuracy 31 model 1 forward 475 / 518 backward 365 / 482 20240716-06:10:55 wrote gpt_001.pth 20240716-06:10:55 wrote gpt_004.pth 20240716-06:11:02 --- epoch 32 ---------------------------------------- 20240716-06:11:02 current_test_accuracies 0.8490 0.8400 0.8470 0.8440 0.8550 20240716-06:11:02 training model 1 20240716-06:11:02 training model 3 20240716-06:19:58 train_perplexity 32 model 3 1.1592201034812235 20240716-06:20:21 train_perplexity 32 model 1 1.1592556010234496 20240716-06:20:28 test_perplexity 32 model 3 1.1601451868197623 20240716-06:20:44 test_perplexity 32 model 1 1.1585455106234457 20240716-06:23:36 test_accuracy 32 model 3 forward 454 / 508 backward 394 / 492 20240716-06:23:40 test_accuracy 32 model 1 forward 481 / 518 backward 373 / 482 20240716-06:23:41 wrote gpt_001.pth 20240716-06:23:42 wrote gpt_003.pth 20240716-06:23:49 --- epoch 33 ---------------------------------------- 20240716-06:23:49 current_test_accuracies 0.8490 0.8540 0.8470 0.8480 0.8550 20240716-06:23:49 training model 2 20240716-06:23:49 training model 3 20240716-06:32:46 train_perplexity 33 model 3 1.1586829350067889 20240716-06:33:07 train_perplexity 33 model 2 1.159421272648845 20240716-06:33:17 test_perplexity 33 model 3 1.1588706923562626 20240716-06:33:31 test_perplexity 33 model 2 1.1573623823340202 20240716-06:36:24 test_accuracy 33 model 3 forward 462 / 508 backward 391 / 492 20240716-06:36:28 test_accuracy 33 model 2 forward 465 / 512 backward 393 / 488 20240716-06:36:29 wrote gpt_002.pth 20240716-06:36:29 wrote gpt_003.pth 20240716-06:36:38 --- epoch 34 ---------------------------------------- 20240716-06:36:38 current_test_accuracies 0.8490 0.8540 0.8580 0.8530 0.8550 20240716-06:36:38 training model 0 20240716-06:36:38 training model 3 20240716-06:45:35 train_perplexity 34 model 3 1.1589225751532157 20240716-06:45:58 train_perplexity 34 model 0 1.1603189310471211 20240716-06:46:05 test_perplexity 34 model 3 1.1592272223919107 20240716-06:46:21 test_perplexity 34 model 0 1.1567559800341192 20240716-06:49:13 test_accuracy 34 model 3 forward 465 / 508 backward 383 / 492 20240716-06:49:17 test_accuracy 34 model 0 forward 438 / 493 backward 403 / 507 20240716-06:49:19 wrote gpt_000.pth 20240716-06:49:19 wrote gpt_003.pth 20240716-06:49:28 --- epoch 35 ---------------------------------------- 20240716-06:49:28 current_test_accuracies 0.8410 0.8540 0.8580 0.8480 0.8550 20240716-06:49:28 training model 0 20240716-06:49:28 training model 3 20240716-06:58:24 train_perplexity 35 model 3 1.1588686631753855 20240716-06:58:50 train_perplexity 35 model 0 1.1597413587764198 20240716-06:58:53 test_perplexity 35 model 3 1.1587913260252005 20240716-06:59:11 test_perplexity 35 model 0 1.1569315935869422 20240716-07:02:03 test_accuracy 35 model 3 forward 470 / 508 backward 404 / 492 20240716-07:02:08 test_accuracy 35 model 0 forward 448 / 493 backward 383 / 507 20240716-07:02:09 wrote gpt_000.pth 20240716-07:02:09 wrote gpt_003.pth 20240716-07:02:18 --- epoch 36 ---------------------------------------- 20240716-07:02:18 current_test_accuracies 0.8310 0.8540 0.8580 0.8740 0.8550 20240716-07:02:18 training model 0 20240716-07:02:18 training model 1 20240716-07:11:14 train_perplexity 36 model 1 1.1594592850066885 20240716-07:11:38 train_perplexity 36 model 0 1.1600589679624906 20240716-07:11:45 test_perplexity 36 model 1 1.1581380499971454 20240716-07:12:01 test_perplexity 36 model 0 1.1565140280096768 20240716-07:14:54 test_accuracy 36 model 1 forward 469 / 518 backward 370 / 482 20240716-07:14:58 test_accuracy 36 model 0 forward 443 / 493 backward 417 / 507 20240716-07:14:59 wrote gpt_000.pth 20240716-07:15:00 wrote gpt_001.pth 20240716-07:15:08 --- epoch 37 ---------------------------------------- 20240716-07:15:08 current_test_accuracies 0.8600 0.8390 0.8580 0.8740 0.8550 20240716-07:15:08 training model 1 20240716-07:15:08 training model 4 20240716-07:24:04 train_perplexity 37 model 4 1.159013906716659 20240716-07:24:30 train_perplexity 37 model 1 1.1585327520535558 20240716-07:24:34 test_perplexity 37 model 4 1.157817252061504 20240716-07:24:52 test_perplexity 37 model 1 1.158368932174961 20240716-07:27:42 test_accuracy 37 model 4 forward 475 / 527 backward 368 / 473 20240716-07:27:47 test_accuracy 37 model 1 forward 485 / 518 backward 369 / 482 20240716-07:27:49 wrote gpt_001.pth 20240716-07:27:49 wrote gpt_004.pth 20240716-07:27:57 --- epoch 38 ---------------------------------------- 20240716-07:27:57 current_test_accuracies 0.8600 0.8540 0.8580 0.8740 0.8430 20240716-07:27:57 training model 4 20240716-07:27:57 training model 1 20240716-07:36:53 train_perplexity 38 model 1 1.158966728091178