20240715-16:27:53 argv ./main.py --result_dir=results_grids_v6
20240715-16:27:53 args.log_filename train.log
20240715-16:27:53 args.result_dir results_grids_v6
20240715-16:27:53 args.seed 0
20240715-16:27:53 args.resume False
20240715-16:27:53 args.max_percents_of_test_in_train -1
20240715-16:27:53 args.nb_epochs 10000
20240715-16:27:53 args.batch_size 25
20240715-16:27:53 args.physical_batch_size None
20240715-16:27:53 args.nb_train_samples 100000
20240715-16:27:53 args.nb_test_samples 10000
20240715-16:27:53 args.nb_new_c_quizzes_for_train None
20240715-16:27:53 args.nb_new_c_quizzes_for_test None
20240715-16:27:53 args.learning_rate 0.0005
20240715-16:27:53 args.model 37M
20240715-16:27:53 args.dim_model 512
20240715-16:27:53 args.dim_keys 64
20240715-16:27:53 args.dim_hidden 2048
20240715-16:27:53 args.nb_heads 8
20240715-16:27:53 args.nb_blocks 12
20240715-16:27:53 args.dropout 0.1
20240715-16:27:53 args.deterministic_synthesis False
20240715-16:27:53 args.problem grids
20240715-16:27:53 args.nb_threads 1
20240715-16:27:53 args.gpus all
20240715-16:27:53 args.nb_gpts 5
20240715-16:27:53 args.accuracy_to_make_c_quizzes 0.9
20240715-16:27:53 args.proba_understands 0.9
20240715-16:27:53 args.proba_not_understands 0.5
20240715-16:27:53 args.generation_temperature 2
20240715-16:27:53 args.dirty_debug False
20240715-16:27:53 args.grids_tasks None
20240715-16:27:53 args.sky_height 6
20240715-16:27:53 args.sky_width 8
20240715-16:27:53 args.sky_nb_birds 3
20240715-16:27:53 args.sky_nb_iterations 2
20240715-16:27:53 args.sky_speed 3
20240715-16:28:15 main_device cuda:0 gpus ['cuda:0', 'cuda:1']
20240715-16:28:15 vocabulary_size 13
20240715-16:28:15 creating model 0 and its w_quizzes
20240715-16:29:53 creating model 1 and its w_quizzes
20240715-16:31:39 creating model 2 and its w_quizzes
20240715-16:33:24 creating model 3 and its w_quizzes
20240715-16:35:10 creating model 4 and its w_quizzes
20240715-16:36:58 nb_parameters 37817357 (37M)
20240715-16:36:59 nb_new_c_quizzes_for_train 1000 nb_new_c_quizzes_for_test 100
20240715-16:36:59 --- epoch 0 ----------------------------------------
20240715-16:36:59 current_test_accuracies 0.0000 0.0000 0.0000 0.0000 0.0000
20240715-16:36:59 training model 0
20240715-16:36:59 training model 1
20240715-16:45:55 train_perplexity 0 model 1 1.6962359633661162
20240715-16:46:18 train_perplexity 0 model 0 1.6927451994906717
20240715-16:46:25 test_perplexity 0 model 1 1.270926889772941
20240715-16:46:41 test_perplexity 0 model 0 1.2698948983866976
20240715-16:49:34 test_accuracy 0 model 1 forward 11 / 493 backward 3 / 507
20240715-16:49:34 main_test_accuracy 0 0.014000000432133675
20240715-16:49:38 test_accuracy 0 model 0 forward 7 / 484 backward 2 / 516
20240715-16:49:38 main_test_accuracy 0 0.009000000543892384
20240715-16:49:39 wrote gpt_000.pth
20240715-16:49:39 wrote gpt_001.pth
20240715-16:49:48 --- epoch 1 ----------------------------------------
20240715-16:49:48 current_test_accuracies 0.0090 0.0140 0.0000 0.0000 0.0000
20240715-16:49:48 training model 2
20240715-16:49:48 training model 3
20240715-16:58:43 train_perplexity 1 model 3 1.7657245135749406
20240715-16:59:00 train_perplexity 1 model 2 1.706454500319667
20240715-16:59:15 test_perplexity 1 model 3 1.2697066081658515
20240715-16:59:27 test_perplexity 1 model 2 1.2656771422226274
20240715-17:02:20 test_accuracy 1 model 3 forward 9 / 506 backward 1 / 494
20240715-17:02:20 main_test_accuracy 1 0.010000000707805157
20240715-17:02:23 test_accuracy 1 model 2 forward 12 / 475 backward 3 / 525
20240715-17:02:23 main_test_accuracy 1 0.015000000596046448
20240715-17:02:24 wrote gpt_002.pth
20240715-17:02:25 wrote gpt_003.pth
20240715-17:02:33 --- epoch 2 ----------------------------------------
20240715-17:02:33 current_test_accuracies 0.0090 0.0140 0.0150 0.0100 0.0000
20240715-17:02:33 training model 4
20240715-17:02:33 training model 0
20240715-17:11:28 train_perplexity 2 model 0 1.2411058885127244
20240715-17:11:46 train_perplexity 2 model 4 1.7647064072016874
20240715-17:12:00 test_perplexity 2 model 0 1.2038803750646887
20240715-17:12:12 test_perplexity 2 model 4 1.2727349838165485
20240715-17:15:07 test_accuracy 2 model 0 forward 151 / 484 backward 77 / 516
20240715-17:15:07 main_test_accuracy 2 0.22800001502037048
20240715-17:15:09 test_accuracy 2 model 4 forward 12 / 528 backward 2 / 472
20240715-17:15:09 main_test_accuracy 2 0.014000000432133675
20240715-17:15:10 wrote gpt_004.pth
20240715-17:15:10 wrote gpt_000.pth
20240715-17:15:18 --- epoch 3 ----------------------------------------
20240715-17:15:18 current_test_accuracies 0.2280 0.0140 0.0150 0.0100 0.0140
20240715-17:15:18 training model 3
20240715-17:15:18 training model 1
20240715-17:24:13 train_perplexity 3 model 1 1.2486546144231132
20240715-17:24:40 train_perplexity 3 model 3 1.2484658028865774
20240715-17:24:43 test_perplexity 3 model 1 1.2075229591652603
20240715-17:25:01 test_perplexity 3 model 3 1.204365996909972
20240715-17:27:55 test_accuracy 3 model 1 forward 120 / 493 backward 62 / 507
20240715-17:27:55 main_test_accuracy 3 0.18200001120567322
20240715-17:27:59 test_accuracy 3 model 3 forward 96 / 506 backward 44 / 494
20240715-17:27:59 main_test_accuracy 3 0.14000000059604645
20240715-17:28:00 wrote gpt_003.pth
20240715-17:28:00 wrote gpt_001.pth
20240715-17:28:09 --- epoch 4 ----------------------------------------
20240715-17:28:09 current_test_accuracies 0.2280 0.1820 0.0150 0.1400 0.0140
20240715-17:28:09 training model 4
20240715-17:28:09 training model 2
20240715-17:37:03 train_perplexity 4 model 2 1.2477534195515345
20240715-17:37:31 train_perplexity 4 model 4 1.249737868340399
20240715-17:37:33 test_perplexity 4 model 2 1.2057924259793467
20240715-17:37:52 test_perplexity 4 model 4 1.2087076547709465
20240715-17:40:44 test_accuracy 4 model 2 forward 79 / 475 backward 25 / 525
20240715-17:40:44 main_test_accuracy 4 0.10400000214576721
20240715-17:40:47 test_accuracy 4 model 4 forward 113 / 528 backward 42 / 472
20240715-17:40:47 main_test_accuracy 4 0.1550000011920929
20240715-17:40:48 wrote gpt_004.pth
20240715-17:40:48 wrote gpt_002.pth
20240715-17:40:57 --- epoch 5 ----------------------------------------
20240715-17:40:57 current_test_accuracies 0.2280 0.1820 0.1040 0.1400 0.1550
20240715-17:40:57 training model 2
20240715-17:40:57 training model 3
20240715-17:49:52 train_perplexity 5 model 3 1.2024429315920526
20240715-17:50:17 train_perplexity 5 model 2 1.2046458259288255
20240715-17:50:22 test_perplexity 5 model 3 1.1904229676303073
20240715-17:50:39 test_perplexity 5 model 2 1.1865104530664998
20240715-17:53:30 test_accuracy 5 model 3 forward 199 / 506 backward 101 / 494
20240715-17:53:30 main_test_accuracy 5 0.30000001192092896
20240715-17:53:35 test_accuracy 5 model 2 forward 175 / 475 backward 161 / 525
20240715-17:53:35 main_test_accuracy 5 0.3360000252723694
20240715-17:53:36 wrote gpt_002.pth
20240715-17:53:36 wrote gpt_003.pth
20240715-17:53:45 --- epoch 6 ----------------------------------------
20240715-17:53:45 current_test_accuracies 0.2280 0.1820 0.3360 0.3000 0.1550
20240715-17:53:45 training model 4
20240715-17:53:45 training model 1
20240715-18:02:40 train_perplexity 6 model 1 1.2024552353687328
20240715-18:03:07 train_perplexity 6 model 4 1.2047341990618097
20240715-18:03:10 test_perplexity 6 model 1 1.1872974373297007
20240715-18:03:28 test_perplexity 6 model 4 1.1887139678613055
20240715-18:06:21 test_accuracy 6 model 1 forward 221 / 493 backward 142 / 507
20240715-18:06:21 main_test_accuracy 6 0.3630000054836273
20240715-18:06:23 test_accuracy 6 model 4 forward 211 / 528 backward 84 / 472
20240715-18:06:23 main_test_accuracy 6 0.29500001668930054
20240715-18:06:25 wrote gpt_004.pth
20240715-18:06:25 wrote gpt_001.pth
20240715-18:06:34 --- epoch 7 ----------------------------------------
20240715-18:06:34 current_test_accuracies 0.2280 0.3630 0.3360 0.3000 0.2950
20240715-18:06:34 training model 0
20240715-18:06:34 training model 4
20240715-18:15:29 train_perplexity 7 model 4 1.1899275702106074
20240715-18:15:55 train_perplexity 7 model 0 1.2006797793766903
20240715-18:15:59 test_perplexity 7 model 4 1.1804004427554906
20240715-18:16:17 test_perplexity 7 model 0 1.1861947857387896
20240715-18:19:08 test_accuracy 7 model 4 forward 284 / 528 backward 156 / 472
20240715-18:19:08 main_test_accuracy 7 0.4400000274181366
20240715-18:19:14 test_accuracy 7 model 0 forward 220 / 484 backward 132 / 516
20240715-18:19:14 main_test_accuracy 7 0.35200002789497375
20240715-18:19:15 wrote gpt_000.pth
20240715-18:19:15 wrote gpt_004.pth
20240715-18:19:23 --- epoch 8 ----------------------------------------
20240715-18:19:23 current_test_accuracies 0.3520 0.3630 0.3360 0.3000 0.4400
20240715-18:19:23 training model 3
20240715-18:19:23 training model 2
20240715-18:28:18 train_perplexity 8 model 2 1.1887773446875023
20240715-18:28:44 train_perplexity 8 model 3 1.1870611081961473
20240715-18:28:48 test_perplexity 8 model 2 1.1791648361746063
20240715-18:29:06 test_perplexity 8 model 3 1.1764893018173475
20240715-18:32:00 test_accuracy 8 model 2 forward 235 / 475 backward 181 / 525
20240715-18:32:00 main_test_accuracy 8 0.41600000858306885
20240715-18:32:03 test_accuracy 8 model 3 forward 265 / 506 backward 202 / 494
20240715-18:32:03 main_test_accuracy 8 0.46700000762939453
20240715-18:32:04 wrote gpt_003.pth
20240715-18:32:05 wrote gpt_002.pth
20240715-18:32:13 --- epoch 9 ----------------------------------------
20240715-18:32:13 current_test_accuracies 0.3520 0.3630 0.4160 0.4670 0.4400
20240715-18:32:13 training model 0
20240715-18:32:13 training model 1
20240715-18:41:08 train_perplexity 9 model 1 1.1860467570356044
20240715-18:41:37 train_perplexity 9 model 0 1.1863951412152316
20240715-18:41:37 test_perplexity 9 model 1 1.1799994311020128
20240715-18:41:57 test_perplexity 9 model 0 1.1785784018999508
20240715-18:44:49 test_accuracy 9 model 1 forward 277 / 493 backward 183 / 507
20240715-18:44:49 main_test_accuracy 9 0.46000000834465027
20240715-18:44:54 test_accuracy 9 model 0 forward 288 / 484 backward 178 / 516
20240715-18:44:54 main_test_accuracy 9 0.4660000205039978
20240715-18:44:55 wrote gpt_000.pth
20240715-18:44:55 wrote gpt_001.pth
20240715-18:45:03 --- epoch 10 ----------------------------------------
20240715-18:45:03 current_test_accuracies 0.4660 0.4600 0.4160 0.4670 0.4400
20240715-18:45:03 training model 2
20240715-18:45:03 training model 4
20240715-18:53:59 train_perplexity 10 model 4 1.1814561558080638
20240715-18:54:26 train_perplexity 10 model 2 1.1808929404488853
20240715-18:54:28 test_perplexity 10 model 4 1.174300502385369
20240715-18:54:47 test_perplexity 10 model 2 1.1745098161074348
20240715-18:57:37 test_accuracy 10 model 4 forward 333 / 528 backward 186 / 472
20240715-18:57:37 main_test_accuracy 10 0.5190000534057617
20240715-18:57:43 test_accuracy 10 model 2 forward 292 / 475 backward 204 / 525
20240715-18:57:43 main_test_accuracy 10 0.4960000216960907
20240715-18:57:44 wrote gpt_002.pth
20240715-18:57:44 wrote gpt_004.pth
20240715-18:57:52 --- epoch 11 ----------------------------------------
20240715-18:57:52 current_test_accuracies 0.4660 0.4600 0.4960 0.4670 0.5190
20240715-18:57:52 training model 1
20240715-18:57:52 training model 0
20240715-19:06:48 train_perplexity 11 model 0 1.1793909567394392
20240715-19:07:12 train_perplexity 11 model 1 1.1805377261447438
20240715-19:07:18 test_perplexity 11 model 0 1.174847118107629
20240715-19:07:35 test_perplexity 11 model 1 1.1749599091565883
20240715-19:10:29 test_accuracy 11 model 0 forward 306 / 484 backward 213 / 516
20240715-19:10:29 main_test_accuracy 11 0.5190000534057617
20240715-19:10:32 test_accuracy 11 model 1 forward 340 / 493 backward 237 / 507
20240715-19:10:32 main_test_accuracy 11 0.5770000219345093
20240715-19:10:34 wrote gpt_001.pth
20240715-19:10:34 wrote gpt_000.pth
20240715-19:10:42 --- epoch 12 ----------------------------------------
20240715-19:10:42 current_test_accuracies 0.5190 0.5770 0.4960 0.4670 0.5190
20240715-19:10:42 training model 3
20240715-19:10:42 training model 2
20240715-19:19:37 train_perplexity 12 model 2 1.1762755184557174
20240715-19:20:03 train_perplexity 12 model 3 1.179923673323153
20240715-19:20:07 test_perplexity 12 model 2 1.1710831213555972
20240715-19:20:25 test_perplexity 12 model 3 1.1732365592527727
20240715-19:23:19 test_accuracy 12 model 2 forward 311 / 475 backward 249 / 525
20240715-19:23:19 main_test_accuracy 12 0.5600000023841858
20240715-19:23:22 test_accuracy 12 model 3 forward 320 / 506 backward 229 / 494
20240715-19:23:22 main_test_accuracy 12 0.5490000247955322
20240715-19:23:23 wrote gpt_003.pth
20240715-19:23:23 wrote gpt_002.pth
20240715-19:23:32 --- epoch 13 ----------------------------------------
20240715-19:23:32 current_test_accuracies 0.5190 0.5770 0.5600 0.5490 0.5190
20240715-19:23:32 training model 0
20240715-19:23:32 training model 4
20240715-19:32:27 train_perplexity 13 model 4 1.1769178773560498
20240715-19:32:55 train_perplexity 13 model 0 1.1751063680128422
20240715-19:32:57 test_perplexity 13 model 4 1.1723348168424697
20240715-19:33:16 test_perplexity 13 model 0 1.1711649808396947
20240715-19:36:06 test_accuracy 13 model 4 forward 358 / 528 backward 196 / 472
20240715-19:36:06 main_test_accuracy 13 0.5540000200271606
20240715-19:36:11 test_accuracy 13 model 0 forward 325 / 484 backward 267 / 516
20240715-19:36:11 main_test_accuracy 13 0.5920000076293945
20240715-19:36:13 wrote gpt_000.pth
20240715-19:36:13 wrote gpt_004.pth
20240715-19:36:22 --- epoch 14 ----------------------------------------
20240715-19:36:22 current_test_accuracies 0.5920 0.5770 0.5600 0.5490 0.5540
20240715-19:36:22 training model 3
20240715-19:36:22 training model 4
20240715-19:45:18 train_perplexity 14 model 4 1.1742961652822685
20240715-19:45:42 train_perplexity 14 model 3 1.1759862217554216
20240715-19:45:48 test_perplexity 14 model 4 1.171107329155257
20240715-19:46:05 test_perplexity 14 model 3 1.1705121622049666
20240715-19:48:56 test_accuracy 14 model 4 forward 377 / 528 backward 233 / 472
20240715-19:48:56 main_test_accuracy 14 0.6100000143051147
20240715-19:49:00 test_accuracy 14 model 3 forward 332 / 506 backward 231 / 494
20240715-19:49:00 main_test_accuracy 14 0.5630000233650208
20240715-19:49:01 wrote gpt_003.pth
20240715-19:49:02 wrote gpt_004.pth
20240715-19:49:10 --- epoch 15 ----------------------------------------
20240715-19:49:10 current_test_accuracies 0.5920 0.5770 0.5600 0.5630 0.6100
20240715-19:49:10 training model 2
20240715-19:49:10 training model 3
20240715-19:58:06 train_perplexity 15 model 3 1.1724367392902866
20240715-19:58:29 train_perplexity 15 model 2 1.17358006746899
20240715-19:58:36 test_perplexity 15 model 3 1.167631982500388
20240715-19:58:52 test_perplexity 15 model 2 1.1699887626054923
20240715-20:01:45 test_accuracy 15 model 3 forward 363 / 506 backward 262 / 494
20240715-20:01:45 main_test_accuracy 15 0.625
20240715-20:01:49 test_accuracy 15 model 2 forward 333 / 475 backward 285 / 525
20240715-20:01:49 main_test_accuracy 15 0.6180000305175781
20240715-20:01:51 wrote gpt_002.pth
20240715-20:01:51 wrote gpt_003.pth
20240715-20:01:58 --- epoch 16 ----------------------------------------
20240715-20:01:58 current_test_accuracies 0.5920 0.5770 0.6180 0.6250 0.6100
20240715-20:01:58 training model 1
20240715-20:01:58 training model 0
20240715-20:10:55 train_perplexity 16 model 0 1.172644219434567
20240715-20:11:18 train_perplexity 16 model 1 1.175089031922472
20240715-20:11:25 test_perplexity 16 model 0 1.1696385890109293
20240715-20:11:41 test_perplexity 16 model 1 1.172811510469653
20240715-20:14:36 test_accuracy 16 model 0 forward 355 / 484 backward 303 / 516
20240715-20:14:36 main_test_accuracy 16 0.6580000519752502
20240715-20:14:40 test_accuracy 16 model 1 forward 366 / 493 backward 290 / 507
20240715-20:14:40 main_test_accuracy 16 0.656000018119812
20240715-20:14:41 wrote gpt_001.pth
20240715-20:14:41 wrote gpt_000.pth
20240715-20:14:50 --- epoch 17 ----------------------------------------
20240715-20:14:50 current_test_accuracies 0.6580 0.6560 0.6180 0.6250 0.6100
20240715-20:14:50 training model 4
20240715-20:14:50 training model 2
20240715-20:23:45 train_perplexity 17 model 2 1.1712131846748688
20240715-20:24:11 train_perplexity 17 model 4 1.1714530071505147
20240715-20:24:15 test_perplexity 17 model 2 1.1683157570856095
20240715-20:24:32 test_perplexity 17 model 4 1.1691343250029471
20240715-20:27:25 test_accuracy 17 model 2 forward 344 / 475 backward 285 / 525
20240715-20:27:25 main_test_accuracy 17 0.6290000081062317
20240715-20:27:27 test_accuracy 17 model 4 forward 393 / 528 backward 240 / 472
20240715-20:27:27 main_test_accuracy 17 0.6330000162124634
20240715-20:27:29 wrote gpt_004.pth
20240715-20:27:29 wrote gpt_002.pth
20240715-20:27:37 --- epoch 18 ----------------------------------------
20240715-20:27:37 current_test_accuracies 0.6580 0.6560 0.6290 0.6250 0.6330
20240715-20:27:37 training model 3
20240715-20:27:37 training model 2
20240715-20:36:33 train_perplexity 18 model 2 1.1700817668566275
20240715-20:36:58 train_perplexity 18 model 3 1.1705111555845278
20240715-20:37:03 test_perplexity 18 model 2 1.1668778790498655
20240715-20:37:20 test_perplexity 18 model 3 1.166153519205439
20240715-20:40:14 test_accuracy 18 model 2 forward 358 / 475 backward 304 / 525
20240715-20:40:14 main_test_accuracy 18 0.6620000600814819
20240715-20:40:17 test_accuracy 18 model 3 forward 368 / 506 backward 281 / 494
20240715-20:40:17 main_test_accuracy 18 0.6490000486373901
20240715-20:40:18 wrote gpt_003.pth
20240715-20:40:19 wrote gpt_002.pth
20240715-20:40:28 --- epoch 19 ----------------------------------------
20240715-20:40:28 current_test_accuracies 0.6580 0.6560 0.6620 0.6490 0.6330
20240715-20:40:28 training model 4
20240715-20:40:28 training model 3
20240715-20:49:23 train_perplexity 19 model 3 1.1692673574381276
20240715-20:49:49 train_perplexity 19 model 4 1.1701120163762246
20240715-20:49:53 test_perplexity 19 model 3 1.1649494398341158
20240715-20:50:11 test_perplexity 19 model 4 1.167223843206261
20240715-20:53:01 test_accuracy 19 model 3 forward 392 / 506 backward 290 / 494
20240715-20:53:01 main_test_accuracy 19 0.6820000410079956
20240715-20:53:05 test_accuracy 19 model 4 forward 398 / 528 backward 272 / 472
20240715-20:53:05 main_test_accuracy 19 0.6700000166893005
20240715-20:53:06 wrote gpt_004.pth
20240715-20:53:06 wrote gpt_003.pth
20240715-20:53:15 --- epoch 20 ----------------------------------------
20240715-20:53:15 current_test_accuracies 0.6580 0.6560 0.6620 0.6820 0.6700
20240715-20:53:15 training model 1
20240715-20:53:15 training model 0
20240715-21:02:11 train_perplexity 20 model 0 1.169896050108601
20240715-21:02:34 train_perplexity 20 model 1 1.17294460842681
20240715-21:02:41 test_perplexity 20 model 0 1.1677458150422304
20240715-21:02:57 test_perplexity 20 model 1 1.1714833241090448
20240715-21:05:51 test_accuracy 20 model 0 forward 353 / 484 backward 325 / 516
20240715-21:05:51 main_test_accuracy 20 0.6780000329017639
20240715-21:05:54 test_accuracy 20 model 1 forward 380 / 493 backward 292 / 507
20240715-21:05:54 main_test_accuracy 20 0.6720000505447388
20240715-21:05:55 wrote gpt_001.pth
20240715-21:05:56 wrote gpt_000.pth
20240715-21:06:04 --- epoch 21 ----------------------------------------
20240715-21:06:04 current_test_accuracies 0.6780 0.6720 0.6620 0.6820 0.6700
20240715-21:06:04 training model 2
20240715-21:06:04 training model 4
20240715-21:14:59 train_perplexity 21 model 4 1.1689778574751686
20240715-21:15:22 train_perplexity 21 model 2 1.1680351623012462
20240715-21:15:33 test_perplexity 21 model 4 1.1666195908932457
20240715-21:15:47 test_perplexity 21 model 2 1.1672975064330555
20240715-21:18:40 test_accuracy 21 model 4 forward 414 / 528 backward 280 / 472
20240715-21:18:40 main_test_accuracy 21 0.6940000057220459
20240715-21:18:44 test_accuracy 21 model 2 forward 350 / 475 backward 328 / 525
20240715-21:18:44 main_test_accuracy 21 0.6780000329017639
20240715-21:18:46 wrote gpt_002.pth
20240715-21:18:46 wrote gpt_004.pth
20240715-21:18:54 --- epoch 22 ----------------------------------------
20240715-21:18:54 current_test_accuracies 0.6780 0.6720 0.6780 0.6820 0.6940
20240715-21:18:54 training model 1
20240715-21:18:54 training model 0
20240715-21:27:50 train_perplexity 22 model 0 1.1685482953865127
20240715-21:28:14 train_perplexity 22 model 1 1.1704900474918538
20240715-21:28:20 test_perplexity 22 model 0 1.166679467986111
20240715-21:28:37 test_perplexity 22 model 1 1.1708365455785255
20240715-21:31:31 test_accuracy 22 model 0 forward 366 / 484 backward 332 / 516
20240715-21:31:31 main_test_accuracy 22 0.6980000138282776
20240715-21:31:35 test_accuracy 22 model 1 forward 377 / 493 backward 305 / 507
20240715-21:31:35 main_test_accuracy 22 0.6820000410079956
20240715-21:31:36 wrote gpt_001.pth
20240715-21:31:37 wrote gpt_000.pth
20240715-21:31:45 --- epoch 23 ----------------------------------------
20240715-21:31:45 current_test_accuracies 0.6980 0.6820 0.6780 0.6820 0.6940
20240715-21:31:45 training model 2
20240715-21:31:45 training model 1
20240715-21:40:40 train_perplexity 23 model 1 1.168715881157189
20240715-21:41:06 train_perplexity 23 model 2 1.167968844107784
20240715-21:41:11 test_perplexity 23 model 1 1.168378660246657
20240715-21:41:29 test_perplexity 23 model 2 1.163832623425657
20240715-21:44:21 test_accuracy 23 model 1 forward 396 / 493 backward 336 / 507
20240715-21:44:21 main_test_accuracy 23 0.7320000529289246
20240715-21:44:25 test_accuracy 23 model 2 forward 372 / 475 backward 317 / 525
20240715-21:44:25 main_test_accuracy 23 0.6890000104904175
20240715-21:44:26 wrote gpt_002.pth
20240715-21:44:27 wrote gpt_001.pth
20240715-21:44:35 --- epoch 24 ----------------------------------------
20240715-21:44:35 current_test_accuracies 0.6980 0.7320 0.6890 0.6820 0.6940
20240715-21:44:35 training model 3
20240715-21:44:35 training model 2
20240715-21:53:31 train_perplexity 24 model 2 1.1660771835458397
20240715-21:53:57 train_perplexity 24 model 3 1.168118211577099
20240715-21:54:01 test_perplexity 24 model 2 1.1631129592662688
20240715-21:54:19 test_perplexity 24 model 3 1.1644743056952902
20240715-21:57:12 test_accuracy 24 model 2 forward 381 / 475 backward 345 / 525
20240715-21:57:12 main_test_accuracy 24 0.7260000109672546
20240715-21:57:15 test_accuracy 24 model 3 forward 405 / 506 backward 314 / 494
20240715-21:57:15 main_test_accuracy 24 0.7190000414848328
20240715-21:57:17 wrote gpt_003.pth
20240715-21:57:17 wrote gpt_002.pth
20240715-21:57:25 --- epoch 25 ----------------------------------------
20240715-21:57:25 current_test_accuracies 0.6980 0.7320 0.7260 0.7190 0.6940
20240715-21:57:25 training model 4
20240715-21:57:25 training model 0
20240715-22:06:21 train_perplexity 25 model 0 1.1671032171767446
20240715-22:06:46 train_perplexity 25 model 4 1.1682154181416815
20240715-22:06:51 test_perplexity 25 model 0 1.1661922147829324
20240715-22:07:08 test_perplexity 25 model 4 1.1656109425605414
20240715-22:10:02 test_accuracy 25 model 0 forward 373 / 484 backward 327 / 516
20240715-22:10:02 main_test_accuracy 25 0.7000000476837158
20240715-22:10:04 test_accuracy 25 model 4 forward 410 / 528 backward 286 / 472
20240715-22:10:04 main_test_accuracy 25 0.6960000395774841
20240715-22:10:05 wrote gpt_004.pth
20240715-22:10:05 wrote gpt_000.pth
20240715-22:10:14 --- epoch 26 ----------------------------------------
20240715-22:10:14 current_test_accuracies 0.7000 0.7320 0.7260 0.7190 0.6960
20240715-22:10:14 training model 4
20240715-22:10:14 training model 0
20240715-22:19:09 train_perplexity 26 model 0 1.1659825606311938
20240715-22:19:35 train_perplexity 26 model 4 1.1668103504397218
20240715-22:19:39 test_perplexity 26 model 0 1.165275162658801
20240715-22:19:57 test_perplexity 26 model 4 1.1640821661046212
20240715-22:22:51 test_accuracy 26 model 0 forward 372 / 484 backward 336 / 516
20240715-22:22:51 main_test_accuracy 26 0.7080000042915344
20240715-22:22:53 test_accuracy 26 model 4 forward 422 / 528 backward 295 / 472
20240715-22:22:53 main_test_accuracy 26 0.7170000076293945
20240715-22:22:55 wrote gpt_004.pth
20240715-22:22:55 wrote gpt_000.pth
20240715-22:23:04 --- epoch 27 ----------------------------------------
20240715-22:23:04 current_test_accuracies 0.7080 0.7320 0.7260 0.7190 0.7170
20240715-22:23:04 training model 0
20240715-22:23:04 training model 4
20240715-22:31:59 train_perplexity 27 model 4 1.1654629430431782
20240715-22:32:21 train_perplexity 27 model 0 1.1651104936743868
20240715-22:32:32 test_perplexity 27 model 4 1.1630204496172947
20240715-22:32:46 test_perplexity 27 model 0 1.1639659489277983
20240715-22:35:39 test_accuracy 27 model 4 forward 423 / 528 backward 308 / 472
20240715-22:35:39 main_test_accuracy 27 0.7310000061988831
20240715-22:35:43 test_accuracy 27 model 0 forward 407 / 484 backward 328 / 516
20240715-22:35:43 main_test_accuracy 27 0.7350000143051147
20240715-22:35:45 wrote gpt_000.pth
20240715-22:35:45 wrote gpt_004.pth
20240715-22:35:54 --- epoch 28 ----------------------------------------
20240715-22:35:54 current_test_accuracies 0.7350 0.7320 0.7260 0.7190 0.7310
20240715-22:35:54 training model 3
20240715-22:35:54 training model 2
20240715-22:44:49 train_perplexity 28 model 2 1.1657030913940218
20240715-22:45:14 train_perplexity 28 model 3 1.1663051819100465
20240715-22:45:20 test_perplexity 28 model 2 1.162806812520627
20240715-22:45:36 test_perplexity 28 model 3 1.1635968892744717
20240715-22:48:31 test_accuracy 28 model 2 forward 389 / 475 backward 326 / 525
20240715-22:48:31 main_test_accuracy 28 0.7150000333786011
20240715-22:48:34 test_accuracy 28 model 3 forward 391 / 506 backward 334 / 494
20240715-22:48:34 main_test_accuracy 28 0.7250000238418579
20240715-22:48:35 wrote gpt_003.pth
20240715-22:48:35 wrote gpt_002.pth
20240715-22:48:43 --- epoch 29 ----------------------------------------
20240715-22:48:43 current_test_accuracies 0.7350 0.7320 0.7150 0.7250 0.7310
20240715-22:48:43 training model 2
20240715-22:48:43 training model 3
20240715-22:57:38 train_perplexity 29 model 3 1.1657818554938788
20240715-22:58:04 train_perplexity 29 model 2 1.1643540590460928
20240715-22:58:08 test_perplexity 29 model 3 1.1618546713501108
20240715-22:58:26 test_perplexity 29 model 2 1.161369152437375
20240715-23:01:18 test_accuracy 29 model 3 forward 414 / 506 backward 328 / 494
20240715-23:01:18 main_test_accuracy 29 0.7420000433921814
20240715-23:01:22 test_accuracy 29 model 2 forward 393 / 475 backward 354 / 525
20240715-23:01:22 main_test_accuracy 29 0.7470000386238098
20240715-23:01:24 wrote gpt_002.pth
20240715-23:01:24 wrote gpt_003.pth
20240715-23:01:33 --- epoch 30 ----------------------------------------
20240715-23:01:33 current_test_accuracies 0.7350 0.7320 0.7470 0.7420 0.7310
20240715-23:01:33 training model 4
20240715-23:01:33 training model 1
20240715-23:10:29 train_perplexity 30 model 1 1.1677036302420989
20240715-23:10:54 train_perplexity 30 model 4 1.1649089163528707
20240715-23:10:58 test_perplexity 30 model 1 1.1666496176618208
20240715-23:11:16 test_perplexity 30 model 4 1.1627261442523724
20240715-23:12:29 argv ./main.py --result_dir=results_grids_v6 --resume --seed=34231
20240715-23:12:29 args.log_filename train.log
20240715-23:12:29 args.result_dir results_grids_v6
20240715-23:12:29 args.seed 34231
20240715-23:12:29 args.resume True
20240715-23:12:29 args.max_percents_of_test_in_train -1
20240715-23:12:29 args.nb_epochs 10000
20240715-23:12:29 args.batch_size 25
20240715-23:12:29 args.physical_batch_size None
20240715-23:12:29 args.nb_train_samples 100000
20240715-23:12:29 args.nb_test_samples 10000
20240715-23:12:29 args.nb_new_c_quizzes_for_train None
20240715-23:12:29 args.nb_new_c_quizzes_for_test None
20240715-23:12:29 args.learning_rate 0.0005
20240715-23:12:29 args.model 37M
20240715-23:12:29 args.dim_model 512
20240715-23:12:29 args.dim_keys 64
20240715-23:12:29 args.dim_hidden 2048
20240715-23:12:29 args.nb_heads 8
20240715-23:12:29 args.nb_blocks 12
20240715-23:12:29 args.dropout 0.1
20240715-23:12:29 args.deterministic_synthesis False
20240715-23:12:29 args.problem grids
20240715-23:12:29 args.nb_threads 1
20240715-23:12:29 args.gpus all
20240715-23:12:29 args.nb_gpts 5
20240715-23:12:29 args.accuracy_to_make_c_quizzes 0.9
20240715-23:12:29 args.proba_understands 0.9
20240715-23:12:29 args.proba_not_understands 0.5
20240715-23:12:29 args.generation_temperature 2
20240715-23:12:29 args.c_quiz_validation_mode predict
20240715-23:12:29 args.dirty_debug False
20240715-23:12:29 args.grids_tasks None
20240715-23:12:29 args.sky_height 6
20240715-23:12:29 args.sky_width 8
20240715-23:12:29 args.sky_nb_birds 3
20240715-23:12:29 args.sky_nb_iterations 2
20240715-23:12:29 args.sky_speed 3
20240715-23:12:29 main_device cuda:0 gpus ['cuda:0', 'cuda:1']
20240715-23:12:29 vocabulary_size 13
20240715-23:12:29 creating model 0 and its w_quizzes
20240715-23:14:16 creating model 1 and its w_quizzes
20240715-23:16:03 creating model 2 and its w_quizzes
20240715-23:17:50 creating model 3 and its w_quizzes
20240715-23:19:36 creating model 4 and its w_quizzes
20240715-23:21:24 successfully loaded gpt_000.pth
20240715-23:21:24 successfully loaded gpt_001.pth
20240715-23:21:24 successfully loaded gpt_002.pth
20240715-23:21:24 successfully loaded gpt_003.pth
20240715-23:21:24 successfully loaded gpt_004.pth
20240715-23:21:24 cannot find c_quizzes.pth
20240715-23:21:24 nb_parameters 37817357 (37M)
20240715-23:21:26 nb_new_c_quizzes_for_train 1000 nb_new_c_quizzes_for_test 100
20240715-23:21:26 --- epoch 0 ----------------------------------------
20240715-23:21:26 current_test_accuracies 0.7350 0.7320 0.7470 0.7420 0.7310
20240715-23:21:26 training model 4
20240715-23:21:26 training model 1
20240715-23:30:23 train_perplexity 0 model 1 1.1681944976711691
20240715-23:30:43 train_perplexity 0 model 4 1.164876289540722
20240715-23:30:55 test_perplexity 0 model 1 1.1663235110996584
20240715-23:31:09 test_perplexity 0 model 4 1.16380195622991
20240715-23:34:02 test_accuracy 0 model 1 forward 400 / 518 backward 311 / 482
20240715-23:34:04 test_accuracy 0 model 4 forward 409 / 527 backward 304 / 473
20240715-23:34:06 wrote gpt_004.pth
20240715-23:34:06 wrote gpt_001.pth
20240715-23:34:13 --- epoch 1 ----------------------------------------
20240715-23:34:13 current_test_accuracies 0.7350 0.7110 0.7470 0.7420 0.7130
20240715-23:34:13 training model 1
20240715-23:34:13 training model 4
20240715-23:43:09 train_perplexity 1 model 4 1.164717532437211
20240715-23:43:32 train_perplexity 1 model 1 1.1662691029360468
20240715-23:43:39 test_perplexity 1 model 4 1.1625640989502355
20240715-23:43:55 test_perplexity 1 model 1 1.1653906061288677
20240715-23:46:46 test_accuracy 1 model 4 forward 414 / 527 backward 297 / 473
20240715-23:46:50 test_accuracy 1 model 1 forward 415 / 518 backward 287 / 482
20240715-23:46:52 wrote gpt_001.pth
20240715-23:46:52 wrote gpt_004.pth
20240715-23:47:01 --- epoch 2 ----------------------------------------
20240715-23:47:01 current_test_accuracies 0.7350 0.7020 0.7470 0.7420 0.7110
20240715-23:47:01 training model 1
20240715-23:47:01 training model 4
20240715-23:55:57 train_perplexity 2 model 4 1.1636115875279662
20240715-23:56:21 train_perplexity 2 model 1 1.1654286267286715
20240715-23:56:27 test_perplexity 2 model 4 1.162122379094477
20240715-23:56:44 test_perplexity 2 model 1 1.1645077035190705
20240715-23:59:34 test_accuracy 2 model 4 forward 432 / 527 backward 313 / 473
20240715-23:59:38 test_accuracy 2 model 1 forward 429 / 518 backward 309 / 482
20240715-23:59:40 wrote gpt_001.pth
20240715-23:59:40 wrote gpt_004.pth
20240715-23:59:48 --- epoch 3 ----------------------------------------
20240715-23:59:48 current_test_accuracies 0.7350 0.7380 0.7470 0.7420 0.7450
20240715-23:59:48 training model 0
20240715-23:59:48 training model 1
20240716-00:08:43 train_perplexity 3 model 1 1.1647133883052014
20240716-00:09:10 train_perplexity 3 model 0 1.164374913117596
20240716-00:09:13 test_perplexity 3 model 1 1.1638636587958564
20240716-00:09:31 test_perplexity 3 model 0 1.1604957903478617
20240716-00:12:22 test_accuracy 3 model 1 forward 425 / 518 backward 324 / 482
20240716-00:12:27 test_accuracy 3 model 0 forward 416 / 493 backward 365 / 507
20240716-00:12:28 wrote gpt_000.pth
20240716-00:12:29 wrote gpt_001.pth
20240716-00:12:36 --- epoch 4 ----------------------------------------
20240716-00:12:36 current_test_accuracies 0.7810 0.7490 0.7470 0.7420 0.7450
20240716-00:12:36 training model 3
20240716-00:12:36 training model 4
20240716-00:21:32 train_perplexity 4 model 4 1.1628591388478227
20240716-00:21:56 train_perplexity 4 model 3 1.1646216534401794
20240716-00:22:05 test_perplexity 4 model 4 1.1615093278146726
20240716-00:22:20 test_perplexity 4 model 3 1.1642830094318648
20240716-00:25:11 test_accuracy 4 model 4 forward 433 / 527 backward 334 / 473
20240716-00:25:15 test_accuracy 4 model 3 forward 395 / 508 backward 337 / 492
20240716-00:25:17 wrote gpt_003.pth
20240716-00:25:17 wrote gpt_004.pth
20240716-00:25:25 --- epoch 5 ----------------------------------------
20240716-00:25:25 current_test_accuracies 0.7810 0.7490 0.7470 0.7320 0.7670
20240716-00:25:25 training model 3
20240716-00:25:25 training model 2
20240716-00:34:21 train_perplexity 5 model 2 1.164244148198222
20240716-00:34:46 train_perplexity 5 model 3 1.1640086717663654
20240716-00:34:51 test_perplexity 5 model 2 1.161365350370555
20240716-00:35:08 test_perplexity 5 model 3 1.1633783953270576
20240716-00:37:59 test_accuracy 5 model 2 forward 412 / 512 backward 359 / 488
20240716-00:38:03 test_accuracy 5 model 3 forward 406 / 508 backward 347 / 492
20240716-00:38:04 wrote gpt_003.pth
20240716-00:38:04 wrote gpt_002.pth
20240716-00:38:13 --- epoch 6 ----------------------------------------
20240716-00:38:13 current_test_accuracies 0.7810 0.7490 0.7710 0.7530 0.7670
20240716-00:38:13 training model 1
20240716-00:38:13 training model 3
20240716-00:47:08 train_perplexity 6 model 3 1.1627806601086885
20240716-00:47:33 train_perplexity 6 model 1 1.1641778893442314
20240716-00:47:38 test_perplexity 6 model 3 1.1634010089780014
20240716-00:47:55 test_perplexity 6 model 1 1.162844999331882
20240716-00:50:47 test_accuracy 6 model 3 forward 407 / 508 backward 349 / 492
20240716-00:50:51 test_accuracy 6 model 1 forward 444 / 518 backward 340 / 482
20240716-00:50:52 wrote gpt_001.pth
20240716-00:50:52 wrote gpt_003.pth
20240716-00:51:01 --- epoch 7 ----------------------------------------
20240716-00:51:01 current_test_accuracies 0.7810 0.7840 0.7710 0.7560 0.7670
20240716-00:51:01 training model 3
20240716-00:51:01 training model 4
20240716-00:59:56 train_perplexity 7 model 4 1.1625470882964364
20240716-01:00:19 train_perplexity 7 model 3 1.1630592867251963
20240716-01:00:27 test_perplexity 7 model 4 1.1613513975885366
20240716-01:00:42 test_perplexity 7 model 3 1.1628012894544302
20240716-01:03:33 test_accuracy 7 model 4 forward 434 / 527 backward 326 / 473
20240716-01:03:37 test_accuracy 7 model 3 forward 412 / 508 backward 357 / 492
20240716-01:03:39 wrote gpt_003.pth
20240716-01:03:39 wrote gpt_004.pth
20240716-01:03:47 --- epoch 8 ----------------------------------------
20240716-01:03:47 current_test_accuracies 0.7810 0.7840 0.7710 0.7690 0.7600
20240716-01:03:47 training model 4
20240716-01:03:47 training model 3
20240716-01:12:43 train_perplexity 8 model 3 1.1626809316469238
20240716-01:13:04 train_perplexity 8 model 4 1.161878773303413
20240716-01:13:15 test_perplexity 8 model 3 1.1613891562815921
20240716-01:13:29 test_perplexity 8 model 4 1.160727514445793
20240716-01:16:23 test_accuracy 8 model 3 forward 419 / 508 backward 373 / 492
20240716-01:16:26 test_accuracy 8 model 4 forward 436 / 527 backward 337 / 473
20240716-01:16:27 wrote gpt_004.pth
20240716-01:16:27 wrote gpt_003.pth
20240716-01:16:35 --- epoch 9 ----------------------------------------
20240716-01:16:35 current_test_accuracies 0.7810 0.7840 0.7710 0.7920 0.7730
20240716-01:16:35 training model 2
20240716-01:16:35 training model 4
20240716-01:25:31 train_perplexity 9 model 4 1.1612737813858176
20240716-01:25:53 train_perplexity 9 model 2 1.162876785355957
20240716-01:26:02 test_perplexity 9 model 4 1.161386770580562
20240716-01:26:17 test_perplexity 9 model 2 1.160122364440327
20240716-01:29:09 test_accuracy 9 model 4 forward 447 / 527 backward 343 / 473
20240716-01:29:13 test_accuracy 9 model 2 forward 440 / 512 backward 350 / 488
20240716-01:29:15 wrote gpt_002.pth
20240716-01:29:15 wrote gpt_004.pth
20240716-01:29:23 --- epoch 10 ----------------------------------------
20240716-01:29:23 current_test_accuracies 0.7810 0.7840 0.7900 0.7920 0.7900
20240716-01:29:23 training model 0
20240716-01:29:23 training model 1
20240716-01:38:19 train_perplexity 10 model 1 1.1637389604537762
20240716-01:38:41 train_perplexity 10 model 0 1.1639329700985483
20240716-01:38:50 test_perplexity 10 model 1 1.1629297568193566
20240716-01:39:05 test_perplexity 10 model 0 1.159376496995672
20240716-01:41:57 test_accuracy 10 model 1 forward 437 / 518 backward 335 / 482
20240716-01:42:01 test_accuracy 10 model 0 forward 415 / 493 backward 360 / 507
20240716-01:42:02 wrote gpt_000.pth
20240716-01:42:02 wrote gpt_001.pth
20240716-01:42:10 --- epoch 11 ----------------------------------------
20240716-01:42:10 current_test_accuracies 0.7750 0.7720 0.7900 0.7920 0.7900
20240716-01:42:10 training model 1
20240716-01:42:10 training model 0
20240716-01:51:06 train_perplexity 11 model 0 1.163244358405057
20240716-01:51:30 train_perplexity 11 model 1 1.1629231664489417
20240716-01:51:36 test_perplexity 11 model 0 1.1591030218181229
20240716-01:51:52 test_perplexity 11 model 1 1.1622342183091998
20240716-01:54:46 test_accuracy 11 model 0 forward 410 / 493 backward 363 / 507
20240716-01:54:49 test_accuracy 11 model 1 forward 436 / 518 backward 342 / 482
20240716-01:54:50 wrote gpt_001.pth
20240716-01:54:50 wrote gpt_000.pth
20240716-01:54:59 --- epoch 12 ----------------------------------------
20240716-01:54:59 current_test_accuracies 0.7730 0.7780 0.7900 0.7920 0.7900
20240716-01:54:59 training model 0
20240716-01:54:59 training model 1
20240716-02:03:55 train_perplexity 12 model 1 1.1625936984636123
20240716-02:04:18 train_perplexity 12 model 0 1.1619294572248016
20240716-02:04:25 test_perplexity 12 model 1 1.1615735600581762
20240716-02:04:41 test_perplexity 12 model 0 1.1587861808668298
20240716-02:07:34 test_accuracy 12 model 1 forward 440 / 518 backward 352 / 482
20240716-02:07:38 test_accuracy 12 model 0 forward 411 / 493 backward 376 / 507
20240716-02:07:40 wrote gpt_000.pth
20240716-02:07:40 wrote gpt_001.pth
20240716-02:07:48 --- epoch 13 ----------------------------------------
20240716-02:07:48 current_test_accuracies 0.7870 0.7920 0.7900 0.7920 0.7900
20240716-02:07:48 training model 0
20240716-02:07:48 training model 2
20240716-02:16:43 train_perplexity 13 model 2 1.162354481337884
20240716-02:17:09 train_perplexity 13 model 0 1.1623766462564102
20240716-02:17:12 test_perplexity 13 model 2 1.1597140779644217
20240716-02:17:30 test_perplexity 13 model 0 1.1594828119596952
20240716-02:20:23 test_accuracy 13 model 2 forward 442 / 512 backward 363 / 488
20240716-02:20:27 test_accuracy 13 model 0 forward 427 / 493 backward 369 / 507
20240716-02:20:28 wrote gpt_000.pth
20240716-02:20:29 wrote gpt_002.pth
20240716-02:20:37 --- epoch 14 ----------------------------------------
20240716-02:20:37 current_test_accuracies 0.7960 0.7920 0.8050 0.7920 0.7900
20240716-02:20:37 training model 4
20240716-02:20:37 training model 1
20240716-02:29:33 train_perplexity 14 model 1 1.162338202504428
20240716-02:29:59 train_perplexity 14 model 4 1.1607271252167721
20240716-02:30:02 test_perplexity 14 model 1 1.1609135185628376
20240716-02:30:20 test_perplexity 14 model 4 1.1602774537325466
20240716-02:33:13 test_accuracy 14 model 1 forward 445 / 518 backward 347 / 482
20240716-02:33:16 test_accuracy 14 model 4 forward 446 / 527 backward 349 / 473
20240716-02:33:17 wrote gpt_004.pth
20240716-02:33:18 wrote gpt_001.pth
20240716-02:33:26 --- epoch 15 ----------------------------------------
20240716-02:33:26 current_test_accuracies 0.7960 0.7920 0.8050 0.7920 0.7950
20240716-02:33:26 training model 1
20240716-02:33:26 training model 3
20240716-02:42:21 train_perplexity 15 model 3 1.1621087333068707
20240716-02:42:49 train_perplexity 15 model 1 1.1613004265709974
20240716-02:42:51 test_perplexity 15 model 3 1.1614653354128044
20240716-02:43:10 test_perplexity 15 model 1 1.160437896040018
20240716-02:46:01 test_accuracy 15 model 3 forward 427 / 508 backward 355 / 492
20240716-02:46:05 test_accuracy 15 model 1 forward 450 / 518 backward 341 / 482
20240716-02:46:06 wrote gpt_001.pth
20240716-02:46:07 wrote gpt_003.pth
20240716-02:46:15 --- epoch 16 ----------------------------------------
20240716-02:46:15 current_test_accuracies 0.7960 0.7910 0.8050 0.7820 0.7950
20240716-02:46:15 training model 3
20240716-02:46:15 training model 1
20240716-02:55:10 train_perplexity 16 model 1 1.1616597817339704
20240716-02:55:38 train_perplexity 16 model 3 1.1616554467030782
20240716-02:55:40 test_perplexity 16 model 1 1.1604162772079394
20240716-02:55:58 test_perplexity 16 model 3 1.1609313012941647
20240716-02:58:50 test_accuracy 16 model 1 forward 451 / 518 backward 346 / 482
20240716-02:58:54 test_accuracy 16 model 3 forward 440 / 508 backward 364 / 492
20240716-02:58:55 wrote gpt_003.pth
20240716-02:58:55 wrote gpt_001.pth
20240716-02:59:03 --- epoch 17 ----------------------------------------
20240716-02:59:03 current_test_accuracies 0.7960 0.7970 0.8050 0.8040 0.7950
20240716-02:59:03 training model 4
20240716-02:59:03 training model 0
20240716-03:07:58 train_perplexity 17 model 0 1.1615086282342462
20240716-03:08:26 train_perplexity 17 model 4 1.1611468739766448
20240716-03:08:28 test_perplexity 17 model 0 1.1580814272484872
20240716-03:08:46 test_perplexity 17 model 4 1.1599096235257347
20240716-03:11:40 test_accuracy 17 model 0 forward 442 / 493 backward 388 / 507
20240716-03:11:42 test_accuracy 17 model 4 forward 456 / 527 backward 335 / 473
20240716-03:11:44 wrote gpt_004.pth
20240716-03:11:44 wrote gpt_000.pth
20240716-03:11:52 --- epoch 18 ----------------------------------------
20240716-03:11:52 current_test_accuracies 0.8300 0.7970 0.8050 0.8040 0.7910
20240716-03:11:52 training model 4
20240716-03:11:52 training model 1
20240716-03:20:48 train_perplexity 18 model 1 1.160860669094151
20240716-03:21:12 train_perplexity 18 model 4 1.160765327797207
20240716-03:21:18 test_perplexity 18 model 1 1.1595407280347287
20240716-03:21:35 test_perplexity 18 model 4 1.1593500069497615
20240716-03:24:27 test_accuracy 18 model 1 forward 458 / 518 backward 349 / 482
20240716-03:24:30 test_accuracy 18 model 4 forward 462 / 527 backward 348 / 473
20240716-03:24:31 wrote gpt_004.pth
20240716-03:24:32 wrote gpt_001.pth
20240716-03:24:39 --- epoch 19 ----------------------------------------
20240716-03:24:39 current_test_accuracies 0.8300 0.8070 0.8050 0.8040 0.8100
20240716-03:24:39 training model 3
20240716-03:24:39 training model 2
20240716-03:33:35 train_perplexity 19 model 2 1.161724956637467
20240716-03:34:00 train_perplexity 19 model 3 1.1607931683678647
20240716-03:34:05 test_perplexity 19 model 2 1.1597464539701
20240716-03:34:22 test_perplexity 19 model 3 1.1605013766063685
20240716-03:37:14 test_accuracy 19 model 2 forward 442 / 512 backward 367 / 488
20240716-03:37:18 test_accuracy 19 model 3 forward 436 / 508 backward 367 / 492
20240716-03:37:19 wrote gpt_003.pth
20240716-03:37:20 wrote gpt_002.pth
20240716-03:37:28 --- epoch 20 ----------------------------------------
20240716-03:37:28 current_test_accuracies 0.8300 0.8070 0.8090 0.8030 0.8100
20240716-03:37:28 training model 3
20240716-03:37:28 training model 1
20240716-03:46:24 train_perplexity 20 model 1 1.1615124566821937
20240716-03:46:50 train_perplexity 20 model 3 1.1607335457891645
20240716-03:46:54 test_perplexity 20 model 1 1.1598362485463514
20240716-03:47:12 test_perplexity 20 model 3 1.1602781802583932
20240716-03:50:05 test_accuracy 20 model 1 forward 462 / 518 backward 356 / 482
20240716-03:50:09 test_accuracy 20 model 3 forward 435 / 508 backward 373 / 492
20240716-03:50:10 wrote gpt_003.pth
20240716-03:50:10 wrote gpt_001.pth
20240716-03:50:19 --- epoch 21 ----------------------------------------
20240716-03:50:19 current_test_accuracies 0.8300 0.8180 0.8090 0.8080 0.8100
20240716-03:50:19 training model 3
20240716-03:50:19 training model 2
20240716-03:59:15 train_perplexity 21 model 2 1.1620423541870768
20240716-03:59:39 train_perplexity 21 model 3 1.1605936271204274
20240716-03:59:45 test_perplexity 21 model 2 1.1604616455454617
20240716-04:00:02 test_perplexity 21 model 3 1.1607672926838677
20240716-04:02:53 test_accuracy 21 model 2 forward 441 / 512 backward 372 / 488
20240716-04:02:56 test_accuracy 21 model 3 forward 436 / 508 backward 377 / 492
20240716-04:02:58 wrote gpt_003.pth
20240716-04:02:58 wrote gpt_002.pth
20240716-04:03:06 --- epoch 22 ----------------------------------------
20240716-04:03:06 current_test_accuracies 0.8300 0.8180 0.8130 0.8130 0.8100
20240716-04:03:06 training model 4
20240716-04:03:06 training model 2
20240716-04:12:03 train_perplexity 22 model 2 1.160928976122871
20240716-04:12:28 train_perplexity 22 model 4 1.160888563747833
20240716-04:12:33 test_perplexity 22 model 2 1.1589235767833839
20240716-04:12:50 test_perplexity 22 model 4 1.1593199005489936
20240716-04:15:41 test_accuracy 22 model 2 forward 453 / 512 backward 372 / 488
20240716-04:15:45 test_accuracy 22 model 4 forward 468 / 527 backward 355 / 473
20240716-04:15:46 wrote gpt_004.pth
20240716-04:15:46 wrote gpt_002.pth
20240716-04:15:54 --- epoch 23 ----------------------------------------
20240716-04:15:54 current_test_accuracies 0.8300 0.8180 0.8250 0.8130 0.8230
20240716-04:15:54 training model 3
20240716-04:15:54 training model 1
20240716-04:24:50 train_perplexity 23 model 1 1.1599813433838502
20240716-04:25:12 train_perplexity 23 model 3 1.1603202332322389
20240716-04:25:24 test_perplexity 23 model 1 1.1588670953293396
20240716-04:25:37 test_perplexity 23 model 3 1.160129590586477
20240716-04:28:31 test_accuracy 23 model 1 forward 467 / 518 backward 356 / 482
20240716-04:28:33 test_accuracy 23 model 3 forward 427 / 508 backward 384 / 492
20240716-04:28:35 wrote gpt_003.pth
20240716-04:28:35 wrote gpt_001.pth
20240716-04:28:44 --- epoch 24 ----------------------------------------
20240716-04:28:44 current_test_accuracies 0.8300 0.8230 0.8250 0.8110 0.8230
20240716-04:28:44 training model 3
20240716-04:28:44 training model 1
20240716-04:37:41 train_perplexity 24 model 1 1.1598169791747441
20240716-04:37:58 train_perplexity 24 model 3 1.1593191857851284
20240716-04:38:15 test_perplexity 24 model 1 1.159158501072529
20240716-04:38:25 test_perplexity 24 model 3 1.159302842416855
20240716-04:41:19 test_accuracy 24 model 1 forward 464 / 518 backward 360 / 482
20240716-04:41:22 test_accuracy 24 model 3 forward 443 / 508 backward 384 / 492
20240716-04:41:23 wrote gpt_003.pth
20240716-04:41:23 wrote gpt_001.pth
20240716-04:41:32 --- epoch 25 ----------------------------------------
20240716-04:41:32 current_test_accuracies 0.8300 0.8240 0.8250 0.8270 0.8230
20240716-04:41:32 training model 4
20240716-04:41:32 training model 1
20240716-04:50:28 train_perplexity 25 model 1 1.1588652193020466
20240716-04:50:50 train_perplexity 25 model 4 1.1602855774464085
20240716-04:50:59 test_perplexity 25 model 1 1.1588139943312108
20240716-04:51:14 test_perplexity 25 model 4 1.159438193312184
20240716-04:54:06 test_accuracy 25 model 1 forward 470 / 518 backward 367 / 482
20240716-04:54:09 test_accuracy 25 model 4 forward 461 / 527 backward 360 / 473
20240716-04:54:10 wrote gpt_004.pth
20240716-04:54:11 wrote gpt_001.pth
20240716-04:54:19 --- epoch 26 ----------------------------------------
20240716-04:54:19 current_test_accuracies 0.8300 0.8370 0.8250 0.8270 0.8210
20240716-04:54:19 training model 4
20240716-04:54:19 training model 2
20240716-05:03:16 train_perplexity 26 model 2 1.1609033400964806
20240716-05:03:33 train_perplexity 26 model 4 1.1598703650699327
20240716-05:03:50 test_perplexity 26 model 2 1.1583954550839972
20240716-05:04:01 test_perplexity 26 model 4 1.1590755515544842
20240716-05:06:55 test_accuracy 26 model 2 forward 454 / 512 backward 374 / 488
20240716-05:06:57 test_accuracy 26 model 4 forward 471 / 527 backward 354 / 473
20240716-05:06:58 wrote gpt_004.pth
20240716-05:06:58 wrote gpt_002.pth
20240716-05:07:07 --- epoch 27 ----------------------------------------
20240716-05:07:07 current_test_accuracies 0.8300 0.8370 0.8280 0.8270 0.8250
20240716-05:07:07 training model 4
20240716-05:07:07 training model 3
20240716-05:16:03 train_perplexity 27 model 3 1.159471986138873
20240716-05:16:24 train_perplexity 27 model 4 1.1592705261748082
20240716-05:16:34 test_perplexity 27 model 3 1.159430653211111
20240716-05:16:48 test_perplexity 27 model 4 1.1592835763123552
20240716-05:19:41 test_accuracy 27 model 3 forward 449 / 508 backward 395 / 492
20240716-05:19:43 test_accuracy 27 model 4 forward 474 / 527 backward 353 / 473
20240716-05:19:44 wrote gpt_004.pth
20240716-05:19:45 wrote gpt_003.pth
20240716-05:19:53 --- epoch 28 ----------------------------------------
20240716-05:19:53 current_test_accuracies 0.8300 0.8370 0.8280 0.8440 0.8270
20240716-05:19:53 training model 4
20240716-05:19:53 training model 2
20240716-05:28:50 train_perplexity 28 model 2 1.1602643797273204
20240716-05:29:11 train_perplexity 28 model 4 1.1594482517819793
20240716-05:29:21 test_perplexity 28 model 2 1.1579603340647633
20240716-05:29:35 test_perplexity 28 model 4 1.1582434711511191
20240716-05:32:27 test_accuracy 28 model 2 forward 446 / 512 backward 369 / 488
20240716-05:32:30 test_accuracy 28 model 4 forward 473 / 527 backward 367 / 473
20240716-05:32:31 wrote gpt_004.pth
20240716-05:32:32 wrote gpt_002.pth
20240716-05:32:40 --- epoch 29 ----------------------------------------
20240716-05:32:40 current_test_accuracies 0.8300 0.8370 0.8150 0.8440 0.8400
20240716-05:32:40 training model 2
20240716-05:32:40 training model 0
20240716-05:41:36 train_perplexity 29 model 0 1.1607948978890095
20240716-05:41:57 train_perplexity 29 model 2 1.1602099999411097
20240716-05:42:07 test_perplexity 29 model 0 1.1577161552433821
20240716-05:42:22 test_perplexity 29 model 2 1.1585824743779107
20240716-05:45:17 test_accuracy 29 model 0 forward 429 / 493 backward 389 / 507
20240716-05:45:19 test_accuracy 29 model 2 forward 439 / 512 backward 376 / 488
20240716-05:45:21 wrote gpt_002.pth
20240716-05:45:21 wrote gpt_000.pth
20240716-05:45:28 --- epoch 30 ----------------------------------------
20240716-05:45:28 current_test_accuracies 0.8180 0.8370 0.8150 0.8440 0.8400
20240716-05:45:28 training model 2
20240716-05:45:28 training model 0
20240716-05:54:24 train_perplexity 30 model 0 1.1602356387018358
20240716-05:54:46 train_perplexity 30 model 2 1.160247466016037
20240716-05:54:55 test_perplexity 30 model 0 1.1573310125511018
20240716-05:55:10 test_perplexity 30 model 2 1.1573603851361278
20240716-05:58:04 test_accuracy 30 model 0 forward 444 / 493 backward 405 / 507
20240716-05:58:07 test_accuracy 30 model 2 forward 458 / 512 backward 389 / 488
20240716-05:58:08 wrote gpt_002.pth
20240716-05:58:08 wrote gpt_000.pth
20240716-05:58:15 --- epoch 31 ----------------------------------------
20240716-05:58:15 current_test_accuracies 0.8490 0.8370 0.8470 0.8440 0.8400
20240716-05:58:15 training model 1
20240716-05:58:15 training model 4
20240716-06:07:13 train_perplexity 31 model 4 1.1591334992661257
20240716-06:07:34 train_perplexity 31 model 1 1.1588386073611407
20240716-06:07:44 test_perplexity 31 model 4 1.1576841404799447
20240716-06:07:58 test_perplexity 31 model 1 1.1590938139457425
20240716-06:10:49 test_accuracy 31 model 4 forward 487 / 527 backward 368 / 473
20240716-06:10:53 test_accuracy 31 model 1 forward 475 / 518 backward 365 / 482
20240716-06:10:55 wrote gpt_001.pth
20240716-06:10:55 wrote gpt_004.pth
20240716-06:11:02 --- epoch 32 ----------------------------------------
20240716-06:11:02 current_test_accuracies 0.8490 0.8400 0.8470 0.8440 0.8550
20240716-06:11:02 training model 1
20240716-06:11:02 training model 3
20240716-06:19:58 train_perplexity 32 model 3 1.1592201034812235
20240716-06:20:21 train_perplexity 32 model 1 1.1592556010234496
20240716-06:20:28 test_perplexity 32 model 3 1.1601451868197623
20240716-06:20:44 test_perplexity 32 model 1 1.1585455106234457
20240716-06:23:36 test_accuracy 32 model 3 forward 454 / 508 backward 394 / 492
20240716-06:23:40 test_accuracy 32 model 1 forward 481 / 518 backward 373 / 482
20240716-06:23:41 wrote gpt_001.pth
20240716-06:23:42 wrote gpt_003.pth
20240716-06:23:49 --- epoch 33 ----------------------------------------
20240716-06:23:49 current_test_accuracies 0.8490 0.8540 0.8470 0.8480 0.8550
20240716-06:23:49 training model 2
20240716-06:23:49 training model 3
20240716-06:32:46 train_perplexity 33 model 3 1.1586829350067889
20240716-06:33:07 train_perplexity 33 model 2 1.159421272648845
20240716-06:33:17 test_perplexity 33 model 3 1.1588706923562626
20240716-06:33:31 test_perplexity 33 model 2 1.1573623823340202
20240716-06:36:24 test_accuracy 33 model 3 forward 462 / 508 backward 391 / 492
20240716-06:36:28 test_accuracy 33 model 2 forward 465 / 512 backward 393 / 488
20240716-06:36:29 wrote gpt_002.pth
20240716-06:36:29 wrote gpt_003.pth
20240716-06:36:38 --- epoch 34 ----------------------------------------
20240716-06:36:38 current_test_accuracies 0.8490 0.8540 0.8580 0.8530 0.8550
20240716-06:36:38 training model 0
20240716-06:36:38 training model 3
20240716-06:45:35 train_perplexity 34 model 3 1.1589225751532157
20240716-06:45:58 train_perplexity 34 model 0 1.1603189310471211
20240716-06:46:05 test_perplexity 34 model 3 1.1592272223919107
20240716-06:46:21 test_perplexity 34 model 0 1.1567559800341192
20240716-06:49:13 test_accuracy 34 model 3 forward 465 / 508 backward 383 / 492
20240716-06:49:17 test_accuracy 34 model 0 forward 438 / 493 backward 403 / 507
20240716-06:49:19 wrote gpt_000.pth
20240716-06:49:19 wrote gpt_003.pth
20240716-06:49:28 --- epoch 35 ----------------------------------------
20240716-06:49:28 current_test_accuracies 0.8410 0.8540 0.8580 0.8480 0.8550
20240716-06:49:28 training model 0
20240716-06:49:28 training model 3
20240716-06:58:24 train_perplexity 35 model 3 1.1588686631753855
20240716-06:58:50 train_perplexity 35 model 0 1.1597413587764198
20240716-06:58:53 test_perplexity 35 model 3 1.1587913260252005
20240716-06:59:11 test_perplexity 35 model 0 1.1569315935869422
20240716-07:02:03 test_accuracy 35 model 3 forward 470 / 508 backward 404 / 492
20240716-07:02:08 test_accuracy 35 model 0 forward 448 / 493 backward 383 / 507
20240716-07:02:09 wrote gpt_000.pth
20240716-07:02:09 wrote gpt_003.pth
20240716-07:02:18 --- epoch 36 ----------------------------------------
20240716-07:02:18 current_test_accuracies 0.8310 0.8540 0.8580 0.8740 0.8550
20240716-07:02:18 training model 0
20240716-07:02:18 training model 1
20240716-07:11:14 train_perplexity 36 model 1 1.1594592850066885
20240716-07:11:38 train_perplexity 36 model 0 1.1600589679624906
20240716-07:11:45 test_perplexity 36 model 1 1.1581380499971454
20240716-07:12:01 test_perplexity 36 model 0 1.1565140280096768
20240716-07:14:54 test_accuracy 36 model 1 forward 469 / 518 backward 370 / 482
20240716-07:14:58 test_accuracy 36 model 0 forward 443 / 493 backward 417 / 507
20240716-07:14:59 wrote gpt_000.pth
20240716-07:15:00 wrote gpt_001.pth
20240716-07:15:08 --- epoch 37 ----------------------------------------
20240716-07:15:08 current_test_accuracies 0.8600 0.8390 0.8580 0.8740 0.8550
20240716-07:15:08 training model 1
20240716-07:15:08 training model 4
20240716-07:24:04 train_perplexity 37 model 4 1.159013906716659
20240716-07:24:30 train_perplexity 37 model 1 1.1585327520535558
20240716-07:24:34 test_perplexity 37 model 4 1.157817252061504
20240716-07:24:52 test_perplexity 37 model 1 1.158368932174961
20240716-07:27:42 test_accuracy 37 model 4 forward 475 / 527 backward 368 / 473
20240716-07:27:47 test_accuracy 37 model 1 forward 485 / 518 backward 369 / 482
20240716-07:27:49 wrote gpt_001.pth
20240716-07:27:49 wrote gpt_004.pth
20240716-07:27:57 --- epoch 38 ----------------------------------------
20240716-07:27:57 current_test_accuracies 0.8600 0.8540 0.8580 0.8740 0.8430
20240716-07:27:57 training model 4
20240716-07:27:57 training model 1
20240716-07:36:53 train_perplexity 38 model 1 1.158966728091178