-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHyperparameters_and_training_logs.txt
1215 lines (1215 loc) · 82.2 KB
/
Hyperparameters_and_training_logs.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
downloading to: liuhaotian/llava-v1.5-13b
downloading url: https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/config.json
downloading url: https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/generation_config.json
downloading url: https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/pytorch_model-00001-of-00003.bin
2024-03-01T12:56:35Z | INFO | [ Initiating ] dest=liuhaotian/llava-v1.5-13b/pytorch_model-00001-of-00003.bin minimum_chunk_size=150M url=https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/pytorch_model-00001-of-00003.bin
2024-03-01T12:56:41Z | INFO | [ Complete ] dest=liuhaotian/llava-v1.5-13b/pytorch_model-00001-of-00003.bin size="9.9 GB" total_elapsed=5.592s url=https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/pytorch_model-00001-of-00003.bin
downloading url: https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/pytorch_model-00002-of-00003.bin
2024-03-01T12:56:41Z | INFO | [ Initiating ] dest=liuhaotian/llava-v1.5-13b/pytorch_model-00002-of-00003.bin minimum_chunk_size=150M url=https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/pytorch_model-00002-of-00003.bin
2024-03-01T12:56:55Z | INFO | [ Complete ] dest=liuhaotian/llava-v1.5-13b/pytorch_model-00002-of-00003.bin size="9.9 GB" total_elapsed=13.674s url=https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/pytorch_model-00002-of-00003.bin
downloading url: https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/pytorch_model-00003-of-00003.bin
2024-03-01T12:56:56Z | INFO | [ Initiating ] dest=liuhaotian/llava-v1.5-13b/pytorch_model-00003-of-00003.bin minimum_chunk_size=150M url=https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/pytorch_model-00003-of-00003.bin
2024-03-01T12:57:02Z | INFO | [ Complete ] dest=liuhaotian/llava-v1.5-13b/pytorch_model-00003-of-00003.bin size="6.2 GB" total_elapsed=6.506s url=https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/pytorch_model-00003-of-00003.bin
downloading url: https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/pytorch_model.bin.index.json
downloading url: https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/special_tokens_map.json
downloading url: https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/tokenizer.model
2024-03-01T12:57:02Z | INFO | [ Initiating ] dest=liuhaotian/llava-v1.5-13b/tokenizer.model minimum_chunk_size=150M url=https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/tokenizer.model
2024-03-01T12:57:03Z | INFO | [ Complete ] dest=liuhaotian/llava-v1.5-13b/tokenizer.model size="500 kB" total_elapsed=0.024s url=https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/tokenizer.model
downloading url: https://weights.replicate.delivery/default/llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8/tokenizer_config.json
downloading took: 27.588810682296753
downloading to: openai/clip-vit-large-patch14-336
downloading url: https://weights.replicate.delivery/default/clip-vit-large-patch14-336/ce19dc912ca5cd21c8a653c79e251e808ccabcd1/config.json
downloading url: https://weights.replicate.delivery/default/clip-vit-large-patch14-336/ce19dc912ca5cd21c8a653c79e251e808ccabcd1/preprocessor_config.json
downloading url: https://weights.replicate.delivery/default/clip-vit-large-patch14-336/ce19dc912ca5cd21c8a653c79e251e808ccabcd1/pytorch_model.bin
2024-03-01T12:57:03Z | INFO | [ Initiating ] dest=openai/clip-vit-large-patch14-336/pytorch_model.bin minimum_chunk_size=150M url=https://weights.replicate.delivery/default/clip-vit-large-patch14-336/ce19dc912ca5cd21c8a653c79e251e808ccabcd1/pytorch_model.bin
2024-03-01T12:57:04Z | INFO | [ Complete ] dest=openai/clip-vit-large-patch14-336/pytorch_model.bin size="1.7 GB" total_elapsed=1.001s url=https://weights.replicate.delivery/default/clip-vit-large-patch14-336/ce19dc912ca5cd21c8a653c79e251e808ccabcd1/pytorch_model.bin
downloading took: 1.2197351455688477
[2024-03-01 12:57:07,400] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-03-01 12:57:09,118] [WARNING] [runner.py:196:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
[2024-03-01 12:57:09,118] [INFO] [runner.py:555:main] cmd = /root/.pyenv/versions/3.11.7/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None llava/train/train_mem.py --model_name_or_path liuhaotian/llava-v1.5-13b --data_path /tmp/tmph2qwkzvb/data.json --image_folder /tmp/tmph2qwkzvb/images --vision_tower openai/clip-vit-large-patch14-336 --output_dir /tmp/tmph2qwkzvb/output --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 --save_steps 500 --deepspeed ./scripts/zero3.json --version v1 --mm_projector_type mlp2x_gelu --mm_vision_select_layer -2 --mm_use_im_start_end False --mm_use_im_patch_token False --image_aspect_ratio pad --group_by_modality_length True --bf16 True --num_train_epochs 16 --per_device_train_batch_size 16 --per_device_eval_batch_size 4 --gradient_accumulation_steps 1 --evaluation_strategy no --save_strategy steps --save_total_limit 1 --learning_rate 0.0002 --weight_decay 0. --warmup_ratio 0.03 --lr_scheduler_type cosine --logging_steps 1 --tf32 True --model_max_length 2048 --gradient_checkpointing True --dataloader_num_workers 4 --lazy_preprocess True --report_to none
[2024-03-01 12:57:10,226] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-03-01 12:57:11,905] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_DEV_PACKAGE=libnccl-dev=2.15.5-1+cuda11.8
[2024-03-01 12:57:11,905] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_DEV_PACKAGE_VERSION=2.15.5-1
[2024-03-01 12:57:11,905] [INFO] [launch.py:138:main] 0 NCCL_VERSION=2.15.5-1
[2024-03-01 12:57:11,905] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_DEV_PACKAGE_NAME=libnccl-dev
[2024-03-01 12:57:11,905] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_PACKAGE=libnccl2=2.15.5-1+cuda11.8
[2024-03-01 12:57:11,905] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_PACKAGE_NAME=libnccl2
[2024-03-01 12:57:11,905] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_PACKAGE_VERSION=2.15.5-1
[2024-03-01 12:57:11,905] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0]}
[2024-03-01 12:57:11,905] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=1, node_rank=0
[2024-03-01 12:57:11,906] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0]})
[2024-03-01 12:57:11,906] [INFO] [launch.py:163:main] dist_world_size=1
[2024-03-01 12:57:11,906] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0
[2024-03-01 12:57:14,272] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-03-01 12:57:15,117] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2024-03-01 12:57:15,117] [INFO] [comm.py:594:init_distributed] cdb=None
[2024-03-01 12:57:15,117] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2024-03-01 12:57:15,708] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 13.05B parameters
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
Loading checkpoint shards: 33%|███▎ | 1/3 [00:06<00:13, 6.94s/it]
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:13<00:06, 6.93s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:18<00:00, 5.79s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:18<00:00, 6.10s/it]
Adding LoRA adapters...
[2024-03-01 12:59:50,504] [WARNING] [partition_parameters.py:836:_post_init_method] param `class_embedding` in CLIPVisionEmbeddings not on GPU so was not broadcasted from rank 0
[2024-03-01 12:59:50,608] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 13.35B parameters
Formatting inputs...Skip in lazy mode
Parameter Offload: Total persistent parameters: 749568 in 328 params
0%| | 0/384 [00:00<?, ?it/s]
0%| | 1/384 [00:11<1:14:32, 11.68s/it]
{'loss': 1.47, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.04}
0%| | 1/384 [00:11<1:14:32, 11.68s/it]
1%| | 2/384 [00:20<1:04:54, 10.20s/it]
{'loss': 1.4744, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.08}
1%| | 2/384 [00:20<1:04:54, 10.20s/it]
1%| | 3/384 [00:31<1:06:30, 10.47s/it]
{'loss': 1.4423, 'learning_rate': 5e-05, 'epoch': 0.12}
1%| | 3/384 [00:31<1:06:30, 10.47s/it]
1%| | 4/384 [00:42<1:06:10, 10.45s/it]
{'loss': 1.4187, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.17}
1%| | 4/384 [00:42<1:06:10, 10.45s/it]
1%|▏ | 5/384 [00:50<1:02:07, 9.84s/it]
{'loss': 1.3487, 'learning_rate': 8.333333333333334e-05, 'epoch': 0.21}
1%|▏ | 5/384 [00:50<1:02:07, 9.84s/it]
2%|▏ | 6/384 [01:01<1:04:06, 10.18s/it]
{'loss': 1.0805, 'learning_rate': 0.0001, 'epoch': 0.25}
[2024-03-01 13:01:06,922] [WARNING] [stage3.py:1850:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
2%|▏ | 6/384 [01:01<1:04:06, 10.18s/it]
2%|▏ | 7/384 [01:13<1:06:30, 10.58s/it]
{'loss': 1.0815, 'learning_rate': 0.00011666666666666668, 'epoch': 0.29}
2%|▏ | 7/384 [01:13<1:06:30, 10.58s/it]
2%|▏ | 8/384 [01:23<1:06:26, 10.60s/it]
{'loss': 1.0874, 'learning_rate': 0.00013333333333333334, 'epoch': 0.33}
2%|▏ | 8/384 [01:23<1:06:26, 10.60s/it]
2%|▏ | 9/384 [01:34<1:06:50, 10.69s/it]
{'loss': 0.9228, 'learning_rate': 0.00015000000000000001, 'epoch': 0.38}
2%|▏ | 9/384 [01:34<1:06:50, 10.69s/it]
3%|▎ | 10/384 [01:45<1:07:25, 10.82s/it]
{'loss': 0.9076, 'learning_rate': 0.0001666666666666667, 'epoch': 0.42}
3%|▎ | 10/384 [01:45<1:07:25, 10.82s/it]
3%|▎ | 11/384 [01:56<1:06:36, 10.71s/it]
{'loss': 0.9325, 'learning_rate': 0.00018333333333333334, 'epoch': 0.46}
3%|▎ | 11/384 [01:56<1:06:36, 10.71s/it]
3%|▎ | 12/384 [02:06<1:06:10, 10.67s/it]
{'loss': 0.9108, 'learning_rate': 0.0002, 'epoch': 0.5}
3%|▎ | 12/384 [02:06<1:06:10, 10.67s/it]
3%|▎ | 13/384 [02:16<1:04:31, 10.44s/it]
{'loss': 0.747, 'learning_rate': 0.000199996434000411, 'epoch': 0.54}
3%|▎ | 13/384 [02:16<1:04:31, 10.44s/it]
4%|▎ | 14/384 [02:27<1:05:27, 10.62s/it]
{'loss': 0.8561, 'learning_rate': 0.000199985736255971, 'epoch': 0.58}
4%|▎ | 14/384 [02:27<1:05:27, 10.62s/it]
4%|▍ | 15/384 [02:38<1:06:00, 10.73s/it]
{'loss': 0.7603, 'learning_rate': 0.00019996790752964305, 'epoch': 0.62}
4%|▍ | 15/384 [02:38<1:06:00, 10.73s/it]
4%|▍ | 16/384 [02:49<1:06:20, 10.82s/it]
{'loss': 0.7379, 'learning_rate': 0.0001999429490929718, 'epoch': 0.67}
4%|▍ | 16/384 [02:49<1:06:20, 10.82s/it]
4%|▍ | 17/384 [03:00<1:05:23, 10.69s/it]
{'loss': 0.6265, 'learning_rate': 0.00019991086272599273, 'epoch': 0.71}
4%|▍ | 17/384 [03:00<1:05:23, 10.69s/it]
5%|▍ | 18/384 [03:10<1:04:26, 10.57s/it]
{'loss': 0.6472, 'learning_rate': 0.00019987165071710527, 'epoch': 0.75}
5%|▍ | 18/384 [03:10<1:04:26, 10.57s/it]
5%|▍ | 19/384 [03:21<1:04:32, 10.61s/it]
{'loss': 0.6905, 'learning_rate': 0.00019982531586290958, 'epoch': 0.79}
5%|▍ | 19/384 [03:21<1:04:32, 10.61s/it]
5%|▌ | 20/384 [03:32<1:05:30, 10.80s/it]
{'loss': 0.7185, 'learning_rate': 0.00019977186146800707, 'epoch': 0.83}
5%|▌ | 20/384 [03:32<1:05:30, 10.80s/it]
5%|▌ | 21/384 [03:42<1:04:17, 10.63s/it]
{'loss': 0.5296, 'learning_rate': 0.00019971129134476473, 'epoch': 0.88}
5%|▌ | 21/384 [03:42<1:04:17, 10.63s/it]
6%|▌ | 22/384 [03:53<1:04:35, 10.71s/it]
{'loss': 0.7328, 'learning_rate': 0.0001996436098130433, 'epoch': 0.92}
6%|▌ | 22/384 [03:53<1:04:35, 10.71s/it]
6%|▌ | 23/384 [04:04<1:04:59, 10.80s/it]
{'loss': 0.7784, 'learning_rate': 0.00019956882169988905, 'epoch': 0.96}
6%|▌ | 23/384 [04:04<1:04:59, 10.80s/it]
6%|▋ | 24/384 [04:15<1:04:40, 10.78s/it]
{'loss': 0.7498, 'learning_rate': 0.00019948693233918952, 'epoch': 1.0}
6%|▋ | 24/384 [04:15<1:04:40, 10.78s/it]
7%|▋ | 25/384 [04:26<1:05:43, 10.98s/it]
{'loss': 0.5786, 'learning_rate': 0.00019939794757129332, 'epoch': 1.04}
7%|▋ | 25/384 [04:26<1:05:43, 10.98s/it]
7%|▋ | 26/384 [04:37<1:05:26, 10.97s/it]
{'loss': 0.6414, 'learning_rate': 0.00019930187374259337, 'epoch': 1.08}
7%|▋ | 26/384 [04:37<1:05:26, 10.97s/it]
7%|▋ | 27/384 [04:48<1:04:23, 10.82s/it]
{'loss': 0.57, 'learning_rate': 0.0001991987177050743, 'epoch': 1.12}
7%|▋ | 27/384 [04:48<1:04:23, 10.82s/it]
7%|▋ | 28/384 [04:58<1:02:55, 10.60s/it]
{'loss': 0.512, 'learning_rate': 0.00019908848681582391, 'epoch': 1.17}
7%|▋ | 28/384 [04:58<1:02:55, 10.60s/it]
8%|▊ | 29/384 [05:07<1:01:04, 10.32s/it]
{'loss': 0.4669, 'learning_rate': 0.00019897118893650825, 'epoch': 1.21}
8%|▊ | 29/384 [05:07<1:01:04, 10.32s/it]
8%|▊ | 30/384 [05:18<1:01:14, 10.38s/it]
{'loss': 0.4525, 'learning_rate': 0.00019884683243281116, 'epoch': 1.25}
8%|▊ | 30/384 [05:18<1:01:14, 10.38s/it]
8%|▊ | 31/384 [05:29<1:02:16, 10.58s/it]
{'loss': 0.5426, 'learning_rate': 0.00019871542617383743, 'epoch': 1.29}
8%|▊ | 31/384 [05:29<1:02:16, 10.58s/it]
8%|▊ | 32/384 [05:39<1:01:45, 10.53s/it]
{'loss': 0.5783, 'learning_rate': 0.00019857697953148037, 'epoch': 1.33}
8%|▊ | 32/384 [05:39<1:01:45, 10.53s/it]
9%|▊ | 33/384 [05:50<1:01:10, 10.46s/it]
{'loss': 0.527, 'learning_rate': 0.00019843150237975344, 'epoch': 1.38}
9%|▊ | 33/384 [05:50<1:01:10, 10.46s/it]
9%|▉ | 34/384 [06:00<1:00:10, 10.32s/it]
{'loss': 0.4429, 'learning_rate': 0.00019827900509408581, 'epoch': 1.42}
9%|▉ | 34/384 [06:00<1:00:10, 10.32s/it]
9%|▉ | 35/384 [06:10<1:01:04, 10.50s/it]
{'loss': 0.4537, 'learning_rate': 0.0001981194985505827, 'epoch': 1.46}
9%|▉ | 35/384 [06:10<1:01:04, 10.50s/it]
9%|▉ | 36/384 [06:22<1:01:49, 10.66s/it]
{'loss': 0.543, 'learning_rate': 0.00019795299412524945, 'epoch': 1.5}
9%|▉ | 36/384 [06:22<1:01:49, 10.66s/it]
10%|▉ | 37/384 [06:32<1:01:08, 10.57s/it]
{'loss': 0.4697, 'learning_rate': 0.00019777950369318035, 'epoch': 1.54}
10%|▉ | 37/384 [06:32<1:01:08, 10.57s/it]
10%|▉ | 38/384 [06:42<1:00:34, 10.50s/it]
{'loss': 0.497, 'learning_rate': 0.00019759903962771156, 'epoch': 1.58}
10%|▉ | 38/384 [06:42<1:00:34, 10.50s/it]
10%|█ | 39/384 [06:53<1:01:14, 10.65s/it]
{'loss': 0.4666, 'learning_rate': 0.0001974116147995387, 'epoch': 1.62}
10%|█ | 39/384 [06:53<1:01:14, 10.65s/it]
10%|█ | 40/384 [07:04<1:01:24, 10.71s/it]
{'loss': 0.5682, 'learning_rate': 0.00019721724257579907, 'epoch': 1.67}
10%|█ | 40/384 [07:04<1:01:24, 10.71s/it]
11%|█ | 41/384 [07:14<1:00:00, 10.50s/it]
{'loss': 0.4415, 'learning_rate': 0.000197015936819118, 'epoch': 1.71}
11%|█ | 41/384 [07:14<1:00:00, 10.50s/it]
11%|█ | 42/384 [07:25<1:01:10, 10.73s/it]
{'loss': 0.573, 'learning_rate': 0.00019680771188662044, 'epoch': 1.75}
11%|█ | 42/384 [07:25<1:01:10, 10.73s/it]
11%|█ | 43/384 [07:36<1:01:06, 10.75s/it]
{'loss': 0.509, 'learning_rate': 0.00019659258262890683, 'epoch': 1.79}
11%|█ | 43/384 [07:36<1:01:06, 10.75s/it]
11%|█▏ | 44/384 [07:47<1:01:24, 10.84s/it]
{'loss': 0.4964, 'learning_rate': 0.0001963705643889941, 'epoch': 1.83}
11%|█▏ | 44/384 [07:47<1:01:24, 10.84s/it]
12%|█▏ | 45/384 [07:57<59:11, 10.47s/it]
{'loss': 0.5012, 'learning_rate': 0.00019614167300122126, 'epoch': 1.88}
12%|█▏ | 45/384 [07:57<59:11, 10.47s/it]
12%|█▏ | 46/384 [08:07<58:52, 10.45s/it]
{'loss': 0.5303, 'learning_rate': 0.00019590592479012023, 'epoch': 1.92}
12%|█▏ | 46/384 [08:07<58:52, 10.45s/it]
12%|█▏ | 47/384 [08:18<59:32, 10.60s/it]
{'loss': 0.5444, 'learning_rate': 0.00019566333656925147, 'epoch': 1.96}
12%|█▏ | 47/384 [08:18<59:32, 10.60s/it]
12%|█▎ | 48/384 [08:29<1:00:31, 10.81s/it]
{'loss': 0.415, 'learning_rate': 0.00019541392564000488, 'epoch': 2.0}
12%|█▎ | 48/384 [08:29<1:00:31, 10.81s/it]
13%|█▎ | 49/384 [08:41<1:00:54, 10.91s/it]
{'loss': 0.3082, 'learning_rate': 0.00019515770979036594, 'epoch': 2.04}
13%|█▎ | 49/384 [08:41<1:00:54, 10.91s/it]
13%|█▎ | 50/384 [08:49<57:15, 10.29s/it]
{'loss': 0.3399, 'learning_rate': 0.00019489470729364692, 'epoch': 2.08}
13%|█▎ | 50/384 [08:49<57:15, 10.29s/it]
13%|█▎ | 51/384 [09:00<57:03, 10.28s/it]
{'loss': 0.285, 'learning_rate': 0.0001946249369071837, 'epoch': 2.12}
13%|█▎ | 51/384 [09:00<57:03, 10.28s/it]
14%|█▎ | 52/384 [09:10<57:41, 10.43s/it]
{'loss': 0.3239, 'learning_rate': 0.00019434841787099803, 'epoch': 2.17}
14%|█▎ | 52/384 [09:10<57:41, 10.43s/it]
14%|█▍ | 53/384 [09:22<58:56, 10.69s/it]
{'loss': 0.4116, 'learning_rate': 0.00019406516990642532, 'epoch': 2.21}
14%|█▍ | 53/384 [09:22<58:56, 10.69s/it]
14%|█▍ | 54/384 [09:32<57:35, 10.47s/it]
{'loss': 0.4075, 'learning_rate': 0.00019377521321470805, 'epoch': 2.25}
14%|█▍ | 54/384 [09:32<57:35, 10.47s/it]
14%|█▍ | 55/384 [09:42<57:22, 10.46s/it]
{'loss': 0.3368, 'learning_rate': 0.00019347856847555512, 'epoch': 2.29}
14%|█▍ | 55/384 [09:42<57:22, 10.46s/it]
15%|█▍ | 56/384 [09:52<56:53, 10.41s/it]
{'loss': 0.2766, 'learning_rate': 0.00019317525684566685, 'epoch': 2.33}
15%|█▍ | 56/384 [09:52<56:53, 10.41s/it]
15%|█▍ | 57/384 [10:03<57:19, 10.52s/it]
{'loss': 0.3271, 'learning_rate': 0.00019286529995722623, 'epoch': 2.38}
15%|█▍ | 57/384 [10:03<57:19, 10.52s/it]
15%|█▌ | 58/384 [10:14<57:53, 10.65s/it]
{'loss': 0.3147, 'learning_rate': 0.00019254871991635598, 'epoch': 2.42}
15%|█▌ | 58/384 [10:14<57:53, 10.65s/it]
15%|█▌ | 59/384 [10:25<58:23, 10.78s/it]
{'loss': 0.3653, 'learning_rate': 0.00019222553930154198, 'epoch': 2.46}
15%|█▌ | 59/384 [10:25<58:23, 10.78s/it]
16%|█▌ | 60/384 [10:35<56:56, 10.55s/it]
{'loss': 0.316, 'learning_rate': 0.00019189578116202307, 'epoch': 2.5}
16%|█▌ | 60/384 [10:35<56:56, 10.55s/it]
16%|█▌ | 61/384 [10:46<57:32, 10.69s/it]
{'loss': 0.4119, 'learning_rate': 0.00019155946901614702, 'epoch': 2.54}
16%|█▌ | 61/384 [10:46<57:32, 10.69s/it]
16%|█▌ | 62/384 [10:56<55:47, 10.40s/it]
{'loss': 0.2912, 'learning_rate': 0.00019121662684969335, 'epoch': 2.58}
16%|█▌ | 62/384 [10:56<55:47, 10.40s/it]
16%|█▋ | 63/384 [11:07<56:16, 10.52s/it]
{'loss': 0.3983, 'learning_rate': 0.0001908672791141625, 'epoch': 2.62}
16%|█▋ | 63/384 [11:07<56:16, 10.52s/it]
17%|█▋ | 64/384 [11:18<56:45, 10.64s/it]
{'loss': 0.3135, 'learning_rate': 0.00019051145072503215, 'epoch': 2.67}
17%|█▋ | 64/384 [11:18<56:45, 10.64s/it]
17%|█▋ | 65/384 [11:28<56:03, 10.54s/it]
{'loss': 0.3501, 'learning_rate': 0.00019014916705998002, 'epoch': 2.71}
17%|█▋ | 65/384 [11:28<56:03, 10.54s/it]
17%|█▋ | 66/384 [11:39<56:15, 10.62s/it]
{'loss': 0.3764, 'learning_rate': 0.00018978045395707418, 'epoch': 2.75}
17%|█▋ | 66/384 [11:39<56:15, 10.62s/it]
17%|█▋ | 67/384 [11:50<56:41, 10.73s/it]
{'loss': 0.3071, 'learning_rate': 0.00018940533771293007, 'epoch': 2.79}
17%|█▋ | 67/384 [11:50<56:41, 10.73s/it]
18%|█▊ | 68/384 [12:00<56:09, 10.66s/it]
{'loss': 0.3553, 'learning_rate': 0.00018902384508083517, 'epoch': 2.83}
18%|█▊ | 68/384 [12:00<56:09, 10.66s/it]
18%|█▊ | 69/384 [12:10<55:03, 10.49s/it]
{'loss': 0.2808, 'learning_rate': 0.00018863600326884082, 'epoch': 2.88}
18%|█▊ | 69/384 [12:10<55:03, 10.49s/it]
18%|█▊ | 70/384 [12:21<55:18, 10.57s/it]
{'loss': 0.4306, 'learning_rate': 0.00018824183993782192, 'epoch': 2.92}
18%|█▊ | 70/384 [12:21<55:18, 10.57s/it]
18%|█▊ | 71/384 [12:31<54:37, 10.47s/it]
{'loss': 0.3211, 'learning_rate': 0.00018784138319950398, 'epoch': 2.96}
18%|█▊ | 71/384 [12:31<54:37, 10.47s/it]
19%|█▉ | 72/384 [12:43<55:49, 10.74s/it]
{'loss': 0.3306, 'learning_rate': 0.00018743466161445823, 'epoch': 3.0}
19%|█▉ | 72/384 [12:43<55:49, 10.74s/it]
19%|█▉ | 73/384 [12:54<55:50, 10.77s/it]
{'loss': 0.1707, 'learning_rate': 0.00018702170419006482, 'epoch': 3.04}
19%|█▉ | 73/384 [12:54<55:50, 10.77s/it]
19%|█▉ | 74/384 [13:04<55:09, 10.68s/it]
{'loss': 0.2103, 'learning_rate': 0.00018660254037844388, 'epoch': 3.08}
19%|█▉ | 74/384 [13:04<55:09, 10.68s/it]
20%|█▉ | 75/384 [13:15<55:32, 10.79s/it]
{'loss': 0.232, 'learning_rate': 0.00018617720007435497, 'epoch': 3.12}
20%|█▉ | 75/384 [13:15<55:32, 10.79s/it]
20%|█▉ | 76/384 [13:26<54:53, 10.69s/it]
{'loss': 0.2233, 'learning_rate': 0.0001857457136130651, 'epoch': 3.17}
20%|█▉ | 76/384 [13:26<54:53, 10.69s/it]
20%|██ | 77/384 [13:37<55:04, 10.76s/it]
{'loss': 0.2397, 'learning_rate': 0.00018530811176818514, 'epoch': 3.21}
20%|██ | 77/384 [13:37<55:04, 10.76s/it]
20%|██ | 78/384 [13:48<55:32, 10.89s/it]
{'loss': 0.1759, 'learning_rate': 0.00018486442574947511, 'epoch': 3.25}
20%|██ | 78/384 [13:48<55:32, 10.89s/it]
21%|██ | 79/384 [13:58<54:24, 10.70s/it]
{'loss': 0.1734, 'learning_rate': 0.00018441468720061815, 'epoch': 3.29}
21%|██ | 79/384 [13:58<54:24, 10.70s/it]
21%|██ | 80/384 [14:09<54:15, 10.71s/it]
{'loss': 0.2073, 'learning_rate': 0.00018395892819696389, 'epoch': 3.33}
21%|██ | 80/384 [14:09<54:15, 10.71s/it]
21%|██ | 81/384 [14:20<54:15, 10.74s/it]
{'loss': 0.2405, 'learning_rate': 0.00018349718124324076, 'epoch': 3.38}
21%|██ | 81/384 [14:20<54:15, 10.74s/it]
21%|██▏ | 82/384 [14:30<53:49, 10.69s/it]
{'loss': 0.2107, 'learning_rate': 0.00018302947927123766, 'epoch': 3.42}
21%|██▏ | 82/384 [14:30<53:49, 10.69s/it]
22%|██▏ | 83/384 [14:41<53:54, 10.74s/it]
{'loss': 0.2156, 'learning_rate': 0.00018255585563745538, 'epoch': 3.46}
22%|██▏ | 83/384 [14:41<53:54, 10.74s/it]
22%|██▏ | 84/384 [14:51<53:15, 10.65s/it]
{'loss': 0.2244, 'learning_rate': 0.00018207634412072764, 'epoch': 3.5}
22%|██▏ | 84/384 [14:51<53:15, 10.65s/it]
22%|██▏ | 85/384 [15:02<53:20, 10.70s/it]
{'loss': 0.234, 'learning_rate': 0.00018159097891981186, 'epoch': 3.54}
22%|██▏ | 85/384 [15:02<53:20, 10.70s/it]
22%|██▏ | 86/384 [15:13<53:39, 10.80s/it]
{'loss': 0.1648, 'learning_rate': 0.00018109979465095013, 'epoch': 3.58}
22%|██▏ | 86/384 [15:13<53:39, 10.80s/it]
23%|██▎ | 87/384 [15:24<53:38, 10.84s/it]
{'loss': 0.2366, 'learning_rate': 0.00018060282634540053, 'epoch': 3.62}
23%|██▎ | 87/384 [15:24<53:38, 10.84s/it]
23%|██▎ | 88/384 [15:34<51:59, 10.54s/it]
{'loss': 0.2134, 'learning_rate': 0.00018010010944693848, 'epoch': 3.67}
23%|██▎ | 88/384 [15:34<51:59, 10.54s/it]
23%|██▎ | 89/384 [15:45<52:29, 10.68s/it]
{'loss': 0.2372, 'learning_rate': 0.00017959167980932908, 'epoch': 3.71}
23%|██▎ | 89/384 [15:45<52:29, 10.68s/it]
23%|██▎ | 90/384 [15:55<51:24, 10.49s/it]
{'loss': 0.2316, 'learning_rate': 0.00017907757369376985, 'epoch': 3.75}
23%|██▎ | 90/384 [15:55<51:24, 10.49s/it]
24%|██▎ | 91/384 [16:06<52:03, 10.66s/it]
{'loss': 0.241, 'learning_rate': 0.00017855782776630483, 'epoch': 3.79}
24%|██▎ | 91/384 [16:06<52:03, 10.66s/it]
24%|██▍ | 92/384 [16:17<52:00, 10.69s/it]
{'loss': 0.2186, 'learning_rate': 0.0001780324790952092, 'epoch': 3.83}
24%|██▍ | 92/384 [16:17<52:00, 10.69s/it]
24%|██▍ | 93/384 [16:27<50:52, 10.49s/it]
{'loss': 0.1967, 'learning_rate': 0.0001775015651483459, 'epoch': 3.88}
24%|██▍ | 93/384 [16:27<50:52, 10.49s/it]
24%|██▍ | 94/384 [16:38<51:48, 10.72s/it]
{'loss': 0.2389, 'learning_rate': 0.00017696512379049325, 'epoch': 3.92}
24%|██▍ | 94/384 [16:38<51:48, 10.72s/it]
25%|██▍ | 95/384 [16:49<52:06, 10.82s/it]
{'loss': 0.1848, 'learning_rate': 0.00017642319328064446, 'epoch': 3.96}
25%|██▍ | 95/384 [16:49<52:06, 10.82s/it]
25%|██▌ | 96/384 [16:59<49:56, 10.40s/it]
{'loss': 0.2082, 'learning_rate': 0.0001758758122692791, 'epoch': 4.0}
25%|██▌ | 96/384 [16:59<49:56, 10.40s/it]
25%|██▌ | 97/384 [17:10<50:48, 10.62s/it]
{'loss': 0.1328, 'learning_rate': 0.00017532301979560636, 'epoch': 4.04}
25%|██▌ | 97/384 [17:10<50:48, 10.62s/it]
26%|██▌ | 98/384 [17:21<51:01, 10.71s/it]
{'loss': 0.1262, 'learning_rate': 0.00017476485528478093, 'epoch': 4.08}
26%|██▌ | 98/384 [17:21<51:01, 10.71s/it]
26%|██▌ | 99/384 [17:31<50:30, 10.63s/it]
{'loss': 0.1373, 'learning_rate': 0.0001742013585450911, 'epoch': 4.12}
26%|██▌ | 99/384 [17:31<50:30, 10.63s/it]
26%|██▌ | 100/384 [17:41<49:42, 10.50s/it]
{'loss': 0.1182, 'learning_rate': 0.00017363256976511972, 'epoch': 4.17}
26%|██▌ | 100/384 [17:41<49:42, 10.50s/it]
26%|██▋ | 101/384 [17:52<50:00, 10.60s/it]
{'loss': 0.1016, 'learning_rate': 0.00017305852951087798, 'epoch': 4.21}
26%|██▋ | 101/384 [17:52<50:00, 10.60s/it]
27%|██▋ | 102/384 [18:03<50:45, 10.80s/it]
{'loss': 0.1286, 'learning_rate': 0.000172479278722912, 'epoch': 4.25}
27%|██▋ | 102/384 [18:03<50:45, 10.80s/it]
27%|██▋ | 103/384 [18:14<50:42, 10.83s/it]
{'loss': 0.1631, 'learning_rate': 0.0001718948587133833, 'epoch': 4.29}
27%|██▋ | 103/384 [18:14<50:42, 10.83s/it]
27%|██▋ | 104/384 [18:22<46:37, 9.99s/it]
{'loss': 0.1143, 'learning_rate': 0.00017130531116312203, 'epoch': 4.33}
27%|██▋ | 104/384 [18:22<46:37, 9.99s/it]
27%|██▋ | 105/384 [18:33<47:04, 10.12s/it]
{'loss': 0.1125, 'learning_rate': 0.00017071067811865476, 'epoch': 4.38}
27%|██▋ | 105/384 [18:33<47:04, 10.12s/it]
28%|██▊ | 106/384 [18:44<48:11, 10.40s/it]
{'loss': 0.137, 'learning_rate': 0.0001701110019892053, 'epoch': 4.42}
28%|██▊ | 106/384 [18:44<48:11, 10.40s/it]
28%|██▊ | 107/384 [18:54<47:43, 10.34s/it]
{'loss': 0.1546, 'learning_rate': 0.00016950632554367019, 'epoch': 4.46}
28%|██▊ | 107/384 [18:54<47:43, 10.34s/it]
28%|██▊ | 108/384 [19:04<47:37, 10.35s/it]
{'loss': 0.1475, 'learning_rate': 0.00016889669190756868, 'epoch': 4.5}
28%|██▊ | 108/384 [19:04<47:37, 10.35s/it]
28%|██▊ | 109/384 [19:15<48:21, 10.55s/it]
{'loss': 0.1178, 'learning_rate': 0.00016828214455996658, 'epoch': 4.54}
28%|██▊ | 109/384 [19:15<48:21, 10.55s/it]
29%|██▊ | 110/384 [19:27<49:06, 10.75s/it]
{'loss': 0.1303, 'learning_rate': 0.00016766272733037576, 'epoch': 4.58}
29%|██▊ | 110/384 [19:27<49:06, 10.75s/it]
29%|██▉ | 111/384 [19:37<48:41, 10.70s/it]
{'loss': 0.1535, 'learning_rate': 0.00016703848439562785, 'epoch': 4.62}
29%|██▉ | 111/384 [19:37<48:41, 10.70s/it]
29%|██▉ | 112/384 [19:48<48:04, 10.60s/it]
{'loss': 0.1574, 'learning_rate': 0.00016640946027672392, 'epoch': 4.67}
29%|██▉ | 112/384 [19:48<48:04, 10.60s/it]
29%|██▉ | 113/384 [19:59<48:30, 10.74s/it]
{'loss': 0.1212, 'learning_rate': 0.0001657756998356589, 'epoch': 4.71}
29%|██▉ | 113/384 [19:59<48:30, 10.74s/it]
30%|██▉ | 114/384 [20:10<48:36, 10.80s/it]
{'loss': 0.1524, 'learning_rate': 0.00016513724827222227, 'epoch': 4.75}
30%|██▉ | 114/384 [20:10<48:36, 10.80s/it]
30%|██▉ | 115/384 [20:21<48:33, 10.83s/it]
{'loss': 0.1624, 'learning_rate': 0.0001644941511207742, 'epoch': 4.79}
30%|██▉ | 115/384 [20:21<48:33, 10.83s/it]
30%|███ | 116/384 [20:30<46:45, 10.47s/it]
{'loss': 0.1289, 'learning_rate': 0.00016384645424699835, 'epoch': 4.83}
30%|███ | 116/384 [20:30<46:45, 10.47s/it]
30%|███ | 117/384 [20:40<45:52, 10.31s/it]
{'loss': 0.1471, 'learning_rate': 0.0001631942038446304, 'epoch': 4.88}
30%|███ | 117/384 [20:40<45:52, 10.31s/it]
31%|███ | 118/384 [20:50<45:14, 10.21s/it]
{'loss': 0.1318, 'learning_rate': 0.00016253744643216368, 'epoch': 4.92}
31%|███ | 118/384 [20:50<45:14, 10.21s/it]
31%|███ | 119/384 [21:00<44:59, 10.19s/it]
{'loss': 0.1916, 'learning_rate': 0.00016187622884953145, 'epoch': 4.96}
31%|███ | 119/384 [21:00<44:59, 10.19s/it]
31%|███▏ | 120/384 [21:11<46:11, 10.50s/it]
{'loss': 0.1588, 'learning_rate': 0.0001612105982547663, 'epoch': 5.0}
31%|███▏ | 120/384 [21:11<46:11, 10.50s/it]
32%|███▏ | 121/384 [21:23<47:50, 10.91s/it]
{'loss': 0.0661, 'learning_rate': 0.00016054060212063672, 'epoch': 5.04}
32%|███▏ | 121/384 [21:23<47:50, 10.91s/it]
32%|███▏ | 122/384 [21:35<48:08, 11.02s/it]
{'loss': 0.0945, 'learning_rate': 0.0001598662882312615, 'epoch': 5.08}
32%|███▏ | 122/384 [21:35<48:08, 11.02s/it]
32%|███▏ | 123/384 [21:46<47:54, 11.01s/it]
{'loss': 0.0878, 'learning_rate': 0.0001591877046787017, 'epoch': 5.12}
32%|███▏ | 123/384 [21:46<47:54, 11.01s/it]
32%|███▏ | 124/384 [21:57<48:02, 11.09s/it]
{'loss': 0.0813, 'learning_rate': 0.00015850489985953076, 'epoch': 5.17}
32%|███▏ | 124/384 [21:57<48:02, 11.09s/it]
33%|███▎ | 125/384 [22:07<46:09, 10.69s/it]
{'loss': 0.0753, 'learning_rate': 0.0001578179224713827, 'epoch': 5.21}
33%|███▎ | 125/384 [22:07<46:09, 10.69s/it]
33%|███▎ | 126/384 [22:18<46:13, 10.75s/it]
{'loss': 0.0891, 'learning_rate': 0.00015712682150947923, 'epoch': 5.25}
33%|███▎ | 126/384 [22:18<46:13, 10.75s/it]
33%|███▎ | 127/384 [22:27<44:41, 10.43s/it]
{'loss': 0.0836, 'learning_rate': 0.00015643164626313527, 'epoch': 5.29}
33%|███▎ | 127/384 [22:27<44:41, 10.43s/it]
33%|███▎ | 128/384 [22:37<44:05, 10.33s/it]
{'loss': 0.0991, 'learning_rate': 0.00015573244631224365, 'epoch': 5.33}
33%|███▎ | 128/384 [22:37<44:05, 10.33s/it]
34%|███▎ | 129/384 [22:48<43:52, 10.33s/it]
{'loss': 0.0955, 'learning_rate': 0.00015502927152373914, 'epoch': 5.38}
34%|███▎ | 129/384 [22:48<43:52, 10.33s/it]
34%|███▍ | 130/384 [22:58<43:26, 10.26s/it]
{'loss': 0.0722, 'learning_rate': 0.0001543221720480419, 'epoch': 5.42}
34%|███▍ | 130/384 [22:58<43:26, 10.26s/it]
34%|███▍ | 131/384 [23:09<44:03, 10.45s/it]
{'loss': 0.098, 'learning_rate': 0.00015361119831548069, 'epoch': 5.46}
34%|███▍ | 131/384 [23:09<44:03, 10.45s/it]
34%|███▍ | 132/384 [23:20<44:39, 10.63s/it]
{'loss': 0.0935, 'learning_rate': 0.00015289640103269625, 'epoch': 5.5}
34%|███▍ | 132/384 [23:20<44:39, 10.63s/it]
35%|███▍ | 133/384 [23:31<44:50, 10.72s/it]
{'loss': 0.0894, 'learning_rate': 0.00015217783117902497, 'epoch': 5.54}
35%|███▍ | 133/384 [23:31<44:50, 10.72s/it]
35%|███▍ | 134/384 [23:41<44:19, 10.64s/it]
{'loss': 0.0942, 'learning_rate': 0.0001514555400028629, 'epoch': 5.58}
35%|███▍ | 134/384 [23:41<44:19, 10.64s/it]
35%|███▌ | 135/384 [23:50<42:38, 10.27s/it]
{'loss': 0.1174, 'learning_rate': 0.00015072957901801076, 'epoch': 5.62}
35%|███▌ | 135/384 [23:50<42:38, 10.27s/it]
35%|███▌ | 136/384 [24:00<42:05, 10.18s/it]
{'loss': 0.1075, 'learning_rate': 0.00015000000000000001, 'epoch': 5.67}
35%|███▌ | 136/384 [24:00<42:05, 10.18s/it]
36%|███▌ | 137/384 [24:11<42:07, 10.23s/it]
{'loss': 0.0919, 'learning_rate': 0.00014926685498240028, 'epoch': 5.71}
36%|███▌ | 137/384 [24:11<42:07, 10.23s/it]
36%|███▌ | 138/384 [24:21<41:42, 10.17s/it]
{'loss': 0.0954, 'learning_rate': 0.00014853019625310813, 'epoch': 5.75}
36%|███▌ | 138/384 [24:21<41:42, 10.17s/it]
36%|███▌ | 139/384 [24:32<42:24, 10.38s/it]
{'loss': 0.1004, 'learning_rate': 0.0001477900763506181, 'epoch': 5.79}
36%|███▌ | 139/384 [24:32<42:24, 10.38s/it]
36%|███▋ | 140/384 [24:42<42:08, 10.36s/it]
{'loss': 0.1224, 'learning_rate': 0.0001470465480602756, 'epoch': 5.83}
36%|███▋ | 140/384 [24:42<42:08, 10.36s/it]
37%|███▋ | 141/384 [24:53<42:16, 10.44s/it]
{'loss': 0.0693, 'learning_rate': 0.00014629966441051208, 'epoch': 5.88}
37%|███▋ | 141/384 [24:53<42:16, 10.44s/it]
37%|███▋ | 142/384 [25:04<42:50, 10.62s/it]
{'loss': 0.0817, 'learning_rate': 0.0001455494786690634, 'epoch': 5.92}
37%|███▋ | 142/384 [25:04<42:50, 10.62s/it]
37%|███▋ | 143/384 [25:14<42:46, 10.65s/it]
{'loss': 0.0884, 'learning_rate': 0.00014479604433917045, 'epoch': 5.96}
37%|███▋ | 143/384 [25:14<42:46, 10.65s/it]
38%|███▊ | 144/384 [25:25<42:32, 10.63s/it]
{'loss': 0.1011, 'learning_rate': 0.00014403941515576344, 'epoch': 6.0}
38%|███▊ | 144/384 [25:25<42:32, 10.63s/it]
38%|███▊ | 145/384 [25:37<43:43, 10.98s/it]
{'loss': 0.0673, 'learning_rate': 0.0001432796450816295, 'epoch': 6.04}
38%|███▊ | 145/384 [25:37<43:43, 10.98s/it]
38%|███▊ | 146/384 [25:47<42:43, 10.77s/it]
{'loss': 0.0675, 'learning_rate': 0.00014251678830356408, 'epoch': 6.08}
38%|███▊ | 146/384 [25:47<42:43, 10.77s/it]
38%|███▊ | 147/384 [25:58<42:45, 10.82s/it]
{'loss': 0.0554, 'learning_rate': 0.00014175089922850633, 'epoch': 6.12}
38%|███▊ | 147/384 [25:58<42:45, 10.82s/it]
39%|███▊ | 148/384 [26:09<42:49, 10.89s/it]
{'loss': 0.0661, 'learning_rate': 0.00014098203247965875, 'epoch': 6.17}
39%|███▊ | 148/384 [26:09<42:49, 10.89s/it]
39%|███▉ | 149/384 [26:20<42:46, 10.92s/it]
{'loss': 0.0572, 'learning_rate': 0.00014021024289259158, 'epoch': 6.21}
39%|███▉ | 149/384 [26:20<42:46, 10.92s/it]
39%|███▉ | 150/384 [26:31<42:56, 11.01s/it]
{'loss': 0.0688, 'learning_rate': 0.00013943558551133186, 'epoch': 6.25}
39%|███▉ | 150/384 [26:31<42:56, 11.01s/it]
39%|███▉ | 151/384 [26:42<41:59, 10.81s/it]
{'loss': 0.0604, 'learning_rate': 0.0001386581155844376, 'epoch': 6.29}
39%|███▉ | 151/384 [26:42<41:59, 10.81s/it]
40%|███▉ | 152/384 [26:50<38:37, 9.99s/it]
{'loss': 0.0696, 'learning_rate': 0.0001378778885610576, 'epoch': 6.33}
40%|███▉ | 152/384 [26:50<38:37, 9.99s/it]
40%|███▉ | 153/384 [27:01<40:00, 10.39s/it]
{'loss': 0.0532, 'learning_rate': 0.0001370949600869768, 'epoch': 6.38}
40%|███▉ | 153/384 [27:01<40:00, 10.39s/it]
40%|████ | 154/384 [27:12<40:33, 10.58s/it]
{'loss': 0.0628, 'learning_rate': 0.00013630938600064747, 'epoch': 6.42}
40%|████ | 154/384 [27:12<40:33, 10.58s/it]
40%|████ | 155/384 [27:23<40:46, 10.68s/it]
{'loss': 0.0604, 'learning_rate': 0.00013552122232920707, 'epoch': 6.46}
40%|████ | 155/384 [27:23<40:46, 10.68s/it]
41%|████ | 156/384 [27:33<40:10, 10.57s/it]
{'loss': 0.058, 'learning_rate': 0.00013473052528448201, 'epoch': 6.5}
41%|████ | 156/384 [27:33<40:10, 10.57s/it]
41%|████ | 157/384 [27:44<40:30, 10.71s/it]
{'loss': 0.0591, 'learning_rate': 0.00013393735125897925, 'epoch': 6.54}
41%|████ | 157/384 [27:44<40:30, 10.71s/it]
41%|████ | 158/384 [27:55<40:00, 10.62s/it]
{'loss': 0.0512, 'learning_rate': 0.0001331417568218636, 'epoch': 6.58}
41%|████ | 158/384 [27:55<40:00, 10.62s/it]
41%|████▏ | 159/384 [28:05<39:40, 10.58s/it]
{'loss': 0.0631, 'learning_rate': 0.0001323437987149238, 'epoch': 6.62}
41%|████▏ | 159/384 [28:05<39:40, 10.58s/it]
42%|████▏ | 160/384 [28:15<38:50, 10.40s/it]
{'loss': 0.0674, 'learning_rate': 0.00013154353384852558, 'epoch': 6.67}
42%|████▏ | 160/384 [28:15<38:50, 10.40s/it]
42%|████▏ | 161/384 [28:26<39:01, 10.50s/it]
{'loss': 0.0607, 'learning_rate': 0.00013074101929755252, 'epoch': 6.71}
42%|████▏ | 161/384 [28:26<39:01, 10.50s/it]
42%|████▏ | 162/384 [28:36<38:44, 10.47s/it]
{'loss': 0.0691, 'learning_rate': 0.00012993631229733582, 'epoch': 6.75}
42%|████▏ | 162/384 [28:36<38:44, 10.47s/it]
42%|████▏ | 163/384 [28:47<38:20, 10.41s/it]
{'loss': 0.0553, 'learning_rate': 0.00012912947023957212, 'epoch': 6.79}
42%|████▏ | 163/384 [28:47<38:20, 10.41s/it]
43%|████▎ | 164/384 [28:57<38:32, 10.51s/it]
{'loss': 0.0671, 'learning_rate': 0.00012832055066823038, 'epoch': 6.83}
43%|████▎ | 164/384 [28:57<38:32, 10.51s/it]
43%|████▎ | 165/384 [29:08<38:05, 10.44s/it]
{'loss': 0.057, 'learning_rate': 0.0001275096112754478, 'epoch': 6.88}
43%|████▎ | 165/384 [29:08<38:05, 10.44s/it]
43%|████▎ | 166/384 [29:19<38:33, 10.61s/it]
{'loss': 0.0705, 'learning_rate': 0.00012669670989741517, 'epoch': 6.92}
43%|████▎ | 166/384 [29:19<38:33, 10.61s/it]
43%|████▎ | 167/384 [29:30<38:41, 10.70s/it]
{'loss': 0.0649, 'learning_rate': 0.00012588190451025207, 'epoch': 6.96}
43%|████▎ | 167/384 [29:30<38:41, 10.70s/it]
44%|████▍ | 168/384 [29:40<37:54, 10.53s/it]
{'loss': 0.0677, 'learning_rate': 0.00012506525322587207, 'epoch': 7.0}
44%|████▍ | 168/384 [29:40<37:54, 10.53s/it]
44%|████▍ | 169/384 [29:51<38:07, 10.64s/it]
{'loss': 0.0415, 'learning_rate': 0.000124246814287838, 'epoch': 7.04}
44%|████▍ | 169/384 [29:51<38:07, 10.64s/it]
44%|████▍ | 170/384 [30:00<37:11, 10.43s/it]
{'loss': 0.0324, 'learning_rate': 0.00012342664606720822, 'epoch': 7.08}
44%|████▍ | 170/384 [30:00<37:11, 10.43s/it]
45%|████▍ | 171/384 [30:10<36:12, 10.20s/it]
{'loss': 0.0452, 'learning_rate': 0.0001226048070583735, 'epoch': 7.12}
45%|████▍ | 171/384 [30:10<36:12, 10.20s/it]
45%|████▍ | 172/384 [30:20<35:58, 10.18s/it]
{'loss': 0.0349, 'learning_rate': 0.00012178135587488515, 'epoch': 7.17}
45%|████▍ | 172/384 [30:20<35:58, 10.18s/it]
45%|████▌ | 173/384 [30:31<36:23, 10.35s/it]
{'loss': 0.0456, 'learning_rate': 0.00012095635124527486, 'epoch': 7.21}
45%|████▌ | 173/384 [30:31<36:23, 10.35s/it]
45%|████▌ | 174/384 [30:42<37:03, 10.59s/it]
{'loss': 0.0449, 'learning_rate': 0.00012012985200886602, 'epoch': 7.25}
45%|████▌ | 174/384 [30:42<37:03, 10.59s/it]
46%|████▌ | 175/384 [30:53<36:47, 10.56s/it]
{'loss': 0.0318, 'learning_rate': 0.00011930191711157737, 'epoch': 7.29}
46%|████▌ | 175/384 [30:53<36:47, 10.56s/it]
46%|████▌ | 176/384 [31:02<35:27, 10.23s/it]
{'loss': 0.0487, 'learning_rate': 0.00011847260560171896, 'epoch': 7.33}
46%|████▌ | 176/384 [31:02<35:27, 10.23s/it]
46%|████▌ | 177/384 [31:12<34:28, 9.99s/it]
{'loss': 0.0428, 'learning_rate': 0.00011764197662578086, 'epoch': 7.38}
46%|████▌ | 177/384 [31:12<34:28, 9.99s/it]
46%|████▋ | 178/384 [31:22<34:52, 10.16s/it]
{'loss': 0.0375, 'learning_rate': 0.00011681008942421483, 'epoch': 7.42}
46%|████▋ | 178/384 [31:22<34:52, 10.16s/it]
47%|████▋ | 179/384 [31:33<35:12, 10.31s/it]
{'loss': 0.0499, 'learning_rate': 0.00011597700332720923, 'epoch': 7.46}
47%|████▋ | 179/384 [31:33<35:12, 10.31s/it]
47%|████▋ | 180/384 [31:44<35:50, 10.54s/it]
{'loss': 0.0442, 'learning_rate': 0.00011514277775045768, 'epoch': 7.5}
47%|████▋ | 180/384 [31:44<35:50, 10.54s/it]
47%|████▋ | 181/384 [31:53<34:37, 10.24s/it]
{'loss': 0.0454, 'learning_rate': 0.00011430747219092142, 'epoch': 7.54}
47%|████▋ | 181/384 [31:53<34:37, 10.24s/it]
47%|████▋ | 182/384 [32:05<35:40, 10.60s/it]
{'loss': 0.0305, 'learning_rate': 0.00011347114622258612, 'epoch': 7.58}
47%|████▋ | 182/384 [32:05<35:40, 10.60s/it]
48%|████▊ | 183/384 [32:16<35:55, 10.72s/it]
{'loss': 0.0421, 'learning_rate': 0.00011263385949221295, 'epoch': 7.62}
48%|████▊ | 183/384 [32:16<35:55, 10.72s/it]
48%|████▊ | 184/384 [32:27<36:05, 10.83s/it]
{'loss': 0.0315, 'learning_rate': 0.00011179567171508463, 'epoch': 7.67}
48%|████▊ | 184/384 [32:27<36:05, 10.83s/it]
48%|████▊ | 185/384 [32:37<35:27, 10.69s/it]
{'loss': 0.0351, 'learning_rate': 0.00011095664267074655, 'epoch': 7.71}
48%|████▊ | 185/384 [32:37<35:27, 10.69s/it]
48%|████▊ | 186/384 [32:46<33:47, 10.24s/it]
{'loss': 0.0438, 'learning_rate': 0.00011011683219874323, 'epoch': 7.75}
48%|████▊ | 186/384 [32:46<33:47, 10.24s/it]
49%|████▊ | 187/384 [32:58<34:45, 10.58s/it]
{'loss': 0.0327, 'learning_rate': 0.00010927630019435066, 'epoch': 7.79}
49%|████▊ | 187/384 [32:58<34:45, 10.58s/it]
49%|████▉ | 188/384 [33:09<34:50, 10.67s/it]
{'loss': 0.044, 'learning_rate': 0.00010843510660430447, 'epoch': 7.83}
49%|████▉ | 188/384 [33:09<34:50, 10.67s/it]
49%|████▉ | 189/384 [33:19<34:31, 10.62s/it]
{'loss': 0.0428, 'learning_rate': 0.00010759331142252462, 'epoch': 7.88}
49%|████▉ | 189/384 [33:19<34:31, 10.62s/it]
49%|████▉ | 190/384 [33:30<34:13, 10.59s/it]
{'loss': 0.0528, 'learning_rate': 0.00010675097468583652, 'epoch': 7.92}
49%|████▉ | 190/384 [33:30<34:13, 10.59s/it]
50%|████▉ | 191/384 [33:40<33:58, 10.56s/it]
{'loss': 0.0423, 'learning_rate': 0.00010590815646968934, 'epoch': 7.96}
50%|████▉ | 191/384 [33:40<33:58, 10.56s/it]
50%|█████ | 192/384 [33:51<34:14, 10.70s/it]
{'loss': 0.0371, 'learning_rate': 0.00010506491688387127, 'epoch': 8.0}
50%|█████ | 192/384 [33:51<34:14, 10.70s/it]
50%|█████ | 193/384 [34:02<33:46, 10.61s/it]
{'loss': 0.0314, 'learning_rate': 0.00010422131606822269, 'epoch': 8.04}
50%|█████ | 193/384 [34:02<33:46, 10.61s/it]
51%|█████ | 194/384 [34:12<33:22, 10.54s/it]
{'loss': 0.0228, 'learning_rate': 0.00010337741418834684, 'epoch': 8.08}
51%|█████ | 194/384 [34:12<33:22, 10.54s/it]
51%|█████ | 195/384 [34:22<32:50, 10.43s/it]
{'loss': 0.029, 'learning_rate': 0.00010253327143131879, 'epoch': 8.12}
51%|█████ | 195/384 [34:22<32:50, 10.43s/it]
51%|█████ | 196/384 [34:34<33:34, 10.72s/it]
{'loss': 0.0179, 'learning_rate': 0.0001016889480013931, 'epoch': 8.17}
51%|█████ | 196/384 [34:34<33:34, 10.72s/it]
51%|█████▏ | 197/384 [34:44<32:43, 10.50s/it]
{'loss': 0.0272, 'learning_rate': 0.00010084450411570985, 'epoch': 8.21}
51%|█████▏ | 197/384 [34:44<32:43, 10.50s/it]
52%|█████▏ | 198/384 [34:54<32:26, 10.46s/it]
{'loss': 0.025, 'learning_rate': 0.0001, 'epoch': 8.25}
52%|█████▏ | 198/384 [34:54<32:26, 10.46s/it]
52%|█████▏ | 199/384 [35:04<32:20, 10.49s/it]
{'loss': 0.0207, 'learning_rate': 9.915549588429015e-05, 'epoch': 8.29}
52%|█████▏ | 199/384 [35:04<32:20, 10.49s/it]
52%|█████▏ | 200/384 [35:15<32:11, 10.50s/it]
{'loss': 0.0356, 'learning_rate': 9.83110519986069e-05, 'epoch': 8.33}
52%|█████▏ | 200/384 [35:15<32:11, 10.50s/it]
52%|█████▏ | 201/384 [35:26<32:01, 10.50s/it]
{'loss': 0.0242, 'learning_rate': 9.746672856868123e-05, 'epoch': 8.38}
52%|█████▏ | 201/384 [35:26<32:01, 10.50s/it]
53%|█████▎ | 202/384 [35:37<32:27, 10.70s/it]
{'loss': 0.0216, 'learning_rate': 9.662258581165319e-05, 'epoch': 8.42}
53%|█████▎ | 202/384 [35:37<32:27, 10.70s/it]
53%|█████▎ | 203/384 [35:46<31:05, 10.31s/it]
{'loss': 0.0261, 'learning_rate': 9.577868393177732e-05, 'epoch': 8.46}
53%|█████▎ | 203/384 [35:46<31:05, 10.31s/it]
53%|█████▎ | 204/384 [35:57<31:24, 10.47s/it]
{'loss': 0.0224, 'learning_rate': 9.493508311612874e-05, 'epoch': 8.5}
53%|█████▎ | 204/384 [35:57<31:24, 10.47s/it]
53%|█████▎ | 205/384 [36:08<31:37, 10.60s/it]
{'loss': 0.0293, 'learning_rate': 9.409184353031068e-05, 'epoch': 8.54}
53%|█████▎ | 205/384 [36:08<31:37, 10.60s/it]
54%|█████▎ | 206/384 [36:19<31:53, 10.75s/it]
{'loss': 0.0279, 'learning_rate': 9.324902531416349e-05, 'epoch': 8.58}
54%|█████▎ | 206/384 [36:19<31:53, 10.75s/it]
54%|█████▍ | 207/384 [36:28<30:31, 10.35s/it]
{'loss': 0.0322, 'learning_rate': 9.24066885774754e-05, 'epoch': 8.62}
54%|█████▍ | 207/384 [36:28<30:31, 10.35s/it]
54%|█████▍ | 208/384 [36:39<30:28, 10.39s/it]
{'loss': 0.0192, 'learning_rate': 9.156489339569554e-05, 'epoch': 8.67}
54%|█████▍ | 208/384 [36:39<30:28, 10.39s/it]
54%|█████▍ | 209/384 [36:50<30:49, 10.57s/it]
{'loss': 0.0231, 'learning_rate': 9.072369980564935e-05, 'epoch': 8.71}
54%|█████▍ | 209/384 [36:50<30:49, 10.57s/it]
55%|█████▍ | 210/384 [37:00<30:39, 10.57s/it]
{'loss': 0.0148, 'learning_rate': 8.98831678012568e-05, 'epoch': 8.75}
55%|█████▍ | 210/384 [37:00<30:39, 10.57s/it]
55%|█████▍ | 211/384 [37:11<30:42, 10.65s/it]
{'loss': 0.0237, 'learning_rate': 8.90433573292535e-05, 'epoch': 8.79}
55%|█████▍ | 211/384 [37:11<30:42, 10.65s/it]
55%|█████▌ | 212/384 [37:22<30:29, 10.64s/it]
{'loss': 0.0269, 'learning_rate': 8.820432828491542e-05, 'epoch': 8.83}
55%|█████▌ | 212/384 [37:22<30:29, 10.64s/it]
55%|█████▌ | 213/384 [37:33<30:56, 10.86s/it]
{'loss': 0.0191, 'learning_rate': 8.73661405077871e-05, 'epoch': 8.88}
55%|█████▌ | 213/384 [37:33<30:56, 10.86s/it]
56%|█████▌ | 214/384 [37:44<31:00, 10.94s/it]
{'loss': 0.0206, 'learning_rate': 8.652885377741393e-05, 'epoch': 8.92}
56%|█████▌ | 214/384 [37:44<31:00, 10.94s/it]
56%|█████▌ | 215/384 [37:53<28:38, 10.17s/it]
{'loss': 0.0352, 'learning_rate': 8.569252780907862e-05, 'epoch': 8.96}
56%|█████▌ | 215/384 [37:53<28:38, 10.17s/it]
56%|█████▋ | 216/384 [38:03<28:47, 10.28s/it]
{'loss': 0.0312, 'learning_rate': 8.485722224954237e-05, 'epoch': 9.0}
56%|█████▋ | 216/384 [38:03<28:47, 10.28s/it]
57%|█████▋ | 217/384 [38:16<30:50, 11.08s/it]
{'loss': 0.0105, 'learning_rate': 8.402299667279078e-05, 'epoch': 9.04}
57%|█████▋ | 217/384 [38:16<30:50, 11.08s/it]
57%|█████▋ | 218/384 [38:26<29:16, 10.58s/it]
{'loss': 0.0124, 'learning_rate': 8.31899105757852e-05, 'epoch': 9.08}
57%|█████▋ | 218/384 [38:26<29:16, 10.58s/it]
57%|█████▋ | 219/384 [38:37<29:25, 10.70s/it]
{'loss': 0.012, 'learning_rate': 8.235802337421919e-05, 'epoch': 9.12}
57%|█████▋ | 219/384 [38:37<29:25, 10.70s/it]
57%|█████▋ | 220/384 [38:48<29:32, 10.81s/it]
{'loss': 0.0145, 'learning_rate': 8.15273943982811e-05, 'epoch': 9.17}
57%|█████▋ | 220/384 [38:48<29:32, 10.81s/it]
58%|█████▊ | 221/384 [38:58<29:01, 10.68s/it]
{'loss': 0.0284, 'learning_rate': 8.06980828884227e-05, 'epoch': 9.21}
58%|█████▊ | 221/384 [38:58<29:01, 10.68s/it]
58%|█████▊ | 222/384 [39:09<28:57, 10.72s/it]
{'loss': 0.0175, 'learning_rate': 7.987014799113397e-05, 'epoch': 9.25}
58%|█████▊ | 222/384 [39:09<28:57, 10.72s/it]
58%|█████▊ | 223/384 [39:20<28:56, 10.79s/it]
{'loss': 0.0195, 'learning_rate': 7.904364875472513e-05, 'epoch': 9.29}
58%|█████▊ | 223/384 [39:20<28:56, 10.79s/it]
58%|█████▊ | 224/384 [39:30<28:33, 10.71s/it]
{'loss': 0.015, 'learning_rate': 7.821864412511485e-05, 'epoch': 9.33}
58%|█████▊ | 224/384 [39:30<28:33, 10.71s/it]
59%|█████▊ | 225/384 [39:41<28:32, 10.77s/it]
{'loss': 0.0186, 'learning_rate': 7.739519294162652e-05, 'epoch': 9.38}
59%|█████▊ | 225/384 [39:41<28:32, 10.77s/it]
59%|█████▉ | 226/384 [39:52<28:19, 10.76s/it]
{'loss': 0.0121, 'learning_rate': 7.65733539327918e-05, 'epoch': 9.42}
59%|█████▉ | 226/384 [39:52<28:19, 10.76s/it]
59%|█████▉ | 227/384 [40:02<27:13, 10.40s/it]
{'loss': 0.0255, 'learning_rate': 7.5753185712162e-05, 'epoch': 9.46}
59%|█████▉ | 227/384 [40:02<27:13, 10.40s/it]
59%|█████▉ | 228/384 [40:13<27:38, 10.63s/it]
{'loss': 0.0112, 'learning_rate': 7.493474677412794e-05, 'epoch': 9.5}
59%|█████▉ | 228/384 [40:13<27:38, 10.63s/it]
60%|█████▉ | 229/384 [40:22<26:43, 10.34s/it]
{'loss': 0.0206, 'learning_rate': 7.411809548974792e-05, 'epoch': 9.54}
60%|█████▉ | 229/384 [40:22<26:43, 10.34s/it]
60%|█████▉ | 230/384 [40:34<27:15, 10.62s/it]
{'loss': 0.0164, 'learning_rate': 7.330329010258483e-05, 'epoch': 9.58}
60%|█████▉ | 230/384 [40:34<27:15, 10.62s/it]
60%|██████ | 231/384 [40:44<26:40, 10.46s/it]
{'loss': 0.014, 'learning_rate': 7.24903887245522e-05, 'epoch': 9.62}
60%|██████ | 231/384 [40:44<26:40, 10.46s/it]
60%|██████ | 232/384 [40:55<26:57, 10.64s/it]
{'loss': 0.017, 'learning_rate': 7.16794493317696e-05, 'epoch': 9.67}
60%|██████ | 232/384 [40:55<26:57, 10.64s/it]
61%|██████ | 233/384 [41:05<26:43, 10.62s/it]
{'loss': 0.0129, 'learning_rate': 7.087052976042789e-05, 'epoch': 9.71}
61%|██████ | 233/384 [41:05<26:43, 10.62s/it]
61%|██████ | 234/384 [41:13<24:28, 9.79s/it]
{'loss': 0.0292, 'learning_rate': 7.006368770266421e-05, 'epoch': 9.75}
61%|██████ | 234/384 [41:13<24:28, 9.79s/it]
61%|██████ | 235/384 [41:24<24:45, 9.97s/it]
{'loss': 0.016, 'learning_rate': 6.925898070244752e-05, 'epoch': 9.79}
61%|██████ | 235/384 [41:24<24:45, 9.97s/it]
61%|██████▏ | 236/384 [41:35<25:27, 10.32s/it]
{'loss': 0.016, 'learning_rate': 6.845646615147445e-05, 'epoch': 9.83}
61%|██████▏ | 236/384 [41:35<25:27, 10.32s/it]
62%|██████▏ | 237/384 [41:45<25:12, 10.29s/it]
{'loss': 0.0169, 'learning_rate': 6.765620128507619e-05, 'epoch': 9.88}
62%|██████▏ | 237/384 [41:45<25:12, 10.29s/it]
62%|██████▏ | 238/384 [41:56<25:34, 10.51s/it]
{'loss': 0.022, 'learning_rate': 6.685824317813643e-05, 'epoch': 9.92}
62%|██████▏ | 238/384 [41:56<25:34, 10.51s/it]
62%|██████▏ | 239/384 [42:06<25:22, 10.50s/it]
{'loss': 0.015, 'learning_rate': 6.606264874102079e-05, 'epoch': 9.96}
62%|██████▏ | 239/384 [42:06<25:22, 10.50s/it]
62%|██████▎ | 240/384 [42:17<25:24, 10.59s/it]
{'loss': 0.0153, 'learning_rate': 6.526947471551798e-05, 'epoch': 10.0}
62%|██████▎ | 240/384 [42:17<25:24, 10.59s/it]
63%|██████▎ | 241/384 [42:28<25:27, 10.68s/it]
{'loss': 0.0122, 'learning_rate': 6.447877767079298e-05, 'epoch': 10.04}
63%|██████▎ | 241/384 [42:28<25:27, 10.68s/it]
63%|██████▎ | 242/384 [42:39<25:27, 10.76s/it]
{'loss': 0.0103, 'learning_rate': 6.369061399935255e-05, 'epoch': 10.08}
63%|██████▎ | 242/384 [42:39<25:27, 10.76s/it]
63%|██████▎ | 243/384 [42:50<25:08, 10.70s/it]
{'loss': 0.0069, 'learning_rate': 6.290503991302324e-05, 'epoch': 10.12}
63%|██████▎ | 243/384 [42:50<25:08, 10.70s/it]
64%|██████▎ | 244/384 [43:00<24:38, 10.56s/it]
{'loss': 0.0149, 'learning_rate': 6.21221114389424e-05, 'epoch': 10.17}
64%|██████▎ | 244/384 [43:00<24:38, 10.56s/it]
64%|██████▍ | 245/384 [43:11<24:45, 10.68s/it]
{'loss': 0.0092, 'learning_rate': 6.134188441556241e-05, 'epoch': 10.21}
64%|██████▍ | 245/384 [43:11<24:45, 10.68s/it]
64%|██████▍ | 246/384 [43:21<24:19, 10.57s/it]
{'loss': 0.0147, 'learning_rate': 6.0564414488668165e-05, 'epoch': 10.25}
64%|██████▍ | 246/384 [43:21<24:19, 10.57s/it]
64%|██████▍ | 247/384 [43:31<23:49, 10.43s/it]
{'loss': 0.0087, 'learning_rate': 5.9789757107408416e-05, 'epoch': 10.29}
64%|██████▍ | 247/384 [43:31<23:49, 10.43s/it]
65%|██████▍ | 248/384 [43:42<24:06, 10.63s/it]
{'loss': 0.0112, 'learning_rate': 5.901796752034128e-05, 'epoch': 10.33}
65%|██████▍ | 248/384 [43:42<24:06, 10.63s/it]
65%|██████▍ | 249/384 [43:53<23:36, 10.50s/it]
{'loss': 0.0135, 'learning_rate': 5.824910077149371e-05, 'epoch': 10.38}
65%|██████▍ | 249/384 [43:53<23:36, 10.50s/it]
65%|██████▌ | 250/384 [44:04<23:47, 10.66s/it]
{'loss': 0.0061, 'learning_rate': 5.748321169643596e-05, 'epoch': 10.42}
65%|██████▌ | 250/384 [44:04<23:47, 10.66s/it]
65%|██████▌ | 251/384 [44:14<23:28, 10.59s/it]
{'loss': 0.0079, 'learning_rate': 5.672035491837053e-05, 'epoch': 10.46}
65%|██████▌ | 251/384 [44:14<23:28, 10.59s/it]
66%|██████▌ | 252/384 [44:24<23:07, 10.51s/it]
{'loss': 0.0141, 'learning_rate': 5.596058484423656e-05, 'epoch': 10.5}
66%|██████▌ | 252/384 [44:24<23:07, 10.51s/it]
66%|██████▌ | 253/384 [44:35<23:18, 10.67s/it]
{'loss': 0.0076, 'learning_rate': 5.5203955660829544e-05, 'epoch': 10.54}
66%|██████▌ | 253/384 [44:35<23:18, 10.67s/it]
66%|██████▌ | 254/384 [44:46<23:22, 10.79s/it]
{'loss': 0.0138, 'learning_rate': 5.44505213309366e-05, 'epoch': 10.58}
66%|██████▌ | 254/384 [44:46<23:22, 10.79s/it]
66%|██████▋ | 255/384 [44:57<23:14, 10.81s/it]
{'loss': 0.0088, 'learning_rate': 5.3700335589487925e-05, 'epoch': 10.62}
66%|██████▋ | 255/384 [44:57<23:14, 10.81s/it]
67%|██████▋ | 256/384 [45:09<23:23, 10.96s/it]
{'loss': 0.0068, 'learning_rate': 5.2953451939724454e-05, 'epoch': 10.67}
67%|██████▋ | 256/384 [45:09<23:23, 10.96s/it]
67%|██████▋ | 257/384 [45:19<22:49, 10.78s/it]
{'loss': 0.0101, 'learning_rate': 5.220992364938193e-05, 'epoch': 10.71}
67%|██████▋ | 257/384 [45:19<22:49, 10.78s/it]
67%|██████▋ | 258/384 [45:28<21:48, 10.38s/it]
{'loss': 0.0123, 'learning_rate': 5.146980374689192e-05, 'epoch': 10.75}
67%|██████▋ | 258/384 [45:28<21:48, 10.38s/it]
67%|██████▋ | 259/384 [45:39<21:59, 10.55s/it]
{'loss': 0.0083, 'learning_rate': 5.073314501759977e-05, 'epoch': 10.79}
67%|██████▋ | 259/384 [45:39<21:59, 10.55s/it]
68%|██████▊ | 260/384 [45:51<22:08, 10.72s/it]
{'loss': 0.0078, 'learning_rate': 5.000000000000002e-05, 'epoch': 10.83}
68%|██████▊ | 260/384 [45:51<22:08, 10.72s/it]
68%|██████▊ | 261/384 [46:02<22:11, 10.83s/it]
{'loss': 0.009, 'learning_rate': 4.9270420981989294e-05, 'epoch': 10.88}
68%|██████▊ | 261/384 [46:02<22:11, 10.83s/it]
68%|██████▊ | 262/384 [46:13<22:18, 10.97s/it]
{'loss': 0.0099, 'learning_rate': 4.854445999713715e-05, 'epoch': 10.92}
68%|██████▊ | 262/384 [46:13<22:18, 10.97s/it]
68%|██████▊ | 263/384 [46:24<22:02, 10.93s/it]
{'loss': 0.0112, 'learning_rate': 4.7822168820975066e-05, 'epoch': 10.96}
68%|██████▊ | 263/384 [46:24<22:02, 10.93s/it]
69%|██████▉ | 264/384 [46:34<21:41, 10.85s/it]
{'loss': 0.0109, 'learning_rate': 4.710359896730379e-05, 'epoch': 11.0}
69%|██████▉ | 264/384 [46:34<21:41, 10.85s/it]
69%|██████▉ | 265/384 [46:46<21:43, 10.95s/it]
{'loss': 0.0052, 'learning_rate': 4.638880168451938e-05, 'epoch': 11.04}
69%|██████▉ | 265/384 [46:46<21:43, 10.95s/it]
69%|██████▉ | 266/384 [46:56<21:09, 10.76s/it]
{'loss': 0.0065, 'learning_rate': 4.567782795195816e-05, 'epoch': 11.08}
69%|██████▉ | 266/384 [46:56<21:09, 10.76s/it]
70%|██████▉ | 267/384 [47:07<20:59, 10.76s/it]
{'loss': 0.0043, 'learning_rate': 4.497072847626087e-05, 'epoch': 11.12}
70%|██████▉ | 267/384 [47:07<20:59, 10.76s/it]
70%|██████▉ | 268/384 [47:17<20:35, 10.65s/it]
{'loss': 0.0052, 'learning_rate': 4.426755368775637e-05, 'epoch': 11.17}
70%|██████▉ | 268/384 [47:17<20:35, 10.65s/it]
70%|███████ | 269/384 [47:28<20:36, 10.75s/it]
{'loss': 0.0065, 'learning_rate': 4.356835373686475e-05, 'epoch': 11.21}
70%|███████ | 269/384 [47:28<20:36, 10.75s/it]
70%|███████ | 270/384 [47:39<20:43, 10.90s/it]
{'loss': 0.0028, 'learning_rate': 4.287317849052075e-05, 'epoch': 11.25}
70%|███████ | 270/384 [47:39<20:43, 10.90s/it]
71%|███████ | 271/384 [47:50<20:22, 10.82s/it]
{'loss': 0.0096, 'learning_rate': 4.218207752861728e-05, 'epoch': 11.29}
71%|███████ | 271/384 [47:50<20:22, 10.82s/it]
71%|███████ | 272/384 [48:00<19:50, 10.63s/it]
{'loss': 0.0034, 'learning_rate': 4.149510014046922e-05, 'epoch': 11.33}
71%|███████ | 272/384 [48:00<19:50, 10.63s/it]
71%|███████ | 273/384 [48:09<18:53, 10.21s/it]
{'loss': 0.0071, 'learning_rate': 4.081229532129827e-05, 'epoch': 11.38}
71%|███████ | 273/384 [48:09<18:53, 10.21s/it]
71%|███████▏ | 274/384 [48:20<18:57, 10.34s/it]
{'loss': 0.0049, 'learning_rate': 4.013371176873849e-05, 'epoch': 11.42}
71%|███████▏ | 274/384 [48:20<18:57, 10.34s/it]
72%|███████▏ | 275/384 [48:31<19:02, 10.49s/it]
{'loss': 0.0031, 'learning_rate': 3.945939787936329e-05, 'epoch': 11.46}
72%|███████▏ | 275/384 [48:31<19:02, 10.49s/it]
72%|███████▏ | 276/384 [48:41<18:27, 10.25s/it]
{'loss': 0.0071, 'learning_rate': 3.878940174523371e-05, 'epoch': 11.5}
72%|███████▏ | 276/384 [48:41<18:27, 10.25s/it]
72%|███████▏ | 277/384 [48:52<18:40, 10.47s/it]
{'loss': 0.0043, 'learning_rate': 3.812377115046855e-05, 'epoch': 11.54}
72%|███████▏ | 277/384 [48:52<18:40, 10.47s/it]
72%|███████▏ | 278/384 [49:03<18:48, 10.65s/it]
{'loss': 0.0087, 'learning_rate': 3.746255356783632e-05, 'epoch': 11.58}
72%|███████▏ | 278/384 [49:03<18:48, 10.65s/it]
73%|███████▎ | 279/384 [49:14<18:59, 10.85s/it]
{'loss': 0.0053, 'learning_rate': 3.680579615536961e-05, 'epoch': 11.62}
73%|███████▎ | 279/384 [49:14<18:59, 10.85s/it]
73%|███████▎ | 280/384 [49:25<19:03, 10.99s/it]
{'loss': 0.0048, 'learning_rate': 3.615354575300166e-05, 'epoch': 11.67}
73%|███████▎ | 280/384 [49:25<19:03, 10.99s/it]
73%|███████▎ | 281/384 [49:36<18:33, 10.81s/it]
{'loss': 0.0037, 'learning_rate': 3.550584887922582e-05, 'epoch': 11.71}
73%|███████▎ | 281/384 [49:36<18:33, 10.81s/it]
73%|███████▎ | 282/384 [49:47<18:26, 10.85s/it]
{'loss': 0.0093, 'learning_rate': 3.4862751727777797e-05, 'epoch': 11.75}
73%|███████▎ | 282/384 [49:47<18:26, 10.85s/it]
74%|███████▎ | 283/384 [49:57<17:59, 10.69s/it]
{'loss': 0.0058, 'learning_rate': 3.422430016434114e-05, 'epoch': 11.79}
74%|███████▎ | 283/384 [49:57<17:59, 10.69s/it]
74%|███████▍ | 284/384 [50:06<17:02, 10.22s/it]
{'loss': 0.0067, 'learning_rate': 3.3590539723276083e-05, 'epoch': 11.83}
74%|███████▍ | 284/384 [50:06<17:02, 10.22s/it]
74%|███████▍ | 285/384 [50:17<17:00, 10.31s/it]
{'loss': 0.0055, 'learning_rate': 3.296151560437214e-05, 'epoch': 11.88}
74%|███████▍ | 285/384 [50:17<17:00, 10.31s/it]
74%|███████▍ | 286/384 [50:27<17:08, 10.49s/it]
{'loss': 0.0057, 'learning_rate': 3.233727266962425e-05, 'epoch': 11.92}
74%|███████▍ | 286/384 [50:27<17:08, 10.49s/it]
75%|███████▍ | 287/384 [50:38<17:01, 10.53s/it]
{'loss': 0.004, 'learning_rate': 3.171785544003342e-05, 'epoch': 11.96}
75%|███████▍ | 287/384 [50:38<17:01, 10.53s/it]
75%|███████▌ | 288/384 [50:49<17:11, 10.75s/it]
{'loss': 0.0058, 'learning_rate': 3.110330809243134e-05, 'epoch': 12.0}
75%|███████▌ | 288/384 [50:49<17:11, 10.75s/it]
75%|███████▌ | 289/384 [51:01<17:35, 11.11s/it]
{'loss': 0.0014, 'learning_rate': 3.0493674456329813e-05, 'epoch': 12.04}
75%|███████▌ | 289/384 [51:01<17:35, 11.11s/it]
76%|███████▌ | 290/384 [51:11<16:57, 10.83s/it]
{'loss': 0.004, 'learning_rate': 2.9888998010794743e-05, 'epoch': 12.08}
76%|███████▌ | 290/384 [51:11<16:57, 10.83s/it]
76%|███████▌ | 291/384 [51:21<16:16, 10.50s/it]
{'loss': 0.003, 'learning_rate': 2.9289321881345254e-05, 'epoch': 12.12}
76%|███████▌ | 291/384 [51:21<16:16, 10.50s/it]
76%|███████▌ | 292/384 [51:29<15:05, 9.84s/it]
{'loss': 0.0054, 'learning_rate': 2.869468883687798e-05, 'epoch': 12.17}
76%|███████▌ | 292/384 [51:29<15:05, 9.84s/it]
76%|███████▋ | 293/384 [51:41<15:30, 10.22s/it]
{'loss': 0.0026, 'learning_rate': 2.8105141286616754e-05, 'epoch': 12.21}
76%|███████▋ | 293/384 [51:41<15:30, 10.22s/it]
77%|███████▋ | 294/384 [51:49<14:32, 9.69s/it]
{'loss': 0.0048, 'learning_rate': 2.7520721277088024e-05, 'epoch': 12.25}
77%|███████▋ | 294/384 [51:49<14:32, 9.69s/it]
77%|███████▋ | 295/384 [51:59<14:38, 9.87s/it]
{'loss': 0.0034, 'learning_rate': 2.6941470489122056e-05, 'epoch': 12.29}
77%|███████▋ | 295/384 [51:59<14:38, 9.87s/it]
77%|███████▋ | 296/384 [52:10<14:58, 10.21s/it]
{'loss': 0.0041, 'learning_rate': 2.6367430234880284e-05, 'epoch': 12.33}
77%|███████▋ | 296/384 [52:10<14:58, 10.21s/it]
77%|███████▋ | 297/384 [52:22<15:15, 10.53s/it]
{'loss': 0.0023, 'learning_rate': 2.5798641454908944e-05, 'epoch': 12.38}
77%|███████▋ | 297/384 [52:22<15:15, 10.53s/it]
78%|███████▊ | 298/384 [52:32<15:12, 10.61s/it]
{'loss': 0.0027, 'learning_rate': 2.523514471521913e-05, 'epoch': 12.42}
78%|███████▊ | 298/384 [52:32<15:12, 10.61s/it]
78%|███████▊ | 299/384 [52:42<14:38, 10.34s/it]
{'loss': 0.0045, 'learning_rate': 2.467698020439365e-05, 'epoch': 12.46}
78%|███████▊ | 299/384 [52:42<14:38, 10.34s/it]
78%|███████▊ | 300/384 [52:50<13:35, 9.71s/it]
{'loss': 0.0092, 'learning_rate': 2.4124187730720917e-05, 'epoch': 12.5}
78%|███████▊ | 300/384 [52:50<13:35, 9.71s/it]
78%|███████▊ | 301/384 [53:01<13:47, 9.97s/it]
{'loss': 0.0028, 'learning_rate': 2.357680671935554e-05, 'epoch': 12.54}
78%|███████▊ | 301/384 [53:01<13:47, 9.97s/it]
79%|███████▊ | 302/384 [53:10<13:22, 9.78s/it]
{'loss': 0.0032, 'learning_rate': 2.3034876209506772e-05, 'epoch': 12.58}
79%|███████▊ | 302/384 [53:10<13:22, 9.78s/it]
79%|███████▉ | 303/384 [53:21<13:40, 10.13s/it]
{'loss': 0.0024, 'learning_rate': 2.2498434851654126e-05, 'epoch': 12.62}
79%|███████▉ | 303/384 [53:21<13:40, 10.13s/it]
79%|███████▉ | 304/384 [53:32<13:36, 10.20s/it]
{'loss': 0.0028, 'learning_rate': 2.1967520904790827e-05, 'epoch': 12.67}
79%|███████▉ | 304/384 [53:32<13:36, 10.20s/it]
79%|███████▉ | 305/384 [53:43<13:49, 10.50s/it]
{'loss': 0.0015, 'learning_rate': 2.14421722336952e-05, 'epoch': 12.71}
79%|███████▉ | 305/384 [53:43<13:49, 10.50s/it]
80%|███████▉ | 306/384 [53:54<13:47, 10.61s/it]
{'loss': 0.0034, 'learning_rate': 2.092242630623016e-05, 'epoch': 12.75}
80%|███████▉ | 306/384 [53:54<13:47, 10.61s/it]
80%|███████▉ | 307/384 [54:04<13:21, 10.41s/it]
{'loss': 0.0028, 'learning_rate': 2.040832019067096e-05, 'epoch': 12.79}
80%|███████▉ | 307/384 [54:04<13:21, 10.41s/it]
80%|████████ | 308/384 [54:14<13:07, 10.36s/it]
{'loss': 0.0036, 'learning_rate': 1.9899890553061562e-05, 'epoch': 12.83}
80%|████████ | 308/384 [54:14<13:07, 10.36s/it]
80%|████████ | 309/384 [54:24<13:00, 10.41s/it]
{'loss': 0.0028, 'learning_rate': 1.939717365459952e-05, 'epoch': 12.88}
80%|████████ | 309/384 [54:24<13:00, 10.41s/it]
81%|████████ | 310/384 [54:35<13:03, 10.59s/it]
{'loss': 0.0033, 'learning_rate': 1.8900205349049904e-05, 'epoch': 12.92}
81%|████████ | 310/384 [54:35<13:03, 10.59s/it]
81%|████████ | 311/384 [54:46<12:46, 10.50s/it]
{'loss': 0.0022, 'learning_rate': 1.8409021080188193e-05, 'epoch': 12.96}
81%|████████ | 311/384 [54:46<12:46, 10.50s/it]
81%|████████▏ | 312/384 [54:57<12:54, 10.76s/it]
{'loss': 0.0037, 'learning_rate': 1.7923655879272393e-05, 'epoch': 13.0}
81%|████████▏ | 312/384 [54:57<12:54, 10.76s/it]
82%|████████▏ | 313/384 [55:08<12:57, 10.95s/it]
{'loss': 0.0018, 'learning_rate': 1.7444144362544625e-05, 'epoch': 13.04}
82%|████████▏ | 313/384 [55:08<12:57, 10.95s/it]
82%|████████▏ | 314/384 [55:19<12:48, 10.98s/it]
{'loss': 0.0014, 'learning_rate': 1.6970520728762375e-05, 'epoch': 13.08}