24B-Pretrain / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
799cc5b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996362313568571,
"eval_steps": 500,
"global_step": 1374,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007275372862859222,
"grad_norm": 1.5288920783603972,
"learning_rate": 0.0,
"loss": 2.2393,
"step": 1
},
{
"epoch": 0.0014550745725718443,
"grad_norm": 1.8644806477200417,
"learning_rate": 2.5000000000000004e-07,
"loss": 2.2191,
"step": 2
},
{
"epoch": 0.0021826118588577663,
"grad_norm": 1.7784703619023348,
"learning_rate": 5.000000000000001e-07,
"loss": 2.2159,
"step": 3
},
{
"epoch": 0.0029101491451436886,
"grad_norm": 1.5806839907563581,
"learning_rate": 7.5e-07,
"loss": 2.184,
"step": 4
},
{
"epoch": 0.0036376864314296106,
"grad_norm": 1.8190865637907,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.212,
"step": 5
},
{
"epoch": 0.0043652237177155325,
"grad_norm": 1.6181433102216847,
"learning_rate": 1.25e-06,
"loss": 2.211,
"step": 6
},
{
"epoch": 0.005092761004001455,
"grad_norm": 1.7927607230934282,
"learning_rate": 1.5e-06,
"loss": 2.1433,
"step": 7
},
{
"epoch": 0.005820298290287377,
"grad_norm": 1.8873680830855888,
"learning_rate": 1.75e-06,
"loss": 2.206,
"step": 8
},
{
"epoch": 0.0065478355765733,
"grad_norm": 1.482732563101495,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.2254,
"step": 9
},
{
"epoch": 0.007275372862859221,
"grad_norm": 1.4889646677667492,
"learning_rate": 2.25e-06,
"loss": 2.1792,
"step": 10
},
{
"epoch": 0.008002910149145144,
"grad_norm": 1.4985805484052215,
"learning_rate": 2.5e-06,
"loss": 2.2684,
"step": 11
},
{
"epoch": 0.008730447435431065,
"grad_norm": 1.5013684044916331,
"learning_rate": 2.7500000000000004e-06,
"loss": 2.1795,
"step": 12
},
{
"epoch": 0.009457984721716987,
"grad_norm": 1.7555667532063568,
"learning_rate": 3e-06,
"loss": 2.2519,
"step": 13
},
{
"epoch": 0.01018552200800291,
"grad_norm": 2.7293866713537134,
"learning_rate": 3.2500000000000002e-06,
"loss": 2.2681,
"step": 14
},
{
"epoch": 0.010913059294288832,
"grad_norm": 2.200225872763007,
"learning_rate": 3.5e-06,
"loss": 2.1938,
"step": 15
},
{
"epoch": 0.011640596580574755,
"grad_norm": 1.8348578578231278,
"learning_rate": 3.7500000000000005e-06,
"loss": 2.2048,
"step": 16
},
{
"epoch": 0.012368133866860677,
"grad_norm": 1.8743984231151645,
"learning_rate": 4.000000000000001e-06,
"loss": 2.1631,
"step": 17
},
{
"epoch": 0.0130956711531466,
"grad_norm": 1.8414431627872851,
"learning_rate": 4.25e-06,
"loss": 2.1406,
"step": 18
},
{
"epoch": 0.013823208439432522,
"grad_norm": 1.4183185947241677,
"learning_rate": 4.5e-06,
"loss": 2.2254,
"step": 19
},
{
"epoch": 0.014550745725718442,
"grad_norm": 1.9481451459518573,
"learning_rate": 4.75e-06,
"loss": 2.1679,
"step": 20
},
{
"epoch": 0.015278283012004365,
"grad_norm": 1.5602129290453182,
"learning_rate": 5e-06,
"loss": 2.2265,
"step": 21
},
{
"epoch": 0.01600582029829029,
"grad_norm": 1.3810432619588962,
"learning_rate": 5.2500000000000006e-06,
"loss": 2.1534,
"step": 22
},
{
"epoch": 0.01673335758457621,
"grad_norm": 1.5968789996194426,
"learning_rate": 5.500000000000001e-06,
"loss": 2.1732,
"step": 23
},
{
"epoch": 0.01746089487086213,
"grad_norm": 1.816533437678404,
"learning_rate": 5.75e-06,
"loss": 2.2226,
"step": 24
},
{
"epoch": 0.018188432157148052,
"grad_norm": 1.5418652750157003,
"learning_rate": 6e-06,
"loss": 2.2728,
"step": 25
},
{
"epoch": 0.018915969443433975,
"grad_norm": 1.4735527362492546,
"learning_rate": 6.25e-06,
"loss": 2.2163,
"step": 26
},
{
"epoch": 0.019643506729719897,
"grad_norm": 1.4845561943479575,
"learning_rate": 6.5000000000000004e-06,
"loss": 2.1918,
"step": 27
},
{
"epoch": 0.02037104401600582,
"grad_norm": 1.7412028945444677,
"learning_rate": 6.750000000000001e-06,
"loss": 2.2396,
"step": 28
},
{
"epoch": 0.021098581302291742,
"grad_norm": 1.6561669558897179,
"learning_rate": 7e-06,
"loss": 2.2286,
"step": 29
},
{
"epoch": 0.021826118588577664,
"grad_norm": 1.7109055266319328,
"learning_rate": 7.25e-06,
"loss": 2.1781,
"step": 30
},
{
"epoch": 0.022553655874863587,
"grad_norm": 1.5704601583829316,
"learning_rate": 7.500000000000001e-06,
"loss": 2.2403,
"step": 31
},
{
"epoch": 0.02328119316114951,
"grad_norm": 1.508006475744022,
"learning_rate": 7.75e-06,
"loss": 2.2205,
"step": 32
},
{
"epoch": 0.02400873044743543,
"grad_norm": 1.803590041071739,
"learning_rate": 8.000000000000001e-06,
"loss": 2.2029,
"step": 33
},
{
"epoch": 0.024736267733721354,
"grad_norm": 1.4516903950304747,
"learning_rate": 8.25e-06,
"loss": 2.2139,
"step": 34
},
{
"epoch": 0.025463805020007276,
"grad_norm": 3.116636483017336,
"learning_rate": 8.5e-06,
"loss": 2.2323,
"step": 35
},
{
"epoch": 0.0261913423062932,
"grad_norm": 1.623617395325745,
"learning_rate": 8.750000000000001e-06,
"loss": 2.1991,
"step": 36
},
{
"epoch": 0.02691887959257912,
"grad_norm": 2.0677382830922686,
"learning_rate": 9e-06,
"loss": 2.2652,
"step": 37
},
{
"epoch": 0.027646416878865043,
"grad_norm": 3.034199810473946,
"learning_rate": 9.250000000000001e-06,
"loss": 2.1865,
"step": 38
},
{
"epoch": 0.028373954165150966,
"grad_norm": 1.7919909350659693,
"learning_rate": 9.5e-06,
"loss": 2.1912,
"step": 39
},
{
"epoch": 0.029101491451436885,
"grad_norm": 3.239893614116414,
"learning_rate": 9.75e-06,
"loss": 2.2285,
"step": 40
},
{
"epoch": 0.029829028737722807,
"grad_norm": 1.6482411098991196,
"learning_rate": 1e-05,
"loss": 2.1061,
"step": 41
},
{
"epoch": 0.03055656602400873,
"grad_norm": 4.393883055041743,
"learning_rate": 9.999986134743949e-06,
"loss": 2.1837,
"step": 42
},
{
"epoch": 0.03128410331029465,
"grad_norm": 3.640280872327228,
"learning_rate": 9.99994453905269e-06,
"loss": 2.1949,
"step": 43
},
{
"epoch": 0.03201164059658058,
"grad_norm": 2.984560444439296,
"learning_rate": 9.999875213156919e-06,
"loss": 2.1897,
"step": 44
},
{
"epoch": 0.032739177882866496,
"grad_norm": 3.643349152064925,
"learning_rate": 9.999778157441126e-06,
"loss": 2.2719,
"step": 45
},
{
"epoch": 0.03346671516915242,
"grad_norm": 1.679399697575627,
"learning_rate": 9.99965337244359e-06,
"loss": 2.1504,
"step": 46
},
{
"epoch": 0.03419425245543834,
"grad_norm": 2.9412530670758796,
"learning_rate": 9.999500858856382e-06,
"loss": 2.1945,
"step": 47
},
{
"epoch": 0.03492178974172426,
"grad_norm": 1.7766551035232812,
"learning_rate": 9.999320617525356e-06,
"loss": 2.1419,
"step": 48
},
{
"epoch": 0.035649327028010186,
"grad_norm": 2.7089668240918066,
"learning_rate": 9.999112649450154e-06,
"loss": 2.1765,
"step": 49
},
{
"epoch": 0.036376864314296105,
"grad_norm": 1.4848133526126528,
"learning_rate": 9.998876955784183e-06,
"loss": 2.1953,
"step": 50
},
{
"epoch": 0.03710440160058203,
"grad_norm": 2.799597350559271,
"learning_rate": 9.998613537834625e-06,
"loss": 2.2151,
"step": 51
},
{
"epoch": 0.03783193888686795,
"grad_norm": 2.0026253192452117,
"learning_rate": 9.998322397062426e-06,
"loss": 2.202,
"step": 52
},
{
"epoch": 0.038559476173153875,
"grad_norm": 2.779566768037998,
"learning_rate": 9.99800353508228e-06,
"loss": 2.2047,
"step": 53
},
{
"epoch": 0.039287013459439794,
"grad_norm": 1.6842181907886473,
"learning_rate": 9.997656953662627e-06,
"loss": 2.1725,
"step": 54
},
{
"epoch": 0.04001455074572572,
"grad_norm": 2.9470995907944304,
"learning_rate": 9.997282654725645e-06,
"loss": 2.22,
"step": 55
},
{
"epoch": 0.04074208803201164,
"grad_norm": 2.209113890782605,
"learning_rate": 9.996880640347234e-06,
"loss": 2.1626,
"step": 56
},
{
"epoch": 0.041469625318297565,
"grad_norm": 3.132001624857469,
"learning_rate": 9.99645091275701e-06,
"loss": 2.2366,
"step": 57
},
{
"epoch": 0.042197162604583484,
"grad_norm": 2.75629026942325,
"learning_rate": 9.99599347433828e-06,
"loss": 2.2009,
"step": 58
},
{
"epoch": 0.04292469989086941,
"grad_norm": 2.6139307213283516,
"learning_rate": 9.99550832762805e-06,
"loss": 2.249,
"step": 59
},
{
"epoch": 0.04365223717715533,
"grad_norm": 2.7751242920969674,
"learning_rate": 9.99499547531699e-06,
"loss": 2.2184,
"step": 60
},
{
"epoch": 0.044379774463441254,
"grad_norm": 1.871420864926908,
"learning_rate": 9.994454920249433e-06,
"loss": 2.1852,
"step": 61
},
{
"epoch": 0.04510731174972717,
"grad_norm": 2.382349880929508,
"learning_rate": 9.993886665423348e-06,
"loss": 2.1422,
"step": 62
},
{
"epoch": 0.04583484903601309,
"grad_norm": 2.3993230486198263,
"learning_rate": 9.993290713990343e-06,
"loss": 2.1824,
"step": 63
},
{
"epoch": 0.04656238632229902,
"grad_norm": 2.360857901310075,
"learning_rate": 9.99266706925562e-06,
"loss": 2.2415,
"step": 64
},
{
"epoch": 0.04728992360858494,
"grad_norm": 1.650711565941275,
"learning_rate": 9.992015734677979e-06,
"loss": 2.1521,
"step": 65
},
{
"epoch": 0.04801746089487086,
"grad_norm": 3.3076174611401994,
"learning_rate": 9.991336713869785e-06,
"loss": 2.181,
"step": 66
},
{
"epoch": 0.04874499818115678,
"grad_norm": 1.6769492616237789,
"learning_rate": 9.99063001059696e-06,
"loss": 2.1962,
"step": 67
},
{
"epoch": 0.04947253546744271,
"grad_norm": 1.7599941311889593,
"learning_rate": 9.989895628778952e-06,
"loss": 2.1816,
"step": 68
},
{
"epoch": 0.050200072753728626,
"grad_norm": 1.4988700213088457,
"learning_rate": 9.989133572488716e-06,
"loss": 2.2403,
"step": 69
},
{
"epoch": 0.05092761004001455,
"grad_norm": 1.6353905077011426,
"learning_rate": 9.988343845952697e-06,
"loss": 2.2182,
"step": 70
},
{
"epoch": 0.05165514732630047,
"grad_norm": 1.7111490961536142,
"learning_rate": 9.987526453550798e-06,
"loss": 2.2347,
"step": 71
},
{
"epoch": 0.0523826846125864,
"grad_norm": 1.3600773855476789,
"learning_rate": 9.98668139981636e-06,
"loss": 2.1614,
"step": 72
},
{
"epoch": 0.053110221898872316,
"grad_norm": 1.6698126117343595,
"learning_rate": 9.98580868943614e-06,
"loss": 2.2139,
"step": 73
},
{
"epoch": 0.05383775918515824,
"grad_norm": 1.6245254791288735,
"learning_rate": 9.984908327250278e-06,
"loss": 2.1661,
"step": 74
},
{
"epoch": 0.05456529647144416,
"grad_norm": 1.675160918900565,
"learning_rate": 9.983980318252274e-06,
"loss": 2.2181,
"step": 75
},
{
"epoch": 0.05529283375773009,
"grad_norm": 1.8972214787238628,
"learning_rate": 9.983024667588961e-06,
"loss": 2.1788,
"step": 76
},
{
"epoch": 0.056020371044016005,
"grad_norm": 1.5577986727860746,
"learning_rate": 9.982041380560476e-06,
"loss": 2.2095,
"step": 77
},
{
"epoch": 0.05674790833030193,
"grad_norm": 1.7480153079321283,
"learning_rate": 9.98103046262023e-06,
"loss": 2.1671,
"step": 78
},
{
"epoch": 0.05747544561658785,
"grad_norm": 1.8001870581973476,
"learning_rate": 9.979991919374877e-06,
"loss": 2.2235,
"step": 79
},
{
"epoch": 0.05820298290287377,
"grad_norm": 1.485718877859379,
"learning_rate": 9.978925756584284e-06,
"loss": 2.1956,
"step": 80
},
{
"epoch": 0.058930520189159695,
"grad_norm": 1.6300668294264538,
"learning_rate": 9.9778319801615e-06,
"loss": 2.223,
"step": 81
},
{
"epoch": 0.059658057475445614,
"grad_norm": 1.359440176512661,
"learning_rate": 9.976710596172721e-06,
"loss": 2.2371,
"step": 82
},
{
"epoch": 0.06038559476173154,
"grad_norm": 1.6330927723700759,
"learning_rate": 9.975561610837254e-06,
"loss": 2.1684,
"step": 83
},
{
"epoch": 0.06111313204801746,
"grad_norm": 1.7561578829689188,
"learning_rate": 9.974385030527496e-06,
"loss": 2.1862,
"step": 84
},
{
"epoch": 0.061840669334303384,
"grad_norm": 1.568293680618204,
"learning_rate": 9.973180861768874e-06,
"loss": 2.1988,
"step": 85
},
{
"epoch": 0.0625682066205893,
"grad_norm": 1.744615695345401,
"learning_rate": 9.971949111239838e-06,
"loss": 2.1834,
"step": 86
},
{
"epoch": 0.06329574390687523,
"grad_norm": 1.620398494899476,
"learning_rate": 9.970689785771798e-06,
"loss": 2.1635,
"step": 87
},
{
"epoch": 0.06402328119316116,
"grad_norm": 1.9224626119840165,
"learning_rate": 9.969402892349105e-06,
"loss": 2.1485,
"step": 88
},
{
"epoch": 0.06475081847944707,
"grad_norm": 1.7081277069472598,
"learning_rate": 9.968088438109002e-06,
"loss": 2.2314,
"step": 89
},
{
"epoch": 0.06547835576573299,
"grad_norm": 1.409570479234312,
"learning_rate": 9.966746430341584e-06,
"loss": 2.1926,
"step": 90
},
{
"epoch": 0.06620589305201892,
"grad_norm": 1.5601946347425566,
"learning_rate": 9.965376876489765e-06,
"loss": 2.2187,
"step": 91
},
{
"epoch": 0.06693343033830484,
"grad_norm": 1.7961828728056013,
"learning_rate": 9.963979784149232e-06,
"loss": 2.1705,
"step": 92
},
{
"epoch": 0.06766096762459076,
"grad_norm": 1.466420625246192,
"learning_rate": 9.962555161068401e-06,
"loss": 2.2048,
"step": 93
},
{
"epoch": 0.06838850491087668,
"grad_norm": 1.5126227471471592,
"learning_rate": 9.961103015148376e-06,
"loss": 2.1821,
"step": 94
},
{
"epoch": 0.06911604219716261,
"grad_norm": 1.66909649824643,
"learning_rate": 9.95962335444291e-06,
"loss": 2.1929,
"step": 95
},
{
"epoch": 0.06984357948344852,
"grad_norm": 1.5144983190620087,
"learning_rate": 9.958116187158351e-06,
"loss": 2.1943,
"step": 96
},
{
"epoch": 0.07057111676973445,
"grad_norm": 1.4373661097027508,
"learning_rate": 9.956581521653604e-06,
"loss": 2.2261,
"step": 97
},
{
"epoch": 0.07129865405602037,
"grad_norm": 1.4934873826145776,
"learning_rate": 9.955019366440082e-06,
"loss": 2.1768,
"step": 98
},
{
"epoch": 0.0720261913423063,
"grad_norm": 1.3692598853625408,
"learning_rate": 9.953429730181653e-06,
"loss": 2.2163,
"step": 99
},
{
"epoch": 0.07275372862859221,
"grad_norm": 1.4437046319559694,
"learning_rate": 9.95181262169461e-06,
"loss": 2.1791,
"step": 100
},
{
"epoch": 0.07348126591487814,
"grad_norm": 1.5090120476700792,
"learning_rate": 9.950168049947597e-06,
"loss": 2.2112,
"step": 101
},
{
"epoch": 0.07420880320116406,
"grad_norm": 1.4811337559061613,
"learning_rate": 9.948496024061577e-06,
"loss": 2.1407,
"step": 102
},
{
"epoch": 0.07493634048744999,
"grad_norm": 1.397294300464599,
"learning_rate": 9.94679655330978e-06,
"loss": 2.2462,
"step": 103
},
{
"epoch": 0.0756638777737359,
"grad_norm": 1.665042097857969,
"learning_rate": 9.945069647117645e-06,
"loss": 2.2017,
"step": 104
},
{
"epoch": 0.07639141506002183,
"grad_norm": 1.6880656948565003,
"learning_rate": 9.943315315062766e-06,
"loss": 2.1964,
"step": 105
},
{
"epoch": 0.07711895234630775,
"grad_norm": 1.6904248420523655,
"learning_rate": 9.941533566874852e-06,
"loss": 2.2072,
"step": 106
},
{
"epoch": 0.07784648963259368,
"grad_norm": 1.4904625247428034,
"learning_rate": 9.939724412435661e-06,
"loss": 2.2022,
"step": 107
},
{
"epoch": 0.07857402691887959,
"grad_norm": 1.9029842633160232,
"learning_rate": 9.937887861778947e-06,
"loss": 2.1946,
"step": 108
},
{
"epoch": 0.07930156420516551,
"grad_norm": 1.6199815657345118,
"learning_rate": 9.93602392509041e-06,
"loss": 2.2013,
"step": 109
},
{
"epoch": 0.08002910149145144,
"grad_norm": 1.9241181799327212,
"learning_rate": 9.934132612707631e-06,
"loss": 2.1813,
"step": 110
},
{
"epoch": 0.08075663877773735,
"grad_norm": 2.286463886994313,
"learning_rate": 9.932213935120025e-06,
"loss": 2.2181,
"step": 111
},
{
"epoch": 0.08148417606402328,
"grad_norm": 1.5854169026098552,
"learning_rate": 9.930267902968774e-06,
"loss": 2.2136,
"step": 112
},
{
"epoch": 0.0822117133503092,
"grad_norm": 1.9337265535465549,
"learning_rate": 9.928294527046771e-06,
"loss": 2.1435,
"step": 113
},
{
"epoch": 0.08293925063659513,
"grad_norm": 1.4724469337009167,
"learning_rate": 9.92629381829856e-06,
"loss": 2.2008,
"step": 114
},
{
"epoch": 0.08366678792288104,
"grad_norm": 1.716990512009962,
"learning_rate": 9.924265787820279e-06,
"loss": 2.2678,
"step": 115
},
{
"epoch": 0.08439432520916697,
"grad_norm": 1.9728635256737985,
"learning_rate": 9.92221044685959e-06,
"loss": 2.2334,
"step": 116
},
{
"epoch": 0.0851218624954529,
"grad_norm": 1.3829353726842706,
"learning_rate": 9.920127806815627e-06,
"loss": 2.2435,
"step": 117
},
{
"epoch": 0.08584939978173882,
"grad_norm": 1.860749893286872,
"learning_rate": 9.918017879238922e-06,
"loss": 2.2438,
"step": 118
},
{
"epoch": 0.08657693706802473,
"grad_norm": 1.5711245049229094,
"learning_rate": 9.915880675831352e-06,
"loss": 2.2131,
"step": 119
},
{
"epoch": 0.08730447435431066,
"grad_norm": 1.7631319009945081,
"learning_rate": 9.913716208446067e-06,
"loss": 2.2532,
"step": 120
},
{
"epoch": 0.08803201164059658,
"grad_norm": 1.7971608421907779,
"learning_rate": 9.91152448908742e-06,
"loss": 2.1021,
"step": 121
},
{
"epoch": 0.08875954892688251,
"grad_norm": 1.6070833156452324,
"learning_rate": 9.909305529910917e-06,
"loss": 2.2237,
"step": 122
},
{
"epoch": 0.08948708621316842,
"grad_norm": 1.5982292957619832,
"learning_rate": 9.907059343223129e-06,
"loss": 2.1703,
"step": 123
},
{
"epoch": 0.09021462349945435,
"grad_norm": 2.318593516011183,
"learning_rate": 9.904785941481638e-06,
"loss": 2.1867,
"step": 124
},
{
"epoch": 0.09094216078574027,
"grad_norm": 1.7613403582041198,
"learning_rate": 9.902485337294965e-06,
"loss": 2.1933,
"step": 125
},
{
"epoch": 0.09166969807202618,
"grad_norm": 2.3386117708504752,
"learning_rate": 9.900157543422493e-06,
"loss": 2.1844,
"step": 126
},
{
"epoch": 0.09239723535831211,
"grad_norm": 1.9798703362538914,
"learning_rate": 9.897802572774407e-06,
"loss": 2.2382,
"step": 127
},
{
"epoch": 0.09312477264459804,
"grad_norm": 2.158022738478921,
"learning_rate": 9.895420438411616e-06,
"loss": 2.2307,
"step": 128
},
{
"epoch": 0.09385230993088396,
"grad_norm": 1.6660240142136167,
"learning_rate": 9.893011153545679e-06,
"loss": 2.2348,
"step": 129
},
{
"epoch": 0.09457984721716987,
"grad_norm": 1.9632305154461456,
"learning_rate": 9.89057473153874e-06,
"loss": 2.2141,
"step": 130
},
{
"epoch": 0.0953073845034558,
"grad_norm": 1.672261371877987,
"learning_rate": 9.888111185903442e-06,
"loss": 2.1641,
"step": 131
},
{
"epoch": 0.09603492178974173,
"grad_norm": 1.6661927373979792,
"learning_rate": 9.885620530302865e-06,
"loss": 2.1819,
"step": 132
},
{
"epoch": 0.09676245907602765,
"grad_norm": 2.2520607322037947,
"learning_rate": 9.883102778550434e-06,
"loss": 2.1935,
"step": 133
},
{
"epoch": 0.09748999636231356,
"grad_norm": 1.4347704266644696,
"learning_rate": 9.880557944609863e-06,
"loss": 2.2312,
"step": 134
},
{
"epoch": 0.09821753364859949,
"grad_norm": 2.5969551378868685,
"learning_rate": 9.877986042595062e-06,
"loss": 2.2294,
"step": 135
},
{
"epoch": 0.09894507093488542,
"grad_norm": 1.5897358074382941,
"learning_rate": 9.87538708677006e-06,
"loss": 2.2595,
"step": 136
},
{
"epoch": 0.09967260822117134,
"grad_norm": 3.023355656547637,
"learning_rate": 9.872761091548933e-06,
"loss": 2.2195,
"step": 137
},
{
"epoch": 0.10040014550745725,
"grad_norm": 2.5009819811646414,
"learning_rate": 9.870108071495721e-06,
"loss": 2.2027,
"step": 138
},
{
"epoch": 0.10112768279374318,
"grad_norm": 2.649004490599995,
"learning_rate": 9.867428041324345e-06,
"loss": 2.2478,
"step": 139
},
{
"epoch": 0.1018552200800291,
"grad_norm": 2.0035185647332976,
"learning_rate": 9.864721015898524e-06,
"loss": 2.2083,
"step": 140
},
{
"epoch": 0.10258275736631503,
"grad_norm": 2.6207182124681823,
"learning_rate": 9.861987010231701e-06,
"loss": 2.1863,
"step": 141
},
{
"epoch": 0.10331029465260094,
"grad_norm": 2.262610029951531,
"learning_rate": 9.85922603948695e-06,
"loss": 2.24,
"step": 142
},
{
"epoch": 0.10403783193888687,
"grad_norm": 2.6477691672398174,
"learning_rate": 9.856438118976899e-06,
"loss": 2.2788,
"step": 143
},
{
"epoch": 0.1047653692251728,
"grad_norm": 2.582318665211976,
"learning_rate": 9.853623264163638e-06,
"loss": 2.2197,
"step": 144
},
{
"epoch": 0.1054929065114587,
"grad_norm": 2.3195328375845814,
"learning_rate": 9.850781490658643e-06,
"loss": 2.2181,
"step": 145
},
{
"epoch": 0.10622044379774463,
"grad_norm": 2.370457788315632,
"learning_rate": 9.84791281422268e-06,
"loss": 2.1894,
"step": 146
},
{
"epoch": 0.10694798108403056,
"grad_norm": 2.072643637054436,
"learning_rate": 9.845017250765721e-06,
"loss": 2.2038,
"step": 147
},
{
"epoch": 0.10767551837031648,
"grad_norm": 1.774580372637068,
"learning_rate": 9.84209481634686e-06,
"loss": 2.1874,
"step": 148
},
{
"epoch": 0.1084030556566024,
"grad_norm": 2.6456479441489185,
"learning_rate": 9.839145527174216e-06,
"loss": 2.2713,
"step": 149
},
{
"epoch": 0.10913059294288832,
"grad_norm": 2.4439242680492193,
"learning_rate": 9.836169399604846e-06,
"loss": 2.2124,
"step": 150
},
{
"epoch": 0.10985813022917425,
"grad_norm": 2.180030185422303,
"learning_rate": 9.833166450144665e-06,
"loss": 2.1744,
"step": 151
},
{
"epoch": 0.11058566751546017,
"grad_norm": 2.3348647373216513,
"learning_rate": 9.830136695448334e-06,
"loss": 2.19,
"step": 152
},
{
"epoch": 0.11131320480174609,
"grad_norm": 1.9656368778684938,
"learning_rate": 9.827080152319182e-06,
"loss": 2.2332,
"step": 153
},
{
"epoch": 0.11204074208803201,
"grad_norm": 1.8480175602182294,
"learning_rate": 9.823996837709114e-06,
"loss": 2.2119,
"step": 154
},
{
"epoch": 0.11276827937431794,
"grad_norm": 2.281426562659181,
"learning_rate": 9.820886768718503e-06,
"loss": 2.2268,
"step": 155
},
{
"epoch": 0.11349581666060386,
"grad_norm": 1.7489041448103229,
"learning_rate": 9.817749962596115e-06,
"loss": 2.2378,
"step": 156
},
{
"epoch": 0.11422335394688977,
"grad_norm": 2.782758570162382,
"learning_rate": 9.814586436738998e-06,
"loss": 2.2052,
"step": 157
},
{
"epoch": 0.1149508912331757,
"grad_norm": 2.2126011643137358,
"learning_rate": 9.811396208692387e-06,
"loss": 2.1733,
"step": 158
},
{
"epoch": 0.11567842851946163,
"grad_norm": 2.1992962836193857,
"learning_rate": 9.808179296149616e-06,
"loss": 2.1681,
"step": 159
},
{
"epoch": 0.11640596580574754,
"grad_norm": 2.492426612704936,
"learning_rate": 9.804935716952011e-06,
"loss": 2.0941,
"step": 160
},
{
"epoch": 0.11713350309203346,
"grad_norm": 1.8537517699892123,
"learning_rate": 9.801665489088795e-06,
"loss": 2.2002,
"step": 161
},
{
"epoch": 0.11786104037831939,
"grad_norm": 1.7969112288772866,
"learning_rate": 9.798368630696984e-06,
"loss": 2.1987,
"step": 162
},
{
"epoch": 0.11858857766460532,
"grad_norm": 2.5513186249381343,
"learning_rate": 9.795045160061295e-06,
"loss": 2.2223,
"step": 163
},
{
"epoch": 0.11931611495089123,
"grad_norm": 1.9389767804192248,
"learning_rate": 9.791695095614036e-06,
"loss": 2.2047,
"step": 164
},
{
"epoch": 0.12004365223717715,
"grad_norm": 2.0700871590965413,
"learning_rate": 9.788318455935008e-06,
"loss": 2.2499,
"step": 165
},
{
"epoch": 0.12077118952346308,
"grad_norm": 1.6399304993048427,
"learning_rate": 9.7849152597514e-06,
"loss": 2.1931,
"step": 166
},
{
"epoch": 0.121498726809749,
"grad_norm": 2.736143568317565,
"learning_rate": 9.781485525937683e-06,
"loss": 2.2173,
"step": 167
},
{
"epoch": 0.12222626409603492,
"grad_norm": 2.3472764088553877,
"learning_rate": 9.778029273515519e-06,
"loss": 2.1949,
"step": 168
},
{
"epoch": 0.12295380138232084,
"grad_norm": 2.144561428349233,
"learning_rate": 9.774546521653633e-06,
"loss": 2.2077,
"step": 169
},
{
"epoch": 0.12368133866860677,
"grad_norm": 2.0898248402072728,
"learning_rate": 9.771037289667726e-06,
"loss": 2.218,
"step": 170
},
{
"epoch": 0.1244088759548927,
"grad_norm": 2.303654173365009,
"learning_rate": 9.767501597020357e-06,
"loss": 2.1504,
"step": 171
},
{
"epoch": 0.1251364132411786,
"grad_norm": 1.9710676759990309,
"learning_rate": 9.76393946332084e-06,
"loss": 2.2076,
"step": 172
},
{
"epoch": 0.12586395052746452,
"grad_norm": 2.0524996430866356,
"learning_rate": 9.760350908325131e-06,
"loss": 2.1814,
"step": 173
},
{
"epoch": 0.12659148781375046,
"grad_norm": 1.980263193179824,
"learning_rate": 9.756735951935725e-06,
"loss": 2.2261,
"step": 174
},
{
"epoch": 0.12731902510003637,
"grad_norm": 2.3228604960740595,
"learning_rate": 9.753094614201542e-06,
"loss": 2.2165,
"step": 175
},
{
"epoch": 0.1280465623863223,
"grad_norm": 2.244176450327055,
"learning_rate": 9.749426915317812e-06,
"loss": 2.1894,
"step": 176
},
{
"epoch": 0.12877409967260822,
"grad_norm": 1.9074936064390875,
"learning_rate": 9.74573287562597e-06,
"loss": 2.231,
"step": 177
},
{
"epoch": 0.12950163695889413,
"grad_norm": 2.1129031026943457,
"learning_rate": 9.742012515613536e-06,
"loss": 2.2113,
"step": 178
},
{
"epoch": 0.13022917424518007,
"grad_norm": 2.1556088711700983,
"learning_rate": 9.738265855914014e-06,
"loss": 2.2041,
"step": 179
},
{
"epoch": 0.13095671153146599,
"grad_norm": 1.8507560510757504,
"learning_rate": 9.734492917306754e-06,
"loss": 2.2195,
"step": 180
},
{
"epoch": 0.1316842488177519,
"grad_norm": 1.8920978000890483,
"learning_rate": 9.730693720716866e-06,
"loss": 2.2247,
"step": 181
},
{
"epoch": 0.13241178610403784,
"grad_norm": 1.825970624258127,
"learning_rate": 9.72686828721508e-06,
"loss": 2.1789,
"step": 182
},
{
"epoch": 0.13313932339032375,
"grad_norm": 2.0662381173028095,
"learning_rate": 9.723016638017644e-06,
"loss": 2.1699,
"step": 183
},
{
"epoch": 0.1338668606766097,
"grad_norm": 1.5517316847482627,
"learning_rate": 9.719138794486198e-06,
"loss": 2.2443,
"step": 184
},
{
"epoch": 0.1345943979628956,
"grad_norm": 2.1385006346730355,
"learning_rate": 9.715234778127658e-06,
"loss": 2.1759,
"step": 185
},
{
"epoch": 0.1353219352491815,
"grad_norm": 1.5551074423625981,
"learning_rate": 9.711304610594104e-06,
"loss": 2.1856,
"step": 186
},
{
"epoch": 0.13604947253546745,
"grad_norm": 2.4865192981684436,
"learning_rate": 9.70734831368264e-06,
"loss": 2.1771,
"step": 187
},
{
"epoch": 0.13677700982175336,
"grad_norm": 2.074018167734129,
"learning_rate": 9.7033659093353e-06,
"loss": 2.2207,
"step": 188
},
{
"epoch": 0.13750454710803928,
"grad_norm": 2.3773220250146934,
"learning_rate": 9.699357419638904e-06,
"loss": 2.161,
"step": 189
},
{
"epoch": 0.13823208439432522,
"grad_norm": 2.325983230196079,
"learning_rate": 9.695322866824948e-06,
"loss": 2.1559,
"step": 190
},
{
"epoch": 0.13895962168061113,
"grad_norm": 2.1999495816652206,
"learning_rate": 9.691262273269472e-06,
"loss": 2.2153,
"step": 191
},
{
"epoch": 0.13968715896689704,
"grad_norm": 2.013387564938728,
"learning_rate": 9.687175661492944e-06,
"loss": 2.1737,
"step": 192
},
{
"epoch": 0.14041469625318298,
"grad_norm": 2.216898719559254,
"learning_rate": 9.683063054160136e-06,
"loss": 2.1715,
"step": 193
},
{
"epoch": 0.1411422335394689,
"grad_norm": 1.7993920418582208,
"learning_rate": 9.678924474079986e-06,
"loss": 2.1593,
"step": 194
},
{
"epoch": 0.14186977082575483,
"grad_norm": 2.144459012819178,
"learning_rate": 9.67475994420548e-06,
"loss": 2.1626,
"step": 195
},
{
"epoch": 0.14259730811204074,
"grad_norm": 2.0013966050180674,
"learning_rate": 9.670569487633534e-06,
"loss": 2.2098,
"step": 196
},
{
"epoch": 0.14332484539832666,
"grad_norm": 1.8221881690950872,
"learning_rate": 9.666353127604845e-06,
"loss": 2.1566,
"step": 197
},
{
"epoch": 0.1440523826846126,
"grad_norm": 1.8758800889845013,
"learning_rate": 9.66211088750378e-06,
"loss": 2.1654,
"step": 198
},
{
"epoch": 0.1447799199708985,
"grad_norm": 1.8952323085883283,
"learning_rate": 9.657842790858235e-06,
"loss": 2.2293,
"step": 199
},
{
"epoch": 0.14550745725718442,
"grad_norm": 1.527686543636565,
"learning_rate": 9.65354886133951e-06,
"loss": 2.2333,
"step": 200
},
{
"epoch": 0.14623499454347036,
"grad_norm": 2.097008587133337,
"learning_rate": 9.64922912276218e-06,
"loss": 2.2221,
"step": 201
},
{
"epoch": 0.14696253182975627,
"grad_norm": 1.7030718610790336,
"learning_rate": 9.644883599083959e-06,
"loss": 2.2129,
"step": 202
},
{
"epoch": 0.1476900691160422,
"grad_norm": 2.368365632717726,
"learning_rate": 9.640512314405563e-06,
"loss": 2.234,
"step": 203
},
{
"epoch": 0.14841760640232812,
"grad_norm": 1.880961780226356,
"learning_rate": 9.636115292970587e-06,
"loss": 2.1554,
"step": 204
},
{
"epoch": 0.14914514368861403,
"grad_norm": 2.3389376664065287,
"learning_rate": 9.63169255916536e-06,
"loss": 2.245,
"step": 205
},
{
"epoch": 0.14987268097489997,
"grad_norm": 1.9804703905200902,
"learning_rate": 9.627244137518821e-06,
"loss": 2.2199,
"step": 206
},
{
"epoch": 0.1506002182611859,
"grad_norm": 2.1750768527829174,
"learning_rate": 9.622770052702366e-06,
"loss": 2.1969,
"step": 207
},
{
"epoch": 0.1513277555474718,
"grad_norm": 2.0242161987667577,
"learning_rate": 9.618270329529734e-06,
"loss": 2.196,
"step": 208
},
{
"epoch": 0.15205529283375774,
"grad_norm": 2.2298649889244593,
"learning_rate": 9.613744992956844e-06,
"loss": 2.1373,
"step": 209
},
{
"epoch": 0.15278283012004365,
"grad_norm": 2.3350565636212828,
"learning_rate": 9.609194068081682e-06,
"loss": 2.1951,
"step": 210
},
{
"epoch": 0.15351036740632956,
"grad_norm": 1.557747949310923,
"learning_rate": 9.60461758014414e-06,
"loss": 2.139,
"step": 211
},
{
"epoch": 0.1542379046926155,
"grad_norm": 1.7716141898834719,
"learning_rate": 9.60001555452589e-06,
"loss": 2.1823,
"step": 212
},
{
"epoch": 0.1549654419789014,
"grad_norm": 1.462177931771256,
"learning_rate": 9.595388016750236e-06,
"loss": 2.2043,
"step": 213
},
{
"epoch": 0.15569297926518735,
"grad_norm": 1.4834208523290262,
"learning_rate": 9.590734992481978e-06,
"loss": 2.2348,
"step": 214
},
{
"epoch": 0.15642051655147327,
"grad_norm": 1.6233545338448112,
"learning_rate": 9.586056507527266e-06,
"loss": 2.1397,
"step": 215
},
{
"epoch": 0.15714805383775918,
"grad_norm": 1.8871156366419548,
"learning_rate": 9.581352587833455e-06,
"loss": 2.2117,
"step": 216
},
{
"epoch": 0.15787559112404512,
"grad_norm": 1.5564465487258499,
"learning_rate": 9.576623259488966e-06,
"loss": 2.2173,
"step": 217
},
{
"epoch": 0.15860312841033103,
"grad_norm": 1.3672789573676138,
"learning_rate": 9.571868548723137e-06,
"loss": 2.1513,
"step": 218
},
{
"epoch": 0.15933066569661694,
"grad_norm": 2.41665190233232,
"learning_rate": 9.567088481906084e-06,
"loss": 2.1893,
"step": 219
},
{
"epoch": 0.16005820298290288,
"grad_norm": 1.5217704051852425,
"learning_rate": 9.562283085548546e-06,
"loss": 2.2071,
"step": 220
},
{
"epoch": 0.1607857402691888,
"grad_norm": 1.7545928854710164,
"learning_rate": 9.55745238630174e-06,
"loss": 2.1944,
"step": 221
},
{
"epoch": 0.1615132775554747,
"grad_norm": 1.341908696911756,
"learning_rate": 9.552596410957224e-06,
"loss": 2.2336,
"step": 222
},
{
"epoch": 0.16224081484176064,
"grad_norm": 1.6189063753553028,
"learning_rate": 9.547715186446732e-06,
"loss": 2.1882,
"step": 223
},
{
"epoch": 0.16296835212804656,
"grad_norm": 1.6872635637497055,
"learning_rate": 9.542808739842034e-06,
"loss": 2.2141,
"step": 224
},
{
"epoch": 0.1636958894143325,
"grad_norm": 1.6381480046906143,
"learning_rate": 9.537877098354787e-06,
"loss": 2.2367,
"step": 225
},
{
"epoch": 0.1644234267006184,
"grad_norm": 1.6166356354012765,
"learning_rate": 9.532920289336378e-06,
"loss": 2.1759,
"step": 226
},
{
"epoch": 0.16515096398690432,
"grad_norm": 2.561546132872073,
"learning_rate": 9.52793834027778e-06,
"loss": 2.2114,
"step": 227
},
{
"epoch": 0.16587850127319026,
"grad_norm": 1.4895519618659814,
"learning_rate": 9.522931278809393e-06,
"loss": 2.2076,
"step": 228
},
{
"epoch": 0.16660603855947617,
"grad_norm": 1.4028314670269106,
"learning_rate": 9.517899132700889e-06,
"loss": 2.1887,
"step": 229
},
{
"epoch": 0.16733357584576208,
"grad_norm": 1.7663091760320573,
"learning_rate": 9.512841929861069e-06,
"loss": 2.2005,
"step": 230
},
{
"epoch": 0.16806111313204802,
"grad_norm": 1.6922932498061918,
"learning_rate": 9.507759698337698e-06,
"loss": 2.1554,
"step": 231
},
{
"epoch": 0.16878865041833394,
"grad_norm": 1.6345258058405243,
"learning_rate": 9.50265246631735e-06,
"loss": 2.1903,
"step": 232
},
{
"epoch": 0.16951618770461988,
"grad_norm": 1.515813866060611,
"learning_rate": 9.49752026212526e-06,
"loss": 2.2268,
"step": 233
},
{
"epoch": 0.1702437249909058,
"grad_norm": 1.8331281614451886,
"learning_rate": 9.492363114225156e-06,
"loss": 2.2079,
"step": 234
},
{
"epoch": 0.1709712622771917,
"grad_norm": 1.4423645928995934,
"learning_rate": 9.487181051219107e-06,
"loss": 2.1809,
"step": 235
},
{
"epoch": 0.17169879956347764,
"grad_norm": 1.7178706823962022,
"learning_rate": 9.481974101847371e-06,
"loss": 2.1769,
"step": 236
},
{
"epoch": 0.17242633684976355,
"grad_norm": 1.888610932855882,
"learning_rate": 9.476742294988214e-06,
"loss": 2.1868,
"step": 237
},
{
"epoch": 0.17315387413604946,
"grad_norm": 1.452915043299178,
"learning_rate": 9.471485659657782e-06,
"loss": 2.2193,
"step": 238
},
{
"epoch": 0.1738814114223354,
"grad_norm": 1.5293848443309745,
"learning_rate": 9.466204225009905e-06,
"loss": 2.1811,
"step": 239
},
{
"epoch": 0.17460894870862131,
"grad_norm": 1.5433952042546901,
"learning_rate": 9.460898020335964e-06,
"loss": 2.2406,
"step": 240
},
{
"epoch": 0.17533648599490723,
"grad_norm": 1.4510435811175628,
"learning_rate": 9.455567075064715e-06,
"loss": 2.2222,
"step": 241
},
{
"epoch": 0.17606402328119317,
"grad_norm": 1.5139877812880527,
"learning_rate": 9.450211418762123e-06,
"loss": 2.195,
"step": 242
},
{
"epoch": 0.17679156056747908,
"grad_norm": 1.376158748717973,
"learning_rate": 9.444831081131209e-06,
"loss": 2.2291,
"step": 243
},
{
"epoch": 0.17751909785376502,
"grad_norm": 1.6584629058122018,
"learning_rate": 9.439426092011877e-06,
"loss": 2.1576,
"step": 244
},
{
"epoch": 0.17824663514005093,
"grad_norm": 1.794121606128276,
"learning_rate": 9.433996481380747e-06,
"loss": 2.2483,
"step": 245
},
{
"epoch": 0.17897417242633684,
"grad_norm": 1.3376769119817673,
"learning_rate": 9.428542279351e-06,
"loss": 2.1706,
"step": 246
},
{
"epoch": 0.17970170971262278,
"grad_norm": 1.7822658414273862,
"learning_rate": 9.423063516172195e-06,
"loss": 2.2478,
"step": 247
},
{
"epoch": 0.1804292469989087,
"grad_norm": 1.3332375488317492,
"learning_rate": 9.417560222230115e-06,
"loss": 2.1941,
"step": 248
},
{
"epoch": 0.1811567842851946,
"grad_norm": 1.8874563547747,
"learning_rate": 9.412032428046594e-06,
"loss": 2.2222,
"step": 249
},
{
"epoch": 0.18188432157148055,
"grad_norm": 1.4438456592645297,
"learning_rate": 9.40648016427934e-06,
"loss": 2.1695,
"step": 250
},
{
"epoch": 0.18261185885776646,
"grad_norm": 1.53967957798226,
"learning_rate": 9.400903461721783e-06,
"loss": 2.1682,
"step": 251
},
{
"epoch": 0.18333939614405237,
"grad_norm": 1.4287131829796078,
"learning_rate": 9.395302351302881e-06,
"loss": 2.213,
"step": 252
},
{
"epoch": 0.1840669334303383,
"grad_norm": 1.7632259286712169,
"learning_rate": 9.38967686408697e-06,
"loss": 2.1377,
"step": 253
},
{
"epoch": 0.18479447071662422,
"grad_norm": 1.549901098906108,
"learning_rate": 9.384027031273575e-06,
"loss": 2.1412,
"step": 254
},
{
"epoch": 0.18552200800291016,
"grad_norm": 1.3651034612955162,
"learning_rate": 9.37835288419725e-06,
"loss": 2.1985,
"step": 255
},
{
"epoch": 0.18624954528919607,
"grad_norm": 1.5072473907035329,
"learning_rate": 9.372654454327394e-06,
"loss": 2.1843,
"step": 256
},
{
"epoch": 0.18697708257548198,
"grad_norm": 1.7799488412583535,
"learning_rate": 9.366931773268083e-06,
"loss": 2.1649,
"step": 257
},
{
"epoch": 0.18770461986176792,
"grad_norm": 1.4893473914063207,
"learning_rate": 9.361184872757894e-06,
"loss": 2.2202,
"step": 258
},
{
"epoch": 0.18843215714805384,
"grad_norm": 1.5304343937588376,
"learning_rate": 9.355413784669722e-06,
"loss": 2.1655,
"step": 259
},
{
"epoch": 0.18915969443433975,
"grad_norm": 1.3086425612708208,
"learning_rate": 9.349618541010616e-06,
"loss": 2.1999,
"step": 260
},
{
"epoch": 0.1898872317206257,
"grad_norm": 1.4216843105367578,
"learning_rate": 9.343799173921591e-06,
"loss": 2.1698,
"step": 261
},
{
"epoch": 0.1906147690069116,
"grad_norm": 1.3292751193395729,
"learning_rate": 9.337955715677452e-06,
"loss": 2.1372,
"step": 262
},
{
"epoch": 0.19134230629319754,
"grad_norm": 1.496179279166623,
"learning_rate": 9.332088198686618e-06,
"loss": 2.1554,
"step": 263
},
{
"epoch": 0.19206984357948345,
"grad_norm": 1.474351638026655,
"learning_rate": 9.326196655490935e-06,
"loss": 2.1968,
"step": 264
},
{
"epoch": 0.19279738086576936,
"grad_norm": 1.6409542250950013,
"learning_rate": 9.32028111876551e-06,
"loss": 2.1943,
"step": 265
},
{
"epoch": 0.1935249181520553,
"grad_norm": 1.6888420163354685,
"learning_rate": 9.314341621318512e-06,
"loss": 2.2244,
"step": 266
},
{
"epoch": 0.19425245543834121,
"grad_norm": 1.4494064790337373,
"learning_rate": 9.308378196091006e-06,
"loss": 2.2073,
"step": 267
},
{
"epoch": 0.19497999272462713,
"grad_norm": 1.4538249282337905,
"learning_rate": 9.302390876156756e-06,
"loss": 2.2282,
"step": 268
},
{
"epoch": 0.19570753001091307,
"grad_norm": 1.4843118954742103,
"learning_rate": 9.296379694722051e-06,
"loss": 2.1769,
"step": 269
},
{
"epoch": 0.19643506729719898,
"grad_norm": 1.6020059875422126,
"learning_rate": 9.29034468512552e-06,
"loss": 2.181,
"step": 270
},
{
"epoch": 0.1971626045834849,
"grad_norm": 1.5715088934862171,
"learning_rate": 9.284285880837947e-06,
"loss": 2.2187,
"step": 271
},
{
"epoch": 0.19789014186977083,
"grad_norm": 1.3806795102455487,
"learning_rate": 9.278203315462078e-06,
"loss": 2.1691,
"step": 272
},
{
"epoch": 0.19861767915605674,
"grad_norm": 1.4921891885676057,
"learning_rate": 9.272097022732444e-06,
"loss": 2.1588,
"step": 273
},
{
"epoch": 0.19934521644234268,
"grad_norm": 1.6009588514320718,
"learning_rate": 9.26596703651517e-06,
"loss": 2.1602,
"step": 274
},
{
"epoch": 0.2000727537286286,
"grad_norm": 1.3637408490300853,
"learning_rate": 9.259813390807788e-06,
"loss": 2.1776,
"step": 275
},
{
"epoch": 0.2008002910149145,
"grad_norm": 1.666708428684375,
"learning_rate": 9.253636119739046e-06,
"loss": 2.1571,
"step": 276
},
{
"epoch": 0.20152782830120045,
"grad_norm": 1.6890538727100897,
"learning_rate": 9.247435257568724e-06,
"loss": 2.1871,
"step": 277
},
{
"epoch": 0.20225536558748636,
"grad_norm": 1.472406127157111,
"learning_rate": 9.241210838687438e-06,
"loss": 2.1659,
"step": 278
},
{
"epoch": 0.20298290287377227,
"grad_norm": 1.99532715556326,
"learning_rate": 9.23496289761645e-06,
"loss": 2.2414,
"step": 279
},
{
"epoch": 0.2037104401600582,
"grad_norm": 1.632025214846479,
"learning_rate": 9.228691469007487e-06,
"loss": 2.1852,
"step": 280
},
{
"epoch": 0.20443797744634412,
"grad_norm": 1.9471642003418521,
"learning_rate": 9.222396587642528e-06,
"loss": 2.1996,
"step": 281
},
{
"epoch": 0.20516551473263006,
"grad_norm": 1.6939593957799761,
"learning_rate": 9.216078288433632e-06,
"loss": 2.234,
"step": 282
},
{
"epoch": 0.20589305201891597,
"grad_norm": 1.6546532687407742,
"learning_rate": 9.209736606422736e-06,
"loss": 2.2038,
"step": 283
},
{
"epoch": 0.20662058930520188,
"grad_norm": 1.6921033877152574,
"learning_rate": 9.203371576781457e-06,
"loss": 2.1592,
"step": 284
},
{
"epoch": 0.20734812659148782,
"grad_norm": 1.9393189311581847,
"learning_rate": 9.1969832348109e-06,
"loss": 2.1711,
"step": 285
},
{
"epoch": 0.20807566387777374,
"grad_norm": 1.712493550054773,
"learning_rate": 9.190571615941462e-06,
"loss": 2.2575,
"step": 286
},
{
"epoch": 0.20880320116405965,
"grad_norm": 1.3814311046647583,
"learning_rate": 9.18413675573264e-06,
"loss": 2.2285,
"step": 287
},
{
"epoch": 0.2095307384503456,
"grad_norm": 1.898478684842769,
"learning_rate": 9.177678689872831e-06,
"loss": 2.1563,
"step": 288
},
{
"epoch": 0.2102582757366315,
"grad_norm": 1.4946721876330928,
"learning_rate": 9.171197454179124e-06,
"loss": 2.2157,
"step": 289
},
{
"epoch": 0.2109858130229174,
"grad_norm": 1.6336273362064742,
"learning_rate": 9.16469308459712e-06,
"loss": 2.2124,
"step": 290
},
{
"epoch": 0.21171335030920335,
"grad_norm": 1.4908709457226583,
"learning_rate": 9.158165617200717e-06,
"loss": 2.1911,
"step": 291
},
{
"epoch": 0.21244088759548926,
"grad_norm": 1.3493887453363558,
"learning_rate": 9.151615088191918e-06,
"loss": 2.2009,
"step": 292
},
{
"epoch": 0.2131684248817752,
"grad_norm": 1.502238823825621,
"learning_rate": 9.14504153390063e-06,
"loss": 2.1473,
"step": 293
},
{
"epoch": 0.21389596216806112,
"grad_norm": 1.5170516607198907,
"learning_rate": 9.138444990784455e-06,
"loss": 2.2,
"step": 294
},
{
"epoch": 0.21462349945434703,
"grad_norm": 1.3075648105696178,
"learning_rate": 9.131825495428496e-06,
"loss": 2.2107,
"step": 295
},
{
"epoch": 0.21535103674063297,
"grad_norm": 1.5657553421243682,
"learning_rate": 9.125183084545158e-06,
"loss": 2.179,
"step": 296
},
{
"epoch": 0.21607857402691888,
"grad_norm": 1.513376266046179,
"learning_rate": 9.118517794973925e-06,
"loss": 2.2405,
"step": 297
},
{
"epoch": 0.2168061113132048,
"grad_norm": 1.7142209859560977,
"learning_rate": 9.111829663681182e-06,
"loss": 2.2293,
"step": 298
},
{
"epoch": 0.21753364859949073,
"grad_norm": 1.4146055271200115,
"learning_rate": 9.105118727759984e-06,
"loss": 2.2409,
"step": 299
},
{
"epoch": 0.21826118588577664,
"grad_norm": 1.3274975658992578,
"learning_rate": 9.098385024429875e-06,
"loss": 2.1765,
"step": 300
},
{
"epoch": 0.21898872317206255,
"grad_norm": 1.5492595246341485,
"learning_rate": 9.09162859103666e-06,
"loss": 2.223,
"step": 301
},
{
"epoch": 0.2197162604583485,
"grad_norm": 1.4496566082639615,
"learning_rate": 9.08484946505221e-06,
"loss": 2.2214,
"step": 302
},
{
"epoch": 0.2204437977446344,
"grad_norm": 1.4095738528344526,
"learning_rate": 9.078047684074254e-06,
"loss": 2.2012,
"step": 303
},
{
"epoch": 0.22117133503092035,
"grad_norm": 1.6379763890733414,
"learning_rate": 9.071223285826166e-06,
"loss": 2.1833,
"step": 304
},
{
"epoch": 0.22189887231720626,
"grad_norm": 1.500367513516407,
"learning_rate": 9.064376308156754e-06,
"loss": 2.1929,
"step": 305
},
{
"epoch": 0.22262640960349217,
"grad_norm": 1.4210992503367728,
"learning_rate": 9.057506789040063e-06,
"loss": 2.189,
"step": 306
},
{
"epoch": 0.2233539468897781,
"grad_norm": 1.3727183620185701,
"learning_rate": 9.050614766575147e-06,
"loss": 2.183,
"step": 307
},
{
"epoch": 0.22408148417606402,
"grad_norm": 1.2406796013559476,
"learning_rate": 9.043700278985867e-06,
"loss": 2.1905,
"step": 308
},
{
"epoch": 0.22480902146234993,
"grad_norm": 1.597577469216686,
"learning_rate": 9.03676336462068e-06,
"loss": 2.2035,
"step": 309
},
{
"epoch": 0.22553655874863587,
"grad_norm": 1.4225372502609417,
"learning_rate": 9.029804061952426e-06,
"loss": 2.1641,
"step": 310
},
{
"epoch": 0.22626409603492179,
"grad_norm": 1.2629130268864135,
"learning_rate": 9.022822409578106e-06,
"loss": 2.1931,
"step": 311
},
{
"epoch": 0.22699163332120773,
"grad_norm": 1.3676865135238048,
"learning_rate": 9.015818446218683e-06,
"loss": 2.1984,
"step": 312
},
{
"epoch": 0.22771917060749364,
"grad_norm": 1.3928409928251604,
"learning_rate": 9.008792210718854e-06,
"loss": 2.1726,
"step": 313
},
{
"epoch": 0.22844670789377955,
"grad_norm": 1.6216502611547985,
"learning_rate": 9.00174374204684e-06,
"loss": 2.1881,
"step": 314
},
{
"epoch": 0.2291742451800655,
"grad_norm": 4.52807672247779,
"learning_rate": 8.994673079294171e-06,
"loss": 2.2517,
"step": 315
},
{
"epoch": 0.2299017824663514,
"grad_norm": 1.8101091064623136,
"learning_rate": 8.987580261675466e-06,
"loss": 2.1813,
"step": 316
},
{
"epoch": 0.2306293197526373,
"grad_norm": 1.5021589664831807,
"learning_rate": 8.98046532852822e-06,
"loss": 2.1709,
"step": 317
},
{
"epoch": 0.23135685703892325,
"grad_norm": 1.2800291466011613,
"learning_rate": 8.973328319312577e-06,
"loss": 2.1939,
"step": 318
},
{
"epoch": 0.23208439432520916,
"grad_norm": 1.7339241475054041,
"learning_rate": 8.966169273611125e-06,
"loss": 2.1608,
"step": 319
},
{
"epoch": 0.23281193161149508,
"grad_norm": 1.4736261516509148,
"learning_rate": 8.958988231128665e-06,
"loss": 2.1797,
"step": 320
},
{
"epoch": 0.23353946889778102,
"grad_norm": 1.4139860896226535,
"learning_rate": 8.95178523169199e-06,
"loss": 2.2057,
"step": 321
},
{
"epoch": 0.23426700618406693,
"grad_norm": 1.6671639145660242,
"learning_rate": 8.944560315249676e-06,
"loss": 2.1635,
"step": 322
},
{
"epoch": 0.23499454347035287,
"grad_norm": 1.598924274653862,
"learning_rate": 8.937313521871846e-06,
"loss": 2.224,
"step": 323
},
{
"epoch": 0.23572208075663878,
"grad_norm": 1.8671189985221208,
"learning_rate": 8.930044891749962e-06,
"loss": 2.1746,
"step": 324
},
{
"epoch": 0.2364496180429247,
"grad_norm": 1.3561015874043998,
"learning_rate": 8.922754465196591e-06,
"loss": 2.2098,
"step": 325
},
{
"epoch": 0.23717715532921063,
"grad_norm": 2.0307652448169926,
"learning_rate": 8.915442282645183e-06,
"loss": 2.16,
"step": 326
},
{
"epoch": 0.23790469261549654,
"grad_norm": 1.6434640576558963,
"learning_rate": 8.908108384649856e-06,
"loss": 2.2079,
"step": 327
},
{
"epoch": 0.23863222990178246,
"grad_norm": 1.4279872738065433,
"learning_rate": 8.900752811885152e-06,
"loss": 2.2,
"step": 328
},
{
"epoch": 0.2393597671880684,
"grad_norm": 1.6970394563321913,
"learning_rate": 8.893375605145837e-06,
"loss": 2.1893,
"step": 329
},
{
"epoch": 0.2400873044743543,
"grad_norm": 1.5059035467757602,
"learning_rate": 8.885976805346651e-06,
"loss": 2.1675,
"step": 330
},
{
"epoch": 0.24081484176064022,
"grad_norm": 1.5043481771028886,
"learning_rate": 8.8785564535221e-06,
"loss": 2.1719,
"step": 331
},
{
"epoch": 0.24154237904692616,
"grad_norm": 1.4989135816810697,
"learning_rate": 8.871114590826211e-06,
"loss": 2.1782,
"step": 332
},
{
"epoch": 0.24226991633321207,
"grad_norm": 1.3065841727798744,
"learning_rate": 8.86365125853232e-06,
"loss": 2.2057,
"step": 333
},
{
"epoch": 0.242997453619498,
"grad_norm": 1.8903212768191282,
"learning_rate": 8.85616649803283e-06,
"loss": 2.1798,
"step": 334
},
{
"epoch": 0.24372499090578392,
"grad_norm": 1.419329884971465,
"learning_rate": 8.84866035083899e-06,
"loss": 2.2423,
"step": 335
},
{
"epoch": 0.24445252819206983,
"grad_norm": 1.5054024361601408,
"learning_rate": 8.841132858580661e-06,
"loss": 2.1573,
"step": 336
},
{
"epoch": 0.24518006547835577,
"grad_norm": 1.7230388656951334,
"learning_rate": 8.833584063006088e-06,
"loss": 2.0773,
"step": 337
},
{
"epoch": 0.2459076027646417,
"grad_norm": 1.4222728150757113,
"learning_rate": 8.826014005981662e-06,
"loss": 2.1373,
"step": 338
},
{
"epoch": 0.2466351400509276,
"grad_norm": 1.4823414448646695,
"learning_rate": 8.818422729491693e-06,
"loss": 2.2306,
"step": 339
},
{
"epoch": 0.24736267733721354,
"grad_norm": 1.494452896957868,
"learning_rate": 8.810810275638183e-06,
"loss": 2.1501,
"step": 340
},
{
"epoch": 0.24809021462349945,
"grad_norm": 1.4702282798707498,
"learning_rate": 8.803176686640577e-06,
"loss": 2.1816,
"step": 341
},
{
"epoch": 0.2488177519097854,
"grad_norm": 1.3618536433099357,
"learning_rate": 8.795522004835543e-06,
"loss": 2.2072,
"step": 342
},
{
"epoch": 0.2495452891960713,
"grad_norm": 1.3113305408053801,
"learning_rate": 8.787846272676728e-06,
"loss": 2.2214,
"step": 343
},
{
"epoch": 0.2502728264823572,
"grad_norm": 1.9543942178819584,
"learning_rate": 8.780149532734531e-06,
"loss": 2.1721,
"step": 344
},
{
"epoch": 0.25100036376864315,
"grad_norm": 1.7571304927742484,
"learning_rate": 8.772431827695862e-06,
"loss": 2.1813,
"step": 345
},
{
"epoch": 0.25172790105492904,
"grad_norm": 2.204086860989567,
"learning_rate": 8.764693200363897e-06,
"loss": 2.1743,
"step": 346
},
{
"epoch": 0.252455438341215,
"grad_norm": 1.5999845528923904,
"learning_rate": 8.756933693657863e-06,
"loss": 2.1814,
"step": 347
},
{
"epoch": 0.2531829756275009,
"grad_norm": 1.8635454967928038,
"learning_rate": 8.749153350612774e-06,
"loss": 2.2211,
"step": 348
},
{
"epoch": 0.25391051291378686,
"grad_norm": 1.4428311175390514,
"learning_rate": 8.74135221437921e-06,
"loss": 2.2052,
"step": 349
},
{
"epoch": 0.25463805020007274,
"grad_norm": 2.106620577197869,
"learning_rate": 8.733530328223076e-06,
"loss": 2.1838,
"step": 350
},
{
"epoch": 0.2553655874863587,
"grad_norm": 1.6014903315993732,
"learning_rate": 8.725687735525347e-06,
"loss": 2.2324,
"step": 351
},
{
"epoch": 0.2560931247726446,
"grad_norm": 1.5943126144634416,
"learning_rate": 8.71782447978185e-06,
"loss": 2.1376,
"step": 352
},
{
"epoch": 0.2568206620589305,
"grad_norm": 1.6101881809656868,
"learning_rate": 8.709940604603006e-06,
"loss": 2.1742,
"step": 353
},
{
"epoch": 0.25754819934521644,
"grad_norm": 1.3439129152348759,
"learning_rate": 8.702036153713594e-06,
"loss": 2.238,
"step": 354
},
{
"epoch": 0.2582757366315024,
"grad_norm": 1.569010680613162,
"learning_rate": 8.694111170952508e-06,
"loss": 2.2009,
"step": 355
},
{
"epoch": 0.25900327391778827,
"grad_norm": 1.542834123668552,
"learning_rate": 8.686165700272513e-06,
"loss": 2.1963,
"step": 356
},
{
"epoch": 0.2597308112040742,
"grad_norm": 1.8941251098493939,
"learning_rate": 8.678199785740003e-06,
"loss": 2.1238,
"step": 357
},
{
"epoch": 0.26045834849036015,
"grad_norm": 1.497786911579719,
"learning_rate": 8.670213471534759e-06,
"loss": 2.1888,
"step": 358
},
{
"epoch": 0.26118588577664603,
"grad_norm": 1.5438129971287569,
"learning_rate": 8.662206801949694e-06,
"loss": 2.2097,
"step": 359
},
{
"epoch": 0.26191342306293197,
"grad_norm": 1.7304337859507464,
"learning_rate": 8.65417982139062e-06,
"loss": 2.1145,
"step": 360
},
{
"epoch": 0.2626409603492179,
"grad_norm": 1.324711597748057,
"learning_rate": 8.646132574375994e-06,
"loss": 2.2167,
"step": 361
},
{
"epoch": 0.2633684976355038,
"grad_norm": 1.764099789083863,
"learning_rate": 8.638065105536669e-06,
"loss": 2.2043,
"step": 362
},
{
"epoch": 0.26409603492178974,
"grad_norm": 1.4187103713769413,
"learning_rate": 8.629977459615655e-06,
"loss": 2.1899,
"step": 363
},
{
"epoch": 0.2648235722080757,
"grad_norm": 1.7175448008154528,
"learning_rate": 8.621869681467865e-06,
"loss": 2.2032,
"step": 364
},
{
"epoch": 0.26555110949436156,
"grad_norm": 1.584647884308209,
"learning_rate": 8.613741816059867e-06,
"loss": 2.1902,
"step": 365
},
{
"epoch": 0.2662786467806475,
"grad_norm": 1.496402910794325,
"learning_rate": 8.605593908469635e-06,
"loss": 2.1673,
"step": 366
},
{
"epoch": 0.26700618406693344,
"grad_norm": 1.698578959322338,
"learning_rate": 8.597426003886295e-06,
"loss": 2.2301,
"step": 367
},
{
"epoch": 0.2677337213532194,
"grad_norm": 1.4974994622957056,
"learning_rate": 8.58923814760989e-06,
"loss": 2.2378,
"step": 368
},
{
"epoch": 0.26846125863950526,
"grad_norm": 1.646949899011469,
"learning_rate": 8.581030385051105e-06,
"loss": 2.2092,
"step": 369
},
{
"epoch": 0.2691887959257912,
"grad_norm": 1.3378903417269772,
"learning_rate": 8.572802761731031e-06,
"loss": 2.1916,
"step": 370
},
{
"epoch": 0.26991633321207714,
"grad_norm": 1.4087448945095493,
"learning_rate": 8.564555323280913e-06,
"loss": 2.2059,
"step": 371
},
{
"epoch": 0.270643870498363,
"grad_norm": 1.5838365104583751,
"learning_rate": 8.556288115441887e-06,
"loss": 2.2018,
"step": 372
},
{
"epoch": 0.27137140778464897,
"grad_norm": 1.5540637646051088,
"learning_rate": 8.548001184064733e-06,
"loss": 2.2461,
"step": 373
},
{
"epoch": 0.2720989450709349,
"grad_norm": 1.7111502870886097,
"learning_rate": 8.539694575109626e-06,
"loss": 2.1165,
"step": 374
},
{
"epoch": 0.2728264823572208,
"grad_norm": 1.5424590640718876,
"learning_rate": 8.531368334645865e-06,
"loss": 2.2229,
"step": 375
},
{
"epoch": 0.27355401964350673,
"grad_norm": 1.812143369112376,
"learning_rate": 8.523022508851634e-06,
"loss": 2.1401,
"step": 376
},
{
"epoch": 0.27428155692979267,
"grad_norm": 1.5062727694529647,
"learning_rate": 8.514657144013738e-06,
"loss": 2.2003,
"step": 377
},
{
"epoch": 0.27500909421607855,
"grad_norm": 1.413424435228159,
"learning_rate": 8.506272286527346e-06,
"loss": 2.1805,
"step": 378
},
{
"epoch": 0.2757366315023645,
"grad_norm": 1.5890140441579645,
"learning_rate": 8.497867982895741e-06,
"loss": 2.2219,
"step": 379
},
{
"epoch": 0.27646416878865043,
"grad_norm": 1.281171640918851,
"learning_rate": 8.489444279730046e-06,
"loss": 2.1925,
"step": 380
},
{
"epoch": 0.2771917060749363,
"grad_norm": 1.4263623146922937,
"learning_rate": 8.481001223748986e-06,
"loss": 2.1471,
"step": 381
},
{
"epoch": 0.27791924336122226,
"grad_norm": 2.1371121352259546,
"learning_rate": 8.47253886177861e-06,
"loss": 2.1793,
"step": 382
},
{
"epoch": 0.2786467806475082,
"grad_norm": 1.3412182076022034,
"learning_rate": 8.464057240752046e-06,
"loss": 2.2298,
"step": 383
},
{
"epoch": 0.2793743179337941,
"grad_norm": 2.071607077435448,
"learning_rate": 8.455556407709235e-06,
"loss": 2.2543,
"step": 384
},
{
"epoch": 0.28010185522008,
"grad_norm": 1.9534003413176857,
"learning_rate": 8.447036409796663e-06,
"loss": 2.1475,
"step": 385
},
{
"epoch": 0.28082939250636596,
"grad_norm": 1.6769392968093242,
"learning_rate": 8.438497294267117e-06,
"loss": 2.1668,
"step": 386
},
{
"epoch": 0.2815569297926519,
"grad_norm": 1.557532120926547,
"learning_rate": 8.429939108479403e-06,
"loss": 2.134,
"step": 387
},
{
"epoch": 0.2822844670789378,
"grad_norm": 2.1612204380901043,
"learning_rate": 8.421361899898095e-06,
"loss": 2.1667,
"step": 388
},
{
"epoch": 0.2830120043652237,
"grad_norm": 1.4942618996422175,
"learning_rate": 8.412765716093273e-06,
"loss": 2.1555,
"step": 389
},
{
"epoch": 0.28373954165150966,
"grad_norm": 1.4344662772626562,
"learning_rate": 8.404150604740248e-06,
"loss": 2.2251,
"step": 390
},
{
"epoch": 0.28446707893779555,
"grad_norm": 1.2683804314574985,
"learning_rate": 8.395516613619315e-06,
"loss": 2.1955,
"step": 391
},
{
"epoch": 0.2851946162240815,
"grad_norm": 1.3922747189250564,
"learning_rate": 8.386863790615472e-06,
"loss": 2.2443,
"step": 392
},
{
"epoch": 0.2859221535103674,
"grad_norm": 1.5881319507851879,
"learning_rate": 8.378192183718158e-06,
"loss": 2.1906,
"step": 393
},
{
"epoch": 0.2866496907966533,
"grad_norm": 1.473240672349657,
"learning_rate": 8.369501841021e-06,
"loss": 2.1416,
"step": 394
},
{
"epoch": 0.28737722808293925,
"grad_norm": 1.4687165410998146,
"learning_rate": 8.360792810721522e-06,
"loss": 2.1452,
"step": 395
},
{
"epoch": 0.2881047653692252,
"grad_norm": 1.579771253507425,
"learning_rate": 8.352065141120902e-06,
"loss": 2.2256,
"step": 396
},
{
"epoch": 0.2888323026555111,
"grad_norm": 1.5712685675891773,
"learning_rate": 8.343318880623688e-06,
"loss": 2.2189,
"step": 397
},
{
"epoch": 0.289559839941797,
"grad_norm": 1.5420480636071308,
"learning_rate": 8.334554077737535e-06,
"loss": 2.2173,
"step": 398
},
{
"epoch": 0.29028737722808295,
"grad_norm": 1.2859318999500267,
"learning_rate": 8.325770781072939e-06,
"loss": 2.2135,
"step": 399
},
{
"epoch": 0.29101491451436884,
"grad_norm": 1.4805265877565563,
"learning_rate": 8.316969039342963e-06,
"loss": 2.2217,
"step": 400
},
{
"epoch": 0.2917424518006548,
"grad_norm": 1.5545540894337095,
"learning_rate": 8.30814890136297e-06,
"loss": 2.1985,
"step": 401
},
{
"epoch": 0.2924699890869407,
"grad_norm": 1.3383510734550588,
"learning_rate": 8.299310416050345e-06,
"loss": 2.1915,
"step": 402
},
{
"epoch": 0.2931975263732266,
"grad_norm": 1.915208096416909,
"learning_rate": 8.290453632424236e-06,
"loss": 2.1534,
"step": 403
},
{
"epoch": 0.29392506365951254,
"grad_norm": 1.664670781828968,
"learning_rate": 8.281578599605269e-06,
"loss": 2.1847,
"step": 404
},
{
"epoch": 0.2946526009457985,
"grad_norm": 1.733732203769642,
"learning_rate": 8.272685366815287e-06,
"loss": 2.211,
"step": 405
},
{
"epoch": 0.2953801382320844,
"grad_norm": 1.3602606814344493,
"learning_rate": 8.26377398337707e-06,
"loss": 2.1909,
"step": 406
},
{
"epoch": 0.2961076755183703,
"grad_norm": 1.5107817144001174,
"learning_rate": 8.254844498714063e-06,
"loss": 2.1868,
"step": 407
},
{
"epoch": 0.29683521280465625,
"grad_norm": 1.8836347431425295,
"learning_rate": 8.2458969623501e-06,
"loss": 2.2369,
"step": 408
},
{
"epoch": 0.2975627500909422,
"grad_norm": 1.490350939718598,
"learning_rate": 8.23693142390914e-06,
"loss": 2.2138,
"step": 409
},
{
"epoch": 0.29829028737722807,
"grad_norm": 2.1389350266540177,
"learning_rate": 8.227947933114971e-06,
"loss": 2.2343,
"step": 410
},
{
"epoch": 0.299017824663514,
"grad_norm": 1.9494550174812935,
"learning_rate": 8.218946539790957e-06,
"loss": 2.2259,
"step": 411
},
{
"epoch": 0.29974536194979995,
"grad_norm": 2.0558110266623886,
"learning_rate": 8.209927293859746e-06,
"loss": 2.1916,
"step": 412
},
{
"epoch": 0.30047289923608583,
"grad_norm": 1.6565866622694128,
"learning_rate": 8.200890245342999e-06,
"loss": 2.1939,
"step": 413
},
{
"epoch": 0.3012004365223718,
"grad_norm": 2.0124988863289284,
"learning_rate": 8.191835444361113e-06,
"loss": 2.1739,
"step": 414
},
{
"epoch": 0.3019279738086577,
"grad_norm": 1.5885211482360033,
"learning_rate": 8.182762941132944e-06,
"loss": 2.1427,
"step": 415
},
{
"epoch": 0.3026555110949436,
"grad_norm": 1.9064461762750247,
"learning_rate": 8.173672785975522e-06,
"loss": 2.174,
"step": 416
},
{
"epoch": 0.30338304838122954,
"grad_norm": 1.404065532109891,
"learning_rate": 8.16456502930378e-06,
"loss": 2.1558,
"step": 417
},
{
"epoch": 0.3041105856675155,
"grad_norm": 2.326646391688612,
"learning_rate": 8.155439721630265e-06,
"loss": 2.2072,
"step": 418
},
{
"epoch": 0.30483812295380136,
"grad_norm": 1.7482726501120724,
"learning_rate": 8.146296913564872e-06,
"loss": 2.2442,
"step": 419
},
{
"epoch": 0.3055656602400873,
"grad_norm": 2.337940156742116,
"learning_rate": 8.13713665581455e-06,
"loss": 2.1828,
"step": 420
},
{
"epoch": 0.30629319752637324,
"grad_norm": 2.1322634931803286,
"learning_rate": 8.127958999183027e-06,
"loss": 2.1796,
"step": 421
},
{
"epoch": 0.3070207348126591,
"grad_norm": 1.9097123883738019,
"learning_rate": 8.118763994570528e-06,
"loss": 2.1734,
"step": 422
},
{
"epoch": 0.30774827209894506,
"grad_norm": 2.1980133785757396,
"learning_rate": 8.109551692973487e-06,
"loss": 2.1898,
"step": 423
},
{
"epoch": 0.308475809385231,
"grad_norm": 1.789583137950799,
"learning_rate": 8.100322145484275e-06,
"loss": 2.1475,
"step": 424
},
{
"epoch": 0.3092033466715169,
"grad_norm": 2.344482041800149,
"learning_rate": 8.091075403290905e-06,
"loss": 2.1686,
"step": 425
},
{
"epoch": 0.3099308839578028,
"grad_norm": 1.8283336967274593,
"learning_rate": 8.081811517676759e-06,
"loss": 2.1467,
"step": 426
},
{
"epoch": 0.31065842124408877,
"grad_norm": 1.8002355815592943,
"learning_rate": 8.072530540020294e-06,
"loss": 2.191,
"step": 427
},
{
"epoch": 0.3113859585303747,
"grad_norm": 1.5927825576828603,
"learning_rate": 8.063232521794762e-06,
"loss": 2.1891,
"step": 428
},
{
"epoch": 0.3121134958166606,
"grad_norm": 1.451643393359493,
"learning_rate": 8.053917514567927e-06,
"loss": 2.1759,
"step": 429
},
{
"epoch": 0.31284103310294653,
"grad_norm": 1.7610648505587718,
"learning_rate": 8.04458557000177e-06,
"loss": 2.1871,
"step": 430
},
{
"epoch": 0.31356857038923247,
"grad_norm": 1.473618048365197,
"learning_rate": 8.035236739852214e-06,
"loss": 2.1437,
"step": 431
},
{
"epoch": 0.31429610767551835,
"grad_norm": 1.8221809772213533,
"learning_rate": 8.025871075968828e-06,
"loss": 2.2207,
"step": 432
},
{
"epoch": 0.3150236449618043,
"grad_norm": 1.2813989706691338,
"learning_rate": 8.016488630294539e-06,
"loss": 2.2069,
"step": 433
},
{
"epoch": 0.31575118224809023,
"grad_norm": 2.0251930508569145,
"learning_rate": 8.007089454865358e-06,
"loss": 2.156,
"step": 434
},
{
"epoch": 0.3164787195343761,
"grad_norm": 1.4958585375734768,
"learning_rate": 7.997673601810071e-06,
"loss": 2.1811,
"step": 435
},
{
"epoch": 0.31720625682066206,
"grad_norm": 2.1481478251388233,
"learning_rate": 7.988241123349965e-06,
"loss": 2.2135,
"step": 436
},
{
"epoch": 0.317933794106948,
"grad_norm": 1.5434770811180365,
"learning_rate": 7.97879207179853e-06,
"loss": 2.1987,
"step": 437
},
{
"epoch": 0.3186613313932339,
"grad_norm": 1.9931565162404845,
"learning_rate": 7.969326499561173e-06,
"loss": 2.1962,
"step": 438
},
{
"epoch": 0.3193888686795198,
"grad_norm": 1.5486850673722312,
"learning_rate": 7.95984445913493e-06,
"loss": 2.1802,
"step": 439
},
{
"epoch": 0.32011640596580576,
"grad_norm": 1.8117775512143597,
"learning_rate": 7.950346003108167e-06,
"loss": 2.1405,
"step": 440
},
{
"epoch": 0.32084394325209165,
"grad_norm": 1.2724580035235387,
"learning_rate": 7.940831184160294e-06,
"loss": 2.1457,
"step": 441
},
{
"epoch": 0.3215714805383776,
"grad_norm": 1.8000824307049166,
"learning_rate": 7.93130005506147e-06,
"loss": 2.1788,
"step": 442
},
{
"epoch": 0.3222990178246635,
"grad_norm": 1.2920020781067392,
"learning_rate": 7.921752668672316e-06,
"loss": 2.1797,
"step": 443
},
{
"epoch": 0.3230265551109494,
"grad_norm": 1.426393538509052,
"learning_rate": 7.912189077943613e-06,
"loss": 2.1948,
"step": 444
},
{
"epoch": 0.32375409239723535,
"grad_norm": 1.4072389270233285,
"learning_rate": 7.902609335916015e-06,
"loss": 2.2199,
"step": 445
},
{
"epoch": 0.3244816296835213,
"grad_norm": 1.5274959145306195,
"learning_rate": 7.893013495719752e-06,
"loss": 2.2308,
"step": 446
},
{
"epoch": 0.32520916696980723,
"grad_norm": 1.6391686446377485,
"learning_rate": 7.883401610574338e-06,
"loss": 2.2187,
"step": 447
},
{
"epoch": 0.3259367042560931,
"grad_norm": 1.3663896663037896,
"learning_rate": 7.873773733788268e-06,
"loss": 2.1845,
"step": 448
},
{
"epoch": 0.32666424154237905,
"grad_norm": 1.4500864060090617,
"learning_rate": 7.864129918758738e-06,
"loss": 2.1479,
"step": 449
},
{
"epoch": 0.327391778828665,
"grad_norm": 2.1191880844490396,
"learning_rate": 7.854470218971333e-06,
"loss": 2.1975,
"step": 450
},
{
"epoch": 0.3281193161149509,
"grad_norm": 1.3905443689989125,
"learning_rate": 7.844794687999737e-06,
"loss": 2.2096,
"step": 451
},
{
"epoch": 0.3288468534012368,
"grad_norm": 1.3979439713741857,
"learning_rate": 7.835103379505433e-06,
"loss": 2.1892,
"step": 452
},
{
"epoch": 0.32957439068752276,
"grad_norm": 1.4535005426658243,
"learning_rate": 7.825396347237413e-06,
"loss": 2.232,
"step": 453
},
{
"epoch": 0.33030192797380864,
"grad_norm": 1.5115384501349436,
"learning_rate": 7.815673645031871e-06,
"loss": 2.1301,
"step": 454
},
{
"epoch": 0.3310294652600946,
"grad_norm": 1.3115282469872074,
"learning_rate": 7.805935326811913e-06,
"loss": 2.2099,
"step": 455
},
{
"epoch": 0.3317570025463805,
"grad_norm": 1.5010012823097187,
"learning_rate": 7.796181446587244e-06,
"loss": 2.145,
"step": 456
},
{
"epoch": 0.3324845398326664,
"grad_norm": 1.404187219427933,
"learning_rate": 7.786412058453886e-06,
"loss": 2.1492,
"step": 457
},
{
"epoch": 0.33321207711895234,
"grad_norm": 1.4296996293876862,
"learning_rate": 7.776627216593863e-06,
"loss": 2.2032,
"step": 458
},
{
"epoch": 0.3339396144052383,
"grad_norm": 1.4288606512177882,
"learning_rate": 7.766826975274916e-06,
"loss": 2.1794,
"step": 459
},
{
"epoch": 0.33466715169152417,
"grad_norm": 1.521228766731058,
"learning_rate": 7.75701138885018e-06,
"loss": 2.1904,
"step": 460
},
{
"epoch": 0.3353946889778101,
"grad_norm": 1.6173905994542024,
"learning_rate": 7.747180511757908e-06,
"loss": 2.1972,
"step": 461
},
{
"epoch": 0.33612222626409605,
"grad_norm": 1.303059707860955,
"learning_rate": 7.737334398521149e-06,
"loss": 2.1815,
"step": 462
},
{
"epoch": 0.33684976355038193,
"grad_norm": 1.6029388549317818,
"learning_rate": 7.727473103747456e-06,
"loss": 2.1548,
"step": 463
},
{
"epoch": 0.33757730083666787,
"grad_norm": 1.4310714925242356,
"learning_rate": 7.717596682128578e-06,
"loss": 2.2169,
"step": 464
},
{
"epoch": 0.3383048381229538,
"grad_norm": 1.5636696059731714,
"learning_rate": 7.707705188440165e-06,
"loss": 2.1424,
"step": 465
},
{
"epoch": 0.33903237540923975,
"grad_norm": 1.5700744253848755,
"learning_rate": 7.697798677541448e-06,
"loss": 2.2076,
"step": 466
},
{
"epoch": 0.33975991269552563,
"grad_norm": 1.4436465913463896,
"learning_rate": 7.687877204374957e-06,
"loss": 2.2143,
"step": 467
},
{
"epoch": 0.3404874499818116,
"grad_norm": 1.765254733044937,
"learning_rate": 7.677940823966196e-06,
"loss": 2.2218,
"step": 468
},
{
"epoch": 0.3412149872680975,
"grad_norm": 1.4691603784424878,
"learning_rate": 7.667989591423349e-06,
"loss": 2.1976,
"step": 469
},
{
"epoch": 0.3419425245543834,
"grad_norm": 1.3603055735560683,
"learning_rate": 7.658023561936966e-06,
"loss": 2.2193,
"step": 470
},
{
"epoch": 0.34267006184066934,
"grad_norm": 1.4465116666140336,
"learning_rate": 7.648042790779677e-06,
"loss": 2.1955,
"step": 471
},
{
"epoch": 0.3433975991269553,
"grad_norm": 1.3063737828027806,
"learning_rate": 7.638047333305853e-06,
"loss": 2.176,
"step": 472
},
{
"epoch": 0.34412513641324116,
"grad_norm": 1.4867284794499172,
"learning_rate": 7.628037244951328e-06,
"loss": 2.2157,
"step": 473
},
{
"epoch": 0.3448526736995271,
"grad_norm": 1.3664879462629613,
"learning_rate": 7.618012581233076e-06,
"loss": 2.2188,
"step": 474
},
{
"epoch": 0.34558021098581304,
"grad_norm": 1.7470519475565276,
"learning_rate": 7.607973397748909e-06,
"loss": 2.1446,
"step": 475
},
{
"epoch": 0.3463077482720989,
"grad_norm": 1.6154246963221328,
"learning_rate": 7.597919750177168e-06,
"loss": 2.2107,
"step": 476
},
{
"epoch": 0.34703528555838487,
"grad_norm": 1.3638455698941867,
"learning_rate": 7.587851694276412e-06,
"loss": 2.2398,
"step": 477
},
{
"epoch": 0.3477628228446708,
"grad_norm": 1.3982722629120483,
"learning_rate": 7.57776928588511e-06,
"loss": 2.1973,
"step": 478
},
{
"epoch": 0.3484903601309567,
"grad_norm": 1.357914570235351,
"learning_rate": 7.56767258092133e-06,
"loss": 2.1843,
"step": 479
},
{
"epoch": 0.34921789741724263,
"grad_norm": 1.6309991413499052,
"learning_rate": 7.557561635382433e-06,
"loss": 2.1556,
"step": 480
},
{
"epoch": 0.34994543470352857,
"grad_norm": 1.6193509733209566,
"learning_rate": 7.54743650534476e-06,
"loss": 2.1584,
"step": 481
},
{
"epoch": 0.35067297198981445,
"grad_norm": 1.2099067965626962,
"learning_rate": 7.537297246963316e-06,
"loss": 2.2221,
"step": 482
},
{
"epoch": 0.3514005092761004,
"grad_norm": 1.5900525315340603,
"learning_rate": 7.5271439164714695e-06,
"loss": 2.1933,
"step": 483
},
{
"epoch": 0.35212804656238633,
"grad_norm": 1.5438986652266296,
"learning_rate": 7.5169765701806295e-06,
"loss": 2.1357,
"step": 484
},
{
"epoch": 0.35285558384867227,
"grad_norm": 1.3208757653456373,
"learning_rate": 7.506795264479941e-06,
"loss": 2.1879,
"step": 485
},
{
"epoch": 0.35358312113495816,
"grad_norm": 1.4301990530187678,
"learning_rate": 7.4966000558359675e-06,
"loss": 2.1783,
"step": 486
},
{
"epoch": 0.3543106584212441,
"grad_norm": 1.4812716326736017,
"learning_rate": 7.486391000792379e-06,
"loss": 2.1626,
"step": 487
},
{
"epoch": 0.35503819570753004,
"grad_norm": 1.417579122162795,
"learning_rate": 7.476168155969643e-06,
"loss": 2.2251,
"step": 488
},
{
"epoch": 0.3557657329938159,
"grad_norm": 1.3774973192524718,
"learning_rate": 7.465931578064703e-06,
"loss": 2.1823,
"step": 489
},
{
"epoch": 0.35649327028010186,
"grad_norm": 1.5296783044040714,
"learning_rate": 7.455681323850669e-06,
"loss": 2.1694,
"step": 490
},
{
"epoch": 0.3572208075663878,
"grad_norm": 1.6575720924854538,
"learning_rate": 7.4454174501765e-06,
"loss": 2.1799,
"step": 491
},
{
"epoch": 0.3579483448526737,
"grad_norm": 1.546046770665215,
"learning_rate": 7.4351400139666894e-06,
"loss": 2.2169,
"step": 492
},
{
"epoch": 0.3586758821389596,
"grad_norm": 1.540389159578989,
"learning_rate": 7.424849072220953e-06,
"loss": 2.2173,
"step": 493
},
{
"epoch": 0.35940341942524556,
"grad_norm": 1.4043240669644461,
"learning_rate": 7.414544682013907e-06,
"loss": 2.2141,
"step": 494
},
{
"epoch": 0.36013095671153145,
"grad_norm": 1.8638300064501658,
"learning_rate": 7.404226900494753e-06,
"loss": 2.1952,
"step": 495
},
{
"epoch": 0.3608584939978174,
"grad_norm": 1.342885944687377,
"learning_rate": 7.3938957848869684e-06,
"loss": 2.1466,
"step": 496
},
{
"epoch": 0.3615860312841033,
"grad_norm": 1.5882952997860806,
"learning_rate": 7.3835513924879755e-06,
"loss": 2.1954,
"step": 497
},
{
"epoch": 0.3623135685703892,
"grad_norm": 1.2373309543768045,
"learning_rate": 7.373193780668835e-06,
"loss": 2.1891,
"step": 498
},
{
"epoch": 0.36304110585667515,
"grad_norm": 2.0443919878347168,
"learning_rate": 7.36282300687392e-06,
"loss": 2.1675,
"step": 499
},
{
"epoch": 0.3637686431429611,
"grad_norm": 1.3887286548737836,
"learning_rate": 7.35243912862061e-06,
"loss": 2.1893,
"step": 500
},
{
"epoch": 0.364496180429247,
"grad_norm": 2.719239252549507,
"learning_rate": 7.342042203498952e-06,
"loss": 2.2446,
"step": 501
},
{
"epoch": 0.3652237177155329,
"grad_norm": 1.7604260800939213,
"learning_rate": 7.33163228917136e-06,
"loss": 2.19,
"step": 502
},
{
"epoch": 0.36595125500181885,
"grad_norm": 1.8242987699374724,
"learning_rate": 7.321209443372284e-06,
"loss": 2.1801,
"step": 503
},
{
"epoch": 0.36667879228810474,
"grad_norm": 1.7293101420517358,
"learning_rate": 7.310773723907895e-06,
"loss": 2.1508,
"step": 504
},
{
"epoch": 0.3674063295743907,
"grad_norm": 2.0764703155554733,
"learning_rate": 7.300325188655762e-06,
"loss": 2.173,
"step": 505
},
{
"epoch": 0.3681338668606766,
"grad_norm": 1.588249317170812,
"learning_rate": 7.289863895564531e-06,
"loss": 2.1604,
"step": 506
},
{
"epoch": 0.36886140414696256,
"grad_norm": 2.1777553622237917,
"learning_rate": 7.279389902653606e-06,
"loss": 2.1974,
"step": 507
},
{
"epoch": 0.36958894143324844,
"grad_norm": 1.7809060067522566,
"learning_rate": 7.268903268012823e-06,
"loss": 2.1729,
"step": 508
},
{
"epoch": 0.3703164787195344,
"grad_norm": 1.9174174704695481,
"learning_rate": 7.258404049802135e-06,
"loss": 2.2177,
"step": 509
},
{
"epoch": 0.3710440160058203,
"grad_norm": 2.2243845689710615,
"learning_rate": 7.247892306251276e-06,
"loss": 2.1902,
"step": 510
},
{
"epoch": 0.3717715532921062,
"grad_norm": 1.7631413862695091,
"learning_rate": 7.237368095659459e-06,
"loss": 2.1981,
"step": 511
},
{
"epoch": 0.37249909057839214,
"grad_norm": 1.8123427455098935,
"learning_rate": 7.226831476395028e-06,
"loss": 2.1863,
"step": 512
},
{
"epoch": 0.3732266278646781,
"grad_norm": 1.5649100589426284,
"learning_rate": 7.216282506895155e-06,
"loss": 2.2349,
"step": 513
},
{
"epoch": 0.37395416515096397,
"grad_norm": 1.9927093639757405,
"learning_rate": 7.2057212456655055e-06,
"loss": 2.2133,
"step": 514
},
{
"epoch": 0.3746817024372499,
"grad_norm": 1.4433401145184241,
"learning_rate": 7.195147751279915e-06,
"loss": 2.1875,
"step": 515
},
{
"epoch": 0.37540923972353585,
"grad_norm": 1.4702187969117708,
"learning_rate": 7.184562082380069e-06,
"loss": 2.1786,
"step": 516
},
{
"epoch": 0.37613677700982173,
"grad_norm": 1.3663494773812568,
"learning_rate": 7.173964297675168e-06,
"loss": 2.2144,
"step": 517
},
{
"epoch": 0.37686431429610767,
"grad_norm": 1.346520463150049,
"learning_rate": 7.163354455941614e-06,
"loss": 2.2495,
"step": 518
},
{
"epoch": 0.3775918515823936,
"grad_norm": 1.5135411330745128,
"learning_rate": 7.152732616022675e-06,
"loss": 2.1856,
"step": 519
},
{
"epoch": 0.3783193888686795,
"grad_norm": 1.631804860578378,
"learning_rate": 7.142098836828162e-06,
"loss": 2.1518,
"step": 520
},
{
"epoch": 0.37904692615496544,
"grad_norm": 1.5375776438606996,
"learning_rate": 7.131453177334103e-06,
"loss": 2.2006,
"step": 521
},
{
"epoch": 0.3797744634412514,
"grad_norm": 1.3613025917303885,
"learning_rate": 7.120795696582419e-06,
"loss": 2.2173,
"step": 522
},
{
"epoch": 0.38050200072753726,
"grad_norm": 3.7985413414970797,
"learning_rate": 7.1101264536805885e-06,
"loss": 2.196,
"step": 523
},
{
"epoch": 0.3812295380138232,
"grad_norm": 1.4859354434913918,
"learning_rate": 7.099445507801324e-06,
"loss": 2.1631,
"step": 524
},
{
"epoch": 0.38195707530010914,
"grad_norm": 1.3409270310054628,
"learning_rate": 7.088752918182247e-06,
"loss": 2.2036,
"step": 525
},
{
"epoch": 0.3826846125863951,
"grad_norm": 1.3337351630271117,
"learning_rate": 7.078048744125553e-06,
"loss": 2.188,
"step": 526
},
{
"epoch": 0.38341214987268096,
"grad_norm": 1.4962256806518117,
"learning_rate": 7.067333044997689e-06,
"loss": 2.159,
"step": 527
},
{
"epoch": 0.3841396871589669,
"grad_norm": 1.6612181443786198,
"learning_rate": 7.0566058802290196e-06,
"loss": 2.1896,
"step": 528
},
{
"epoch": 0.38486722444525284,
"grad_norm": 1.4131231742415042,
"learning_rate": 7.045867309313499e-06,
"loss": 2.1807,
"step": 529
},
{
"epoch": 0.3855947617315387,
"grad_norm": 1.3191716183997426,
"learning_rate": 7.035117391808341e-06,
"loss": 2.159,
"step": 530
},
{
"epoch": 0.38632229901782467,
"grad_norm": 1.4960990390145708,
"learning_rate": 7.024356187333692e-06,
"loss": 2.0966,
"step": 531
},
{
"epoch": 0.3870498363041106,
"grad_norm": 1.429956783351808,
"learning_rate": 7.01358375557229e-06,
"loss": 2.1894,
"step": 532
},
{
"epoch": 0.3877773735903965,
"grad_norm": 1.4572049756953669,
"learning_rate": 7.0028001562691475e-06,
"loss": 2.1212,
"step": 533
},
{
"epoch": 0.38850491087668243,
"grad_norm": 1.5966434548753567,
"learning_rate": 6.9920054492312086e-06,
"loss": 2.1895,
"step": 534
},
{
"epoch": 0.38923244816296837,
"grad_norm": 1.33802025979571,
"learning_rate": 6.981199694327024e-06,
"loss": 2.1844,
"step": 535
},
{
"epoch": 0.38995998544925425,
"grad_norm": 1.3802399864961623,
"learning_rate": 6.97038295148642e-06,
"loss": 2.1677,
"step": 536
},
{
"epoch": 0.3906875227355402,
"grad_norm": 1.6091030572317013,
"learning_rate": 6.959555280700162e-06,
"loss": 2.1643,
"step": 537
},
{
"epoch": 0.39141506002182613,
"grad_norm": 2.7837498605385913,
"learning_rate": 6.948716742019616e-06,
"loss": 2.1977,
"step": 538
},
{
"epoch": 0.392142597308112,
"grad_norm": 1.500029077202491,
"learning_rate": 6.937867395556428e-06,
"loss": 2.1591,
"step": 539
},
{
"epoch": 0.39287013459439796,
"grad_norm": 1.3796722880177077,
"learning_rate": 6.927007301482187e-06,
"loss": 2.1181,
"step": 540
},
{
"epoch": 0.3935976718806839,
"grad_norm": 1.2841069999862686,
"learning_rate": 6.916136520028087e-06,
"loss": 2.1519,
"step": 541
},
{
"epoch": 0.3943252091669698,
"grad_norm": 1.4995288048414561,
"learning_rate": 6.905255111484592e-06,
"loss": 2.2125,
"step": 542
},
{
"epoch": 0.3950527464532557,
"grad_norm": 2.0316965689982607,
"learning_rate": 6.894363136201114e-06,
"loss": 2.1612,
"step": 543
},
{
"epoch": 0.39578028373954166,
"grad_norm": 1.3579977814916835,
"learning_rate": 6.88346065458566e-06,
"loss": 2.2381,
"step": 544
},
{
"epoch": 0.3965078210258276,
"grad_norm": 1.3472184874244,
"learning_rate": 6.8725477271045085e-06,
"loss": 2.1687,
"step": 545
},
{
"epoch": 0.3972353583121135,
"grad_norm": 1.485886897353197,
"learning_rate": 6.861624414281875e-06,
"loss": 2.1931,
"step": 546
},
{
"epoch": 0.3979628955983994,
"grad_norm": 1.4551297785369353,
"learning_rate": 6.850690776699574e-06,
"loss": 2.147,
"step": 547
},
{
"epoch": 0.39869043288468536,
"grad_norm": 1.4380183589510607,
"learning_rate": 6.8397468749966735e-06,
"loss": 2.1952,
"step": 548
},
{
"epoch": 0.39941797017097125,
"grad_norm": 1.5714557268967253,
"learning_rate": 6.8287927698691745e-06,
"loss": 2.1955,
"step": 549
},
{
"epoch": 0.4001455074572572,
"grad_norm": 1.279188232189353,
"learning_rate": 6.8178285220696686e-06,
"loss": 2.1981,
"step": 550
},
{
"epoch": 0.4008730447435431,
"grad_norm": 1.4424478452820921,
"learning_rate": 6.806854192406995e-06,
"loss": 2.1119,
"step": 551
},
{
"epoch": 0.401600582029829,
"grad_norm": 1.3026202589943117,
"learning_rate": 6.795869841745912e-06,
"loss": 2.1805,
"step": 552
},
{
"epoch": 0.40232811931611495,
"grad_norm": 1.409446508641355,
"learning_rate": 6.784875531006751e-06,
"loss": 2.2104,
"step": 553
},
{
"epoch": 0.4030556566024009,
"grad_norm": 1.3443877470647068,
"learning_rate": 6.7738713211650885e-06,
"loss": 2.1898,
"step": 554
},
{
"epoch": 0.4037831938886868,
"grad_norm": 1.6892639463777863,
"learning_rate": 6.762857273251396e-06,
"loss": 2.2186,
"step": 555
},
{
"epoch": 0.4045107311749727,
"grad_norm": 1.5688437019416017,
"learning_rate": 6.751833448350713e-06,
"loss": 2.1855,
"step": 556
},
{
"epoch": 0.40523826846125865,
"grad_norm": 1.603112976516178,
"learning_rate": 6.740799907602302e-06,
"loss": 2.2116,
"step": 557
},
{
"epoch": 0.40596580574754454,
"grad_norm": 1.4321804286899071,
"learning_rate": 6.729756712199309e-06,
"loss": 2.1305,
"step": 558
},
{
"epoch": 0.4066933430338305,
"grad_norm": 1.5339389129789849,
"learning_rate": 6.718703923388427e-06,
"loss": 2.1777,
"step": 559
},
{
"epoch": 0.4074208803201164,
"grad_norm": 1.503291365726574,
"learning_rate": 6.707641602469554e-06,
"loss": 2.1939,
"step": 560
},
{
"epoch": 0.4081484176064023,
"grad_norm": 1.9104614902741786,
"learning_rate": 6.696569810795455e-06,
"loss": 2.1462,
"step": 561
},
{
"epoch": 0.40887595489268824,
"grad_norm": 1.5181941385544446,
"learning_rate": 6.685488609771422e-06,
"loss": 2.2068,
"step": 562
},
{
"epoch": 0.4096034921789742,
"grad_norm": 1.854212603906569,
"learning_rate": 6.674398060854931e-06,
"loss": 2.1901,
"step": 563
},
{
"epoch": 0.4103310294652601,
"grad_norm": 1.5249028077188949,
"learning_rate": 6.6632982255553004e-06,
"loss": 2.207,
"step": 564
},
{
"epoch": 0.411058566751546,
"grad_norm": 2.0569427143628647,
"learning_rate": 6.652189165433356e-06,
"loss": 2.1998,
"step": 565
},
{
"epoch": 0.41178610403783195,
"grad_norm": 1.7548253682980153,
"learning_rate": 6.64107094210108e-06,
"loss": 2.1028,
"step": 566
},
{
"epoch": 0.4125136413241179,
"grad_norm": 1.641162449194734,
"learning_rate": 6.62994361722128e-06,
"loss": 2.1805,
"step": 567
},
{
"epoch": 0.41324117861040377,
"grad_norm": 1.4868121115076225,
"learning_rate": 6.618807252507238e-06,
"loss": 2.1483,
"step": 568
},
{
"epoch": 0.4139687158966897,
"grad_norm": 1.7597038884989522,
"learning_rate": 6.6076619097223735e-06,
"loss": 2.2063,
"step": 569
},
{
"epoch": 0.41469625318297565,
"grad_norm": 1.5240304099422757,
"learning_rate": 6.5965076506799e-06,
"loss": 2.2156,
"step": 570
},
{
"epoch": 0.41542379046926153,
"grad_norm": 1.564601016318413,
"learning_rate": 6.5853445372424805e-06,
"loss": 2.2211,
"step": 571
},
{
"epoch": 0.4161513277555475,
"grad_norm": 1.480576244672552,
"learning_rate": 6.574172631321885e-06,
"loss": 2.1956,
"step": 572
},
{
"epoch": 0.4168788650418334,
"grad_norm": 1.7476087188506109,
"learning_rate": 6.562991994878649e-06,
"loss": 2.1871,
"step": 573
},
{
"epoch": 0.4176064023281193,
"grad_norm": 1.3207274146769286,
"learning_rate": 6.551802689921726e-06,
"loss": 2.1756,
"step": 574
},
{
"epoch": 0.41833393961440524,
"grad_norm": 1.7068135107861524,
"learning_rate": 6.5406047785081485e-06,
"loss": 2.1858,
"step": 575
},
{
"epoch": 0.4190614769006912,
"grad_norm": 1.4042743643084192,
"learning_rate": 6.529398322742677e-06,
"loss": 2.1722,
"step": 576
},
{
"epoch": 0.41978901418697706,
"grad_norm": 1.9755798708905363,
"learning_rate": 6.518183384777468e-06,
"loss": 2.1576,
"step": 577
},
{
"epoch": 0.420516551473263,
"grad_norm": 1.5994303461102262,
"learning_rate": 6.506960026811712e-06,
"loss": 2.2132,
"step": 578
},
{
"epoch": 0.42124408875954894,
"grad_norm": 2.0649893491282114,
"learning_rate": 6.495728311091303e-06,
"loss": 2.1971,
"step": 579
},
{
"epoch": 0.4219716260458348,
"grad_norm": 1.8098677800737781,
"learning_rate": 6.484488299908487e-06,
"loss": 2.2014,
"step": 580
},
{
"epoch": 0.42269916333212076,
"grad_norm": 1.6214666551666128,
"learning_rate": 6.473240055601517e-06,
"loss": 2.2096,
"step": 581
},
{
"epoch": 0.4234267006184067,
"grad_norm": 1.7878777932661953,
"learning_rate": 6.46198364055431e-06,
"loss": 2.1419,
"step": 582
},
{
"epoch": 0.4241542379046926,
"grad_norm": 2.159509168849858,
"learning_rate": 6.450719117196094e-06,
"loss": 2.194,
"step": 583
},
{
"epoch": 0.42488177519097853,
"grad_norm": 1.605971453076131,
"learning_rate": 6.439446548001069e-06,
"loss": 2.2016,
"step": 584
},
{
"epoch": 0.42560931247726447,
"grad_norm": 1.2909853506916047,
"learning_rate": 6.4281659954880605e-06,
"loss": 2.1941,
"step": 585
},
{
"epoch": 0.4263368497635504,
"grad_norm": 1.6094780157573472,
"learning_rate": 6.416877522220167e-06,
"loss": 2.1441,
"step": 586
},
{
"epoch": 0.4270643870498363,
"grad_norm": 1.2477800378388897,
"learning_rate": 6.405581190804418e-06,
"loss": 2.1539,
"step": 587
},
{
"epoch": 0.42779192433612223,
"grad_norm": 1.4114854405252097,
"learning_rate": 6.394277063891422e-06,
"loss": 2.1303,
"step": 588
},
{
"epoch": 0.42851946162240817,
"grad_norm": 1.365404913537153,
"learning_rate": 6.382965204175027e-06,
"loss": 2.1426,
"step": 589
},
{
"epoch": 0.42924699890869406,
"grad_norm": 1.6876040695944547,
"learning_rate": 6.371645674391967e-06,
"loss": 2.1633,
"step": 590
},
{
"epoch": 0.42997453619498,
"grad_norm": 1.3968277764396695,
"learning_rate": 6.3603185373215105e-06,
"loss": 2.1505,
"step": 591
},
{
"epoch": 0.43070207348126593,
"grad_norm": 1.4586710459080776,
"learning_rate": 6.348983855785122e-06,
"loss": 2.1557,
"step": 592
},
{
"epoch": 0.4314296107675518,
"grad_norm": 1.369509599324066,
"learning_rate": 6.337641692646106e-06,
"loss": 2.163,
"step": 593
},
{
"epoch": 0.43215714805383776,
"grad_norm": 1.488317021603382,
"learning_rate": 6.326292110809258e-06,
"loss": 2.2305,
"step": 594
},
{
"epoch": 0.4328846853401237,
"grad_norm": 1.293463816991133,
"learning_rate": 6.314935173220524e-06,
"loss": 2.1733,
"step": 595
},
{
"epoch": 0.4336122226264096,
"grad_norm": 1.2329798588148075,
"learning_rate": 6.303570942866643e-06,
"loss": 2.2361,
"step": 596
},
{
"epoch": 0.4343397599126955,
"grad_norm": 1.274641665015929,
"learning_rate": 6.2921994827748e-06,
"loss": 2.1727,
"step": 597
},
{
"epoch": 0.43506729719898146,
"grad_norm": 1.52350376606858,
"learning_rate": 6.280820856012277e-06,
"loss": 2.1166,
"step": 598
},
{
"epoch": 0.43579483448526735,
"grad_norm": 1.3443923990170132,
"learning_rate": 6.269435125686105e-06,
"loss": 2.1592,
"step": 599
},
{
"epoch": 0.4365223717715533,
"grad_norm": 1.4532761222668074,
"learning_rate": 6.258042354942708e-06,
"loss": 2.1825,
"step": 600
},
{
"epoch": 0.4372499090578392,
"grad_norm": 1.4337559630256602,
"learning_rate": 6.2466426069675626e-06,
"loss": 2.1405,
"step": 601
},
{
"epoch": 0.4379774463441251,
"grad_norm": 1.3646159270348335,
"learning_rate": 6.235235944984835e-06,
"loss": 2.1662,
"step": 602
},
{
"epoch": 0.43870498363041105,
"grad_norm": 1.2228149723332558,
"learning_rate": 6.223822432257043e-06,
"loss": 2.2147,
"step": 603
},
{
"epoch": 0.439432520916697,
"grad_norm": 1.5215015742650733,
"learning_rate": 6.212402132084697e-06,
"loss": 2.1389,
"step": 604
},
{
"epoch": 0.44016005820298293,
"grad_norm": 1.3169917799704782,
"learning_rate": 6.200975107805951e-06,
"loss": 2.2103,
"step": 605
},
{
"epoch": 0.4408875954892688,
"grad_norm": 1.3932661877068893,
"learning_rate": 6.189541422796254e-06,
"loss": 2.215,
"step": 606
},
{
"epoch": 0.44161513277555475,
"grad_norm": 1.3096197602120236,
"learning_rate": 6.1781011404679905e-06,
"loss": 2.2127,
"step": 607
},
{
"epoch": 0.4423426700618407,
"grad_norm": 1.4934737646647973,
"learning_rate": 6.16665432427014e-06,
"loss": 2.2126,
"step": 608
},
{
"epoch": 0.4430702073481266,
"grad_norm": 1.2933269423981684,
"learning_rate": 6.155201037687917e-06,
"loss": 2.213,
"step": 609
},
{
"epoch": 0.4437977446344125,
"grad_norm": 1.2805748576266416,
"learning_rate": 6.1437413442424236e-06,
"loss": 2.2125,
"step": 610
},
{
"epoch": 0.44452528192069846,
"grad_norm": 1.2687373233631138,
"learning_rate": 6.132275307490291e-06,
"loss": 2.1634,
"step": 611
},
{
"epoch": 0.44525281920698434,
"grad_norm": 1.2334743124034346,
"learning_rate": 6.120802991023334e-06,
"loss": 2.1814,
"step": 612
},
{
"epoch": 0.4459803564932703,
"grad_norm": 1.3262631511688279,
"learning_rate": 6.109324458468198e-06,
"loss": 2.1997,
"step": 613
},
{
"epoch": 0.4467078937795562,
"grad_norm": 1.2608029031130938,
"learning_rate": 6.097839773485995e-06,
"loss": 2.2009,
"step": 614
},
{
"epoch": 0.4474354310658421,
"grad_norm": 1.600622551011594,
"learning_rate": 6.086348999771967e-06,
"loss": 2.1711,
"step": 615
},
{
"epoch": 0.44816296835212804,
"grad_norm": 1.3683377787237463,
"learning_rate": 6.074852201055121e-06,
"loss": 2.1955,
"step": 616
},
{
"epoch": 0.448890505638414,
"grad_norm": 8.048703163178985,
"learning_rate": 6.063349441097881e-06,
"loss": 2.1864,
"step": 617
},
{
"epoch": 0.44961804292469987,
"grad_norm": 1.5300459998198346,
"learning_rate": 6.051840783695731e-06,
"loss": 2.1785,
"step": 618
},
{
"epoch": 0.4503455802109858,
"grad_norm": 1.3019331521335327,
"learning_rate": 6.040326292676865e-06,
"loss": 2.188,
"step": 619
},
{
"epoch": 0.45107311749727175,
"grad_norm": 1.4083628372242818,
"learning_rate": 6.028806031901829e-06,
"loss": 2.1921,
"step": 620
},
{
"epoch": 0.45180065478355763,
"grad_norm": 1.3938607440843427,
"learning_rate": 6.0172800652631706e-06,
"loss": 2.1739,
"step": 621
},
{
"epoch": 0.45252819206984357,
"grad_norm": 1.3388550745415642,
"learning_rate": 6.005748456685077e-06,
"loss": 2.1935,
"step": 622
},
{
"epoch": 0.4532557293561295,
"grad_norm": 1.414747738164431,
"learning_rate": 5.994211270123034e-06,
"loss": 2.2062,
"step": 623
},
{
"epoch": 0.45398326664241545,
"grad_norm": 1.3971151162411044,
"learning_rate": 5.9826685695634575e-06,
"loss": 2.1685,
"step": 624
},
{
"epoch": 0.45471080392870133,
"grad_norm": 1.5318415273856,
"learning_rate": 5.971120419023349e-06,
"loss": 2.1522,
"step": 625
},
{
"epoch": 0.4554383412149873,
"grad_norm": 1.465485425902321,
"learning_rate": 5.959566882549936e-06,
"loss": 2.2035,
"step": 626
},
{
"epoch": 0.4561658785012732,
"grad_norm": 1.423172083365543,
"learning_rate": 5.948008024220311e-06,
"loss": 2.1863,
"step": 627
},
{
"epoch": 0.4568934157875591,
"grad_norm": 1.4240745852160854,
"learning_rate": 5.936443908141088e-06,
"loss": 2.1878,
"step": 628
},
{
"epoch": 0.45762095307384504,
"grad_norm": 1.3532825391471102,
"learning_rate": 5.924874598448038e-06,
"loss": 2.169,
"step": 629
},
{
"epoch": 0.458348490360131,
"grad_norm": 1.358186857880125,
"learning_rate": 5.913300159305741e-06,
"loss": 2.1414,
"step": 630
},
{
"epoch": 0.45907602764641686,
"grad_norm": 1.4951735213965345,
"learning_rate": 5.901720654907217e-06,
"loss": 2.1903,
"step": 631
},
{
"epoch": 0.4598035649327028,
"grad_norm": 1.5407272148019264,
"learning_rate": 5.8901361494735874e-06,
"loss": 2.2203,
"step": 632
},
{
"epoch": 0.46053110221898874,
"grad_norm": 1.3121352377647268,
"learning_rate": 5.878546707253704e-06,
"loss": 2.1472,
"step": 633
},
{
"epoch": 0.4612586395052746,
"grad_norm": 1.34642713273401,
"learning_rate": 5.8669523925238e-06,
"loss": 2.2139,
"step": 634
},
{
"epoch": 0.46198617679156057,
"grad_norm": 1.278017955157032,
"learning_rate": 5.855353269587134e-06,
"loss": 2.178,
"step": 635
},
{
"epoch": 0.4627137140778465,
"grad_norm": 1.4552647069125098,
"learning_rate": 5.843749402773629e-06,
"loss": 2.1409,
"step": 636
},
{
"epoch": 0.4634412513641324,
"grad_norm": 1.564092880214355,
"learning_rate": 5.8321408564395165e-06,
"loss": 2.1801,
"step": 637
},
{
"epoch": 0.46416878865041833,
"grad_norm": 1.2442285834349986,
"learning_rate": 5.820527694966988e-06,
"loss": 2.2191,
"step": 638
},
{
"epoch": 0.46489632593670427,
"grad_norm": 1.2375566820697201,
"learning_rate": 5.808909982763825e-06,
"loss": 2.1601,
"step": 639
},
{
"epoch": 0.46562386322299015,
"grad_norm": 1.3882042357040267,
"learning_rate": 5.797287784263047e-06,
"loss": 2.1376,
"step": 640
},
{
"epoch": 0.4663514005092761,
"grad_norm": 1.4087732429863524,
"learning_rate": 5.785661163922558e-06,
"loss": 2.2206,
"step": 641
},
{
"epoch": 0.46707893779556203,
"grad_norm": 1.4816673933482334,
"learning_rate": 5.774030186224786e-06,
"loss": 2.1835,
"step": 642
},
{
"epoch": 0.46780647508184797,
"grad_norm": 1.2407433002116413,
"learning_rate": 5.762394915676325e-06,
"loss": 2.1961,
"step": 643
},
{
"epoch": 0.46853401236813386,
"grad_norm": 1.273642310061063,
"learning_rate": 5.750755416807575e-06,
"loss": 2.1482,
"step": 644
},
{
"epoch": 0.4692615496544198,
"grad_norm": 1.4089871928103983,
"learning_rate": 5.7391117541723914e-06,
"loss": 2.1724,
"step": 645
},
{
"epoch": 0.46998908694070574,
"grad_norm": 1.5125196870593247,
"learning_rate": 5.727463992347719e-06,
"loss": 2.1689,
"step": 646
},
{
"epoch": 0.4707166242269916,
"grad_norm": 1.341889005253219,
"learning_rate": 5.715812195933238e-06,
"loss": 2.2176,
"step": 647
},
{
"epoch": 0.47144416151327756,
"grad_norm": 1.403686348651851,
"learning_rate": 5.704156429551004e-06,
"loss": 2.1759,
"step": 648
},
{
"epoch": 0.4721716987995635,
"grad_norm": 1.307605691657786,
"learning_rate": 5.692496757845092e-06,
"loss": 2.1926,
"step": 649
},
{
"epoch": 0.4728992360858494,
"grad_norm": 1.4323344759039234,
"learning_rate": 5.680833245481234e-06,
"loss": 2.2126,
"step": 650
},
{
"epoch": 0.4736267733721353,
"grad_norm": 1.5581057141318937,
"learning_rate": 5.6691659571464655e-06,
"loss": 2.2334,
"step": 651
},
{
"epoch": 0.47435431065842126,
"grad_norm": 1.4300789953117086,
"learning_rate": 5.657494957548761e-06,
"loss": 2.1843,
"step": 652
},
{
"epoch": 0.47508184794470715,
"grad_norm": 1.3970648098199812,
"learning_rate": 5.645820311416681e-06,
"loss": 2.2122,
"step": 653
},
{
"epoch": 0.4758093852309931,
"grad_norm": 1.3766394494849667,
"learning_rate": 5.63414208349901e-06,
"loss": 2.1505,
"step": 654
},
{
"epoch": 0.476536922517279,
"grad_norm": 1.2611851627806758,
"learning_rate": 5.622460338564393e-06,
"loss": 2.1835,
"step": 655
},
{
"epoch": 0.4772644598035649,
"grad_norm": 1.2689258379576362,
"learning_rate": 5.610775141400986e-06,
"loss": 2.1851,
"step": 656
},
{
"epoch": 0.47799199708985085,
"grad_norm": 1.2562703679535898,
"learning_rate": 5.599086556816089e-06,
"loss": 2.2069,
"step": 657
},
{
"epoch": 0.4787195343761368,
"grad_norm": 1.373649678079621,
"learning_rate": 5.587394649635789e-06,
"loss": 2.1818,
"step": 658
},
{
"epoch": 0.4794470716624227,
"grad_norm": 1.340447745306835,
"learning_rate": 5.575699484704599e-06,
"loss": 2.1518,
"step": 659
},
{
"epoch": 0.4801746089487086,
"grad_norm": 1.4409063267903557,
"learning_rate": 5.564001126885106e-06,
"loss": 2.2298,
"step": 660
},
{
"epoch": 0.48090214623499455,
"grad_norm": 1.4770825673856207,
"learning_rate": 5.552299641057596e-06,
"loss": 2.211,
"step": 661
},
{
"epoch": 0.48162968352128044,
"grad_norm": 1.451816328888107,
"learning_rate": 5.540595092119709e-06,
"loss": 2.2002,
"step": 662
},
{
"epoch": 0.4823572208075664,
"grad_norm": 1.237295472207834,
"learning_rate": 5.5288875449860745e-06,
"loss": 2.2193,
"step": 663
},
{
"epoch": 0.4830847580938523,
"grad_norm": 1.4771105112520675,
"learning_rate": 5.517177064587945e-06,
"loss": 2.1932,
"step": 664
},
{
"epoch": 0.48381229538013826,
"grad_norm": 1.3053846946139933,
"learning_rate": 5.505463715872846e-06,
"loss": 2.1545,
"step": 665
},
{
"epoch": 0.48453983266642414,
"grad_norm": 1.3508860777646363,
"learning_rate": 5.493747563804211e-06,
"loss": 2.1694,
"step": 666
},
{
"epoch": 0.4852673699527101,
"grad_norm": 1.2858010766706796,
"learning_rate": 5.482028673361015e-06,
"loss": 2.2014,
"step": 667
},
{
"epoch": 0.485994907238996,
"grad_norm": 1.3616832657950724,
"learning_rate": 5.470307109537427e-06,
"loss": 2.1389,
"step": 668
},
{
"epoch": 0.4867224445252819,
"grad_norm": 1.3973955367794273,
"learning_rate": 5.45858293734244e-06,
"loss": 2.1505,
"step": 669
},
{
"epoch": 0.48744998181156785,
"grad_norm": 1.2914653848603326,
"learning_rate": 5.446856221799515e-06,
"loss": 2.183,
"step": 670
},
{
"epoch": 0.4881775190978538,
"grad_norm": 1.2585865929216316,
"learning_rate": 5.435127027946215e-06,
"loss": 2.1943,
"step": 671
},
{
"epoch": 0.48890505638413967,
"grad_norm": 1.3052683558561404,
"learning_rate": 5.423395420833853e-06,
"loss": 2.176,
"step": 672
},
{
"epoch": 0.4896325936704256,
"grad_norm": 1.3873609358622778,
"learning_rate": 5.411661465527123e-06,
"loss": 2.1425,
"step": 673
},
{
"epoch": 0.49036013095671155,
"grad_norm": 1.2669745889132074,
"learning_rate": 5.39992522710374e-06,
"loss": 2.1792,
"step": 674
},
{
"epoch": 0.49108766824299743,
"grad_norm": 1.4194570179154562,
"learning_rate": 5.38818677065409e-06,
"loss": 2.2,
"step": 675
},
{
"epoch": 0.4918152055292834,
"grad_norm": 1.2404450036792771,
"learning_rate": 5.376446161280851e-06,
"loss": 2.2025,
"step": 676
},
{
"epoch": 0.4925427428155693,
"grad_norm": 1.3396645425218945,
"learning_rate": 5.364703464098645e-06,
"loss": 2.1413,
"step": 677
},
{
"epoch": 0.4932702801018552,
"grad_norm": 1.2987925328919214,
"learning_rate": 5.352958744233673e-06,
"loss": 2.1667,
"step": 678
},
{
"epoch": 0.49399781738814114,
"grad_norm": 1.7812213963044377,
"learning_rate": 5.341212066823356e-06,
"loss": 2.157,
"step": 679
},
{
"epoch": 0.4947253546744271,
"grad_norm": 1.292537524703904,
"learning_rate": 5.329463497015969e-06,
"loss": 2.201,
"step": 680
},
{
"epoch": 0.49545289196071296,
"grad_norm": 1.2361221322380314,
"learning_rate": 5.317713099970283e-06,
"loss": 2.1697,
"step": 681
},
{
"epoch": 0.4961804292469989,
"grad_norm": 1.2590867014183742,
"learning_rate": 5.305960940855205e-06,
"loss": 2.1641,
"step": 682
},
{
"epoch": 0.49690796653328484,
"grad_norm": 1.3424288308905323,
"learning_rate": 5.294207084849412e-06,
"loss": 2.1684,
"step": 683
},
{
"epoch": 0.4976355038195708,
"grad_norm": 1.294118009811616,
"learning_rate": 5.282451597140994e-06,
"loss": 2.213,
"step": 684
},
{
"epoch": 0.49836304110585666,
"grad_norm": 1.2748612300877948,
"learning_rate": 5.270694542927089e-06,
"loss": 2.1645,
"step": 685
},
{
"epoch": 0.4990905783921426,
"grad_norm": 1.4758373492064474,
"learning_rate": 5.258935987413524e-06,
"loss": 2.1795,
"step": 686
},
{
"epoch": 0.49981811567842854,
"grad_norm": 1.418081827629764,
"learning_rate": 5.247175995814452e-06,
"loss": 2.0981,
"step": 687
},
{
"epoch": 0.5005456529647144,
"grad_norm": 1.3395470503978169,
"learning_rate": 5.235414633351992e-06,
"loss": 2.1836,
"step": 688
},
{
"epoch": 0.5012731902510004,
"grad_norm": 1.2941659730123067,
"learning_rate": 5.223651965255864e-06,
"loss": 2.169,
"step": 689
},
{
"epoch": 0.5020007275372863,
"grad_norm": 1.575875089973837,
"learning_rate": 5.211888056763029e-06,
"loss": 2.1769,
"step": 690
},
{
"epoch": 0.5027282648235722,
"grad_norm": 1.3840535076365168,
"learning_rate": 5.20012297311733e-06,
"loss": 2.189,
"step": 691
},
{
"epoch": 0.5034558021098581,
"grad_norm": 1.4124848862463264,
"learning_rate": 5.188356779569125e-06,
"loss": 2.1797,
"step": 692
},
{
"epoch": 0.504183339396144,
"grad_norm": 1.3903900183285682,
"learning_rate": 5.176589541374929e-06,
"loss": 2.1851,
"step": 693
},
{
"epoch": 0.50491087668243,
"grad_norm": 1.346682971941151,
"learning_rate": 5.164821323797051e-06,
"loss": 2.1561,
"step": 694
},
{
"epoch": 0.5056384139687159,
"grad_norm": 1.2342634042885627,
"learning_rate": 5.1530521921032305e-06,
"loss": 2.1873,
"step": 695
},
{
"epoch": 0.5063659512550018,
"grad_norm": 1.289754702836016,
"learning_rate": 5.141282211566276e-06,
"loss": 2.2453,
"step": 696
},
{
"epoch": 0.5070934885412878,
"grad_norm": 1.2422208566945978,
"learning_rate": 5.129511447463705e-06,
"loss": 2.2103,
"step": 697
},
{
"epoch": 0.5078210258275737,
"grad_norm": 1.3065192995628705,
"learning_rate": 5.117739965077382e-06,
"loss": 2.2236,
"step": 698
},
{
"epoch": 0.5085485631138595,
"grad_norm": 1.3495238458160375,
"learning_rate": 5.105967829693155e-06,
"loss": 2.1476,
"step": 699
},
{
"epoch": 0.5092761004001455,
"grad_norm": 1.2325811406693374,
"learning_rate": 5.0941951066004906e-06,
"loss": 2.1687,
"step": 700
},
{
"epoch": 0.5100036376864314,
"grad_norm": 1.3631358419723048,
"learning_rate": 5.082421861092116e-06,
"loss": 2.1913,
"step": 701
},
{
"epoch": 0.5107311749727174,
"grad_norm": 1.270779013966519,
"learning_rate": 5.0706481584636605e-06,
"loss": 2.2431,
"step": 702
},
{
"epoch": 0.5114587122590033,
"grad_norm": 1.2036076608414459,
"learning_rate": 5.0588740640132805e-06,
"loss": 2.1895,
"step": 703
},
{
"epoch": 0.5121862495452892,
"grad_norm": 1.309299489371034,
"learning_rate": 5.047099643041312e-06,
"loss": 2.2308,
"step": 704
},
{
"epoch": 0.5129137868315751,
"grad_norm": 2.4163480824417887,
"learning_rate": 5.0353249608499e-06,
"loss": 2.2099,
"step": 705
},
{
"epoch": 0.513641324117861,
"grad_norm": 1.3384150940350374,
"learning_rate": 5.023550082742637e-06,
"loss": 2.1809,
"step": 706
},
{
"epoch": 0.514368861404147,
"grad_norm": 1.286056696964337,
"learning_rate": 5.011775074024202e-06,
"loss": 2.182,
"step": 707
},
{
"epoch": 0.5150963986904329,
"grad_norm": 1.3126296066456467,
"learning_rate": 5e-06,
"loss": 2.1946,
"step": 708
},
{
"epoch": 0.5158239359767188,
"grad_norm": 1.3270454866003318,
"learning_rate": 4.988224925975799e-06,
"loss": 2.1873,
"step": 709
},
{
"epoch": 0.5165514732630048,
"grad_norm": 1.2104803167234541,
"learning_rate": 4.976449917257365e-06,
"loss": 2.1754,
"step": 710
},
{
"epoch": 0.5172790105492906,
"grad_norm": 1.361700904797741,
"learning_rate": 4.964675039150102e-06,
"loss": 2.1479,
"step": 711
},
{
"epoch": 0.5180065478355765,
"grad_norm": 1.2525166720230223,
"learning_rate": 4.952900356958689e-06,
"loss": 2.1876,
"step": 712
},
{
"epoch": 0.5187340851218625,
"grad_norm": 1.3502833880969705,
"learning_rate": 4.941125935986721e-06,
"loss": 2.1657,
"step": 713
},
{
"epoch": 0.5194616224081484,
"grad_norm": 1.3396266138224724,
"learning_rate": 4.929351841536342e-06,
"loss": 2.1869,
"step": 714
},
{
"epoch": 0.5201891596944344,
"grad_norm": 1.2140015595334195,
"learning_rate": 4.917578138907884e-06,
"loss": 2.1803,
"step": 715
},
{
"epoch": 0.5209166969807203,
"grad_norm": 1.6002438838217814,
"learning_rate": 4.90580489339951e-06,
"loss": 2.1274,
"step": 716
},
{
"epoch": 0.5216442342670062,
"grad_norm": 1.3821865250885832,
"learning_rate": 4.894032170306846e-06,
"loss": 2.1731,
"step": 717
},
{
"epoch": 0.5223717715532921,
"grad_norm": 1.3845708527944702,
"learning_rate": 4.882260034922618e-06,
"loss": 2.1313,
"step": 718
},
{
"epoch": 0.523099308839578,
"grad_norm": 1.3576928213923576,
"learning_rate": 4.870488552536296e-06,
"loss": 2.1854,
"step": 719
},
{
"epoch": 0.5238268461258639,
"grad_norm": 1.3371555624306086,
"learning_rate": 4.858717788433725e-06,
"loss": 2.1803,
"step": 720
},
{
"epoch": 0.5245543834121499,
"grad_norm": 1.3113210170808254,
"learning_rate": 4.846947807896771e-06,
"loss": 2.1972,
"step": 721
},
{
"epoch": 0.5252819206984358,
"grad_norm": 1.2936893601356254,
"learning_rate": 4.83517867620295e-06,
"loss": 2.1742,
"step": 722
},
{
"epoch": 0.5260094579847218,
"grad_norm": 1.363329577036146,
"learning_rate": 4.823410458625072e-06,
"loss": 2.1583,
"step": 723
},
{
"epoch": 0.5267369952710076,
"grad_norm": 1.2816776945909583,
"learning_rate": 4.811643220430877e-06,
"loss": 2.2167,
"step": 724
},
{
"epoch": 0.5274645325572935,
"grad_norm": 1.4000479914079145,
"learning_rate": 4.7998770268826726e-06,
"loss": 2.1422,
"step": 725
},
{
"epoch": 0.5281920698435795,
"grad_norm": 1.2363482944769124,
"learning_rate": 4.788111943236973e-06,
"loss": 2.1708,
"step": 726
},
{
"epoch": 0.5289196071298654,
"grad_norm": 1.2634999042620405,
"learning_rate": 4.7763480347441395e-06,
"loss": 2.2168,
"step": 727
},
{
"epoch": 0.5296471444161513,
"grad_norm": 1.3655777462801062,
"learning_rate": 4.7645853666480104e-06,
"loss": 2.214,
"step": 728
},
{
"epoch": 0.5303746817024373,
"grad_norm": 1.3566078294042703,
"learning_rate": 4.752824004185548e-06,
"loss": 2.149,
"step": 729
},
{
"epoch": 0.5311022189887231,
"grad_norm": 1.4882940760804018,
"learning_rate": 4.7410640125864785e-06,
"loss": 2.1441,
"step": 730
},
{
"epoch": 0.5318297562750091,
"grad_norm": 1.4375750939214706,
"learning_rate": 4.729305457072913e-06,
"loss": 2.1757,
"step": 731
},
{
"epoch": 0.532557293561295,
"grad_norm": 1.3376613224457599,
"learning_rate": 4.717548402859008e-06,
"loss": 2.1585,
"step": 732
},
{
"epoch": 0.5332848308475809,
"grad_norm": 1.2605792646846594,
"learning_rate": 4.7057929151505895e-06,
"loss": 2.1889,
"step": 733
},
{
"epoch": 0.5340123681338669,
"grad_norm": 1.8650621006861559,
"learning_rate": 4.694039059144797e-06,
"loss": 2.1302,
"step": 734
},
{
"epoch": 0.5347399054201528,
"grad_norm": 1.4404601559351293,
"learning_rate": 4.6822869000297185e-06,
"loss": 2.1504,
"step": 735
},
{
"epoch": 0.5354674427064388,
"grad_norm": 1.3347908301393572,
"learning_rate": 4.670536502984033e-06,
"loss": 2.191,
"step": 736
},
{
"epoch": 0.5361949799927246,
"grad_norm": 1.50594842793546,
"learning_rate": 4.6587879331766465e-06,
"loss": 2.1525,
"step": 737
},
{
"epoch": 0.5369225172790105,
"grad_norm": 1.2677367459392634,
"learning_rate": 4.647041255766329e-06,
"loss": 2.1766,
"step": 738
},
{
"epoch": 0.5376500545652965,
"grad_norm": 1.2221286265080424,
"learning_rate": 4.6352965359013576e-06,
"loss": 2.171,
"step": 739
},
{
"epoch": 0.5383775918515824,
"grad_norm": 1.2878264260228098,
"learning_rate": 4.623553838719151e-06,
"loss": 2.1832,
"step": 740
},
{
"epoch": 0.5391051291378683,
"grad_norm": 1.2660416052678283,
"learning_rate": 4.611813229345911e-06,
"loss": 2.1712,
"step": 741
},
{
"epoch": 0.5398326664241543,
"grad_norm": 1.442141691261282,
"learning_rate": 4.6000747728962606e-06,
"loss": 2.1485,
"step": 742
},
{
"epoch": 0.5405602037104401,
"grad_norm": 1.2638028620122013,
"learning_rate": 4.588338534472878e-06,
"loss": 2.2101,
"step": 743
},
{
"epoch": 0.541287740996726,
"grad_norm": 1.3293217690788863,
"learning_rate": 4.576604579166147e-06,
"loss": 2.2138,
"step": 744
},
{
"epoch": 0.542015278283012,
"grad_norm": 1.3418992707760247,
"learning_rate": 4.564872972053786e-06,
"loss": 2.1954,
"step": 745
},
{
"epoch": 0.5427428155692979,
"grad_norm": 1.383674149920183,
"learning_rate": 4.553143778200486e-06,
"loss": 2.1756,
"step": 746
},
{
"epoch": 0.5434703528555839,
"grad_norm": 1.2522210322008165,
"learning_rate": 4.541417062657561e-06,
"loss": 2.1991,
"step": 747
},
{
"epoch": 0.5441978901418698,
"grad_norm": 1.1950219808865283,
"learning_rate": 4.529692890462574e-06,
"loss": 2.1462,
"step": 748
},
{
"epoch": 0.5449254274281556,
"grad_norm": 1.4821104806561844,
"learning_rate": 4.5179713266389866e-06,
"loss": 2.1622,
"step": 749
},
{
"epoch": 0.5456529647144416,
"grad_norm": 1.2743523641653294,
"learning_rate": 4.50625243619579e-06,
"loss": 2.2167,
"step": 750
},
{
"epoch": 0.5463805020007275,
"grad_norm": 1.3181368071817843,
"learning_rate": 4.494536284127155e-06,
"loss": 2.2139,
"step": 751
},
{
"epoch": 0.5471080392870135,
"grad_norm": 1.2418738111745269,
"learning_rate": 4.4828229354120565e-06,
"loss": 2.2264,
"step": 752
},
{
"epoch": 0.5478355765732994,
"grad_norm": 1.2885155053876325,
"learning_rate": 4.471112455013928e-06,
"loss": 2.2022,
"step": 753
},
{
"epoch": 0.5485631138595853,
"grad_norm": 1.3444450947561895,
"learning_rate": 4.459404907880293e-06,
"loss": 2.1748,
"step": 754
},
{
"epoch": 0.5492906511458713,
"grad_norm": 2.9905972047742733,
"learning_rate": 4.447700358942407e-06,
"loss": 2.1239,
"step": 755
},
{
"epoch": 0.5500181884321571,
"grad_norm": 1.473706001074463,
"learning_rate": 4.435998873114895e-06,
"loss": 2.1655,
"step": 756
},
{
"epoch": 0.550745725718443,
"grad_norm": 1.3501108667605346,
"learning_rate": 4.424300515295401e-06,
"loss": 2.1731,
"step": 757
},
{
"epoch": 0.551473263004729,
"grad_norm": 1.2964111110362575,
"learning_rate": 4.412605350364213e-06,
"loss": 2.1732,
"step": 758
},
{
"epoch": 0.5522008002910149,
"grad_norm": 1.3254861775621958,
"learning_rate": 4.400913443183913e-06,
"loss": 2.183,
"step": 759
},
{
"epoch": 0.5529283375773009,
"grad_norm": 1.449214814716554,
"learning_rate": 4.389224858599015e-06,
"loss": 2.1766,
"step": 760
},
{
"epoch": 0.5536558748635868,
"grad_norm": 1.3379961949853125,
"learning_rate": 4.377539661435608e-06,
"loss": 2.1253,
"step": 761
},
{
"epoch": 0.5543834121498726,
"grad_norm": 1.302585638735648,
"learning_rate": 4.365857916500991e-06,
"loss": 2.1778,
"step": 762
},
{
"epoch": 0.5551109494361586,
"grad_norm": 1.4822331492387226,
"learning_rate": 4.35417968858332e-06,
"loss": 2.1978,
"step": 763
},
{
"epoch": 0.5558384867224445,
"grad_norm": 1.5506151702217899,
"learning_rate": 4.3425050424512405e-06,
"loss": 2.1304,
"step": 764
},
{
"epoch": 0.5565660240087305,
"grad_norm": 1.3104067578648475,
"learning_rate": 4.330834042853537e-06,
"loss": 2.1714,
"step": 765
},
{
"epoch": 0.5572935612950164,
"grad_norm": 1.3051669299320896,
"learning_rate": 4.319166754518768e-06,
"loss": 2.1609,
"step": 766
},
{
"epoch": 0.5580210985813023,
"grad_norm": 1.229874722173343,
"learning_rate": 4.30750324215491e-06,
"loss": 2.2052,
"step": 767
},
{
"epoch": 0.5587486358675882,
"grad_norm": 1.3217497554258093,
"learning_rate": 4.295843570448998e-06,
"loss": 2.175,
"step": 768
},
{
"epoch": 0.5594761731538741,
"grad_norm": 1.4091315451317816,
"learning_rate": 4.284187804066764e-06,
"loss": 2.197,
"step": 769
},
{
"epoch": 0.56020371044016,
"grad_norm": 1.9162169406144054,
"learning_rate": 4.272536007652281e-06,
"loss": 2.1093,
"step": 770
},
{
"epoch": 0.560931247726446,
"grad_norm": 1.3754027757166483,
"learning_rate": 4.260888245827608e-06,
"loss": 2.2147,
"step": 771
},
{
"epoch": 0.5616587850127319,
"grad_norm": 1.3743512226246963,
"learning_rate": 4.249244583192425e-06,
"loss": 2.188,
"step": 772
},
{
"epoch": 0.5623863222990179,
"grad_norm": 1.5122563849193087,
"learning_rate": 4.237605084323676e-06,
"loss": 2.1581,
"step": 773
},
{
"epoch": 0.5631138595853038,
"grad_norm": 1.1836737555675112,
"learning_rate": 4.225969813775215e-06,
"loss": 2.1945,
"step": 774
},
{
"epoch": 0.5638413968715896,
"grad_norm": 1.4327467816072759,
"learning_rate": 4.214338836077444e-06,
"loss": 2.1444,
"step": 775
},
{
"epoch": 0.5645689341578756,
"grad_norm": 1.4050707246252088,
"learning_rate": 4.202712215736955e-06,
"loss": 2.1793,
"step": 776
},
{
"epoch": 0.5652964714441615,
"grad_norm": 1.239554329773933,
"learning_rate": 4.191090017236177e-06,
"loss": 2.2064,
"step": 777
},
{
"epoch": 0.5660240087304474,
"grad_norm": 1.2770561521182617,
"learning_rate": 4.1794723050330125e-06,
"loss": 2.2113,
"step": 778
},
{
"epoch": 0.5667515460167334,
"grad_norm": 1.3219438488411415,
"learning_rate": 4.167859143560484e-06,
"loss": 2.2279,
"step": 779
},
{
"epoch": 0.5674790833030193,
"grad_norm": 1.4154758042801283,
"learning_rate": 4.1562505972263735e-06,
"loss": 2.1746,
"step": 780
},
{
"epoch": 0.5682066205893052,
"grad_norm": 1.3740339531693826,
"learning_rate": 4.144646730412868e-06,
"loss": 2.1997,
"step": 781
},
{
"epoch": 0.5689341578755911,
"grad_norm": 3.777402142491512,
"learning_rate": 4.133047607476202e-06,
"loss": 2.1841,
"step": 782
},
{
"epoch": 0.569661695161877,
"grad_norm": 1.544999076264375,
"learning_rate": 4.121453292746297e-06,
"loss": 2.1451,
"step": 783
},
{
"epoch": 0.570389232448163,
"grad_norm": 1.3279022974010046,
"learning_rate": 4.109863850526413e-06,
"loss": 2.1742,
"step": 784
},
{
"epoch": 0.5711167697344489,
"grad_norm": 1.323190106700413,
"learning_rate": 4.098279345092783e-06,
"loss": 2.1796,
"step": 785
},
{
"epoch": 0.5718443070207349,
"grad_norm": 1.3152250504974496,
"learning_rate": 4.086699840694262e-06,
"loss": 2.1191,
"step": 786
},
{
"epoch": 0.5725718443070207,
"grad_norm": 1.5083030885832742,
"learning_rate": 4.075125401551963e-06,
"loss": 2.1534,
"step": 787
},
{
"epoch": 0.5732993815933066,
"grad_norm": 1.136113143067403,
"learning_rate": 4.063556091858914e-06,
"loss": 2.1837,
"step": 788
},
{
"epoch": 0.5740269188795926,
"grad_norm": 1.4698755700883925,
"learning_rate": 4.051991975779691e-06,
"loss": 2.1546,
"step": 789
},
{
"epoch": 0.5747544561658785,
"grad_norm": 1.3802123832715365,
"learning_rate": 4.040433117450066e-06,
"loss": 2.1632,
"step": 790
},
{
"epoch": 0.5754819934521644,
"grad_norm": 1.5354119147308578,
"learning_rate": 4.0288795809766516e-06,
"loss": 2.1918,
"step": 791
},
{
"epoch": 0.5762095307384504,
"grad_norm": 1.44019389860738,
"learning_rate": 4.017331430436543e-06,
"loss": 2.188,
"step": 792
},
{
"epoch": 0.5769370680247363,
"grad_norm": 1.3140332293999832,
"learning_rate": 4.005788729876968e-06,
"loss": 2.1619,
"step": 793
},
{
"epoch": 0.5776646053110222,
"grad_norm": 1.3595817834123527,
"learning_rate": 3.994251543314925e-06,
"loss": 2.1158,
"step": 794
},
{
"epoch": 0.5783921425973081,
"grad_norm": 1.3075801783310985,
"learning_rate": 3.982719934736832e-06,
"loss": 2.151,
"step": 795
},
{
"epoch": 0.579119679883594,
"grad_norm": 1.45652881963162,
"learning_rate": 3.971193968098172e-06,
"loss": 2.1715,
"step": 796
},
{
"epoch": 0.57984721716988,
"grad_norm": 1.1217645508627507,
"learning_rate": 3.959673707323135e-06,
"loss": 2.1998,
"step": 797
},
{
"epoch": 0.5805747544561659,
"grad_norm": 1.2189466042014208,
"learning_rate": 3.948159216304269e-06,
"loss": 2.1857,
"step": 798
},
{
"epoch": 0.5813022917424518,
"grad_norm": 1.3190108995893723,
"learning_rate": 3.93665055890212e-06,
"loss": 2.1988,
"step": 799
},
{
"epoch": 0.5820298290287377,
"grad_norm": 1.2178878026121687,
"learning_rate": 3.92514779894488e-06,
"loss": 2.1617,
"step": 800
},
{
"epoch": 0.5827573663150236,
"grad_norm": 1.3032425542197852,
"learning_rate": 3.9136510002280344e-06,
"loss": 2.2213,
"step": 801
},
{
"epoch": 0.5834849036013096,
"grad_norm": 1.5544389666656435,
"learning_rate": 3.902160226514007e-06,
"loss": 2.1795,
"step": 802
},
{
"epoch": 0.5842124408875955,
"grad_norm": 1.2429875863967428,
"learning_rate": 3.8906755415318045e-06,
"loss": 2.1962,
"step": 803
},
{
"epoch": 0.5849399781738814,
"grad_norm": 1.2878993650555384,
"learning_rate": 3.8791970089766665e-06,
"loss": 2.1809,
"step": 804
},
{
"epoch": 0.5856675154601674,
"grad_norm": 1.5981630336586339,
"learning_rate": 3.86772469250971e-06,
"loss": 2.1831,
"step": 805
},
{
"epoch": 0.5863950527464532,
"grad_norm": 1.3398972377634621,
"learning_rate": 3.856258655757578e-06,
"loss": 2.1683,
"step": 806
},
{
"epoch": 0.5871225900327391,
"grad_norm": 1.3902922273693485,
"learning_rate": 3.844798962312085e-06,
"loss": 2.1711,
"step": 807
},
{
"epoch": 0.5878501273190251,
"grad_norm": 1.5076390916392564,
"learning_rate": 3.833345675729863e-06,
"loss": 2.1873,
"step": 808
},
{
"epoch": 0.588577664605311,
"grad_norm": 1.3602095220019874,
"learning_rate": 3.821898859532013e-06,
"loss": 2.1572,
"step": 809
},
{
"epoch": 0.589305201891597,
"grad_norm": 1.3849483328488064,
"learning_rate": 3.8104585772037493e-06,
"loss": 2.1637,
"step": 810
},
{
"epoch": 0.5900327391778829,
"grad_norm": 1.3389495619269316,
"learning_rate": 3.7990248921940485e-06,
"loss": 2.1713,
"step": 811
},
{
"epoch": 0.5907602764641688,
"grad_norm": 1.3880194739199263,
"learning_rate": 3.787597867915303e-06,
"loss": 2.1837,
"step": 812
},
{
"epoch": 0.5914878137504547,
"grad_norm": 1.2861161541489807,
"learning_rate": 3.7761775677429567e-06,
"loss": 2.188,
"step": 813
},
{
"epoch": 0.5922153510367406,
"grad_norm": 1.2318992307646213,
"learning_rate": 3.7647640550151666e-06,
"loss": 2.1662,
"step": 814
},
{
"epoch": 0.5929428883230266,
"grad_norm": 1.1972675066127456,
"learning_rate": 3.7533573930324395e-06,
"loss": 2.2122,
"step": 815
},
{
"epoch": 0.5936704256093125,
"grad_norm": 1.341853588828462,
"learning_rate": 3.7419576450572924e-06,
"loss": 2.1221,
"step": 816
},
{
"epoch": 0.5943979628955984,
"grad_norm": 1.3121356522727252,
"learning_rate": 3.7305648743138966e-06,
"loss": 2.1702,
"step": 817
},
{
"epoch": 0.5951255001818844,
"grad_norm": 1.193623744711227,
"learning_rate": 3.7191791439877236e-06,
"loss": 2.1873,
"step": 818
},
{
"epoch": 0.5958530374681702,
"grad_norm": 2.1904797654328534,
"learning_rate": 3.7078005172252015e-06,
"loss": 2.2182,
"step": 819
},
{
"epoch": 0.5965805747544561,
"grad_norm": 1.2499680138753542,
"learning_rate": 3.6964290571333583e-06,
"loss": 2.1625,
"step": 820
},
{
"epoch": 0.5973081120407421,
"grad_norm": 1.452278370561498,
"learning_rate": 3.6850648267794776e-06,
"loss": 2.0981,
"step": 821
},
{
"epoch": 0.598035649327028,
"grad_norm": 1.3248797969544868,
"learning_rate": 3.673707889190744e-06,
"loss": 2.1693,
"step": 822
},
{
"epoch": 0.598763186613314,
"grad_norm": 1.3935529244008626,
"learning_rate": 3.662358307353897e-06,
"loss": 2.1635,
"step": 823
},
{
"epoch": 0.5994907238995999,
"grad_norm": 1.2660094629060932,
"learning_rate": 3.6510161442148783e-06,
"loss": 2.0612,
"step": 824
},
{
"epoch": 0.6002182611858857,
"grad_norm": 1.1899096410575125,
"learning_rate": 3.63968146267849e-06,
"loss": 2.1582,
"step": 825
},
{
"epoch": 0.6009457984721717,
"grad_norm": 1.4707524646381498,
"learning_rate": 3.6283543256080334e-06,
"loss": 2.1861,
"step": 826
},
{
"epoch": 0.6016733357584576,
"grad_norm": 2.2990139344746043,
"learning_rate": 3.6170347958249728e-06,
"loss": 2.1556,
"step": 827
},
{
"epoch": 0.6024008730447435,
"grad_norm": 1.2985950672345654,
"learning_rate": 3.605722936108579e-06,
"loss": 2.1848,
"step": 828
},
{
"epoch": 0.6031284103310295,
"grad_norm": 1.2578946342605386,
"learning_rate": 3.5944188091955843e-06,
"loss": 2.1714,
"step": 829
},
{
"epoch": 0.6038559476173154,
"grad_norm": 1.2748819645637657,
"learning_rate": 3.5831224777798346e-06,
"loss": 2.1336,
"step": 830
},
{
"epoch": 0.6045834849036014,
"grad_norm": 1.416638405798943,
"learning_rate": 3.5718340045119416e-06,
"loss": 2.1863,
"step": 831
},
{
"epoch": 0.6053110221898872,
"grad_norm": 1.2823866800966135,
"learning_rate": 3.5605534519989327e-06,
"loss": 2.1921,
"step": 832
},
{
"epoch": 0.6060385594761731,
"grad_norm": 1.261745846438069,
"learning_rate": 3.5492808828039083e-06,
"loss": 2.1546,
"step": 833
},
{
"epoch": 0.6067660967624591,
"grad_norm": 1.2333351495616436,
"learning_rate": 3.538016359445692e-06,
"loss": 2.0962,
"step": 834
},
{
"epoch": 0.607493634048745,
"grad_norm": 1.241100958102387,
"learning_rate": 3.5267599443984848e-06,
"loss": 2.1731,
"step": 835
},
{
"epoch": 0.608221171335031,
"grad_norm": 1.4719975452264942,
"learning_rate": 3.5155117000915153e-06,
"loss": 2.1652,
"step": 836
},
{
"epoch": 0.6089487086213169,
"grad_norm": 1.2062716693873263,
"learning_rate": 3.5042716889086998e-06,
"loss": 2.1553,
"step": 837
},
{
"epoch": 0.6096762459076027,
"grad_norm": 1.3613990068473611,
"learning_rate": 3.493039973188289e-06,
"loss": 2.1603,
"step": 838
},
{
"epoch": 0.6104037831938887,
"grad_norm": 1.2656018128816646,
"learning_rate": 3.481816615222533e-06,
"loss": 2.1789,
"step": 839
},
{
"epoch": 0.6111313204801746,
"grad_norm": 1.4240615172025373,
"learning_rate": 3.470601677257323e-06,
"loss": 2.1792,
"step": 840
},
{
"epoch": 0.6118588577664605,
"grad_norm": 1.4177889412892386,
"learning_rate": 3.459395221491853e-06,
"loss": 2.1296,
"step": 841
},
{
"epoch": 0.6125863950527465,
"grad_norm": 1.2845162857537746,
"learning_rate": 3.4481973100782756e-06,
"loss": 2.1477,
"step": 842
},
{
"epoch": 0.6133139323390324,
"grad_norm": 1.1999612596610867,
"learning_rate": 3.4370080051213527e-06,
"loss": 2.1877,
"step": 843
},
{
"epoch": 0.6140414696253182,
"grad_norm": 1.173270406358778,
"learning_rate": 3.4258273686781156e-06,
"loss": 2.1612,
"step": 844
},
{
"epoch": 0.6147690069116042,
"grad_norm": 1.3127766990292529,
"learning_rate": 3.4146554627575207e-06,
"loss": 2.1637,
"step": 845
},
{
"epoch": 0.6154965441978901,
"grad_norm": 1.3033434445301304,
"learning_rate": 3.403492349320101e-06,
"loss": 2.152,
"step": 846
},
{
"epoch": 0.6162240814841761,
"grad_norm": 1.2129577650821965,
"learning_rate": 3.392338090277628e-06,
"loss": 2.1982,
"step": 847
},
{
"epoch": 0.616951618770462,
"grad_norm": 1.2402068492200884,
"learning_rate": 3.3811927474927644e-06,
"loss": 2.2277,
"step": 848
},
{
"epoch": 0.617679156056748,
"grad_norm": 1.2610330081583359,
"learning_rate": 3.3700563827787224e-06,
"loss": 2.1312,
"step": 849
},
{
"epoch": 0.6184066933430338,
"grad_norm": 1.2221232125577248,
"learning_rate": 3.358929057898922e-06,
"loss": 2.1795,
"step": 850
},
{
"epoch": 0.6191342306293197,
"grad_norm": 1.367240054820803,
"learning_rate": 3.3478108345666456e-06,
"loss": 2.1599,
"step": 851
},
{
"epoch": 0.6198617679156057,
"grad_norm": 1.2533884095301935,
"learning_rate": 3.3367017744446995e-06,
"loss": 2.1798,
"step": 852
},
{
"epoch": 0.6205893052018916,
"grad_norm": 1.445873029531831,
"learning_rate": 3.3256019391450696e-06,
"loss": 2.189,
"step": 853
},
{
"epoch": 0.6213168424881775,
"grad_norm": 1.5308918859452214,
"learning_rate": 3.314511390228578e-06,
"loss": 2.1167,
"step": 854
},
{
"epoch": 0.6220443797744635,
"grad_norm": 1.1935640999794672,
"learning_rate": 3.303430189204545e-06,
"loss": 2.1835,
"step": 855
},
{
"epoch": 0.6227719170607494,
"grad_norm": 1.2758745534483429,
"learning_rate": 3.2923583975304474e-06,
"loss": 2.189,
"step": 856
},
{
"epoch": 0.6234994543470352,
"grad_norm": 1.2812582557317376,
"learning_rate": 3.2812960766115747e-06,
"loss": 2.1764,
"step": 857
},
{
"epoch": 0.6242269916333212,
"grad_norm": 1.3052844302185176,
"learning_rate": 3.270243287800693e-06,
"loss": 2.1596,
"step": 858
},
{
"epoch": 0.6249545289196071,
"grad_norm": 1.1735756944101055,
"learning_rate": 3.2592000923976997e-06,
"loss": 2.1914,
"step": 859
},
{
"epoch": 0.6256820662058931,
"grad_norm": 1.3820135946650651,
"learning_rate": 3.2481665516492876e-06,
"loss": 2.1671,
"step": 860
},
{
"epoch": 0.626409603492179,
"grad_norm": 1.2951883382314702,
"learning_rate": 3.2371427267486044e-06,
"loss": 2.1767,
"step": 861
},
{
"epoch": 0.6271371407784649,
"grad_norm": 1.6538268336475825,
"learning_rate": 3.2261286788349127e-06,
"loss": 2.1366,
"step": 862
},
{
"epoch": 0.6278646780647508,
"grad_norm": 1.2576633846796257,
"learning_rate": 3.2151244689932505e-06,
"loss": 2.1383,
"step": 863
},
{
"epoch": 0.6285922153510367,
"grad_norm": 1.2026846661371544,
"learning_rate": 3.2041301582540903e-06,
"loss": 2.1615,
"step": 864
},
{
"epoch": 0.6293197526373226,
"grad_norm": 1.9677840709401988,
"learning_rate": 3.1931458075930046e-06,
"loss": 2.1896,
"step": 865
},
{
"epoch": 0.6300472899236086,
"grad_norm": 1.3114255471930778,
"learning_rate": 3.182171477930332e-06,
"loss": 2.2036,
"step": 866
},
{
"epoch": 0.6307748272098945,
"grad_norm": 1.5993419978853645,
"learning_rate": 3.171207230130826e-06,
"loss": 2.175,
"step": 867
},
{
"epoch": 0.6315023644961805,
"grad_norm": 1.3379682301899738,
"learning_rate": 3.1602531250033286e-06,
"loss": 2.1345,
"step": 868
},
{
"epoch": 0.6322299017824663,
"grad_norm": 1.1605864226425355,
"learning_rate": 3.149309223300428e-06,
"loss": 2.1998,
"step": 869
},
{
"epoch": 0.6329574390687522,
"grad_norm": 1.3863307023900937,
"learning_rate": 3.1383755857181253e-06,
"loss": 2.1669,
"step": 870
},
{
"epoch": 0.6336849763550382,
"grad_norm": 1.253318814192694,
"learning_rate": 3.1274522728954928e-06,
"loss": 2.1333,
"step": 871
},
{
"epoch": 0.6344125136413241,
"grad_norm": 1.2058026824056487,
"learning_rate": 3.1165393454143423e-06,
"loss": 2.1651,
"step": 872
},
{
"epoch": 0.6351400509276101,
"grad_norm": 1.1644040700745641,
"learning_rate": 3.1056368637988876e-06,
"loss": 2.0699,
"step": 873
},
{
"epoch": 0.635867588213896,
"grad_norm": 1.20675303117915,
"learning_rate": 3.0947448885154085e-06,
"loss": 2.1484,
"step": 874
},
{
"epoch": 0.6365951255001819,
"grad_norm": 1.2391348218319218,
"learning_rate": 3.0838634799719157e-06,
"loss": 2.2138,
"step": 875
},
{
"epoch": 0.6373226627864678,
"grad_norm": 1.2114878066432457,
"learning_rate": 3.072992698517815e-06,
"loss": 2.2021,
"step": 876
},
{
"epoch": 0.6380502000727537,
"grad_norm": 1.3283135525126404,
"learning_rate": 3.0621326044435738e-06,
"loss": 2.1344,
"step": 877
},
{
"epoch": 0.6387777373590396,
"grad_norm": 1.3236221007892095,
"learning_rate": 3.0512832579803873e-06,
"loss": 2.1644,
"step": 878
},
{
"epoch": 0.6395052746453256,
"grad_norm": 1.1667265916431366,
"learning_rate": 3.0404447192998398e-06,
"loss": 2.182,
"step": 879
},
{
"epoch": 0.6402328119316115,
"grad_norm": 1.204222459042279,
"learning_rate": 3.029617048513579e-06,
"loss": 2.1453,
"step": 880
},
{
"epoch": 0.6409603492178975,
"grad_norm": 1.3308367139766337,
"learning_rate": 3.0188003056729752e-06,
"loss": 2.1893,
"step": 881
},
{
"epoch": 0.6416878865041833,
"grad_norm": 1.1960192121658078,
"learning_rate": 3.007994550768793e-06,
"loss": 2.1942,
"step": 882
},
{
"epoch": 0.6424154237904692,
"grad_norm": 1.145892797210482,
"learning_rate": 2.9971998437308546e-06,
"loss": 2.1857,
"step": 883
},
{
"epoch": 0.6431429610767552,
"grad_norm": 1.535073515408013,
"learning_rate": 2.9864162444277118e-06,
"loss": 2.154,
"step": 884
},
{
"epoch": 0.6438704983630411,
"grad_norm": 1.19226809686383,
"learning_rate": 2.97564381266631e-06,
"loss": 2.1522,
"step": 885
},
{
"epoch": 0.644598035649327,
"grad_norm": 1.283458540798743,
"learning_rate": 2.964882608191659e-06,
"loss": 2.1653,
"step": 886
},
{
"epoch": 0.645325572935613,
"grad_norm": 1.1444227382316488,
"learning_rate": 2.954132690686502e-06,
"loss": 2.1985,
"step": 887
},
{
"epoch": 0.6460531102218988,
"grad_norm": 1.3133662020784518,
"learning_rate": 2.9433941197709813e-06,
"loss": 2.0988,
"step": 888
},
{
"epoch": 0.6467806475081848,
"grad_norm": 9.045196136906558,
"learning_rate": 2.9326669550023124e-06,
"loss": 2.1569,
"step": 889
},
{
"epoch": 0.6475081847944707,
"grad_norm": 1.318633177919385,
"learning_rate": 2.921951255874449e-06,
"loss": 2.1826,
"step": 890
},
{
"epoch": 0.6482357220807566,
"grad_norm": 1.2574589759053059,
"learning_rate": 2.9112470818177563e-06,
"loss": 2.2294,
"step": 891
},
{
"epoch": 0.6489632593670426,
"grad_norm": 1.5320788443566247,
"learning_rate": 2.9005544921986774e-06,
"loss": 2.1152,
"step": 892
},
{
"epoch": 0.6496907966533285,
"grad_norm": 1.3393258246398236,
"learning_rate": 2.8898735463194128e-06,
"loss": 2.1454,
"step": 893
},
{
"epoch": 0.6504183339396145,
"grad_norm": 1.2099327172531733,
"learning_rate": 2.8792043034175817e-06,
"loss": 2.1981,
"step": 894
},
{
"epoch": 0.6511458712259003,
"grad_norm": 1.4423610907708257,
"learning_rate": 2.8685468226658974e-06,
"loss": 2.2454,
"step": 895
},
{
"epoch": 0.6518734085121862,
"grad_norm": 1.50482757871637,
"learning_rate": 2.85790116317184e-06,
"loss": 2.2105,
"step": 896
},
{
"epoch": 0.6526009457984722,
"grad_norm": 1.2444683591992163,
"learning_rate": 2.8472673839773267e-06,
"loss": 2.1572,
"step": 897
},
{
"epoch": 0.6533284830847581,
"grad_norm": 1.2427782247532162,
"learning_rate": 2.8366455440583874e-06,
"loss": 2.1902,
"step": 898
},
{
"epoch": 0.654056020371044,
"grad_norm": 1.162919929796855,
"learning_rate": 2.8260357023248323e-06,
"loss": 2.1376,
"step": 899
},
{
"epoch": 0.65478355765733,
"grad_norm": 1.2691029843654758,
"learning_rate": 2.815437917619932e-06,
"loss": 2.2438,
"step": 900
},
{
"epoch": 0.6555110949436158,
"grad_norm": 1.3335556247541136,
"learning_rate": 2.804852248720085e-06,
"loss": 2.179,
"step": 901
},
{
"epoch": 0.6562386322299018,
"grad_norm": 1.6825656470777264,
"learning_rate": 2.7942787543344957e-06,
"loss": 2.2047,
"step": 902
},
{
"epoch": 0.6569661695161877,
"grad_norm": 1.2506157526312551,
"learning_rate": 2.783717493104846e-06,
"loss": 2.1322,
"step": 903
},
{
"epoch": 0.6576937068024736,
"grad_norm": 1.2507417772023046,
"learning_rate": 2.7731685236049745e-06,
"loss": 2.2024,
"step": 904
},
{
"epoch": 0.6584212440887596,
"grad_norm": 1.2322326595801811,
"learning_rate": 2.762631904340546e-06,
"loss": 2.1625,
"step": 905
},
{
"epoch": 0.6591487813750455,
"grad_norm": 1.4118755276611434,
"learning_rate": 2.7521076937487248e-06,
"loss": 2.1307,
"step": 906
},
{
"epoch": 0.6598763186613313,
"grad_norm": 1.4424042343111918,
"learning_rate": 2.7415959501978674e-06,
"loss": 2.1514,
"step": 907
},
{
"epoch": 0.6606038559476173,
"grad_norm": 1.2750452237109262,
"learning_rate": 2.731096731987177e-06,
"loss": 2.1527,
"step": 908
},
{
"epoch": 0.6613313932339032,
"grad_norm": 1.1683736874051467,
"learning_rate": 2.7206100973463958e-06,
"loss": 2.1582,
"step": 909
},
{
"epoch": 0.6620589305201892,
"grad_norm": 1.2299439828061258,
"learning_rate": 2.71013610443547e-06,
"loss": 2.128,
"step": 910
},
{
"epoch": 0.6627864678064751,
"grad_norm": 1.6816303465805231,
"learning_rate": 2.6996748113442397e-06,
"loss": 2.1404,
"step": 911
},
{
"epoch": 0.663514005092761,
"grad_norm": 1.2618180205618854,
"learning_rate": 2.689226276092107e-06,
"loss": 2.1719,
"step": 912
},
{
"epoch": 0.664241542379047,
"grad_norm": 1.2228966712350884,
"learning_rate": 2.6787905566277185e-06,
"loss": 2.12,
"step": 913
},
{
"epoch": 0.6649690796653328,
"grad_norm": 1.2443352246586448,
"learning_rate": 2.6683677108286423e-06,
"loss": 2.1936,
"step": 914
},
{
"epoch": 0.6656966169516187,
"grad_norm": 1.3458440849116249,
"learning_rate": 2.65795779650105e-06,
"loss": 2.1508,
"step": 915
},
{
"epoch": 0.6664241542379047,
"grad_norm": 1.2361743789072361,
"learning_rate": 2.6475608713793923e-06,
"loss": 2.1627,
"step": 916
},
{
"epoch": 0.6671516915241906,
"grad_norm": 1.2303373889513625,
"learning_rate": 2.6371769931260806e-06,
"loss": 2.1704,
"step": 917
},
{
"epoch": 0.6678792288104766,
"grad_norm": 1.2708199715941912,
"learning_rate": 2.6268062193311672e-06,
"loss": 2.1186,
"step": 918
},
{
"epoch": 0.6686067660967625,
"grad_norm": 1.2294325949492098,
"learning_rate": 2.6164486075120245e-06,
"loss": 2.108,
"step": 919
},
{
"epoch": 0.6693343033830483,
"grad_norm": 1.1475392443431833,
"learning_rate": 2.606104215113033e-06,
"loss": 2.1766,
"step": 920
},
{
"epoch": 0.6700618406693343,
"grad_norm": 1.1625492019817114,
"learning_rate": 2.5957730995052477e-06,
"loss": 2.1743,
"step": 921
},
{
"epoch": 0.6707893779556202,
"grad_norm": 1.2367499609432473,
"learning_rate": 2.585455317986095e-06,
"loss": 2.1893,
"step": 922
},
{
"epoch": 0.6715169152419062,
"grad_norm": 1.2430383905991158,
"learning_rate": 2.5751509277790487e-06,
"loss": 2.2044,
"step": 923
},
{
"epoch": 0.6722444525281921,
"grad_norm": 1.5168046892041702,
"learning_rate": 2.5648599860333122e-06,
"loss": 2.1738,
"step": 924
},
{
"epoch": 0.672971989814478,
"grad_norm": 1.2063096926991026,
"learning_rate": 2.554582549823502e-06,
"loss": 2.1727,
"step": 925
},
{
"epoch": 0.6736995271007639,
"grad_norm": 1.2758063180302197,
"learning_rate": 2.5443186761493327e-06,
"loss": 2.1567,
"step": 926
},
{
"epoch": 0.6744270643870498,
"grad_norm": 1.1824740849643813,
"learning_rate": 2.5340684219352977e-06,
"loss": 2.1925,
"step": 927
},
{
"epoch": 0.6751546016733357,
"grad_norm": 1.1643463010411315,
"learning_rate": 2.523831844030358e-06,
"loss": 2.1241,
"step": 928
},
{
"epoch": 0.6758821389596217,
"grad_norm": 1.1286757655919661,
"learning_rate": 2.513608999207622e-06,
"loss": 2.1638,
"step": 929
},
{
"epoch": 0.6766096762459076,
"grad_norm": 1.5947207232710128,
"learning_rate": 2.503399944164035e-06,
"loss": 2.1467,
"step": 930
},
{
"epoch": 0.6773372135321936,
"grad_norm": 1.5508691613737382,
"learning_rate": 2.4932047355200613e-06,
"loss": 2.1744,
"step": 931
},
{
"epoch": 0.6780647508184795,
"grad_norm": 1.1974290750839893,
"learning_rate": 2.483023429819372e-06,
"loss": 2.1832,
"step": 932
},
{
"epoch": 0.6787922881047653,
"grad_norm": 1.2618835690498464,
"learning_rate": 2.472856083528531e-06,
"loss": 2.1972,
"step": 933
},
{
"epoch": 0.6795198253910513,
"grad_norm": 1.2782330367865011,
"learning_rate": 2.4627027530366836e-06,
"loss": 2.1451,
"step": 934
},
{
"epoch": 0.6802473626773372,
"grad_norm": 1.436706243744654,
"learning_rate": 2.4525634946552405e-06,
"loss": 2.2135,
"step": 935
},
{
"epoch": 0.6809748999636231,
"grad_norm": 1.411133966277898,
"learning_rate": 2.442438364617567e-06,
"loss": 2.1562,
"step": 936
},
{
"epoch": 0.6817024372499091,
"grad_norm": 1.2169700146486773,
"learning_rate": 2.4323274190786703e-06,
"loss": 2.2271,
"step": 937
},
{
"epoch": 0.682429974536195,
"grad_norm": 1.3255427285044914,
"learning_rate": 2.422230714114891e-06,
"loss": 2.0765,
"step": 938
},
{
"epoch": 0.6831575118224809,
"grad_norm": 1.2230803856218881,
"learning_rate": 2.4121483057235884e-06,
"loss": 2.1744,
"step": 939
},
{
"epoch": 0.6838850491087668,
"grad_norm": 1.1731216113786644,
"learning_rate": 2.4020802498228333e-06,
"loss": 2.1297,
"step": 940
},
{
"epoch": 0.6846125863950527,
"grad_norm": 1.2488772581044372,
"learning_rate": 2.392026602251093e-06,
"loss": 2.1954,
"step": 941
},
{
"epoch": 0.6853401236813387,
"grad_norm": 1.2800008196295536,
"learning_rate": 2.3819874187669266e-06,
"loss": 2.1727,
"step": 942
},
{
"epoch": 0.6860676609676246,
"grad_norm": 1.2834051636026926,
"learning_rate": 2.371962755048675e-06,
"loss": 2.1916,
"step": 943
},
{
"epoch": 0.6867951982539106,
"grad_norm": 1.1954226450289407,
"learning_rate": 2.36195266669415e-06,
"loss": 2.1212,
"step": 944
},
{
"epoch": 0.6875227355401964,
"grad_norm": 1.57141550563432,
"learning_rate": 2.351957209220326e-06,
"loss": 2.1853,
"step": 945
},
{
"epoch": 0.6882502728264823,
"grad_norm": 1.2182134837350904,
"learning_rate": 2.341976438063035e-06,
"loss": 2.146,
"step": 946
},
{
"epoch": 0.6889778101127683,
"grad_norm": 1.2384002744557308,
"learning_rate": 2.332010408576653e-06,
"loss": 2.1524,
"step": 947
},
{
"epoch": 0.6897053473990542,
"grad_norm": 1.1238339859507342,
"learning_rate": 2.3220591760338046e-06,
"loss": 2.1538,
"step": 948
},
{
"epoch": 0.6904328846853401,
"grad_norm": 1.287927410530698,
"learning_rate": 2.3121227956250435e-06,
"loss": 2.1496,
"step": 949
},
{
"epoch": 0.6911604219716261,
"grad_norm": 1.2592066618116187,
"learning_rate": 2.302201322458552e-06,
"loss": 2.121,
"step": 950
},
{
"epoch": 0.691887959257912,
"grad_norm": 1.2131741302198513,
"learning_rate": 2.292294811559837e-06,
"loss": 2.2045,
"step": 951
},
{
"epoch": 0.6926154965441979,
"grad_norm": 1.4467989027206576,
"learning_rate": 2.282403317871422e-06,
"loss": 2.1203,
"step": 952
},
{
"epoch": 0.6933430338304838,
"grad_norm": 1.4648976666718256,
"learning_rate": 2.2725268962525454e-06,
"loss": 2.1879,
"step": 953
},
{
"epoch": 0.6940705711167697,
"grad_norm": 1.501178435831737,
"learning_rate": 2.262665601478852e-06,
"loss": 2.2198,
"step": 954
},
{
"epoch": 0.6947981084030557,
"grad_norm": 1.2540967608340094,
"learning_rate": 2.252819488242093e-06,
"loss": 2.2109,
"step": 955
},
{
"epoch": 0.6955256456893416,
"grad_norm": 1.3412153228190293,
"learning_rate": 2.24298861114982e-06,
"loss": 2.1885,
"step": 956
},
{
"epoch": 0.6962531829756275,
"grad_norm": 1.1811258333483128,
"learning_rate": 2.2331730247250857e-06,
"loss": 2.1559,
"step": 957
},
{
"epoch": 0.6969807202619134,
"grad_norm": 1.3587367408455937,
"learning_rate": 2.223372783406137e-06,
"loss": 2.1716,
"step": 958
},
{
"epoch": 0.6977082575481993,
"grad_norm": 1.2034426964114804,
"learning_rate": 2.2135879415461152e-06,
"loss": 2.0977,
"step": 959
},
{
"epoch": 0.6984357948344853,
"grad_norm": 1.2127358120385474,
"learning_rate": 2.203818553412757e-06,
"loss": 2.1677,
"step": 960
},
{
"epoch": 0.6991633321207712,
"grad_norm": 1.0992839512542802,
"learning_rate": 2.1940646731880887e-06,
"loss": 2.1886,
"step": 961
},
{
"epoch": 0.6998908694070571,
"grad_norm": 1.2635019786683503,
"learning_rate": 2.1843263549681287e-06,
"loss": 2.177,
"step": 962
},
{
"epoch": 0.7006184066933431,
"grad_norm": 1.1771389092011315,
"learning_rate": 2.174603652762588e-06,
"loss": 2.1845,
"step": 963
},
{
"epoch": 0.7013459439796289,
"grad_norm": 1.232918716717663,
"learning_rate": 2.164896620494569e-06,
"loss": 2.1865,
"step": 964
},
{
"epoch": 0.7020734812659148,
"grad_norm": 1.2162200340200031,
"learning_rate": 2.1552053120002655e-06,
"loss": 2.2218,
"step": 965
},
{
"epoch": 0.7028010185522008,
"grad_norm": 1.210396704267297,
"learning_rate": 2.145529781028668e-06,
"loss": 2.2157,
"step": 966
},
{
"epoch": 0.7035285558384867,
"grad_norm": 1.174279238426189,
"learning_rate": 2.1358700812412625e-06,
"loss": 2.1556,
"step": 967
},
{
"epoch": 0.7042560931247727,
"grad_norm": 1.2262375990076664,
"learning_rate": 2.1262262662117327e-06,
"loss": 2.1585,
"step": 968
},
{
"epoch": 0.7049836304110586,
"grad_norm": 1.2564233079440728,
"learning_rate": 2.1165983894256647e-06,
"loss": 2.1624,
"step": 969
},
{
"epoch": 0.7057111676973445,
"grad_norm": 2.68545550948343,
"learning_rate": 2.1069865042802502e-06,
"loss": 2.1878,
"step": 970
},
{
"epoch": 0.7064387049836304,
"grad_norm": 1.586940340896605,
"learning_rate": 2.0973906640839867e-06,
"loss": 2.1582,
"step": 971
},
{
"epoch": 0.7071662422699163,
"grad_norm": 1.2373753331763884,
"learning_rate": 2.0878109220563884e-06,
"loss": 2.1438,
"step": 972
},
{
"epoch": 0.7078937795562023,
"grad_norm": 1.2072718942816432,
"learning_rate": 2.078247331327685e-06,
"loss": 2.1775,
"step": 973
},
{
"epoch": 0.7086213168424882,
"grad_norm": 2.0527719084538005,
"learning_rate": 2.0686999449385286e-06,
"loss": 2.1157,
"step": 974
},
{
"epoch": 0.7093488541287741,
"grad_norm": 1.4912010984015167,
"learning_rate": 2.0591688158397054e-06,
"loss": 2.1851,
"step": 975
},
{
"epoch": 0.7100763914150601,
"grad_norm": 1.1603288071139832,
"learning_rate": 2.0496539968918342e-06,
"loss": 2.2072,
"step": 976
},
{
"epoch": 0.7108039287013459,
"grad_norm": 1.2710873811707057,
"learning_rate": 2.0401555408650714e-06,
"loss": 2.1385,
"step": 977
},
{
"epoch": 0.7115314659876318,
"grad_norm": 1.1827296569891017,
"learning_rate": 2.030673500438828e-06,
"loss": 2.1932,
"step": 978
},
{
"epoch": 0.7122590032739178,
"grad_norm": 1.2118848171068402,
"learning_rate": 2.0212079282014725e-06,
"loss": 2.1849,
"step": 979
},
{
"epoch": 0.7129865405602037,
"grad_norm": 1.1819279466854862,
"learning_rate": 2.0117588766500375e-06,
"loss": 2.1489,
"step": 980
},
{
"epoch": 0.7137140778464897,
"grad_norm": 1.2566646302176254,
"learning_rate": 2.002326398189931e-06,
"loss": 2.1535,
"step": 981
},
{
"epoch": 0.7144416151327756,
"grad_norm": 1.1881989349766866,
"learning_rate": 1.9929105451346436e-06,
"loss": 2.1631,
"step": 982
},
{
"epoch": 0.7151691524190614,
"grad_norm": 1.1577242634231006,
"learning_rate": 1.983511369705462e-06,
"loss": 2.173,
"step": 983
},
{
"epoch": 0.7158966897053474,
"grad_norm": 1.4225822847082645,
"learning_rate": 1.9741289240311757e-06,
"loss": 2.1893,
"step": 984
},
{
"epoch": 0.7166242269916333,
"grad_norm": 1.2226780041724277,
"learning_rate": 1.9647632601477877e-06,
"loss": 2.1584,
"step": 985
},
{
"epoch": 0.7173517642779192,
"grad_norm": 1.2859995557359045,
"learning_rate": 1.9554144299982314e-06,
"loss": 2.1452,
"step": 986
},
{
"epoch": 0.7180793015642052,
"grad_norm": 1.2099058922003745,
"learning_rate": 1.9460824854320755e-06,
"loss": 2.1644,
"step": 987
},
{
"epoch": 0.7188068388504911,
"grad_norm": 1.1443095152375087,
"learning_rate": 1.9367674782052376e-06,
"loss": 2.1687,
"step": 988
},
{
"epoch": 0.7195343761367771,
"grad_norm": 1.2280151800014574,
"learning_rate": 1.9274694599797067e-06,
"loss": 2.204,
"step": 989
},
{
"epoch": 0.7202619134230629,
"grad_norm": 1.2141865430162755,
"learning_rate": 1.918188482323242e-06,
"loss": 2.1757,
"step": 990
},
{
"epoch": 0.7209894507093488,
"grad_norm": 1.382072714954584,
"learning_rate": 1.9089245967090952e-06,
"loss": 2.171,
"step": 991
},
{
"epoch": 0.7217169879956348,
"grad_norm": 1.2705212986077308,
"learning_rate": 1.8996778545157263e-06,
"loss": 2.137,
"step": 992
},
{
"epoch": 0.7224445252819207,
"grad_norm": 1.2381608303569365,
"learning_rate": 1.8904483070265133e-06,
"loss": 2.1668,
"step": 993
},
{
"epoch": 0.7231720625682067,
"grad_norm": 1.2340508189273616,
"learning_rate": 1.8812360054294725e-06,
"loss": 2.1137,
"step": 994
},
{
"epoch": 0.7238995998544926,
"grad_norm": 1.3012424535662288,
"learning_rate": 1.8720410008169727e-06,
"loss": 2.1833,
"step": 995
},
{
"epoch": 0.7246271371407784,
"grad_norm": 1.282191032046726,
"learning_rate": 1.8628633441854515e-06,
"loss": 2.1927,
"step": 996
},
{
"epoch": 0.7253546744270644,
"grad_norm": 1.1817189594536317,
"learning_rate": 1.8537030864351303e-06,
"loss": 2.2164,
"step": 997
},
{
"epoch": 0.7260822117133503,
"grad_norm": 1.3280972391871648,
"learning_rate": 1.8445602783697375e-06,
"loss": 2.1622,
"step": 998
},
{
"epoch": 0.7268097489996362,
"grad_norm": 1.6079264986426691,
"learning_rate": 1.8354349706962243e-06,
"loss": 2.1486,
"step": 999
},
{
"epoch": 0.7275372862859222,
"grad_norm": 1.2135689219158894,
"learning_rate": 1.8263272140244803e-06,
"loss": 2.1321,
"step": 1000
},
{
"epoch": 0.7282648235722081,
"grad_norm": 1.2135204905494617,
"learning_rate": 1.8172370588670563e-06,
"loss": 2.1563,
"step": 1001
},
{
"epoch": 0.728992360858494,
"grad_norm": 1.1202605701009725,
"learning_rate": 1.8081645556388866e-06,
"loss": 2.1528,
"step": 1002
},
{
"epoch": 0.7297198981447799,
"grad_norm": 1.191518625719572,
"learning_rate": 1.7991097546570018e-06,
"loss": 2.1803,
"step": 1003
},
{
"epoch": 0.7304474354310658,
"grad_norm": 1.2602921711662611,
"learning_rate": 1.7900727061402556e-06,
"loss": 2.1558,
"step": 1004
},
{
"epoch": 0.7311749727173518,
"grad_norm": 1.2059662767470918,
"learning_rate": 1.7810534602090445e-06,
"loss": 2.2084,
"step": 1005
},
{
"epoch": 0.7319025100036377,
"grad_norm": 1.3067306072967486,
"learning_rate": 1.77205206688503e-06,
"loss": 2.1738,
"step": 1006
},
{
"epoch": 0.7326300472899236,
"grad_norm": 1.1891951117979749,
"learning_rate": 1.7630685760908623e-06,
"loss": 2.193,
"step": 1007
},
{
"epoch": 0.7333575845762095,
"grad_norm": 1.192631934374658,
"learning_rate": 1.7541030376499002e-06,
"loss": 2.1612,
"step": 1008
},
{
"epoch": 0.7340851218624954,
"grad_norm": 1.3750072249969223,
"learning_rate": 1.745155501285939e-06,
"loss": 2.1168,
"step": 1009
},
{
"epoch": 0.7348126591487814,
"grad_norm": 1.2232130975728073,
"learning_rate": 1.736226016622931e-06,
"loss": 2.1883,
"step": 1010
},
{
"epoch": 0.7355401964350673,
"grad_norm": 1.1914477767091434,
"learning_rate": 1.727314633184714e-06,
"loss": 2.1502,
"step": 1011
},
{
"epoch": 0.7362677337213532,
"grad_norm": 1.210870912995155,
"learning_rate": 1.718421400394732e-06,
"loss": 2.2149,
"step": 1012
},
{
"epoch": 0.7369952710076392,
"grad_norm": 1.1574223183585124,
"learning_rate": 1.7095463675757656e-06,
"loss": 2.2031,
"step": 1013
},
{
"epoch": 0.7377228082939251,
"grad_norm": 1.1965731255050562,
"learning_rate": 1.7006895839496557e-06,
"loss": 2.1607,
"step": 1014
},
{
"epoch": 0.7384503455802109,
"grad_norm": 1.3292826460968283,
"learning_rate": 1.6918510986370312e-06,
"loss": 2.1709,
"step": 1015
},
{
"epoch": 0.7391778828664969,
"grad_norm": 1.2612651230081413,
"learning_rate": 1.6830309606570372e-06,
"loss": 2.1685,
"step": 1016
},
{
"epoch": 0.7399054201527828,
"grad_norm": 1.2382345176070437,
"learning_rate": 1.674229218927062e-06,
"loss": 2.169,
"step": 1017
},
{
"epoch": 0.7406329574390688,
"grad_norm": 1.1520915560133695,
"learning_rate": 1.665445922262467e-06,
"loss": 2.2348,
"step": 1018
},
{
"epoch": 0.7413604947253547,
"grad_norm": 1.336700749645762,
"learning_rate": 1.6566811193763149e-06,
"loss": 2.1717,
"step": 1019
},
{
"epoch": 0.7420880320116406,
"grad_norm": 1.1549274584397604,
"learning_rate": 1.6479348588791e-06,
"loss": 2.1904,
"step": 1020
},
{
"epoch": 0.7428155692979265,
"grad_norm": 1.1893843208664543,
"learning_rate": 1.6392071892784789e-06,
"loss": 2.144,
"step": 1021
},
{
"epoch": 0.7435431065842124,
"grad_norm": 1.2381224137576587,
"learning_rate": 1.6304981589790015e-06,
"loss": 2.1631,
"step": 1022
},
{
"epoch": 0.7442706438704983,
"grad_norm": 1.1142566545158097,
"learning_rate": 1.6218078162818418e-06,
"loss": 2.1496,
"step": 1023
},
{
"epoch": 0.7449981811567843,
"grad_norm": 1.1146629784867004,
"learning_rate": 1.6131362093845299e-06,
"loss": 2.107,
"step": 1024
},
{
"epoch": 0.7457257184430702,
"grad_norm": 1.1632211803162529,
"learning_rate": 1.6044833863806864e-06,
"loss": 2.2039,
"step": 1025
},
{
"epoch": 0.7464532557293562,
"grad_norm": 1.195542241551238,
"learning_rate": 1.5958493952597536e-06,
"loss": 2.1514,
"step": 1026
},
{
"epoch": 0.747180793015642,
"grad_norm": 1.21840951959182,
"learning_rate": 1.5872342839067305e-06,
"loss": 2.139,
"step": 1027
},
{
"epoch": 0.7479083303019279,
"grad_norm": 1.130530825309705,
"learning_rate": 1.5786381001019052e-06,
"loss": 2.2141,
"step": 1028
},
{
"epoch": 0.7486358675882139,
"grad_norm": 1.182720116319556,
"learning_rate": 1.5700608915205978e-06,
"loss": 2.178,
"step": 1029
},
{
"epoch": 0.7493634048744998,
"grad_norm": 1.295581677262678,
"learning_rate": 1.561502705732883e-06,
"loss": 2.1421,
"step": 1030
},
{
"epoch": 0.7500909421607858,
"grad_norm": 1.2466355218601581,
"learning_rate": 1.5529635902033358e-06,
"loss": 2.1752,
"step": 1031
},
{
"epoch": 0.7508184794470717,
"grad_norm": 1.2314124456003142,
"learning_rate": 1.5444435922907669e-06,
"loss": 2.148,
"step": 1032
},
{
"epoch": 0.7515460167333576,
"grad_norm": 1.2138658987017406,
"learning_rate": 1.5359427592479553e-06,
"loss": 2.1426,
"step": 1033
},
{
"epoch": 0.7522735540196435,
"grad_norm": 1.1764184047644508,
"learning_rate": 1.5274611382213922e-06,
"loss": 2.1451,
"step": 1034
},
{
"epoch": 0.7530010913059294,
"grad_norm": 1.1915391500641057,
"learning_rate": 1.5189987762510167e-06,
"loss": 2.239,
"step": 1035
},
{
"epoch": 0.7537286285922153,
"grad_norm": 1.3393307034725943,
"learning_rate": 1.510555720269955e-06,
"loss": 2.1776,
"step": 1036
},
{
"epoch": 0.7544561658785013,
"grad_norm": 1.1074497181939627,
"learning_rate": 1.5021320171042608e-06,
"loss": 2.1814,
"step": 1037
},
{
"epoch": 0.7551837031647872,
"grad_norm": 1.2294477772308599,
"learning_rate": 1.4937277134726542e-06,
"loss": 2.1771,
"step": 1038
},
{
"epoch": 0.7559112404510732,
"grad_norm": 1.3646434494534407,
"learning_rate": 1.4853428559862637e-06,
"loss": 2.1932,
"step": 1039
},
{
"epoch": 0.756638777737359,
"grad_norm": 1.3978038925374128,
"learning_rate": 1.4769774911483686e-06,
"loss": 2.1953,
"step": 1040
},
{
"epoch": 0.7573663150236449,
"grad_norm": 1.2943852265946576,
"learning_rate": 1.4686316653541377e-06,
"loss": 2.171,
"step": 1041
},
{
"epoch": 0.7580938523099309,
"grad_norm": 1.087441192480477,
"learning_rate": 1.4603054248903752e-06,
"loss": 2.1768,
"step": 1042
},
{
"epoch": 0.7588213895962168,
"grad_norm": 1.5143127314406655,
"learning_rate": 1.4519988159352665e-06,
"loss": 2.1381,
"step": 1043
},
{
"epoch": 0.7595489268825028,
"grad_norm": 1.2488966257426735,
"learning_rate": 1.4437118845581138e-06,
"loss": 2.1914,
"step": 1044
},
{
"epoch": 0.7602764641687887,
"grad_norm": 1.2273958325384626,
"learning_rate": 1.4354446767190873e-06,
"loss": 2.1348,
"step": 1045
},
{
"epoch": 0.7610040014550745,
"grad_norm": 1.150082229138482,
"learning_rate": 1.4271972382689685e-06,
"loss": 2.1541,
"step": 1046
},
{
"epoch": 0.7617315387413605,
"grad_norm": 1.1934279136181158,
"learning_rate": 1.4189696149488956e-06,
"loss": 2.1576,
"step": 1047
},
{
"epoch": 0.7624590760276464,
"grad_norm": 1.2353996087944772,
"learning_rate": 1.4107618523901101e-06,
"loss": 2.151,
"step": 1048
},
{
"epoch": 0.7631866133139323,
"grad_norm": 1.1608215953352758,
"learning_rate": 1.4025739961137043e-06,
"loss": 2.1742,
"step": 1049
},
{
"epoch": 0.7639141506002183,
"grad_norm": 1.2788688900480345,
"learning_rate": 1.394406091530367e-06,
"loss": 2.1227,
"step": 1050
},
{
"epoch": 0.7646416878865042,
"grad_norm": 1.1579121493761417,
"learning_rate": 1.3862581839401346e-06,
"loss": 2.1749,
"step": 1051
},
{
"epoch": 0.7653692251727902,
"grad_norm": 1.9174641245246467,
"learning_rate": 1.3781303185321377e-06,
"loss": 2.2115,
"step": 1052
},
{
"epoch": 0.766096762459076,
"grad_norm": 1.3522708249259594,
"learning_rate": 1.370022540384347e-06,
"loss": 2.1995,
"step": 1053
},
{
"epoch": 0.7668242997453619,
"grad_norm": 1.603800827528043,
"learning_rate": 1.3619348944633331e-06,
"loss": 2.1682,
"step": 1054
},
{
"epoch": 0.7675518370316479,
"grad_norm": 1.1171227555477896,
"learning_rate": 1.3538674256240087e-06,
"loss": 2.1833,
"step": 1055
},
{
"epoch": 0.7682793743179338,
"grad_norm": 1.1622054710202911,
"learning_rate": 1.3458201786093795e-06,
"loss": 2.174,
"step": 1056
},
{
"epoch": 0.7690069116042197,
"grad_norm": 1.2372871882629508,
"learning_rate": 1.3377931980503055e-06,
"loss": 2.1894,
"step": 1057
},
{
"epoch": 0.7697344488905057,
"grad_norm": 1.220747336103902,
"learning_rate": 1.3297865284652417e-06,
"loss": 2.1416,
"step": 1058
},
{
"epoch": 0.7704619861767915,
"grad_norm": 1.2380603823828646,
"learning_rate": 1.3218002142599973e-06,
"loss": 2.1695,
"step": 1059
},
{
"epoch": 0.7711895234630775,
"grad_norm": 1.3966519763419472,
"learning_rate": 1.3138342997274883e-06,
"loss": 2.1459,
"step": 1060
},
{
"epoch": 0.7719170607493634,
"grad_norm": 1.3223384039164157,
"learning_rate": 1.3058888290474937e-06,
"loss": 2.1687,
"step": 1061
},
{
"epoch": 0.7726445980356493,
"grad_norm": 1.3021294267966386,
"learning_rate": 1.2979638462864069e-06,
"loss": 2.1213,
"step": 1062
},
{
"epoch": 0.7733721353219353,
"grad_norm": 1.223581952451272,
"learning_rate": 1.2900593953969947e-06,
"loss": 2.1693,
"step": 1063
},
{
"epoch": 0.7740996726082212,
"grad_norm": 1.5875289558290289,
"learning_rate": 1.2821755202181503e-06,
"loss": 2.1336,
"step": 1064
},
{
"epoch": 0.774827209894507,
"grad_norm": 1.2649833712441083,
"learning_rate": 1.2743122644746536e-06,
"loss": 2.1759,
"step": 1065
},
{
"epoch": 0.775554747180793,
"grad_norm": 1.2123555937456751,
"learning_rate": 1.266469671776926e-06,
"loss": 2.1812,
"step": 1066
},
{
"epoch": 0.7762822844670789,
"grad_norm": 1.2276927993970268,
"learning_rate": 1.2586477856207902e-06,
"loss": 2.1468,
"step": 1067
},
{
"epoch": 0.7770098217533649,
"grad_norm": 1.447520161103287,
"learning_rate": 1.2508466493872273e-06,
"loss": 2.1762,
"step": 1068
},
{
"epoch": 0.7777373590396508,
"grad_norm": 1.1531459103202673,
"learning_rate": 1.2430663063421388e-06,
"loss": 2.1864,
"step": 1069
},
{
"epoch": 0.7784648963259367,
"grad_norm": 1.400241485477495,
"learning_rate": 1.2353067996361034e-06,
"loss": 2.1957,
"step": 1070
},
{
"epoch": 0.7791924336122227,
"grad_norm": 1.2819279866788011,
"learning_rate": 1.2275681723041406e-06,
"loss": 2.1548,
"step": 1071
},
{
"epoch": 0.7799199708985085,
"grad_norm": 1.2699916767435784,
"learning_rate": 1.2198504672654694e-06,
"loss": 2.167,
"step": 1072
},
{
"epoch": 0.7806475081847944,
"grad_norm": 1.316288970557494,
"learning_rate": 1.212153727323273e-06,
"loss": 2.2055,
"step": 1073
},
{
"epoch": 0.7813750454710804,
"grad_norm": 1.1597273100326448,
"learning_rate": 1.2044779951644586e-06,
"loss": 2.1858,
"step": 1074
},
{
"epoch": 0.7821025827573663,
"grad_norm": 1.1638383463111301,
"learning_rate": 1.1968233133594243e-06,
"loss": 2.1741,
"step": 1075
},
{
"epoch": 0.7828301200436523,
"grad_norm": 1.2672830420894228,
"learning_rate": 1.1891897243618184e-06,
"loss": 2.1857,
"step": 1076
},
{
"epoch": 0.7835576573299382,
"grad_norm": 1.1682442183180266,
"learning_rate": 1.1815772705083072e-06,
"loss": 2.1882,
"step": 1077
},
{
"epoch": 0.784285194616224,
"grad_norm": 1.1495622151549003,
"learning_rate": 1.17398599401834e-06,
"loss": 2.2104,
"step": 1078
},
{
"epoch": 0.78501273190251,
"grad_norm": 2.4743403838723483,
"learning_rate": 1.1664159369939137e-06,
"loss": 2.1847,
"step": 1079
},
{
"epoch": 0.7857402691887959,
"grad_norm": 1.1427902163296222,
"learning_rate": 1.1588671414193397e-06,
"loss": 2.1397,
"step": 1080
},
{
"epoch": 0.7864678064750819,
"grad_norm": 1.2419446136578443,
"learning_rate": 1.1513396491610113e-06,
"loss": 2.113,
"step": 1081
},
{
"epoch": 0.7871953437613678,
"grad_norm": 1.19356149405073,
"learning_rate": 1.1438335019671715e-06,
"loss": 2.1734,
"step": 1082
},
{
"epoch": 0.7879228810476537,
"grad_norm": 1.4092480507548693,
"learning_rate": 1.1363487414676805e-06,
"loss": 2.1451,
"step": 1083
},
{
"epoch": 0.7886504183339396,
"grad_norm": 1.33971569627605,
"learning_rate": 1.128885409173789e-06,
"loss": 2.1734,
"step": 1084
},
{
"epoch": 0.7893779556202255,
"grad_norm": 1.0948443491354019,
"learning_rate": 1.1214435464779006e-06,
"loss": 2.186,
"step": 1085
},
{
"epoch": 0.7901054929065114,
"grad_norm": 1.1451593409287064,
"learning_rate": 1.1140231946533486e-06,
"loss": 2.2262,
"step": 1086
},
{
"epoch": 0.7908330301927974,
"grad_norm": 1.1592036822391414,
"learning_rate": 1.1066243948541638e-06,
"loss": 2.1421,
"step": 1087
},
{
"epoch": 0.7915605674790833,
"grad_norm": 1.3931059781118835,
"learning_rate": 1.0992471881148497e-06,
"loss": 2.1791,
"step": 1088
},
{
"epoch": 0.7922881047653693,
"grad_norm": 1.1756390362792626,
"learning_rate": 1.091891615350147e-06,
"loss": 2.1748,
"step": 1089
},
{
"epoch": 0.7930156420516552,
"grad_norm": 1.5667835817360032,
"learning_rate": 1.0845577173548172e-06,
"loss": 2.1871,
"step": 1090
},
{
"epoch": 0.793743179337941,
"grad_norm": 1.1723123928342023,
"learning_rate": 1.07724553480341e-06,
"loss": 2.1973,
"step": 1091
},
{
"epoch": 0.794470716624227,
"grad_norm": 1.1741596660871914,
"learning_rate": 1.0699551082500387e-06,
"loss": 2.1675,
"step": 1092
},
{
"epoch": 0.7951982539105129,
"grad_norm": 1.5792380614214159,
"learning_rate": 1.0626864781281553e-06,
"loss": 2.1785,
"step": 1093
},
{
"epoch": 0.7959257911967988,
"grad_norm": 1.3496288299574075,
"learning_rate": 1.0554396847503272e-06,
"loss": 2.1754,
"step": 1094
},
{
"epoch": 0.7966533284830848,
"grad_norm": 1.209717075090295,
"learning_rate": 1.0482147683080125e-06,
"loss": 2.1536,
"step": 1095
},
{
"epoch": 0.7973808657693707,
"grad_norm": 1.1606881621010003,
"learning_rate": 1.0410117688713366e-06,
"loss": 2.1714,
"step": 1096
},
{
"epoch": 0.7981084030556566,
"grad_norm": 1.311535089956001,
"learning_rate": 1.0338307263888748e-06,
"loss": 2.2004,
"step": 1097
},
{
"epoch": 0.7988359403419425,
"grad_norm": 1.0940623937123624,
"learning_rate": 1.0266716806874227e-06,
"loss": 2.184,
"step": 1098
},
{
"epoch": 0.7995634776282284,
"grad_norm": 1.5075293827537422,
"learning_rate": 1.0195346714717813e-06,
"loss": 2.2194,
"step": 1099
},
{
"epoch": 0.8002910149145144,
"grad_norm": 1.219694046957049,
"learning_rate": 1.0124197383245344e-06,
"loss": 2.1548,
"step": 1100
},
{
"epoch": 0.8010185522008003,
"grad_norm": 1.2551974465446878,
"learning_rate": 1.0053269207058298e-06,
"loss": 2.151,
"step": 1101
},
{
"epoch": 0.8017460894870863,
"grad_norm": 1.237030767097611,
"learning_rate": 9.982562579531607e-07,
"loss": 2.1479,
"step": 1102
},
{
"epoch": 0.8024736267733721,
"grad_norm": 1.1308995295036655,
"learning_rate": 9.912077892811473e-07,
"loss": 2.2096,
"step": 1103
},
{
"epoch": 0.803201164059658,
"grad_norm": 1.1208141811054002,
"learning_rate": 9.841815537813177e-07,
"loss": 2.1537,
"step": 1104
},
{
"epoch": 0.803928701345944,
"grad_norm": 1.4560765495831982,
"learning_rate": 9.77177590421895e-07,
"loss": 2.1327,
"step": 1105
},
{
"epoch": 0.8046562386322299,
"grad_norm": 1.1404979922710892,
"learning_rate": 9.70195938047576e-07,
"loss": 2.1422,
"step": 1106
},
{
"epoch": 0.8053837759185158,
"grad_norm": 4.06610644983023,
"learning_rate": 9.63236635379321e-07,
"loss": 2.1989,
"step": 1107
},
{
"epoch": 0.8061113132048018,
"grad_norm": 1.1852343159799084,
"learning_rate": 9.562997210141355e-07,
"loss": 2.1391,
"step": 1108
},
{
"epoch": 0.8068388504910877,
"grad_norm": 1.0865192405071906,
"learning_rate": 9.49385233424856e-07,
"loss": 2.101,
"step": 1109
},
{
"epoch": 0.8075663877773736,
"grad_norm": 1.2112784868475805,
"learning_rate": 9.424932109599372e-07,
"loss": 2.1838,
"step": 1110
},
{
"epoch": 0.8082939250636595,
"grad_norm": 1.1375103883075794,
"learning_rate": 9.356236918432454e-07,
"loss": 2.1604,
"step": 1111
},
{
"epoch": 0.8090214623499454,
"grad_norm": 1.2333713888207174,
"learning_rate": 9.287767141738352e-07,
"loss": 2.1123,
"step": 1112
},
{
"epoch": 0.8097489996362314,
"grad_norm": 1.1611512866888798,
"learning_rate": 9.21952315925746e-07,
"loss": 2.1285,
"step": 1113
},
{
"epoch": 0.8104765369225173,
"grad_norm": 1.169065058306762,
"learning_rate": 9.151505349477901e-07,
"loss": 2.1505,
"step": 1114
},
{
"epoch": 0.8112040742088032,
"grad_norm": 1.1785587576365077,
"learning_rate": 9.08371408963341e-07,
"loss": 2.1385,
"step": 1115
},
{
"epoch": 0.8119316114950891,
"grad_norm": 1.1577165996672336,
"learning_rate": 9.016149755701259e-07,
"loss": 2.1415,
"step": 1116
},
{
"epoch": 0.812659148781375,
"grad_norm": 1.2593173534941327,
"learning_rate": 8.948812722400157e-07,
"loss": 2.1681,
"step": 1117
},
{
"epoch": 0.813386686067661,
"grad_norm": 1.219466614222021,
"learning_rate": 8.881703363188199e-07,
"loss": 2.1802,
"step": 1118
},
{
"epoch": 0.8141142233539469,
"grad_norm": 1.121801024097721,
"learning_rate": 8.814822050260758e-07,
"loss": 2.2099,
"step": 1119
},
{
"epoch": 0.8148417606402328,
"grad_norm": 1.167478440289016,
"learning_rate": 8.748169154548448e-07,
"loss": 2.1474,
"step": 1120
},
{
"epoch": 0.8155692979265188,
"grad_norm": 1.1164752843710233,
"learning_rate": 8.681745045715045e-07,
"loss": 2.168,
"step": 1121
},
{
"epoch": 0.8162968352128046,
"grad_norm": 1.1218689018085943,
"learning_rate": 8.615550092155478e-07,
"loss": 2.2039,
"step": 1122
},
{
"epoch": 0.8170243724990905,
"grad_norm": 1.1738239638703962,
"learning_rate": 8.549584660993726e-07,
"loss": 2.194,
"step": 1123
},
{
"epoch": 0.8177519097853765,
"grad_norm": 1.1883715644243047,
"learning_rate": 8.483849118080828e-07,
"loss": 2.1823,
"step": 1124
},
{
"epoch": 0.8184794470716624,
"grad_norm": 1.2887090928750042,
"learning_rate": 8.418343827992842e-07,
"loss": 2.1005,
"step": 1125
},
{
"epoch": 0.8192069843579484,
"grad_norm": 1.2560094931923382,
"learning_rate": 8.353069154028814e-07,
"loss": 2.1619,
"step": 1126
},
{
"epoch": 0.8199345216442343,
"grad_norm": 1.1674958296283056,
"learning_rate": 8.28802545820877e-07,
"loss": 2.162,
"step": 1127
},
{
"epoch": 0.8206620589305202,
"grad_norm": 1.124133546214429,
"learning_rate": 8.223213101271709e-07,
"loss": 2.1873,
"step": 1128
},
{
"epoch": 0.8213895962168061,
"grad_norm": 1.3855733066248885,
"learning_rate": 8.158632442673603e-07,
"loss": 2.1377,
"step": 1129
},
{
"epoch": 0.822117133503092,
"grad_norm": 1.1974445286333184,
"learning_rate": 8.094283840585398e-07,
"loss": 2.1868,
"step": 1130
},
{
"epoch": 0.822844670789378,
"grad_norm": 1.3903170573362245,
"learning_rate": 8.03016765189103e-07,
"loss": 2.2213,
"step": 1131
},
{
"epoch": 0.8235722080756639,
"grad_norm": 1.1331591969369272,
"learning_rate": 7.966284232185451e-07,
"loss": 2.242,
"step": 1132
},
{
"epoch": 0.8242997453619498,
"grad_norm": 1.1468355972644708,
"learning_rate": 7.902633935772647e-07,
"loss": 2.2077,
"step": 1133
},
{
"epoch": 0.8250272826482358,
"grad_norm": 1.1686608768216766,
"learning_rate": 7.839217115663683e-07,
"loss": 2.1383,
"step": 1134
},
{
"epoch": 0.8257548199345216,
"grad_norm": 1.3068539015030758,
"learning_rate": 7.776034123574738e-07,
"loss": 2.1846,
"step": 1135
},
{
"epoch": 0.8264823572208075,
"grad_norm": 1.523560986226689,
"learning_rate": 7.713085309925156e-07,
"loss": 2.1513,
"step": 1136
},
{
"epoch": 0.8272098945070935,
"grad_norm": 1.1007712070990663,
"learning_rate": 7.650371023835495e-07,
"loss": 2.227,
"step": 1137
},
{
"epoch": 0.8279374317933794,
"grad_norm": 1.1507259497696056,
"learning_rate": 7.587891613125631e-07,
"loss": 2.1392,
"step": 1138
},
{
"epoch": 0.8286649690796654,
"grad_norm": 1.1372034569945393,
"learning_rate": 7.525647424312766e-07,
"loss": 2.1327,
"step": 1139
},
{
"epoch": 0.8293925063659513,
"grad_norm": 1.2185998796300685,
"learning_rate": 7.46363880260954e-07,
"loss": 2.1443,
"step": 1140
},
{
"epoch": 0.8301200436522371,
"grad_norm": 1.431473939970321,
"learning_rate": 7.401866091922133e-07,
"loss": 2.1697,
"step": 1141
},
{
"epoch": 0.8308475809385231,
"grad_norm": 1.168251404055778,
"learning_rate": 7.340329634848309e-07,
"loss": 2.1866,
"step": 1142
},
{
"epoch": 0.831575118224809,
"grad_norm": 1.7243666262254045,
"learning_rate": 7.279029772675572e-07,
"loss": 2.2235,
"step": 1143
},
{
"epoch": 0.832302655511095,
"grad_norm": 1.1198817161954957,
"learning_rate": 7.217966845379243e-07,
"loss": 2.1741,
"step": 1144
},
{
"epoch": 0.8330301927973809,
"grad_norm": 1.1495553472618867,
"learning_rate": 7.157141191620548e-07,
"loss": 2.1228,
"step": 1145
},
{
"epoch": 0.8337577300836668,
"grad_norm": 1.2264294129806552,
"learning_rate": 7.096553148744806e-07,
"loss": 2.209,
"step": 1146
},
{
"epoch": 0.8344852673699527,
"grad_norm": 1.1189699810078966,
"learning_rate": 7.036203052779506e-07,
"loss": 2.1608,
"step": 1147
},
{
"epoch": 0.8352128046562386,
"grad_norm": 1.1167070479223586,
"learning_rate": 6.97609123843247e-07,
"loss": 2.1324,
"step": 1148
},
{
"epoch": 0.8359403419425245,
"grad_norm": 1.2407016203598813,
"learning_rate": 6.916218039089961e-07,
"loss": 2.2016,
"step": 1149
},
{
"epoch": 0.8366678792288105,
"grad_norm": 1.0891946755118704,
"learning_rate": 6.856583786814891e-07,
"loss": 2.1565,
"step": 1150
},
{
"epoch": 0.8373954165150964,
"grad_norm": 1.121834586640259,
"learning_rate": 6.797188812344907e-07,
"loss": 2.1688,
"step": 1151
},
{
"epoch": 0.8381229538013824,
"grad_norm": 1.1952722361624357,
"learning_rate": 6.738033445090653e-07,
"loss": 2.1958,
"step": 1152
},
{
"epoch": 0.8388504910876683,
"grad_norm": 1.256213852552062,
"learning_rate": 6.67911801313384e-07,
"loss": 2.1381,
"step": 1153
},
{
"epoch": 0.8395780283739541,
"grad_norm": 1.8449125450101216,
"learning_rate": 6.620442843225483e-07,
"loss": 2.1822,
"step": 1154
},
{
"epoch": 0.8403055656602401,
"grad_norm": 1.222232102615464,
"learning_rate": 6.562008260784092e-07,
"loss": 2.2062,
"step": 1155
},
{
"epoch": 0.841033102946526,
"grad_norm": 1.0938211772059208,
"learning_rate": 6.503814589893836e-07,
"loss": 2.1551,
"step": 1156
},
{
"epoch": 0.8417606402328119,
"grad_norm": 1.1976326698603619,
"learning_rate": 6.445862153302784e-07,
"loss": 2.198,
"step": 1157
},
{
"epoch": 0.8424881775190979,
"grad_norm": 1.145599285934117,
"learning_rate": 6.388151272421078e-07,
"loss": 2.152,
"step": 1158
},
{
"epoch": 0.8432157148053838,
"grad_norm": 1.1584030834020036,
"learning_rate": 6.330682267319177e-07,
"loss": 2.1492,
"step": 1159
},
{
"epoch": 0.8439432520916696,
"grad_norm": 1.1891679423413797,
"learning_rate": 6.273455456726074e-07,
"loss": 2.1861,
"step": 1160
},
{
"epoch": 0.8446707893779556,
"grad_norm": 1.240134539666397,
"learning_rate": 6.216471158027515e-07,
"loss": 2.2224,
"step": 1161
},
{
"epoch": 0.8453983266642415,
"grad_norm": 1.3441384078153296,
"learning_rate": 6.159729687264254e-07,
"loss": 2.1399,
"step": 1162
},
{
"epoch": 0.8461258639505275,
"grad_norm": 1.3621665509768819,
"learning_rate": 6.103231359130308e-07,
"loss": 2.1678,
"step": 1163
},
{
"epoch": 0.8468534012368134,
"grad_norm": 1.1509024526416052,
"learning_rate": 6.046976486971201e-07,
"loss": 2.2213,
"step": 1164
},
{
"epoch": 0.8475809385230993,
"grad_norm": 1.135048732397783,
"learning_rate": 5.990965382782177e-07,
"loss": 2.1534,
"step": 1165
},
{
"epoch": 0.8483084758093852,
"grad_norm": 1.150383037307535,
"learning_rate": 5.935198357206595e-07,
"loss": 2.2025,
"step": 1166
},
{
"epoch": 0.8490360130956711,
"grad_norm": 1.1090708990139237,
"learning_rate": 5.879675719534078e-07,
"loss": 2.1869,
"step": 1167
},
{
"epoch": 0.8497635503819571,
"grad_norm": 1.2017907661883667,
"learning_rate": 5.824397777698859e-07,
"loss": 2.1598,
"step": 1168
},
{
"epoch": 0.850491087668243,
"grad_norm": 1.2122822792948795,
"learning_rate": 5.769364838278063e-07,
"loss": 2.1669,
"step": 1169
},
{
"epoch": 0.8512186249545289,
"grad_norm": 1.2593026892043682,
"learning_rate": 5.714577206490018e-07,
"loss": 2.2036,
"step": 1170
},
{
"epoch": 0.8519461622408149,
"grad_norm": 1.1026371974965643,
"learning_rate": 5.660035186192531e-07,
"loss": 2.1641,
"step": 1171
},
{
"epoch": 0.8526736995271008,
"grad_norm": 1.261771709071015,
"learning_rate": 5.60573907988124e-07,
"loss": 2.1476,
"step": 1172
},
{
"epoch": 0.8534012368133866,
"grad_norm": 1.207021289017894,
"learning_rate": 5.551689188687909e-07,
"loss": 2.1635,
"step": 1173
},
{
"epoch": 0.8541287740996726,
"grad_norm": 1.2578218076463634,
"learning_rate": 5.497885812378772e-07,
"loss": 2.1802,
"step": 1174
},
{
"epoch": 0.8548563113859585,
"grad_norm": 1.1413457564662952,
"learning_rate": 5.444329249352859e-07,
"loss": 2.2131,
"step": 1175
},
{
"epoch": 0.8555838486722445,
"grad_norm": 1.292402549873486,
"learning_rate": 5.391019796640362e-07,
"loss": 2.1774,
"step": 1176
},
{
"epoch": 0.8563113859585304,
"grad_norm": 1.14093076222506,
"learning_rate": 5.337957749900958e-07,
"loss": 2.235,
"step": 1177
},
{
"epoch": 0.8570389232448163,
"grad_norm": 1.236479670217711,
"learning_rate": 5.285143403422188e-07,
"loss": 2.141,
"step": 1178
},
{
"epoch": 0.8577664605311022,
"grad_norm": 1.30310549862543,
"learning_rate": 5.23257705011786e-07,
"loss": 2.1661,
"step": 1179
},
{
"epoch": 0.8584939978173881,
"grad_norm": 1.2079208652252142,
"learning_rate": 5.18025898152631e-07,
"loss": 2.1353,
"step": 1180
},
{
"epoch": 0.859221535103674,
"grad_norm": 1.4069011415665924,
"learning_rate": 5.128189487808927e-07,
"loss": 2.1496,
"step": 1181
},
{
"epoch": 0.85994907238996,
"grad_norm": 1.161746521014043,
"learning_rate": 5.076368857748454e-07,
"loss": 2.1444,
"step": 1182
},
{
"epoch": 0.8606766096762459,
"grad_norm": 1.332538033508701,
"learning_rate": 5.024797378747414e-07,
"loss": 2.164,
"step": 1183
},
{
"epoch": 0.8614041469625319,
"grad_norm": 1.0993636240534148,
"learning_rate": 4.973475336826506e-07,
"loss": 2.1636,
"step": 1184
},
{
"epoch": 0.8621316842488177,
"grad_norm": 1.1356208007011255,
"learning_rate": 4.922403016623034e-07,
"loss": 2.1201,
"step": 1185
},
{
"epoch": 0.8628592215351036,
"grad_norm": 1.1305387251029828,
"learning_rate": 4.871580701389316e-07,
"loss": 2.2007,
"step": 1186
},
{
"epoch": 0.8635867588213896,
"grad_norm": 1.0574829907568835,
"learning_rate": 4.821008672991118e-07,
"loss": 2.1973,
"step": 1187
},
{
"epoch": 0.8643142961076755,
"grad_norm": 1.190650921826717,
"learning_rate": 4.770687211906089e-07,
"loss": 2.1696,
"step": 1188
},
{
"epoch": 0.8650418333939615,
"grad_norm": 1.1306022951349342,
"learning_rate": 4.720616597222205e-07,
"loss": 2.1633,
"step": 1189
},
{
"epoch": 0.8657693706802474,
"grad_norm": 1.196362267443237,
"learning_rate": 4.6707971066362324e-07,
"loss": 2.1598,
"step": 1190
},
{
"epoch": 0.8664969079665333,
"grad_norm": 1.1567776943227535,
"learning_rate": 4.6212290164521554e-07,
"loss": 2.1812,
"step": 1191
},
{
"epoch": 0.8672244452528192,
"grad_norm": 1.1723671166995326,
"learning_rate": 4.5719126015796757e-07,
"loss": 2.1705,
"step": 1192
},
{
"epoch": 0.8679519825391051,
"grad_norm": 1.0810496584272096,
"learning_rate": 4.522848135532698e-07,
"loss": 2.1261,
"step": 1193
},
{
"epoch": 0.868679519825391,
"grad_norm": 1.243126377593316,
"learning_rate": 4.474035890427769e-07,
"loss": 2.1473,
"step": 1194
},
{
"epoch": 0.869407057111677,
"grad_norm": 1.0895467499897105,
"learning_rate": 4.4254761369825984e-07,
"loss": 2.1511,
"step": 1195
},
{
"epoch": 0.8701345943979629,
"grad_norm": 1.1230163941665576,
"learning_rate": 4.377169144514554e-07,
"loss": 2.1951,
"step": 1196
},
{
"epoch": 0.8708621316842489,
"grad_norm": 1.1565737076422473,
"learning_rate": 4.329115180939164e-07,
"loss": 2.1575,
"step": 1197
},
{
"epoch": 0.8715896689705347,
"grad_norm": 1.0696169727753786,
"learning_rate": 4.281314512768625e-07,
"loss": 2.181,
"step": 1198
},
{
"epoch": 0.8723172062568206,
"grad_norm": 1.3615594210380526,
"learning_rate": 4.2337674051103504e-07,
"loss": 2.1395,
"step": 1199
},
{
"epoch": 0.8730447435431066,
"grad_norm": 1.2378958475002244,
"learning_rate": 4.186474121665468e-07,
"loss": 2.1419,
"step": 1200
},
{
"epoch": 0.8737722808293925,
"grad_norm": 1.1629911120291947,
"learning_rate": 4.139434924727359e-07,
"loss": 2.2328,
"step": 1201
},
{
"epoch": 0.8744998181156785,
"grad_norm": 1.2389585671969723,
"learning_rate": 4.092650075180232e-07,
"loss": 2.1682,
"step": 1202
},
{
"epoch": 0.8752273554019644,
"grad_norm": 1.3514109314913836,
"learning_rate": 4.046119832497658e-07,
"loss": 2.1164,
"step": 1203
},
{
"epoch": 0.8759548926882502,
"grad_norm": 1.1144370754598174,
"learning_rate": 3.9998444547411255e-07,
"loss": 2.2024,
"step": 1204
},
{
"epoch": 0.8766824299745362,
"grad_norm": 1.1147888063902611,
"learning_rate": 3.9538241985586144e-07,
"loss": 2.183,
"step": 1205
},
{
"epoch": 0.8774099672608221,
"grad_norm": 1.1141131177134063,
"learning_rate": 3.908059319183194e-07,
"loss": 2.1748,
"step": 1206
},
{
"epoch": 0.878137504547108,
"grad_norm": 1.1547417917481566,
"learning_rate": 3.8625500704315645e-07,
"loss": 2.2025,
"step": 1207
},
{
"epoch": 0.878865041833394,
"grad_norm": 1.1701183558695587,
"learning_rate": 3.8172967047026834e-07,
"loss": 2.131,
"step": 1208
},
{
"epoch": 0.8795925791196799,
"grad_norm": 1.0882106659044095,
"learning_rate": 3.7722994729763427e-07,
"loss": 2.192,
"step": 1209
},
{
"epoch": 0.8803201164059659,
"grad_norm": 1.0736286473742218,
"learning_rate": 3.7275586248118114e-07,
"loss": 2.1932,
"step": 1210
},
{
"epoch": 0.8810476536922517,
"grad_norm": 1.0875477501520778,
"learning_rate": 3.683074408346404e-07,
"loss": 2.1797,
"step": 1211
},
{
"epoch": 0.8817751909785376,
"grad_norm": 1.0912855242341808,
"learning_rate": 3.6388470702941436e-07,
"loss": 2.1876,
"step": 1212
},
{
"epoch": 0.8825027282648236,
"grad_norm": 1.1768993340036018,
"learning_rate": 3.594876855944385e-07,
"loss": 2.1852,
"step": 1213
},
{
"epoch": 0.8832302655511095,
"grad_norm": 1.151821081597769,
"learning_rate": 3.5511640091604293e-07,
"loss": 2.193,
"step": 1214
},
{
"epoch": 0.8839578028373954,
"grad_norm": 1.0904648278786118,
"learning_rate": 3.50770877237821e-07,
"loss": 2.1804,
"step": 1215
},
{
"epoch": 0.8846853401236814,
"grad_norm": 1.3744133620598458,
"learning_rate": 3.4645113866049187e-07,
"loss": 2.1703,
"step": 1216
},
{
"epoch": 0.8854128774099672,
"grad_norm": 1.2488307712877746,
"learning_rate": 3.42157209141768e-07,
"loss": 2.1049,
"step": 1217
},
{
"epoch": 0.8861404146962532,
"grad_norm": 1.3738389697166,
"learning_rate": 3.3788911249622194e-07,
"loss": 2.1679,
"step": 1218
},
{
"epoch": 0.8868679519825391,
"grad_norm": 1.0926436743467096,
"learning_rate": 3.336468723951558e-07,
"loss": 2.1589,
"step": 1219
},
{
"epoch": 0.887595489268825,
"grad_norm": 1.233863721599574,
"learning_rate": 3.294305123664665e-07,
"loss": 2.1621,
"step": 1220
},
{
"epoch": 0.888323026555111,
"grad_norm": 1.1963007037275617,
"learning_rate": 3.2524005579452014e-07,
"loss": 2.1802,
"step": 1221
},
{
"epoch": 0.8890505638413969,
"grad_norm": 1.1338121772814955,
"learning_rate": 3.2107552592001657e-07,
"loss": 2.1652,
"step": 1222
},
{
"epoch": 0.8897781011276827,
"grad_norm": 1.0673131775790383,
"learning_rate": 3.169369458398652e-07,
"loss": 2.1752,
"step": 1223
},
{
"epoch": 0.8905056384139687,
"grad_norm": 1.1289416481728582,
"learning_rate": 3.128243385070562e-07,
"loss": 2.1889,
"step": 1224
},
{
"epoch": 0.8912331757002546,
"grad_norm": 1.2096782575370533,
"learning_rate": 3.087377267305297e-07,
"loss": 2.1454,
"step": 1225
},
{
"epoch": 0.8919607129865406,
"grad_norm": 1.1219019564743515,
"learning_rate": 3.0467713317505363e-07,
"loss": 2.1647,
"step": 1226
},
{
"epoch": 0.8926882502728265,
"grad_norm": 1.1840262988207078,
"learning_rate": 3.006425803610963e-07,
"loss": 2.1608,
"step": 1227
},
{
"epoch": 0.8934157875591124,
"grad_norm": 1.1775780165121978,
"learning_rate": 2.9663409066470025e-07,
"loss": 2.1721,
"step": 1228
},
{
"epoch": 0.8941433248453984,
"grad_norm": 1.1423731089255424,
"learning_rate": 2.9265168631736005e-07,
"loss": 2.1577,
"step": 1229
},
{
"epoch": 0.8948708621316842,
"grad_norm": 1.832353369781614,
"learning_rate": 2.88695389405898e-07,
"loss": 2.1378,
"step": 1230
},
{
"epoch": 0.8955983994179701,
"grad_norm": 1.1648125598165715,
"learning_rate": 2.8476522187234177e-07,
"loss": 2.1383,
"step": 1231
},
{
"epoch": 0.8963259367042561,
"grad_norm": 1.1276493156309477,
"learning_rate": 2.808612055138038e-07,
"loss": 2.1563,
"step": 1232
},
{
"epoch": 0.897053473990542,
"grad_norm": 1.2036308042804933,
"learning_rate": 2.76983361982357e-07,
"loss": 2.1653,
"step": 1233
},
{
"epoch": 0.897781011276828,
"grad_norm": 1.1166716350605161,
"learning_rate": 2.731317127849209e-07,
"loss": 2.1954,
"step": 1234
},
{
"epoch": 0.8985085485631139,
"grad_norm": 1.0644548721799072,
"learning_rate": 2.693062792831358e-07,
"loss": 2.1614,
"step": 1235
},
{
"epoch": 0.8992360858493997,
"grad_norm": 1.0984178351424647,
"learning_rate": 2.655070826932471e-07,
"loss": 2.1572,
"step": 1236
},
{
"epoch": 0.8999636231356857,
"grad_norm": 1.101303527800766,
"learning_rate": 2.617341440859883e-07,
"loss": 2.138,
"step": 1237
},
{
"epoch": 0.9006911604219716,
"grad_norm": 1.1208887662712228,
"learning_rate": 2.5798748438646326e-07,
"loss": 2.1561,
"step": 1238
},
{
"epoch": 0.9014186977082576,
"grad_norm": 1.100598661105781,
"learning_rate": 2.5426712437403134e-07,
"loss": 2.1581,
"step": 1239
},
{
"epoch": 0.9021462349945435,
"grad_norm": 1.1703700133267052,
"learning_rate": 2.5057308468218913e-07,
"loss": 2.1473,
"step": 1240
},
{
"epoch": 0.9028737722808294,
"grad_norm": 1.1567399217761656,
"learning_rate": 2.4690538579845933e-07,
"loss": 2.1112,
"step": 1241
},
{
"epoch": 0.9036013095671153,
"grad_norm": 1.115099441313513,
"learning_rate": 2.432640480642756e-07,
"loss": 2.1854,
"step": 1242
},
{
"epoch": 0.9043288468534012,
"grad_norm": 1.3913584825821503,
"learning_rate": 2.396490916748706e-07,
"loss": 2.1817,
"step": 1243
},
{
"epoch": 0.9050563841396871,
"grad_norm": 1.1314527530267833,
"learning_rate": 2.360605366791624e-07,
"loss": 2.1848,
"step": 1244
},
{
"epoch": 0.9057839214259731,
"grad_norm": 1.1318209517298516,
"learning_rate": 2.32498402979644e-07,
"loss": 2.1345,
"step": 1245
},
{
"epoch": 0.906511458712259,
"grad_norm": 1.0985737775074131,
"learning_rate": 2.2896271033227392e-07,
"loss": 2.158,
"step": 1246
},
{
"epoch": 0.907238995998545,
"grad_norm": 1.2655865839503828,
"learning_rate": 2.2545347834636632e-07,
"loss": 2.1542,
"step": 1247
},
{
"epoch": 0.9079665332848309,
"grad_norm": 1.4823682609114033,
"learning_rate": 2.219707264844806e-07,
"loss": 2.1673,
"step": 1248
},
{
"epoch": 0.9086940705711167,
"grad_norm": 1.2066786328395243,
"learning_rate": 2.1851447406231573e-07,
"loss": 2.13,
"step": 1249
},
{
"epoch": 0.9094216078574027,
"grad_norm": 1.0771985903083936,
"learning_rate": 2.1508474024860171e-07,
"loss": 2.1457,
"step": 1250
},
{
"epoch": 0.9101491451436886,
"grad_norm": 1.1095865446255258,
"learning_rate": 2.1168154406499275e-07,
"loss": 2.2382,
"step": 1251
},
{
"epoch": 0.9108766824299745,
"grad_norm": 1.3103891317698702,
"learning_rate": 2.0830490438596418e-07,
"loss": 2.1959,
"step": 1252
},
{
"epoch": 0.9116042197162605,
"grad_norm": 1.2306939222199123,
"learning_rate": 2.0495483993870578e-07,
"loss": 2.1139,
"step": 1253
},
{
"epoch": 0.9123317570025464,
"grad_norm": 2.967994937097106,
"learning_rate": 2.0163136930301696e-07,
"loss": 2.0948,
"step": 1254
},
{
"epoch": 0.9130592942888323,
"grad_norm": 1.080126789220765,
"learning_rate": 1.9833451091120727e-07,
"loss": 2.1823,
"step": 1255
},
{
"epoch": 0.9137868315751182,
"grad_norm": 1.0892792590150484,
"learning_rate": 1.9506428304799095e-07,
"loss": 2.1579,
"step": 1256
},
{
"epoch": 0.9145143688614041,
"grad_norm": 1.1192972960003569,
"learning_rate": 1.9182070385038555e-07,
"loss": 2.2095,
"step": 1257
},
{
"epoch": 0.9152419061476901,
"grad_norm": 1.3083562274897766,
"learning_rate": 1.886037913076144e-07,
"loss": 2.1905,
"step": 1258
},
{
"epoch": 0.915969443433976,
"grad_norm": 1.105616909797537,
"learning_rate": 1.8541356326100436e-07,
"loss": 2.1699,
"step": 1259
},
{
"epoch": 0.916696980720262,
"grad_norm": 1.5218961639183424,
"learning_rate": 1.8225003740388546e-07,
"loss": 2.1513,
"step": 1260
},
{
"epoch": 0.9174245180065478,
"grad_norm": 1.0592820677755947,
"learning_rate": 1.791132312814975e-07,
"loss": 2.1357,
"step": 1261
},
{
"epoch": 0.9181520552928337,
"grad_norm": 1.240605202233977,
"learning_rate": 1.760031622908881e-07,
"loss": 2.2205,
"step": 1262
},
{
"epoch": 0.9188795925791197,
"grad_norm": 1.8555531064869182,
"learning_rate": 1.729198476808186e-07,
"loss": 2.2314,
"step": 1263
},
{
"epoch": 0.9196071298654056,
"grad_norm": 1.228934084019767,
"learning_rate": 1.6986330455166733e-07,
"loss": 2.1364,
"step": 1264
},
{
"epoch": 0.9203346671516915,
"grad_norm": 1.1859552322747644,
"learning_rate": 1.6683354985533583e-07,
"loss": 2.1482,
"step": 1265
},
{
"epoch": 0.9210622044379775,
"grad_norm": 1.6560873234086992,
"learning_rate": 1.6383060039515343e-07,
"loss": 2.1396,
"step": 1266
},
{
"epoch": 0.9217897417242634,
"grad_norm": 1.5232709345600017,
"learning_rate": 1.6085447282578548e-07,
"loss": 2.1474,
"step": 1267
},
{
"epoch": 0.9225172790105493,
"grad_norm": 1.0928289545918428,
"learning_rate": 1.579051836531409e-07,
"loss": 2.139,
"step": 1268
},
{
"epoch": 0.9232448162968352,
"grad_norm": 1.1214465184167164,
"learning_rate": 1.5498274923427925e-07,
"loss": 2.2277,
"step": 1269
},
{
"epoch": 0.9239723535831211,
"grad_norm": 1.1315512124069351,
"learning_rate": 1.5208718577732096e-07,
"loss": 2.1498,
"step": 1270
},
{
"epoch": 0.9246998908694071,
"grad_norm": 1.107067470667357,
"learning_rate": 1.4921850934135785e-07,
"loss": 2.178,
"step": 1271
},
{
"epoch": 0.925427428155693,
"grad_norm": 1.0921667144280345,
"learning_rate": 1.463767358363627e-07,
"loss": 2.1863,
"step": 1272
},
{
"epoch": 0.926154965441979,
"grad_norm": 1.1131197425175732,
"learning_rate": 1.4356188102310266e-07,
"loss": 2.1662,
"step": 1273
},
{
"epoch": 0.9268825027282648,
"grad_norm": 1.1701178383938489,
"learning_rate": 1.4077396051305093e-07,
"loss": 2.1882,
"step": 1274
},
{
"epoch": 0.9276100400145507,
"grad_norm": 1.1838245677231893,
"learning_rate": 1.3801298976830025e-07,
"loss": 2.1209,
"step": 1275
},
{
"epoch": 0.9283375773008367,
"grad_norm": 1.1933829830491318,
"learning_rate": 1.3527898410147677e-07,
"loss": 2.1649,
"step": 1276
},
{
"epoch": 0.9290651145871226,
"grad_norm": 1.145526098399752,
"learning_rate": 1.325719586756563e-07,
"loss": 2.1817,
"step": 1277
},
{
"epoch": 0.9297926518734085,
"grad_norm": 1.0749155606673366,
"learning_rate": 1.2989192850427933e-07,
"loss": 2.1557,
"step": 1278
},
{
"epoch": 0.9305201891596945,
"grad_norm": 1.3783023050622016,
"learning_rate": 1.2723890845106723e-07,
"loss": 2.1287,
"step": 1279
},
{
"epoch": 0.9312477264459803,
"grad_norm": 1.6006748713293288,
"learning_rate": 1.2461291322994118e-07,
"loss": 2.085,
"step": 1280
},
{
"epoch": 0.9319752637322662,
"grad_norm": 1.059269234572768,
"learning_rate": 1.2201395740493948e-07,
"loss": 2.1708,
"step": 1281
},
{
"epoch": 0.9327028010185522,
"grad_norm": 1.105289296728809,
"learning_rate": 1.1944205539013708e-07,
"loss": 2.1864,
"step": 1282
},
{
"epoch": 0.9334303383048381,
"grad_norm": 1.193801817088734,
"learning_rate": 1.1689722144956672e-07,
"loss": 2.1534,
"step": 1283
},
{
"epoch": 0.9341578755911241,
"grad_norm": 1.2624183501841544,
"learning_rate": 1.1437946969713731e-07,
"loss": 2.2187,
"step": 1284
},
{
"epoch": 0.93488541287741,
"grad_norm": 1.461535614887746,
"learning_rate": 1.1188881409655849e-07,
"loss": 2.1274,
"step": 1285
},
{
"epoch": 0.9356129501636959,
"grad_norm": 1.617278091319289,
"learning_rate": 1.0942526846126122e-07,
"loss": 2.1088,
"step": 1286
},
{
"epoch": 0.9363404874499818,
"grad_norm": 1.0932395005042113,
"learning_rate": 1.0698884645432117e-07,
"loss": 2.1984,
"step": 1287
},
{
"epoch": 0.9370680247362677,
"grad_norm": 1.1107685198802046,
"learning_rate": 1.0457956158838545e-07,
"loss": 2.1502,
"step": 1288
},
{
"epoch": 0.9377955620225537,
"grad_norm": 1.1432608826273079,
"learning_rate": 1.0219742722559433e-07,
"loss": 2.2091,
"step": 1289
},
{
"epoch": 0.9385230993088396,
"grad_norm": 1.889492536398189,
"learning_rate": 9.984245657750857e-08,
"loss": 2.1273,
"step": 1290
},
{
"epoch": 0.9392506365951255,
"grad_norm": 1.2709195270172968,
"learning_rate": 9.751466270503718e-08,
"loss": 2.1281,
"step": 1291
},
{
"epoch": 0.9399781738814115,
"grad_norm": 1.0799994138588318,
"learning_rate": 9.521405851836252e-08,
"loss": 2.2231,
"step": 1292
},
{
"epoch": 0.9407057111676973,
"grad_norm": 1.525855749830862,
"learning_rate": 9.294065677687202e-08,
"loss": 2.1466,
"step": 1293
},
{
"epoch": 0.9414332484539832,
"grad_norm": 1.1761906693535622,
"learning_rate": 9.069447008908383e-08,
"loss": 2.1912,
"step": 1294
},
{
"epoch": 0.9421607857402692,
"grad_norm": 1.1312463149407157,
"learning_rate": 8.847551091257956e-08,
"loss": 2.1097,
"step": 1295
},
{
"epoch": 0.9428883230265551,
"grad_norm": 1.6796709095897908,
"learning_rate": 8.62837915539344e-08,
"loss": 2.1561,
"step": 1296
},
{
"epoch": 0.9436158603128411,
"grad_norm": 1.0297370910963033,
"learning_rate": 8.411932416864832e-08,
"loss": 2.1631,
"step": 1297
},
{
"epoch": 0.944343397599127,
"grad_norm": 1.4116528470571312,
"learning_rate": 8.198212076107881e-08,
"loss": 2.1428,
"step": 1298
},
{
"epoch": 0.9450709348854128,
"grad_norm": 1.098962010036319,
"learning_rate": 7.987219318437489e-08,
"loss": 2.2117,
"step": 1299
},
{
"epoch": 0.9457984721716988,
"grad_norm": 1.2540943557186013,
"learning_rate": 7.778955314041103e-08,
"loss": 2.1657,
"step": 1300
},
{
"epoch": 0.9465260094579847,
"grad_norm": 1.2400620556554858,
"learning_rate": 7.573421217972222e-08,
"loss": 2.1536,
"step": 1301
},
{
"epoch": 0.9472535467442706,
"grad_norm": 1.151462311267039,
"learning_rate": 7.370618170144062e-08,
"loss": 2.1253,
"step": 1302
},
{
"epoch": 0.9479810840305566,
"grad_norm": 1.1976131413618134,
"learning_rate": 7.170547295323016e-08,
"loss": 2.087,
"step": 1303
},
{
"epoch": 0.9487086213168425,
"grad_norm": 1.4258177957311036,
"learning_rate": 6.973209703122652e-08,
"loss": 2.1011,
"step": 1304
},
{
"epoch": 0.9494361586031284,
"grad_norm": 1.3415584018629254,
"learning_rate": 6.778606487997496e-08,
"loss": 2.1651,
"step": 1305
},
{
"epoch": 0.9501636958894143,
"grad_norm": 1.099049032462197,
"learning_rate": 6.58673872923693e-08,
"loss": 2.1783,
"step": 1306
},
{
"epoch": 0.9508912331757002,
"grad_norm": 1.0740685526691962,
"learning_rate": 6.397607490959134e-08,
"loss": 2.1613,
"step": 1307
},
{
"epoch": 0.9516187704619862,
"grad_norm": 1.2432577438660946,
"learning_rate": 6.211213822105378e-08,
"loss": 2.1624,
"step": 1308
},
{
"epoch": 0.9523463077482721,
"grad_norm": 1.1213774439829132,
"learning_rate": 6.027558756434015e-08,
"loss": 2.1513,
"step": 1309
},
{
"epoch": 0.953073845034558,
"grad_norm": 1.1012965217884594,
"learning_rate": 5.846643312514888e-08,
"loss": 2.1826,
"step": 1310
},
{
"epoch": 0.953801382320844,
"grad_norm": 1.109684951398295,
"learning_rate": 5.668468493723489e-08,
"loss": 2.1705,
"step": 1311
},
{
"epoch": 0.9545289196071298,
"grad_norm": 1.0970545980570312,
"learning_rate": 5.4930352882357486e-08,
"loss": 2.1471,
"step": 1312
},
{
"epoch": 0.9552564568934158,
"grad_norm": 1.1020402645803815,
"learning_rate": 5.3203446690220374e-08,
"loss": 2.1642,
"step": 1313
},
{
"epoch": 0.9559839941797017,
"grad_norm": 1.2667183522454073,
"learning_rate": 5.1503975938422824e-08,
"loss": 2.1417,
"step": 1314
},
{
"epoch": 0.9567115314659876,
"grad_norm": 1.2915533164915878,
"learning_rate": 4.983195005240415e-08,
"loss": 2.1976,
"step": 1315
},
{
"epoch": 0.9574390687522736,
"grad_norm": 1.1076129486018869,
"learning_rate": 4.8187378305390994e-08,
"loss": 2.1808,
"step": 1316
},
{
"epoch": 0.9581666060385595,
"grad_norm": 1.1387016009937267,
"learning_rate": 4.657026981834623e-08,
"loss": 2.1806,
"step": 1317
},
{
"epoch": 0.9588941433248453,
"grad_norm": 1.1899678063694281,
"learning_rate": 4.498063355991955e-08,
"loss": 2.1713,
"step": 1318
},
{
"epoch": 0.9596216806111313,
"grad_norm": 1.24744884870978,
"learning_rate": 4.341847834639645e-08,
"loss": 2.1992,
"step": 1319
},
{
"epoch": 0.9603492178974172,
"grad_norm": 1.146131261626508,
"learning_rate": 4.188381284164933e-08,
"loss": 2.1161,
"step": 1320
},
{
"epoch": 0.9610767551837032,
"grad_norm": 1.1277234081016827,
"learning_rate": 4.0376645557090864e-08,
"loss": 2.2013,
"step": 1321
},
{
"epoch": 0.9618042924699891,
"grad_norm": 1.1559215314132825,
"learning_rate": 3.889698485162463e-08,
"loss": 2.16,
"step": 1322
},
{
"epoch": 0.962531829756275,
"grad_norm": 1.1381481425895759,
"learning_rate": 3.744483893160067e-08,
"loss": 2.2413,
"step": 1323
},
{
"epoch": 0.9632593670425609,
"grad_norm": 1.415309549759452,
"learning_rate": 3.602021585076942e-08,
"loss": 2.1665,
"step": 1324
},
{
"epoch": 0.9639869043288468,
"grad_norm": 1.4440616690572,
"learning_rate": 3.462312351023567e-08,
"loss": 2.1889,
"step": 1325
},
{
"epoch": 0.9647144416151328,
"grad_norm": 1.9538836520889915,
"learning_rate": 3.325356965841686e-08,
"loss": 2.1754,
"step": 1326
},
{
"epoch": 0.9654419789014187,
"grad_norm": 1.1248225603912345,
"learning_rate": 3.191156189099931e-08,
"loss": 2.1724,
"step": 1327
},
{
"epoch": 0.9661695161877046,
"grad_norm": 1.2614599283450048,
"learning_rate": 3.0597107650894855e-08,
"loss": 2.1464,
"step": 1328
},
{
"epoch": 0.9668970534739906,
"grad_norm": 1.2011934411088816,
"learning_rate": 2.9310214228202016e-08,
"loss": 2.1381,
"step": 1329
},
{
"epoch": 0.9676245907602765,
"grad_norm": 1.2998812947548053,
"learning_rate": 2.8050888760163265e-08,
"loss": 2.1581,
"step": 1330
},
{
"epoch": 0.9683521280465623,
"grad_norm": 1.1140672894015613,
"learning_rate": 2.6819138231126695e-08,
"loss": 2.1749,
"step": 1331
},
{
"epoch": 0.9690796653328483,
"grad_norm": 1.1216126535021602,
"learning_rate": 2.5614969472506634e-08,
"loss": 2.1792,
"step": 1332
},
{
"epoch": 0.9698072026191342,
"grad_norm": 1.25322513583671,
"learning_rate": 2.4438389162746434e-08,
"loss": 2.1696,
"step": 1333
},
{
"epoch": 0.9705347399054202,
"grad_norm": 1.1131994577649031,
"learning_rate": 2.3289403827281287e-08,
"loss": 2.1516,
"step": 1334
},
{
"epoch": 0.9712622771917061,
"grad_norm": 1.149788902199477,
"learning_rate": 2.2168019838501032e-08,
"loss": 2.1206,
"step": 1335
},
{
"epoch": 0.971989814477992,
"grad_norm": 1.2320932338097785,
"learning_rate": 2.1074243415716288e-08,
"loss": 2.1656,
"step": 1336
},
{
"epoch": 0.9727173517642779,
"grad_norm": 1.1657936554154817,
"learning_rate": 2.0008080625124048e-08,
"loss": 2.1708,
"step": 1337
},
{
"epoch": 0.9734448890505638,
"grad_norm": 1.1894175601351693,
"learning_rate": 1.896953737977103e-08,
"loss": 2.1455,
"step": 1338
},
{
"epoch": 0.9741724263368498,
"grad_norm": 1.2162504154036424,
"learning_rate": 1.7958619439524817e-08,
"loss": 2.153,
"step": 1339
},
{
"epoch": 0.9748999636231357,
"grad_norm": 1.0695427579088523,
"learning_rate": 1.6975332411040547e-08,
"loss": 2.1903,
"step": 1340
},
{
"epoch": 0.9756275009094216,
"grad_norm": 1.128006404235128,
"learning_rate": 1.601968174772761e-08,
"loss": 2.1733,
"step": 1341
},
{
"epoch": 0.9763550381957076,
"grad_norm": 1.078617608471699,
"learning_rate": 1.5091672749723564e-08,
"loss": 2.1835,
"step": 1342
},
{
"epoch": 0.9770825754819934,
"grad_norm": 1.870175223923746,
"learning_rate": 1.4191310563860806e-08,
"loss": 2.0813,
"step": 1343
},
{
"epoch": 0.9778101127682793,
"grad_norm": 1.6310544307639132,
"learning_rate": 1.331860018363995e-08,
"loss": 2.1312,
"step": 1344
},
{
"epoch": 0.9785376500545653,
"grad_norm": 1.1349574618732923,
"learning_rate": 1.2473546449203178e-08,
"loss": 2.2078,
"step": 1345
},
{
"epoch": 0.9792651873408512,
"grad_norm": 1.2159937263760292,
"learning_rate": 1.1656154047303691e-08,
"loss": 2.1796,
"step": 1346
},
{
"epoch": 0.9799927246271372,
"grad_norm": 1.14281219680189,
"learning_rate": 1.0866427511285194e-08,
"loss": 2.1934,
"step": 1347
},
{
"epoch": 0.9807202619134231,
"grad_norm": 1.1929972027287525,
"learning_rate": 1.0104371221050236e-08,
"loss": 2.0814,
"step": 1348
},
{
"epoch": 0.981447799199709,
"grad_norm": 1.0720512824500246,
"learning_rate": 9.369989403041347e-09,
"loss": 2.1718,
"step": 1349
},
{
"epoch": 0.9821753364859949,
"grad_norm": 1.1678936203643433,
"learning_rate": 8.663286130216608e-09,
"loss": 2.1682,
"step": 1350
},
{
"epoch": 0.9829028737722808,
"grad_norm": 1.245453145807709,
"learning_rate": 7.984265322023011e-09,
"loss": 2.158,
"step": 1351
},
{
"epoch": 0.9836304110585667,
"grad_norm": 1.1492193587944532,
"learning_rate": 7.332930744380906e-09,
"loss": 2.1758,
"step": 1352
},
{
"epoch": 0.9843579483448527,
"grad_norm": 1.2612149321084427,
"learning_rate": 6.709286009657368e-09,
"loss": 2.1567,
"step": 1353
},
{
"epoch": 0.9850854856311386,
"grad_norm": 1.3115967794340748,
"learning_rate": 6.1133345766511975e-09,
"loss": 2.1383,
"step": 1354
},
{
"epoch": 0.9858130229174246,
"grad_norm": 1.2352405406930702,
"learning_rate": 5.5450797505690605e-09,
"loss": 2.1914,
"step": 1355
},
{
"epoch": 0.9865405602037104,
"grad_norm": 1.2123024417780415,
"learning_rate": 5.004524683011048e-09,
"loss": 2.2086,
"step": 1356
},
{
"epoch": 0.9872680974899963,
"grad_norm": 1.0605306568096313,
"learning_rate": 4.491672371950695e-09,
"loss": 2.1677,
"step": 1357
},
{
"epoch": 0.9879956347762823,
"grad_norm": 1.159183588970115,
"learning_rate": 4.0065256617199954e-09,
"loss": 2.1543,
"step": 1358
},
{
"epoch": 0.9887231720625682,
"grad_norm": 1.1587983090969554,
"learning_rate": 3.5490872429910784e-09,
"loss": 2.1265,
"step": 1359
},
{
"epoch": 0.9894507093488542,
"grad_norm": 1.260819419569731,
"learning_rate": 3.119359652765108e-09,
"loss": 2.1756,
"step": 1360
},
{
"epoch": 0.9901782466351401,
"grad_norm": 1.1705707883099417,
"learning_rate": 2.7173452743550767e-09,
"loss": 2.1663,
"step": 1361
},
{
"epoch": 0.9909057839214259,
"grad_norm": 1.3422853083589237,
"learning_rate": 2.343046337374144e-09,
"loss": 2.1805,
"step": 1362
},
{
"epoch": 0.9916333212077119,
"grad_norm": 1.2168506130447845,
"learning_rate": 1.9964649177223184e-09,
"loss": 2.1607,
"step": 1363
},
{
"epoch": 0.9923608584939978,
"grad_norm": 1.1771922885619566,
"learning_rate": 1.6776029375759062e-09,
"loss": 2.1397,
"step": 1364
},
{
"epoch": 0.9930883957802837,
"grad_norm": 1.1042541645524246,
"learning_rate": 1.386462165375857e-09,
"loss": 2.1772,
"step": 1365
},
{
"epoch": 0.9938159330665697,
"grad_norm": 1.2703508857723977,
"learning_rate": 1.1230442158188804e-09,
"loss": 2.1632,
"step": 1366
},
{
"epoch": 0.9945434703528556,
"grad_norm": 1.108751259348474,
"learning_rate": 8.873505498474544e-10,
"loss": 2.1881,
"step": 1367
},
{
"epoch": 0.9952710076391416,
"grad_norm": 1.1228251721868356,
"learning_rate": 6.793824746437194e-10,
"loss": 2.175,
"step": 1368
},
{
"epoch": 0.9959985449254274,
"grad_norm": 1.1248446425450205,
"learning_rate": 4.991411436189308e-10,
"loss": 2.1946,
"step": 1369
},
{
"epoch": 0.9967260822117133,
"grad_norm": 1.0751502038617766,
"learning_rate": 3.466275564101285e-10,
"loss": 2.1715,
"step": 1370
},
{
"epoch": 0.9974536194979993,
"grad_norm": 1.1002050846433344,
"learning_rate": 2.2184255887403028e-10,
"loss": 2.1891,
"step": 1371
},
{
"epoch": 0.9981811567842852,
"grad_norm": 1.350470042755439,
"learning_rate": 1.2478684308037115e-10,
"loss": 2.1012,
"step": 1372
},
{
"epoch": 0.9989086940705711,
"grad_norm": 1.0908546681209645,
"learning_rate": 5.5460947310237435e-11,
"loss": 2.1487,
"step": 1373
},
{
"epoch": 0.9996362313568571,
"grad_norm": 1.1465345153912363,
"learning_rate": 1.3865256052181252e-11,
"loss": 2.202,
"step": 1374
}
],
"logging_steps": 1,
"max_steps": 1374,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 687,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4520916805419008e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}