{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 10, "global_step": 5300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 2.2099817351323793e-07, "loss": 4.0153, "step": 10 }, { "epoch": 0.02, "eval_loss": 3.2407805919647217, "eval_runtime": 51.2609, "eval_samples_per_second": 1322.879, "eval_steps_per_second": 2.595, "step": 10 }, { "epoch": 0.04, "learning_rate": 4.088952094525112e-07, "loss": 2.4309, "step": 20 }, { "epoch": 0.04, "eval_loss": 1.3295739889144897, "eval_runtime": 51.4605, "eval_samples_per_second": 1317.748, "eval_steps_per_second": 2.585, "step": 20 }, { "epoch": 0.06, "learning_rate": 4.998494650312849e-07, "loss": 1.2878, "step": 30 }, { "epoch": 0.06, "eval_loss": 1.2454885244369507, "eval_runtime": 51.5936, "eval_samples_per_second": 1314.349, "eval_steps_per_second": 2.578, "step": 30 }, { "epoch": 0.08, "learning_rate": 5.574009434295683e-07, "loss": 1.214, "step": 40 }, { "epoch": 0.08, "eval_loss": 1.176835060119629, "eval_runtime": 51.5326, "eval_samples_per_second": 1315.904, "eval_steps_per_second": 2.581, "step": 40 }, { "epoch": 0.09, "learning_rate": 5.99597299894211e-07, "loss": 1.1456, "step": 50 }, { "epoch": 0.09, "eval_loss": 1.135105013847351, "eval_runtime": 51.5256, "eval_samples_per_second": 1316.083, "eval_steps_per_second": 2.581, "step": 50 }, { "epoch": 0.11, "learning_rate": 6.329299792437913e-07, "loss": 1.1182, "step": 60 }, { "epoch": 0.11, "eval_loss": 1.1110774278640747, "eval_runtime": 51.7018, "eval_samples_per_second": 1311.599, "eval_steps_per_second": 2.572, "step": 60 }, { "epoch": 0.13, "learning_rate": 6.604839728354141e-07, "loss": 1.0933, "step": 70 }, { "epoch": 0.13, "eval_loss": 1.0951799154281616, "eval_runtime": 51.6151, "eval_samples_per_second": 1313.801, "eval_steps_per_second": 2.577, "step": 70 }, { "epoch": 0.15, "learning_rate": 6.839699609399337e-07, "loss": 1.0838, "step": 80 }, { "epoch": 0.15, "eval_loss": 1.0831515789031982, "eval_runtime": 51.5722, "eval_samples_per_second": 1314.893, "eval_steps_per_second": 2.579, "step": 80 }, { "epoch": 0.17, "learning_rate": 7.044360352012708e-07, "loss": 1.0585, "step": 90 }, { "epoch": 0.17, "eval_loss": 1.0732280015945435, "eval_runtime": 51.672, "eval_samples_per_second": 1312.355, "eval_steps_per_second": 2.574, "step": 90 }, { "epoch": 0.19, "learning_rate": 7.225710767839359e-07, "loss": 1.0496, "step": 100 }, { "epoch": 0.19, "eval_loss": 1.0647205114364624, "eval_runtime": 51.7752, "eval_samples_per_second": 1309.74, "eval_steps_per_second": 2.569, "step": 100 }, { "epoch": 0.21, "learning_rate": 7.388522018513969e-07, "loss": 1.0627, "step": 110 }, { "epoch": 0.21, "eval_loss": 1.0570907592773438, "eval_runtime": 51.8166, "eval_samples_per_second": 1308.691, "eval_steps_per_second": 2.567, "step": 110 }, { "epoch": 0.23, "learning_rate": 7.536235468635834e-07, "loss": 1.0518, "step": 120 }, { "epoch": 0.23, "eval_loss": 1.049896478652954, "eval_runtime": 51.7728, "eval_samples_per_second": 1309.799, "eval_steps_per_second": 2.569, "step": 120 }, { "epoch": 0.25, "learning_rate": 7.671415126345663e-07, "loss": 1.0649, "step": 130 }, { "epoch": 0.25, "eval_loss": 1.0434167385101318, "eval_runtime": 51.7035, "eval_samples_per_second": 1311.557, "eval_steps_per_second": 2.572, "step": 130 }, { "epoch": 0.26, "learning_rate": 7.796022659328959e-07, "loss": 1.045, "step": 140 }, { "epoch": 0.26, "eval_loss": 1.0371665954589844, "eval_runtime": 51.5545, "eval_samples_per_second": 1315.346, "eval_steps_per_second": 2.58, "step": 140 }, { "epoch": 0.28, "learning_rate": 7.911592440089047e-07, "loss": 1.032, "step": 150 }, { "epoch": 0.28, "eval_loss": 1.0311847925186157, "eval_runtime": 51.7951, "eval_samples_per_second": 1309.235, "eval_steps_per_second": 2.568, "step": 150 }, { "epoch": 0.3, "learning_rate": 8.019347288561271e-07, "loss": 1.0257, "step": 160 }, { "epoch": 0.3, "eval_loss": 1.0256140232086182, "eval_runtime": 51.7483, "eval_samples_per_second": 1310.42, "eval_steps_per_second": 2.57, "step": 160 }, { "epoch": 0.32, "learning_rate": 8.120277498651883e-07, "loss": 1.0171, "step": 170 }, { "epoch": 0.32, "eval_loss": 1.0200444459915161, "eval_runtime": 51.7312, "eval_samples_per_second": 1310.854, "eval_steps_per_second": 2.571, "step": 170 }, { "epoch": 0.34, "learning_rate": 8.215196295288388e-07, "loss": 1.0294, "step": 180 }, { "epoch": 0.34, "eval_loss": 1.014703392982483, "eval_runtime": 51.5856, "eval_samples_per_second": 1314.554, "eval_steps_per_second": 2.578, "step": 180 }, { "epoch": 0.36, "learning_rate": 8.304779684672557e-07, "loss": 1.0028, "step": 190 }, { "epoch": 0.36, "eval_loss": 1.0094585418701172, "eval_runtime": 51.7818, "eval_samples_per_second": 1309.571, "eval_steps_per_second": 2.568, "step": 190 }, { "epoch": 0.38, "learning_rate": 8.389595688569879e-07, "loss": 1.0127, "step": 200 }, { "epoch": 0.38, "eval_loss": 1.0043591260910034, "eval_runtime": 51.636, "eval_samples_per_second": 1313.269, "eval_steps_per_second": 2.576, "step": 200 }, { "epoch": 0.4, "learning_rate": 8.470126185316722e-07, "loss": 1.0185, "step": 210 }, { "epoch": 0.4, "eval_loss": 0.9993944764137268, "eval_runtime": 51.5783, "eval_samples_per_second": 1314.739, "eval_steps_per_second": 2.579, "step": 210 }, { "epoch": 0.42, "learning_rate": 8.546783493625903e-07, "loss": 1.0023, "step": 220 }, { "epoch": 0.42, "eval_loss": 0.9948075413703918, "eval_runtime": 51.744, "eval_samples_per_second": 1310.529, "eval_steps_per_second": 2.57, "step": 220 }, { "epoch": 0.43, "learning_rate": 8.61992314820994e-07, "loss": 0.9993, "step": 230 }, { "epoch": 0.43, "eval_loss": 0.9913944005966187, "eval_runtime": 51.6406, "eval_samples_per_second": 1313.153, "eval_steps_per_second": 2.575, "step": 230 }, { "epoch": 0.45, "learning_rate": 8.689853870683541e-07, "loss": 0.9826, "step": 240 }, { "epoch": 0.45, "eval_loss": 0.9888330698013306, "eval_runtime": 51.5703, "eval_samples_per_second": 1314.942, "eval_steps_per_second": 2.579, "step": 240 }, { "epoch": 0.47, "learning_rate": 8.75684544365874e-07, "loss": 0.984, "step": 250 }, { "epoch": 0.47, "eval_loss": 0.9866300821304321, "eval_runtime": 51.6072, "eval_samples_per_second": 1314.004, "eval_steps_per_second": 2.577, "step": 250 }, { "epoch": 0.49, "learning_rate": 8.821134995876785e-07, "loss": 0.9768, "step": 260 }, { "epoch": 0.49, "eval_loss": 0.9846345782279968, "eval_runtime": 51.7595, "eval_samples_per_second": 1310.136, "eval_steps_per_second": 2.57, "step": 260 }, { "epoch": 0.51, "learning_rate": 8.882932068272419e-07, "loss": 0.9652, "step": 270 }, { "epoch": 0.51, "eval_loss": 0.9827404022216797, "eval_runtime": 51.652, "eval_samples_per_second": 1312.864, "eval_steps_per_second": 2.575, "step": 270 }, { "epoch": 0.53, "learning_rate": 8.942422734147504e-07, "loss": 0.9891, "step": 280 }, { "epoch": 0.53, "eval_loss": 0.980950117111206, "eval_runtime": 51.6461, "eval_samples_per_second": 1313.013, "eval_steps_per_second": 2.575, "step": 280 }, { "epoch": 0.55, "learning_rate": 8.999772977776921e-07, "loss": 0.9842, "step": 290 }, { "epoch": 0.55, "eval_loss": 0.9792138934135437, "eval_runtime": 51.7586, "eval_samples_per_second": 1310.159, "eval_steps_per_second": 2.57, "step": 290 }, { "epoch": 0.57, "learning_rate": 9.05513148606202e-07, "loss": 0.9589, "step": 300 }, { "epoch": 0.57, "eval_loss": 0.9775672554969788, "eval_runtime": 51.5871, "eval_samples_per_second": 1314.513, "eval_steps_per_second": 2.578, "step": 300 }, { "epoch": 0.58, "learning_rate": 9.10863197149682e-07, "loss": 0.9691, "step": 310 }, { "epoch": 0.58, "eval_loss": 0.975963294506073, "eval_runtime": 51.6231, "eval_samples_per_second": 1313.598, "eval_steps_per_second": 2.576, "step": 310 }, { "epoch": 0.6, "learning_rate": 9.160395117812138e-07, "loss": 0.9865, "step": 320 }, { "epoch": 0.6, "eval_loss": 0.9743648171424866, "eval_runtime": 51.6267, "eval_samples_per_second": 1313.507, "eval_steps_per_second": 2.576, "step": 320 }, { "epoch": 0.62, "learning_rate": 9.21053021953609e-07, "loss": 0.9961, "step": 330 }, { "epoch": 0.62, "eval_loss": 0.9728517532348633, "eval_runtime": 51.7315, "eval_samples_per_second": 1310.844, "eval_steps_per_second": 2.571, "step": 330 }, { "epoch": 0.64, "learning_rate": 9.259136571495437e-07, "loss": 0.9619, "step": 340 }, { "epoch": 0.64, "eval_loss": 0.9713432192802429, "eval_runtime": 51.6068, "eval_samples_per_second": 1314.013, "eval_steps_per_second": 2.577, "step": 340 }, { "epoch": 0.66, "learning_rate": 9.306304652671935e-07, "loss": 0.9845, "step": 350 }, { "epoch": 0.66, "eval_loss": 0.9698730707168579, "eval_runtime": 51.5942, "eval_samples_per_second": 1314.334, "eval_steps_per_second": 2.578, "step": 350 }, { "epoch": 0.68, "learning_rate": 9.352117139888513e-07, "loss": 0.9585, "step": 360 }, { "epoch": 0.68, "eval_loss": 0.9684355854988098, "eval_runtime": 51.6809, "eval_samples_per_second": 1312.128, "eval_steps_per_second": 2.573, "step": 360 }, { "epoch": 0.7, "learning_rate": 9.396649779859618e-07, "loss": 0.9645, "step": 370 }, { "epoch": 0.7, "eval_loss": 0.9670720100402832, "eval_runtime": 51.5257, "eval_samples_per_second": 1316.082, "eval_steps_per_second": 2.581, "step": 370 }, { "epoch": 0.72, "learning_rate": 9.439972142709402e-07, "loss": 0.9727, "step": 380 }, { "epoch": 0.72, "eval_loss": 0.9656769633293152, "eval_runtime": 51.5615, "eval_samples_per_second": 1315.169, "eval_steps_per_second": 2.579, "step": 380 }, { "epoch": 0.74, "learning_rate": 9.48214827578075e-07, "loss": 0.9581, "step": 390 }, { "epoch": 0.74, "eval_loss": 0.9643924236297607, "eval_runtime": 51.4727, "eval_samples_per_second": 1317.436, "eval_steps_per_second": 2.584, "step": 390 }, { "epoch": 0.75, "learning_rate": 9.523237273160609e-07, "loss": 0.9786, "step": 400 }, { "epoch": 0.75, "eval_loss": 0.9630870223045349, "eval_runtime": 51.6713, "eval_samples_per_second": 1312.374, "eval_steps_per_second": 2.574, "step": 400 }, { "epoch": 0.77, "learning_rate": 9.563293773632721e-07, "loss": 0.9654, "step": 410 }, { "epoch": 0.77, "eval_loss": 0.9617934823036194, "eval_runtime": 51.6036, "eval_samples_per_second": 1314.093, "eval_steps_per_second": 2.577, "step": 410 }, { "epoch": 0.79, "learning_rate": 9.602368397587203e-07, "loss": 0.9526, "step": 420 }, { "epoch": 0.79, "eval_loss": 0.960529625415802, "eval_runtime": 51.5066, "eval_samples_per_second": 1316.569, "eval_steps_per_second": 2.582, "step": 420 }, { "epoch": 0.81, "learning_rate": 9.640508131652252e-07, "loss": 0.9611, "step": 430 }, { "epoch": 0.81, "eval_loss": 0.9593038558959961, "eval_runtime": 51.676, "eval_samples_per_second": 1312.254, "eval_steps_per_second": 2.574, "step": 430 }, { "epoch": 0.83, "learning_rate": 9.677756668379226e-07, "loss": 0.9744, "step": 440 }, { "epoch": 0.83, "eval_loss": 0.9580965638160706, "eval_runtime": 51.6776, "eval_samples_per_second": 1312.214, "eval_steps_per_second": 2.574, "step": 440 }, { "epoch": 0.85, "learning_rate": 9.714154707140102e-07, "loss": 0.9636, "step": 450 }, { "epoch": 0.85, "eval_loss": 0.9569092392921448, "eval_runtime": 51.5523, "eval_samples_per_second": 1315.402, "eval_steps_per_second": 2.58, "step": 450 }, { "epoch": 0.87, "learning_rate": 9.749740221433787e-07, "loss": 0.9291, "step": 460 }, { "epoch": 0.87, "eval_loss": 0.955761194229126, "eval_runtime": 51.6324, "eval_samples_per_second": 1313.362, "eval_steps_per_second": 2.576, "step": 460 }, { "epoch": 0.89, "learning_rate": 9.784548697003153e-07, "loss": 0.9581, "step": 470 }, { "epoch": 0.89, "eval_loss": 0.954668402671814, "eval_runtime": 51.5081, "eval_samples_per_second": 1316.531, "eval_steps_per_second": 2.582, "step": 470 }, { "epoch": 0.91, "learning_rate": 9.818613344506044e-07, "loss": 0.9298, "step": 480 }, { "epoch": 0.91, "eval_loss": 0.9535388946533203, "eval_runtime": 51.7629, "eval_samples_per_second": 1310.049, "eval_steps_per_second": 2.569, "step": 480 }, { "epoch": 0.92, "learning_rate": 9.851965289935241e-07, "loss": 0.9389, "step": 490 }, { "epoch": 0.92, "eval_loss": 0.9524605870246887, "eval_runtime": 51.5702, "eval_samples_per_second": 1314.945, "eval_steps_per_second": 2.579, "step": 490 }, { "epoch": 0.94, "learning_rate": 9.884633745523852e-07, "loss": 0.9664, "step": 500 }, { "epoch": 0.94, "eval_loss": 0.9513717889785767, "eval_runtime": 51.687, "eval_samples_per_second": 1311.975, "eval_steps_per_second": 2.573, "step": 500 }, { "epoch": 0.96, "learning_rate": 9.91664616348788e-07, "loss": 0.9609, "step": 510 }, { "epoch": 0.96, "eval_loss": 0.9503200054168701, "eval_runtime": 51.6496, "eval_samples_per_second": 1312.923, "eval_steps_per_second": 2.575, "step": 510 }, { "epoch": 0.98, "learning_rate": 9.948028374633558e-07, "loss": 0.9619, "step": 520 }, { "epoch": 0.98, "eval_loss": 0.9492778778076172, "eval_runtime": 51.8222, "eval_samples_per_second": 1308.552, "eval_steps_per_second": 2.566, "step": 520 }, { "epoch": 1.0, "learning_rate": 9.978804713582987e-07, "loss": 0.9344, "step": 530 }, { "epoch": 1.0, "eval_loss": 0.9482750296592712, "eval_runtime": 51.4982, "eval_samples_per_second": 1316.785, "eval_steps_per_second": 2.583, "step": 530 }, { "epoch": 1.02, "learning_rate": 1e-06, "loss": 0.9296, "step": 540 }, { "epoch": 1.02, "eval_loss": 0.947296142578125, "eval_runtime": 51.6009, "eval_samples_per_second": 1314.163, "eval_steps_per_second": 2.577, "step": 540 }, { "epoch": 1.04, "learning_rate": 1e-06, "loss": 0.9576, "step": 550 }, { "epoch": 1.04, "eval_loss": 0.9463191032409668, "eval_runtime": 51.5571, "eval_samples_per_second": 1315.281, "eval_steps_per_second": 2.58, "step": 550 }, { "epoch": 1.06, "learning_rate": 1e-06, "loss": 0.9506, "step": 560 }, { "epoch": 1.06, "eval_loss": 0.945310115814209, "eval_runtime": 51.6349, "eval_samples_per_second": 1313.297, "eval_steps_per_second": 2.576, "step": 560 }, { "epoch": 1.08, "learning_rate": 1e-06, "loss": 0.936, "step": 570 }, { "epoch": 1.08, "eval_loss": 0.9444094300270081, "eval_runtime": 51.5323, "eval_samples_per_second": 1315.912, "eval_steps_per_second": 2.581, "step": 570 }, { "epoch": 1.09, "learning_rate": 1e-06, "loss": 0.9286, "step": 580 }, { "epoch": 1.09, "eval_loss": 0.9434741139411926, "eval_runtime": 51.602, "eval_samples_per_second": 1314.134, "eval_steps_per_second": 2.577, "step": 580 }, { "epoch": 1.11, "learning_rate": 1e-06, "loss": 0.9528, "step": 590 }, { "epoch": 1.11, "eval_loss": 0.9425981044769287, "eval_runtime": 51.5723, "eval_samples_per_second": 1314.893, "eval_steps_per_second": 2.579, "step": 590 }, { "epoch": 1.13, "learning_rate": 1e-06, "loss": 0.9334, "step": 600 }, { "epoch": 1.13, "eval_loss": 0.9417036771774292, "eval_runtime": 51.6826, "eval_samples_per_second": 1312.086, "eval_steps_per_second": 2.573, "step": 600 }, { "epoch": 1.15, "learning_rate": 1e-06, "loss": 0.9337, "step": 610 }, { "epoch": 1.15, "eval_loss": 0.9408856630325317, "eval_runtime": 51.611, "eval_samples_per_second": 1313.905, "eval_steps_per_second": 2.577, "step": 610 }, { "epoch": 1.17, "learning_rate": 1e-06, "loss": 0.9417, "step": 620 }, { "epoch": 1.17, "eval_loss": 0.9400270581245422, "eval_runtime": 51.7034, "eval_samples_per_second": 1311.558, "eval_steps_per_second": 2.572, "step": 620 }, { "epoch": 1.19, "learning_rate": 1e-06, "loss": 0.9356, "step": 630 }, { "epoch": 1.19, "eval_loss": 0.9391804933547974, "eval_runtime": 51.5713, "eval_samples_per_second": 1314.918, "eval_steps_per_second": 2.579, "step": 630 }, { "epoch": 1.21, "learning_rate": 1e-06, "loss": 0.9523, "step": 640 }, { "epoch": 1.21, "eval_loss": 0.9383418560028076, "eval_runtime": 51.7082, "eval_samples_per_second": 1311.436, "eval_steps_per_second": 2.572, "step": 640 }, { "epoch": 1.23, "learning_rate": 1e-06, "loss": 0.9212, "step": 650 }, { "epoch": 1.23, "eval_loss": 0.9375633597373962, "eval_runtime": 51.6214, "eval_samples_per_second": 1313.64, "eval_steps_per_second": 2.576, "step": 650 }, { "epoch": 1.25, "learning_rate": 1e-06, "loss": 0.9317, "step": 660 }, { "epoch": 1.25, "eval_loss": 0.9367907047271729, "eval_runtime": 51.6435, "eval_samples_per_second": 1313.079, "eval_steps_per_second": 2.575, "step": 660 }, { "epoch": 1.26, "learning_rate": 1e-06, "loss": 0.9242, "step": 670 }, { "epoch": 1.26, "eval_loss": 0.9359526634216309, "eval_runtime": 51.6499, "eval_samples_per_second": 1312.916, "eval_steps_per_second": 2.575, "step": 670 }, { "epoch": 1.28, "learning_rate": 1e-06, "loss": 0.9316, "step": 680 }, { "epoch": 1.28, "eval_loss": 0.9352383017539978, "eval_runtime": 51.6428, "eval_samples_per_second": 1313.098, "eval_steps_per_second": 2.575, "step": 680 }, { "epoch": 1.3, "learning_rate": 1e-06, "loss": 0.9391, "step": 690 }, { "epoch": 1.3, "eval_loss": 0.9344274401664734, "eval_runtime": 51.5062, "eval_samples_per_second": 1316.58, "eval_steps_per_second": 2.582, "step": 690 }, { "epoch": 1.32, "learning_rate": 1e-06, "loss": 0.9437, "step": 700 }, { "epoch": 1.32, "eval_loss": 0.9337021708488464, "eval_runtime": 51.6761, "eval_samples_per_second": 1312.252, "eval_steps_per_second": 2.574, "step": 700 }, { "epoch": 1.34, "learning_rate": 1e-06, "loss": 0.9347, "step": 710 }, { "epoch": 1.34, "eval_loss": 0.9328891634941101, "eval_runtime": 51.7055, "eval_samples_per_second": 1311.504, "eval_steps_per_second": 2.572, "step": 710 }, { "epoch": 1.36, "learning_rate": 1e-06, "loss": 0.9344, "step": 720 }, { "epoch": 1.36, "eval_loss": 0.9321677088737488, "eval_runtime": 51.5434, "eval_samples_per_second": 1315.629, "eval_steps_per_second": 2.58, "step": 720 }, { "epoch": 1.38, "learning_rate": 1e-06, "loss": 0.9082, "step": 730 }, { "epoch": 1.38, "eval_loss": 0.9314926266670227, "eval_runtime": 51.5975, "eval_samples_per_second": 1314.249, "eval_steps_per_second": 2.578, "step": 730 }, { "epoch": 1.4, "learning_rate": 1e-06, "loss": 0.9152, "step": 740 }, { "epoch": 1.4, "eval_loss": 0.9307209253311157, "eval_runtime": 51.6257, "eval_samples_per_second": 1313.531, "eval_steps_per_second": 2.576, "step": 740 }, { "epoch": 1.42, "learning_rate": 1e-06, "loss": 0.9178, "step": 750 }, { "epoch": 1.42, "eval_loss": 0.9300168752670288, "eval_runtime": 51.5343, "eval_samples_per_second": 1315.862, "eval_steps_per_second": 2.581, "step": 750 }, { "epoch": 1.43, "learning_rate": 1e-06, "loss": 0.9321, "step": 760 }, { "epoch": 1.43, "eval_loss": 0.9293088912963867, "eval_runtime": 51.5854, "eval_samples_per_second": 1314.559, "eval_steps_per_second": 2.578, "step": 760 }, { "epoch": 1.45, "learning_rate": 1e-06, "loss": 0.9239, "step": 770 }, { "epoch": 1.45, "eval_loss": 0.9286373853683472, "eval_runtime": 51.6726, "eval_samples_per_second": 1312.34, "eval_steps_per_second": 2.574, "step": 770 }, { "epoch": 1.47, "learning_rate": 1e-06, "loss": 0.9347, "step": 780 }, { "epoch": 1.47, "eval_loss": 0.927922248840332, "eval_runtime": 51.653, "eval_samples_per_second": 1312.837, "eval_steps_per_second": 2.575, "step": 780 }, { "epoch": 1.49, "learning_rate": 1e-06, "loss": 0.9029, "step": 790 }, { "epoch": 1.49, "eval_loss": 0.9272500276565552, "eval_runtime": 51.8063, "eval_samples_per_second": 1308.953, "eval_steps_per_second": 2.567, "step": 790 }, { "epoch": 1.51, "learning_rate": 1e-06, "loss": 0.9222, "step": 800 }, { "epoch": 1.51, "eval_loss": 0.9265542030334473, "eval_runtime": 51.6947, "eval_samples_per_second": 1311.778, "eval_steps_per_second": 2.573, "step": 800 }, { "epoch": 1.53, "learning_rate": 1e-06, "loss": 0.9209, "step": 810 }, { "epoch": 1.53, "eval_loss": 0.9258936643600464, "eval_runtime": 51.8655, "eval_samples_per_second": 1307.459, "eval_steps_per_second": 2.564, "step": 810 }, { "epoch": 1.55, "learning_rate": 1e-06, "loss": 0.9287, "step": 820 }, { "epoch": 1.55, "eval_loss": 0.9251777529716492, "eval_runtime": 51.6265, "eval_samples_per_second": 1313.511, "eval_steps_per_second": 2.576, "step": 820 }, { "epoch": 1.57, "learning_rate": 1e-06, "loss": 0.9214, "step": 830 }, { "epoch": 1.57, "eval_loss": 0.9245129227638245, "eval_runtime": 51.6754, "eval_samples_per_second": 1312.268, "eval_steps_per_second": 2.574, "step": 830 }, { "epoch": 1.58, "learning_rate": 1e-06, "loss": 0.9029, "step": 840 }, { "epoch": 1.58, "eval_loss": 0.9238165020942688, "eval_runtime": 51.7762, "eval_samples_per_second": 1309.713, "eval_steps_per_second": 2.569, "step": 840 }, { "epoch": 1.6, "learning_rate": 1e-06, "loss": 0.9303, "step": 850 }, { "epoch": 1.6, "eval_loss": 0.9232094883918762, "eval_runtime": 51.813, "eval_samples_per_second": 1308.783, "eval_steps_per_second": 2.567, "step": 850 }, { "epoch": 1.62, "learning_rate": 1e-06, "loss": 0.9262, "step": 860 }, { "epoch": 1.62, "eval_loss": 0.9224840998649597, "eval_runtime": 51.6696, "eval_samples_per_second": 1312.415, "eval_steps_per_second": 2.574, "step": 860 }, { "epoch": 1.64, "learning_rate": 1e-06, "loss": 0.9187, "step": 870 }, { "epoch": 1.64, "eval_loss": 0.9218891859054565, "eval_runtime": 51.7308, "eval_samples_per_second": 1310.862, "eval_steps_per_second": 2.571, "step": 870 }, { "epoch": 1.66, "learning_rate": 1e-06, "loss": 0.9177, "step": 880 }, { "epoch": 1.66, "eval_loss": 0.9212586283683777, "eval_runtime": 51.8721, "eval_samples_per_second": 1307.291, "eval_steps_per_second": 2.564, "step": 880 }, { "epoch": 1.68, "learning_rate": 1e-06, "loss": 0.9138, "step": 890 }, { "epoch": 1.68, "eval_loss": 0.9205953478813171, "eval_runtime": 51.7717, "eval_samples_per_second": 1309.827, "eval_steps_per_second": 2.569, "step": 890 }, { "epoch": 1.7, "learning_rate": 1e-06, "loss": 0.9097, "step": 900 }, { "epoch": 1.7, "eval_loss": 0.9199565052986145, "eval_runtime": 51.8761, "eval_samples_per_second": 1307.192, "eval_steps_per_second": 2.564, "step": 900 }, { "epoch": 1.72, "learning_rate": 1e-06, "loss": 0.9063, "step": 910 }, { "epoch": 1.72, "eval_loss": 0.9193738698959351, "eval_runtime": 51.7714, "eval_samples_per_second": 1309.836, "eval_steps_per_second": 2.569, "step": 910 }, { "epoch": 1.74, "learning_rate": 1e-06, "loss": 0.92, "step": 920 }, { "epoch": 1.74, "eval_loss": 0.9187218546867371, "eval_runtime": 51.6954, "eval_samples_per_second": 1311.761, "eval_steps_per_second": 2.573, "step": 920 }, { "epoch": 1.75, "learning_rate": 1e-06, "loss": 0.9411, "step": 930 }, { "epoch": 1.75, "eval_loss": 0.9181123971939087, "eval_runtime": 52.3612, "eval_samples_per_second": 1295.082, "eval_steps_per_second": 2.54, "step": 930 }, { "epoch": 1.77, "learning_rate": 1e-06, "loss": 0.9235, "step": 940 }, { "epoch": 1.77, "eval_loss": 0.9175041317939758, "eval_runtime": 51.7288, "eval_samples_per_second": 1310.913, "eval_steps_per_second": 2.571, "step": 940 }, { "epoch": 1.79, "learning_rate": 1e-06, "loss": 0.9, "step": 950 }, { "epoch": 1.79, "eval_loss": 0.9168665409088135, "eval_runtime": 51.7948, "eval_samples_per_second": 1309.242, "eval_steps_per_second": 2.568, "step": 950 }, { "epoch": 1.81, "learning_rate": 1e-06, "loss": 0.9148, "step": 960 }, { "epoch": 1.81, "eval_loss": 0.9162449240684509, "eval_runtime": 51.6549, "eval_samples_per_second": 1312.789, "eval_steps_per_second": 2.575, "step": 960 }, { "epoch": 1.83, "learning_rate": 1e-06, "loss": 0.9064, "step": 970 }, { "epoch": 1.83, "eval_loss": 0.9156414270401001, "eval_runtime": 51.9765, "eval_samples_per_second": 1304.667, "eval_steps_per_second": 2.559, "step": 970 }, { "epoch": 1.85, "learning_rate": 1e-06, "loss": 0.9267, "step": 980 }, { "epoch": 1.85, "eval_loss": 0.9150117635726929, "eval_runtime": 51.8206, "eval_samples_per_second": 1308.593, "eval_steps_per_second": 2.567, "step": 980 }, { "epoch": 1.87, "learning_rate": 1e-06, "loss": 0.9125, "step": 990 }, { "epoch": 1.87, "eval_loss": 0.9144014716148376, "eval_runtime": 51.936, "eval_samples_per_second": 1305.684, "eval_steps_per_second": 2.561, "step": 990 }, { "epoch": 1.89, "learning_rate": 1e-06, "loss": 0.9263, "step": 1000 }, { "epoch": 1.89, "eval_loss": 0.9138918519020081, "eval_runtime": 51.687, "eval_samples_per_second": 1311.974, "eval_steps_per_second": 2.573, "step": 1000 }, { "epoch": 1.91, "learning_rate": 1e-06, "loss": 0.9124, "step": 1010 }, { "epoch": 1.91, "eval_loss": 0.9132450222969055, "eval_runtime": 51.6969, "eval_samples_per_second": 1311.722, "eval_steps_per_second": 2.573, "step": 1010 }, { "epoch": 1.92, "learning_rate": 1e-06, "loss": 0.9025, "step": 1020 }, { "epoch": 1.92, "eval_loss": 0.9127308130264282, "eval_runtime": 51.7421, "eval_samples_per_second": 1310.578, "eval_steps_per_second": 2.57, "step": 1020 }, { "epoch": 1.94, "learning_rate": 1e-06, "loss": 0.916, "step": 1030 }, { "epoch": 1.94, "eval_loss": 0.9121115803718567, "eval_runtime": 51.779, "eval_samples_per_second": 1309.642, "eval_steps_per_second": 2.569, "step": 1030 }, { "epoch": 1.96, "learning_rate": 1e-06, "loss": 0.8957, "step": 1040 }, { "epoch": 1.96, "eval_loss": 0.9115496873855591, "eval_runtime": 51.7016, "eval_samples_per_second": 1311.604, "eval_steps_per_second": 2.572, "step": 1040 }, { "epoch": 1.98, "learning_rate": 1e-06, "loss": 0.9107, "step": 1050 }, { "epoch": 1.98, "eval_loss": 0.9109433889389038, "eval_runtime": 51.7718, "eval_samples_per_second": 1309.826, "eval_steps_per_second": 2.569, "step": 1050 }, { "epoch": 2.0, "learning_rate": 1e-06, "loss": 0.9072, "step": 1060 }, { "epoch": 2.0, "eval_loss": 0.9104679226875305, "eval_runtime": 51.7958, "eval_samples_per_second": 1309.218, "eval_steps_per_second": 2.568, "step": 1060 }, { "epoch": 2.02, "learning_rate": 1e-06, "loss": 0.9009, "step": 1070 }, { "epoch": 2.02, "eval_loss": 0.9099251627922058, "eval_runtime": 51.7577, "eval_samples_per_second": 1310.181, "eval_steps_per_second": 2.57, "step": 1070 }, { "epoch": 2.04, "learning_rate": 1e-06, "loss": 0.9208, "step": 1080 }, { "epoch": 2.04, "eval_loss": 0.9094162583351135, "eval_runtime": 51.7631, "eval_samples_per_second": 1310.046, "eval_steps_per_second": 2.569, "step": 1080 }, { "epoch": 2.06, "learning_rate": 1e-06, "loss": 0.9114, "step": 1090 }, { "epoch": 2.06, "eval_loss": 0.908789873123169, "eval_runtime": 51.6814, "eval_samples_per_second": 1312.116, "eval_steps_per_second": 2.573, "step": 1090 }, { "epoch": 2.08, "learning_rate": 1e-06, "loss": 0.8926, "step": 1100 }, { "epoch": 2.08, "eval_loss": 0.9083024859428406, "eval_runtime": 51.7598, "eval_samples_per_second": 1310.128, "eval_steps_per_second": 2.57, "step": 1100 }, { "epoch": 2.09, "learning_rate": 1e-06, "loss": 0.9126, "step": 1110 }, { "epoch": 2.09, "eval_loss": 0.9077491760253906, "eval_runtime": 51.7667, "eval_samples_per_second": 1309.954, "eval_steps_per_second": 2.569, "step": 1110 }, { "epoch": 2.11, "learning_rate": 1e-06, "loss": 0.9111, "step": 1120 }, { "epoch": 2.11, "eval_loss": 0.9072958827018738, "eval_runtime": 51.7977, "eval_samples_per_second": 1309.17, "eval_steps_per_second": 2.568, "step": 1120 }, { "epoch": 2.13, "learning_rate": 1e-06, "loss": 0.8893, "step": 1130 }, { "epoch": 2.13, "eval_loss": 0.9067880511283875, "eval_runtime": 51.8164, "eval_samples_per_second": 1308.698, "eval_steps_per_second": 2.567, "step": 1130 }, { "epoch": 2.15, "learning_rate": 1e-06, "loss": 0.9091, "step": 1140 }, { "epoch": 2.15, "eval_loss": 0.9062515497207642, "eval_runtime": 51.7521, "eval_samples_per_second": 1310.323, "eval_steps_per_second": 2.57, "step": 1140 }, { "epoch": 2.17, "learning_rate": 1e-06, "loss": 0.88, "step": 1150 }, { "epoch": 2.17, "eval_loss": 0.9057653546333313, "eval_runtime": 51.8093, "eval_samples_per_second": 1308.876, "eval_steps_per_second": 2.567, "step": 1150 }, { "epoch": 2.19, "learning_rate": 1e-06, "loss": 0.8956, "step": 1160 }, { "epoch": 2.19, "eval_loss": 0.9052146077156067, "eval_runtime": 51.6703, "eval_samples_per_second": 1312.399, "eval_steps_per_second": 2.574, "step": 1160 }, { "epoch": 2.21, "learning_rate": 1e-06, "loss": 0.9016, "step": 1170 }, { "epoch": 2.21, "eval_loss": 0.904728353023529, "eval_runtime": 51.8128, "eval_samples_per_second": 1308.788, "eval_steps_per_second": 2.567, "step": 1170 }, { "epoch": 2.23, "learning_rate": 1e-06, "loss": 0.8985, "step": 1180 }, { "epoch": 2.23, "eval_loss": 0.9042342901229858, "eval_runtime": 51.7265, "eval_samples_per_second": 1310.973, "eval_steps_per_second": 2.571, "step": 1180 }, { "epoch": 2.25, "learning_rate": 1e-06, "loss": 0.888, "step": 1190 }, { "epoch": 2.25, "eval_loss": 0.9037690162658691, "eval_runtime": 51.6956, "eval_samples_per_second": 1311.755, "eval_steps_per_second": 2.573, "step": 1190 }, { "epoch": 2.26, "learning_rate": 1e-06, "loss": 0.887, "step": 1200 }, { "epoch": 2.26, "eval_loss": 0.9033035039901733, "eval_runtime": 51.861, "eval_samples_per_second": 1307.571, "eval_steps_per_second": 2.565, "step": 1200 }, { "epoch": 2.28, "learning_rate": 1e-06, "loss": 0.8888, "step": 1210 }, { "epoch": 2.28, "eval_loss": 0.9027730226516724, "eval_runtime": 51.6531, "eval_samples_per_second": 1312.834, "eval_steps_per_second": 2.575, "step": 1210 }, { "epoch": 2.3, "learning_rate": 1e-06, "loss": 0.8921, "step": 1220 }, { "epoch": 2.3, "eval_loss": 0.9023709893226624, "eval_runtime": 51.638, "eval_samples_per_second": 1313.219, "eval_steps_per_second": 2.576, "step": 1220 }, { "epoch": 2.32, "learning_rate": 1e-06, "loss": 0.8952, "step": 1230 }, { "epoch": 2.32, "eval_loss": 0.9018809795379639, "eval_runtime": 51.8323, "eval_samples_per_second": 1308.296, "eval_steps_per_second": 2.566, "step": 1230 }, { "epoch": 2.34, "learning_rate": 1e-06, "loss": 0.8897, "step": 1240 }, { "epoch": 2.34, "eval_loss": 0.90138179063797, "eval_runtime": 51.672, "eval_samples_per_second": 1312.354, "eval_steps_per_second": 2.574, "step": 1240 }, { "epoch": 2.36, "learning_rate": 1e-06, "loss": 0.897, "step": 1250 }, { "epoch": 2.36, "eval_loss": 0.9009556174278259, "eval_runtime": 51.8076, "eval_samples_per_second": 1308.921, "eval_steps_per_second": 2.567, "step": 1250 }, { "epoch": 2.38, "learning_rate": 1e-06, "loss": 0.8928, "step": 1260 }, { "epoch": 2.38, "eval_loss": 0.900455117225647, "eval_runtime": 51.7097, "eval_samples_per_second": 1311.397, "eval_steps_per_second": 2.572, "step": 1260 }, { "epoch": 2.4, "learning_rate": 1e-06, "loss": 0.9051, "step": 1270 }, { "epoch": 2.4, "eval_loss": 0.9000570774078369, "eval_runtime": 51.76, "eval_samples_per_second": 1310.123, "eval_steps_per_second": 2.57, "step": 1270 }, { "epoch": 2.42, "learning_rate": 1e-06, "loss": 0.8985, "step": 1280 }, { "epoch": 2.42, "eval_loss": 0.8996244072914124, "eval_runtime": 51.7876, "eval_samples_per_second": 1309.425, "eval_steps_per_second": 2.568, "step": 1280 }, { "epoch": 2.43, "learning_rate": 1e-06, "loss": 0.9074, "step": 1290 }, { "epoch": 2.43, "eval_loss": 0.899140477180481, "eval_runtime": 51.7059, "eval_samples_per_second": 1311.495, "eval_steps_per_second": 2.572, "step": 1290 }, { "epoch": 2.45, "learning_rate": 1e-06, "loss": 0.8832, "step": 1300 }, { "epoch": 2.45, "eval_loss": 0.8987085819244385, "eval_runtime": 51.8508, "eval_samples_per_second": 1307.829, "eval_steps_per_second": 2.565, "step": 1300 }, { "epoch": 2.47, "learning_rate": 1e-06, "loss": 0.8708, "step": 1310 }, { "epoch": 2.47, "eval_loss": 0.8981704711914062, "eval_runtime": 51.6881, "eval_samples_per_second": 1311.947, "eval_steps_per_second": 2.573, "step": 1310 }, { "epoch": 2.49, "learning_rate": 1e-06, "loss": 0.901, "step": 1320 }, { "epoch": 2.49, "eval_loss": 0.8977486491203308, "eval_runtime": 51.6963, "eval_samples_per_second": 1311.739, "eval_steps_per_second": 2.573, "step": 1320 }, { "epoch": 2.51, "learning_rate": 1e-06, "loss": 0.8835, "step": 1330 }, { "epoch": 2.51, "eval_loss": 0.8974390625953674, "eval_runtime": 51.6765, "eval_samples_per_second": 1312.241, "eval_steps_per_second": 2.574, "step": 1330 }, { "epoch": 2.53, "learning_rate": 1e-06, "loss": 0.8897, "step": 1340 }, { "epoch": 2.53, "eval_loss": 0.8968603014945984, "eval_runtime": 51.64, "eval_samples_per_second": 1313.168, "eval_steps_per_second": 2.576, "step": 1340 }, { "epoch": 2.55, "learning_rate": 1e-06, "loss": 0.8893, "step": 1350 }, { "epoch": 2.55, "eval_loss": 0.8964337706565857, "eval_runtime": 51.7386, "eval_samples_per_second": 1310.665, "eval_steps_per_second": 2.571, "step": 1350 }, { "epoch": 2.57, "learning_rate": 1e-06, "loss": 0.8809, "step": 1360 }, { "epoch": 2.57, "eval_loss": 0.8959853649139404, "eval_runtime": 51.7398, "eval_samples_per_second": 1310.634, "eval_steps_per_second": 2.571, "step": 1360 }, { "epoch": 2.58, "learning_rate": 1e-06, "loss": 0.8977, "step": 1370 }, { "epoch": 2.58, "eval_loss": 0.8955875039100647, "eval_runtime": 51.8409, "eval_samples_per_second": 1308.08, "eval_steps_per_second": 2.566, "step": 1370 }, { "epoch": 2.6, "learning_rate": 1e-06, "loss": 0.8948, "step": 1380 }, { "epoch": 2.6, "eval_loss": 0.895256519317627, "eval_runtime": 51.6804, "eval_samples_per_second": 1312.142, "eval_steps_per_second": 2.574, "step": 1380 }, { "epoch": 2.62, "learning_rate": 1e-06, "loss": 0.8851, "step": 1390 }, { "epoch": 2.62, "eval_loss": 0.8946732878684998, "eval_runtime": 51.6288, "eval_samples_per_second": 1313.454, "eval_steps_per_second": 2.576, "step": 1390 }, { "epoch": 2.64, "learning_rate": 1e-06, "loss": 0.8875, "step": 1400 }, { "epoch": 2.64, "eval_loss": 0.8942922353744507, "eval_runtime": 51.8893, "eval_samples_per_second": 1306.859, "eval_steps_per_second": 2.563, "step": 1400 }, { "epoch": 2.66, "learning_rate": 1e-06, "loss": 0.8953, "step": 1410 }, { "epoch": 2.66, "eval_loss": 0.8939051032066345, "eval_runtime": 51.8333, "eval_samples_per_second": 1308.272, "eval_steps_per_second": 2.566, "step": 1410 }, { "epoch": 2.68, "learning_rate": 1e-06, "loss": 0.8709, "step": 1420 }, { "epoch": 2.68, "eval_loss": 0.8934805393218994, "eval_runtime": 51.6417, "eval_samples_per_second": 1313.125, "eval_steps_per_second": 2.575, "step": 1420 }, { "epoch": 2.7, "learning_rate": 1e-06, "loss": 0.8821, "step": 1430 }, { "epoch": 2.7, "eval_loss": 0.8930464386940002, "eval_runtime": 51.8812, "eval_samples_per_second": 1307.063, "eval_steps_per_second": 2.564, "step": 1430 }, { "epoch": 2.72, "learning_rate": 1e-06, "loss": 0.8925, "step": 1440 }, { "epoch": 2.72, "eval_loss": 0.8925924301147461, "eval_runtime": 51.6652, "eval_samples_per_second": 1312.527, "eval_steps_per_second": 2.574, "step": 1440 }, { "epoch": 2.74, "learning_rate": 1e-06, "loss": 0.8939, "step": 1450 }, { "epoch": 2.74, "eval_loss": 0.8922019004821777, "eval_runtime": 51.7786, "eval_samples_per_second": 1309.654, "eval_steps_per_second": 2.569, "step": 1450 }, { "epoch": 2.75, "learning_rate": 1e-06, "loss": 0.8887, "step": 1460 }, { "epoch": 2.75, "eval_loss": 0.8918561339378357, "eval_runtime": 51.7061, "eval_samples_per_second": 1311.489, "eval_steps_per_second": 2.572, "step": 1460 }, { "epoch": 2.77, "learning_rate": 1e-06, "loss": 0.8842, "step": 1470 }, { "epoch": 2.77, "eval_loss": 0.8913816213607788, "eval_runtime": 51.9169, "eval_samples_per_second": 1306.164, "eval_steps_per_second": 2.562, "step": 1470 }, { "epoch": 2.79, "learning_rate": 1e-06, "loss": 0.8752, "step": 1480 }, { "epoch": 2.79, "eval_loss": 0.8909953236579895, "eval_runtime": 51.8089, "eval_samples_per_second": 1308.888, "eval_steps_per_second": 2.567, "step": 1480 }, { "epoch": 2.81, "learning_rate": 1e-06, "loss": 0.8755, "step": 1490 }, { "epoch": 2.81, "eval_loss": 0.8905987739562988, "eval_runtime": 51.6731, "eval_samples_per_second": 1312.328, "eval_steps_per_second": 2.574, "step": 1490 }, { "epoch": 2.83, "learning_rate": 1e-06, "loss": 0.8804, "step": 1500 }, { "epoch": 2.83, "eval_loss": 0.8901605606079102, "eval_runtime": 51.8047, "eval_samples_per_second": 1308.993, "eval_steps_per_second": 2.567, "step": 1500 }, { "epoch": 2.85, "learning_rate": 1e-06, "loss": 0.8881, "step": 1510 }, { "epoch": 2.85, "eval_loss": 0.8897448778152466, "eval_runtime": 51.7378, "eval_samples_per_second": 1310.687, "eval_steps_per_second": 2.571, "step": 1510 }, { "epoch": 2.87, "learning_rate": 1e-06, "loss": 0.8674, "step": 1520 }, { "epoch": 2.87, "eval_loss": 0.8893808126449585, "eval_runtime": 51.6928, "eval_samples_per_second": 1311.827, "eval_steps_per_second": 2.573, "step": 1520 }, { "epoch": 2.89, "learning_rate": 1e-06, "loss": 0.8718, "step": 1530 }, { "epoch": 2.89, "eval_loss": 0.8889936804771423, "eval_runtime": 51.6709, "eval_samples_per_second": 1312.382, "eval_steps_per_second": 2.574, "step": 1530 }, { "epoch": 2.91, "learning_rate": 1e-06, "loss": 0.8867, "step": 1540 }, { "epoch": 2.91, "eval_loss": 0.8885937929153442, "eval_runtime": 52.0485, "eval_samples_per_second": 1302.862, "eval_steps_per_second": 2.555, "step": 1540 }, { "epoch": 2.92, "learning_rate": 1e-06, "loss": 0.8983, "step": 1550 }, { "epoch": 2.92, "eval_loss": 0.8881903886795044, "eval_runtime": 51.779, "eval_samples_per_second": 1309.642, "eval_steps_per_second": 2.569, "step": 1550 }, { "epoch": 2.94, "learning_rate": 1e-06, "loss": 0.8769, "step": 1560 }, { "epoch": 2.94, "eval_loss": 0.8878265023231506, "eval_runtime": 51.6439, "eval_samples_per_second": 1313.07, "eval_steps_per_second": 2.575, "step": 1560 }, { "epoch": 2.96, "learning_rate": 1e-06, "loss": 0.8877, "step": 1570 }, { "epoch": 2.96, "eval_loss": 0.8874040246009827, "eval_runtime": 51.7344, "eval_samples_per_second": 1310.771, "eval_steps_per_second": 2.571, "step": 1570 }, { "epoch": 2.98, "learning_rate": 1e-06, "loss": 0.9046, "step": 1580 }, { "epoch": 2.98, "eval_loss": 0.8870102763175964, "eval_runtime": 51.7489, "eval_samples_per_second": 1310.404, "eval_steps_per_second": 2.57, "step": 1580 }, { "epoch": 3.0, "learning_rate": 1e-06, "loss": 0.8814, "step": 1590 }, { "epoch": 3.0, "eval_loss": 0.8866777420043945, "eval_runtime": 51.715, "eval_samples_per_second": 1311.264, "eval_steps_per_second": 2.572, "step": 1590 }, { "epoch": 3.02, "learning_rate": 1e-06, "loss": 0.8832, "step": 1600 }, { "epoch": 3.02, "eval_loss": 0.8863227963447571, "eval_runtime": 51.8519, "eval_samples_per_second": 1307.801, "eval_steps_per_second": 2.565, "step": 1600 }, { "epoch": 3.04, "learning_rate": 1e-06, "loss": 0.8814, "step": 1610 }, { "epoch": 3.04, "eval_loss": 0.8860304951667786, "eval_runtime": 51.7087, "eval_samples_per_second": 1311.424, "eval_steps_per_second": 2.572, "step": 1610 }, { "epoch": 3.06, "learning_rate": 1e-06, "loss": 0.8867, "step": 1620 }, { "epoch": 3.06, "eval_loss": 0.8856372237205505, "eval_runtime": 51.6572, "eval_samples_per_second": 1312.73, "eval_steps_per_second": 2.575, "step": 1620 }, { "epoch": 3.08, "learning_rate": 1e-06, "loss": 0.8873, "step": 1630 }, { "epoch": 3.08, "eval_loss": 0.8852881193161011, "eval_runtime": 51.7478, "eval_samples_per_second": 1310.433, "eval_steps_per_second": 2.57, "step": 1630 }, { "epoch": 3.09, "learning_rate": 1e-06, "loss": 0.8762, "step": 1640 }, { "epoch": 3.09, "eval_loss": 0.8849110007286072, "eval_runtime": 51.7028, "eval_samples_per_second": 1311.572, "eval_steps_per_second": 2.572, "step": 1640 }, { "epoch": 3.11, "learning_rate": 1e-06, "loss": 0.876, "step": 1650 }, { "epoch": 3.11, "eval_loss": 0.884550929069519, "eval_runtime": 51.7551, "eval_samples_per_second": 1310.248, "eval_steps_per_second": 2.57, "step": 1650 }, { "epoch": 3.13, "learning_rate": 1e-06, "loss": 0.8964, "step": 1660 }, { "epoch": 3.13, "eval_loss": 0.8842361569404602, "eval_runtime": 51.5961, "eval_samples_per_second": 1314.285, "eval_steps_per_second": 2.578, "step": 1660 }, { "epoch": 3.15, "learning_rate": 1e-06, "loss": 0.86, "step": 1670 }, { "epoch": 3.15, "eval_loss": 0.8838174343109131, "eval_runtime": 51.7753, "eval_samples_per_second": 1309.737, "eval_steps_per_second": 2.569, "step": 1670 }, { "epoch": 3.17, "learning_rate": 1e-06, "loss": 0.8848, "step": 1680 }, { "epoch": 3.17, "eval_loss": 0.883499026298523, "eval_runtime": 51.8229, "eval_samples_per_second": 1308.534, "eval_steps_per_second": 2.566, "step": 1680 }, { "epoch": 3.19, "learning_rate": 1e-06, "loss": 0.8529, "step": 1690 }, { "epoch": 3.19, "eval_loss": 0.8831055760383606, "eval_runtime": 51.8274, "eval_samples_per_second": 1308.42, "eval_steps_per_second": 2.566, "step": 1690 }, { "epoch": 3.21, "learning_rate": 1e-06, "loss": 0.8716, "step": 1700 }, { "epoch": 3.21, "eval_loss": 0.8827484250068665, "eval_runtime": 51.6637, "eval_samples_per_second": 1312.566, "eval_steps_per_second": 2.574, "step": 1700 }, { "epoch": 3.23, "learning_rate": 1e-06, "loss": 0.8513, "step": 1710 }, { "epoch": 3.23, "eval_loss": 0.8824192881584167, "eval_runtime": 51.7982, "eval_samples_per_second": 1309.156, "eval_steps_per_second": 2.568, "step": 1710 }, { "epoch": 3.25, "learning_rate": 1e-06, "loss": 0.8796, "step": 1720 }, { "epoch": 3.25, "eval_loss": 0.8820593357086182, "eval_runtime": 51.6857, "eval_samples_per_second": 1312.007, "eval_steps_per_second": 2.573, "step": 1720 }, { "epoch": 3.26, "learning_rate": 1e-06, "loss": 0.8789, "step": 1730 }, { "epoch": 3.26, "eval_loss": 0.8817575573921204, "eval_runtime": 51.6335, "eval_samples_per_second": 1313.333, "eval_steps_per_second": 2.576, "step": 1730 }, { "epoch": 3.28, "learning_rate": 1e-06, "loss": 0.8728, "step": 1740 }, { "epoch": 3.28, "eval_loss": 0.8813122510910034, "eval_runtime": 51.6387, "eval_samples_per_second": 1313.202, "eval_steps_per_second": 2.576, "step": 1740 }, { "epoch": 3.3, "learning_rate": 1e-06, "loss": 0.8735, "step": 1750 }, { "epoch": 3.3, "eval_loss": 0.8810012936592102, "eval_runtime": 51.7373, "eval_samples_per_second": 1310.7, "eval_steps_per_second": 2.571, "step": 1750 }, { "epoch": 3.32, "learning_rate": 1e-06, "loss": 0.8682, "step": 1760 }, { "epoch": 3.32, "eval_loss": 0.8805950284004211, "eval_runtime": 51.6928, "eval_samples_per_second": 1311.827, "eval_steps_per_second": 2.573, "step": 1760 }, { "epoch": 3.34, "learning_rate": 1e-06, "loss": 0.862, "step": 1770 }, { "epoch": 3.34, "eval_loss": 0.8802461624145508, "eval_runtime": 51.7119, "eval_samples_per_second": 1311.341, "eval_steps_per_second": 2.572, "step": 1770 }, { "epoch": 3.36, "learning_rate": 1e-06, "loss": 0.8673, "step": 1780 }, { "epoch": 3.36, "eval_loss": 0.8799049258232117, "eval_runtime": 51.8086, "eval_samples_per_second": 1308.894, "eval_steps_per_second": 2.567, "step": 1780 }, { "epoch": 3.38, "learning_rate": 1e-06, "loss": 0.8659, "step": 1790 }, { "epoch": 3.38, "eval_loss": 0.8796053528785706, "eval_runtime": 51.738, "eval_samples_per_second": 1310.682, "eval_steps_per_second": 2.571, "step": 1790 }, { "epoch": 3.4, "learning_rate": 1e-06, "loss": 0.8611, "step": 1800 }, { "epoch": 3.4, "eval_loss": 0.8792104125022888, "eval_runtime": 51.8854, "eval_samples_per_second": 1306.957, "eval_steps_per_second": 2.563, "step": 1800 }, { "epoch": 3.42, "learning_rate": 1e-06, "loss": 0.8703, "step": 1810 }, { "epoch": 3.42, "eval_loss": 0.878919780254364, "eval_runtime": 51.7353, "eval_samples_per_second": 1310.75, "eval_steps_per_second": 2.571, "step": 1810 }, { "epoch": 3.43, "learning_rate": 1e-06, "loss": 0.8589, "step": 1820 }, { "epoch": 3.43, "eval_loss": 0.8785597681999207, "eval_runtime": 51.7103, "eval_samples_per_second": 1311.383, "eval_steps_per_second": 2.572, "step": 1820 }, { "epoch": 3.45, "learning_rate": 1e-06, "loss": 0.8612, "step": 1830 }, { "epoch": 3.45, "eval_loss": 0.8782058954238892, "eval_runtime": 51.762, "eval_samples_per_second": 1310.072, "eval_steps_per_second": 2.569, "step": 1830 }, { "epoch": 3.47, "learning_rate": 1e-06, "loss": 0.8656, "step": 1840 }, { "epoch": 3.47, "eval_loss": 0.877843976020813, "eval_runtime": 51.6306, "eval_samples_per_second": 1313.407, "eval_steps_per_second": 2.576, "step": 1840 }, { "epoch": 3.49, "learning_rate": 1e-06, "loss": 0.8642, "step": 1850 }, { "epoch": 3.49, "eval_loss": 0.8775426745414734, "eval_runtime": 51.7629, "eval_samples_per_second": 1310.05, "eval_steps_per_second": 2.569, "step": 1850 }, { "epoch": 3.51, "learning_rate": 1e-06, "loss": 0.8794, "step": 1860 }, { "epoch": 3.51, "eval_loss": 0.8772388696670532, "eval_runtime": 51.7172, "eval_samples_per_second": 1311.207, "eval_steps_per_second": 2.572, "step": 1860 }, { "epoch": 3.53, "learning_rate": 1e-06, "loss": 0.8536, "step": 1870 }, { "epoch": 3.53, "eval_loss": 0.8769137859344482, "eval_runtime": 51.6913, "eval_samples_per_second": 1311.864, "eval_steps_per_second": 2.573, "step": 1870 }, { "epoch": 3.55, "learning_rate": 1e-06, "loss": 0.8522, "step": 1880 }, { "epoch": 3.55, "eval_loss": 0.876491367816925, "eval_runtime": 51.9434, "eval_samples_per_second": 1305.497, "eval_steps_per_second": 2.56, "step": 1880 }, { "epoch": 3.57, "learning_rate": 1e-06, "loss": 0.8699, "step": 1890 }, { "epoch": 3.57, "eval_loss": 0.8762040734291077, "eval_runtime": 51.7598, "eval_samples_per_second": 1310.129, "eval_steps_per_second": 2.57, "step": 1890 }, { "epoch": 3.58, "learning_rate": 1e-06, "loss": 0.8643, "step": 1900 }, { "epoch": 3.58, "eval_loss": 0.8758660554885864, "eval_runtime": 51.7094, "eval_samples_per_second": 1311.404, "eval_steps_per_second": 2.572, "step": 1900 }, { "epoch": 3.6, "learning_rate": 1e-06, "loss": 0.8778, "step": 1910 }, { "epoch": 3.6, "eval_loss": 0.8755151629447937, "eval_runtime": 51.938, "eval_samples_per_second": 1305.634, "eval_steps_per_second": 2.561, "step": 1910 }, { "epoch": 3.62, "learning_rate": 1e-06, "loss": 0.8712, "step": 1920 }, { "epoch": 3.62, "eval_loss": 0.8752315044403076, "eval_runtime": 52.1258, "eval_samples_per_second": 1300.929, "eval_steps_per_second": 2.552, "step": 1920 }, { "epoch": 3.64, "learning_rate": 1e-06, "loss": 0.864, "step": 1930 }, { "epoch": 3.64, "eval_loss": 0.8748995065689087, "eval_runtime": 51.7657, "eval_samples_per_second": 1309.979, "eval_steps_per_second": 2.569, "step": 1930 }, { "epoch": 3.66, "learning_rate": 1e-06, "loss": 0.8651, "step": 1940 }, { "epoch": 3.66, "eval_loss": 0.8745700716972351, "eval_runtime": 51.7789, "eval_samples_per_second": 1309.645, "eval_steps_per_second": 2.569, "step": 1940 }, { "epoch": 3.68, "learning_rate": 1e-06, "loss": 0.8406, "step": 1950 }, { "epoch": 3.68, "eval_loss": 0.8742328882217407, "eval_runtime": 51.6777, "eval_samples_per_second": 1312.21, "eval_steps_per_second": 2.574, "step": 1950 }, { "epoch": 3.7, "learning_rate": 1e-06, "loss": 0.8724, "step": 1960 }, { "epoch": 3.7, "eval_loss": 0.8738684058189392, "eval_runtime": 51.851, "eval_samples_per_second": 1307.826, "eval_steps_per_second": 2.565, "step": 1960 }, { "epoch": 3.72, "learning_rate": 1e-06, "loss": 0.861, "step": 1970 }, { "epoch": 3.72, "eval_loss": 0.8735198974609375, "eval_runtime": 51.7532, "eval_samples_per_second": 1310.296, "eval_steps_per_second": 2.57, "step": 1970 }, { "epoch": 3.74, "learning_rate": 1e-06, "loss": 0.8836, "step": 1980 }, { "epoch": 3.74, "eval_loss": 0.8732261657714844, "eval_runtime": 51.8862, "eval_samples_per_second": 1306.936, "eval_steps_per_second": 2.563, "step": 1980 }, { "epoch": 3.75, "learning_rate": 1e-06, "loss": 0.8668, "step": 1990 }, { "epoch": 3.75, "eval_loss": 0.8728705048561096, "eval_runtime": 51.8038, "eval_samples_per_second": 1309.015, "eval_steps_per_second": 2.567, "step": 1990 }, { "epoch": 3.77, "learning_rate": 1e-06, "loss": 0.8557, "step": 2000 }, { "epoch": 3.77, "eval_loss": 0.8725922107696533, "eval_runtime": 51.7963, "eval_samples_per_second": 1309.206, "eval_steps_per_second": 2.568, "step": 2000 }, { "epoch": 3.79, "learning_rate": 1e-06, "loss": 0.8662, "step": 2010 }, { "epoch": 3.79, "eval_loss": 0.872270941734314, "eval_runtime": 51.8085, "eval_samples_per_second": 1308.898, "eval_steps_per_second": 2.567, "step": 2010 }, { "epoch": 3.81, "learning_rate": 1e-06, "loss": 0.8672, "step": 2020 }, { "epoch": 3.81, "eval_loss": 0.8719350695610046, "eval_runtime": 51.9143, "eval_samples_per_second": 1306.231, "eval_steps_per_second": 2.562, "step": 2020 }, { "epoch": 3.83, "learning_rate": 1e-06, "loss": 0.8549, "step": 2030 }, { "epoch": 3.83, "eval_loss": 0.8716722130775452, "eval_runtime": 51.8011, "eval_samples_per_second": 1309.083, "eval_steps_per_second": 2.568, "step": 2030 }, { "epoch": 3.85, "learning_rate": 1e-06, "loss": 0.861, "step": 2040 }, { "epoch": 3.85, "eval_loss": 0.8712872862815857, "eval_runtime": 51.9365, "eval_samples_per_second": 1305.671, "eval_steps_per_second": 2.561, "step": 2040 }, { "epoch": 3.87, "learning_rate": 1e-06, "loss": 0.8668, "step": 2050 }, { "epoch": 3.87, "eval_loss": 0.8710207939147949, "eval_runtime": 51.7794, "eval_samples_per_second": 1309.632, "eval_steps_per_second": 2.569, "step": 2050 }, { "epoch": 3.89, "learning_rate": 1e-06, "loss": 0.8642, "step": 2060 }, { "epoch": 3.89, "eval_loss": 0.870637834072113, "eval_runtime": 51.8156, "eval_samples_per_second": 1308.717, "eval_steps_per_second": 2.567, "step": 2060 }, { "epoch": 3.91, "learning_rate": 1e-06, "loss": 0.8645, "step": 2070 }, { "epoch": 3.91, "eval_loss": 0.87038254737854, "eval_runtime": 51.9756, "eval_samples_per_second": 1304.688, "eval_steps_per_second": 2.559, "step": 2070 }, { "epoch": 3.92, "learning_rate": 1e-06, "loss": 0.853, "step": 2080 }, { "epoch": 3.92, "eval_loss": 0.8700686693191528, "eval_runtime": 51.8858, "eval_samples_per_second": 1306.946, "eval_steps_per_second": 2.563, "step": 2080 }, { "epoch": 3.94, "learning_rate": 1e-06, "loss": 0.8744, "step": 2090 }, { "epoch": 3.94, "eval_loss": 0.8697250485420227, "eval_runtime": 51.8937, "eval_samples_per_second": 1306.749, "eval_steps_per_second": 2.563, "step": 2090 }, { "epoch": 3.96, "learning_rate": 1e-06, "loss": 0.8485, "step": 2100 }, { "epoch": 3.96, "eval_loss": 0.8694667220115662, "eval_runtime": 51.8101, "eval_samples_per_second": 1308.858, "eval_steps_per_second": 2.567, "step": 2100 }, { "epoch": 3.98, "learning_rate": 1e-06, "loss": 0.8708, "step": 2110 }, { "epoch": 3.98, "eval_loss": 0.869096040725708, "eval_runtime": 51.7678, "eval_samples_per_second": 1309.925, "eval_steps_per_second": 2.569, "step": 2110 }, { "epoch": 4.0, "learning_rate": 1e-06, "loss": 0.8588, "step": 2120 }, { "epoch": 4.0, "eval_loss": 0.8688409328460693, "eval_runtime": 51.7659, "eval_samples_per_second": 1309.976, "eval_steps_per_second": 2.569, "step": 2120 }, { "epoch": 4.02, "learning_rate": 1e-06, "loss": 0.8497, "step": 2130 }, { "epoch": 4.02, "eval_loss": 0.8686378002166748, "eval_runtime": 51.9362, "eval_samples_per_second": 1305.68, "eval_steps_per_second": 2.561, "step": 2130 }, { "epoch": 4.04, "learning_rate": 1e-06, "loss": 0.8567, "step": 2140 }, { "epoch": 4.04, "eval_loss": 0.8682656288146973, "eval_runtime": 51.8823, "eval_samples_per_second": 1307.036, "eval_steps_per_second": 2.563, "step": 2140 }, { "epoch": 4.06, "learning_rate": 1e-06, "loss": 0.8727, "step": 2150 }, { "epoch": 4.06, "eval_loss": 0.8680951595306396, "eval_runtime": 51.8888, "eval_samples_per_second": 1306.872, "eval_steps_per_second": 2.563, "step": 2150 }, { "epoch": 4.08, "learning_rate": 1e-06, "loss": 0.8497, "step": 2160 }, { "epoch": 4.08, "eval_loss": 0.8676822185516357, "eval_runtime": 51.8218, "eval_samples_per_second": 1308.561, "eval_steps_per_second": 2.566, "step": 2160 }, { "epoch": 4.09, "learning_rate": 1e-06, "loss": 0.869, "step": 2170 }, { "epoch": 4.09, "eval_loss": 0.8674589395523071, "eval_runtime": 51.8359, "eval_samples_per_second": 1308.206, "eval_steps_per_second": 2.566, "step": 2170 }, { "epoch": 4.11, "learning_rate": 1e-06, "loss": 0.8496, "step": 2180 }, { "epoch": 4.11, "eval_loss": 0.8672137260437012, "eval_runtime": 51.8903, "eval_samples_per_second": 1306.834, "eval_steps_per_second": 2.563, "step": 2180 }, { "epoch": 4.13, "learning_rate": 1e-06, "loss": 0.8616, "step": 2190 }, { "epoch": 4.13, "eval_loss": 0.8669330477714539, "eval_runtime": 51.8142, "eval_samples_per_second": 1308.754, "eval_steps_per_second": 2.567, "step": 2190 }, { "epoch": 4.15, "learning_rate": 1e-06, "loss": 0.8529, "step": 2200 }, { "epoch": 4.15, "eval_loss": 0.8666622042655945, "eval_runtime": 51.9401, "eval_samples_per_second": 1305.58, "eval_steps_per_second": 2.561, "step": 2200 }, { "epoch": 4.17, "learning_rate": 1e-06, "loss": 0.8544, "step": 2210 }, { "epoch": 4.17, "eval_loss": 0.8662629127502441, "eval_runtime": 51.8488, "eval_samples_per_second": 1307.881, "eval_steps_per_second": 2.565, "step": 2210 }, { "epoch": 4.19, "learning_rate": 1e-06, "loss": 0.8557, "step": 2220 }, { "epoch": 4.19, "eval_loss": 0.8660885095596313, "eval_runtime": 51.9277, "eval_samples_per_second": 1305.892, "eval_steps_per_second": 2.561, "step": 2220 }, { "epoch": 4.21, "learning_rate": 1e-06, "loss": 0.8622, "step": 2230 }, { "epoch": 4.21, "eval_loss": 0.8657708764076233, "eval_runtime": 51.9531, "eval_samples_per_second": 1305.255, "eval_steps_per_second": 2.56, "step": 2230 }, { "epoch": 4.23, "learning_rate": 1e-06, "loss": 0.8579, "step": 2240 }, { "epoch": 4.23, "eval_loss": 0.8654367327690125, "eval_runtime": 51.8806, "eval_samples_per_second": 1307.079, "eval_steps_per_second": 2.564, "step": 2240 }, { "epoch": 4.25, "learning_rate": 1e-06, "loss": 0.8671, "step": 2250 }, { "epoch": 4.25, "eval_loss": 0.8651946783065796, "eval_runtime": 51.8436, "eval_samples_per_second": 1308.011, "eval_steps_per_second": 2.565, "step": 2250 }, { "epoch": 4.26, "learning_rate": 1e-06, "loss": 0.8518, "step": 2260 }, { "epoch": 4.26, "eval_loss": 0.8648258447647095, "eval_runtime": 51.8609, "eval_samples_per_second": 1307.574, "eval_steps_per_second": 2.565, "step": 2260 }, { "epoch": 4.28, "learning_rate": 1e-06, "loss": 0.8539, "step": 2270 }, { "epoch": 4.28, "eval_loss": 0.8645607233047485, "eval_runtime": 51.9475, "eval_samples_per_second": 1305.396, "eval_steps_per_second": 2.56, "step": 2270 }, { "epoch": 4.3, "learning_rate": 1e-06, "loss": 0.8477, "step": 2280 }, { "epoch": 4.3, "eval_loss": 0.8643002510070801, "eval_runtime": 51.8601, "eval_samples_per_second": 1307.596, "eval_steps_per_second": 2.565, "step": 2280 }, { "epoch": 4.32, "learning_rate": 1e-06, "loss": 0.8514, "step": 2290 }, { "epoch": 4.32, "eval_loss": 0.8639960885047913, "eval_runtime": 51.9219, "eval_samples_per_second": 1306.038, "eval_steps_per_second": 2.562, "step": 2290 }, { "epoch": 4.34, "learning_rate": 1e-06, "loss": 0.8541, "step": 2300 }, { "epoch": 4.34, "eval_loss": 0.8637537360191345, "eval_runtime": 51.9296, "eval_samples_per_second": 1305.844, "eval_steps_per_second": 2.561, "step": 2300 }, { "epoch": 4.36, "learning_rate": 1e-06, "loss": 0.8488, "step": 2310 }, { "epoch": 4.36, "eval_loss": 0.8634164333343506, "eval_runtime": 51.9575, "eval_samples_per_second": 1305.145, "eval_steps_per_second": 2.56, "step": 2310 }, { "epoch": 4.38, "learning_rate": 1e-06, "loss": 0.8477, "step": 2320 }, { "epoch": 4.38, "eval_loss": 0.8631356954574585, "eval_runtime": 51.9351, "eval_samples_per_second": 1305.706, "eval_steps_per_second": 2.561, "step": 2320 }, { "epoch": 4.4, "learning_rate": 1e-06, "loss": 0.8428, "step": 2330 }, { "epoch": 4.4, "eval_loss": 0.8628517389297485, "eval_runtime": 51.8839, "eval_samples_per_second": 1306.995, "eval_steps_per_second": 2.563, "step": 2330 }, { "epoch": 4.42, "learning_rate": 1e-06, "loss": 0.8488, "step": 2340 }, { "epoch": 4.42, "eval_loss": 0.8625032305717468, "eval_runtime": 51.8371, "eval_samples_per_second": 1308.174, "eval_steps_per_second": 2.566, "step": 2340 }, { "epoch": 4.43, "learning_rate": 1e-06, "loss": 0.8498, "step": 2350 }, { "epoch": 4.43, "eval_loss": 0.8623167276382446, "eval_runtime": 51.9502, "eval_samples_per_second": 1305.327, "eval_steps_per_second": 2.56, "step": 2350 }, { "epoch": 4.45, "learning_rate": 1e-06, "loss": 0.8593, "step": 2360 }, { "epoch": 4.45, "eval_loss": 0.8619263172149658, "eval_runtime": 51.8749, "eval_samples_per_second": 1307.222, "eval_steps_per_second": 2.564, "step": 2360 }, { "epoch": 4.47, "learning_rate": 1e-06, "loss": 0.8563, "step": 2370 }, { "epoch": 4.47, "eval_loss": 0.8616353869438171, "eval_runtime": 51.9072, "eval_samples_per_second": 1306.407, "eval_steps_per_second": 2.562, "step": 2370 }, { "epoch": 4.49, "learning_rate": 1e-06, "loss": 0.8589, "step": 2380 }, { "epoch": 4.49, "eval_loss": 0.8614597916603088, "eval_runtime": 51.9394, "eval_samples_per_second": 1305.597, "eval_steps_per_second": 2.561, "step": 2380 }, { "epoch": 4.51, "learning_rate": 1e-06, "loss": 0.8508, "step": 2390 }, { "epoch": 4.51, "eval_loss": 0.8611072301864624, "eval_runtime": 51.9103, "eval_samples_per_second": 1306.331, "eval_steps_per_second": 2.562, "step": 2390 }, { "epoch": 4.53, "learning_rate": 1e-06, "loss": 0.8266, "step": 2400 }, { "epoch": 4.53, "eval_loss": 0.8608320951461792, "eval_runtime": 51.9481, "eval_samples_per_second": 1305.38, "eval_steps_per_second": 2.56, "step": 2400 }, { "epoch": 4.55, "learning_rate": 1e-06, "loss": 0.8546, "step": 2410 }, { "epoch": 4.55, "eval_loss": 0.8605498671531677, "eval_runtime": 51.9432, "eval_samples_per_second": 1305.504, "eval_steps_per_second": 2.56, "step": 2410 }, { "epoch": 4.57, "learning_rate": 1e-06, "loss": 0.853, "step": 2420 }, { "epoch": 4.57, "eval_loss": 0.8603416085243225, "eval_runtime": 52.0432, "eval_samples_per_second": 1302.996, "eval_steps_per_second": 2.556, "step": 2420 }, { "epoch": 4.58, "learning_rate": 1e-06, "loss": 0.8484, "step": 2430 }, { "epoch": 4.58, "eval_loss": 0.8599910736083984, "eval_runtime": 51.8688, "eval_samples_per_second": 1307.375, "eval_steps_per_second": 2.564, "step": 2430 }, { "epoch": 4.6, "learning_rate": 1e-06, "loss": 0.8328, "step": 2440 }, { "epoch": 4.6, "eval_loss": 0.859768271446228, "eval_runtime": 51.8564, "eval_samples_per_second": 1307.689, "eval_steps_per_second": 2.565, "step": 2440 }, { "epoch": 4.62, "learning_rate": 1e-06, "loss": 0.834, "step": 2450 }, { "epoch": 4.62, "eval_loss": 0.8594483733177185, "eval_runtime": 51.951, "eval_samples_per_second": 1305.306, "eval_steps_per_second": 2.56, "step": 2450 }, { "epoch": 4.64, "learning_rate": 1e-06, "loss": 0.8383, "step": 2460 }, { "epoch": 4.64, "eval_loss": 0.8591568470001221, "eval_runtime": 51.8704, "eval_samples_per_second": 1307.335, "eval_steps_per_second": 2.564, "step": 2460 }, { "epoch": 4.66, "learning_rate": 1e-06, "loss": 0.841, "step": 2470 }, { "epoch": 4.66, "eval_loss": 0.8589540719985962, "eval_runtime": 51.9469, "eval_samples_per_second": 1305.409, "eval_steps_per_second": 2.56, "step": 2470 }, { "epoch": 4.68, "learning_rate": 1e-06, "loss": 0.8472, "step": 2480 }, { "epoch": 4.68, "eval_loss": 0.858716607093811, "eval_runtime": 51.8753, "eval_samples_per_second": 1307.213, "eval_steps_per_second": 2.564, "step": 2480 }, { "epoch": 4.7, "learning_rate": 1e-06, "loss": 0.856, "step": 2490 }, { "epoch": 4.7, "eval_loss": 0.8584380745887756, "eval_runtime": 51.7822, "eval_samples_per_second": 1309.561, "eval_steps_per_second": 2.568, "step": 2490 }, { "epoch": 4.72, "learning_rate": 1e-06, "loss": 0.8477, "step": 2500 }, { "epoch": 4.72, "eval_loss": 0.8581625819206238, "eval_runtime": 51.8356, "eval_samples_per_second": 1308.214, "eval_steps_per_second": 2.566, "step": 2500 }, { "epoch": 4.74, "learning_rate": 1e-06, "loss": 0.8391, "step": 2510 }, { "epoch": 4.74, "eval_loss": 0.85783851146698, "eval_runtime": 51.9944, "eval_samples_per_second": 1304.217, "eval_steps_per_second": 2.558, "step": 2510 }, { "epoch": 4.75, "learning_rate": 1e-06, "loss": 0.8428, "step": 2520 }, { "epoch": 4.75, "eval_loss": 0.8575791120529175, "eval_runtime": 51.7668, "eval_samples_per_second": 1309.951, "eval_steps_per_second": 2.569, "step": 2520 }, { "epoch": 4.77, "learning_rate": 1e-06, "loss": 0.8348, "step": 2530 }, { "epoch": 4.77, "eval_loss": 0.8571997284889221, "eval_runtime": 51.7677, "eval_samples_per_second": 1309.93, "eval_steps_per_second": 2.569, "step": 2530 }, { "epoch": 4.79, "learning_rate": 1e-06, "loss": 0.8387, "step": 2540 }, { "epoch": 4.79, "eval_loss": 0.857079803943634, "eval_runtime": 51.9747, "eval_samples_per_second": 1304.711, "eval_steps_per_second": 2.559, "step": 2540 }, { "epoch": 4.81, "learning_rate": 1e-06, "loss": 0.8382, "step": 2550 }, { "epoch": 4.81, "eval_loss": 0.8567344546318054, "eval_runtime": 51.8152, "eval_samples_per_second": 1308.727, "eval_steps_per_second": 2.567, "step": 2550 }, { "epoch": 4.83, "learning_rate": 1e-06, "loss": 0.8352, "step": 2560 }, { "epoch": 4.83, "eval_loss": 0.856507420539856, "eval_runtime": 51.8084, "eval_samples_per_second": 1308.899, "eval_steps_per_second": 2.567, "step": 2560 }, { "epoch": 4.85, "learning_rate": 1e-06, "loss": 0.8583, "step": 2570 }, { "epoch": 4.85, "eval_loss": 0.8562394380569458, "eval_runtime": 51.8605, "eval_samples_per_second": 1307.585, "eval_steps_per_second": 2.565, "step": 2570 }, { "epoch": 4.87, "learning_rate": 1e-06, "loss": 0.8576, "step": 2580 }, { "epoch": 4.87, "eval_loss": 0.8559291362762451, "eval_runtime": 51.9102, "eval_samples_per_second": 1306.334, "eval_steps_per_second": 2.562, "step": 2580 }, { "epoch": 4.89, "learning_rate": 1e-06, "loss": 0.8509, "step": 2590 }, { "epoch": 4.89, "eval_loss": 0.8556599617004395, "eval_runtime": 51.7511, "eval_samples_per_second": 1310.348, "eval_steps_per_second": 2.57, "step": 2590 }, { "epoch": 4.91, "learning_rate": 1e-06, "loss": 0.8348, "step": 2600 }, { "epoch": 4.91, "eval_loss": 0.8555102944374084, "eval_runtime": 51.9474, "eval_samples_per_second": 1305.398, "eval_steps_per_second": 2.56, "step": 2600 }, { "epoch": 4.92, "learning_rate": 1e-06, "loss": 0.8341, "step": 2610 }, { "epoch": 4.92, "eval_loss": 0.8551704287528992, "eval_runtime": 51.7687, "eval_samples_per_second": 1309.904, "eval_steps_per_second": 2.569, "step": 2610 }, { "epoch": 4.94, "learning_rate": 1e-06, "loss": 0.8402, "step": 2620 }, { "epoch": 4.94, "eval_loss": 0.8548431396484375, "eval_runtime": 51.924, "eval_samples_per_second": 1305.985, "eval_steps_per_second": 2.561, "step": 2620 }, { "epoch": 4.96, "learning_rate": 1e-06, "loss": 0.8266, "step": 2630 }, { "epoch": 4.96, "eval_loss": 0.8546814322471619, "eval_runtime": 51.8716, "eval_samples_per_second": 1307.306, "eval_steps_per_second": 2.564, "step": 2630 }, { "epoch": 4.98, "learning_rate": 1e-06, "loss": 0.8239, "step": 2640 }, { "epoch": 4.98, "eval_loss": 0.8543078303337097, "eval_runtime": 51.8685, "eval_samples_per_second": 1307.382, "eval_steps_per_second": 2.564, "step": 2640 }, { "epoch": 5.0, "learning_rate": 1e-06, "loss": 0.8442, "step": 2650 }, { "epoch": 5.0, "eval_loss": 0.8540557622909546, "eval_runtime": 51.7829, "eval_samples_per_second": 1309.543, "eval_steps_per_second": 2.568, "step": 2650 }, { "epoch": 5.02, "learning_rate": 1e-06, "loss": 0.8376, "step": 2660 }, { "epoch": 5.02, "eval_loss": 0.8539601564407349, "eval_runtime": 51.8171, "eval_samples_per_second": 1308.68, "eval_steps_per_second": 2.567, "step": 2660 }, { "epoch": 5.04, "learning_rate": 1e-06, "loss": 0.8451, "step": 2670 }, { "epoch": 5.04, "eval_loss": 0.8537167310714722, "eval_runtime": 51.8574, "eval_samples_per_second": 1307.663, "eval_steps_per_second": 2.565, "step": 2670 }, { "epoch": 5.06, "learning_rate": 1e-06, "loss": 0.832, "step": 2680 }, { "epoch": 5.06, "eval_loss": 0.8533909320831299, "eval_runtime": 51.8868, "eval_samples_per_second": 1306.922, "eval_steps_per_second": 2.563, "step": 2680 }, { "epoch": 5.08, "learning_rate": 1e-06, "loss": 0.8439, "step": 2690 }, { "epoch": 5.08, "eval_loss": 0.853160560131073, "eval_runtime": 51.8888, "eval_samples_per_second": 1306.873, "eval_steps_per_second": 2.563, "step": 2690 }, { "epoch": 5.09, "learning_rate": 1e-06, "loss": 0.8352, "step": 2700 }, { "epoch": 5.09, "eval_loss": 0.852979302406311, "eval_runtime": 51.9317, "eval_samples_per_second": 1305.793, "eval_steps_per_second": 2.561, "step": 2700 }, { "epoch": 5.11, "learning_rate": 1e-06, "loss": 0.8339, "step": 2710 }, { "epoch": 5.11, "eval_loss": 0.8527371287345886, "eval_runtime": 51.8547, "eval_samples_per_second": 1307.732, "eval_steps_per_second": 2.565, "step": 2710 }, { "epoch": 5.13, "learning_rate": 1e-06, "loss": 0.8342, "step": 2720 }, { "epoch": 5.13, "eval_loss": 0.8524041771888733, "eval_runtime": 51.8908, "eval_samples_per_second": 1306.82, "eval_steps_per_second": 2.563, "step": 2720 }, { "epoch": 5.15, "learning_rate": 1e-06, "loss": 0.8417, "step": 2730 }, { "epoch": 5.15, "eval_loss": 0.8523533940315247, "eval_runtime": 51.9792, "eval_samples_per_second": 1304.6, "eval_steps_per_second": 2.559, "step": 2730 }, { "epoch": 5.17, "learning_rate": 1e-06, "loss": 0.8407, "step": 2740 }, { "epoch": 5.17, "eval_loss": 0.8520050644874573, "eval_runtime": 51.9453, "eval_samples_per_second": 1305.45, "eval_steps_per_second": 2.56, "step": 2740 }, { "epoch": 5.19, "learning_rate": 1e-06, "loss": 0.8367, "step": 2750 }, { "epoch": 5.19, "eval_loss": 0.8518672585487366, "eval_runtime": 51.8852, "eval_samples_per_second": 1306.963, "eval_steps_per_second": 2.563, "step": 2750 }, { "epoch": 5.21, "learning_rate": 1e-06, "loss": 0.8201, "step": 2760 }, { "epoch": 5.21, "eval_loss": 0.8514959216117859, "eval_runtime": 51.8894, "eval_samples_per_second": 1306.855, "eval_steps_per_second": 2.563, "step": 2760 }, { "epoch": 5.23, "learning_rate": 1e-06, "loss": 0.8278, "step": 2770 }, { "epoch": 5.23, "eval_loss": 0.8512703776359558, "eval_runtime": 51.7905, "eval_samples_per_second": 1309.351, "eval_steps_per_second": 2.568, "step": 2770 }, { "epoch": 5.25, "learning_rate": 1e-06, "loss": 0.8302, "step": 2780 }, { "epoch": 5.25, "eval_loss": 0.8510229587554932, "eval_runtime": 51.8436, "eval_samples_per_second": 1308.01, "eval_steps_per_second": 2.565, "step": 2780 }, { "epoch": 5.26, "learning_rate": 1e-06, "loss": 0.8148, "step": 2790 }, { "epoch": 5.26, "eval_loss": 0.8507676720619202, "eval_runtime": 51.818, "eval_samples_per_second": 1308.657, "eval_steps_per_second": 2.567, "step": 2790 }, { "epoch": 5.28, "learning_rate": 1e-06, "loss": 0.84, "step": 2800 }, { "epoch": 5.28, "eval_loss": 0.8504866361618042, "eval_runtime": 51.7628, "eval_samples_per_second": 1310.052, "eval_steps_per_second": 2.569, "step": 2800 }, { "epoch": 5.3, "learning_rate": 1e-06, "loss": 0.8267, "step": 2810 }, { "epoch": 5.3, "eval_loss": 0.8502947688102722, "eval_runtime": 51.7712, "eval_samples_per_second": 1309.839, "eval_steps_per_second": 2.569, "step": 2810 }, { "epoch": 5.32, "learning_rate": 1e-06, "loss": 0.8356, "step": 2820 }, { "epoch": 5.32, "eval_loss": 0.8499984741210938, "eval_runtime": 51.8805, "eval_samples_per_second": 1307.08, "eval_steps_per_second": 2.564, "step": 2820 }, { "epoch": 5.34, "learning_rate": 1e-06, "loss": 0.8375, "step": 2830 }, { "epoch": 5.34, "eval_loss": 0.8497674465179443, "eval_runtime": 52.2672, "eval_samples_per_second": 1297.41, "eval_steps_per_second": 2.545, "step": 2830 }, { "epoch": 5.36, "learning_rate": 1e-06, "loss": 0.829, "step": 2840 }, { "epoch": 5.36, "eval_loss": 0.849518895149231, "eval_runtime": 51.816, "eval_samples_per_second": 1308.709, "eval_steps_per_second": 2.567, "step": 2840 }, { "epoch": 5.38, "learning_rate": 1e-06, "loss": 0.8212, "step": 2850 }, { "epoch": 5.38, "eval_loss": 0.8492794632911682, "eval_runtime": 51.9648, "eval_samples_per_second": 1304.961, "eval_steps_per_second": 2.559, "step": 2850 }, { "epoch": 5.4, "learning_rate": 1e-06, "loss": 0.8353, "step": 2860 }, { "epoch": 5.4, "eval_loss": 0.8490678071975708, "eval_runtime": 51.8767, "eval_samples_per_second": 1307.175, "eval_steps_per_second": 2.564, "step": 2860 }, { "epoch": 5.42, "learning_rate": 1e-06, "loss": 0.8322, "step": 2870 }, { "epoch": 5.42, "eval_loss": 0.8488078713417053, "eval_runtime": 51.8479, "eval_samples_per_second": 1307.902, "eval_steps_per_second": 2.565, "step": 2870 }, { "epoch": 5.43, "learning_rate": 1e-06, "loss": 0.826, "step": 2880 }, { "epoch": 5.43, "eval_loss": 0.848633885383606, "eval_runtime": 51.7082, "eval_samples_per_second": 1311.436, "eval_steps_per_second": 2.572, "step": 2880 }, { "epoch": 5.45, "learning_rate": 1e-06, "loss": 0.8237, "step": 2890 }, { "epoch": 5.45, "eval_loss": 0.8483902812004089, "eval_runtime": 52.0509, "eval_samples_per_second": 1302.801, "eval_steps_per_second": 2.555, "step": 2890 }, { "epoch": 5.47, "learning_rate": 1e-06, "loss": 0.8499, "step": 2900 }, { "epoch": 5.47, "eval_loss": 0.8481594920158386, "eval_runtime": 51.9139, "eval_samples_per_second": 1306.239, "eval_steps_per_second": 2.562, "step": 2900 }, { "epoch": 5.49, "learning_rate": 1e-06, "loss": 0.8291, "step": 2910 }, { "epoch": 5.49, "eval_loss": 0.8477580547332764, "eval_runtime": 51.9491, "eval_samples_per_second": 1305.354, "eval_steps_per_second": 2.56, "step": 2910 }, { "epoch": 5.51, "learning_rate": 1e-06, "loss": 0.8216, "step": 2920 }, { "epoch": 5.51, "eval_loss": 0.8476288914680481, "eval_runtime": 52.0178, "eval_samples_per_second": 1303.631, "eval_steps_per_second": 2.557, "step": 2920 }, { "epoch": 5.53, "learning_rate": 1e-06, "loss": 0.841, "step": 2930 }, { "epoch": 5.53, "eval_loss": 0.8473249673843384, "eval_runtime": 52.0044, "eval_samples_per_second": 1303.967, "eval_steps_per_second": 2.557, "step": 2930 }, { "epoch": 5.55, "learning_rate": 1e-06, "loss": 0.8265, "step": 2940 }, { "epoch": 5.55, "eval_loss": 0.8471129536628723, "eval_runtime": 51.799, "eval_samples_per_second": 1309.137, "eval_steps_per_second": 2.568, "step": 2940 }, { "epoch": 5.57, "learning_rate": 1e-06, "loss": 0.8329, "step": 2950 }, { "epoch": 5.57, "eval_loss": 0.8468539118766785, "eval_runtime": 51.8537, "eval_samples_per_second": 1307.756, "eval_steps_per_second": 2.565, "step": 2950 }, { "epoch": 5.58, "learning_rate": 1e-06, "loss": 0.8257, "step": 2960 }, { "epoch": 5.58, "eval_loss": 0.8465786576271057, "eval_runtime": 52.0735, "eval_samples_per_second": 1302.236, "eval_steps_per_second": 2.554, "step": 2960 }, { "epoch": 5.6, "learning_rate": 1e-06, "loss": 0.8332, "step": 2970 }, { "epoch": 5.6, "eval_loss": 0.8464725613594055, "eval_runtime": 51.7405, "eval_samples_per_second": 1310.618, "eval_steps_per_second": 2.571, "step": 2970 }, { "epoch": 5.62, "learning_rate": 1e-06, "loss": 0.8188, "step": 2980 }, { "epoch": 5.62, "eval_loss": 0.8461154699325562, "eval_runtime": 51.9517, "eval_samples_per_second": 1305.29, "eval_steps_per_second": 2.56, "step": 2980 }, { "epoch": 5.64, "learning_rate": 1e-06, "loss": 0.8336, "step": 2990 }, { "epoch": 5.64, "eval_loss": 0.8458889126777649, "eval_runtime": 51.9673, "eval_samples_per_second": 1304.896, "eval_steps_per_second": 2.559, "step": 2990 }, { "epoch": 5.66, "learning_rate": 1e-06, "loss": 0.8423, "step": 3000 }, { "epoch": 5.66, "eval_loss": 0.845611035823822, "eval_runtime": 51.8207, "eval_samples_per_second": 1308.59, "eval_steps_per_second": 2.567, "step": 3000 }, { "epoch": 5.68, "learning_rate": 1e-06, "loss": 0.8356, "step": 3010 }, { "epoch": 5.68, "eval_loss": 0.8453643918037415, "eval_runtime": 51.8234, "eval_samples_per_second": 1308.52, "eval_steps_per_second": 2.566, "step": 3010 }, { "epoch": 5.7, "learning_rate": 1e-06, "loss": 0.8237, "step": 3020 }, { "epoch": 5.7, "eval_loss": 0.845206081867218, "eval_runtime": 51.8105, "eval_samples_per_second": 1308.846, "eval_steps_per_second": 2.567, "step": 3020 }, { "epoch": 5.72, "learning_rate": 1e-06, "loss": 0.8414, "step": 3030 }, { "epoch": 5.72, "eval_loss": 0.8449116349220276, "eval_runtime": 51.9928, "eval_samples_per_second": 1304.256, "eval_steps_per_second": 2.558, "step": 3030 }, { "epoch": 5.74, "learning_rate": 1e-06, "loss": 0.8276, "step": 3040 }, { "epoch": 5.74, "eval_loss": 0.8447545170783997, "eval_runtime": 51.9148, "eval_samples_per_second": 1306.217, "eval_steps_per_second": 2.562, "step": 3040 }, { "epoch": 5.75, "learning_rate": 1e-06, "loss": 0.818, "step": 3050 }, { "epoch": 5.75, "eval_loss": 0.844507098197937, "eval_runtime": 52.0252, "eval_samples_per_second": 1303.446, "eval_steps_per_second": 2.556, "step": 3050 }, { "epoch": 5.77, "learning_rate": 1e-06, "loss": 0.8458, "step": 3060 }, { "epoch": 5.77, "eval_loss": 0.8442298173904419, "eval_runtime": 51.9337, "eval_samples_per_second": 1305.742, "eval_steps_per_second": 2.561, "step": 3060 }, { "epoch": 5.79, "learning_rate": 1e-06, "loss": 0.8323, "step": 3070 }, { "epoch": 5.79, "eval_loss": 0.843976616859436, "eval_runtime": 51.9921, "eval_samples_per_second": 1304.275, "eval_steps_per_second": 2.558, "step": 3070 }, { "epoch": 5.81, "learning_rate": 1e-06, "loss": 0.8312, "step": 3080 }, { "epoch": 5.81, "eval_loss": 0.843853771686554, "eval_runtime": 51.8976, "eval_samples_per_second": 1306.649, "eval_steps_per_second": 2.563, "step": 3080 }, { "epoch": 5.83, "learning_rate": 1e-06, "loss": 0.8272, "step": 3090 }, { "epoch": 5.83, "eval_loss": 0.8435570597648621, "eval_runtime": 51.9376, "eval_samples_per_second": 1305.643, "eval_steps_per_second": 2.561, "step": 3090 }, { "epoch": 5.85, "learning_rate": 1e-06, "loss": 0.8105, "step": 3100 }, { "epoch": 5.85, "eval_loss": 0.8433138132095337, "eval_runtime": 52.0207, "eval_samples_per_second": 1303.559, "eval_steps_per_second": 2.557, "step": 3100 }, { "epoch": 5.87, "learning_rate": 1e-06, "loss": 0.827, "step": 3110 }, { "epoch": 5.87, "eval_loss": 0.8431060314178467, "eval_runtime": 52.0407, "eval_samples_per_second": 1303.058, "eval_steps_per_second": 2.556, "step": 3110 }, { "epoch": 5.89, "learning_rate": 1e-06, "loss": 0.8338, "step": 3120 }, { "epoch": 5.89, "eval_loss": 0.8428162336349487, "eval_runtime": 52.0503, "eval_samples_per_second": 1302.817, "eval_steps_per_second": 2.555, "step": 3120 }, { "epoch": 5.91, "learning_rate": 1e-06, "loss": 0.8402, "step": 3130 }, { "epoch": 5.91, "eval_loss": 0.8426678776741028, "eval_runtime": 51.9479, "eval_samples_per_second": 1305.384, "eval_steps_per_second": 2.56, "step": 3130 }, { "epoch": 5.92, "learning_rate": 1e-06, "loss": 0.8249, "step": 3140 }, { "epoch": 5.92, "eval_loss": 0.8424062728881836, "eval_runtime": 52.0747, "eval_samples_per_second": 1302.206, "eval_steps_per_second": 2.554, "step": 3140 }, { "epoch": 5.94, "learning_rate": 1e-06, "loss": 0.8262, "step": 3150 }, { "epoch": 5.94, "eval_loss": 0.8421285152435303, "eval_runtime": 51.9477, "eval_samples_per_second": 1305.39, "eval_steps_per_second": 2.56, "step": 3150 }, { "epoch": 5.96, "learning_rate": 1e-06, "loss": 0.8375, "step": 3160 }, { "epoch": 5.96, "eval_loss": 0.8420241475105286, "eval_runtime": 51.9104, "eval_samples_per_second": 1306.329, "eval_steps_per_second": 2.562, "step": 3160 }, { "epoch": 5.98, "learning_rate": 1e-06, "loss": 0.8332, "step": 3170 }, { "epoch": 5.98, "eval_loss": 0.8416071534156799, "eval_runtime": 52.0768, "eval_samples_per_second": 1302.154, "eval_steps_per_second": 2.554, "step": 3170 }, { "epoch": 6.0, "learning_rate": 1e-06, "loss": 0.8165, "step": 3180 }, { "epoch": 6.0, "eval_loss": 0.8414864540100098, "eval_runtime": 52.0127, "eval_samples_per_second": 1303.76, "eval_steps_per_second": 2.557, "step": 3180 }, { "epoch": 6.02, "learning_rate": 1e-06, "loss": 0.8023, "step": 3190 }, { "epoch": 6.02, "eval_loss": 0.8415040969848633, "eval_runtime": 51.9192, "eval_samples_per_second": 1306.107, "eval_steps_per_second": 2.562, "step": 3190 }, { "epoch": 6.04, "learning_rate": 1e-06, "loss": 0.8223, "step": 3200 }, { "epoch": 6.04, "eval_loss": 0.8412625789642334, "eval_runtime": 52.0405, "eval_samples_per_second": 1303.063, "eval_steps_per_second": 2.556, "step": 3200 }, { "epoch": 6.06, "learning_rate": 1e-06, "loss": 0.8273, "step": 3210 }, { "epoch": 6.06, "eval_loss": 0.8409404158592224, "eval_runtime": 51.9332, "eval_samples_per_second": 1305.754, "eval_steps_per_second": 2.561, "step": 3210 }, { "epoch": 6.08, "learning_rate": 1e-06, "loss": 0.8175, "step": 3220 }, { "epoch": 6.08, "eval_loss": 0.840887725353241, "eval_runtime": 52.0538, "eval_samples_per_second": 1302.729, "eval_steps_per_second": 2.555, "step": 3220 }, { "epoch": 6.09, "learning_rate": 1e-06, "loss": 0.8332, "step": 3230 }, { "epoch": 6.09, "eval_loss": 0.8405251502990723, "eval_runtime": 51.9819, "eval_samples_per_second": 1304.53, "eval_steps_per_second": 2.559, "step": 3230 }, { "epoch": 6.11, "learning_rate": 1e-06, "loss": 0.825, "step": 3240 }, { "epoch": 6.11, "eval_loss": 0.8403586745262146, "eval_runtime": 51.9603, "eval_samples_per_second": 1305.073, "eval_steps_per_second": 2.56, "step": 3240 }, { "epoch": 6.13, "learning_rate": 1e-06, "loss": 0.8214, "step": 3250 }, { "epoch": 6.13, "eval_loss": 0.8403131365776062, "eval_runtime": 51.9874, "eval_samples_per_second": 1304.393, "eval_steps_per_second": 2.558, "step": 3250 }, { "epoch": 6.15, "learning_rate": 1e-06, "loss": 0.8158, "step": 3260 }, { "epoch": 6.15, "eval_loss": 0.8399011492729187, "eval_runtime": 51.9729, "eval_samples_per_second": 1304.757, "eval_steps_per_second": 2.559, "step": 3260 }, { "epoch": 6.17, "learning_rate": 1e-06, "loss": 0.8128, "step": 3270 }, { "epoch": 6.17, "eval_loss": 0.8397038578987122, "eval_runtime": 51.9517, "eval_samples_per_second": 1305.29, "eval_steps_per_second": 2.56, "step": 3270 }, { "epoch": 6.19, "learning_rate": 1e-06, "loss": 0.8209, "step": 3280 }, { "epoch": 6.19, "eval_loss": 0.8394789695739746, "eval_runtime": 51.9885, "eval_samples_per_second": 1304.366, "eval_steps_per_second": 2.558, "step": 3280 }, { "epoch": 6.21, "learning_rate": 1e-06, "loss": 0.807, "step": 3290 }, { "epoch": 6.21, "eval_loss": 0.8392835855484009, "eval_runtime": 52.0478, "eval_samples_per_second": 1302.879, "eval_steps_per_second": 2.555, "step": 3290 }, { "epoch": 6.23, "learning_rate": 1e-06, "loss": 0.8187, "step": 3300 }, { "epoch": 6.23, "eval_loss": 0.8390551209449768, "eval_runtime": 52.5827, "eval_samples_per_second": 1289.627, "eval_steps_per_second": 2.529, "step": 3300 }, { "epoch": 6.25, "learning_rate": 1e-06, "loss": 0.8299, "step": 3310 }, { "epoch": 6.25, "eval_loss": 0.8387653231620789, "eval_runtime": 52.0284, "eval_samples_per_second": 1303.366, "eval_steps_per_second": 2.556, "step": 3310 }, { "epoch": 6.26, "learning_rate": 1e-06, "loss": 0.816, "step": 3320 }, { "epoch": 6.26, "eval_loss": 0.8385831117630005, "eval_runtime": 52.1251, "eval_samples_per_second": 1300.947, "eval_steps_per_second": 2.552, "step": 3320 }, { "epoch": 6.28, "learning_rate": 1e-06, "loss": 0.7963, "step": 3330 }, { "epoch": 6.28, "eval_loss": 0.8384872674942017, "eval_runtime": 51.9641, "eval_samples_per_second": 1304.977, "eval_steps_per_second": 2.559, "step": 3330 }, { "epoch": 6.3, "learning_rate": 1e-06, "loss": 0.832, "step": 3340 }, { "epoch": 6.3, "eval_loss": 0.8381639719009399, "eval_runtime": 51.9857, "eval_samples_per_second": 1304.436, "eval_steps_per_second": 2.558, "step": 3340 }, { "epoch": 6.32, "learning_rate": 1e-06, "loss": 0.8122, "step": 3350 }, { "epoch": 6.32, "eval_loss": 0.8379948139190674, "eval_runtime": 52.0508, "eval_samples_per_second": 1302.805, "eval_steps_per_second": 2.555, "step": 3350 }, { "epoch": 6.34, "learning_rate": 1e-06, "loss": 0.8173, "step": 3360 }, { "epoch": 6.34, "eval_loss": 0.8377028107643127, "eval_runtime": 52.0007, "eval_samples_per_second": 1304.059, "eval_steps_per_second": 2.558, "step": 3360 }, { "epoch": 6.36, "learning_rate": 1e-06, "loss": 0.8254, "step": 3370 }, { "epoch": 6.36, "eval_loss": 0.8377081155776978, "eval_runtime": 52.0676, "eval_samples_per_second": 1302.383, "eval_steps_per_second": 2.554, "step": 3370 }, { "epoch": 6.38, "learning_rate": 1e-06, "loss": 0.82, "step": 3380 }, { "epoch": 6.38, "eval_loss": 0.8372619152069092, "eval_runtime": 52.0154, "eval_samples_per_second": 1303.692, "eval_steps_per_second": 2.557, "step": 3380 }, { "epoch": 6.4, "learning_rate": 1e-06, "loss": 0.8347, "step": 3390 }, { "epoch": 6.4, "eval_loss": 0.8370408415794373, "eval_runtime": 51.9385, "eval_samples_per_second": 1305.622, "eval_steps_per_second": 2.561, "step": 3390 }, { "epoch": 6.42, "learning_rate": 1e-06, "loss": 0.8195, "step": 3400 }, { "epoch": 6.42, "eval_loss": 0.8368457555770874, "eval_runtime": 52.0455, "eval_samples_per_second": 1302.937, "eval_steps_per_second": 2.555, "step": 3400 }, { "epoch": 6.43, "learning_rate": 1e-06, "loss": 0.8121, "step": 3410 }, { "epoch": 6.43, "eval_loss": 0.8366797566413879, "eval_runtime": 51.9623, "eval_samples_per_second": 1305.024, "eval_steps_per_second": 2.56, "step": 3410 }, { "epoch": 6.45, "learning_rate": 1e-06, "loss": 0.8226, "step": 3420 }, { "epoch": 6.45, "eval_loss": 0.8365644812583923, "eval_runtime": 52.0483, "eval_samples_per_second": 1302.866, "eval_steps_per_second": 2.555, "step": 3420 }, { "epoch": 6.47, "learning_rate": 1e-06, "loss": 0.8278, "step": 3430 }, { "epoch": 6.47, "eval_loss": 0.836248517036438, "eval_runtime": 51.9763, "eval_samples_per_second": 1304.671, "eval_steps_per_second": 2.559, "step": 3430 }, { "epoch": 6.49, "learning_rate": 1e-06, "loss": 0.7992, "step": 3440 }, { "epoch": 6.49, "eval_loss": 0.8359348177909851, "eval_runtime": 51.9796, "eval_samples_per_second": 1304.588, "eval_steps_per_second": 2.559, "step": 3440 }, { "epoch": 6.51, "learning_rate": 1e-06, "loss": 0.8262, "step": 3450 }, { "epoch": 6.51, "eval_loss": 0.8360700607299805, "eval_runtime": 51.8816, "eval_samples_per_second": 1307.052, "eval_steps_per_second": 2.564, "step": 3450 }, { "epoch": 6.53, "learning_rate": 1e-06, "loss": 0.8161, "step": 3460 }, { "epoch": 6.53, "eval_loss": 0.8355943560600281, "eval_runtime": 51.9822, "eval_samples_per_second": 1304.524, "eval_steps_per_second": 2.559, "step": 3460 }, { "epoch": 6.55, "learning_rate": 1e-06, "loss": 0.8063, "step": 3470 }, { "epoch": 6.55, "eval_loss": 0.8354068398475647, "eval_runtime": 51.8622, "eval_samples_per_second": 1307.541, "eval_steps_per_second": 2.564, "step": 3470 }, { "epoch": 6.57, "learning_rate": 1e-06, "loss": 0.8078, "step": 3480 }, { "epoch": 6.57, "eval_loss": 0.8352031111717224, "eval_runtime": 51.9774, "eval_samples_per_second": 1304.643, "eval_steps_per_second": 2.559, "step": 3480 }, { "epoch": 6.58, "learning_rate": 1e-06, "loss": 0.8024, "step": 3490 }, { "epoch": 6.58, "eval_loss": 0.8349164128303528, "eval_runtime": 51.9339, "eval_samples_per_second": 1305.738, "eval_steps_per_second": 2.561, "step": 3490 }, { "epoch": 6.6, "learning_rate": 1e-06, "loss": 0.808, "step": 3500 }, { "epoch": 6.6, "eval_loss": 0.8347747921943665, "eval_runtime": 52.0623, "eval_samples_per_second": 1302.516, "eval_steps_per_second": 2.555, "step": 3500 }, { "epoch": 6.62, "learning_rate": 1e-06, "loss": 0.8173, "step": 3510 }, { "epoch": 6.62, "eval_loss": 0.8345584869384766, "eval_runtime": 51.9186, "eval_samples_per_second": 1306.123, "eval_steps_per_second": 2.562, "step": 3510 }, { "epoch": 6.64, "learning_rate": 1e-06, "loss": 0.8211, "step": 3520 }, { "epoch": 6.64, "eval_loss": 0.834367573261261, "eval_runtime": 52.0978, "eval_samples_per_second": 1301.63, "eval_steps_per_second": 2.553, "step": 3520 }, { "epoch": 6.66, "learning_rate": 1e-06, "loss": 0.8349, "step": 3530 }, { "epoch": 6.66, "eval_loss": 0.8342083692550659, "eval_runtime": 52.0093, "eval_samples_per_second": 1303.844, "eval_steps_per_second": 2.557, "step": 3530 }, { "epoch": 6.68, "learning_rate": 1e-06, "loss": 0.8306, "step": 3540 }, { "epoch": 6.68, "eval_loss": 0.8338024616241455, "eval_runtime": 51.9465, "eval_samples_per_second": 1305.421, "eval_steps_per_second": 2.56, "step": 3540 }, { "epoch": 6.7, "learning_rate": 1e-06, "loss": 0.8193, "step": 3550 }, { "epoch": 6.7, "eval_loss": 0.8336243033409119, "eval_runtime": 51.9784, "eval_samples_per_second": 1304.619, "eval_steps_per_second": 2.559, "step": 3550 }, { "epoch": 6.72, "learning_rate": 1e-06, "loss": 0.8118, "step": 3560 }, { "epoch": 6.72, "eval_loss": 0.8336126804351807, "eval_runtime": 52.0238, "eval_samples_per_second": 1303.479, "eval_steps_per_second": 2.557, "step": 3560 }, { "epoch": 6.74, "learning_rate": 1e-06, "loss": 0.8263, "step": 3570 }, { "epoch": 6.74, "eval_loss": 0.8331900238990784, "eval_runtime": 51.9913, "eval_samples_per_second": 1304.296, "eval_steps_per_second": 2.558, "step": 3570 }, { "epoch": 6.75, "learning_rate": 1e-06, "loss": 0.811, "step": 3580 }, { "epoch": 6.75, "eval_loss": 0.8329805731773376, "eval_runtime": 52.0168, "eval_samples_per_second": 1303.656, "eval_steps_per_second": 2.557, "step": 3580 }, { "epoch": 6.77, "learning_rate": 1e-06, "loss": 0.8065, "step": 3590 }, { "epoch": 6.77, "eval_loss": 0.8329923152923584, "eval_runtime": 52.0099, "eval_samples_per_second": 1303.83, "eval_steps_per_second": 2.557, "step": 3590 }, { "epoch": 6.79, "learning_rate": 1e-06, "loss": 0.8204, "step": 3600 }, { "epoch": 6.79, "eval_loss": 0.8327195644378662, "eval_runtime": 52.1332, "eval_samples_per_second": 1300.746, "eval_steps_per_second": 2.551, "step": 3600 }, { "epoch": 6.81, "learning_rate": 1e-06, "loss": 0.818, "step": 3610 }, { "epoch": 6.81, "eval_loss": 0.8323770761489868, "eval_runtime": 52.001, "eval_samples_per_second": 1304.052, "eval_steps_per_second": 2.558, "step": 3610 }, { "epoch": 6.83, "learning_rate": 1e-06, "loss": 0.805, "step": 3620 }, { "epoch": 6.83, "eval_loss": 0.832315981388092, "eval_runtime": 51.8866, "eval_samples_per_second": 1306.928, "eval_steps_per_second": 2.563, "step": 3620 }, { "epoch": 6.85, "learning_rate": 1e-06, "loss": 0.8161, "step": 3630 }, { "epoch": 6.85, "eval_loss": 0.8319803476333618, "eval_runtime": 51.9838, "eval_samples_per_second": 1304.484, "eval_steps_per_second": 2.558, "step": 3630 }, { "epoch": 6.87, "learning_rate": 1e-06, "loss": 0.8269, "step": 3640 }, { "epoch": 6.87, "eval_loss": 0.831838846206665, "eval_runtime": 51.9964, "eval_samples_per_second": 1304.168, "eval_steps_per_second": 2.558, "step": 3640 }, { "epoch": 6.89, "learning_rate": 1e-06, "loss": 0.8244, "step": 3650 }, { "epoch": 6.89, "eval_loss": 0.8317265510559082, "eval_runtime": 51.8435, "eval_samples_per_second": 1308.014, "eval_steps_per_second": 2.565, "step": 3650 }, { "epoch": 6.91, "learning_rate": 1e-06, "loss": 0.8007, "step": 3660 }, { "epoch": 6.91, "eval_loss": 0.8313496112823486, "eval_runtime": 51.9846, "eval_samples_per_second": 1304.463, "eval_steps_per_second": 2.558, "step": 3660 }, { "epoch": 6.92, "learning_rate": 1e-06, "loss": 0.8021, "step": 3670 }, { "epoch": 6.92, "eval_loss": 0.8311917781829834, "eval_runtime": 51.9971, "eval_samples_per_second": 1304.15, "eval_steps_per_second": 2.558, "step": 3670 }, { "epoch": 6.94, "learning_rate": 1e-06, "loss": 0.8014, "step": 3680 }, { "epoch": 6.94, "eval_loss": 0.8310168981552124, "eval_runtime": 51.8745, "eval_samples_per_second": 1307.231, "eval_steps_per_second": 2.564, "step": 3680 }, { "epoch": 6.96, "learning_rate": 1e-06, "loss": 0.8108, "step": 3690 }, { "epoch": 6.96, "eval_loss": 0.8306861519813538, "eval_runtime": 51.9592, "eval_samples_per_second": 1305.101, "eval_steps_per_second": 2.56, "step": 3690 }, { "epoch": 6.98, "learning_rate": 1e-06, "loss": 0.8007, "step": 3700 }, { "epoch": 6.98, "eval_loss": 0.8306426405906677, "eval_runtime": 52.0214, "eval_samples_per_second": 1303.54, "eval_steps_per_second": 2.557, "step": 3700 }, { "epoch": 7.0, "learning_rate": 1e-06, "loss": 0.8008, "step": 3710 }, { "epoch": 7.0, "eval_loss": 0.8303616046905518, "eval_runtime": 52.106, "eval_samples_per_second": 1301.425, "eval_steps_per_second": 2.552, "step": 3710 }, { "epoch": 7.02, "learning_rate": 1e-06, "loss": 0.8087, "step": 3720 }, { "epoch": 7.02, "eval_loss": 0.8302958607673645, "eval_runtime": 51.8942, "eval_samples_per_second": 1306.735, "eval_steps_per_second": 2.563, "step": 3720 }, { "epoch": 7.04, "learning_rate": 1e-06, "loss": 0.7951, "step": 3730 }, { "epoch": 7.04, "eval_loss": 0.8302793502807617, "eval_runtime": 52.0585, "eval_samples_per_second": 1302.613, "eval_steps_per_second": 2.555, "step": 3730 }, { "epoch": 7.06, "learning_rate": 1e-06, "loss": 0.7948, "step": 3740 }, { "epoch": 7.06, "eval_loss": 0.8299869894981384, "eval_runtime": 51.889, "eval_samples_per_second": 1306.867, "eval_steps_per_second": 2.563, "step": 3740 }, { "epoch": 7.08, "learning_rate": 1e-06, "loss": 0.8202, "step": 3750 }, { "epoch": 7.08, "eval_loss": 0.8297907114028931, "eval_runtime": 51.8406, "eval_samples_per_second": 1308.086, "eval_steps_per_second": 2.566, "step": 3750 }, { "epoch": 7.09, "learning_rate": 1e-06, "loss": 0.8065, "step": 3760 }, { "epoch": 7.09, "eval_loss": 0.8296034932136536, "eval_runtime": 51.9961, "eval_samples_per_second": 1304.174, "eval_steps_per_second": 2.558, "step": 3760 }, { "epoch": 7.11, "learning_rate": 1e-06, "loss": 0.8044, "step": 3770 }, { "epoch": 7.11, "eval_loss": 0.829464316368103, "eval_runtime": 51.9978, "eval_samples_per_second": 1304.133, "eval_steps_per_second": 2.558, "step": 3770 }, { "epoch": 7.13, "learning_rate": 1e-06, "loss": 0.8093, "step": 3780 }, { "epoch": 7.13, "eval_loss": 0.8291881084442139, "eval_runtime": 52.0553, "eval_samples_per_second": 1302.693, "eval_steps_per_second": 2.555, "step": 3780 }, { "epoch": 7.15, "learning_rate": 1e-06, "loss": 0.823, "step": 3790 }, { "epoch": 7.15, "eval_loss": 0.8290471434593201, "eval_runtime": 51.9731, "eval_samples_per_second": 1304.751, "eval_steps_per_second": 2.559, "step": 3790 }, { "epoch": 7.17, "learning_rate": 1e-06, "loss": 0.7982, "step": 3800 }, { "epoch": 7.17, "eval_loss": 0.828815758228302, "eval_runtime": 52.0766, "eval_samples_per_second": 1302.159, "eval_steps_per_second": 2.554, "step": 3800 }, { "epoch": 7.19, "learning_rate": 1e-06, "loss": 0.8129, "step": 3810 }, { "epoch": 7.19, "eval_loss": 0.8286548256874084, "eval_runtime": 51.9203, "eval_samples_per_second": 1306.078, "eval_steps_per_second": 2.562, "step": 3810 }, { "epoch": 7.21, "learning_rate": 1e-06, "loss": 0.8017, "step": 3820 }, { "epoch": 7.21, "eval_loss": 0.8284364938735962, "eval_runtime": 52.0163, "eval_samples_per_second": 1303.668, "eval_steps_per_second": 2.557, "step": 3820 }, { "epoch": 7.23, "learning_rate": 1e-06, "loss": 0.8091, "step": 3830 }, { "epoch": 7.23, "eval_loss": 0.8282648921012878, "eval_runtime": 52.0506, "eval_samples_per_second": 1302.81, "eval_steps_per_second": 2.555, "step": 3830 }, { "epoch": 7.25, "learning_rate": 1e-06, "loss": 0.7956, "step": 3840 }, { "epoch": 7.25, "eval_loss": 0.8282151222229004, "eval_runtime": 52.0166, "eval_samples_per_second": 1303.66, "eval_steps_per_second": 2.557, "step": 3840 }, { "epoch": 7.26, "learning_rate": 1e-06, "loss": 0.8153, "step": 3850 }, { "epoch": 7.26, "eval_loss": 0.8278843760490417, "eval_runtime": 52.0038, "eval_samples_per_second": 1303.982, "eval_steps_per_second": 2.558, "step": 3850 }, { "epoch": 7.28, "learning_rate": 1e-06, "loss": 0.8066, "step": 3860 }, { "epoch": 7.28, "eval_loss": 0.8276271820068359, "eval_runtime": 52.0462, "eval_samples_per_second": 1302.919, "eval_steps_per_second": 2.555, "step": 3860 }, { "epoch": 7.3, "learning_rate": 1e-06, "loss": 0.8046, "step": 3870 }, { "epoch": 7.3, "eval_loss": 0.8274891376495361, "eval_runtime": 51.955, "eval_samples_per_second": 1305.208, "eval_steps_per_second": 2.56, "step": 3870 }, { "epoch": 7.32, "learning_rate": 1e-06, "loss": 0.7983, "step": 3880 }, { "epoch": 7.32, "eval_loss": 0.8273729085922241, "eval_runtime": 51.9905, "eval_samples_per_second": 1304.315, "eval_steps_per_second": 2.558, "step": 3880 }, { "epoch": 7.34, "learning_rate": 1e-06, "loss": 0.8028, "step": 3890 }, { "epoch": 7.34, "eval_loss": 0.8270576000213623, "eval_runtime": 52.0076, "eval_samples_per_second": 1303.886, "eval_steps_per_second": 2.557, "step": 3890 }, { "epoch": 7.36, "learning_rate": 1e-06, "loss": 0.7898, "step": 3900 }, { "epoch": 7.36, "eval_loss": 0.8267748355865479, "eval_runtime": 51.9676, "eval_samples_per_second": 1304.891, "eval_steps_per_second": 2.559, "step": 3900 }, { "epoch": 7.38, "learning_rate": 1e-06, "loss": 0.8076, "step": 3910 }, { "epoch": 7.38, "eval_loss": 0.82685387134552, "eval_runtime": 51.949, "eval_samples_per_second": 1305.357, "eval_steps_per_second": 2.56, "step": 3910 }, { "epoch": 7.4, "learning_rate": 1e-06, "loss": 0.8105, "step": 3920 }, { "epoch": 7.4, "eval_loss": 0.8265158534049988, "eval_runtime": 51.8551, "eval_samples_per_second": 1307.722, "eval_steps_per_second": 2.565, "step": 3920 }, { "epoch": 7.42, "learning_rate": 1e-06, "loss": 0.8111, "step": 3930 }, { "epoch": 7.42, "eval_loss": 0.8262932896614075, "eval_runtime": 51.982, "eval_samples_per_second": 1304.529, "eval_steps_per_second": 2.559, "step": 3930 }, { "epoch": 7.43, "learning_rate": 1e-06, "loss": 0.7957, "step": 3940 }, { "epoch": 7.43, "eval_loss": 0.8261959552764893, "eval_runtime": 52.0057, "eval_samples_per_second": 1303.933, "eval_steps_per_second": 2.557, "step": 3940 }, { "epoch": 7.45, "learning_rate": 1e-06, "loss": 0.7907, "step": 3950 }, { "epoch": 7.45, "eval_loss": 0.8257889747619629, "eval_runtime": 51.9024, "eval_samples_per_second": 1306.529, "eval_steps_per_second": 2.563, "step": 3950 }, { "epoch": 7.47, "learning_rate": 1e-06, "loss": 0.8043, "step": 3960 }, { "epoch": 7.47, "eval_loss": 0.8256938457489014, "eval_runtime": 51.8359, "eval_samples_per_second": 1308.204, "eval_steps_per_second": 2.566, "step": 3960 }, { "epoch": 7.49, "learning_rate": 1e-06, "loss": 0.7979, "step": 3970 }, { "epoch": 7.49, "eval_loss": 0.8256092071533203, "eval_runtime": 51.8834, "eval_samples_per_second": 1307.008, "eval_steps_per_second": 2.563, "step": 3970 }, { "epoch": 7.51, "learning_rate": 1e-06, "loss": 0.8023, "step": 3980 }, { "epoch": 7.51, "eval_loss": 0.8254022598266602, "eval_runtime": 52.5271, "eval_samples_per_second": 1290.99, "eval_steps_per_second": 2.532, "step": 3980 }, { "epoch": 7.53, "learning_rate": 1e-06, "loss": 0.8079, "step": 3990 }, { "epoch": 7.53, "eval_loss": 0.8251172304153442, "eval_runtime": 51.9409, "eval_samples_per_second": 1305.561, "eval_steps_per_second": 2.561, "step": 3990 }, { "epoch": 7.55, "learning_rate": 1e-06, "loss": 0.8058, "step": 4000 }, { "epoch": 7.55, "eval_loss": 0.8249055743217468, "eval_runtime": 51.9893, "eval_samples_per_second": 1304.345, "eval_steps_per_second": 2.558, "step": 4000 }, { "epoch": 7.57, "learning_rate": 1e-06, "loss": 0.8055, "step": 4010 }, { "epoch": 7.57, "eval_loss": 0.8246675729751587, "eval_runtime": 51.8262, "eval_samples_per_second": 1308.449, "eval_steps_per_second": 2.566, "step": 4010 }, { "epoch": 7.58, "learning_rate": 1e-06, "loss": 0.812, "step": 4020 }, { "epoch": 7.58, "eval_loss": 0.8246596455574036, "eval_runtime": 51.8921, "eval_samples_per_second": 1306.788, "eval_steps_per_second": 2.563, "step": 4020 }, { "epoch": 7.6, "learning_rate": 1e-06, "loss": 0.7859, "step": 4030 }, { "epoch": 7.6, "eval_loss": 0.8243203163146973, "eval_runtime": 51.8981, "eval_samples_per_second": 1306.637, "eval_steps_per_second": 2.563, "step": 4030 }, { "epoch": 7.62, "learning_rate": 1e-06, "loss": 0.8007, "step": 4040 }, { "epoch": 7.62, "eval_loss": 0.8241901397705078, "eval_runtime": 51.9544, "eval_samples_per_second": 1305.22, "eval_steps_per_second": 2.56, "step": 4040 }, { "epoch": 7.64, "learning_rate": 1e-06, "loss": 0.799, "step": 4050 }, { "epoch": 7.64, "eval_loss": 0.8239621520042419, "eval_runtime": 51.9119, "eval_samples_per_second": 1306.289, "eval_steps_per_second": 2.562, "step": 4050 }, { "epoch": 7.66, "learning_rate": 1e-06, "loss": 0.8007, "step": 4060 }, { "epoch": 7.66, "eval_loss": 0.8240479826927185, "eval_runtime": 52.0052, "eval_samples_per_second": 1303.947, "eval_steps_per_second": 2.557, "step": 4060 }, { "epoch": 7.68, "learning_rate": 1e-06, "loss": 0.7952, "step": 4070 }, { "epoch": 7.68, "eval_loss": 0.8235350847244263, "eval_runtime": 51.9417, "eval_samples_per_second": 1305.54, "eval_steps_per_second": 2.561, "step": 4070 }, { "epoch": 7.7, "learning_rate": 1e-06, "loss": 0.8089, "step": 4080 }, { "epoch": 7.7, "eval_loss": 0.8234580159187317, "eval_runtime": 51.9976, "eval_samples_per_second": 1304.138, "eval_steps_per_second": 2.558, "step": 4080 }, { "epoch": 7.72, "learning_rate": 1e-06, "loss": 0.8106, "step": 4090 }, { "epoch": 7.72, "eval_loss": 0.8232220411300659, "eval_runtime": 51.9783, "eval_samples_per_second": 1304.621, "eval_steps_per_second": 2.559, "step": 4090 }, { "epoch": 7.74, "learning_rate": 1e-06, "loss": 0.81, "step": 4100 }, { "epoch": 7.74, "eval_loss": 0.8230018615722656, "eval_runtime": 51.9098, "eval_samples_per_second": 1306.343, "eval_steps_per_second": 2.562, "step": 4100 }, { "epoch": 7.75, "learning_rate": 1e-06, "loss": 0.8003, "step": 4110 }, { "epoch": 7.75, "eval_loss": 0.8228157758712769, "eval_runtime": 51.9865, "eval_samples_per_second": 1304.415, "eval_steps_per_second": 2.558, "step": 4110 }, { "epoch": 7.77, "learning_rate": 1e-06, "loss": 0.7981, "step": 4120 }, { "epoch": 7.77, "eval_loss": 0.8227835893630981, "eval_runtime": 51.9696, "eval_samples_per_second": 1304.841, "eval_steps_per_second": 2.559, "step": 4120 }, { "epoch": 7.79, "learning_rate": 1e-06, "loss": 0.8073, "step": 4130 }, { "epoch": 7.79, "eval_loss": 0.8224825263023376, "eval_runtime": 52.0838, "eval_samples_per_second": 1301.978, "eval_steps_per_second": 2.554, "step": 4130 }, { "epoch": 7.81, "learning_rate": 1e-06, "loss": 0.8023, "step": 4140 }, { "epoch": 7.81, "eval_loss": 0.8221595883369446, "eval_runtime": 52.1232, "eval_samples_per_second": 1300.994, "eval_steps_per_second": 2.552, "step": 4140 }, { "epoch": 7.83, "learning_rate": 1e-06, "loss": 0.7924, "step": 4150 }, { "epoch": 7.83, "eval_loss": 0.8220430612564087, "eval_runtime": 52.0247, "eval_samples_per_second": 1303.458, "eval_steps_per_second": 2.556, "step": 4150 }, { "epoch": 7.85, "learning_rate": 1e-06, "loss": 0.793, "step": 4160 }, { "epoch": 7.85, "eval_loss": 0.821935772895813, "eval_runtime": 51.9774, "eval_samples_per_second": 1304.644, "eval_steps_per_second": 2.559, "step": 4160 }, { "epoch": 7.87, "learning_rate": 1e-06, "loss": 0.8021, "step": 4170 }, { "epoch": 7.87, "eval_loss": 0.8216782808303833, "eval_runtime": 52.0293, "eval_samples_per_second": 1303.343, "eval_steps_per_second": 2.556, "step": 4170 }, { "epoch": 7.89, "learning_rate": 1e-06, "loss": 0.8013, "step": 4180 }, { "epoch": 7.89, "eval_loss": 0.821456789970398, "eval_runtime": 51.9831, "eval_samples_per_second": 1304.502, "eval_steps_per_second": 2.559, "step": 4180 }, { "epoch": 7.91, "learning_rate": 1e-06, "loss": 0.799, "step": 4190 }, { "epoch": 7.91, "eval_loss": 0.8213809728622437, "eval_runtime": 51.9984, "eval_samples_per_second": 1304.118, "eval_steps_per_second": 2.558, "step": 4190 }, { "epoch": 7.92, "learning_rate": 1e-06, "loss": 0.8031, "step": 4200 }, { "epoch": 7.92, "eval_loss": 0.8211493492126465, "eval_runtime": 52.1677, "eval_samples_per_second": 1299.885, "eval_steps_per_second": 2.549, "step": 4200 }, { "epoch": 7.94, "learning_rate": 1e-06, "loss": 0.7866, "step": 4210 }, { "epoch": 7.94, "eval_loss": 0.8209183812141418, "eval_runtime": 52.0364, "eval_samples_per_second": 1303.164, "eval_steps_per_second": 2.556, "step": 4210 }, { "epoch": 7.96, "learning_rate": 1e-06, "loss": 0.7912, "step": 4220 }, { "epoch": 7.96, "eval_loss": 0.8207370042800903, "eval_runtime": 52.1686, "eval_samples_per_second": 1299.863, "eval_steps_per_second": 2.549, "step": 4220 }, { "epoch": 7.98, "learning_rate": 1e-06, "loss": 0.7894, "step": 4230 }, { "epoch": 7.98, "eval_loss": 0.82054603099823, "eval_runtime": 52.0068, "eval_samples_per_second": 1303.906, "eval_steps_per_second": 2.557, "step": 4230 }, { "epoch": 8.0, "learning_rate": 1e-06, "loss": 0.7986, "step": 4240 }, { "epoch": 8.0, "eval_loss": 0.8204940557479858, "eval_runtime": 52.0562, "eval_samples_per_second": 1302.668, "eval_steps_per_second": 2.555, "step": 4240 }, { "epoch": 8.02, "learning_rate": 1e-06, "loss": 0.7956, "step": 4250 }, { "epoch": 8.02, "eval_loss": 0.8204708099365234, "eval_runtime": 52.0158, "eval_samples_per_second": 1303.681, "eval_steps_per_second": 2.557, "step": 4250 }, { "epoch": 8.04, "learning_rate": 1e-06, "loss": 0.7977, "step": 4260 }, { "epoch": 8.04, "eval_loss": 0.8203235864639282, "eval_runtime": 51.9618, "eval_samples_per_second": 1305.036, "eval_steps_per_second": 2.56, "step": 4260 }, { "epoch": 8.06, "learning_rate": 1e-06, "loss": 0.7893, "step": 4270 }, { "epoch": 8.06, "eval_loss": 0.8200653195381165, "eval_runtime": 52.0065, "eval_samples_per_second": 1303.915, "eval_steps_per_second": 2.557, "step": 4270 }, { "epoch": 8.08, "learning_rate": 1e-06, "loss": 0.7886, "step": 4280 }, { "epoch": 8.08, "eval_loss": 0.8199793696403503, "eval_runtime": 52.0989, "eval_samples_per_second": 1301.601, "eval_steps_per_second": 2.553, "step": 4280 }, { "epoch": 8.09, "learning_rate": 1e-06, "loss": 0.8015, "step": 4290 }, { "epoch": 8.09, "eval_loss": 0.819825291633606, "eval_runtime": 52.0141, "eval_samples_per_second": 1303.723, "eval_steps_per_second": 2.557, "step": 4290 }, { "epoch": 8.11, "learning_rate": 1e-06, "loss": 0.7975, "step": 4300 }, { "epoch": 8.11, "eval_loss": 0.8197045922279358, "eval_runtime": 51.996, "eval_samples_per_second": 1304.176, "eval_steps_per_second": 2.558, "step": 4300 }, { "epoch": 8.13, "learning_rate": 1e-06, "loss": 0.7905, "step": 4310 }, { "epoch": 8.13, "eval_loss": 0.8193663954734802, "eval_runtime": 52.1388, "eval_samples_per_second": 1300.606, "eval_steps_per_second": 2.551, "step": 4310 }, { "epoch": 8.15, "learning_rate": 1e-06, "loss": 0.7859, "step": 4320 }, { "epoch": 8.15, "eval_loss": 0.8192564249038696, "eval_runtime": 52.064, "eval_samples_per_second": 1302.475, "eval_steps_per_second": 2.555, "step": 4320 }, { "epoch": 8.17, "learning_rate": 1e-06, "loss": 0.7936, "step": 4330 }, { "epoch": 8.17, "eval_loss": 0.8193119168281555, "eval_runtime": 52.2011, "eval_samples_per_second": 1299.053, "eval_steps_per_second": 2.548, "step": 4330 }, { "epoch": 8.19, "learning_rate": 1e-06, "loss": 0.8034, "step": 4340 }, { "epoch": 8.19, "eval_loss": 0.8189331293106079, "eval_runtime": 51.9711, "eval_samples_per_second": 1304.802, "eval_steps_per_second": 2.559, "step": 4340 }, { "epoch": 8.21, "learning_rate": 1e-06, "loss": 0.7855, "step": 4350 }, { "epoch": 8.21, "eval_loss": 0.8188046813011169, "eval_runtime": 52.0472, "eval_samples_per_second": 1302.895, "eval_steps_per_second": 2.555, "step": 4350 }, { "epoch": 8.23, "learning_rate": 1e-06, "loss": 0.7841, "step": 4360 }, { "epoch": 8.23, "eval_loss": 0.8185828924179077, "eval_runtime": 52.0856, "eval_samples_per_second": 1301.935, "eval_steps_per_second": 2.553, "step": 4360 }, { "epoch": 8.25, "learning_rate": 1e-06, "loss": 0.7798, "step": 4370 }, { "epoch": 8.25, "eval_loss": 0.8185929656028748, "eval_runtime": 52.0044, "eval_samples_per_second": 1303.968, "eval_steps_per_second": 2.557, "step": 4370 }, { "epoch": 8.26, "learning_rate": 1e-06, "loss": 0.7829, "step": 4380 }, { "epoch": 8.26, "eval_loss": 0.8181660771369934, "eval_runtime": 52.0905, "eval_samples_per_second": 1301.811, "eval_steps_per_second": 2.553, "step": 4380 }, { "epoch": 8.28, "learning_rate": 1e-06, "loss": 0.7891, "step": 4390 }, { "epoch": 8.28, "eval_loss": 0.8180428147315979, "eval_runtime": 52.0875, "eval_samples_per_second": 1301.887, "eval_steps_per_second": 2.553, "step": 4390 }, { "epoch": 8.3, "learning_rate": 1e-06, "loss": 0.7813, "step": 4400 }, { "epoch": 8.3, "eval_loss": 0.8179446458816528, "eval_runtime": 51.9869, "eval_samples_per_second": 1304.404, "eval_steps_per_second": 2.558, "step": 4400 }, { "epoch": 8.32, "learning_rate": 1e-06, "loss": 0.7976, "step": 4410 }, { "epoch": 8.32, "eval_loss": 0.8177404403686523, "eval_runtime": 52.004, "eval_samples_per_second": 1303.975, "eval_steps_per_second": 2.557, "step": 4410 }, { "epoch": 8.34, "learning_rate": 1e-06, "loss": 0.7898, "step": 4420 }, { "epoch": 8.34, "eval_loss": 0.8177515268325806, "eval_runtime": 52.0623, "eval_samples_per_second": 1302.515, "eval_steps_per_second": 2.555, "step": 4420 }, { "epoch": 8.36, "learning_rate": 1e-06, "loss": 0.8042, "step": 4430 }, { "epoch": 8.36, "eval_loss": 0.8172406554222107, "eval_runtime": 52.0842, "eval_samples_per_second": 1301.969, "eval_steps_per_second": 2.554, "step": 4430 }, { "epoch": 8.38, "learning_rate": 1e-06, "loss": 0.7951, "step": 4440 }, { "epoch": 8.38, "eval_loss": 0.8171444535255432, "eval_runtime": 52.1302, "eval_samples_per_second": 1300.821, "eval_steps_per_second": 2.551, "step": 4440 }, { "epoch": 8.4, "learning_rate": 1e-06, "loss": 0.7918, "step": 4450 }, { "epoch": 8.4, "eval_loss": 0.8172018527984619, "eval_runtime": 52.3027, "eval_samples_per_second": 1296.531, "eval_steps_per_second": 2.543, "step": 4450 }, { "epoch": 8.42, "learning_rate": 1e-06, "loss": 0.7979, "step": 4460 }, { "epoch": 8.42, "eval_loss": 0.8168366551399231, "eval_runtime": 52.1192, "eval_samples_per_second": 1301.094, "eval_steps_per_second": 2.552, "step": 4460 }, { "epoch": 8.43, "learning_rate": 1e-06, "loss": 0.7939, "step": 4470 }, { "epoch": 8.43, "eval_loss": 0.8166252970695496, "eval_runtime": 52.0263, "eval_samples_per_second": 1303.417, "eval_steps_per_second": 2.556, "step": 4470 }, { "epoch": 8.45, "learning_rate": 1e-06, "loss": 0.7914, "step": 4480 }, { "epoch": 8.45, "eval_loss": 0.8164658546447754, "eval_runtime": 52.148, "eval_samples_per_second": 1300.377, "eval_steps_per_second": 2.55, "step": 4480 }, { "epoch": 8.47, "learning_rate": 1e-06, "loss": 0.7915, "step": 4490 }, { "epoch": 8.47, "eval_loss": 0.8164568543434143, "eval_runtime": 52.2088, "eval_samples_per_second": 1298.861, "eval_steps_per_second": 2.547, "step": 4490 }, { "epoch": 8.49, "learning_rate": 1e-06, "loss": 0.7834, "step": 4500 }, { "epoch": 8.49, "eval_loss": 0.8161565065383911, "eval_runtime": 52.0345, "eval_samples_per_second": 1303.211, "eval_steps_per_second": 2.556, "step": 4500 }, { "epoch": 8.51, "learning_rate": 1e-06, "loss": 0.7859, "step": 4510 }, { "epoch": 8.51, "eval_loss": 0.8159014582633972, "eval_runtime": 52.2477, "eval_samples_per_second": 1297.895, "eval_steps_per_second": 2.546, "step": 4510 }, { "epoch": 8.53, "learning_rate": 1e-06, "loss": 0.8038, "step": 4520 }, { "epoch": 8.53, "eval_loss": 0.815627932548523, "eval_runtime": 51.918, "eval_samples_per_second": 1306.136, "eval_steps_per_second": 2.562, "step": 4520 }, { "epoch": 8.55, "learning_rate": 1e-06, "loss": 0.7982, "step": 4530 }, { "epoch": 8.55, "eval_loss": 0.8155547380447388, "eval_runtime": 52.0742, "eval_samples_per_second": 1302.218, "eval_steps_per_second": 2.554, "step": 4530 }, { "epoch": 8.57, "learning_rate": 1e-06, "loss": 0.7947, "step": 4540 }, { "epoch": 8.57, "eval_loss": 0.8155472278594971, "eval_runtime": 52.0767, "eval_samples_per_second": 1302.157, "eval_steps_per_second": 2.554, "step": 4540 }, { "epoch": 8.58, "learning_rate": 1e-06, "loss": 0.7756, "step": 4550 }, { "epoch": 8.58, "eval_loss": 0.8152143955230713, "eval_runtime": 51.9632, "eval_samples_per_second": 1305.0, "eval_steps_per_second": 2.56, "step": 4550 }, { "epoch": 8.6, "learning_rate": 1e-06, "loss": 0.7886, "step": 4560 }, { "epoch": 8.6, "eval_loss": 0.8149456977844238, "eval_runtime": 52.1021, "eval_samples_per_second": 1301.523, "eval_steps_per_second": 2.553, "step": 4560 }, { "epoch": 8.62, "learning_rate": 1e-06, "loss": 0.7979, "step": 4570 }, { "epoch": 8.62, "eval_loss": 0.8148903846740723, "eval_runtime": 52.1236, "eval_samples_per_second": 1300.986, "eval_steps_per_second": 2.552, "step": 4570 }, { "epoch": 8.64, "learning_rate": 1e-06, "loss": 0.7932, "step": 4580 }, { "epoch": 8.64, "eval_loss": 0.814667284488678, "eval_runtime": 52.1496, "eval_samples_per_second": 1300.337, "eval_steps_per_second": 2.55, "step": 4580 }, { "epoch": 8.66, "learning_rate": 1e-06, "loss": 0.7864, "step": 4590 }, { "epoch": 8.66, "eval_loss": 0.81462162733078, "eval_runtime": 52.1366, "eval_samples_per_second": 1300.66, "eval_steps_per_second": 2.551, "step": 4590 }, { "epoch": 8.68, "learning_rate": 1e-06, "loss": 0.7999, "step": 4600 }, { "epoch": 8.68, "eval_loss": 0.8142070174217224, "eval_runtime": 52.1471, "eval_samples_per_second": 1300.398, "eval_steps_per_second": 2.55, "step": 4600 }, { "epoch": 8.7, "learning_rate": 1e-06, "loss": 0.7952, "step": 4610 }, { "epoch": 8.7, "eval_loss": 0.8141977787017822, "eval_runtime": 52.1159, "eval_samples_per_second": 1301.176, "eval_steps_per_second": 2.552, "step": 4610 }, { "epoch": 8.72, "learning_rate": 1e-06, "loss": 0.7779, "step": 4620 }, { "epoch": 8.72, "eval_loss": 0.8140039443969727, "eval_runtime": 52.0607, "eval_samples_per_second": 1302.556, "eval_steps_per_second": 2.555, "step": 4620 }, { "epoch": 8.74, "learning_rate": 1e-06, "loss": 0.7838, "step": 4630 }, { "epoch": 8.74, "eval_loss": 0.8139258027076721, "eval_runtime": 51.9226, "eval_samples_per_second": 1306.02, "eval_steps_per_second": 2.562, "step": 4630 }, { "epoch": 8.75, "learning_rate": 1e-06, "loss": 0.7842, "step": 4640 }, { "epoch": 8.75, "eval_loss": 0.8135305047035217, "eval_runtime": 52.1327, "eval_samples_per_second": 1300.757, "eval_steps_per_second": 2.551, "step": 4640 }, { "epoch": 8.77, "learning_rate": 1e-06, "loss": 0.7916, "step": 4650 }, { "epoch": 8.77, "eval_loss": 0.8136957287788391, "eval_runtime": 52.1197, "eval_samples_per_second": 1301.083, "eval_steps_per_second": 2.552, "step": 4650 }, { "epoch": 8.79, "learning_rate": 1e-06, "loss": 0.7843, "step": 4660 }, { "epoch": 8.79, "eval_loss": 0.813274085521698, "eval_runtime": 52.1225, "eval_samples_per_second": 1301.011, "eval_steps_per_second": 2.552, "step": 4660 }, { "epoch": 8.81, "learning_rate": 1e-06, "loss": 0.7789, "step": 4670 }, { "epoch": 8.81, "eval_loss": 0.8131061792373657, "eval_runtime": 52.2467, "eval_samples_per_second": 1297.919, "eval_steps_per_second": 2.546, "step": 4670 }, { "epoch": 8.83, "learning_rate": 1e-06, "loss": 0.7848, "step": 4680 }, { "epoch": 8.83, "eval_loss": 0.812892735004425, "eval_runtime": 52.0469, "eval_samples_per_second": 1302.903, "eval_steps_per_second": 2.555, "step": 4680 }, { "epoch": 8.85, "learning_rate": 1e-06, "loss": 0.7941, "step": 4690 }, { "epoch": 8.85, "eval_loss": 0.8127599954605103, "eval_runtime": 52.14, "eval_samples_per_second": 1300.576, "eval_steps_per_second": 2.551, "step": 4690 }, { "epoch": 8.87, "learning_rate": 1e-06, "loss": 0.7715, "step": 4700 }, { "epoch": 8.87, "eval_loss": 0.8127383589744568, "eval_runtime": 52.1476, "eval_samples_per_second": 1300.386, "eval_steps_per_second": 2.55, "step": 4700 }, { "epoch": 8.89, "learning_rate": 1e-06, "loss": 0.7845, "step": 4710 }, { "epoch": 8.89, "eval_loss": 0.8124809861183167, "eval_runtime": 52.1147, "eval_samples_per_second": 1301.206, "eval_steps_per_second": 2.552, "step": 4710 }, { "epoch": 8.91, "learning_rate": 1e-06, "loss": 0.793, "step": 4720 }, { "epoch": 8.91, "eval_loss": 0.8122683167457581, "eval_runtime": 52.1513, "eval_samples_per_second": 1300.294, "eval_steps_per_second": 2.55, "step": 4720 }, { "epoch": 8.92, "learning_rate": 1e-06, "loss": 0.7831, "step": 4730 }, { "epoch": 8.92, "eval_loss": 0.8123452663421631, "eval_runtime": 52.0764, "eval_samples_per_second": 1302.165, "eval_steps_per_second": 2.554, "step": 4730 }, { "epoch": 8.94, "learning_rate": 1e-06, "loss": 0.7676, "step": 4740 }, { "epoch": 8.94, "eval_loss": 0.8120014071464539, "eval_runtime": 52.2681, "eval_samples_per_second": 1297.387, "eval_steps_per_second": 2.545, "step": 4740 }, { "epoch": 8.96, "learning_rate": 1e-06, "loss": 0.8054, "step": 4750 }, { "epoch": 8.96, "eval_loss": 0.8117390871047974, "eval_runtime": 52.1213, "eval_samples_per_second": 1301.043, "eval_steps_per_second": 2.552, "step": 4750 }, { "epoch": 8.98, "learning_rate": 1e-06, "loss": 0.7902, "step": 4760 }, { "epoch": 8.98, "eval_loss": 0.811499297618866, "eval_runtime": 52.1351, "eval_samples_per_second": 1300.698, "eval_steps_per_second": 2.551, "step": 4760 }, { "epoch": 9.0, "learning_rate": 1e-06, "loss": 0.7635, "step": 4770 }, { "epoch": 9.0, "eval_loss": 0.8113887310028076, "eval_runtime": 52.1292, "eval_samples_per_second": 1300.846, "eval_steps_per_second": 2.551, "step": 4770 }, { "epoch": 9.02, "learning_rate": 1e-06, "loss": 0.7836, "step": 4780 }, { "epoch": 9.02, "eval_loss": 0.8115408420562744, "eval_runtime": 52.2174, "eval_samples_per_second": 1298.648, "eval_steps_per_second": 2.547, "step": 4780 }, { "epoch": 9.04, "learning_rate": 1e-06, "loss": 0.7819, "step": 4790 }, { "epoch": 9.04, "eval_loss": 0.8114977478981018, "eval_runtime": 52.1485, "eval_samples_per_second": 1300.362, "eval_steps_per_second": 2.55, "step": 4790 }, { "epoch": 9.06, "learning_rate": 1e-06, "loss": 0.7802, "step": 4800 }, { "epoch": 9.06, "eval_loss": 0.8112668395042419, "eval_runtime": 52.4451, "eval_samples_per_second": 1293.009, "eval_steps_per_second": 2.536, "step": 4800 }, { "epoch": 9.08, "learning_rate": 1e-06, "loss": 0.7808, "step": 4810 }, { "epoch": 9.08, "eval_loss": 0.8110054135322571, "eval_runtime": 52.0098, "eval_samples_per_second": 1303.83, "eval_steps_per_second": 2.557, "step": 4810 }, { "epoch": 9.09, "learning_rate": 1e-06, "loss": 0.7942, "step": 4820 }, { "epoch": 9.09, "eval_loss": 0.8112225532531738, "eval_runtime": 52.1881, "eval_samples_per_second": 1299.377, "eval_steps_per_second": 2.548, "step": 4820 }, { "epoch": 9.11, "learning_rate": 1e-06, "loss": 0.7899, "step": 4830 }, { "epoch": 9.11, "eval_loss": 0.8108929991722107, "eval_runtime": 52.179, "eval_samples_per_second": 1299.604, "eval_steps_per_second": 2.549, "step": 4830 }, { "epoch": 9.13, "learning_rate": 1e-06, "loss": 0.7814, "step": 4840 }, { "epoch": 9.13, "eval_loss": 0.8106999397277832, "eval_runtime": 52.0828, "eval_samples_per_second": 1302.005, "eval_steps_per_second": 2.554, "step": 4840 }, { "epoch": 9.15, "learning_rate": 1e-06, "loss": 0.7886, "step": 4850 }, { "epoch": 9.15, "eval_loss": 0.810533881187439, "eval_runtime": 52.1124, "eval_samples_per_second": 1301.263, "eval_steps_per_second": 2.552, "step": 4850 }, { "epoch": 9.17, "learning_rate": 1e-06, "loss": 0.7697, "step": 4860 }, { "epoch": 9.17, "eval_loss": 0.8104182481765747, "eval_runtime": 52.0327, "eval_samples_per_second": 1303.256, "eval_steps_per_second": 2.556, "step": 4860 }, { "epoch": 9.19, "learning_rate": 1e-06, "loss": 0.7859, "step": 4870 }, { "epoch": 9.19, "eval_loss": 0.8101289868354797, "eval_runtime": 52.1542, "eval_samples_per_second": 1300.221, "eval_steps_per_second": 2.55, "step": 4870 }, { "epoch": 9.21, "learning_rate": 1e-06, "loss": 0.7718, "step": 4880 }, { "epoch": 9.21, "eval_loss": 0.8099629878997803, "eval_runtime": 52.087, "eval_samples_per_second": 1301.899, "eval_steps_per_second": 2.553, "step": 4880 }, { "epoch": 9.23, "learning_rate": 1e-06, "loss": 0.771, "step": 4890 }, { "epoch": 9.23, "eval_loss": 0.8100450038909912, "eval_runtime": 52.1185, "eval_samples_per_second": 1301.112, "eval_steps_per_second": 2.552, "step": 4890 }, { "epoch": 9.25, "learning_rate": 1e-06, "loss": 0.7852, "step": 4900 }, { "epoch": 9.25, "eval_loss": 0.8099735379219055, "eval_runtime": 52.051, "eval_samples_per_second": 1302.8, "eval_steps_per_second": 2.555, "step": 4900 }, { "epoch": 9.26, "learning_rate": 1e-06, "loss": 0.7722, "step": 4910 }, { "epoch": 9.26, "eval_loss": 0.8093700408935547, "eval_runtime": 51.9299, "eval_samples_per_second": 1305.836, "eval_steps_per_second": 2.561, "step": 4910 }, { "epoch": 9.28, "learning_rate": 1e-06, "loss": 0.7699, "step": 4920 }, { "epoch": 9.28, "eval_loss": 0.8094404935836792, "eval_runtime": 52.6564, "eval_samples_per_second": 1287.821, "eval_steps_per_second": 2.526, "step": 4920 }, { "epoch": 9.3, "learning_rate": 1e-06, "loss": 0.7847, "step": 4930 }, { "epoch": 9.3, "eval_loss": 0.8093037605285645, "eval_runtime": 52.1908, "eval_samples_per_second": 1299.309, "eval_steps_per_second": 2.548, "step": 4930 }, { "epoch": 9.32, "learning_rate": 1e-06, "loss": 0.776, "step": 4940 }, { "epoch": 9.32, "eval_loss": 0.8089527487754822, "eval_runtime": 52.1434, "eval_samples_per_second": 1300.49, "eval_steps_per_second": 2.551, "step": 4940 }, { "epoch": 9.34, "learning_rate": 1e-06, "loss": 0.7727, "step": 4950 }, { "epoch": 9.34, "eval_loss": 0.8089351058006287, "eval_runtime": 52.0839, "eval_samples_per_second": 1301.976, "eval_steps_per_second": 2.554, "step": 4950 }, { "epoch": 9.36, "learning_rate": 1e-06, "loss": 0.782, "step": 4960 }, { "epoch": 9.36, "eval_loss": 0.8087407350540161, "eval_runtime": 52.211, "eval_samples_per_second": 1298.808, "eval_steps_per_second": 2.547, "step": 4960 }, { "epoch": 9.38, "learning_rate": 1e-06, "loss": 0.7715, "step": 4970 }, { "epoch": 9.38, "eval_loss": 0.8086969256401062, "eval_runtime": 51.9535, "eval_samples_per_second": 1305.245, "eval_steps_per_second": 2.56, "step": 4970 }, { "epoch": 9.4, "learning_rate": 1e-06, "loss": 0.7812, "step": 4980 }, { "epoch": 9.4, "eval_loss": 0.8085045218467712, "eval_runtime": 52.0964, "eval_samples_per_second": 1301.664, "eval_steps_per_second": 2.553, "step": 4980 }, { "epoch": 9.42, "learning_rate": 1e-06, "loss": 0.7648, "step": 4990 }, { "epoch": 9.42, "eval_loss": 0.8081462383270264, "eval_runtime": 51.9735, "eval_samples_per_second": 1304.741, "eval_steps_per_second": 2.559, "step": 4990 }, { "epoch": 9.43, "learning_rate": 1e-06, "loss": 0.7694, "step": 5000 }, { "epoch": 9.43, "eval_loss": 0.8081194758415222, "eval_runtime": 52.1059, "eval_samples_per_second": 1301.426, "eval_steps_per_second": 2.552, "step": 5000 }, { "epoch": 9.45, "learning_rate": 1e-06, "loss": 0.7824, "step": 5010 }, { "epoch": 9.45, "eval_loss": 0.8081274032592773, "eval_runtime": 52.1903, "eval_samples_per_second": 1299.322, "eval_steps_per_second": 2.548, "step": 5010 }, { "epoch": 9.47, "learning_rate": 1e-06, "loss": 0.7822, "step": 5020 }, { "epoch": 9.47, "eval_loss": 0.8078827261924744, "eval_runtime": 52.0034, "eval_samples_per_second": 1303.991, "eval_steps_per_second": 2.558, "step": 5020 }, { "epoch": 9.49, "learning_rate": 1e-06, "loss": 0.7766, "step": 5030 }, { "epoch": 9.49, "eval_loss": 0.807519793510437, "eval_runtime": 52.2252, "eval_samples_per_second": 1298.453, "eval_steps_per_second": 2.547, "step": 5030 }, { "epoch": 9.51, "learning_rate": 1e-06, "loss": 0.7732, "step": 5040 }, { "epoch": 9.51, "eval_loss": 0.807347297668457, "eval_runtime": 52.0984, "eval_samples_per_second": 1301.614, "eval_steps_per_second": 2.553, "step": 5040 }, { "epoch": 9.53, "learning_rate": 1e-06, "loss": 0.7861, "step": 5050 }, { "epoch": 9.53, "eval_loss": 0.8072365522384644, "eval_runtime": 52.0257, "eval_samples_per_second": 1303.434, "eval_steps_per_second": 2.556, "step": 5050 }, { "epoch": 9.55, "learning_rate": 1e-06, "loss": 0.7828, "step": 5060 }, { "epoch": 9.55, "eval_loss": 0.8069567084312439, "eval_runtime": 52.0026, "eval_samples_per_second": 1304.011, "eval_steps_per_second": 2.558, "step": 5060 }, { "epoch": 9.57, "learning_rate": 1e-06, "loss": 0.79, "step": 5070 }, { "epoch": 9.57, "eval_loss": 0.807108998298645, "eval_runtime": 52.0409, "eval_samples_per_second": 1303.052, "eval_steps_per_second": 2.556, "step": 5070 }, { "epoch": 9.58, "learning_rate": 1e-06, "loss": 0.764, "step": 5080 }, { "epoch": 9.58, "eval_loss": 0.8068212270736694, "eval_runtime": 51.974, "eval_samples_per_second": 1304.729, "eval_steps_per_second": 2.559, "step": 5080 }, { "epoch": 9.6, "learning_rate": 1e-06, "loss": 0.7667, "step": 5090 }, { "epoch": 9.6, "eval_loss": 0.8066729307174683, "eval_runtime": 52.1256, "eval_samples_per_second": 1300.934, "eval_steps_per_second": 2.552, "step": 5090 }, { "epoch": 9.62, "learning_rate": 1e-06, "loss": 0.7745, "step": 5100 }, { "epoch": 9.62, "eval_loss": 0.8065735101699829, "eval_runtime": 52.0723, "eval_samples_per_second": 1302.266, "eval_steps_per_second": 2.554, "step": 5100 }, { "epoch": 9.64, "learning_rate": 1e-06, "loss": 0.774, "step": 5110 }, { "epoch": 9.64, "eval_loss": 0.8063581585884094, "eval_runtime": 51.992, "eval_samples_per_second": 1304.277, "eval_steps_per_second": 2.558, "step": 5110 }, { "epoch": 9.66, "learning_rate": 1e-06, "loss": 0.7775, "step": 5120 }, { "epoch": 9.66, "eval_loss": 0.8062164783477783, "eval_runtime": 52.2512, "eval_samples_per_second": 1297.809, "eval_steps_per_second": 2.545, "step": 5120 }, { "epoch": 9.68, "learning_rate": 1e-06, "loss": 0.7571, "step": 5130 }, { "epoch": 9.68, "eval_loss": 0.8059104084968567, "eval_runtime": 52.076, "eval_samples_per_second": 1302.173, "eval_steps_per_second": 2.554, "step": 5130 }, { "epoch": 9.7, "learning_rate": 1e-06, "loss": 0.7807, "step": 5140 }, { "epoch": 9.7, "eval_loss": 0.8059520125389099, "eval_runtime": 52.109, "eval_samples_per_second": 1301.349, "eval_steps_per_second": 2.552, "step": 5140 }, { "epoch": 9.72, "learning_rate": 1e-06, "loss": 0.7838, "step": 5150 }, { "epoch": 9.72, "eval_loss": 0.8056530952453613, "eval_runtime": 51.8958, "eval_samples_per_second": 1306.695, "eval_steps_per_second": 2.563, "step": 5150 }, { "epoch": 9.74, "learning_rate": 1e-06, "loss": 0.7773, "step": 5160 }, { "epoch": 9.74, "eval_loss": 0.8054617047309875, "eval_runtime": 52.2537, "eval_samples_per_second": 1297.746, "eval_steps_per_second": 2.545, "step": 5160 }, { "epoch": 9.75, "learning_rate": 1e-06, "loss": 0.7917, "step": 5170 }, { "epoch": 9.75, "eval_loss": 0.8053807616233826, "eval_runtime": 52.0793, "eval_samples_per_second": 1302.09, "eval_steps_per_second": 2.554, "step": 5170 }, { "epoch": 9.77, "learning_rate": 1e-06, "loss": 0.7904, "step": 5180 }, { "epoch": 9.77, "eval_loss": 0.8053616881370544, "eval_runtime": 51.9922, "eval_samples_per_second": 1304.272, "eval_steps_per_second": 2.558, "step": 5180 }, { "epoch": 9.79, "learning_rate": 1e-06, "loss": 0.781, "step": 5190 }, { "epoch": 9.79, "eval_loss": 0.8050292134284973, "eval_runtime": 52.2751, "eval_samples_per_second": 1297.214, "eval_steps_per_second": 2.544, "step": 5190 }, { "epoch": 9.81, "learning_rate": 1e-06, "loss": 0.7674, "step": 5200 }, { "epoch": 9.81, "eval_loss": 0.8047987222671509, "eval_runtime": 52.0988, "eval_samples_per_second": 1301.603, "eval_steps_per_second": 2.553, "step": 5200 }, { "epoch": 9.83, "learning_rate": 1e-06, "loss": 0.7704, "step": 5210 }, { "epoch": 9.83, "eval_loss": 0.8047385215759277, "eval_runtime": 52.0987, "eval_samples_per_second": 1301.607, "eval_steps_per_second": 2.553, "step": 5210 }, { "epoch": 9.85, "learning_rate": 1e-06, "loss": 0.7526, "step": 5220 }, { "epoch": 9.85, "eval_loss": 0.8044944405555725, "eval_runtime": 52.1435, "eval_samples_per_second": 1300.489, "eval_steps_per_second": 2.551, "step": 5220 }, { "epoch": 9.87, "learning_rate": 1e-06, "loss": 0.784, "step": 5230 }, { "epoch": 9.87, "eval_loss": 0.8044453859329224, "eval_runtime": 52.2575, "eval_samples_per_second": 1297.652, "eval_steps_per_second": 2.545, "step": 5230 }, { "epoch": 9.89, "learning_rate": 1e-06, "loss": 0.7644, "step": 5240 }, { "epoch": 9.89, "eval_loss": 0.8041695356369019, "eval_runtime": 52.2688, "eval_samples_per_second": 1297.372, "eval_steps_per_second": 2.545, "step": 5240 }, { "epoch": 9.91, "learning_rate": 1e-06, "loss": 0.7684, "step": 5250 }, { "epoch": 9.91, "eval_loss": 0.804131031036377, "eval_runtime": 52.1426, "eval_samples_per_second": 1300.51, "eval_steps_per_second": 2.551, "step": 5250 }, { "epoch": 9.92, "learning_rate": 1e-06, "loss": 0.7565, "step": 5260 }, { "epoch": 9.92, "eval_loss": 0.8040266036987305, "eval_runtime": 52.0192, "eval_samples_per_second": 1303.595, "eval_steps_per_second": 2.557, "step": 5260 }, { "epoch": 9.94, "learning_rate": 1e-06, "loss": 0.7873, "step": 5270 }, { "epoch": 9.94, "eval_loss": 0.8036627769470215, "eval_runtime": 52.3717, "eval_samples_per_second": 1294.821, "eval_steps_per_second": 2.54, "step": 5270 }, { "epoch": 9.96, "learning_rate": 1e-06, "loss": 0.7871, "step": 5280 }, { "epoch": 9.96, "eval_loss": 0.8036572337150574, "eval_runtime": 52.2188, "eval_samples_per_second": 1298.613, "eval_steps_per_second": 2.547, "step": 5280 }, { "epoch": 9.98, "learning_rate": 1e-06, "loss": 0.7719, "step": 5290 }, { "epoch": 9.98, "eval_loss": 0.8034039735794067, "eval_runtime": 51.9811, "eval_samples_per_second": 1304.551, "eval_steps_per_second": 2.559, "step": 5290 }, { "epoch": 10.0, "learning_rate": 1e-06, "loss": 0.7852, "step": 5300 }, { "epoch": 10.0, "eval_loss": 0.8034818172454834, "eval_runtime": 52.3185, "eval_samples_per_second": 1296.138, "eval_steps_per_second": 2.542, "step": 5300 } ], "logging_steps": 10, "max_steps": 5300, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10, "total_flos": 767247589048320.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }