[ { "entropy": 2.184988856315613, "epoch": 0.0625, "grad_norm": 6.174712657928467, "learning_rate": 0.00029531249999999995, "loss": 0.513, "mean_token_accuracy": 0.9921568632125854, "num_tokens": 4096.0, "step": 2 }, { "entropy": 1.7302125096321106, "epoch": 0.125, "grad_norm": 0.9681358933448792, "learning_rate": 0.00028593749999999995, "loss": 0.441, "mean_token_accuracy": 0.9870097935199738, "num_tokens": 8192.0, "step": 4 }, { "entropy": 2.0538085103034973, "epoch": 0.1875, "grad_norm": 0.5153102278709412, "learning_rate": 0.00027656249999999995, "loss": 0.4929, "mean_token_accuracy": 0.9980392158031464, "num_tokens": 12288.0, "step": 6 }, { "entropy": 1.7263505458831787, "epoch": 0.25, "grad_norm": 0.6404310464859009, "learning_rate": 0.00026718749999999996, "loss": 0.4014, "mean_token_accuracy": 0.9987744987010956, "num_tokens": 16384.0, "step": 8 }, { "entropy": 1.8347786664962769, "epoch": 0.3125, "grad_norm": 0.6209350228309631, "learning_rate": 0.00025781249999999996, "loss": 0.4365, "mean_token_accuracy": 1.0, "num_tokens": 20480.0, "step": 10 }, { "entropy": 1.4254534244537354, "epoch": 0.375, "grad_norm": 0.44427844882011414, "learning_rate": 0.00024843749999999996, "loss": 0.2791, "mean_token_accuracy": 1.0, "num_tokens": 24576.0, "step": 12 }, { "entropy": 1.8001930117607117, "epoch": 0.4375, "grad_norm": 0.3619579076766968, "learning_rate": 0.0002390625, "loss": 0.3962, "mean_token_accuracy": 0.9987744987010956, "num_tokens": 28672.0, "step": 14 }, { "entropy": 1.6178001761436462, "epoch": 0.5, "grad_norm": 0.38358834385871887, "learning_rate": 0.0002296875, "loss": 0.3492, "mean_token_accuracy": 1.0, "num_tokens": 32768.0, "step": 16 }, { "entropy": 1.661442220211029, "epoch": 0.5625, "grad_norm": 0.3749903440475464, "learning_rate": 0.00022031249999999997, "loss": 0.3582, "mean_token_accuracy": 1.0, "num_tokens": 36864.0, "step": 18 }, { "entropy": 1.5717861652374268, "epoch": 0.625, "grad_norm": 0.36388659477233887, "learning_rate": 0.00021093749999999997, "loss": 0.3126, "mean_token_accuracy": 1.0, "num_tokens": 40960.0, "step": 20 }, { "entropy": 1.5534449219703674, "epoch": 0.6875, "grad_norm": 0.40969353914260864, "learning_rate": 0.00020156249999999997, "loss": 0.3478, "mean_token_accuracy": 0.9997549057006836, "num_tokens": 45056.0, "step": 22 }, { "entropy": 1.5265448689460754, "epoch": 0.75, "grad_norm": 0.41839736700057983, "learning_rate": 0.00019218749999999998, "loss": 0.3601, "mean_token_accuracy": 0.9995098114013672, "num_tokens": 49152.0, "step": 24 }, { "entropy": 2.074858069419861, "epoch": 0.8125, "grad_norm": 0.44888272881507874, "learning_rate": 0.00018281249999999998, "loss": 0.4777, "mean_token_accuracy": 0.9997549057006836, "num_tokens": 53248.0, "step": 26 }, { "entropy": 1.6095194816589355, "epoch": 0.875, "grad_norm": 0.5080280900001526, "learning_rate": 0.00017343749999999998, "loss": 0.3859, "mean_token_accuracy": 0.9987744987010956, "num_tokens": 57344.0, "step": 28 }, { "entropy": 1.7032344341278076, "epoch": 0.9375, "grad_norm": 0.3624984622001648, "learning_rate": 0.00016406249999999998, "loss": 0.3576, "mean_token_accuracy": 1.0, "num_tokens": 61440.0, "step": 30 }, { "entropy": 1.8873920440673828, "epoch": 1.0, "grad_norm": 0.618506133556366, "learning_rate": 0.00015468749999999999, "loss": 0.446, "mean_token_accuracy": 0.9997549057006836, "num_tokens": 64256.0, "step": 32 }, { "entropy": 1.4586840271949768, "epoch": 1.0625, "grad_norm": 0.3723963797092438, "learning_rate": 0.0001453125, "loss": 0.2824, "mean_token_accuracy": 1.0, "num_tokens": 68352.0, "step": 34 }, { "entropy": 1.2832568883895874, "epoch": 1.125, "grad_norm": 0.3108985424041748, "learning_rate": 0.0001359375, "loss": 0.2191, "mean_token_accuracy": 1.0, "num_tokens": 72448.0, "step": 36 }, { "entropy": 1.523368000984192, "epoch": 1.1875, "grad_norm": 0.3509906232357025, "learning_rate": 0.0001265625, "loss": 0.3042, "mean_token_accuracy": 0.9995098114013672, "num_tokens": 76544.0, "step": 38 }, { "entropy": 1.2994396686553955, "epoch": 1.25, "grad_norm": 0.3014850616455078, "learning_rate": 0.0001171875, "loss": 0.2456, "mean_token_accuracy": 1.0, "num_tokens": 80640.0, "step": 40 }, { "entropy": 1.3811439871788025, "epoch": 1.3125, "grad_norm": 0.32755109667778015, "learning_rate": 0.00010781249999999998, "loss": 0.2521, "mean_token_accuracy": 1.0, "num_tokens": 84736.0, "step": 42 }, { "entropy": 1.1994215250015259, "epoch": 1.375, "grad_norm": 0.24541084468364716, "learning_rate": 9.843749999999999e-05, "loss": 0.2118, "mean_token_accuracy": 1.0, "num_tokens": 88832.0, "step": 44 }, { "entropy": 1.0519097447395325, "epoch": 1.4375, "grad_norm": 0.2063349187374115, "learning_rate": 8.906249999999999e-05, "loss": 0.1943, "mean_token_accuracy": 1.0, "num_tokens": 92928.0, "step": 46 }, { "entropy": 1.1152112483978271, "epoch": 1.5, "grad_norm": 0.31837204098701477, "learning_rate": 7.968749999999999e-05, "loss": 0.2132, "mean_token_accuracy": 1.0, "num_tokens": 97024.0, "step": 48 }, { "entropy": 1.2649919390678406, "epoch": 1.5625, "grad_norm": 0.289153516292572, "learning_rate": 7.03125e-05, "loss": 0.2158, "mean_token_accuracy": 1.0, "num_tokens": 101120.0, "step": 50 }, { "entropy": 0.9955946207046509, "epoch": 1.625, "grad_norm": 0.2607753276824951, "learning_rate": 6.09375e-05, "loss": 0.1913, "mean_token_accuracy": 0.9997549057006836, "num_tokens": 105216.0, "step": 52 }, { "entropy": 1.3506205081939697, "epoch": 1.6875, "grad_norm": 0.2850724458694458, "learning_rate": 5.156249999999999e-05, "loss": 0.2234, "mean_token_accuracy": 1.0, "num_tokens": 109312.0, "step": 54 }, { "entropy": 1.3460099697113037, "epoch": 1.75, "grad_norm": 0.23587484657764435, "learning_rate": 4.2187499999999995e-05, "loss": 0.2544, "mean_token_accuracy": 1.0, "num_tokens": 113408.0, "step": 56 }, { "entropy": 1.4365423321723938, "epoch": 1.8125, "grad_norm": 0.3239842653274536, "learning_rate": 3.28125e-05, "loss": 0.2958, "mean_token_accuracy": 1.0, "num_tokens": 117504.0, "step": 58 }, { "entropy": 1.567048728466034, "epoch": 1.875, "grad_norm": 0.34480002522468567, "learning_rate": 2.3437499999999997e-05, "loss": 0.3122, "mean_token_accuracy": 1.0, "num_tokens": 121600.0, "step": 60 }, { "entropy": 1.344693124294281, "epoch": 1.9375, "grad_norm": 0.25280237197875977, "learning_rate": 1.40625e-05, "loss": 0.2472, "mean_token_accuracy": 1.0, "num_tokens": 125696.0, "step": 62 }, { "entropy": 1.2893942594528198, "epoch": 2.0, "grad_norm": 0.5825140476226807, "learning_rate": 4.6875e-06, "loss": 0.2222, "mean_token_accuracy": 1.0, "num_tokens": 128512.0, "step": 64 }, { "epoch": 2.0, "step": 64, "total_flos": 59560143504384.0, "train_loss": 0.32000101869925857, "train_runtime": 720.302, "train_samples_per_second": 0.711, "train_steps_per_second": 0.089 } ]