| [ | |
| { | |
| "entropy": 2.184988856315613, | |
| "epoch": 0.0625, | |
| "grad_norm": 6.174712657928467, | |
| "learning_rate": 0.00029531249999999995, | |
| "loss": 0.513, | |
| "mean_token_accuracy": 0.9921568632125854, | |
| "num_tokens": 4096.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.7302125096321106, | |
| "epoch": 0.125, | |
| "grad_norm": 0.9681358933448792, | |
| "learning_rate": 0.00028593749999999995, | |
| "loss": 0.441, | |
| "mean_token_accuracy": 0.9870097935199738, | |
| "num_tokens": 8192.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 2.0538085103034973, | |
| "epoch": 0.1875, | |
| "grad_norm": 0.5153102278709412, | |
| "learning_rate": 0.00027656249999999995, | |
| "loss": 0.4929, | |
| "mean_token_accuracy": 0.9980392158031464, | |
| "num_tokens": 12288.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.7263505458831787, | |
| "epoch": 0.25, | |
| "grad_norm": 0.6404310464859009, | |
| "learning_rate": 0.00026718749999999996, | |
| "loss": 0.4014, | |
| "mean_token_accuracy": 0.9987744987010956, | |
| "num_tokens": 16384.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.8347786664962769, | |
| "epoch": 0.3125, | |
| "grad_norm": 0.6209350228309631, | |
| "learning_rate": 0.00025781249999999996, | |
| "loss": 0.4365, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 20480.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.4254534244537354, | |
| "epoch": 0.375, | |
| "grad_norm": 0.44427844882011414, | |
| "learning_rate": 0.00024843749999999996, | |
| "loss": 0.2791, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 24576.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 1.8001930117607117, | |
| "epoch": 0.4375, | |
| "grad_norm": 0.3619579076766968, | |
| "learning_rate": 0.0002390625, | |
| "loss": 0.3962, | |
| "mean_token_accuracy": 0.9987744987010956, | |
| "num_tokens": 28672.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 1.6178001761436462, | |
| "epoch": 0.5, | |
| "grad_norm": 0.38358834385871887, | |
| "learning_rate": 0.0002296875, | |
| "loss": 0.3492, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 32768.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 1.661442220211029, | |
| "epoch": 0.5625, | |
| "grad_norm": 0.3749903440475464, | |
| "learning_rate": 0.00022031249999999997, | |
| "loss": 0.3582, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 36864.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 1.5717861652374268, | |
| "epoch": 0.625, | |
| "grad_norm": 0.36388659477233887, | |
| "learning_rate": 0.00021093749999999997, | |
| "loss": 0.3126, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 40960.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.5534449219703674, | |
| "epoch": 0.6875, | |
| "grad_norm": 0.40969353914260864, | |
| "learning_rate": 0.00020156249999999997, | |
| "loss": 0.3478, | |
| "mean_token_accuracy": 0.9997549057006836, | |
| "num_tokens": 45056.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 1.5265448689460754, | |
| "epoch": 0.75, | |
| "grad_norm": 0.41839736700057983, | |
| "learning_rate": 0.00019218749999999998, | |
| "loss": 0.3601, | |
| "mean_token_accuracy": 0.9995098114013672, | |
| "num_tokens": 49152.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 2.074858069419861, | |
| "epoch": 0.8125, | |
| "grad_norm": 0.44888272881507874, | |
| "learning_rate": 0.00018281249999999998, | |
| "loss": 0.4777, | |
| "mean_token_accuracy": 0.9997549057006836, | |
| "num_tokens": 53248.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 1.6095194816589355, | |
| "epoch": 0.875, | |
| "grad_norm": 0.5080280900001526, | |
| "learning_rate": 0.00017343749999999998, | |
| "loss": 0.3859, | |
| "mean_token_accuracy": 0.9987744987010956, | |
| "num_tokens": 57344.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 1.7032344341278076, | |
| "epoch": 0.9375, | |
| "grad_norm": 0.3624984622001648, | |
| "learning_rate": 0.00016406249999999998, | |
| "loss": 0.3576, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 61440.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.8873920440673828, | |
| "epoch": 1.0, | |
| "grad_norm": 0.618506133556366, | |
| "learning_rate": 0.00015468749999999999, | |
| "loss": 0.446, | |
| "mean_token_accuracy": 0.9997549057006836, | |
| "num_tokens": 64256.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 1.4586840271949768, | |
| "epoch": 1.0625, | |
| "grad_norm": 0.3723963797092438, | |
| "learning_rate": 0.0001453125, | |
| "loss": 0.2824, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 68352.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 1.2832568883895874, | |
| "epoch": 1.125, | |
| "grad_norm": 0.3108985424041748, | |
| "learning_rate": 0.0001359375, | |
| "loss": 0.2191, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 72448.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 1.523368000984192, | |
| "epoch": 1.1875, | |
| "grad_norm": 0.3509906232357025, | |
| "learning_rate": 0.0001265625, | |
| "loss": 0.3042, | |
| "mean_token_accuracy": 0.9995098114013672, | |
| "num_tokens": 76544.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 1.2994396686553955, | |
| "epoch": 1.25, | |
| "grad_norm": 0.3014850616455078, | |
| "learning_rate": 0.0001171875, | |
| "loss": 0.2456, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 80640.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.3811439871788025, | |
| "epoch": 1.3125, | |
| "grad_norm": 0.32755109667778015, | |
| "learning_rate": 0.00010781249999999998, | |
| "loss": 0.2521, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 84736.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 1.1994215250015259, | |
| "epoch": 1.375, | |
| "grad_norm": 0.24541084468364716, | |
| "learning_rate": 9.843749999999999e-05, | |
| "loss": 0.2118, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 88832.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 1.0519097447395325, | |
| "epoch": 1.4375, | |
| "grad_norm": 0.2063349187374115, | |
| "learning_rate": 8.906249999999999e-05, | |
| "loss": 0.1943, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 92928.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 1.1152112483978271, | |
| "epoch": 1.5, | |
| "grad_norm": 0.31837204098701477, | |
| "learning_rate": 7.968749999999999e-05, | |
| "loss": 0.2132, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 97024.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 1.2649919390678406, | |
| "epoch": 1.5625, | |
| "grad_norm": 0.289153516292572, | |
| "learning_rate": 7.03125e-05, | |
| "loss": 0.2158, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 101120.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.9955946207046509, | |
| "epoch": 1.625, | |
| "grad_norm": 0.2607753276824951, | |
| "learning_rate": 6.09375e-05, | |
| "loss": 0.1913, | |
| "mean_token_accuracy": 0.9997549057006836, | |
| "num_tokens": 105216.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 1.3506205081939697, | |
| "epoch": 1.6875, | |
| "grad_norm": 0.2850724458694458, | |
| "learning_rate": 5.156249999999999e-05, | |
| "loss": 0.2234, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 109312.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 1.3460099697113037, | |
| "epoch": 1.75, | |
| "grad_norm": 0.23587484657764435, | |
| "learning_rate": 4.2187499999999995e-05, | |
| "loss": 0.2544, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 113408.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 1.4365423321723938, | |
| "epoch": 1.8125, | |
| "grad_norm": 0.3239842653274536, | |
| "learning_rate": 3.28125e-05, | |
| "loss": 0.2958, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 117504.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 1.567048728466034, | |
| "epoch": 1.875, | |
| "grad_norm": 0.34480002522468567, | |
| "learning_rate": 2.3437499999999997e-05, | |
| "loss": 0.3122, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 121600.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.344693124294281, | |
| "epoch": 1.9375, | |
| "grad_norm": 0.25280237197875977, | |
| "learning_rate": 1.40625e-05, | |
| "loss": 0.2472, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 125696.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 1.2893942594528198, | |
| "epoch": 2.0, | |
| "grad_norm": 0.5825140476226807, | |
| "learning_rate": 4.6875e-06, | |
| "loss": 0.2222, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 128512.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "step": 64, | |
| "total_flos": 59560143504384.0, | |
| "train_loss": 0.32000101869925857, | |
| "train_runtime": 720.302, | |
| "train_samples_per_second": 0.711, | |
| "train_steps_per_second": 0.089 | |
| } | |
| ] |