MoA-150M / MoA-150M_results.json
reaperdoesntknow's picture
Upload 2 files
66e9d41 verified
[
{
"entropy": 2.184988856315613,
"epoch": 0.0625,
"grad_norm": 6.174712657928467,
"learning_rate": 0.00029531249999999995,
"loss": 0.513,
"mean_token_accuracy": 0.9921568632125854,
"num_tokens": 4096.0,
"step": 2
},
{
"entropy": 1.7302125096321106,
"epoch": 0.125,
"grad_norm": 0.9681358933448792,
"learning_rate": 0.00028593749999999995,
"loss": 0.441,
"mean_token_accuracy": 0.9870097935199738,
"num_tokens": 8192.0,
"step": 4
},
{
"entropy": 2.0538085103034973,
"epoch": 0.1875,
"grad_norm": 0.5153102278709412,
"learning_rate": 0.00027656249999999995,
"loss": 0.4929,
"mean_token_accuracy": 0.9980392158031464,
"num_tokens": 12288.0,
"step": 6
},
{
"entropy": 1.7263505458831787,
"epoch": 0.25,
"grad_norm": 0.6404310464859009,
"learning_rate": 0.00026718749999999996,
"loss": 0.4014,
"mean_token_accuracy": 0.9987744987010956,
"num_tokens": 16384.0,
"step": 8
},
{
"entropy": 1.8347786664962769,
"epoch": 0.3125,
"grad_norm": 0.6209350228309631,
"learning_rate": 0.00025781249999999996,
"loss": 0.4365,
"mean_token_accuracy": 1.0,
"num_tokens": 20480.0,
"step": 10
},
{
"entropy": 1.4254534244537354,
"epoch": 0.375,
"grad_norm": 0.44427844882011414,
"learning_rate": 0.00024843749999999996,
"loss": 0.2791,
"mean_token_accuracy": 1.0,
"num_tokens": 24576.0,
"step": 12
},
{
"entropy": 1.8001930117607117,
"epoch": 0.4375,
"grad_norm": 0.3619579076766968,
"learning_rate": 0.0002390625,
"loss": 0.3962,
"mean_token_accuracy": 0.9987744987010956,
"num_tokens": 28672.0,
"step": 14
},
{
"entropy": 1.6178001761436462,
"epoch": 0.5,
"grad_norm": 0.38358834385871887,
"learning_rate": 0.0002296875,
"loss": 0.3492,
"mean_token_accuracy": 1.0,
"num_tokens": 32768.0,
"step": 16
},
{
"entropy": 1.661442220211029,
"epoch": 0.5625,
"grad_norm": 0.3749903440475464,
"learning_rate": 0.00022031249999999997,
"loss": 0.3582,
"mean_token_accuracy": 1.0,
"num_tokens": 36864.0,
"step": 18
},
{
"entropy": 1.5717861652374268,
"epoch": 0.625,
"grad_norm": 0.36388659477233887,
"learning_rate": 0.00021093749999999997,
"loss": 0.3126,
"mean_token_accuracy": 1.0,
"num_tokens": 40960.0,
"step": 20
},
{
"entropy": 1.5534449219703674,
"epoch": 0.6875,
"grad_norm": 0.40969353914260864,
"learning_rate": 0.00020156249999999997,
"loss": 0.3478,
"mean_token_accuracy": 0.9997549057006836,
"num_tokens": 45056.0,
"step": 22
},
{
"entropy": 1.5265448689460754,
"epoch": 0.75,
"grad_norm": 0.41839736700057983,
"learning_rate": 0.00019218749999999998,
"loss": 0.3601,
"mean_token_accuracy": 0.9995098114013672,
"num_tokens": 49152.0,
"step": 24
},
{
"entropy": 2.074858069419861,
"epoch": 0.8125,
"grad_norm": 0.44888272881507874,
"learning_rate": 0.00018281249999999998,
"loss": 0.4777,
"mean_token_accuracy": 0.9997549057006836,
"num_tokens": 53248.0,
"step": 26
},
{
"entropy": 1.6095194816589355,
"epoch": 0.875,
"grad_norm": 0.5080280900001526,
"learning_rate": 0.00017343749999999998,
"loss": 0.3859,
"mean_token_accuracy": 0.9987744987010956,
"num_tokens": 57344.0,
"step": 28
},
{
"entropy": 1.7032344341278076,
"epoch": 0.9375,
"grad_norm": 0.3624984622001648,
"learning_rate": 0.00016406249999999998,
"loss": 0.3576,
"mean_token_accuracy": 1.0,
"num_tokens": 61440.0,
"step": 30
},
{
"entropy": 1.8873920440673828,
"epoch": 1.0,
"grad_norm": 0.618506133556366,
"learning_rate": 0.00015468749999999999,
"loss": 0.446,
"mean_token_accuracy": 0.9997549057006836,
"num_tokens": 64256.0,
"step": 32
},
{
"entropy": 1.4586840271949768,
"epoch": 1.0625,
"grad_norm": 0.3723963797092438,
"learning_rate": 0.0001453125,
"loss": 0.2824,
"mean_token_accuracy": 1.0,
"num_tokens": 68352.0,
"step": 34
},
{
"entropy": 1.2832568883895874,
"epoch": 1.125,
"grad_norm": 0.3108985424041748,
"learning_rate": 0.0001359375,
"loss": 0.2191,
"mean_token_accuracy": 1.0,
"num_tokens": 72448.0,
"step": 36
},
{
"entropy": 1.523368000984192,
"epoch": 1.1875,
"grad_norm": 0.3509906232357025,
"learning_rate": 0.0001265625,
"loss": 0.3042,
"mean_token_accuracy": 0.9995098114013672,
"num_tokens": 76544.0,
"step": 38
},
{
"entropy": 1.2994396686553955,
"epoch": 1.25,
"grad_norm": 0.3014850616455078,
"learning_rate": 0.0001171875,
"loss": 0.2456,
"mean_token_accuracy": 1.0,
"num_tokens": 80640.0,
"step": 40
},
{
"entropy": 1.3811439871788025,
"epoch": 1.3125,
"grad_norm": 0.32755109667778015,
"learning_rate": 0.00010781249999999998,
"loss": 0.2521,
"mean_token_accuracy": 1.0,
"num_tokens": 84736.0,
"step": 42
},
{
"entropy": 1.1994215250015259,
"epoch": 1.375,
"grad_norm": 0.24541084468364716,
"learning_rate": 9.843749999999999e-05,
"loss": 0.2118,
"mean_token_accuracy": 1.0,
"num_tokens": 88832.0,
"step": 44
},
{
"entropy": 1.0519097447395325,
"epoch": 1.4375,
"grad_norm": 0.2063349187374115,
"learning_rate": 8.906249999999999e-05,
"loss": 0.1943,
"mean_token_accuracy": 1.0,
"num_tokens": 92928.0,
"step": 46
},
{
"entropy": 1.1152112483978271,
"epoch": 1.5,
"grad_norm": 0.31837204098701477,
"learning_rate": 7.968749999999999e-05,
"loss": 0.2132,
"mean_token_accuracy": 1.0,
"num_tokens": 97024.0,
"step": 48
},
{
"entropy": 1.2649919390678406,
"epoch": 1.5625,
"grad_norm": 0.289153516292572,
"learning_rate": 7.03125e-05,
"loss": 0.2158,
"mean_token_accuracy": 1.0,
"num_tokens": 101120.0,
"step": 50
},
{
"entropy": 0.9955946207046509,
"epoch": 1.625,
"grad_norm": 0.2607753276824951,
"learning_rate": 6.09375e-05,
"loss": 0.1913,
"mean_token_accuracy": 0.9997549057006836,
"num_tokens": 105216.0,
"step": 52
},
{
"entropy": 1.3506205081939697,
"epoch": 1.6875,
"grad_norm": 0.2850724458694458,
"learning_rate": 5.156249999999999e-05,
"loss": 0.2234,
"mean_token_accuracy": 1.0,
"num_tokens": 109312.0,
"step": 54
},
{
"entropy": 1.3460099697113037,
"epoch": 1.75,
"grad_norm": 0.23587484657764435,
"learning_rate": 4.2187499999999995e-05,
"loss": 0.2544,
"mean_token_accuracy": 1.0,
"num_tokens": 113408.0,
"step": 56
},
{
"entropy": 1.4365423321723938,
"epoch": 1.8125,
"grad_norm": 0.3239842653274536,
"learning_rate": 3.28125e-05,
"loss": 0.2958,
"mean_token_accuracy": 1.0,
"num_tokens": 117504.0,
"step": 58
},
{
"entropy": 1.567048728466034,
"epoch": 1.875,
"grad_norm": 0.34480002522468567,
"learning_rate": 2.3437499999999997e-05,
"loss": 0.3122,
"mean_token_accuracy": 1.0,
"num_tokens": 121600.0,
"step": 60
},
{
"entropy": 1.344693124294281,
"epoch": 1.9375,
"grad_norm": 0.25280237197875977,
"learning_rate": 1.40625e-05,
"loss": 0.2472,
"mean_token_accuracy": 1.0,
"num_tokens": 125696.0,
"step": 62
},
{
"entropy": 1.2893942594528198,
"epoch": 2.0,
"grad_norm": 0.5825140476226807,
"learning_rate": 4.6875e-06,
"loss": 0.2222,
"mean_token_accuracy": 1.0,
"num_tokens": 128512.0,
"step": 64
},
{
"epoch": 2.0,
"step": 64,
"total_flos": 59560143504384.0,
"train_loss": 0.32000101869925857,
"train_runtime": 720.302,
"train_samples_per_second": 0.711,
"train_steps_per_second": 0.089
}
]