| [ | |
| { | |
| "config": { | |
| "model_name": "ChatGPT-4o-latest (2024-09-03)", | |
| "organization": "OpenAI", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2023/10" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 93.51557945652831, | |
| "Standard Deviation": 3.1900396436407785, | |
| "Rank": 4 | |
| }, | |
| "Geometry": { | |
| "Average Score": 81.8536937387725, | |
| "Standard Deviation": null, | |
| "Rank": 5 | |
| }, | |
| "Algebra": { | |
| "Average Score": 89.3642910524324, | |
| "Standard Deviation": null, | |
| "Rank": 3 | |
| }, | |
| "Probability": { | |
| "Average Score": 86.55761073510537, | |
| "Standard Deviation": null, | |
| "Rank": 4 | |
| }, | |
| "Logical": { | |
| "Average Score": 97.39734315785844, | |
| "Standard Deviation": null, | |
| "Rank": 2 | |
| }, | |
| "Social": { | |
| "Average Score": 91.03727530739368, | |
| "Standard Deviation": null, | |
| "Rank": 7 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 100.0, | |
| "Standard Deviation": null, | |
| "Rank": 1 | |
| }, | |
| "CPP": { | |
| "Average Score": 100.0, | |
| "Standard Deviation": null, | |
| "Rank": 1 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gpt-4o-2024-08-06", | |
| "organization": "OpenAI", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2023/10" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 79.7806321863411, | |
| "Standard Deviation": 0.8302330946013555, | |
| "Rank": 14 | |
| }, | |
| "Geometry": { | |
| "Average Score": 86.29041459755453, | |
| "Standard Deviation": null, | |
| "Rank": 2 | |
| }, | |
| "Algebra": { | |
| "Average Score": 88.53373721863113, | |
| "Standard Deviation": null, | |
| "Rank": 4 | |
| }, | |
| "Probability": { | |
| "Average Score": 78.694360721361, | |
| "Standard Deviation": null, | |
| "Rank": 7 | |
| }, | |
| "Logical": { | |
| "Average Score": 78.3116623496895, | |
| "Standard Deviation": null, | |
| "Rank": 12 | |
| }, | |
| "Social": { | |
| "Average Score": 79.90944696263446, | |
| "Standard Deviation": null, | |
| "Rank": 11 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 86.96011263543132, | |
| "Standard Deviation": null, | |
| "Rank": 7 | |
| }, | |
| "CPP": { | |
| "Average Score": 92.43090226400756, | |
| "Standard Deviation": null, | |
| "Rank": 2 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gpt-4o-2024-05-13", | |
| "organization": "OpenAI", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2023/10" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 86.40675398236253, | |
| "Standard Deviation": 6.473604235710212, | |
| "Rank": 9 | |
| }, | |
| "Geometry": { | |
| "Average Score": 82.42032988843268, | |
| "Standard Deviation": null, | |
| "Rank": 4 | |
| }, | |
| "Algebra": { | |
| "Average Score": 83.51580675782952, | |
| "Standard Deviation": null, | |
| "Rank": 9 | |
| }, | |
| "Probability": { | |
| "Average Score": 81.88434691830915, | |
| "Standard Deviation": null, | |
| "Rank": 5 | |
| }, | |
| "Logical": { | |
| "Average Score": 87.92744931984977, | |
| "Standard Deviation": null, | |
| "Rank": 9 | |
| }, | |
| "Social": { | |
| "Average Score": 76.12369632852445, | |
| "Standard Deviation": null, | |
| "Rank": 15 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 90.93459148149344, | |
| "Standard Deviation": null, | |
| "Rank": 4 | |
| }, | |
| "CPP": { | |
| "Average Score": 79.1592634699295, | |
| "Standard Deviation": null, | |
| "Rank": 6 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gpt-4-turbo-2024-04-09", | |
| "organization": "OpenAI", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2023/12" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 87.17581147282237, | |
| "Standard Deviation": 8.716963621850567, | |
| "Rank": 8 | |
| }, | |
| "Geometry": { | |
| "Average Score": 78.76635545274637, | |
| "Standard Deviation": null, | |
| "Rank": 7 | |
| }, | |
| "Algebra": { | |
| "Average Score": 79.96323615621023, | |
| "Standard Deviation": null, | |
| "Rank": 11 | |
| }, | |
| "Probability": { | |
| "Average Score": 77.65333799733705, | |
| "Standard Deviation": null, | |
| "Rank": 9 | |
| }, | |
| "Logical": { | |
| "Average Score": 89.33307138659873, | |
| "Standard Deviation": null, | |
| "Rank": 8 | |
| }, | |
| "Social": { | |
| "Average Score": 76.86597570996584, | |
| "Standard Deviation": null, | |
| "Rank": 14 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 84.02855687506661, | |
| "Standard Deviation": null, | |
| "Rank": 9 | |
| }, | |
| "CPP": { | |
| "Average Score": 70.73143363230263, | |
| "Standard Deviation": null, | |
| "Rank": 11 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gemini-1.5-pro-001", | |
| "organization": "Google", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2023/11" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 80.38345723548734, | |
| "Standard Deviation": 2.4635699815143584, | |
| "Rank": 13 | |
| }, | |
| "Geometry": { | |
| "Average Score": 84.30455076458965, | |
| "Standard Deviation": null, | |
| "Rank": 3 | |
| }, | |
| "Algebra": { | |
| "Average Score": 85.9212061409364, | |
| "Standard Deviation": null, | |
| "Rank": 6 | |
| }, | |
| "Probability": { | |
| "Average Score": 73.11806712394745, | |
| "Standard Deviation": null, | |
| "Rank": 13 | |
| }, | |
| "Logical": { | |
| "Average Score": 78.27369746632996, | |
| "Standard Deviation": null, | |
| "Rank": 12 | |
| }, | |
| "Social": { | |
| "Average Score": 79.57606824531047, | |
| "Standard Deviation": null, | |
| "Rank": 13 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "qwen2-72b-instruct", | |
| "organization": "Alibaba", | |
| "license": "Qianwen LICENSE", | |
| "knowledge_cutoff": "2024/09" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 74.44059692248071, | |
| "Standard Deviation": 2.3957041566666697, | |
| "Rank": 16 | |
| }, | |
| "Geometry": { | |
| "Average Score": 72.58490369919883, | |
| "Standard Deviation": null, | |
| "Rank": 11 | |
| }, | |
| "Algebra": { | |
| "Average Score": 88.53359632761772, | |
| "Standard Deviation": null, | |
| "Rank": 4 | |
| }, | |
| "Probability": { | |
| "Average Score": 80.19789976985243, | |
| "Standard Deviation": null, | |
| "Rank": 6 | |
| }, | |
| "Logical": { | |
| "Average Score": 72.76843081200641, | |
| "Standard Deviation": null, | |
| "Rank": 17 | |
| }, | |
| "Social": { | |
| "Average Score": 57.256064868444426, | |
| "Standard Deviation": null, | |
| "Rank": 19 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 75.47190401351077, | |
| "Standard Deviation": null, | |
| "Rank": 12 | |
| }, | |
| "CPP": { | |
| "Average Score": 73.54037778797029, | |
| "Standard Deviation": null, | |
| "Rank": 7 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gpt-4o-mini-2024-07-18", | |
| "organization": "OpenAI", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2023/10" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 82.82456893277315, | |
| "Standard Deviation": 7.714840109805867, | |
| "Rank": 12 | |
| }, | |
| "Geometry": { | |
| "Average Score": 78.89323869622943, | |
| "Standard Deviation": null, | |
| "Rank": 6 | |
| }, | |
| "Algebra": { | |
| "Average Score": 84.8722603687823, | |
| "Standard Deviation": null, | |
| "Rank": 8 | |
| }, | |
| "Probability": { | |
| "Average Score": 78.6942843346463, | |
| "Standard Deviation": null, | |
| "Rank": 7 | |
| }, | |
| "Logical": { | |
| "Average Score": 85.68921109829361, | |
| "Standard Deviation": null, | |
| "Rank": 10 | |
| }, | |
| "Social": { | |
| "Average Score": 81.79892848722542, | |
| "Standard Deviation": null, | |
| "Rank": 10 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 81.46805623180109, | |
| "Standard Deviation": null, | |
| "Rank": 10 | |
| }, | |
| "CPP": { | |
| "Average Score": 88.3877070580296, | |
| "Standard Deviation": null, | |
| "Rank": 3 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "claude-3.5-sonnet", | |
| "organization": "Anthropic", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2024/04" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 88.43557924843628, | |
| "Standard Deviation": 5.680338106806327, | |
| "Rank": 7 | |
| }, | |
| "Geometry": { | |
| "Average Score": 76.26169400931595, | |
| "Standard Deviation": null, | |
| "Rank": 10 | |
| }, | |
| "Algebra": { | |
| "Average Score": 77.15040433072186, | |
| "Standard Deviation": null, | |
| "Rank": 13 | |
| }, | |
| "Probability": { | |
| "Average Score": 73.9942759783754, | |
| "Standard Deviation": null, | |
| "Rank": 11 | |
| }, | |
| "Logical": { | |
| "Average Score": 89.70827617930533, | |
| "Standard Deviation": null, | |
| "Rank": 7 | |
| }, | |
| "Social": { | |
| "Average Score": 97.3810636467068, | |
| "Standard Deviation": null, | |
| "Rank": 3 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 94.92819763202698, | |
| "Standard Deviation": null, | |
| "Rank": 3 | |
| }, | |
| "CPP": { | |
| "Average Score": 82.37734076815008, | |
| "Standard Deviation": null, | |
| "Rank": 5 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "o1-mini", | |
| "organization": "OpenAI", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2023/10" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 96.12399889226096, | |
| "Standard Deviation": 0.5674965705992511, | |
| "Rank": 2 | |
| }, | |
| "Geometry": { | |
| "Average Score": 100.0, | |
| "Standard Deviation": null, | |
| "Rank": 1 | |
| }, | |
| "Algebra": { | |
| "Average Score": 100.0, | |
| "Standard Deviation": null, | |
| "Rank": 1 | |
| }, | |
| "Probability": { | |
| "Average Score": 100.0, | |
| "Standard Deviation": null, | |
| "Rank": 1 | |
| }, | |
| "Logical": { | |
| "Average Score": 96.52089445393929, | |
| "Standard Deviation": null, | |
| "Rank": 3 | |
| }, | |
| "Social": { | |
| "Average Score": 95.00695256918654, | |
| "Standard Deviation": null, | |
| "Rank": 5 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "o1-preview", | |
| "organization": "OpenAI", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2023/10" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 91.08240629161766, | |
| "Standard Deviation": 4.83378135710071, | |
| "Rank": 5 | |
| }, | |
| "Geometry": { | |
| "Average Score": "N/A", | |
| "Standard Deviation": "N/A", | |
| "Rank": "N/A" | |
| }, | |
| "Algebra": { | |
| "Average Score": 98.1870991822192, | |
| "Standard Deviation": null, | |
| "Rank": 2 | |
| }, | |
| "Probability": { | |
| "Average Score": 94.12657646584134, | |
| "Standard Deviation": null, | |
| "Rank": 2 | |
| }, | |
| "Logical": { | |
| "Average Score": 100.0, | |
| "Standard Deviation": null, | |
| "Rank": 1 | |
| }, | |
| "Social": { | |
| "Average Score": 96.56802743955569, | |
| "Standard Deviation": null, | |
| "Rank": 4 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gemini-1.5-flash-001", | |
| "organization": "Google", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2023/11" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 66.25275609135964, | |
| "Standard Deviation": 2.5314573702881438, | |
| "Rank": 20 | |
| }, | |
| "Geometry": { | |
| "Average Score": 66.8010242138006, | |
| "Standard Deviation": null, | |
| "Rank": 13 | |
| }, | |
| "Algebra": { | |
| "Average Score": 78.24639082497596, | |
| "Standard Deviation": null, | |
| "Rank": 12 | |
| }, | |
| "Probability": { | |
| "Average Score": 67.84602916736804, | |
| "Standard Deviation": null, | |
| "Rank": 15 | |
| }, | |
| "Logical": { | |
| "Average Score": 72.76845749138818, | |
| "Standard Deviation": null, | |
| "Rank": 17 | |
| }, | |
| "Social": { | |
| "Average Score": 68.57728479711058, | |
| "Standard Deviation": null, | |
| "Rank": 16 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 75.47188329078935, | |
| "Standard Deviation": null, | |
| "Rank": 12 | |
| }, | |
| "CPP": { | |
| "Average Score": 72.1127762005651, | |
| "Standard Deviation": null, | |
| "Rank": 10 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gpt4-1106", | |
| "organization": "OpenAI", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2024/04" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 85.660054434658, | |
| "Standard Deviation": 7.392502344300497, | |
| "Rank": 10 | |
| }, | |
| "Geometry": { | |
| "Average Score": 63.36396165140893, | |
| "Standard Deviation": null, | |
| "Rank": 15 | |
| }, | |
| "Algebra": { | |
| "Average Score": 74.67191687355754, | |
| "Standard Deviation": null, | |
| "Rank": 15 | |
| }, | |
| "Probability": { | |
| "Average Score": 71.35141952665965, | |
| "Standard Deviation": null, | |
| "Rank": 14 | |
| }, | |
| "Logical": { | |
| "Average Score": 76.34506017196868, | |
| "Standard Deviation": null, | |
| "Rank": 15 | |
| }, | |
| "Social": { | |
| "Average Score": 46.00126575332808, | |
| "Standard Deviation": null, | |
| "Rank": 25 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 78.70156756289569, | |
| "Standard Deviation": null, | |
| "Rank": 11 | |
| }, | |
| "CPP": { | |
| "Average Score": 69.11824072252848, | |
| "Standard Deviation": null, | |
| "Rank": 12 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gemma-2-27b-it", | |
| "organization": "Google", | |
| "license": "Gemma License", | |
| "knowledge_cutoff": "2024/06" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 70.82622192650408, | |
| "Standard Deviation": 0.18962869075029884, | |
| "Rank": 18 | |
| }, | |
| "Geometry": { | |
| "Average Score": 58.25724467150374, | |
| "Standard Deviation": null, | |
| "Rank": 16 | |
| }, | |
| "Algebra": { | |
| "Average Score": 73.71614711121721, | |
| "Standard Deviation": null, | |
| "Rank": 16 | |
| }, | |
| "Probability": { | |
| "Average Score": 66.08200742339983, | |
| "Standard Deviation": null, | |
| "Rank": 17 | |
| }, | |
| "Logical": { | |
| "Average Score": 72.76841354275011, | |
| "Standard Deviation": null, | |
| "Rank": 17 | |
| }, | |
| "Social": { | |
| "Average Score": 53.736358144621576, | |
| "Standard Deviation": null, | |
| "Rank": 21 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 68.1178055540124, | |
| "Standard Deviation": null, | |
| "Rank": 17 | |
| }, | |
| "CPP": { | |
| "Average Score": 63.28920072143611, | |
| "Standard Deviation": null, | |
| "Rank": 14 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "claude-3-opus", | |
| "organization": "Anthropic", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2023/08" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 82.28903171580336, | |
| "Standard Deviation": 10.093273304495547, | |
| "Rank": 11 | |
| }, | |
| "Geometry": { | |
| "Average Score": 57.98602891013921, | |
| "Standard Deviation": null, | |
| "Rank": 17 | |
| }, | |
| "Algebra": { | |
| "Average Score": 73.54334730242743, | |
| "Standard Deviation": null, | |
| "Rank": 18 | |
| }, | |
| "Probability": { | |
| "Average Score": 67.8341594991468, | |
| "Standard Deviation": null, | |
| "Rank": 15 | |
| }, | |
| "Logical": { | |
| "Average Score": 78.31155849680502, | |
| "Standard Deviation": null, | |
| "Rank": 12 | |
| }, | |
| "Social": { | |
| "Average Score": 90.45833112761075, | |
| "Standard Deviation": null, | |
| "Rank": 8 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 85.97349470177741, | |
| "Standard Deviation": null, | |
| "Rank": 8 | |
| }, | |
| "CPP": { | |
| "Average Score": 73.5404403567132, | |
| "Standard Deviation": null, | |
| "Rank": 8 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gemma-2-9b-it-simpo", | |
| "organization": "Google", | |
| "license": "Gemma License", | |
| "knowledge_cutoff": "2024/07" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": "N/A", | |
| "Standard Deviation": "N/A", | |
| "Rank": "N/A" | |
| }, | |
| "Geometry": { | |
| "Average Score": 52.80896798216458, | |
| "Standard Deviation": null, | |
| "Rank": 19 | |
| }, | |
| "Algebra": { | |
| "Average Score": 69.60260038105677, | |
| "Standard Deviation": null, | |
| "Rank": 19 | |
| }, | |
| "Probability": { | |
| "Average Score": 59.52630271491633, | |
| "Standard Deviation": null, | |
| "Rank": 21 | |
| }, | |
| "Logical": { | |
| "Average Score": 63.57920031465781, | |
| "Standard Deviation": null, | |
| "Rank": 23 | |
| }, | |
| "Social": { | |
| "Average Score": 79.90950201631269, | |
| "Standard Deviation": null, | |
| "Rank": 11 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 90.36508196626548, | |
| "Standard Deviation": null, | |
| "Rank": 5 | |
| }, | |
| "CPP": { | |
| "Average Score": 73.43757596214863, | |
| "Standard Deviation": null, | |
| "Rank": 9 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "qwen1.5-72b-chat", | |
| "organization": "Alibaba", | |
| "license": "Qianwen LICENSE", | |
| "knowledge_cutoff": "2024/03" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 65.26710370586439, | |
| "Standard Deviation": 9.198700753743012, | |
| "Rank": 19 | |
| }, | |
| "Geometry": { | |
| "Average Score": 48.52417714351894, | |
| "Standard Deviation": null, | |
| "Rank": 24 | |
| }, | |
| "Algebra": { | |
| "Average Score": 68.55765479604507, | |
| "Standard Deviation": null, | |
| "Rank": 20 | |
| }, | |
| "Probability": { | |
| "Average Score": 49.52382148131357, | |
| "Standard Deviation": null, | |
| "Rank": 26 | |
| }, | |
| "Logical": { | |
| "Average Score": 37.33563924001827, | |
| "Standard Deviation": null, | |
| "Rank": 35 | |
| }, | |
| "Social": { | |
| "Average Score": 46.00141195402727, | |
| "Standard Deviation": null, | |
| "Rank": 25 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 52.625823960166215, | |
| "Standard Deviation": null, | |
| "Rank": 23 | |
| }, | |
| "CPP": { | |
| "Average Score": 48.69302376665551, | |
| "Standard Deviation": null, | |
| "Rank": 20 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "qwen1.5-32b-chat", | |
| "organization": "Alibaba", | |
| "license": "Qianwen LICENSE", | |
| "knowledge_cutoff": "2024/03" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 46.74335731441104, | |
| "Standard Deviation": 4.096227849530709, | |
| "Rank": 28 | |
| }, | |
| "Geometry": { | |
| "Average Score": 44.96670224519297, | |
| "Standard Deviation": null, | |
| "Rank": 26 | |
| }, | |
| "Algebra": { | |
| "Average Score": 63.19715848628476, | |
| "Standard Deviation": null, | |
| "Rank": 23 | |
| }, | |
| "Probability": { | |
| "Average Score": 48.59873650270336, | |
| "Standard Deviation": null, | |
| "Rank": 27 | |
| }, | |
| "Logical": { | |
| "Average Score": 42.028753105249216, | |
| "Standard Deviation": null, | |
| "Rank": 33 | |
| }, | |
| "Social": { | |
| "Average Score": 43.183938768454986, | |
| "Standard Deviation": null, | |
| "Rank": 28 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 47.84488021045937, | |
| "Standard Deviation": null, | |
| "Rank": 26 | |
| }, | |
| "CPP": { | |
| "Average Score": 45.14284028264288, | |
| "Standard Deviation": null, | |
| "Rank": 24 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "google-gemma-2-9b-it", | |
| "organization": "Google", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2024/06" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 60.71065949101693, | |
| "Standard Deviation": 0.12283018509137462, | |
| "Rank": 23 | |
| }, | |
| "Geometry": { | |
| "Average Score": 52.49270527783856, | |
| "Standard Deviation": null, | |
| "Rank": 20 | |
| }, | |
| "Algebra": { | |
| "Average Score": 63.446032975128176, | |
| "Standard Deviation": null, | |
| "Rank": 21 | |
| }, | |
| "Probability": { | |
| "Average Score": 63.95287475488081, | |
| "Standard Deviation": null, | |
| "Rank": 20 | |
| }, | |
| "Logical": { | |
| "Average Score": 70.18644584116615, | |
| "Standard Deviation": null, | |
| "Rank": 20 | |
| }, | |
| "Social": { | |
| "Average Score": 86.45401862572464, | |
| "Standard Deviation": null, | |
| "Rank": 9 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 57.56342217758078, | |
| "Standard Deviation": null, | |
| "Rank": 20 | |
| }, | |
| "CPP": { | |
| "Average Score": 54.03167523687635, | |
| "Standard Deviation": null, | |
| "Rank": 17 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "yi-1.5-34b-chat", | |
| "organization": "01 AI", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2024/05" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 71.53811567931923, | |
| "Standard Deviation": 0.4838075734512934, | |
| "Rank": 17 | |
| }, | |
| "Geometry": { | |
| "Average Score": 53.98343904373819, | |
| "Standard Deviation": null, | |
| "Rank": 18 | |
| }, | |
| "Algebra": { | |
| "Average Score": 63.317896075817885, | |
| "Standard Deviation": null, | |
| "Rank": 22 | |
| }, | |
| "Probability": { | |
| "Average Score": 64.73492918491159, | |
| "Standard Deviation": null, | |
| "Rank": 19 | |
| }, | |
| "Logical": { | |
| "Average Score": 66.39420245024361, | |
| "Standard Deviation": null, | |
| "Rank": 21 | |
| }, | |
| "Social": { | |
| "Average Score": 53.73650350964252, | |
| "Standard Deviation": null, | |
| "Rank": 21 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 56.722360677914686, | |
| "Standard Deviation": null, | |
| "Rank": 21 | |
| }, | |
| "CPP": { | |
| "Average Score": 52.148798061768964, | |
| "Standard Deviation": null, | |
| "Rank": 18 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "meta-llama-3.1-70b-instruct", | |
| "organization": "Meta", | |
| "license": "Llama 3.1 Community", | |
| "knowledge_cutoff": "2023/12" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 74.01502078434305, | |
| "Standard Deviation": 0.24116839515156926, | |
| "Rank": 15 | |
| }, | |
| "Geometry": { | |
| "Average Score": 66.80097850274383, | |
| "Standard Deviation": null, | |
| "Rank": 13 | |
| }, | |
| "Algebra": { | |
| "Average Score": 74.7667367179752, | |
| "Standard Deviation": null, | |
| "Rank": 14 | |
| }, | |
| "Probability": { | |
| "Average Score": 66.0819470113051, | |
| "Standard Deviation": null, | |
| "Rank": 17 | |
| }, | |
| "Logical": { | |
| "Average Score": 73.68238947162197, | |
| "Standard Deviation": null, | |
| "Rank": 16 | |
| }, | |
| "Social": { | |
| "Average Score": 68.577541438994, | |
| "Standard Deviation": null, | |
| "Rank": 16 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 70.4019514562452, | |
| "Standard Deviation": null, | |
| "Rank": 15 | |
| }, | |
| "CPP": { | |
| "Average Score": 84.36815192532764, | |
| "Standard Deviation": null, | |
| "Rank": 4 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "meta-llama-3.1-8b-instruct", | |
| "organization": "Meta", | |
| "license": "Llama 3.1 Community", | |
| "knowledge_cutoff": "2023/12" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 55.268736955905695, | |
| "Standard Deviation": 7.060517225126177, | |
| "Rank": 26 | |
| }, | |
| "Geometry": { | |
| "Average Score": 42.44262022417502, | |
| "Standard Deviation": null, | |
| "Rank": 28 | |
| }, | |
| "Algebra": { | |
| "Average Score": 60.632347391080486, | |
| "Standard Deviation": null, | |
| "Rank": 25 | |
| }, | |
| "Probability": { | |
| "Average Score": 52.372362507453694, | |
| "Standard Deviation": null, | |
| "Rank": 24 | |
| }, | |
| "Logical": { | |
| "Average Score": 54.17571378414435, | |
| "Standard Deviation": null, | |
| "Rank": 28 | |
| }, | |
| "Social": { | |
| "Average Score": 39.07966801070027, | |
| "Standard Deviation": null, | |
| "Rank": 31 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 45.0170262190059, | |
| "Standard Deviation": null, | |
| "Rank": 29 | |
| }, | |
| "CPP": { | |
| "Average Score": 44.41846841004584, | |
| "Standard Deviation": null, | |
| "Rank": 26 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gpt3.5-turbo-0125", | |
| "organization": "OpenAI", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2021/09" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 29.17379433602279, | |
| "Standard Deviation": 2.6813415847393878, | |
| "Rank": 44 | |
| }, | |
| "Geometry": { | |
| "Average Score": 51.47279337094397, | |
| "Standard Deviation": null, | |
| "Rank": 21 | |
| }, | |
| "Algebra": { | |
| "Average Score": 59.03601450977881, | |
| "Standard Deviation": null, | |
| "Rank": 26 | |
| }, | |
| "Probability": { | |
| "Average Score": 46.71541304474977, | |
| "Standard Deviation": null, | |
| "Rank": 28 | |
| }, | |
| "Logical": { | |
| "Average Score": 20.82026871015984, | |
| "Standard Deviation": null, | |
| "Rank": 46 | |
| }, | |
| "Social": { | |
| "Average Score": 28.31096293069848, | |
| "Standard Deviation": null, | |
| "Rank": 41 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 42.899594571904004, | |
| "Standard Deviation": null, | |
| "Rank": 31 | |
| }, | |
| "CPP": { | |
| "Average Score": 40.46958736582551, | |
| "Standard Deviation": null, | |
| "Rank": 29 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "llama-3-70b-instruct", | |
| "organization": "Meta", | |
| "license": "Llama 3 Community", | |
| "knowledge_cutoff": "2023/12" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 65.90407336557487, | |
| "Standard Deviation": 66.63940143516267, | |
| "Rank": 24 | |
| }, | |
| "Geometry": { | |
| "Average Score": 46.40555349958932, | |
| "Standard Deviation": null, | |
| "Rank": 25 | |
| }, | |
| "Algebra": { | |
| "Average Score": 60.86276607976933, | |
| "Standard Deviation": null, | |
| "Rank": 24 | |
| }, | |
| "Probability": { | |
| "Average Score": 55.0233135868055, | |
| "Standard Deviation": null, | |
| "Rank": 22 | |
| }, | |
| "Logical": { | |
| "Average Score": 83.99546392889077, | |
| "Standard Deviation": null, | |
| "Rank": 11 | |
| }, | |
| "Social": { | |
| "Average Score": 47.90189246663785, | |
| "Standard Deviation": null, | |
| "Rank": 23 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 70.40198909396582, | |
| "Standard Deviation": null, | |
| "Rank": 15 | |
| }, | |
| "CPP": { | |
| "Average Score": 65.32140697218945, | |
| "Standard Deviation": null, | |
| "Rank": 13 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "claude-3-sonnet", | |
| "organization": "Anthropic", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2023/08" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 64.4278622266347, | |
| "Standard Deviation": 3.089828107392469, | |
| "Rank": 21 | |
| }, | |
| "Geometry": { | |
| "Average Score": 51.4677627365698, | |
| "Standard Deviation": null, | |
| "Rank": 21 | |
| }, | |
| "Algebra": { | |
| "Average Score": 57.157810499255426, | |
| "Standard Deviation": null, | |
| "Rank": 27 | |
| }, | |
| "Probability": { | |
| "Average Score": 54.68761427070592, | |
| "Standard Deviation": null, | |
| "Rank": 23 | |
| }, | |
| "Logical": { | |
| "Average Score": 65.8346271849297, | |
| "Standard Deviation": null, | |
| "Rank": 22 | |
| }, | |
| "Social": { | |
| "Average Score": 62.842721798877186, | |
| "Standard Deviation": null, | |
| "Rank": 18 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 66.1914400411681, | |
| "Standard Deviation": null, | |
| "Rank": 18 | |
| }, | |
| "CPP": { | |
| "Average Score": 61.33538592327427, | |
| "Standard Deviation": null, | |
| "Rank": 15 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "qwen1.5-14b-chat", | |
| "organization": "Alibaba", | |
| "license": "Qianwen LICENSE", | |
| "knowledge_cutoff": "2024/02" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 44.920016997055804, | |
| "Standard Deviation": 0.3041914765974254, | |
| "Rank": 30 | |
| }, | |
| "Geometry": { | |
| "Average Score": 36.40735570120079, | |
| "Standard Deviation": null, | |
| "Rank": 30 | |
| }, | |
| "Algebra": { | |
| "Average Score": 56.004717588310726, | |
| "Standard Deviation": null, | |
| "Rank": 28 | |
| }, | |
| "Probability": { | |
| "Average Score": 39.24866255465088, | |
| "Standard Deviation": null, | |
| "Rank": 33 | |
| }, | |
| "Logical": { | |
| "Average Score": 35.15462916949486, | |
| "Standard Deviation": null, | |
| "Rank": 38 | |
| }, | |
| "Social": { | |
| "Average Score": 35.236185321936766, | |
| "Standard Deviation": null, | |
| "Rank": 34 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 40.803706763362605, | |
| "Standard Deviation": null, | |
| "Rank": 34 | |
| }, | |
| "CPP": { | |
| "Average Score": 38.552779976347026, | |
| "Standard Deviation": null, | |
| "Rank": 31 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "claude-3-haiku", | |
| "organization": "Anthropic", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2023/08" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 53.46814061793852, | |
| "Standard Deviation": 10.143567097006747, | |
| "Rank": 25 | |
| }, | |
| "Geometry": { | |
| "Average Score": 42.87542087805953, | |
| "Standard Deviation": null, | |
| "Rank": 27 | |
| }, | |
| "Algebra": { | |
| "Average Score": 53.706856083803686, | |
| "Standard Deviation": null, | |
| "Rank": 30 | |
| }, | |
| "Probability": { | |
| "Average Score": 49.80372052799326, | |
| "Standard Deviation": null, | |
| "Rank": 25 | |
| }, | |
| "Logical": { | |
| "Average Score": 62.585349577709394, | |
| "Standard Deviation": null, | |
| "Rank": 24 | |
| }, | |
| "Social": { | |
| "Average Score": 57.25601125762336, | |
| "Standard Deviation": null, | |
| "Rank": 19 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 60.48921113945562, | |
| "Standard Deviation": null, | |
| "Rank": 19 | |
| }, | |
| "CPP": { | |
| "Average Score": 56.40200048817984, | |
| "Standard Deviation": null, | |
| "Rank": 16 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "claude-2.1", | |
| "organization": "Anthropic", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "Unknown" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 39.855928282633364, | |
| "Standard Deviation": 8.396129652430814, | |
| "Rank": 35 | |
| }, | |
| "Geometry": { | |
| "Average Score": 51.1749207092159, | |
| "Standard Deviation": null, | |
| "Rank": 23 | |
| }, | |
| "Algebra": { | |
| "Average Score": 53.05386216145516, | |
| "Standard Deviation": null, | |
| "Rank": 31 | |
| }, | |
| "Probability": { | |
| "Average Score": 44.42150447611455, | |
| "Standard Deviation": null, | |
| "Rank": 30 | |
| }, | |
| "Logical": { | |
| "Average Score": 60.51381867118053, | |
| "Standard Deviation": null, | |
| "Rank": 25 | |
| }, | |
| "Social": { | |
| "Average Score": 38.492280755756035, | |
| "Standard Deviation": null, | |
| "Rank": 32 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 50.66182745698702, | |
| "Standard Deviation": null, | |
| "Rank": 24 | |
| }, | |
| "CPP": { | |
| "Average Score": 47.23672563994903, | |
| "Standard Deviation": null, | |
| "Rank": 21 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "mistral-8x7b-instruct-v0.1", | |
| "organization": "Mistral", | |
| "license": "Apache 2.0", | |
| "knowledge_cutoff": "2023/12" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 42.70451051343715, | |
| "Standard Deviation": 9.965602920103015, | |
| "Rank": 31 | |
| }, | |
| "Geometry": { | |
| "Average Score": 33.473933494899164, | |
| "Standard Deviation": null, | |
| "Rank": 34 | |
| }, | |
| "Algebra": { | |
| "Average Score": 48.99207852115047, | |
| "Standard Deviation": null, | |
| "Rank": 34 | |
| }, | |
| "Probability": { | |
| "Average Score": 44.46936520340586, | |
| "Standard Deviation": null, | |
| "Rank": 30 | |
| }, | |
| "Logical": { | |
| "Average Score": 42.656238987207246, | |
| "Standard Deviation": null, | |
| "Rank": 31 | |
| }, | |
| "Social": { | |
| "Average Score": 30.32900110312259, | |
| "Standard Deviation": null, | |
| "Rank": 40 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 47.047104057571026, | |
| "Standard Deviation": null, | |
| "Rank": 27 | |
| }, | |
| "CPP": { | |
| "Average Score": 44.533118241976666, | |
| "Standard Deviation": null, | |
| "Rank": 25 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "claude-2.0", | |
| "organization": "Anthropic", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "Unknown" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 33.53990717968659, | |
| "Standard Deviation": 7.640386327990536, | |
| "Rank": 41 | |
| }, | |
| "Geometry": { | |
| "Average Score": 38.40953902052666, | |
| "Standard Deviation": null, | |
| "Rank": 29 | |
| }, | |
| "Algebra": { | |
| "Average Score": 49.07235259762855, | |
| "Standard Deviation": null, | |
| "Rank": 33 | |
| }, | |
| "Probability": { | |
| "Average Score": 46.71546649299419, | |
| "Standard Deviation": null, | |
| "Rank": 28 | |
| }, | |
| "Logical": { | |
| "Average Score": 56.26908965013192, | |
| "Standard Deviation": null, | |
| "Rank": 27 | |
| }, | |
| "Social": { | |
| "Average Score": 47.84034165469707, | |
| "Standard Deviation": null, | |
| "Rank": 23 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 55.20362543510563, | |
| "Standard Deviation": null, | |
| "Rank": 22 | |
| }, | |
| "CPP": { | |
| "Average Score": 50.773143448036464, | |
| "Standard Deviation": null, | |
| "Rank": 19 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "starling-lm-7b-beta", | |
| "organization": "Nexusflow", | |
| "license": "Apache-2.0", | |
| "knowledge_cutoff": "2024/03" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 50.90398580969381, | |
| "Standard Deviation": 0.2839403187065694, | |
| "Rank": 27 | |
| }, | |
| "Geometry": { | |
| "Average Score": 34.653904247826965, | |
| "Standard Deviation": null, | |
| "Rank": 33 | |
| }, | |
| "Algebra": { | |
| "Average Score": 49.66265150940668, | |
| "Standard Deviation": null, | |
| "Rank": 32 | |
| }, | |
| "Probability": { | |
| "Average Score": 40.04695085773174, | |
| "Standard Deviation": null, | |
| "Rank": 32 | |
| }, | |
| "Logical": { | |
| "Average Score": 48.02284849364292, | |
| "Standard Deviation": null, | |
| "Rank": 29 | |
| }, | |
| "Social": { | |
| "Average Score": 42.82322308642107, | |
| "Standard Deviation": null, | |
| "Rank": 29 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 40.54467030566931, | |
| "Standard Deviation": null, | |
| "Rank": 35 | |
| }, | |
| "CPP": { | |
| "Average Score": 38.27587102395908, | |
| "Standard Deviation": null, | |
| "Rank": 32 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gemini-1.0-pro-001", | |
| "organization": "Google", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "2023/04" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 37.91102687366529, | |
| "Standard Deviation": 15.15111885239772, | |
| "Rank": 38 | |
| }, | |
| "Geometry": { | |
| "Average Score": 35.480853719259684, | |
| "Standard Deviation": null, | |
| "Rank": 32 | |
| }, | |
| "Algebra": { | |
| "Average Score": 48.08542847805497, | |
| "Standard Deviation": null, | |
| "Rank": 35 | |
| }, | |
| "Probability": { | |
| "Average Score": 29.862669786973395, | |
| "Standard Deviation": null, | |
| "Rank": 42 | |
| }, | |
| "Logical": { | |
| "Average Score": 24.141794297157134, | |
| "Standard Deviation": null, | |
| "Rank": 43 | |
| }, | |
| "Social": { | |
| "Average Score": 15.062345665891504, | |
| "Standard Deviation": null, | |
| "Rank": 51 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 46.52522766257804, | |
| "Standard Deviation": null, | |
| "Rank": 28 | |
| }, | |
| "CPP": { | |
| "Average Score": 45.22204471452975, | |
| "Standard Deviation": null, | |
| "Rank": 23 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "openchat-3.5-0106", | |
| "organization": "OpenChat", | |
| "license": "Apache-2.0", | |
| "knowledge_cutoff": "2024/01" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 41.34314082389491, | |
| "Standard Deviation": 4.394481877390224, | |
| "Rank": 32 | |
| }, | |
| "Geometry": { | |
| "Average Score": 29.859015723426758, | |
| "Standard Deviation": null, | |
| "Rank": 36 | |
| }, | |
| "Algebra": { | |
| "Average Score": 45.79428201943078, | |
| "Standard Deviation": null, | |
| "Rank": 36 | |
| }, | |
| "Probability": { | |
| "Average Score": 38.766888608782956, | |
| "Standard Deviation": null, | |
| "Rank": 34 | |
| }, | |
| "Logical": { | |
| "Average Score": 42.1345774485532, | |
| "Standard Deviation": null, | |
| "Rank": 32 | |
| }, | |
| "Social": { | |
| "Average Score": 32.07155544930587, | |
| "Standard Deviation": null, | |
| "Rank": 39 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 35.28601797606463, | |
| "Standard Deviation": null, | |
| "Rank": 37 | |
| }, | |
| "CPP": { | |
| "Average Score": 33.70639271807677, | |
| "Standard Deviation": null, | |
| "Rank": 33 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "openchat-3.5", | |
| "organization": "OpenChat", | |
| "license": "Apache-2.0", | |
| "knowledge_cutoff": "2023/11" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 39.60454188051808, | |
| "Standard Deviation": 0.8232501722386516, | |
| "Rank": 36 | |
| }, | |
| "Geometry": { | |
| "Average Score": 30.77657388742533, | |
| "Standard Deviation": null, | |
| "Rank": 35 | |
| }, | |
| "Algebra": { | |
| "Average Score": 42.13028451761782, | |
| "Standard Deviation": null, | |
| "Rank": 38 | |
| }, | |
| "Probability": { | |
| "Average Score": 34.817635171077754, | |
| "Standard Deviation": null, | |
| "Rank": 37 | |
| }, | |
| "Logical": { | |
| "Average Score": 36.21944706732088, | |
| "Standard Deviation": null, | |
| "Rank": 36 | |
| }, | |
| "Social": { | |
| "Average Score": 37.59265084241427, | |
| "Standard Deviation": null, | |
| "Rank": 33 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 37.21911183748652, | |
| "Standard Deviation": null, | |
| "Rank": 36 | |
| }, | |
| "CPP": { | |
| "Average Score": 33.020911255646965, | |
| "Standard Deviation": null, | |
| "Rank": 34 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "command-r-(08-2024)", | |
| "organization": "Cohere", | |
| "license": "CC-BY-NC-4.0", | |
| "knowledge_cutoff": "2024/08" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 45.84310421663912, | |
| "Standard Deviation": 0.14535750785421472, | |
| "Rank": 29 | |
| }, | |
| "Geometry": { | |
| "Average Score": 36.33550343578038, | |
| "Standard Deviation": null, | |
| "Rank": 31 | |
| }, | |
| "Algebra": { | |
| "Average Score": 41.87079446639028, | |
| "Standard Deviation": null, | |
| "Rank": 39 | |
| }, | |
| "Probability": { | |
| "Average Score": 36.87662939858684, | |
| "Standard Deviation": null, | |
| "Rank": 36 | |
| }, | |
| "Logical": { | |
| "Average Score": 26.22482921268266, | |
| "Standard Deviation": null, | |
| "Rank": 41 | |
| }, | |
| "Social": { | |
| "Average Score": 35.11019761697373, | |
| "Standard Deviation": null, | |
| "Rank": 35 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 41.81772722027254, | |
| "Standard Deviation": null, | |
| "Rank": 33 | |
| }, | |
| "CPP": { | |
| "Average Score": 39.61492485677676, | |
| "Standard Deviation": null, | |
| "Rank": 30 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gemma-1.1-7b-it", | |
| "organization": "Google", | |
| "license": "Gemma License", | |
| "knowledge_cutoff": "2024/02" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 35.873210924652795, | |
| "Standard Deviation": 6.462625645064649, | |
| "Rank": 37 | |
| }, | |
| "Geometry": { | |
| "Average Score": 25.79207201693066, | |
| "Standard Deviation": null, | |
| "Rank": 40 | |
| }, | |
| "Algebra": { | |
| "Average Score": 40.58046616460041, | |
| "Standard Deviation": null, | |
| "Rank": 40 | |
| }, | |
| "Probability": { | |
| "Average Score": 29.581773053230897, | |
| "Standard Deviation": null, | |
| "Rank": 43 | |
| }, | |
| "Logical": { | |
| "Average Score": 41.99821650962693, | |
| "Standard Deviation": null, | |
| "Rank": 33 | |
| }, | |
| "Social": { | |
| "Average Score": 24.39015213949678, | |
| "Standard Deviation": null, | |
| "Rank": 43 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 45.01706482033765, | |
| "Standard Deviation": null, | |
| "Rank": 29 | |
| }, | |
| "CPP": { | |
| "Average Score": 42.666504105798204, | |
| "Standard Deviation": null, | |
| "Rank": 27 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "llama3-8b-instruct", | |
| "organization": "Meta", | |
| "license": "Llama 3 Community", | |
| "knowledge_cutoff": "2023/03" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 39.00917270775336, | |
| "Standard Deviation": 3.999506140299149, | |
| "Rank": 39 | |
| }, | |
| "Geometry": { | |
| "Average Score": 29.224089668837465, | |
| "Standard Deviation": null, | |
| "Rank": 38 | |
| }, | |
| "Algebra": { | |
| "Average Score": 42.90961619082775, | |
| "Standard Deviation": null, | |
| "Rank": 37 | |
| }, | |
| "Probability": { | |
| "Average Score": 34.15721355738147, | |
| "Standard Deviation": null, | |
| "Rank": 38 | |
| }, | |
| "Logical": { | |
| "Average Score": 58.39773915370141, | |
| "Standard Deviation": null, | |
| "Rank": 26 | |
| }, | |
| "Social": { | |
| "Average Score": 40.88535401371015, | |
| "Standard Deviation": null, | |
| "Rank": 30 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 49.70839372661025, | |
| "Standard Deviation": null, | |
| "Rank": 25 | |
| }, | |
| "CPP": { | |
| "Average Score": 45.35392139264795, | |
| "Standard Deviation": null, | |
| "Rank": 22 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gemma-2-2b-it", | |
| "organization": "Google", | |
| "license": "Gemma License", | |
| "knowledge_cutoff": "2024/07" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 57.45780847204313, | |
| "Standard Deviation": 16.310023687014333, | |
| "Rank": 22 | |
| }, | |
| "Geometry": { | |
| "Average Score": 29.820233374501843, | |
| "Standard Deviation": null, | |
| "Rank": 36 | |
| }, | |
| "Algebra": { | |
| "Average Score": 39.873024674507214, | |
| "Standard Deviation": null, | |
| "Rank": 41 | |
| }, | |
| "Probability": { | |
| "Average Score": 31.85692359301203, | |
| "Standard Deviation": null, | |
| "Rank": 40 | |
| }, | |
| "Logical": { | |
| "Average Score": 43.93437465788311, | |
| "Standard Deviation": null, | |
| "Rank": 30 | |
| }, | |
| "Social": { | |
| "Average Score": 44.689420554662476, | |
| "Standard Deviation": null, | |
| "Rank": 27 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 32.05704364512495, | |
| "Standard Deviation": null, | |
| "Rank": 40 | |
| }, | |
| "CPP": { | |
| "Average Score": 30.53406933106768, | |
| "Standard Deviation": null, | |
| "Rank": 36 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "starling-lm-7b-alpha", | |
| "organization": "Nexusflow", | |
| "license": "Apache-2.0", | |
| "knowledge_cutoff": "2023/11" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 40.625443347641045, | |
| "Standard Deviation": 3.0544259540377268, | |
| "Rank": 34 | |
| }, | |
| "Geometry": { | |
| "Average Score": 26.171147508308422, | |
| "Standard Deviation": null, | |
| "Rank": 39 | |
| }, | |
| "Algebra": { | |
| "Average Score": 39.149463007523856, | |
| "Standard Deviation": null, | |
| "Rank": 42 | |
| }, | |
| "Probability": { | |
| "Average Score": 32.36862021879827, | |
| "Standard Deviation": null, | |
| "Rank": 39 | |
| }, | |
| "Logical": { | |
| "Average Score": 34.17344938419256, | |
| "Standard Deviation": null, | |
| "Rank": 39 | |
| }, | |
| "Social": { | |
| "Average Score": 35.06966333212518, | |
| "Standard Deviation": null, | |
| "Rank": 35 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 32.15932739848045, | |
| "Standard Deviation": null, | |
| "Rank": 39 | |
| }, | |
| "CPP": { | |
| "Average Score": 30.07926487356878, | |
| "Standard Deviation": null, | |
| "Rank": 37 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "qwen1.5-4b-chat", | |
| "organization": "Alibaba", | |
| "license": "Qianwen LICENSE", | |
| "knowledge_cutoff": "2024/02" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 11.723779019126527, | |
| "Standard Deviation": 0.856230353584155, | |
| "Rank": 53 | |
| }, | |
| "Geometry": { | |
| "Average Score": 16.072772563608115, | |
| "Standard Deviation": null, | |
| "Rank": 45 | |
| }, | |
| "Algebra": { | |
| "Average Score": 32.22626131587612, | |
| "Standard Deviation": null, | |
| "Rank": 44 | |
| }, | |
| "Probability": { | |
| "Average Score": 13.98282712349133, | |
| "Standard Deviation": null, | |
| "Rank": 48 | |
| }, | |
| "Logical": { | |
| "Average Score": 13.993097991375581, | |
| "Standard Deviation": null, | |
| "Rank": 51 | |
| }, | |
| "Social": { | |
| "Average Score": 22.955898106386442, | |
| "Standard Deviation": null, | |
| "Rank": 45 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 13.907481529463642, | |
| "Standard Deviation": null, | |
| "Rank": 51 | |
| }, | |
| "CPP": { | |
| "Average Score": 13.21208067122554, | |
| "Standard Deviation": null, | |
| "Rank": 47 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "command-r-(04-2024)", | |
| "organization": "Cohere", | |
| "license": "CC-BY-NC-4.0", | |
| "knowledge_cutoff": "2024/04" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 43.08187135994592, | |
| "Standard Deviation": 0.7654553730614279, | |
| "Rank": 33 | |
| }, | |
| "Geometry": { | |
| "Average Score": 24.037084801508428, | |
| "Standard Deviation": null, | |
| "Rank": 41 | |
| }, | |
| "Algebra": { | |
| "Average Score": 32.37474440275246, | |
| "Standard Deviation": null, | |
| "Rank": 43 | |
| }, | |
| "Probability": { | |
| "Average Score": 31.014039425232298, | |
| "Standard Deviation": null, | |
| "Rank": 41 | |
| }, | |
| "Logical": { | |
| "Average Score": 35.49507014348235, | |
| "Standard Deviation": null, | |
| "Rank": 37 | |
| }, | |
| "Social": { | |
| "Average Score": 34.782695172510856, | |
| "Standard Deviation": null, | |
| "Rank": 37 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 42.46395478814961, | |
| "Standard Deviation": null, | |
| "Rank": 32 | |
| }, | |
| "CPP": { | |
| "Average Score": 41.346336503003236, | |
| "Standard Deviation": null, | |
| "Rank": 28 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "vicuna-33b", | |
| "organization": "LMSYS", | |
| "license": "Non-commercial", | |
| "knowledge_cutoff": "2023/08" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 30.8582386682731, | |
| "Standard Deviation": 2.3851186735858945, | |
| "Rank": 42 | |
| }, | |
| "Geometry": { | |
| "Average Score": 17.058968577112452, | |
| "Standard Deviation": null, | |
| "Rank": 44 | |
| }, | |
| "Algebra": { | |
| "Average Score": 25.22004544023738, | |
| "Standard Deviation": null, | |
| "Rank": 45 | |
| }, | |
| "Probability": { | |
| "Average Score": 21.097169680647767, | |
| "Standard Deviation": null, | |
| "Rank": 46 | |
| }, | |
| "Logical": { | |
| "Average Score": 23.212667585279515, | |
| "Standard Deviation": null, | |
| "Rank": 45 | |
| }, | |
| "Social": { | |
| "Average Score": 32.357116321848025, | |
| "Standard Deviation": null, | |
| "Rank": 38 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 29.376389899632898, | |
| "Standard Deviation": null, | |
| "Rank": 42 | |
| }, | |
| "CPP": { | |
| "Average Score": 28.01838653090379, | |
| "Standard Deviation": null, | |
| "Rank": 38 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gemma-7b-it", | |
| "organization": "Google", | |
| "license": "Gemma License", | |
| "knowledge_cutoff": "2024/02" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 27.609692676933715, | |
| "Standard Deviation": 5.8350892031427435, | |
| "Rank": 45 | |
| }, | |
| "Geometry": { | |
| "Average Score": 20.127802528542947, | |
| "Standard Deviation": null, | |
| "Rank": 42 | |
| }, | |
| "Algebra": { | |
| "Average Score": 23.46400816161807, | |
| "Standard Deviation": null, | |
| "Rank": 47 | |
| }, | |
| "Probability": { | |
| "Average Score": 17.139514453170445, | |
| "Standard Deviation": null, | |
| "Rank": 47 | |
| }, | |
| "Logical": { | |
| "Average Score": 24.625290351028372, | |
| "Standard Deviation": null, | |
| "Rank": 42 | |
| }, | |
| "Social": { | |
| "Average Score": 26.715025606557614, | |
| "Standard Deviation": null, | |
| "Rank": 42 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 29.383105099269972, | |
| "Standard Deviation": null, | |
| "Rank": 41 | |
| }, | |
| "CPP": { | |
| "Average Score": 28.014658234926813, | |
| "Standard Deviation": null, | |
| "Rank": 39 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "mistral-7b-instruct-2", | |
| "organization": "Mistral", | |
| "license": "Apache 2.0", | |
| "knowledge_cutoff": "2023/12" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 32.583755237895794, | |
| "Standard Deviation": 1.6860156811686553, | |
| "Rank": 40 | |
| }, | |
| "Geometry": { | |
| "Average Score": 17.27716649229315, | |
| "Standard Deviation": null, | |
| "Rank": 43 | |
| }, | |
| "Algebra": { | |
| "Average Score": 23.58916877939791, | |
| "Standard Deviation": null, | |
| "Rank": 46 | |
| }, | |
| "Probability": { | |
| "Average Score": 25.1012270940144, | |
| "Standard Deviation": null, | |
| "Rank": 44 | |
| }, | |
| "Logical": { | |
| "Average Score": 29.07002036532878, | |
| "Standard Deviation": null, | |
| "Rank": 40 | |
| }, | |
| "Social": { | |
| "Average Score": 24.39006275978174, | |
| "Standard Deviation": null, | |
| "Rank": 43 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 32.76096708662236, | |
| "Standard Deviation": null, | |
| "Rank": 38 | |
| }, | |
| "CPP": { | |
| "Average Score": 31.382959631870822, | |
| "Standard Deviation": null, | |
| "Rank": 35 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "mistral-7b-instruct-1", | |
| "organization": "Mistral", | |
| "license": "Apache 2.0", | |
| "knowledge_cutoff": "2023/12" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 22.167930858422395, | |
| "Standard Deviation": 3.328543828571604, | |
| "Rank": 50 | |
| }, | |
| "Geometry": { | |
| "Average Score": 11.300762460776488, | |
| "Standard Deviation": null, | |
| "Rank": 49 | |
| }, | |
| "Algebra": { | |
| "Average Score": 21.016466430115493, | |
| "Standard Deviation": null, | |
| "Rank": 48 | |
| }, | |
| "Probability": { | |
| "Average Score": 24.506863192031716, | |
| "Standard Deviation": null, | |
| "Rank": 45 | |
| }, | |
| "Logical": { | |
| "Average Score": 17.0066100312336, | |
| "Standard Deviation": null, | |
| "Rank": 49 | |
| }, | |
| "Social": { | |
| "Average Score": 14.049392081101905, | |
| "Standard Deviation": null, | |
| "Rank": 52 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 20.796521445473058, | |
| "Standard Deviation": null, | |
| "Rank": 45 | |
| }, | |
| "CPP": { | |
| "Average Score": 18.929093202755805, | |
| "Standard Deviation": null, | |
| "Rank": 42 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "vicuna-13b", | |
| "organization": "LMSYS", | |
| "license": "Non-commercial", | |
| "knowledge_cutoff": "2023/07" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 20.105123059326157, | |
| "Standard Deviation": 4.100609090750239, | |
| "Rank": 51 | |
| }, | |
| "Geometry": { | |
| "Average Score": 13.080654946737525, | |
| "Standard Deviation": null, | |
| "Rank": 48 | |
| }, | |
| "Algebra": { | |
| "Average Score": 20.125194674408167, | |
| "Standard Deviation": null, | |
| "Rank": 49 | |
| }, | |
| "Probability": { | |
| "Average Score": 13.125942598704368, | |
| "Standard Deviation": null, | |
| "Rank": 49 | |
| }, | |
| "Logical": { | |
| "Average Score": 17.182300978389822, | |
| "Standard Deviation": null, | |
| "Rank": 48 | |
| }, | |
| "Social": { | |
| "Average Score": 16.258399348520832, | |
| "Standard Deviation": null, | |
| "Rank": 50 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 23.79065696739089, | |
| "Standard Deviation": null, | |
| "Rank": 44 | |
| }, | |
| "CPP": { | |
| "Average Score": 21.840013221590294, | |
| "Standard Deviation": null, | |
| "Rank": 40 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "zephyr-7b-beta", | |
| "organization": "HuggingFace", | |
| "license": "MIT", | |
| "knowledge_cutoff": "2023/10" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 11.581258432641418, | |
| "Standard Deviation": 1.677081510212375, | |
| "Rank": 54 | |
| }, | |
| "Geometry": { | |
| "Average Score": 8.432624521698594, | |
| "Standard Deviation": null, | |
| "Rank": 50 | |
| }, | |
| "Algebra": { | |
| "Average Score": 12.912859660357217, | |
| "Standard Deviation": null, | |
| "Rank": 51 | |
| }, | |
| "Probability": { | |
| "Average Score": 7.643552619113196, | |
| "Standard Deviation": null, | |
| "Rank": 54 | |
| }, | |
| "Logical": { | |
| "Average Score": 7.444095116649809, | |
| "Standard Deviation": null, | |
| "Rank": 55 | |
| }, | |
| "Social": { | |
| "Average Score": 0.0, | |
| "Standard Deviation": null, | |
| "Rank": 57 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 16.150157007299235, | |
| "Standard Deviation": null, | |
| "Rank": 49 | |
| }, | |
| "CPP": { | |
| "Average Score": 18.92902220864132, | |
| "Standard Deviation": null, | |
| "Rank": 43 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gemma-1.1-2b-it", | |
| "organization": "Google", | |
| "license": "Gemma License", | |
| "knowledge_cutoff": "2024/02" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 25.06653151900311, | |
| "Standard Deviation": 5.340973431345662, | |
| "Rank": 48 | |
| }, | |
| "Geometry": { | |
| "Average Score": 13.161686218568628, | |
| "Standard Deviation": null, | |
| "Rank": 47 | |
| }, | |
| "Algebra": { | |
| "Average Score": 15.592205919293873, | |
| "Standard Deviation": null, | |
| "Rank": 50 | |
| }, | |
| "Probability": { | |
| "Average Score": 8.305764696120711, | |
| "Standard Deviation": null, | |
| "Rank": 51 | |
| }, | |
| "Logical": { | |
| "Average Score": 10.940766703849592, | |
| "Standard Deviation": null, | |
| "Rank": 53 | |
| }, | |
| "Social": { | |
| "Average Score": 21.925546766366356, | |
| "Standard Deviation": null, | |
| "Rank": 46 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 18.700936936742952, | |
| "Standard Deviation": null, | |
| "Rank": 46 | |
| }, | |
| "CPP": { | |
| "Average Score": 20.724691953843916, | |
| "Standard Deviation": null, | |
| "Rank": 41 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "llama2-7b-chat", | |
| "organization": "Meta", | |
| "license": "Llama 2 Community", | |
| "knowledge_cutoff": "2023/07" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 25.633612357313762, | |
| "Standard Deviation": 2.805639153654191, | |
| "Rank": 46 | |
| }, | |
| "Geometry": { | |
| "Average Score": 5.825877827672446, | |
| "Standard Deviation": null, | |
| "Rank": 51 | |
| }, | |
| "Algebra": { | |
| "Average Score": 8.58657284915635, | |
| "Standard Deviation": null, | |
| "Rank": 53 | |
| }, | |
| "Probability": { | |
| "Average Score": 8.164826137672431, | |
| "Standard Deviation": null, | |
| "Rank": 53 | |
| }, | |
| "Logical": { | |
| "Average Score": 20.697630462723275, | |
| "Standard Deviation": null, | |
| "Rank": 47 | |
| }, | |
| "Social": { | |
| "Average Score": 18.13821609304045, | |
| "Standard Deviation": null, | |
| "Rank": 47 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 17.065363968846427, | |
| "Standard Deviation": null, | |
| "Rank": 47 | |
| }, | |
| "CPP": { | |
| "Average Score": 15.730513733660898, | |
| "Standard Deviation": null, | |
| "Rank": 45 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "gemma-2b-it", | |
| "organization": "Google", | |
| "license": "Gemma License", | |
| "knowledge_cutoff": "2024/02" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 22.935122315202772, | |
| "Standard Deviation": 1.9451357494738446, | |
| "Rank": 49 | |
| }, | |
| "Geometry": { | |
| "Average Score": 15.523844579555126, | |
| "Standard Deviation": null, | |
| "Rank": 46 | |
| }, | |
| "Algebra": { | |
| "Average Score": 8.997563653883809, | |
| "Standard Deviation": null, | |
| "Rank": 52 | |
| }, | |
| "Probability": { | |
| "Average Score": 6.750305898269558, | |
| "Standard Deviation": null, | |
| "Rank": 55 | |
| }, | |
| "Logical": { | |
| "Average Score": 5.354222904092569, | |
| "Standard Deviation": null, | |
| "Rank": 56 | |
| }, | |
| "Social": { | |
| "Average Score": 10.938132042877358, | |
| "Standard Deviation": null, | |
| "Rank": 54 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 17.06532733699507, | |
| "Standard Deviation": null, | |
| "Rank": 47 | |
| }, | |
| "CPP": { | |
| "Average Score": 17.2715657115764, | |
| "Standard Deviation": null, | |
| "Rank": 44 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "llama2-13b-chat", | |
| "organization": "Meta", | |
| "license": "Llama 2 Community", | |
| "knowledge_cutoff": "2023/07" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 25.828530292775856, | |
| "Standard Deviation": 3.2503558704879296, | |
| "Rank": 47 | |
| }, | |
| "Geometry": { | |
| "Average Score": 4.119943280135397, | |
| "Standard Deviation": null, | |
| "Rank": 53 | |
| }, | |
| "Algebra": { | |
| "Average Score": 6.355347828676415, | |
| "Standard Deviation": null, | |
| "Rank": 54 | |
| }, | |
| "Probability": { | |
| "Average Score": 11.5585998384148, | |
| "Standard Deviation": null, | |
| "Rank": 50 | |
| }, | |
| "Logical": { | |
| "Average Score": 24.172674067890938, | |
| "Standard Deviation": null, | |
| "Rank": 43 | |
| }, | |
| "Social": { | |
| "Average Score": 17.850287642446094, | |
| "Standard Deviation": null, | |
| "Rank": 49 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 13.887442704655687, | |
| "Standard Deviation": null, | |
| "Rank": 52 | |
| }, | |
| "CPP": { | |
| "Average Score": 13.17258252933903, | |
| "Standard Deviation": null, | |
| "Rank": 48 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "vicuna-7b", | |
| "organization": "LMSYS", | |
| "license": "Non-commercial", | |
| "knowledge_cutoff": "2023/07" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 19.78471384913738, | |
| "Standard Deviation": 3.7936645273402276, | |
| "Rank": 52 | |
| }, | |
| "Geometry": { | |
| "Average Score": 5.434763675792798, | |
| "Standard Deviation": null, | |
| "Rank": 52 | |
| }, | |
| "Algebra": { | |
| "Average Score": 5.925959137419872, | |
| "Standard Deviation": null, | |
| "Rank": 55 | |
| }, | |
| "Probability": { | |
| "Average Score": 8.30566475354697, | |
| "Standard Deviation": null, | |
| "Rank": 51 | |
| }, | |
| "Logical": { | |
| "Average Score": 11.881223740003346, | |
| "Standard Deviation": null, | |
| "Rank": 52 | |
| }, | |
| "Social": { | |
| "Average Score": 12.864677350128595, | |
| "Standard Deviation": null, | |
| "Rank": 53 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 14.187574975522333, | |
| "Standard Deviation": null, | |
| "Rank": 50 | |
| }, | |
| "CPP": { | |
| "Average Score": 14.255194156624162, | |
| "Standard Deviation": null, | |
| "Rank": 46 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "koala-13b", | |
| "organization": "UC Berkeley", | |
| "license": "Non-commercial", | |
| "knowledge_cutoff": "2023/04" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 10.216910767982592, | |
| "Standard Deviation": 2.0597606260293655, | |
| "Rank": 55 | |
| }, | |
| "Geometry": { | |
| "Average Score": 0.1600118163292883, | |
| "Standard Deviation": null, | |
| "Rank": 54 | |
| }, | |
| "Algebra": { | |
| "Average Score": 2.2219841274068948, | |
| "Standard Deviation": null, | |
| "Rank": 56 | |
| }, | |
| "Probability": { | |
| "Average Score": 3.353938470588142, | |
| "Standard Deviation": null, | |
| "Rank": 56 | |
| }, | |
| "Logical": { | |
| "Average Score": 8.24436273551765, | |
| "Standard Deviation": null, | |
| "Rank": 54 | |
| }, | |
| "Social": { | |
| "Average Score": 10.96000067573448, | |
| "Standard Deviation": null, | |
| "Rank": 54 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 6.272570799004611, | |
| "Standard Deviation": null, | |
| "Rank": 53 | |
| }, | |
| "CPP": { | |
| "Average Score": 6.36433272373514, | |
| "Standard Deviation": null, | |
| "Rank": 49 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "openassistant-pythia-12b", | |
| "organization": "OpenAssistant", | |
| "license": "Non-commercial", | |
| "knowledge_cutoff": "2023/04" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 0.0, | |
| "Standard Deviation": 0.0, | |
| "Rank": 56 | |
| }, | |
| "Geometry": { | |
| "Average Score": 0.0, | |
| "Standard Deviation": null, | |
| "Rank": 55 | |
| }, | |
| "Algebra": { | |
| "Average Score": 0.0, | |
| "Standard Deviation": null, | |
| "Rank": 57 | |
| }, | |
| "Probability": { | |
| "Average Score": 0.0, | |
| "Standard Deviation": null, | |
| "Rank": 57 | |
| }, | |
| "Logical": { | |
| "Average Score": 0.0, | |
| "Standard Deviation": null, | |
| "Rank": 57 | |
| }, | |
| "Social": { | |
| "Average Score": 1.859688217710296, | |
| "Standard Deviation": null, | |
| "Rank": 56 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 0.0, | |
| "Standard Deviation": null, | |
| "Rank": 54 | |
| }, | |
| "CPP": { | |
| "Average Score": 0.0, | |
| "Standard Deviation": null, | |
| "Rank": 50 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "nemotron-70b", | |
| "organization": "NVIDIA", | |
| "license": "Unknown", | |
| "knowledge_cutoff": "Unknown" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 100.0, | |
| "Standard Deviation": 0.0, | |
| "Rank": 1 | |
| }, | |
| "Geometry": { | |
| "Average Score": 68.72757963233221, | |
| "Standard Deviation": null, | |
| "Rank": 12 | |
| }, | |
| "Algebra": { | |
| "Average Score": 73.71625129267943, | |
| "Standard Deviation": null, | |
| "Rank": 16 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 72.48678626772566, | |
| "Standard Deviation": null, | |
| "Rank": 14 | |
| }, | |
| "Logical": { | |
| "Average Score": 92.57864400540329, | |
| "Standard Deviation": null, | |
| "Rank": 5 | |
| }, | |
| "Social": { | |
| "Average Score": 99.63342284899149, | |
| "Standard Deviation": null, | |
| "Rank": 2 | |
| }, | |
| "Probability": { | |
| "Average Score": 75.30735899300154, | |
| "Standard Deviation": null, | |
| "Rank": 10 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "llama-3.2-3b-it", | |
| "organization": "Meta", | |
| "license": "Llama 3 Community", | |
| "knowledge_cutoff": "Unknown" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 29.47099904114387, | |
| "Standard Deviation": 1.6836027650802912, | |
| "Rank": 43 | |
| }, | |
| "Geometry": { | |
| "Average Score": 0.0, | |
| "Standard Deviation": 0.0, | |
| "Rank": 50 | |
| }, | |
| "Algebra": { | |
| "Average Score": 55.31592410564261, | |
| "Standard Deviation": null, | |
| "Rank": 29 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 28.667640602193643, | |
| "Standard Deviation": null, | |
| "Rank": 43 | |
| }, | |
| "Logical": { | |
| "Average Score": 15.35430947415723, | |
| "Standard Deviation": null, | |
| "Rank": 49 | |
| }, | |
| "Social": { | |
| "Average Score": 18.087938295545133, | |
| "Standard Deviation": null, | |
| "Rank": 48 | |
| }, | |
| "Probability": { | |
| "Average Score": 37.84631410688676, | |
| "Standard Deviation": null, | |
| "Rank": 35 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "yi-lightning", | |
| "organization": "01 AI", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "Unknown" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 96.10303362688546, | |
| "Standard Deviation": 0.5365246195716372, | |
| "Rank": 3 | |
| }, | |
| "Geometry": { | |
| "Average Score": 77.09570683128703, | |
| "Standard Deviation": null, | |
| "Rank": 8 | |
| }, | |
| "Algebra": { | |
| "Average Score": 85.92132293392635, | |
| "Standard Deviation": null, | |
| "Rank": 6 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 95.7205664118507, | |
| "Standard Deviation": null, | |
| "Rank": 2 | |
| }, | |
| "Logical": { | |
| "Average Score": 94.60171867702756, | |
| "Standard Deviation": null, | |
| "Rank": 4 | |
| }, | |
| "Social": { | |
| "Average Score": 93.93680225135506, | |
| "Standard Deviation": null, | |
| "Rank": 6 | |
| }, | |
| "Probability": { | |
| "Average Score": 90.23858748317501, | |
| "Standard Deviation": null, | |
| "Rank": 3 | |
| } | |
| } | |
| }, | |
| { | |
| "config": { | |
| "model_name": "glm-4-plus", | |
| "organization": "Zhipu AI", | |
| "license": "Proprietary", | |
| "knowledge_cutoff": "Unknown" | |
| }, | |
| "results": { | |
| "OVERALL": { | |
| "Average Score": 90.50303579501356, | |
| "Standard Deviation": 5.202472970969946, | |
| "Rank": 6 | |
| }, | |
| "Geometry": { | |
| "Average Score": 76.37543021571776, | |
| "Standard Deviation": null, | |
| "Rank": 9 | |
| }, | |
| "Algebra": { | |
| "Average Score": 81.39859078752944, | |
| "Standard Deviation": null, | |
| "Rank": 10 | |
| }, | |
| "Chemistry": { | |
| "Average Score": 90.15506569759444, | |
| "Standard Deviation": null, | |
| "Rank": 6 | |
| }, | |
| "Logical": { | |
| "Average Score": 92.26403821208403, | |
| "Standard Deviation": null, | |
| "Rank": 6 | |
| }, | |
| "Social": { | |
| "Average Score": 100.0, | |
| "Standard Deviation": null, | |
| "Rank": 1 | |
| }, | |
| "Probability": { | |
| "Average Score": 73.99418447190348, | |
| "Standard Deviation": null, | |
| "Rank": 11 | |
| } | |
| } | |
| } | |
| ] |