Spaces:
Sleeping
Sleeping
style: code blacked
Browse files- app.py +4 -2
- src/common/data.py +3 -1
- src/eval/cli.py +30 -12
- src/eval/matchers.py +3 -1
- src/eval/metrics.py +1 -1
- src/generate/cli.py +27 -12
- src/generate/generators.py +2 -6
app.py
CHANGED
|
@@ -52,7 +52,9 @@ with gr.Blocks(
|
|
| 52 |
),
|
| 53 |
) as application:
|
| 54 |
gr.Markdown("# 🥇 ROMB - Russian Olympiad Math Benchmark")
|
| 55 |
-
gr.Markdown(
|
|
|
|
|
|
|
| 56 |
with gr.Tabs():
|
| 57 |
with gr.Tab("Leaderboard"):
|
| 58 |
gr.Markdown("In progress...")
|
|
@@ -77,7 +79,7 @@ with gr.Blocks(
|
|
| 77 |
[22],
|
| 78 |
[40],
|
| 79 |
[230],
|
| 80 |
-
]
|
| 81 |
)
|
| 82 |
|
| 83 |
|
|
|
|
| 52 |
),
|
| 53 |
) as application:
|
| 54 |
gr.Markdown("# 🥇 ROMB - Russian Olympiad Math Benchmark")
|
| 55 |
+
gr.Markdown(
|
| 56 |
+
f"See ROMB-1.0 dataset there - [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME})."
|
| 57 |
+
)
|
| 58 |
with gr.Tabs():
|
| 59 |
with gr.Tab("Leaderboard"):
|
| 60 |
gr.Markdown("In progress...")
|
|
|
|
| 79 |
[22],
|
| 80 |
[40],
|
| 81 |
[230],
|
| 82 |
+
],
|
| 83 |
)
|
| 84 |
|
| 85 |
|
src/common/data.py
CHANGED
|
@@ -13,5 +13,7 @@ def load_dataset() -> pd.DataFrame:
|
|
| 13 |
ds = datasets.load_dataset(DATASET_NAME, split="test")
|
| 14 |
df = pd.DataFrame(ds)
|
| 15 |
|
| 16 |
-
df[DatasetSchema.correct_answer] = df[DatasetSchema.correct_answer].apply(
|
|
|
|
|
|
|
| 17 |
return df
|
|
|
|
| 13 |
ds = datasets.load_dataset(DATASET_NAME, split="test")
|
| 14 |
df = pd.DataFrame(ds)
|
| 15 |
|
| 16 |
+
df[DatasetSchema.correct_answer] = df[DatasetSchema.correct_answer].apply(
|
| 17 |
+
json.loads
|
| 18 |
+
)
|
| 19 |
return df
|
src/eval/cli.py
CHANGED
|
@@ -41,7 +41,9 @@ def _evaluate_single_answer(
|
|
| 41 |
)
|
| 42 |
except Exception as e:
|
| 43 |
print(e)
|
| 44 |
-
print(
|
|
|
|
|
|
|
| 45 |
exit(1)
|
| 46 |
return result
|
| 47 |
|
|
@@ -53,7 +55,9 @@ def _evaluate(
|
|
| 53 |
) -> pd.DataFrame:
|
| 54 |
tqdm.pandas()
|
| 55 |
|
| 56 |
-
generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[
|
|
|
|
|
|
|
| 57 |
lambda x: GenerationAnswer.model_validate(deepcopy(x)) if x else None,
|
| 58 |
)
|
| 59 |
dataset_df = load_dataset()
|
|
@@ -67,10 +71,14 @@ def _evaluate(
|
|
| 67 |
axis=1,
|
| 68 |
)
|
| 69 |
|
| 70 |
-
predictions_df[DatasetEvalSchema.predicted_answer] = predictions_df[
|
|
|
|
|
|
|
| 71 |
lambda x: x.answer if not pd.isna(x) else None,
|
| 72 |
)
|
| 73 |
-
predictions_df[DatasetEvalSchema.context] = predictions_df[
|
|
|
|
|
|
|
| 74 |
lambda x: x.context if not pd.isna(x) else None,
|
| 75 |
)
|
| 76 |
predictions_df = predictions_df[list(DatasetEvalSchema._collect_fields().keys())]
|
|
@@ -91,7 +99,9 @@ def evaluate(
|
|
| 91 |
|
| 92 |
df = pd.read_json(file, lines=True)
|
| 93 |
evaluated_df = _evaluate(df)
|
| 94 |
-
evaluated_df.to_json(
|
|
|
|
|
|
|
| 95 |
|
| 96 |
|
| 97 |
@pa.check_input(DatasetEvalSchema)
|
|
@@ -101,16 +111,24 @@ def _metrics(
|
|
| 101 |
model_name: str,
|
| 102 |
model_size: float,
|
| 103 |
model_url: str,
|
| 104 |
-
model_config: str
|
| 105 |
) -> pd.DataFrame:
|
| 106 |
pass1 = df[DatasetEvalSchema.is_correct].mean()
|
| 107 |
|
| 108 |
w = df[DatasetEvalSchema.grade].apply(grade_to_weight)
|
| 109 |
-
weighted_accuracy = (
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
result = {
|
| 116 |
LeaderBoardSchema.model_name: model_name,
|
|
@@ -174,7 +192,7 @@ def metrics(
|
|
| 174 |
model_name=model_name,
|
| 175 |
model_size=model_size,
|
| 176 |
model_url=model_url,
|
| 177 |
-
model_config=model_config or
|
| 178 |
)
|
| 179 |
metrics = metrics_df.to_dict(orient="records")[0]
|
| 180 |
print(f"Metrics for {model_name}:")
|
|
|
|
| 41 |
)
|
| 42 |
except Exception as e:
|
| 43 |
print(e)
|
| 44 |
+
print(
|
| 45 |
+
f"Error evaluating row with {row[DatasetSchema.check_type]} {row[DatasetSchema.id_]}: {y_true} vs {y_pred}"
|
| 46 |
+
)
|
| 47 |
exit(1)
|
| 48 |
return result
|
| 49 |
|
|
|
|
| 55 |
) -> pd.DataFrame:
|
| 56 |
tqdm.pandas()
|
| 57 |
|
| 58 |
+
generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[
|
| 59 |
+
GeneratedDatasetSchema.generated_answer
|
| 60 |
+
].apply(
|
| 61 |
lambda x: GenerationAnswer.model_validate(deepcopy(x)) if x else None,
|
| 62 |
)
|
| 63 |
dataset_df = load_dataset()
|
|
|
|
| 71 |
axis=1,
|
| 72 |
)
|
| 73 |
|
| 74 |
+
predictions_df[DatasetEvalSchema.predicted_answer] = predictions_df[
|
| 75 |
+
GeneratedDatasetSchema.generated_answer
|
| 76 |
+
].apply(
|
| 77 |
lambda x: x.answer if not pd.isna(x) else None,
|
| 78 |
)
|
| 79 |
+
predictions_df[DatasetEvalSchema.context] = predictions_df[
|
| 80 |
+
GeneratedDatasetSchema.generated_answer
|
| 81 |
+
].apply(
|
| 82 |
lambda x: x.context if not pd.isna(x) else None,
|
| 83 |
)
|
| 84 |
predictions_df = predictions_df[list(DatasetEvalSchema._collect_fields().keys())]
|
|
|
|
| 99 |
|
| 100 |
df = pd.read_json(file, lines=True)
|
| 101 |
evaluated_df = _evaluate(df)
|
| 102 |
+
evaluated_df.to_json(
|
| 103 |
+
file.with_suffix(".eval.jsonl"), orient="records", lines=True, force_ascii=False
|
| 104 |
+
)
|
| 105 |
|
| 106 |
|
| 107 |
@pa.check_input(DatasetEvalSchema)
|
|
|
|
| 111 |
model_name: str,
|
| 112 |
model_size: float,
|
| 113 |
model_url: str,
|
| 114 |
+
model_config: str,
|
| 115 |
) -> pd.DataFrame:
|
| 116 |
pass1 = df[DatasetEvalSchema.is_correct].mean()
|
| 117 |
|
| 118 |
w = df[DatasetEvalSchema.grade].apply(grade_to_weight)
|
| 119 |
+
weighted_accuracy = (
|
| 120 |
+
df[DatasetEvalSchema.is_correct].astype(int) * w
|
| 121 |
+
).sum() / w.sum()
|
| 122 |
+
|
| 123 |
+
arith_pass1 = df[df[DatasetEvalSchema.task_type] == "arith"][
|
| 124 |
+
DatasetEvalSchema.is_correct
|
| 125 |
+
].mean()
|
| 126 |
+
geometry_pass1 = df[df[DatasetEvalSchema.task_type] == "geometry"][
|
| 127 |
+
DatasetEvalSchema.is_correct
|
| 128 |
+
].mean()
|
| 129 |
+
logic_pass1 = df[df[DatasetEvalSchema.task_type] == "logic"][
|
| 130 |
+
DatasetEvalSchema.is_correct
|
| 131 |
+
].mean()
|
| 132 |
|
| 133 |
result = {
|
| 134 |
LeaderBoardSchema.model_name: model_name,
|
|
|
|
| 192 |
model_name=model_name,
|
| 193 |
model_size=model_size,
|
| 194 |
model_url=model_url,
|
| 195 |
+
model_config=model_config or "",
|
| 196 |
)
|
| 197 |
metrics = metrics_df.to_dict(orient="records")[0]
|
| 198 |
print(f"Metrics for {model_name}:")
|
src/eval/matchers.py
CHANGED
|
@@ -32,7 +32,9 @@ def um(y_true: list, y_pred: list) -> bool:
|
|
| 32 |
return False
|
| 33 |
if len(y_true) == 0:
|
| 34 |
return True
|
| 35 |
-
if (len(y_true) > 0 and type(y_true[0]) is dict) or (
|
|
|
|
|
|
|
| 36 |
y_true = [_dict_to_tuple(item) for item in y_true]
|
| 37 |
y_pred = [_dict_to_tuple(item) for item in y_pred]
|
| 38 |
if type(y_true) != type(y_pred):
|
|
|
|
| 32 |
return False
|
| 33 |
if len(y_true) == 0:
|
| 34 |
return True
|
| 35 |
+
if (len(y_true) > 0 and type(y_true[0]) is dict) or (
|
| 36 |
+
len(y_true) == 0 and type(y_pred[0]) is dict
|
| 37 |
+
):
|
| 38 |
y_true = [_dict_to_tuple(item) for item in y_true]
|
| 39 |
y_pred = [_dict_to_tuple(item) for item in y_pred]
|
| 40 |
if type(y_true) != type(y_pred):
|
src/eval/metrics.py
CHANGED
|
@@ -3,5 +3,5 @@ import numpy as np
|
|
| 3 |
|
| 4 |
def grade_to_weight(g: str) -> float:
|
| 5 |
"""Convert a grade string to a weight value."""
|
| 6 |
-
parts = list(map(int, g.split(
|
| 7 |
return np.mean(parts)
|
|
|
|
| 3 |
|
| 4 |
def grade_to_weight(g: str) -> float:
|
| 5 |
"""Convert a grade string to a weight value."""
|
| 6 |
+
parts = list(map(int, g.split("-")))
|
| 7 |
return np.mean(parts)
|
src/generate/cli.py
CHANGED
|
@@ -41,7 +41,9 @@ def _generate_single_answer(
|
|
| 41 |
) -> GenerationAnswer:
|
| 42 |
if temp_path and (temp_path / f"{row[DatasetSchema.id_]}.json").exists():
|
| 43 |
return GenerationAnswer.model_validate(
|
| 44 |
-
json.load(open(temp_path / f"{row[DatasetSchema.id_]}.json", "r"))[
|
|
|
|
|
|
|
| 45 |
)
|
| 46 |
answer_type = make_root_model(row[DatasetSchema.answer_type])
|
| 47 |
chain = build_chain(answer_type)
|
|
@@ -131,14 +133,16 @@ def generate(
|
|
| 131 |
build_chain_function,
|
| 132 |
llm_class=config.llm_class,
|
| 133 |
structured_output_method=config.structured_output_method,
|
| 134 |
-
**config.kwargs
|
| 135 |
)
|
| 136 |
|
| 137 |
-
df = _generate_answers(
|
| 138 |
-
|
| 139 |
-
df[GeneratedDatasetSchema.generated_answer] = df[GeneratedDatasetSchema.generated_answer].apply(
|
| 140 |
-
lambda x: x.model_dump()
|
| 141 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
df.to_json(
|
| 143 |
output_path,
|
| 144 |
lines=True,
|
|
@@ -151,15 +155,19 @@ def generate(
|
|
| 151 |
def _type_sanitycheck(
|
| 152 |
generated_df: pd.DataFrame,
|
| 153 |
) -> tuple[bool, str]:
|
| 154 |
-
generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
)
|
| 157 |
|
| 158 |
dataset_df = load_dataset()
|
| 159 |
predicted_df = dataset_df.join(
|
| 160 |
generated_df.set_index(GeneratedDatasetSchema.id_),
|
| 161 |
on=DatasetSchema.id_,
|
| 162 |
-
rsuffix=
|
| 163 |
).dropna(subset=[GeneratedDatasetSchema.generated_answer])
|
| 164 |
|
| 165 |
if len(predicted_df) == 0:
|
|
@@ -170,13 +178,20 @@ def _type_sanitycheck(
|
|
| 170 |
lambda row: matches_type(
|
| 171 |
row[GeneratedDatasetSchema.generated_answer].answer,
|
| 172 |
string_to_type(row[DatasetSchema.answer_type]),
|
| 173 |
-
),
|
|
|
|
| 174 |
)
|
| 175 |
|
| 176 |
if not predicted_df[TYPE_MATCH].all():
|
| 177 |
-
return
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
-
return
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
|
| 182 |
@click.command()
|
|
|
|
| 41 |
) -> GenerationAnswer:
|
| 42 |
if temp_path and (temp_path / f"{row[DatasetSchema.id_]}.json").exists():
|
| 43 |
return GenerationAnswer.model_validate(
|
| 44 |
+
json.load(open(temp_path / f"{row[DatasetSchema.id_]}.json", "r"))[
|
| 45 |
+
GeneratedDatasetSchema.generated_answer
|
| 46 |
+
]
|
| 47 |
)
|
| 48 |
answer_type = make_root_model(row[DatasetSchema.answer_type])
|
| 49 |
chain = build_chain(answer_type)
|
|
|
|
| 133 |
build_chain_function,
|
| 134 |
llm_class=config.llm_class,
|
| 135 |
structured_output_method=config.structured_output_method,
|
| 136 |
+
**config.kwargs,
|
| 137 |
)
|
| 138 |
|
| 139 |
+
df = _generate_answers(
|
| 140 |
+
df, build_chain_function, use_tqdm=use_tqdm, temp_path=temp_path
|
|
|
|
|
|
|
| 141 |
)
|
| 142 |
+
|
| 143 |
+
df[GeneratedDatasetSchema.generated_answer] = df[
|
| 144 |
+
GeneratedDatasetSchema.generated_answer
|
| 145 |
+
].apply(lambda x: x.model_dump())
|
| 146 |
df.to_json(
|
| 147 |
output_path,
|
| 148 |
lines=True,
|
|
|
|
| 155 |
def _type_sanitycheck(
|
| 156 |
generated_df: pd.DataFrame,
|
| 157 |
) -> tuple[bool, str]:
|
| 158 |
+
generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[
|
| 159 |
+
GeneratedDatasetSchema.generated_answer
|
| 160 |
+
].apply(
|
| 161 |
+
lambda x: GenerationAnswer.model_validate(deepcopy(x))
|
| 162 |
+
if not isinstance(x, GenerationAnswer)
|
| 163 |
+
else x
|
| 164 |
)
|
| 165 |
|
| 166 |
dataset_df = load_dataset()
|
| 167 |
predicted_df = dataset_df.join(
|
| 168 |
generated_df.set_index(GeneratedDatasetSchema.id_),
|
| 169 |
on=DatasetSchema.id_,
|
| 170 |
+
rsuffix="_generated",
|
| 171 |
).dropna(subset=[GeneratedDatasetSchema.generated_answer])
|
| 172 |
|
| 173 |
if len(predicted_df) == 0:
|
|
|
|
| 178 |
lambda row: matches_type(
|
| 179 |
row[GeneratedDatasetSchema.generated_answer].answer,
|
| 180 |
string_to_type(row[DatasetSchema.answer_type]),
|
| 181 |
+
),
|
| 182 |
+
axis=1,
|
| 183 |
)
|
| 184 |
|
| 185 |
if not predicted_df[TYPE_MATCH].all():
|
| 186 |
+
return (
|
| 187 |
+
False,
|
| 188 |
+
f"Type mismatch found for {predicted_df[~predicted_df[TYPE_MATCH]][DatasetSchema.id_].tolist()}.",
|
| 189 |
+
)
|
| 190 |
|
| 191 |
+
return (
|
| 192 |
+
True,
|
| 193 |
+
f"All matched. Predicted count: {len(predicted_df)} of {len(dataset_df)}",
|
| 194 |
+
)
|
| 195 |
|
| 196 |
|
| 197 |
@click.command()
|
src/generate/generators.py
CHANGED
|
@@ -57,9 +57,7 @@ def build_singleturn_chain(
|
|
| 57 |
context={},
|
| 58 |
)
|
| 59 |
)
|
| 60 |
-
chain = chain.with_retry(
|
| 61 |
-
retry_if_exception_type=(openai.PermissionDeniedError, )
|
| 62 |
-
)
|
| 63 |
return chain
|
| 64 |
|
| 65 |
|
|
@@ -126,9 +124,7 @@ def build_thinking_chain(
|
|
| 126 |
)
|
| 127 |
)
|
| 128 |
)
|
| 129 |
-
chain = chain.with_retry(
|
| 130 |
-
retry_if_exception_type=(openai.PermissionDeniedError, )
|
| 131 |
-
)
|
| 132 |
return chain
|
| 133 |
|
| 134 |
|
|
|
|
| 57 |
context={},
|
| 58 |
)
|
| 59 |
)
|
| 60 |
+
chain = chain.with_retry(retry_if_exception_type=(openai.PermissionDeniedError,))
|
|
|
|
|
|
|
| 61 |
return chain
|
| 62 |
|
| 63 |
|
|
|
|
| 124 |
)
|
| 125 |
)
|
| 126 |
)
|
| 127 |
+
chain = chain.with_retry(retry_if_exception_type=(openai.PermissionDeniedError,))
|
|
|
|
|
|
|
| 128 |
return chain
|
| 129 |
|
| 130 |
|