Spaces:

pelcra
/

llmlagbench

Sleeping

App Files Files Community

fzarnecki commited on Nov 18

Commit

654cff7

1 Parent(s): 207bddc

Changed column header

Browse files

Files changed (2) hide show

app.py +3 -3
src/utils.py +48 -20

app.py CHANGED Viewed

@@ -54,7 +54,7 @@ def update_dashboard(graph_years, graph_model_filter):
         graph_years = cfg.get("years")
     # keep some necessary metadata columns in the specified order
-    metadata_cols = ["Model", "Overall Average", "Provider cutoff", "1st Detected cutoff", "2nd Detected cutoff", "Provider", "Release date", "Model cutoff", "trend_changepoints", "Parameters"]
     cols = metadata_cols.copy()
     yearly_df = df.copy()
@@ -63,7 +63,7 @@ def update_dashboard(graph_years, graph_model_filter):
     # TODO if >1 year - aggregate the values to be per year, not per month
     if len(table_years) > 1:
-        lb_cols = ["Model", "Overall Average", "Provider cutoff", "1st Detected cutoff", "2nd Detected cutoff", "Provider", "Release date", "Model cutoff", "Parameters"] + [y for y in cfg.get("aggregated_cols_year") if y in table_years]
         yearly_df = yearly_df[lb_cols]
     # Expand years into their YYYY_MM columns (for table)
@@ -101,7 +101,7 @@ def update_dashboard(graph_years, graph_model_filter):
     # Build tidy dataframe for gr.LinePlot with columns x, y, Model
     records = []
     # Exclude all metadata columns and yearly aggregates from x_labels - only keep monthly columns
-    excluded_cols = {"Model", "Overall Average", "Parameters", "1st Detected cutoff", "2nd Detected cutoff", "Provider", "Provider cutoff", "Release date", "Model cutoff", "trend_changepoints"}
     x_labels = [c for c in graph_cols if c not in excluded_cols and c not in graph_years]  # only months for the plot
     for _, row in graph_df.iterrows():
         for col in x_labels:

         graph_years = cfg.get("years")
     # keep some necessary metadata columns in the specified order
+    metadata_cols = ["Model", "Overall Average", "1st Detected cutoff", "2nd Detected cutoff", "Provider cutoff", "Provider", "Release date", "Self-declared cutoff", "trend_changepoints", "Parameters", "Evaluation period"]
     cols = metadata_cols.copy()
     yearly_df = df.copy()
     # TODO if >1 year - aggregate the values to be per year, not per month
     if len(table_years) > 1:
+        lb_cols = ["Model", "Overall Average", "1st Detected cutoff", "2nd Detected cutoff", "Provider cutoff", "Provider", "Release date", "Self-declared cutoff", "Parameters", "Evaluation period"] + [y for y in cfg.get("aggregated_cols_year") if y in table_years]
         yearly_df = yearly_df[lb_cols]
     # Expand years into their YYYY_MM columns (for table)
     # Build tidy dataframe for gr.LinePlot with columns x, y, Model
     records = []
     # Exclude all metadata columns and yearly aggregates from x_labels - only keep monthly columns
+    excluded_cols = {"Model", "Overall Average", "Parameters", "1st Detected cutoff", "2nd Detected cutoff", "Provider", "Provider cutoff", "Release date", "Self-declared cutoff", "trend_changepoints", "Evaluation period"}
     x_labels = [c for c in graph_cols if c not in excluded_cols and c not in graph_years]  # only months for the plot
     for _, row in graph_df.iterrows():
         for col in x_labels:

src/utils.py CHANGED Viewed

@@ -169,34 +169,50 @@ def build_year_column_mapping(years, months):
 def validate_equal_measurements(data):
     """
-    Validate that all models have the same number of measurements.
     Args:
         data: Dictionary with model names as keys
     Returns:
-        tuple: (is_valid, measurement_count, error_message)
     """
     measurement_counts = {}
     for model_name, model_data in data.items():
         dates = model_data.get('dates', [])
         measurement_counts[model_name] = len(dates)
-    unique_counts = set(measurement_counts.values())
-    if len(unique_counts) == 0:
-        return False, 0, "No models found in data"
-    if len(unique_counts) == 1:
-        count = list(unique_counts)[0]
-        return True, count, f"All models have {count} measurements"
-    # Models have different counts - create error message
-    error_msg = "Models have different measurement counts:\n"
     for model, count in sorted(measurement_counts.items(), key=lambda x: x[1]):
-        error_msg += f"  {model}: {count}\n"
-    return False, None, error_msg
 def transform_leaderboard_data_to_dataframe(data, years, months, model_metadata=None):
@@ -212,12 +228,10 @@ def transform_leaderboard_data_to_dataframe(data, years, months, model_metadata=
     Returns:
         List of row dictionaries ready for DataFrame creation
     """
-    # Validate equal measurements
-    is_valid, count, message = validate_equal_measurements(data)
-    if not is_valid:
-        raise ValueError(f"Data validation failed: {message}")
-    print(f"✓ Validation passed: {message}")
     rows = []
@@ -258,7 +272,7 @@ def transform_leaderboard_data_to_dataframe(data, years, months, model_metadata=
             row["Parameters"] = metadata.get("Parameters", "")
             row["Provider cutoff"] = metadata.get("Provider cutoff", "")
             row["Release date"] = metadata.get("Release date", "")
-            row["Model cutoff"] = metadata.get("Model cutoff", "")
         else:
             # Set empty values if metadata not available
             # Fall back to extracting provider from model name if no metadata
@@ -266,19 +280,31 @@ def transform_leaderboard_data_to_dataframe(data, years, months, model_metadata=
             row["Parameters"] = ""
             row["Provider cutoff"] = ""
             row["Release date"] = ""
-            row["Model cutoff"] = ""
         # Aggregate faithfulness data to monthly averages
         dates = model_data.get("dates", [])
         faithfulness = model_data.get("faithfulness", [])
         monthly_averages = aggregate_weekly_to_monthly(dates, faithfulness)
         # Add monthly columns (e.g., "2021_01", "2021_02", ...)
         for month_key, avg_value in monthly_averages.items():
             row[month_key] = avg_value
         # Calculate yearly averages
-        all_years_values = []  # Collect all values for overall average
         for year in years:
             year_values = []
             for month in months:
@@ -293,6 +319,8 @@ def transform_leaderboard_data_to_dataframe(data, years, months, model_metadata=
             all_years_values.extend(year_values)
         # Calculate overall average across all years
         row["Overall Average"] = round(mean(all_years_values), 2) if all_years_values else None
         rows.append(row)

 def validate_equal_measurements(data):
     """
+    Validate measurement counts across models and warn about discrepancies.
     Args:
         data: Dictionary with model names as keys
     Returns:
+        tuple: (is_valid, measurement_counts_dict, message)
+            - is_valid: Always True now (we allow different counts)
+            - measurement_counts_dict: Dict mapping model_name -> count
+            - message: Info/warning message about the counts
     """
     measurement_counts = {}
     for model_name, model_data in data.items():
         dates = model_data.get('dates', [])
         measurement_counts[model_name] = len(dates)
+    if len(measurement_counts) == 0:
+        return True, {}, "No models found in data"
+    # Find max count
+    max_count = max(measurement_counts.values())
+    min_count = min(measurement_counts.values())
+    if max_count == min_count:
+        # All models have same count
+        return True, measurement_counts, f"All models have {max_count} measurements"
+    # Models have different counts - create warning message
+    warning_msg = f"⚠️  Models have different measurement counts (range: {min_count}-{max_count}):\n"
+    # Show models with fewer than max samples
+    models_with_fewer = []
     for model, count in sorted(measurement_counts.items(), key=lambda x: x[1]):
+        if count < max_count:
+            models_with_fewer.append(f"  {model}: {count} samples (missing {max_count - count})")
+    if models_with_fewer:
+        warning_msg += "\n".join(models_with_fewer)
+        warning_msg += f"\n\nModels with maximum samples ({max_count}):\n"
+        for model, count in measurement_counts.items():
+            if count == max_count:
+                warning_msg += f"  {model}\n"
+    return True, measurement_counts, warning_msg
 def transform_leaderboard_data_to_dataframe(data, years, months, model_metadata=None):
     Returns:
         List of row dictionaries ready for DataFrame creation
     """
+    # Validate measurements and get counts per model
+    is_valid, measurement_counts, message = validate_equal_measurements(data)
+    print(message)
     rows = []
             row["Parameters"] = metadata.get("Parameters", "")
             row["Provider cutoff"] = metadata.get("Provider cutoff", "")
             row["Release date"] = metadata.get("Release date", "")
+            row["Self-declared cutoff"] = metadata.get("Model cutoff", "")
         else:
             # Set empty values if metadata not available
             # Fall back to extracting provider from model name if no metadata
             row["Parameters"] = ""
             row["Provider cutoff"] = ""
             row["Release date"] = ""
+            row["Self-declared cutoff"] = ""
         # Aggregate faithfulness data to monthly averages
         dates = model_data.get("dates", [])
         faithfulness = model_data.get("faithfulness", [])
         monthly_averages = aggregate_weekly_to_monthly(dates, faithfulness)
+        # Calculate evaluation period (min and max dates)
+        if dates:
+            try:
+                date_objects = [datetime.strptime(d, '%Y-%m-%d') for d in dates]
+                min_date = min(date_objects).strftime('%Y-%m-%d')
+                max_date = max(date_objects).strftime('%Y-%m-%d')
+                row["Evaluation period"] = f"{min_date} - {max_date}"
+            except Exception:
+                row["Evaluation period"] = ""
+        else:
+            row["Evaluation period"] = ""
         # Add monthly columns (e.g., "2021_01", "2021_02", ...)
         for month_key, avg_value in monthly_averages.items():
             row[month_key] = avg_value
         # Calculate yearly averages
+        all_years_values = []  # Collect all monthly values for overall average (specific to this model)
         for year in years:
             year_values = []
             for month in months:
             all_years_values.extend(year_values)
         # Calculate overall average across all years
+        # Note: This is calculated from the model's actual sample count
+        # Models with fewer samples will have their average based only on their available data
         row["Overall Average"] = round(mean(all_years_values), 2) if all_years_values else None
         rows.append(row)