import time from datasets import DatasetDict from loguru import logger import numpy as np import pandas as pd import torch import turing.config as config def calculate_submission_score(avg_f1: float, avg_runtime: float, avg_flops: float) -> float: """ Calculates the final competition score. The score is a weighted sum of F1 score, runtime, and GFLOPS. Weights: - F1 Score: 60% - Runtime: 20% - GFLOPS: 20% Args: avg_f1 (float): Average F1 score across all categories. avg_runtime (float): Average runtime in seconds. avg_flops (float): Average GFLOPS. Returns: float: Final submission score. """ score_f1 = 0.6 * avg_f1 runtime_ratio = (config.MAX_AVG_RUNTIME - avg_runtime) / config.MAX_AVG_RUNTIME score_runtime = 0.2 * max(runtime_ratio, 0) flops_ratio = (config.MAX_AVG_FLOPS - avg_flops) / config.MAX_AVG_FLOPS score_flops = 0.2 * max(flops_ratio, 0) total_score = score_f1 + score_runtime + score_flops logger.info(f" F1 Score (60%): {score_f1:.4f} (avg_f1: {avg_f1:.4f})") logger.info( f" Runtime Score (20%): {score_runtime:.4f} (avg_runtime: {avg_runtime:.4f}s / {config.MAX_AVG_RUNTIME}s)" ) logger.info( f" GFLOPS Score (20%): {score_flops:.4f} (avg_flops: {avg_flops:.4f} / {config.MAX_AVG_FLOPS})" ) logger.info(" ====================") logger.info(f" Final Score: {total_score:.4f}") return total_score def evaluate_models(models: dict, dataset: DatasetDict): """ Evaluates the provided models on the test datasets for each language. Computes precision, recall, and F1 score for each category and language. Also measures average runtime and GFLOPS for model inference. Args: models (dict): A dictionary mapping language codes to their respective models. dataset (DatasetDict): A DatasetDict containing test datasets for each language. Returns: pd.DataFrame: DataFrame containing precision, recall, and F1 scores for each category and language. float: Final submission score calculated based on average F1, runtime, and GF """ total_flops = 0 total_time = 0 scores = [] for lan in config.LANGS: logger.info(f"\n--- Evaluating Language: {lan.upper()} ---") model = models[lan] with torch.profiler.profile(with_flops=True) as p: test_data = dataset[f"{lan}_test"] x = test_data[config.INPUT_COLUMN] x = list(x) if hasattr(x, 'tolist') else x # Convert pandas Series to list y_true = np.array(test_data[config.LABEL_COLUMN]).T begin = time.time() for i in range(10): y_pred = model.predict(x) y_pred = np.asarray(y_pred).T total = time.time() - begin total_time = total_time + total total_flops = total_flops + (sum(k.flops for k in p.key_averages()) / 1e9) for i in range(len(y_pred)): assert len(y_pred[i]) == len(y_true[i]) tp = sum([true == pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])]) #tn = sum([true == pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])]) fp = sum([true == 0 and pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])]) fn = sum([true == 1 and pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])]) precision = tp / (tp + fp) recall = tp / (tp + fn) f1 = (2 * tp) / (2 * tp + fp + fn) scores.append({ "lan": lan, "cat": config.LABELS_MAP[lan][i], "precision": precision, "recall": recall, "f1": f1, }) logger.info(f"Compute in GFLOPs: {total_flops / 10}") logger.info(f"Avg runtime in seconds: {total_time / 10}") scores = pd.DataFrame(scores) print(scores) avg_f1 = scores["f1"].mean() avg_runtime = total_time / 10 avg_flops = total_flops / 10 final_score = calculate_submission_score(avg_f1, avg_runtime, avg_flops) logger.info(f"Final Score for {lan.upper()}: {final_score:.4f}") return scores, final_score