import json import re from transformers import AutoTokenizer # 配置部分👇 dataset_path = "all_dataset_train.jsonl" # 你的数据文件路径 model_path = "/root/autodl-tmp/output_7B_FULL_cotSFT/v8-20250720-210226/checkpoint-58" # 用于加载tokenizer的模型 required_fields = ["input", "output"] # 必须字段 max_token_length = 8192 # 最大允许token数量(你可以按模型修改) # 加载 tokenizer print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_path) # 控制字符检查函数 def has_control_chars(text): return bool(re.search(r"[\x00-\x1F\x7F]", text)) # 开始逐行检查 print("Checking dataset...\n") with open(dataset_path, "r", encoding="utf-8") as f: for idx, line in enumerate(f, 1): try: data = json.loads(line) except json.JSONDecodeError as e: print(f"[Line {idx}] ❌ JSON decode error: {e}") continue # 检查字段完整性 for field in required_fields: if field not in data: print(f"[Line {idx}] ❌ Missing required field: '{field}'") elif not data[field].strip(): print(f"[Line {idx}] ❌ Field '{field}' is empty") # 控制字符检查 input_text = data.get("input", "") output_text = data.get("output", "") if has_control_chars(input_text + output_text): print(f"[Line {idx}] ⚠️ Contains control characters") # Token 长度检查 try: tokens = tokenizer(input_text + output_text, return_tensors="pt") token_len = tokens["input_ids"].shape[1] if token_len > max_token_length: print(f"[Line {idx}] ⚠️ Too many tokens: {token_len} > {max_token_length}") except Exception as e: print(f"[Line {idx}] ❌ Tokenization error: {e}") print("\n✅ Dataset check complete.")