import os import json import logging import numpy as np import torchaudio from torch.utils.data import Dataset def _handle_wav(wav_path, target_rate=16000): """ handle one wav file. Return: waveform: numpy narray(1d) """ waveform, sample_rate = torchaudio.load(wav_path) if sample_rate != target_rate: waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform) audio = waveform[0] return audio def _handle_qa(obj, is_think=True, think_max_len=50): if is_think: prompt_template = ( "# Dialogue Response Evaluation\n\n" "**IMPORTANT:** Evaluation must include `` analysis and `` rating.\n\n" "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" "## Scoring Criteria\n\n" "**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" "**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" "**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" "## Evaluation Requirements\n\n" "Response **MUST** follow this format:\n\n" "\n" f"Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...(less than {think_max_len} words)\n" "\n\n" "X (**X is 1, 3, or 5**)\n\n") else: prompt_template = ( "# Dialogue Response Evaluation\n\n" "**IMPORTANT:** Evaluation must include`` rating.\n\n" "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" "## Scoring Criteria\n\n" "**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" "**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" "**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" "## Evaluation Requirements\n\n" "Response **MUST** follow this format:\n\n" "X (**X is 1, 3, or 5**)\n\n") # 构建处理后的对象 processed_obj = { "id": obj["id"], "prompt": [{"role": "user", "content": [ {"type": "audio", "audio": obj["merge_wav"]}, {"type": "text", "text": prompt_template} ]}], "solution": obj["gt_score"], "audio": obj.get("audio", None), "clean_dialogue": obj.get("clean_dialogue", None) } return processed_obj class AudioDataset(Dataset): def __init__(self, data_dir, sample_rate=16000, is_think=True, think_max_len=50, load_audio=False): super().__init__() self.sample_rate = sample_rate self.data_dir = data_dir self.is_think = is_think self.think_max_len = think_max_len self.load_audio = load_audio self.metadata = [] # Store only metadata instead of full data self._load_metadata() logging.info(f"Loaded metadata for {len(self.metadata)} dialogues from {data_dir}") def _load_metadata(self): for fname in os.listdir(self.data_dir): if fname.endswith('.json'): fpath = os.path.join(self.data_dir, fname) with open(fpath, 'r', encoding='utf8') as f: try: json_obj = json.load(f) except Exception as e: logging.warning(f"Failed to load {fpath}: {e}") continue for dialogue_id, obj in json_obj.items(): # Store only essential metadata metadata = { "id": dialogue_id, "merge_wav": obj.get("merge_wav", None), "gt_score": obj.get("gt_score", None), "clean_dialogue": obj.get("clean_dialogue", None), "json_path": fpath } self.metadata.append(metadata) def __len__(self): return len(self.metadata) def __getitem__(self, index): metadata = self.metadata[index] # 构建完整的对象 item = { "id": metadata["id"], "merge_wav": metadata["merge_wav"], "gt_score": metadata["gt_score"], "clean_dialogue": metadata["clean_dialogue"] } # 如果需要加载音频 if self.load_audio and metadata["merge_wav"] and os.path.exists(metadata["merge_wav"]): item["audio"] = _handle_wav(metadata["merge_wav"], self.sample_rate).numpy() # 使用_handle_qa处理对象 return _handle_qa( item, is_think=self.is_think, think_max_len=self.think_max_len )