Delta-Vector commited on
Commit
2e11bb3
·
verified ·
1 Parent(s): 791b9ca

Upload dataset_converter.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. dataset_converter.py +203 -0
dataset_converter.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import argparse
4
+ from fuzzywuzzy import fuzz
5
+
6
+ class DatasetConverter:
7
+ @staticmethod
8
+ def load_data(input_path: str) -> list:
9
+ ext = os.path.splitext(input_path)[1].lower()
10
+ if ext == '.json':
11
+ return DatasetConverter.load_json_data(input_path)
12
+ elif ext == '.jsonl':
13
+ return DatasetConverter.load_jsonl_data(input_path)
14
+ else:
15
+ raise ValueError("Unsupported file format")
16
+
17
+ @staticmethod
18
+ def load_json_data(input_path: str) -> list:
19
+ data = []
20
+ try:
21
+ with open(input_path, 'r', encoding='utf-8') as f:
22
+ file_content = f.read()
23
+ try:
24
+ data = json.loads(file_content)
25
+ if not isinstance(data, list):
26
+ data = [data]
27
+ except json.JSONDecodeError:
28
+ print("JSON Decode Error. Attempting to process line by line.")
29
+ lines = file_content.splitlines()
30
+ for line in lines:
31
+ line = line.strip()
32
+ if line:
33
+ try:
34
+ json_object = json.loads(line)
35
+ if isinstance(json_object, dict):
36
+ data.append(json_object)
37
+ except json.JSONDecodeError:
38
+ print(f"Skipping invalid JSON line: {line}")
39
+ data.extend(DatasetConverter.fallback_parse_line(line))
40
+ except UnicodeDecodeError:
41
+ print("Unicode Decode Error. Ensure file is encoded in UTF-8.")
42
+ return data
43
+
44
+ @staticmethod
45
+ def load_jsonl_data(input_path: str) -> list:
46
+ data = []
47
+ try:
48
+ with open(input_path, 'r', encoding='utf-8') as f:
49
+ for line in f:
50
+ line = line.strip()
51
+ if line:
52
+ try:
53
+ data.append(json.loads(line))
54
+ except json.JSONDecodeError:
55
+ print(f"Skipping invalid JSON line: {line}")
56
+ data.extend(DatasetConverter.fallback_parse_line(line))
57
+ except UnicodeDecodeError:
58
+ print("Unicode Decode Error. Ensure file is encoded in UTF-8.")
59
+ return data
60
+
61
+ @staticmethod
62
+ def extract_conversations(entry: dict) -> list:
63
+ conversations = []
64
+ if 'conversations' in entry:
65
+ for message in entry['conversations']:
66
+ role = message.get('from')
67
+ if role == 'user':
68
+ role = 'human'
69
+ conv_entry = {
70
+ "from": role if role != 'assistant' else 'gpt',
71
+ "value": message.get('value', '') if message.get('value') else '' }
72
+ if 'prefix' in message:
73
+ conv_entry['prefix'] = message['prefix']
74
+ if 'loss' in message:
75
+ conv_entry['loss'] = message['loss']
76
+ conversations.append(conv_entry)
77
+ else:
78
+ if 'system' in entry:
79
+ conversations.append({"from": "system", "value": entry['system'].strip()})
80
+ if 'completion' in entry:
81
+ DatasetConverter.process_completion(entry['completion'], conversations)
82
+ elif 'messages' in entry:
83
+ for message in entry.get('messages', []):
84
+ if isinstance(message, dict):
85
+ role = message.get('role')
86
+ if role == 'user':
87
+ role = 'human'
88
+ elif role == 'assistant':
89
+ role = 'gpt'
90
+ conv_entry = {
91
+ "from": role,
92
+ "value": message.get('content', '').strip()
93
+ }
94
+ if 'prefix' in message:
95
+ conv_entry['prefix'] = message['prefix']
96
+ if 'loss' in message:
97
+ conv_entry['loss'] = message['loss']
98
+ conversations.append(conv_entry)
99
+ if not conversations:
100
+ return [{"from": "system", "value": "No conversations found."}]
101
+ return conversations
102
+
103
+ @staticmethod
104
+ def process_completion(completion: dict, conversations: list):
105
+ if isinstance(completion, list):
106
+ for message in completion:
107
+ DatasetConverter.add_conversation(message, conversations)
108
+ elif isinstance(completion, str):
109
+ try:
110
+ completion_json = json.loads(completion)
111
+ if isinstance(completion_json, list):
112
+ for message in completion_json:
113
+ DatasetConverter.add_conversation(message, conversations)
114
+ except json.JSONDecodeError:
115
+ pass
116
+
117
+ @staticmethod
118
+ def add_conversation(message: dict, conversations: list):
119
+ role = message.get('role')
120
+ if role == 'user':
121
+ role = 'human'
122
+ elif role == 'assistant':
123
+ role = 'gpt'
124
+ conv_entry = {
125
+ "from": role,
126
+ "value": message.get('content', '').strip()
127
+ }
128
+ if 'prefix' in message:
129
+ conv_entry['prefix'] = message['prefix']
130
+ if 'loss' in message:
131
+ conv_entry['loss'] = message['loss']
132
+ conversations.append(conv_entry)
133
+
134
+ @staticmethod
135
+ def fallback_parse_line(line: str) -> list:
136
+ conversations = []
137
+ keywords = {'system': 'system:', 'user': 'user:', 'assistant': 'assistant:'}
138
+ for role, keyword in keywords.items():
139
+ if keyword in line:
140
+ value = line.split(keyword, 1)[1].strip()
141
+ conversations.append({"from": role if role != 'assistant' else 'gpt', "value": value})
142
+ if not conversations:
143
+ potential_roles = ['system', 'user', 'assistant']
144
+ for role in potential_roles:
145
+ ratio = fuzz.ratio(line.lower(), role)
146
+ if ratio > 70:
147
+ conversations.append({"from": role if role != 'assistant' else 'gpt', "value": line.strip()})
148
+ break
149
+ if not conversations:
150
+ conversations.append({"from": "unknown", "value": line.strip()})
151
+ return conversations
152
+
153
+ @staticmethod
154
+ def validate_jsonl(output_path: str):
155
+ with open(output_path, 'r', encoding='utf-8') as f:
156
+ for i, line in enumerate(f, 1):
157
+ line = line.strip()
158
+ if line:
159
+ try:
160
+ json.loads(line)
161
+ except json.JSONDecodeError:
162
+ print(f"Invalid JSON at line {i}: {line}")
163
+ raise ValueError(f"Invalid JSONL format detected at line {i}.")
164
+ print("Validation completed: The output is proper JSONL.")
165
+
166
+ @staticmethod
167
+ def process_data(data: list, output_path: str) -> list:
168
+ preview_entries = []
169
+ conversations_found = False
170
+ with open(output_path, 'w', encoding='utf-8') as f:
171
+ for entry in data:
172
+ conversations = DatasetConverter.extract_conversations(entry)
173
+ formatted_entry = {"conversations": conversations}
174
+ f.write(json.dumps(formatted_entry, ensure_ascii=False) + '\n')
175
+ conversations_found = True
176
+ if len(preview_entries) < 3:
177
+ preview_entries.append(formatted_entry)
178
+ status_message = "Conversations completed successfully." if conversations_found else "No conversations found for this dataset."
179
+ print(status_message)
180
+ DatasetConverter.validate_jsonl(output_path)
181
+ return preview_entries
182
+
183
+ @staticmethod
184
+ def process_multiple_files(input_paths: list, output_dir: str) -> dict:
185
+ preview_entries = {}
186
+ for input_path in input_paths:
187
+ filename = os.path.basename(input_path)
188
+ output_path = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}.jsonl")
189
+ print(f"Processing file: {filename}")
190
+ data = DatasetConverter.load_data(input_path)
191
+ preview = DatasetConverter.process_data(data, output_path)
192
+ preview_entries[filename] = preview
193
+ return preview_entries
194
+
195
+ if __name__ == "__main__":
196
+ parser = argparse.ArgumentParser(description="Convert JSON/JSONL datasets into structured conversation format.")
197
+ parser.add_argument("input", nargs="+", help="Input file paths (JSON/JSONL)")
198
+ parser.add_argument("output_dir", help="Output directory for the processed files")
199
+ args = parser.parse_args()
200
+ os.makedirs(args.output_dir, exist_ok=True)
201
+ converter = DatasetConverter()
202
+ preview = converter.process_multiple_files(args.input, args.output_dir)
203
+ print("Preview of processed conversations:", json.dumps(preview, indent=2, ensure_ascii=False))