Student0809
/

interactSpeech

Model card Files Files and versions

interactSpeech / analyze_dialogue_lengths.py

Student0809's picture

Add files using upload-large-folder tool

fd421e2 verified 5 months ago

history blame contribute delete

3.69 kB

	import json
	from collections import Counter
	import numpy as np
	from typing import List, Dict
	import matplotlib.pyplot as plt

	def analyze_dialogue_lengths(file_path: str) -> Dict:
	# Read the JSONL file
	lengths = []
	with open(file_path, 'r', encoding='utf-8') as f:
	for line in f:
	try:
	item = json.loads(line.strip())
	for message in item['messages']:
	if message['role'] == 'assistant':
	content = message['content']
	length = len(content)
	lengths.append(length)
	except json.JSONDecodeError as e:
	print(f"Error parsing line: {e}")
	continue

	if not lengths:
	print(f"No valid assistant responses found in {file_path}")
	return {}

	# Calculate statistics
	max_length = max(lengths)
	avg_length = np.mean(lengths)
	median_length = np.median(lengths)

	# Calculate length distribution with more detailed ranges
	length_ranges = {
	'0-100': 0,
	'101-500': 0,
	'501-1000': 0,
	'1001-2000': 0,
	'2001-3000': 0,
	'3001-4000': 0,
	'4001-5000': 0,
	'5001-6000': 0,
	'6000+': 0
	}

	for length in lengths:
	if length <= 100:
	length_ranges['0-100'] += 1
	elif length <= 500:
	length_ranges['101-500'] += 1
	elif length <= 1000:
	length_ranges['501-1000'] += 1
	elif length <= 2000:
	length_ranges['1001-2000'] += 1
	elif length <= 3000:
	length_ranges['2001-3000'] += 1
	elif length <= 4000:
	length_ranges['3001-4000'] += 1
	elif length <= 5000:
	length_ranges['4001-5000'] += 1
	elif length <= 6000:
	length_ranges['5001-6000'] += 1
	else:
	length_ranges['6000+'] += 1

	# Calculate percentages
	total = len(lengths)
	percentages = {k: (v/total)*100 for k, v in length_ranges.items()}

	# Print results
	print(f"\nAnalysis Results for {file_path}:")
	print(f"Total number of assistant responses: {total}")
	print(f"Maximum length: {max_length} characters")
	print(f"Average length: {avg_length:.2f} characters")
	print(f"Median length: {median_length:.2f} characters")
	print("\nLength Distribution:")
	for range_name, percentage in percentages.items():
	print(f"{range_name}: {percentage:.2f}%")

	# Create a histogram with more bins for better visualization
	plt.figure(figsize=(12, 6))
	plt.hist(lengths, bins=100, edgecolor='black')
	plt.title('Distribution of Assistant Response Lengths')
	plt.xlabel('Length (characters)')
	plt.ylabel('Frequency')
	plt.savefig('dialogue_length_distribution.png')
	plt.close()

	# Create a bar chart for the ranges
	plt.figure(figsize=(12, 6))
	ranges = list(length_ranges.keys())
	counts = list(length_ranges.values())
	plt.bar(ranges, counts)
	plt.title('Distribution of Response Lengths by Range')
	plt.xlabel('Length Range')
	plt.ylabel('Count')
	plt.xticks(rotation=45)
	plt.tight_layout()
	plt.savefig('dialogue_length_ranges.png')
	plt.close()

	return {
	'total_responses': total,
	'max_length': max_length,
	'avg_length': avg_length,
	'median_length': median_length,
	'distribution': percentages
	}

	if __name__ == "__main__":
	# Analyze both train and test datasets
	train_results = analyze_dialogue_lengths('dataset_cotSFTtrain.json')
	test_results = analyze_dialogue_lengths('dataset_cotSFTtest.json')