|
|
import cv2
|
|
|
import numpy as np
|
|
|
import os
|
|
|
from pathlib import Path
|
|
|
import imagehash
|
|
|
from PIL import Image
|
|
|
|
|
|
def calculate_phash(image_path):
|
|
|
"""
|
|
|
计算图片的感知哈希值(pHash)
|
|
|
:param image_path: 图片路径
|
|
|
:return: 感知哈希值(imagehash.ImageHash对象)
|
|
|
"""
|
|
|
try:
|
|
|
|
|
|
img = Image.open(image_path).convert("L")
|
|
|
|
|
|
phash = imagehash.phash(img, hash_size=8)
|
|
|
return phash
|
|
|
except Exception as e:
|
|
|
print(f"计算哈希失败:{image_path},错误:{e}")
|
|
|
return None
|
|
|
|
|
|
def calculate_clarity(image_path):
|
|
|
"""
|
|
|
拉普拉斯方差法计算图片清晰度评分
|
|
|
:param image_path: 图片路径
|
|
|
:return: 清晰度评分(方差值),若读取失败返回0
|
|
|
"""
|
|
|
img = cv2.imread(image_path)
|
|
|
if img is None:
|
|
|
return 0
|
|
|
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
|
laplacian = cv2.Laplacian(gray, cv2.CV_64F)
|
|
|
clarity_score = np.var(laplacian)
|
|
|
return clarity_score
|
|
|
|
|
|
def calculate_hamming_distance(hash1, hash2):
|
|
|
"""
|
|
|
计算两个哈希值的汉明距离
|
|
|
:param hash1, hash2: 感知哈希值(imagehash.ImageHash对象)
|
|
|
:return: 汉明距离(越小越相似)
|
|
|
"""
|
|
|
if hash1 is None or hash2 is None:
|
|
|
return float("inf")
|
|
|
return hash1 - hash2
|
|
|
|
|
|
def process_duplicate_frames(input_dir, output_dir, similarity_threshold=10):
|
|
|
"""
|
|
|
处理视频截帧,去除相似帧并保留更清晰的图片
|
|
|
:param input_dir: 原始图片文件夹路径
|
|
|
:param output_dir: 去重后保存的文件夹路径
|
|
|
:param similarity_threshold: 相似度阈值(汉明距离≤该值为相似帧)
|
|
|
"""
|
|
|
|
|
|
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
img_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"]
|
|
|
img_paths = [
|
|
|
os.path.join(input_dir, f) for f in os.listdir(input_dir)
|
|
|
if Path(f).suffix.lower() in img_extensions
|
|
|
]
|
|
|
|
|
|
img_paths.sort(key=lambda x: os.path.basename(x))
|
|
|
|
|
|
if len(img_paths) == 0:
|
|
|
print("文件夹中未找到图片!")
|
|
|
return
|
|
|
|
|
|
|
|
|
saved_img_path = img_paths[0]
|
|
|
saved_phash = calculate_phash(saved_img_path)
|
|
|
saved_clarity = calculate_clarity(saved_img_path)
|
|
|
|
|
|
|
|
|
save_name = os.path.basename(saved_img_path)
|
|
|
cv2.imwrite(os.path.join(output_dir, save_name), cv2.imread(saved_img_path))
|
|
|
print(f"初始保存:{save_name},清晰度:{saved_clarity:.2f}")
|
|
|
|
|
|
|
|
|
for current_img_path in img_paths[1:]:
|
|
|
current_name = os.path.basename(current_img_path)
|
|
|
current_phash = calculate_phash(current_img_path)
|
|
|
current_clarity = calculate_clarity(current_img_path)
|
|
|
|
|
|
|
|
|
hamming_dist = calculate_hamming_distance(saved_phash, current_phash)
|
|
|
print(f"\n对比:{saved_img_path.split('/')[-1]} vs {current_name}")
|
|
|
print(f"汉明距离:{hamming_dist},当前图片清晰度:{current_clarity:.2f}")
|
|
|
|
|
|
if hamming_dist <= similarity_threshold:
|
|
|
|
|
|
if current_clarity > saved_clarity:
|
|
|
|
|
|
os.remove(os.path.join(output_dir, os.path.basename(saved_img_path)))
|
|
|
cv2.imwrite(os.path.join(output_dir, current_name), cv2.imread(current_img_path))
|
|
|
print(f"替换:{current_name} 更清晰,已替换原基准图片")
|
|
|
|
|
|
saved_img_path = current_img_path
|
|
|
saved_phash = current_phash
|
|
|
saved_clarity = current_clarity
|
|
|
else:
|
|
|
|
|
|
print(f"跳过:{current_name} 模糊,保留原基准图片")
|
|
|
else:
|
|
|
|
|
|
cv2.imwrite(os.path.join(output_dir, current_name), cv2.imread(current_img_path))
|
|
|
print(f"保存:{current_name} 为新基准,与原基准非相似帧")
|
|
|
|
|
|
saved_img_path = current_img_path
|
|
|
saved_phash = current_phash
|
|
|
saved_clarity = current_clarity
|
|
|
|
|
|
print(f"\n处理完成!去重后图片保存在:{output_dir}")
|
|
|
print(f"原始图片数量:{len(img_paths)},去重后数量:{len(os.listdir(output_dir))}")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
input_dirs = ["/fi-lib/workspace/sjx/DiffSynth-Studio/dataset/no other choice","/fi-lib/workspace/sjx/DiffSynth-Studio/dataset/the roses","/fi-lib/workspace/sjx/DiffSynth-Studio/dataset/nouvelle","/fi-lib/workspace/sjx/DiffSynth-Studio/dataset/legs","/fi-lib/workspace/sjx/DiffSynth-Studio/dataset/frankenstein"]
|
|
|
|
|
|
for i in range(len(input_dirs)):
|
|
|
|
|
|
SIMILARITY_THRESHOLD = 10
|
|
|
|
|
|
|
|
|
process_duplicate_frames(input_dirs[i], f"{input_dirs[i]}_dedup", SIMILARITY_THRESHOLD) |