Spaces:
Paused
Paused
lanny xu
commited on
Commit
·
25e7f71
1
Parent(s):
e427a94
delete files
Browse files- entity_extractor.py +48 -6
entity_extractor.py
CHANGED
|
@@ -364,6 +364,35 @@ class EntityDeduplicator:
|
|
| 364 |
|
| 365 |
self.merge_chain = self.merge_prompt | self.llm | JsonOutputParser()
|
| 366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
def deduplicate_entities(self, entities: List[Dict]) -> Dict:
|
| 368 |
"""
|
| 369 |
去重实体列表
|
|
@@ -384,7 +413,7 @@ class EntityDeduplicator:
|
|
| 384 |
|
| 385 |
print(f"🔄 开始去重 {len(entities)} 个实体...")
|
| 386 |
|
| 387 |
-
#
|
| 388 |
unique_entities = {}
|
| 389 |
entity_mapping = {} # 映射别名到标准名称
|
| 390 |
|
|
@@ -394,16 +423,29 @@ class EntityDeduplicator:
|
|
| 394 |
# 查找是否有相似实体
|
| 395 |
merged = False
|
| 396 |
for canonical_name, canonical_entity in unique_entities.items():
|
| 397 |
-
#
|
| 398 |
-
|
| 399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
merged = True
|
| 401 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
|
| 403 |
if not merged:
|
| 404 |
unique_entities[name] = entity
|
| 405 |
-
entity_mapping[entity["name"]] = name
|
| 406 |
-
|
| 407 |
print(f"✅ 去重完成,剩余 {len(unique_entities)} 个唯一实体")
|
| 408 |
|
| 409 |
return {
|
|
|
|
| 364 |
|
| 365 |
self.merge_chain = self.merge_prompt | self.llm | JsonOutputParser()
|
| 366 |
|
| 367 |
+
def _is_same_entity(self, entity1: Dict, entity2: Dict) -> bool:
|
| 368 |
+
"""
|
| 369 |
+
使用LLM判断两个实体是否指向同一个对象
|
| 370 |
+
|
| 371 |
+
Args:
|
| 372 |
+
entity1: 实体1字典
|
| 373 |
+
entity2: 实体2字典
|
| 374 |
+
|
| 375 |
+
Returns:
|
| 376 |
+
bool: 是否相同
|
| 377 |
+
"""
|
| 378 |
+
try:
|
| 379 |
+
# 准备输入
|
| 380 |
+
input_data = {
|
| 381 |
+
"entity1_name": entity1["name"],
|
| 382 |
+
"entity1_desc": entity1.get("description", "无描述"),
|
| 383 |
+
"entity2_name": entity2["name"],
|
| 384 |
+
"entity2_desc": entity2.get("description", "无描述")
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
# 调用LLM
|
| 388 |
+
result = self.merge_chain.invoke(input_data)
|
| 389 |
+
|
| 390 |
+
# 解析结果
|
| 391 |
+
return result.get("is_same", False)
|
| 392 |
+
except Exception as e:
|
| 393 |
+
print(f" ⚠️ LLM判重失败 ({entity1['name']} vs {entity2['name']}): {e}")
|
| 394 |
+
return False
|
| 395 |
+
|
| 396 |
def deduplicate_entities(self, entities: List[Dict]) -> Dict:
|
| 397 |
"""
|
| 398 |
去重实体列表
|
|
|
|
| 413 |
|
| 414 |
print(f"🔄 开始去重 {len(entities)} 个实体...")
|
| 415 |
|
| 416 |
+
# 基于名称和LLM的智能去重
|
| 417 |
unique_entities = {}
|
| 418 |
entity_mapping = {} # 映射别名到标准名称
|
| 419 |
|
|
|
|
| 423 |
# 查找是否有相似实体
|
| 424 |
merged = False
|
| 425 |
for canonical_name, canonical_entity in unique_entities.items():
|
| 426 |
+
# 1. 简单的字符串匹配(作为预筛选)
|
| 427 |
+
# 如果名称完全相同,或者是子串关系,则考虑合并
|
| 428 |
+
is_substring = name in canonical_name or canonical_name in name
|
| 429 |
+
|
| 430 |
+
if name == canonical_name:
|
| 431 |
+
# 完全匹配(忽略大小写),直接合并
|
| 432 |
+
entity_mapping[entity["name"]] = canonical_entity["name"]
|
| 433 |
merged = True
|
| 434 |
break
|
| 435 |
+
elif is_substring:
|
| 436 |
+
# 子串匹配,使用LLM进行智能确认
|
| 437 |
+
# 例如:"Python" 和 "Python Programming Language" -> 合并
|
| 438 |
+
# "Java" 和 "JavaScript" -> 不合并
|
| 439 |
+
if self._is_same_entity(entity, canonical_entity):
|
| 440 |
+
print(f" ✨ 合并: {entity['name']} -> {canonical_entity['name']}")
|
| 441 |
+
entity_mapping[entity["name"]] = canonical_entity["name"]
|
| 442 |
+
merged = True
|
| 443 |
+
break
|
| 444 |
|
| 445 |
if not merged:
|
| 446 |
unique_entities[name] = entity
|
| 447 |
+
entity_mapping[entity["name"]] = entity["name"]
|
| 448 |
+
|
| 449 |
print(f"✅ 去重完成,剩余 {len(unique_entities)} 个唯一实体")
|
| 450 |
|
| 451 |
return {
|