lanny xu commited on
Commit
25e7f71
·
1 Parent(s): e427a94

delete files

Browse files
Files changed (1) hide show
  1. entity_extractor.py +48 -6
entity_extractor.py CHANGED
@@ -364,6 +364,35 @@ class EntityDeduplicator:
364
 
365
  self.merge_chain = self.merge_prompt | self.llm | JsonOutputParser()
366
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
  def deduplicate_entities(self, entities: List[Dict]) -> Dict:
368
  """
369
  去重实体列表
@@ -384,7 +413,7 @@ class EntityDeduplicator:
384
 
385
  print(f"🔄 开始去重 {len(entities)} 个实体...")
386
 
387
- # 简单的基于名称的去重
388
  unique_entities = {}
389
  entity_mapping = {} # 映射别名到标准名称
390
 
@@ -394,16 +423,29 @@ class EntityDeduplicator:
394
  # 查找是否有相似实体
395
  merged = False
396
  for canonical_name, canonical_entity in unique_entities.items():
397
- # 简单的字符串匹配(可以用LLM做更智能的判断)
398
- if name in canonical_name or canonical_name in name:
399
- entity_mapping[entity["name"]] = canonical_name
 
 
 
 
400
  merged = True
401
  break
 
 
 
 
 
 
 
 
 
402
 
403
  if not merged:
404
  unique_entities[name] = entity
405
- entity_mapping[entity["name"]] = name
406
-
407
  print(f"✅ 去重完成,剩余 {len(unique_entities)} 个唯一实体")
408
 
409
  return {
 
364
 
365
  self.merge_chain = self.merge_prompt | self.llm | JsonOutputParser()
366
 
367
+ def _is_same_entity(self, entity1: Dict, entity2: Dict) -> bool:
368
+ """
369
+ 使用LLM判断两个实体是否指向同一个对象
370
+
371
+ Args:
372
+ entity1: 实体1字典
373
+ entity2: 实体2字典
374
+
375
+ Returns:
376
+ bool: 是否相同
377
+ """
378
+ try:
379
+ # 准备输入
380
+ input_data = {
381
+ "entity1_name": entity1["name"],
382
+ "entity1_desc": entity1.get("description", "无描述"),
383
+ "entity2_name": entity2["name"],
384
+ "entity2_desc": entity2.get("description", "无描述")
385
+ }
386
+
387
+ # 调用LLM
388
+ result = self.merge_chain.invoke(input_data)
389
+
390
+ # 解析结果
391
+ return result.get("is_same", False)
392
+ except Exception as e:
393
+ print(f" ⚠️ LLM判重失败 ({entity1['name']} vs {entity2['name']}): {e}")
394
+ return False
395
+
396
  def deduplicate_entities(self, entities: List[Dict]) -> Dict:
397
  """
398
  去重实体列表
 
413
 
414
  print(f"🔄 开始去重 {len(entities)} 个实体...")
415
 
416
+ # 基于名称和LLM的智能去重
417
  unique_entities = {}
418
  entity_mapping = {} # 映射别名到标准名称
419
 
 
423
  # 查找是否有相似实体
424
  merged = False
425
  for canonical_name, canonical_entity in unique_entities.items():
426
+ # 1. 简单的字符串匹配(作为预筛选)
427
+ # 如果名称完全相同,或者是子串关系,则考虑合并
428
+ is_substring = name in canonical_name or canonical_name in name
429
+
430
+ if name == canonical_name:
431
+ # 完全匹配(忽略大小写),直接合并
432
+ entity_mapping[entity["name"]] = canonical_entity["name"]
433
  merged = True
434
  break
435
+ elif is_substring:
436
+ # 子串匹配,使用LLM进行智能确认
437
+ # 例如:"Python" 和 "Python Programming Language" -> 合并
438
+ # "Java" 和 "JavaScript" -> 不合并
439
+ if self._is_same_entity(entity, canonical_entity):
440
+ print(f" ✨ 合并: {entity['name']} -> {canonical_entity['name']}")
441
+ entity_mapping[entity["name"]] = canonical_entity["name"]
442
+ merged = True
443
+ break
444
 
445
  if not merged:
446
  unique_entities[name] = entity
447
+ entity_mapping[entity["name"]] = entity["name"]
448
+
449
  print(f"✅ 去重完成,剩余 {len(unique_entities)} 个唯一实体")
450
 
451
  return {