Spaces:

ahaahaaha
/

adaptive_rag

Paused

App Files Files Community

lanny xu commited on Oct 23

Commit

a576aa9

1 Parent(s): c33bb69

resolve conflict

Browse files

Files changed (3) hide show

entity_extractor.py +125 -3
graph_indexer.py +81 -22
requirements_graphrag.txt +3 -0

entity_extractor.py CHANGED Viewed

@@ -5,6 +5,9 @@
 from typing import List, Dict, Tuple
 import time
 try:
     from langchain_core.prompts import PromptTemplate
 except ImportError:
@@ -16,14 +19,15 @@ from config import LOCAL_LLM
 class EntityExtractor:
-    """实体提取器 - 使用LLM从文本中提取实体"""
-    def __init__(self, timeout: int = 60, max_retries: int = 3):
         """初始化实体提取器
         Args:
             timeout: LLM调用超时时间（秒）
             max_retries: 失败重试次数
         """
         self.llm = ChatOllama(
             model=LOCAL_LLM,
@@ -32,6 +36,8 @@ class EntityExtractor:
             timeout=timeout  # 添加超时设置
         )
         self.max_retries = max_retries
         # 实体提取提示模板
         self.entity_prompt = PromptTemplate(
@@ -175,9 +181,124 @@ class EntityExtractor:
                     return []
         return []
     def extract_from_document(self, document_text: str, doc_index: int = 0) -> Dict:
         """
-        从单个文档中提取实体和关系
         Args:
             document_text: 文档文本
@@ -186,6 +307,7 @@ class EntityExtractor:
         Returns:
             包含实体和关系的字典
         """
         print(f"\n🔍 文档 #{doc_index + 1}: 开始提取...")
         entities = self.extract_entities(document_text)

 from typing import List, Dict, Tuple
 import time
+import asyncio
+import aiohttp
+import json
 try:
     from langchain_core.prompts import PromptTemplate
 except ImportError:
 class EntityExtractor:
+    """实体提取器 - 使用LLM从文本中提取实体（支持异步批处理）"""
+    def __init__(self, timeout: int = 60, max_retries: int = 3, enable_async: bool = True):
         """初始化实体提取器
         Args:
             timeout: LLM调用超时时间（秒）
             max_retries: 失败重试次数
+            enable_async: 是否启用异步处理（默认启用）
         """
         self.llm = ChatOllama(
             model=LOCAL_LLM,
             timeout=timeout  # 添加超时设置
         )
         self.max_retries = max_retries
+        self.enable_async = enable_async
+        self.ollama_url = "http://localhost:11434/api/generate"
         # 实体提取提示模板
         self.entity_prompt = PromptTemplate(
                     return []
         return []
+    async def _async_llm_call(self, prompt: str, session: aiohttp.ClientSession, attempt: int = 0) -> Dict:
+        """异步调用 Ollama API"""
+        try:
+            async with session.post(
+                self.ollama_url,
+                json={
+                    "model": LOCAL_LLM,
+                    "prompt": prompt,
+                    "format": "json",
+                    "stream": False,
+                    "options": {"temperature": 0}
+                },
+                timeout=aiohttp.ClientTimeout(total=self.llm.timeout if hasattr(self.llm, 'timeout') else 60)
+            ) as response:
+                if response.status == 200:
+                    result = await response.json()
+                    return json.loads(result.get('response', '{}'))
+                else:
+                    raise Exception(f"API返回错误: {response.status}")
+        except asyncio.TimeoutError:
+            if attempt < self.max_retries - 1:
+                await asyncio.sleep((attempt + 1) * 2)
+                return await self._async_llm_call(prompt, session, attempt + 1)
+            raise
+        except Exception as e:
+            if attempt < self.max_retries - 1:
+                await asyncio.sleep(1)
+                return await self._async_llm_call(prompt, session, attempt + 1)
+            raise
+    async def _extract_entities_async(self, text: str, doc_index: int, session: aiohttp.ClientSession) -> List[Dict]:
+        """异步提取实体"""
+        prompt = self.entity_prompt.format(text=text[:2000])
+        for attempt in range(self.max_retries):
+            try:
+                print(f"   [文档 #{doc_index + 1}] 🔄 提取实体 (尝试 {attempt + 1}/{self.max_retries})...", end="")
+                result = await self._async_llm_call(prompt, session, attempt)
+                entities = result.get("entities", [])
+                print(f" ✅ {len(entities)} 个实体")
+                return entities
+            except Exception as e:
+                print(f" ❌ {str(e)[:50]}")
+                if attempt == self.max_retries - 1:
+                    return []
+        return []
+    async def _extract_relations_async(self, text: str, entities: List[Dict], doc_index: int, session: aiohttp.ClientSession) -> List[Dict]:
+        """异步提取关系"""
+        if not entities:
+            return []
+        entity_names = [e["name"] for e in entities]
+        prompt = self.relation_prompt.format(
+            text=text[:2000],
+            entities=", ".join(entity_names)
+        )
+        for attempt in range(self.max_retries):
+            try:
+                print(f"   [文档 #{doc_index + 1}] 🔄 提取关系 (尝试 {attempt + 1}/{self.max_retries})...", end="")
+                result = await self._async_llm_call(prompt, session, attempt)
+                relations = result.get("relations", [])
+                print(f" ✅ {len(relations)} 个关系")
+                return relations
+            except Exception as e:
+                print(f" ❌ {str(e)[:50]}")
+                if attempt == self.max_retries - 1:
+                    return []
+        return []
+    async def _extract_from_document_async(self, document_text: str, doc_index: int, session: aiohttp.ClientSession) -> Dict:
+        """异步处理单个文档"""
+        print(f"\n🔍 [文档 #{doc_index + 1}] 开始异步提取...")
+        # 并发提取实体和关系（先实体，再关系）
+        entities = await self._extract_entities_async(document_text, doc_index, session)
+        relations = await self._extract_relations_async(document_text, entities, doc_index, session)
+        print(f"📊 [文档 #{doc_index + 1}] 完成: {len(entities)} 实体, {len(relations)} 关系")
+        return {
+            "entities": entities,
+            "relations": relations
+        }
+    async def extract_batch_async(self, documents: List[Tuple[str, int]]) -> List[Dict]:
+        """异步批量处理多个文档
+        Args:
+            documents: 文档列表，每个元素为 (document_text, doc_index) 元组
+        Returns:
+            提取结果列表
+        """
+        async with aiohttp.ClientSession() as session:
+            tasks = [
+                self._extract_from_document_async(doc_text, doc_idx, session)
+                for doc_text, doc_idx in documents
+            ]
+            # 并发执行所有任务
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+            # 处理异常结果
+            processed_results = []
+            for i, result in enumerate(results):
+                if isinstance(result, Exception):
+                    print(f"⚠️ 文档 #{documents[i][1] + 1} 处理失败: {result}")
+                    processed_results.append({"entities": [], "relations": []})
+                else:
+                    processed_results.append(result)
+            return processed_results
     def extract_from_document(self, document_text: str, doc_index: int = 0) -> Dict:
         """
+        从单个文档中提取实体和关系（同步接口，保持向后兼容）
         Args:
             document_text: 文档文本
         Returns:
             包含实体和关系的字典
         """
+        # 同步方式调用（保持向后兼容）
         print(f"\n🔍 文档 #{doc_index + 1}: 开始提取...")
         entities = self.extract_entities(document_text)

graph_indexer.py CHANGED Viewed

@@ -4,6 +4,7 @@ GraphRAG索引器
 """
 from typing import List, Dict, Optional
 try:
     from langchain_core.documents import Document
 except ImportError:
@@ -16,17 +17,26 @@ from knowledge_graph import KnowledgeGraph, CommunitySummarizer
 class GraphRAGIndexer:
     """GraphRAG索引器 - 实现Microsoft GraphRAG的索引流程"""
-    def __init__(self):
         print("🚀 初始化GraphRAG索引器...")
-        self.entity_extractor = EntityExtractor()
         self.entity_deduplicator = EntityDeduplicator()
         self.knowledge_graph = KnowledgeGraph()
         self.community_summarizer = CommunitySummarizer()
         self.indexed = False
-        print("✅ GraphRAG索引器初始化完成")
     def index_documents(self, documents: List[Document],
                        batch_size: int = 10,
@@ -58,27 +68,35 @@ class GraphRAGIndexer:
         # 步骤1: 实体和关系提取
         print("📍 步骤 1/5: 实体和关系提取")
         extraction_results = []
-        total_batches = (len(documents) - 1) // batch_size + 1
-        for i in range(0, len(documents), batch_size):
-            batch = documents[i:i+batch_size]
-            batch_num = i // batch_size + 1
-            print(f"\n⚙️  === 批次 {batch_num}/{total_batches} (文档 {i+1}-{min(i+batch_size, len(documents))}) ===")
-            for idx, doc in enumerate(batch):
-                doc_global_index = i + idx
-                try:
-                    result = self.entity_extractor.extract_from_document(
-                        doc.page_content,
-                        doc_index=doc_global_index
-                    )
-                    extraction_results.append(result)
-                except Exception as e:
-                    print(f"   ❌ 文档 #{doc_global_index + 1} 处理失败: {e}")
-                    # 添加空结果以保持索引一致
-                    extraction_results.append({"entities": [], "relations": []})
-            print(f"✅ 批次 {batch_num}/{total_batches} 完成")
         # 步骤2: 实体去重
         print("\n📍 步骤 2/5: 实体去重和合并")
@@ -142,6 +160,47 @@ class GraphRAGIndexer:
         return self.knowledge_graph
     def get_graph(self) -> KnowledgeGraph:
         """获取知识图谱"""
         if not self.indexed:

 """
 from typing import List, Dict, Optional
+import asyncio
 try:
     from langchain_core.documents import Document
 except ImportError:
 class GraphRAGIndexer:
     """GraphRAG索引器 - 实现Microsoft GraphRAG的索引流程"""
+    def __init__(self, enable_async: bool = True, async_batch_size: int = 5):
+        """初始化GraphRAG索引器
+        Args:
+            enable_async: 是否启用异步处理（默认启用）
+            async_batch_size: 异步并发批次大小（默认5个文档并发）
+        """
         print("🚀 初始化GraphRAG索引器...")
+        self.entity_extractor = EntityExtractor(enable_async=enable_async)
         self.entity_deduplicator = EntityDeduplicator()
         self.knowledge_graph = KnowledgeGraph()
         self.community_summarizer = CommunitySummarizer()
+        self.enable_async = enable_async
+        self.async_batch_size = async_batch_size
         self.indexed = False
+        mode = "异步模式" if enable_async else "同步模式"
+        print(f"✅ GraphRAG索引器初始化完成 ({mode}, 并发数={async_batch_size})")
     def index_documents(self, documents: List[Document],
                        batch_size: int = 10,
         # 步骤1: 实体和关系提取
         print("📍 步骤 1/5: 实体和关系提取")
         extraction_results = []
+        if self.enable_async:
+            # 异步批量处理模式
+            print(f"🚀 使用异步处理模式，并发数={self.async_batch_size}")
+            extraction_results = self._extract_async(documents)
+        else:
+            # 同步处理模式（原有逻辑）
+            print("🔄 使用同步处理模式")
+            total_batches = (len(documents) - 1) // batch_size + 1
+            for i in range(0, len(documents), batch_size):
+                batch = documents[i:i+batch_size]
+                batch_num = i // batch_size + 1
+                print(f"\n⚙️  === 批次 {batch_num}/{total_batches} (文档 {i+1}-{min(i+batch_size, len(documents))}) ===")
+                for idx, doc in enumerate(batch):
+                    doc_global_index = i + idx
+                    try:
+                        result = self.entity_extractor.extract_from_document(
+                            doc.page_content,
+                            doc_index=doc_global_index
+                        )
+                        extraction_results.append(result)
+                    except Exception as e:
+                        print(f"   ❌ 文档 #{doc_global_index + 1} 处理失败: {e}")
+                        # 添加空结果以保持索引一致
+                        extraction_results.append({"entities": [], "relations": []})
+                print(f"✅ 批次 {batch_num}/{total_batches} 完成")
         # 步骤2: 实体去重
         print("\n📍 步骤 2/5: 实体去重和合并")
         return self.knowledge_graph
+    def _extract_async(self, documents: List[Document]) -> List[Dict]:
+        """异步批量提取实体和关系
+        Args:
+            documents: 文档列表
+        Returns:
+            提取结果列表
+        """
+        total_docs = len(documents)
+        extraction_results = []
+        # 将文档分成多个异步批次
+        for i in range(0, total_docs, self.async_batch_size):
+            batch_end = min(i + self.async_batch_size, total_docs)
+            batch_num = i // self.async_batch_size + 1
+            total_batches = (total_docs - 1) // self.async_batch_size + 1
+            print(f"\n⚡ === 异步批次 {batch_num}/{total_batches} (文档 {i+1}-{batch_end}) ===")
+            # 准备异步批次数据
+            async_batch = [
+                (documents[idx].page_content, idx)
+                for idx in range(i, batch_end)
+            ]
+            # 异步执行当前批次
+            try:
+                batch_results = asyncio.run(
+                    self.entity_extractor.extract_batch_async(async_batch)
+                )
+                extraction_results.extend(batch_results)
+                print(f"✅ 异步批次 {batch_num}/{total_batches} 完成")
+            except Exception as e:
+                print(f"❌ 异步批次 {batch_num} 失败: {e}")
+                # 添加空结果
+                for _ in range(len(async_batch)):
+                    extraction_results.append({"entities": [], "relations": []})
+        return extraction_results
     def get_graph(self) -> KnowledgeGraph:
         """获取知识图谱"""
         if not self.indexed:

requirements_graphrag.txt CHANGED Viewed

@@ -35,3 +35,6 @@ plotly>=5.18.0
 # 缓存和性能优化
 diskcache>=5.6.0
 joblib>=1.3.0

 # 缓存和性能优化
 diskcache>=5.6.0
 joblib>=1.3.0
+# 异步HTTP请求（用于并发处理）
+aiohttp>=3.9.0