Bi Yoo commited on
Commit
96ef181
·
1 Parent(s): 38bd823

update model, too slow

Browse files
Files changed (1) hide show
  1. config.py +5 -5
config.py CHANGED
@@ -15,11 +15,11 @@ HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "")
15
  HUGGINGFACE_MODEL = "google/gemma-2-2b-it"
16
 
17
  # Local model configuration (for quantized models hosted within the Space)
18
- LOCAL_MODEL_REPO = os.getenv("LOCAL_MODEL_REPO", "bartowski/Qwen_Qwen3-4B-Instruct-2507-GGUF")
19
- LOCAL_MODEL_FILENAME = os.getenv("LOCAL_MODEL_FILENAME", "Qwen_Qwen3-4B-Instruct-2507-Q4_K_M.gguf")
20
- LOCAL_MODEL_CONTEXT_LENGTH = int(os.getenv("LOCAL_MODEL_CONTEXT_LENGTH", "4096"))
21
  LOCAL_MODEL_THREADS = int(os.getenv("LOCAL_MODEL_THREADS", str(os.cpu_count() or 4)))
22
- LOCAL_MODEL_BATCH_SIZE = int(os.getenv("LOCAL_MODEL_BATCH_SIZE", "512"))
23
  LOCAL_MODEL_MAX_OUTPUT_TOKENS = int(os.getenv("LOCAL_MODEL_MAX_OUTPUT_TOKENS", "200"))
24
  LOCAL_MODEL_HF_TOKEN = os.getenv("LOCAL_MODEL_HF_TOKEN", HUGGINGFACE_API_KEY or "")
25
 
@@ -39,7 +39,7 @@ SESSION_TOKEN_TTL_SECONDS = int(os.getenv("SESSION_TOKEN_TTL_SECONDS", "600"))
39
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Fast, lightweight
40
  CHUNK_SIZE = 300 # Characters per chunk (reduced for faster inference)
41
  CHUNK_OVERLAP = 30 # Overlap between chunks
42
- TOP_K_RESULTS = 2 # Number of relevant chunks to retrieve (reduced to minimize context)
43
 
44
  # System prompt for the chatbot
45
  SYSTEM_PROMPT = """Answer questions about Bi using the provided context. Keep answers short and direct. Always refer to Bi by name."""
 
15
  HUGGINGFACE_MODEL = "google/gemma-2-2b-it"
16
 
17
  # Local model configuration (for quantized models hosted within the Space)
18
+ LOCAL_MODEL_REPO = os.getenv("LOCAL_MODEL_REPO", "bartowski/Qwen_Qwen3-1.7B-Instruct-GGUF")
19
+ LOCAL_MODEL_FILENAME = os.getenv("LOCAL_MODEL_FILENAME", "Qwen_Qwen3-1.7B-Instruct-Q4_K_M.gguf")
20
+ LOCAL_MODEL_CONTEXT_LENGTH = int(os.getenv("LOCAL_MODEL_CONTEXT_LENGTH", "2048"))
21
  LOCAL_MODEL_THREADS = int(os.getenv("LOCAL_MODEL_THREADS", str(os.cpu_count() or 4)))
22
+ LOCAL_MODEL_BATCH_SIZE = int(os.getenv("LOCAL_MODEL_BATCH_SIZE", "256"))
23
  LOCAL_MODEL_MAX_OUTPUT_TOKENS = int(os.getenv("LOCAL_MODEL_MAX_OUTPUT_TOKENS", "200"))
24
  LOCAL_MODEL_HF_TOKEN = os.getenv("LOCAL_MODEL_HF_TOKEN", HUGGINGFACE_API_KEY or "")
25
 
 
39
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Fast, lightweight
40
  CHUNK_SIZE = 300 # Characters per chunk (reduced for faster inference)
41
  CHUNK_OVERLAP = 30 # Overlap between chunks
42
+ TOP_K_RESULTS = 1 # Fewer chunks lowers prompt size on small CPU tiers
43
 
44
  # System prompt for the chatbot
45
  SYSTEM_PROMPT = """Answer questions about Bi using the provided context. Keep answers short and direct. Always refer to Bi by name."""