Spaces:
Runtime error
Runtime error
| import sys | |
| import subprocess | |
| import os | |
| import ctypes | |
| # --- CUDA DEPENDENCY FIX START --- | |
| def ensure_cuda_runtime(): | |
| """ | |
| Ensures that CUDA runtime libraries are available. | |
| If libcudart.so.12 or libcublas.so.12 is missing, installs the necessary packages | |
| and restarts the script with the correct LD_LIBRARY_PATH. | |
| """ | |
| missing_libs = [] | |
| lib_paths = [] | |
| # 1. Check for libcudart (CUDA Runtime) | |
| try: | |
| ctypes.CDLL("libcudart.so.12") | |
| except OSError: | |
| print("libcudart.so.12 not found.") | |
| missing_libs.append("nvidia-cuda-runtime-cu12") | |
| # 2. Check for libcublas (CUDA BLAS) | |
| try: | |
| ctypes.CDLL("libcublas.so.12") | |
| except OSError: | |
| print("libcublas.so.12 not found.") | |
| missing_libs.append("nvidia-cublas-cu12") | |
| # 3. Check for libcuda.so.1 (NVIDIA Driver) | |
| # This CANNOT be installed via pip. It must be mounted from the host. | |
| # If missing, we must fall back to CPU mode. | |
| driver_found = False | |
| try: | |
| ctypes.CDLL("libcuda.so.1") | |
| driver_found = True | |
| except OSError: | |
| print("libcuda.so.1 (NVIDIA Driver) not found via dlopen.") | |
| # Search common paths | |
| search_paths = [ | |
| "/usr/lib/x86_64-linux-gnu", | |
| "/usr/lib64", | |
| "/usr/lib", | |
| "/usr/local/cuda/lib64", | |
| "/usr/lib/wsl/lib" | |
| ] | |
| for path in search_paths: | |
| if os.path.exists(os.path.join(path, "libcuda.so.1")): | |
| print(f"Found libcuda.so.1 manually at {path}") | |
| lib_paths.append(path) | |
| driver_found = True | |
| break | |
| if not driver_found and not missing_libs: | |
| # If we have libs but no driver, we can't run GPU. | |
| # We will set an env var to force CPU install later. | |
| print("CRITICAL: NVIDIA Driver (libcuda.so.1) not found. GPU acceleration will fail.") | |
| print("Switching to CPU-only mode for this run.") | |
| os.environ["FORCE_CPU_MODE"] = "1" | |
| if not missing_libs and driver_found: | |
| return # All libraries and driver found | |
| if missing_libs: | |
| print(f"Missing CUDA libraries. Installing: {', '.join(missing_libs)}...") | |
| # 3. Install missing packages | |
| subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing_libs) | |
| # 4. Find library paths | |
| import site | |
| for sp in site.getsitepackages(): | |
| # Runtime libs | |
| rt_path = os.path.join(sp, "nvidia", "cuda_runtime", "lib") | |
| if os.path.isdir(rt_path): | |
| lib_paths.append(rt_path) | |
| # Cublas libs | |
| cublas_path = os.path.join(sp, "nvidia", "cublas", "lib") | |
| if os.path.isdir(cublas_path): | |
| lib_paths.append(cublas_path) | |
| if not lib_paths and missing_libs: | |
| print("Warning: Could not find nvidia lib paths after installation.") | |
| return | |
| # 5. Update LD_LIBRARY_PATH and restart | |
| current_ld = os.environ.get("LD_LIBRARY_PATH", "") | |
| new_ld_parts = [] | |
| # Add only new paths | |
| for p in lib_paths: | |
| if p not in current_ld: | |
| new_ld_parts.append(p) | |
| if new_ld_parts: | |
| print(f"Adding {len(new_ld_parts)} paths to LD_LIBRARY_PATH and restarting...") | |
| new_ld = os.pathsep.join(new_ld_parts) | |
| if current_ld: | |
| new_ld = f"{new_ld}{os.pathsep}{current_ld}" | |
| os.environ["LD_LIBRARY_PATH"] = new_ld | |
| # Pass the FORCE_CPU_MODE flag to the restarted process if set | |
| env = os.environ.copy() | |
| # Re-execute the current script with the new environment | |
| os.execvpe(sys.executable, [sys.executable] + sys.argv, env) | |
| ensure_cuda_runtime() | |
| # --- CUDA DEPENDENCY FIX END --- | |
| # --- INSTALLATION BLOCK --- | |
| # We check if installed, and if not, we force install from the CUDA index. | |
| # This avoids the "Building wheel" loop and the "libc.musl" error. | |
| try: | |
| import llama_cpp | |
| # Trigger a load to verify it works (catches the RuntimeError if libs are missing) | |
| llama_cpp.Llama | |
| print("llama-cpp-python is correctly installed.") | |
| except (ImportError, RuntimeError, OSError) as e: | |
| print(f"llama-cpp-python needs installation or repair: {e}") | |
| if os.environ.get("FORCE_CPU_MODE") == "1": | |
| print("Installing CPU-only llama-cpp-python (Fallback)...") | |
| subprocess.check_call([ | |
| sys.executable, "-m", "pip", "install", | |
| "llama-cpp-python", | |
| "--prefer-binary", | |
| "--force-reinstall" | |
| ]) | |
| else: | |
| print("Installing llama-cpp-python from pre-built wheel index (CUDA)...") | |
| subprocess.check_call([ | |
| sys.executable, "-m", "pip", "install", | |
| "llama-cpp-python", | |
| "--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cu121", | |
| "--prefer-binary", | |
| "--force-reinstall" | |
| ]) | |
| print("Installation complete.") | |
| # --- IMPORTS AFTER INSTALL --- | |
| import gradio as gr | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| # 1. CONFIGURATION | |
| REPO_ID = "unsloth/Llama-3.2-1B-Instruct-GGUF" | |
| FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf" | |
| # 2. DOWNLOAD THE GGUF MODEL | |
| # This downloads the file to a local cache in the container | |
| print(f"Downloading {FILENAME} from {REPO_ID}...") | |
| model_path = hf_hub_download( | |
| repo_id=REPO_ID, | |
| filename=FILENAME | |
| ) | |
| print(f"Model downloaded to: {model_path}") | |
| # 3. LOAD MODEL | |
| # n_ctx=2048 is the context window. Adjust if your finetune supports more. | |
| # n_threads=2 is good for the free tier (it has 2 vCPUs). | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=2048, | |
| n_threads=2, | |
| verbose=False | |
| ) | |
| # 4. INFERENCE FUNCTION | |
| def chat_stream(message, history): | |
| # Construct the prompt. | |
| # Llama 3.2 Instruct usually expects standard special tokens. | |
| # We'll build a manual prompt string if we don't want to use the built-in chat handler. | |
| # But llama-cpp-python has a 'create_chat_completion' that handles formatting | |
| # IF the GGUF metadata has the template. | |
| messages = [] | |
| for user_msg, bot_msg in history: | |
| messages.append({"role": "user", "content": user_msg}) | |
| messages.append({"role": "assistant", "content": bot_msg}) | |
| messages.append({"role": "user", "content": message}) | |
| # Stream response | |
| stream = llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=512, | |
| stream=True, | |
| temperature=0.7 | |
| ) | |
| partial_message = "" | |
| for chunk in stream: | |
| delta = chunk['choices'][0]['delta'] | |
| if 'content' in delta: | |
| partial_message += delta['content'] | |
| yield partial_message | |
| # 5. UI SETUP | |
| demo = gr.ChatInterface( | |
| fn=chat_stream, | |
| title="Mouse (Llama 3.2 1B Finetune)", | |
| description=f"Running {FILENAME} on CPU", | |
| examples=["Hello!", "Why is the sky blue?"], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |