Spaces:
Runtime error
Runtime error
File size: 6,944 Bytes
2d37492 b2468b4 944c094 910d96a 944c094 910d96a 09ec7d7 910d96a 944c094 910d96a 944c094 910d96a 944c094 910d96a 09ec7d7 910d96a 09ec7d7 944c094 910d96a 944c094 910d96a 09ec7d7 910d96a 944c094 910d96a 944c094 910d96a 09ec7d7 944c094 09ec7d7 944c094 2d37492 a7b7951 b2468b4 2d37492 944c094 09ec7d7 cee07e1 b2468b4 7c41901 d92a2fd 42b8d88 7c41901 d92a2fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
import sys
import subprocess
import os
import ctypes
# --- CUDA DEPENDENCY FIX START ---
def ensure_cuda_runtime():
"""
Ensures that CUDA runtime libraries are available.
If libcudart.so.12 or libcublas.so.12 is missing, installs the necessary packages
and restarts the script with the correct LD_LIBRARY_PATH.
"""
missing_libs = []
lib_paths = []
# 1. Check for libcudart (CUDA Runtime)
try:
ctypes.CDLL("libcudart.so.12")
except OSError:
print("libcudart.so.12 not found.")
missing_libs.append("nvidia-cuda-runtime-cu12")
# 2. Check for libcublas (CUDA BLAS)
try:
ctypes.CDLL("libcublas.so.12")
except OSError:
print("libcublas.so.12 not found.")
missing_libs.append("nvidia-cublas-cu12")
# 3. Check for libcuda.so.1 (NVIDIA Driver)
# This CANNOT be installed via pip. It must be mounted from the host.
# If missing, we must fall back to CPU mode.
driver_found = False
try:
ctypes.CDLL("libcuda.so.1")
driver_found = True
except OSError:
print("libcuda.so.1 (NVIDIA Driver) not found via dlopen.")
# Search common paths
search_paths = [
"/usr/lib/x86_64-linux-gnu",
"/usr/lib64",
"/usr/lib",
"/usr/local/cuda/lib64",
"/usr/lib/wsl/lib"
]
for path in search_paths:
if os.path.exists(os.path.join(path, "libcuda.so.1")):
print(f"Found libcuda.so.1 manually at {path}")
lib_paths.append(path)
driver_found = True
break
if not driver_found and not missing_libs:
# If we have libs but no driver, we can't run GPU.
# We will set an env var to force CPU install later.
print("CRITICAL: NVIDIA Driver (libcuda.so.1) not found. GPU acceleration will fail.")
print("Switching to CPU-only mode for this run.")
os.environ["FORCE_CPU_MODE"] = "1"
if not missing_libs and driver_found:
return # All libraries and driver found
if missing_libs:
print(f"Missing CUDA libraries. Installing: {', '.join(missing_libs)}...")
# 3. Install missing packages
subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing_libs)
# 4. Find library paths
import site
for sp in site.getsitepackages():
# Runtime libs
rt_path = os.path.join(sp, "nvidia", "cuda_runtime", "lib")
if os.path.isdir(rt_path):
lib_paths.append(rt_path)
# Cublas libs
cublas_path = os.path.join(sp, "nvidia", "cublas", "lib")
if os.path.isdir(cublas_path):
lib_paths.append(cublas_path)
if not lib_paths and missing_libs:
print("Warning: Could not find nvidia lib paths after installation.")
return
# 5. Update LD_LIBRARY_PATH and restart
current_ld = os.environ.get("LD_LIBRARY_PATH", "")
new_ld_parts = []
# Add only new paths
for p in lib_paths:
if p not in current_ld:
new_ld_parts.append(p)
if new_ld_parts:
print(f"Adding {len(new_ld_parts)} paths to LD_LIBRARY_PATH and restarting...")
new_ld = os.pathsep.join(new_ld_parts)
if current_ld:
new_ld = f"{new_ld}{os.pathsep}{current_ld}"
os.environ["LD_LIBRARY_PATH"] = new_ld
# Pass the FORCE_CPU_MODE flag to the restarted process if set
env = os.environ.copy()
# Re-execute the current script with the new environment
os.execvpe(sys.executable, [sys.executable] + sys.argv, env)
ensure_cuda_runtime()
# --- CUDA DEPENDENCY FIX END ---
# --- INSTALLATION BLOCK ---
# We check if installed, and if not, we force install from the CUDA index.
# This avoids the "Building wheel" loop and the "libc.musl" error.
try:
import llama_cpp
# Trigger a load to verify it works (catches the RuntimeError if libs are missing)
llama_cpp.Llama
print("llama-cpp-python is correctly installed.")
except (ImportError, RuntimeError, OSError) as e:
print(f"llama-cpp-python needs installation or repair: {e}")
if os.environ.get("FORCE_CPU_MODE") == "1":
print("Installing CPU-only llama-cpp-python (Fallback)...")
subprocess.check_call([
sys.executable, "-m", "pip", "install",
"llama-cpp-python",
"--prefer-binary",
"--force-reinstall"
])
else:
print("Installing llama-cpp-python from pre-built wheel index (CUDA)...")
subprocess.check_call([
sys.executable, "-m", "pip", "install",
"llama-cpp-python",
"--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cu121",
"--prefer-binary",
"--force-reinstall"
])
print("Installation complete.")
# --- IMPORTS AFTER INSTALL ---
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# 1. CONFIGURATION
REPO_ID = "unsloth/Llama-3.2-1B-Instruct-GGUF"
FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
# 2. DOWNLOAD THE GGUF MODEL
# This downloads the file to a local cache in the container
print(f"Downloading {FILENAME} from {REPO_ID}...")
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME
)
print(f"Model downloaded to: {model_path}")
# 3. LOAD MODEL
# n_ctx=2048 is the context window. Adjust if your finetune supports more.
# n_threads=2 is good for the free tier (it has 2 vCPUs).
llm = Llama(
model_path=model_path,
n_ctx=2048,
n_threads=2,
verbose=False
)
# 4. INFERENCE FUNCTION
def chat_stream(message, history):
# Construct the prompt.
# Llama 3.2 Instruct usually expects standard special tokens.
# We'll build a manual prompt string if we don't want to use the built-in chat handler.
# But llama-cpp-python has a 'create_chat_completion' that handles formatting
# IF the GGUF metadata has the template.
messages = []
for user_msg, bot_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": bot_msg})
messages.append({"role": "user", "content": message})
# Stream response
stream = llm.create_chat_completion(
messages=messages,
max_tokens=512,
stream=True,
temperature=0.7
)
partial_message = ""
for chunk in stream:
delta = chunk['choices'][0]['delta']
if 'content' in delta:
partial_message += delta['content']
yield partial_message
# 5. UI SETUP
demo = gr.ChatInterface(
fn=chat_stream,
title="Mouse (Llama 3.2 1B Finetune)",
description=f"Running {FILENAME} on CPU",
examples=["Hello!", "Why is the sky blue?"],
)
if __name__ == "__main__":
demo.launch()
|