medve / app.py
Márk Vince Varga
ESGRTSDRYVGEDDEGRYDEGR
42b8d88 unverified
import sys
import subprocess
import os
import ctypes
# --- CUDA DEPENDENCY FIX START ---
def ensure_cuda_runtime():
"""
Ensures that CUDA runtime libraries are available.
If libcudart.so.12 or libcublas.so.12 is missing, installs the necessary packages
and restarts the script with the correct LD_LIBRARY_PATH.
"""
missing_libs = []
lib_paths = []
# 1. Check for libcudart (CUDA Runtime)
try:
ctypes.CDLL("libcudart.so.12")
except OSError:
print("libcudart.so.12 not found.")
missing_libs.append("nvidia-cuda-runtime-cu12")
# 2. Check for libcublas (CUDA BLAS)
try:
ctypes.CDLL("libcublas.so.12")
except OSError:
print("libcublas.so.12 not found.")
missing_libs.append("nvidia-cublas-cu12")
# 3. Check for libcuda.so.1 (NVIDIA Driver)
# This CANNOT be installed via pip. It must be mounted from the host.
# If missing, we must fall back to CPU mode.
driver_found = False
try:
ctypes.CDLL("libcuda.so.1")
driver_found = True
except OSError:
print("libcuda.so.1 (NVIDIA Driver) not found via dlopen.")
# Search common paths
search_paths = [
"/usr/lib/x86_64-linux-gnu",
"/usr/lib64",
"/usr/lib",
"/usr/local/cuda/lib64",
"/usr/lib/wsl/lib"
]
for path in search_paths:
if os.path.exists(os.path.join(path, "libcuda.so.1")):
print(f"Found libcuda.so.1 manually at {path}")
lib_paths.append(path)
driver_found = True
break
if not driver_found and not missing_libs:
# If we have libs but no driver, we can't run GPU.
# We will set an env var to force CPU install later.
print("CRITICAL: NVIDIA Driver (libcuda.so.1) not found. GPU acceleration will fail.")
print("Switching to CPU-only mode for this run.")
os.environ["FORCE_CPU_MODE"] = "1"
if not missing_libs and driver_found:
return # All libraries and driver found
if missing_libs:
print(f"Missing CUDA libraries. Installing: {', '.join(missing_libs)}...")
# 3. Install missing packages
subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing_libs)
# 4. Find library paths
import site
for sp in site.getsitepackages():
# Runtime libs
rt_path = os.path.join(sp, "nvidia", "cuda_runtime", "lib")
if os.path.isdir(rt_path):
lib_paths.append(rt_path)
# Cublas libs
cublas_path = os.path.join(sp, "nvidia", "cublas", "lib")
if os.path.isdir(cublas_path):
lib_paths.append(cublas_path)
if not lib_paths and missing_libs:
print("Warning: Could not find nvidia lib paths after installation.")
return
# 5. Update LD_LIBRARY_PATH and restart
current_ld = os.environ.get("LD_LIBRARY_PATH", "")
new_ld_parts = []
# Add only new paths
for p in lib_paths:
if p not in current_ld:
new_ld_parts.append(p)
if new_ld_parts:
print(f"Adding {len(new_ld_parts)} paths to LD_LIBRARY_PATH and restarting...")
new_ld = os.pathsep.join(new_ld_parts)
if current_ld:
new_ld = f"{new_ld}{os.pathsep}{current_ld}"
os.environ["LD_LIBRARY_PATH"] = new_ld
# Pass the FORCE_CPU_MODE flag to the restarted process if set
env = os.environ.copy()
# Re-execute the current script with the new environment
os.execvpe(sys.executable, [sys.executable] + sys.argv, env)
ensure_cuda_runtime()
# --- CUDA DEPENDENCY FIX END ---
# --- INSTALLATION BLOCK ---
# We check if installed, and if not, we force install from the CUDA index.
# This avoids the "Building wheel" loop and the "libc.musl" error.
try:
import llama_cpp
# Trigger a load to verify it works (catches the RuntimeError if libs are missing)
llama_cpp.Llama
print("llama-cpp-python is correctly installed.")
except (ImportError, RuntimeError, OSError) as e:
print(f"llama-cpp-python needs installation or repair: {e}")
if os.environ.get("FORCE_CPU_MODE") == "1":
print("Installing CPU-only llama-cpp-python (Fallback)...")
subprocess.check_call([
sys.executable, "-m", "pip", "install",
"llama-cpp-python",
"--prefer-binary",
"--force-reinstall"
])
else:
print("Installing llama-cpp-python from pre-built wheel index (CUDA)...")
subprocess.check_call([
sys.executable, "-m", "pip", "install",
"llama-cpp-python",
"--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cu121",
"--prefer-binary",
"--force-reinstall"
])
print("Installation complete.")
# --- IMPORTS AFTER INSTALL ---
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# 1. CONFIGURATION
REPO_ID = "unsloth/Llama-3.2-1B-Instruct-GGUF"
FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
# 2. DOWNLOAD THE GGUF MODEL
# This downloads the file to a local cache in the container
print(f"Downloading {FILENAME} from {REPO_ID}...")
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME
)
print(f"Model downloaded to: {model_path}")
# 3. LOAD MODEL
# n_ctx=2048 is the context window. Adjust if your finetune supports more.
# n_threads=2 is good for the free tier (it has 2 vCPUs).
llm = Llama(
model_path=model_path,
n_ctx=2048,
n_threads=2,
verbose=False
)
# 4. INFERENCE FUNCTION
def chat_stream(message, history):
# Construct the prompt.
# Llama 3.2 Instruct usually expects standard special tokens.
# We'll build a manual prompt string if we don't want to use the built-in chat handler.
# But llama-cpp-python has a 'create_chat_completion' that handles formatting
# IF the GGUF metadata has the template.
messages = []
for user_msg, bot_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": bot_msg})
messages.append({"role": "user", "content": message})
# Stream response
stream = llm.create_chat_completion(
messages=messages,
max_tokens=512,
stream=True,
temperature=0.7
)
partial_message = ""
for chunk in stream:
delta = chunk['choices'][0]['delta']
if 'content' in delta:
partial_message += delta['content']
yield partial_message
# 5. UI SETUP
demo = gr.ChatInterface(
fn=chat_stream,
title="Mouse (Llama 3.2 1B Finetune)",
description=f"Running {FILENAME} on CPU",
examples=["Hello!", "Why is the sky blue?"],
)
if __name__ == "__main__":
demo.launch()