Spaces:

markvincevarga
/

medve

Runtime error

File size: 6,944 Bytes

import sys
import subprocess
import os
import ctypes

# --- CUDA DEPENDENCY FIX START ---
def ensure_cuda_runtime():
    """
    Ensures that CUDA runtime libraries are available. 
    If libcudart.so.12 or libcublas.so.12 is missing, installs the necessary packages
    and restarts the script with the correct LD_LIBRARY_PATH.
    """
    missing_libs = []
    lib_paths = []
    
    # 1. Check for libcudart (CUDA Runtime)
    try:
        ctypes.CDLL("libcudart.so.12")
    except OSError:
        print("libcudart.so.12 not found.")
        missing_libs.append("nvidia-cuda-runtime-cu12")

    # 2. Check for libcublas (CUDA BLAS)
    try:
        ctypes.CDLL("libcublas.so.12")
    except OSError:
        print("libcublas.so.12 not found.")
        missing_libs.append("nvidia-cublas-cu12")

    # 3. Check for libcuda.so.1 (NVIDIA Driver)
    # This CANNOT be installed via pip. It must be mounted from the host.
    # If missing, we must fall back to CPU mode.
    driver_found = False
    try:
        ctypes.CDLL("libcuda.so.1")
        driver_found = True
    except OSError:
        print("libcuda.so.1 (NVIDIA Driver) not found via dlopen.")
        # Search common paths
        search_paths = [
            "/usr/lib/x86_64-linux-gnu",
            "/usr/lib64",
            "/usr/lib",
            "/usr/local/cuda/lib64",
            "/usr/lib/wsl/lib"
        ]
        for path in search_paths:
            if os.path.exists(os.path.join(path, "libcuda.so.1")):
                print(f"Found libcuda.so.1 manually at {path}")
                lib_paths.append(path)
                driver_found = True
                break
    
    if not driver_found and not missing_libs:
        # If we have libs but no driver, we can't run GPU.
        # We will set an env var to force CPU install later.
        print("CRITICAL: NVIDIA Driver (libcuda.so.1) not found. GPU acceleration will fail.")
        print("Switching to CPU-only mode for this run.")
        os.environ["FORCE_CPU_MODE"] = "1"

    if not missing_libs and driver_found:
        return  # All libraries and driver found

    if missing_libs:
        print(f"Missing CUDA libraries. Installing: {', '.join(missing_libs)}...")
        # 3. Install missing packages
        subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing_libs)

    # 4. Find library paths
    import site
    for sp in site.getsitepackages():
        # Runtime libs
        rt_path = os.path.join(sp, "nvidia", "cuda_runtime", "lib")
        if os.path.isdir(rt_path):
            lib_paths.append(rt_path)
        
        # Cublas libs
        cublas_path = os.path.join(sp, "nvidia", "cublas", "lib")
        if os.path.isdir(cublas_path):
            lib_paths.append(cublas_path)

    if not lib_paths and missing_libs:
        print("Warning: Could not find nvidia lib paths after installation.")
        return

    # 5. Update LD_LIBRARY_PATH and restart
    current_ld = os.environ.get("LD_LIBRARY_PATH", "")
    new_ld_parts = []
    
    # Add only new paths
    for p in lib_paths:
        if p not in current_ld:
            new_ld_parts.append(p)
            
    if new_ld_parts:
        print(f"Adding {len(new_ld_parts)} paths to LD_LIBRARY_PATH and restarting...")
        new_ld = os.pathsep.join(new_ld_parts)
        if current_ld:
            new_ld = f"{new_ld}{os.pathsep}{current_ld}"
            
        os.environ["LD_LIBRARY_PATH"] = new_ld
        
        # Pass the FORCE_CPU_MODE flag to the restarted process if set
        env = os.environ.copy()
        
        # Re-execute the current script with the new environment
        os.execvpe(sys.executable, [sys.executable] + sys.argv, env)

ensure_cuda_runtime()
# --- CUDA DEPENDENCY FIX END ---

# --- INSTALLATION BLOCK ---
# We check if installed, and if not, we force install from the CUDA index.
# This avoids the "Building wheel" loop and the "libc.musl" error.
try:
    import llama_cpp
    # Trigger a load to verify it works (catches the RuntimeError if libs are missing)
    llama_cpp.Llama
    print("llama-cpp-python is correctly installed.")
except (ImportError, RuntimeError, OSError) as e:
    print(f"llama-cpp-python needs installation or repair: {e}")
    
    if os.environ.get("FORCE_CPU_MODE") == "1":
        print("Installing CPU-only llama-cpp-python (Fallback)...")
        subprocess.check_call([
            sys.executable, "-m", "pip", "install", 
            "llama-cpp-python", 
            "--prefer-binary",
            "--force-reinstall"
        ])
    else:
        print("Installing llama-cpp-python from pre-built wheel index (CUDA)...")
        subprocess.check_call([
            sys.executable, "-m", "pip", "install", 
            "llama-cpp-python", 
            "--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cu121",
            "--prefer-binary",
            "--force-reinstall"
        ])
    print("Installation complete.")

# --- IMPORTS AFTER INSTALL ---
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# 1. CONFIGURATION
REPO_ID = "unsloth/Llama-3.2-1B-Instruct-GGUF"
FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"

# 2. DOWNLOAD THE GGUF MODEL
# This downloads the file to a local cache in the container
print(f"Downloading {FILENAME} from {REPO_ID}...")
model_path = hf_hub_download(
    repo_id=REPO_ID, 
    filename=FILENAME
)
print(f"Model downloaded to: {model_path}")

# 3. LOAD MODEL
# n_ctx=2048 is the context window. Adjust if your finetune supports more.
# n_threads=2 is good for the free tier (it has 2 vCPUs).
llm = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=2,  
    verbose=False
)

# 4. INFERENCE FUNCTION
def chat_stream(message, history):
    # Construct the prompt. 
    # Llama 3.2 Instruct usually expects standard special tokens.
    # We'll build a manual prompt string if we don't want to use the built-in chat handler.
    # But llama-cpp-python has a 'create_chat_completion' that handles formatting 
    # IF the GGUF metadata has the template. 
    
    messages = []
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
    messages.append({"role": "user", "content": message})

    # Stream response
    stream = llm.create_chat_completion(
        messages=messages,
        max_tokens=512,
        stream=True,
        temperature=0.7
    )
    
    partial_message = ""
    for chunk in stream:
        delta = chunk['choices'][0]['delta']
        if 'content' in delta:
            partial_message += delta['content']
            yield partial_message

# 5. UI SETUP
demo = gr.ChatInterface(
    fn=chat_stream,
    title="Mouse (Llama 3.2 1B Finetune)",
    description=f"Running {FILENAME} on CPU",
    examples=["Hello!", "Why is the sky blue?"],
)

if __name__ == "__main__":
    demo.launch()