import sys import subprocess import os import ctypes # --- CUDA DEPENDENCY FIX START --- def ensure_cuda_runtime(): """ Ensures that CUDA runtime libraries are available. If libcudart.so.12 or libcublas.so.12 is missing, installs the necessary packages and restarts the script with the correct LD_LIBRARY_PATH. """ missing_libs = [] lib_paths = [] # 1. Check for libcudart (CUDA Runtime) try: ctypes.CDLL("libcudart.so.12") except OSError: print("libcudart.so.12 not found.") missing_libs.append("nvidia-cuda-runtime-cu12") # 2. Check for libcublas (CUDA BLAS) try: ctypes.CDLL("libcublas.so.12") except OSError: print("libcublas.so.12 not found.") missing_libs.append("nvidia-cublas-cu12") # 3. Check for libcuda.so.1 (NVIDIA Driver) # This CANNOT be installed via pip. It must be mounted from the host. # If missing, we must fall back to CPU mode. driver_found = False try: ctypes.CDLL("libcuda.so.1") driver_found = True except OSError: print("libcuda.so.1 (NVIDIA Driver) not found via dlopen.") # Search common paths search_paths = [ "/usr/lib/x86_64-linux-gnu", "/usr/lib64", "/usr/lib", "/usr/local/cuda/lib64", "/usr/lib/wsl/lib" ] for path in search_paths: if os.path.exists(os.path.join(path, "libcuda.so.1")): print(f"Found libcuda.so.1 manually at {path}") lib_paths.append(path) driver_found = True break if not driver_found and not missing_libs: # If we have libs but no driver, we can't run GPU. # We will set an env var to force CPU install later. print("CRITICAL: NVIDIA Driver (libcuda.so.1) not found. GPU acceleration will fail.") print("Switching to CPU-only mode for this run.") os.environ["FORCE_CPU_MODE"] = "1" if not missing_libs and driver_found: return # All libraries and driver found if missing_libs: print(f"Missing CUDA libraries. Installing: {', '.join(missing_libs)}...") # 3. Install missing packages subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing_libs) # 4. Find library paths import site for sp in site.getsitepackages(): # Runtime libs rt_path = os.path.join(sp, "nvidia", "cuda_runtime", "lib") if os.path.isdir(rt_path): lib_paths.append(rt_path) # Cublas libs cublas_path = os.path.join(sp, "nvidia", "cublas", "lib") if os.path.isdir(cublas_path): lib_paths.append(cublas_path) if not lib_paths and missing_libs: print("Warning: Could not find nvidia lib paths after installation.") return # 5. Update LD_LIBRARY_PATH and restart current_ld = os.environ.get("LD_LIBRARY_PATH", "") new_ld_parts = [] # Add only new paths for p in lib_paths: if p not in current_ld: new_ld_parts.append(p) if new_ld_parts: print(f"Adding {len(new_ld_parts)} paths to LD_LIBRARY_PATH and restarting...") new_ld = os.pathsep.join(new_ld_parts) if current_ld: new_ld = f"{new_ld}{os.pathsep}{current_ld}" os.environ["LD_LIBRARY_PATH"] = new_ld # Pass the FORCE_CPU_MODE flag to the restarted process if set env = os.environ.copy() # Re-execute the current script with the new environment os.execvpe(sys.executable, [sys.executable] + sys.argv, env) ensure_cuda_runtime() # --- CUDA DEPENDENCY FIX END --- # --- INSTALLATION BLOCK --- # We check if installed, and if not, we force install from the CUDA index. # This avoids the "Building wheel" loop and the "libc.musl" error. try: import llama_cpp # Trigger a load to verify it works (catches the RuntimeError if libs are missing) llama_cpp.Llama print("llama-cpp-python is correctly installed.") except (ImportError, RuntimeError, OSError) as e: print(f"llama-cpp-python needs installation or repair: {e}") if os.environ.get("FORCE_CPU_MODE") == "1": print("Installing CPU-only llama-cpp-python (Fallback)...") subprocess.check_call([ sys.executable, "-m", "pip", "install", "llama-cpp-python", "--prefer-binary", "--force-reinstall" ]) else: print("Installing llama-cpp-python from pre-built wheel index (CUDA)...") subprocess.check_call([ sys.executable, "-m", "pip", "install", "llama-cpp-python", "--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cu121", "--prefer-binary", "--force-reinstall" ]) print("Installation complete.") # --- IMPORTS AFTER INSTALL --- import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # 1. CONFIGURATION REPO_ID = "unsloth/Llama-3.2-1B-Instruct-GGUF" FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf" # 2. DOWNLOAD THE GGUF MODEL # This downloads the file to a local cache in the container print(f"Downloading {FILENAME} from {REPO_ID}...") model_path = hf_hub_download( repo_id=REPO_ID, filename=FILENAME ) print(f"Model downloaded to: {model_path}") # 3. LOAD MODEL # n_ctx=2048 is the context window. Adjust if your finetune supports more. # n_threads=2 is good for the free tier (it has 2 vCPUs). llm = Llama( model_path=model_path, n_ctx=2048, n_threads=2, verbose=False ) # 4. INFERENCE FUNCTION def chat_stream(message, history): # Construct the prompt. # Llama 3.2 Instruct usually expects standard special tokens. # We'll build a manual prompt string if we don't want to use the built-in chat handler. # But llama-cpp-python has a 'create_chat_completion' that handles formatting # IF the GGUF metadata has the template. messages = [] for user_msg, bot_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": message}) # Stream response stream = llm.create_chat_completion( messages=messages, max_tokens=512, stream=True, temperature=0.7 ) partial_message = "" for chunk in stream: delta = chunk['choices'][0]['delta'] if 'content' in delta: partial_message += delta['content'] yield partial_message # 5. UI SETUP demo = gr.ChatInterface( fn=chat_stream, title="Mouse (Llama 3.2 1B Finetune)", description=f"Running {FILENAME} on CPU", examples=["Hello!", "Why is the sky blue?"], ) if __name__ == "__main__": demo.launch()