Spaces:

markvincevarga
/

medve

Runtime error

medve / app.py

Márk Vince Varga

ESGRTSDRYVGEDDEGRYDEGR

42b8d88 unverified 12 days ago

6.94 kB

	import sys
	import subprocess
	import os
	import ctypes

	# --- CUDA DEPENDENCY FIX START ---
	def ensure_cuda_runtime():
	"""
	Ensures that CUDA runtime libraries are available.
	If libcudart.so.12 or libcublas.so.12 is missing, installs the necessary packages
	and restarts the script with the correct LD_LIBRARY_PATH.
	"""
	missing_libs = []
	lib_paths = []

	# 1. Check for libcudart (CUDA Runtime)
	try:
	ctypes.CDLL("libcudart.so.12")
	except OSError:
	print("libcudart.so.12 not found.")
	missing_libs.append("nvidia-cuda-runtime-cu12")

	# 2. Check for libcublas (CUDA BLAS)
	try:
	ctypes.CDLL("libcublas.so.12")
	except OSError:
	print("libcublas.so.12 not found.")
	missing_libs.append("nvidia-cublas-cu12")

	# 3. Check for libcuda.so.1 (NVIDIA Driver)
	# This CANNOT be installed via pip. It must be mounted from the host.
	# If missing, we must fall back to CPU mode.
	driver_found = False
	try:
	ctypes.CDLL("libcuda.so.1")
	driver_found = True
	except OSError:
	print("libcuda.so.1 (NVIDIA Driver) not found via dlopen.")
	# Search common paths
	search_paths = [
	"/usr/lib/x86_64-linux-gnu",
	"/usr/lib64",
	"/usr/lib",
	"/usr/local/cuda/lib64",
	"/usr/lib/wsl/lib"
	]
	for path in search_paths:
	if os.path.exists(os.path.join(path, "libcuda.so.1")):
	print(f"Found libcuda.so.1 manually at {path}")
	lib_paths.append(path)
	driver_found = True
	break

	if not driver_found and not missing_libs:
	# If we have libs but no driver, we can't run GPU.
	# We will set an env var to force CPU install later.
	print("CRITICAL: NVIDIA Driver (libcuda.so.1) not found. GPU acceleration will fail.")
	print("Switching to CPU-only mode for this run.")
	os.environ["FORCE_CPU_MODE"] = "1"

	if not missing_libs and driver_found:
	return # All libraries and driver found

	if missing_libs:
	print(f"Missing CUDA libraries. Installing: {', '.join(missing_libs)}...")
	# 3. Install missing packages
	subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing_libs)

	# 4. Find library paths
	import site
	for sp in site.getsitepackages():
	# Runtime libs
	rt_path = os.path.join(sp, "nvidia", "cuda_runtime", "lib")
	if os.path.isdir(rt_path):
	lib_paths.append(rt_path)

	# Cublas libs
	cublas_path = os.path.join(sp, "nvidia", "cublas", "lib")
	if os.path.isdir(cublas_path):
	lib_paths.append(cublas_path)

	if not lib_paths and missing_libs:
	print("Warning: Could not find nvidia lib paths after installation.")
	return

	# 5. Update LD_LIBRARY_PATH and restart
	current_ld = os.environ.get("LD_LIBRARY_PATH", "")
	new_ld_parts = []

	# Add only new paths
	for p in lib_paths:
	if p not in current_ld:
	new_ld_parts.append(p)

	if new_ld_parts:
	print(f"Adding {len(new_ld_parts)} paths to LD_LIBRARY_PATH and restarting...")
	new_ld = os.pathsep.join(new_ld_parts)
	if current_ld:
	new_ld = f"{new_ld}{os.pathsep}{current_ld}"

	os.environ["LD_LIBRARY_PATH"] = new_ld

	# Pass the FORCE_CPU_MODE flag to the restarted process if set
	env = os.environ.copy()

	# Re-execute the current script with the new environment
	os.execvpe(sys.executable, [sys.executable] + sys.argv, env)

	ensure_cuda_runtime()
	# --- CUDA DEPENDENCY FIX END ---

	# --- INSTALLATION BLOCK ---
	# We check if installed, and if not, we force install from the CUDA index.
	# This avoids the "Building wheel" loop and the "libc.musl" error.
	try:
	import llama_cpp
	# Trigger a load to verify it works (catches the RuntimeError if libs are missing)
	llama_cpp.Llama
	print("llama-cpp-python is correctly installed.")
	except (ImportError, RuntimeError, OSError) as e:
	print(f"llama-cpp-python needs installation or repair: {e}")

	if os.environ.get("FORCE_CPU_MODE") == "1":
	print("Installing CPU-only llama-cpp-python (Fallback)...")
	subprocess.check_call([
	sys.executable, "-m", "pip", "install",
	"llama-cpp-python",
	"--prefer-binary",
	"--force-reinstall"
	])
	else:
	print("Installing llama-cpp-python from pre-built wheel index (CUDA)...")
	subprocess.check_call([
	sys.executable, "-m", "pip", "install",
	"llama-cpp-python",
	"--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cu121",
	"--prefer-binary",
	"--force-reinstall"
	])
	print("Installation complete.")

	# --- IMPORTS AFTER INSTALL ---
	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	# 1. CONFIGURATION
	REPO_ID = "unsloth/Llama-3.2-1B-Instruct-GGUF"
	FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"

	# 2. DOWNLOAD THE GGUF MODEL
	# This downloads the file to a local cache in the container
	print(f"Downloading {FILENAME} from {REPO_ID}...")
	model_path = hf_hub_download(
	repo_id=REPO_ID,
	filename=FILENAME
	)
	print(f"Model downloaded to: {model_path}")

	# 3. LOAD MODEL
	# n_ctx=2048 is the context window. Adjust if your finetune supports more.
	# n_threads=2 is good for the free tier (it has 2 vCPUs).
	llm = Llama(
	model_path=model_path,
	n_ctx=2048,
	n_threads=2,
	verbose=False
	)

	# 4. INFERENCE FUNCTION
	def chat_stream(message, history):
	# Construct the prompt.
	# Llama 3.2 Instruct usually expects standard special tokens.
	# We'll build a manual prompt string if we don't want to use the built-in chat handler.
	# But llama-cpp-python has a 'create_chat_completion' that handles formatting
	# IF the GGUF metadata has the template.

	messages = []
	for user_msg, bot_msg in history:
	messages.append({"role": "user", "content": user_msg})
	messages.append({"role": "assistant", "content": bot_msg})
	messages.append({"role": "user", "content": message})

	# Stream response
	stream = llm.create_chat_completion(
	messages=messages,
	max_tokens=512,
	stream=True,
	temperature=0.7
	)

	partial_message = ""
	for chunk in stream:
	delta = chunk['choices'][0]['delta']
	if 'content' in delta:
	partial_message += delta['content']
	yield partial_message

	# 5. UI SETUP
	demo = gr.ChatInterface(
	fn=chat_stream,
	title="Mouse (Llama 3.2 1B Finetune)",
	description=f"Running {FILENAME} on CPU",
	examples=["Hello!", "Why is the sky blue?"],
	)

	if __name__ == "__main__":
	demo.launch()