File size: 6,944 Bytes
2d37492
b2468b4
 
944c094
 
 
 
 
 
910d96a
944c094
 
910d96a
09ec7d7
910d96a
 
944c094
 
 
910d96a
 
944c094
910d96a
944c094
910d96a
 
 
 
 
09ec7d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
910d96a
09ec7d7
 
 
 
 
 
 
 
 
 
 
 
 
 
944c094
910d96a
944c094
 
910d96a
 
 
 
 
 
 
 
 
 
09ec7d7
910d96a
944c094
 
910d96a
944c094
910d96a
 
 
 
 
 
 
 
 
 
 
 
 
 
09ec7d7
 
 
 
944c094
09ec7d7
944c094
 
 
2d37492
a7b7951
b2468b4
 
2d37492
 
944c094
 
 
 
 
09ec7d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cee07e1
b2468b4
 
7c41901
d92a2fd
 
 
42b8d88
 
7c41901
d92a2fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import sys
import subprocess
import os
import ctypes

# --- CUDA DEPENDENCY FIX START ---
def ensure_cuda_runtime():
    """
    Ensures that CUDA runtime libraries are available. 
    If libcudart.so.12 or libcublas.so.12 is missing, installs the necessary packages
    and restarts the script with the correct LD_LIBRARY_PATH.
    """
    missing_libs = []
    lib_paths = []
    
    # 1. Check for libcudart (CUDA Runtime)
    try:
        ctypes.CDLL("libcudart.so.12")
    except OSError:
        print("libcudart.so.12 not found.")
        missing_libs.append("nvidia-cuda-runtime-cu12")

    # 2. Check for libcublas (CUDA BLAS)
    try:
        ctypes.CDLL("libcublas.so.12")
    except OSError:
        print("libcublas.so.12 not found.")
        missing_libs.append("nvidia-cublas-cu12")

    # 3. Check for libcuda.so.1 (NVIDIA Driver)
    # This CANNOT be installed via pip. It must be mounted from the host.
    # If missing, we must fall back to CPU mode.
    driver_found = False
    try:
        ctypes.CDLL("libcuda.so.1")
        driver_found = True
    except OSError:
        print("libcuda.so.1 (NVIDIA Driver) not found via dlopen.")
        # Search common paths
        search_paths = [
            "/usr/lib/x86_64-linux-gnu",
            "/usr/lib64",
            "/usr/lib",
            "/usr/local/cuda/lib64",
            "/usr/lib/wsl/lib"
        ]
        for path in search_paths:
            if os.path.exists(os.path.join(path, "libcuda.so.1")):
                print(f"Found libcuda.so.1 manually at {path}")
                lib_paths.append(path)
                driver_found = True
                break
    
    if not driver_found and not missing_libs:
        # If we have libs but no driver, we can't run GPU.
        # We will set an env var to force CPU install later.
        print("CRITICAL: NVIDIA Driver (libcuda.so.1) not found. GPU acceleration will fail.")
        print("Switching to CPU-only mode for this run.")
        os.environ["FORCE_CPU_MODE"] = "1"

    if not missing_libs and driver_found:
        return  # All libraries and driver found

    if missing_libs:
        print(f"Missing CUDA libraries. Installing: {', '.join(missing_libs)}...")
        # 3. Install missing packages
        subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing_libs)

    # 4. Find library paths
    import site
    for sp in site.getsitepackages():
        # Runtime libs
        rt_path = os.path.join(sp, "nvidia", "cuda_runtime", "lib")
        if os.path.isdir(rt_path):
            lib_paths.append(rt_path)
        
        # Cublas libs
        cublas_path = os.path.join(sp, "nvidia", "cublas", "lib")
        if os.path.isdir(cublas_path):
            lib_paths.append(cublas_path)

    if not lib_paths and missing_libs:
        print("Warning: Could not find nvidia lib paths after installation.")
        return

    # 5. Update LD_LIBRARY_PATH and restart
    current_ld = os.environ.get("LD_LIBRARY_PATH", "")
    new_ld_parts = []
    
    # Add only new paths
    for p in lib_paths:
        if p not in current_ld:
            new_ld_parts.append(p)
            
    if new_ld_parts:
        print(f"Adding {len(new_ld_parts)} paths to LD_LIBRARY_PATH and restarting...")
        new_ld = os.pathsep.join(new_ld_parts)
        if current_ld:
            new_ld = f"{new_ld}{os.pathsep}{current_ld}"
            
        os.environ["LD_LIBRARY_PATH"] = new_ld
        
        # Pass the FORCE_CPU_MODE flag to the restarted process if set
        env = os.environ.copy()
        
        # Re-execute the current script with the new environment
        os.execvpe(sys.executable, [sys.executable] + sys.argv, env)

ensure_cuda_runtime()
# --- CUDA DEPENDENCY FIX END ---

# --- INSTALLATION BLOCK ---
# We check if installed, and if not, we force install from the CUDA index.
# This avoids the "Building wheel" loop and the "libc.musl" error.
try:
    import llama_cpp
    # Trigger a load to verify it works (catches the RuntimeError if libs are missing)
    llama_cpp.Llama
    print("llama-cpp-python is correctly installed.")
except (ImportError, RuntimeError, OSError) as e:
    print(f"llama-cpp-python needs installation or repair: {e}")
    
    if os.environ.get("FORCE_CPU_MODE") == "1":
        print("Installing CPU-only llama-cpp-python (Fallback)...")
        subprocess.check_call([
            sys.executable, "-m", "pip", "install", 
            "llama-cpp-python", 
            "--prefer-binary",
            "--force-reinstall"
        ])
    else:
        print("Installing llama-cpp-python from pre-built wheel index (CUDA)...")
        subprocess.check_call([
            sys.executable, "-m", "pip", "install", 
            "llama-cpp-python", 
            "--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cu121",
            "--prefer-binary",
            "--force-reinstall"
        ])
    print("Installation complete.")

# --- IMPORTS AFTER INSTALL ---
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# 1. CONFIGURATION
REPO_ID = "unsloth/Llama-3.2-1B-Instruct-GGUF"
FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"

# 2. DOWNLOAD THE GGUF MODEL
# This downloads the file to a local cache in the container
print(f"Downloading {FILENAME} from {REPO_ID}...")
model_path = hf_hub_download(
    repo_id=REPO_ID, 
    filename=FILENAME
)
print(f"Model downloaded to: {model_path}")

# 3. LOAD MODEL
# n_ctx=2048 is the context window. Adjust if your finetune supports more.
# n_threads=2 is good for the free tier (it has 2 vCPUs).
llm = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=2,  
    verbose=False
)

# 4. INFERENCE FUNCTION
def chat_stream(message, history):
    # Construct the prompt. 
    # Llama 3.2 Instruct usually expects standard special tokens.
    # We'll build a manual prompt string if we don't want to use the built-in chat handler.
    # But llama-cpp-python has a 'create_chat_completion' that handles formatting 
    # IF the GGUF metadata has the template. 
    
    messages = []
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
    messages.append({"role": "user", "content": message})

    # Stream response
    stream = llm.create_chat_completion(
        messages=messages,
        max_tokens=512,
        stream=True,
        temperature=0.7
    )
    
    partial_message = ""
    for chunk in stream:
        delta = chunk['choices'][0]['delta']
        if 'content' in delta:
            partial_message += delta['content']
            yield partial_message

# 5. UI SETUP
demo = gr.ChatInterface(
    fn=chat_stream,
    title="Mouse (Llama 3.2 1B Finetune)",
    description=f"Running {FILENAME} on CPU",
    examples=["Hello!", "Why is the sky blue?"],
)

if __name__ == "__main__":
    demo.launch()