Spaces:
Sleeping
Sleeping
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama.cpp +339 -251
- examples/talk-llama/llama.h +8 -1
- examples/talk-llama/unicode.cpp +11 -0
examples/talk-llama/llama.cpp
CHANGED
|
@@ -179,7 +179,7 @@ enum llm_arch {
|
|
| 179 |
LLM_ARCH_COMMAND_R,
|
| 180 |
LLM_ARCH_DBRX,
|
| 181 |
LLM_ARCH_OLMO,
|
| 182 |
-
|
| 183 |
LLM_ARCH_OLMOE,
|
| 184 |
LLM_ARCH_OPENELM,
|
| 185 |
LLM_ARCH_ARCTIC,
|
|
@@ -233,7 +233,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
| 233 |
{ LLM_ARCH_COMMAND_R, "command-r" },
|
| 234 |
{ LLM_ARCH_DBRX, "dbrx" },
|
| 235 |
{ LLM_ARCH_OLMO, "olmo" },
|
| 236 |
-
{
|
| 237 |
{ LLM_ARCH_OLMOE, "olmoe" },
|
| 238 |
{ LLM_ARCH_OPENELM, "openelm" },
|
| 239 |
{ LLM_ARCH_ARCTIC, "arctic" },
|
|
@@ -1036,6 +1036,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
| 1036 |
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 1037 |
{ LLM_TENSOR_OUTPUT, "output" },
|
| 1038 |
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
|
|
|
|
|
| 1039 |
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 1040 |
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 1041 |
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
@@ -1210,7 +1212,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
| 1210 |
},
|
| 1211 |
},
|
| 1212 |
{
|
| 1213 |
-
|
| 1214 |
{
|
| 1215 |
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 1216 |
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
@@ -1549,6 +1551,67 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
| 1549 |
},
|
| 1550 |
};
|
| 1551 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1552 |
static llm_arch llm_arch_from_string(const std::string & name) {
|
| 1553 |
for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
|
| 1554 |
if (kv.second == name) {
|
|
@@ -1622,9 +1685,10 @@ struct LLM_TN {
|
|
| 1622 |
//
|
| 1623 |
|
| 1624 |
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
| 1625 |
-
{ LLAMA_ROPE_SCALING_TYPE_NONE,
|
| 1626 |
-
{ LLAMA_ROPE_SCALING_TYPE_LINEAR,
|
| 1627 |
-
{ LLAMA_ROPE_SCALING_TYPE_YARN,
|
|
|
|
| 1628 |
};
|
| 1629 |
|
| 1630 |
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
|
@@ -2341,6 +2405,7 @@ enum e_model {
|
|
| 2341 |
MODEL_16B,
|
| 2342 |
MODEL_20B,
|
| 2343 |
MODEL_30B,
|
|
|
|
| 2344 |
MODEL_34B,
|
| 2345 |
MODEL_35B,
|
| 2346 |
MODEL_40B,
|
|
@@ -4866,7 +4931,9 @@ struct llama_model_loader {
|
|
| 4866 |
mappings.reserve(files.size());
|
| 4867 |
mmaps_used.reserve(files.size());
|
| 4868 |
for (const auto & file : files) {
|
| 4869 |
-
|
|
|
|
|
|
|
| 4870 |
mmaps_used.emplace_back(mapping->size, 0);
|
| 4871 |
if (mlock_mmaps) {
|
| 4872 |
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
|
@@ -5328,6 +5395,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
| 5328 |
case MODEL_16B: return "16B";
|
| 5329 |
case MODEL_20B: return "20B";
|
| 5330 |
case MODEL_30B: return "30B";
|
|
|
|
| 5331 |
case MODEL_34B: return "34B";
|
| 5332 |
case MODEL_35B: return "35B";
|
| 5333 |
case MODEL_40B: return "40B";
|
|
@@ -5515,8 +5583,12 @@ static void llm_load_hparams(
|
|
| 5515 |
case LLM_ARCH_MINICPM:
|
| 5516 |
{
|
| 5517 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
|
|
|
|
|
|
| 5518 |
|
| 5519 |
switch (hparams.n_layer) {
|
|
|
|
| 5520 |
case 40: model.type = e_model::MODEL_2B; break;
|
| 5521 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 5522 |
}
|
|
@@ -5688,7 +5760,10 @@ static void llm_load_hparams(
|
|
| 5688 |
case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
|
| 5689 |
case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
|
| 5690 |
case 32: model.type = e_model::MODEL_7B; break;
|
|
|
|
| 5691 |
case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
|
|
|
|
|
|
|
| 5692 |
case 80: model.type = e_model::MODEL_70B; break;
|
| 5693 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 5694 |
}
|
|
@@ -5898,7 +5973,7 @@ static void llm_load_hparams(
|
|
| 5898 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 5899 |
}
|
| 5900 |
} break;
|
| 5901 |
-
case
|
| 5902 |
{
|
| 5903 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 5904 |
|
|
@@ -6997,7 +7072,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
| 6997 |
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
| 6998 |
}
|
| 6999 |
|
| 7000 |
-
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
|
| 7001 |
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
| 7002 |
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
| 7003 |
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
@@ -7181,12 +7256,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|
| 7181 |
} break;
|
| 7182 |
case GGML_OP_ADD:
|
| 7183 |
{
|
| 7184 |
-
ggml_tensor * a =
|
| 7185 |
op_tensor = ggml_add(ctx, a, w);
|
| 7186 |
} break;
|
| 7187 |
case GGML_OP_MUL:
|
| 7188 |
{
|
| 7189 |
-
ggml_tensor * a =
|
| 7190 |
op_tensor = ggml_mul(ctx, a, w);
|
| 7191 |
} break;
|
| 7192 |
case GGML_OP_DIV:
|
|
@@ -7622,7 +7697,13 @@ static bool llm_load_tensors(
|
|
| 7622 |
|
| 7623 |
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
| 7624 |
|
| 7625 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7626 |
|
| 7627 |
if (n_expert == 0) {
|
| 7628 |
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
@@ -8591,7 +8672,7 @@ static bool llm_load_tensors(
|
|
| 8591 |
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
| 8592 |
}
|
| 8593 |
} break;
|
| 8594 |
-
case
|
| 8595 |
{
|
| 8596 |
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
| 8597 |
|
|
@@ -9190,7 +9271,7 @@ static bool llm_load_tensors(
|
|
| 9190 |
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
| 9191 |
if (!dev) {
|
| 9192 |
// FIXME: workaround for CPU backend buft having a NULL device
|
| 9193 |
-
dev =
|
| 9194 |
}
|
| 9195 |
ggml_backend_dev_props props;
|
| 9196 |
ggml_backend_dev_get_props(dev, &props);
|
|
@@ -13429,153 +13510,6 @@ struct llm_build_context {
|
|
| 13429 |
return gf;
|
| 13430 |
}
|
| 13431 |
|
| 13432 |
-
// ref: https://arxiv.org/abs/2203.03466
|
| 13433 |
-
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
|
| 13434 |
-
// based on the original build_llama() function
|
| 13435 |
-
struct ggml_cgraph * build_minicpm() {
|
| 13436 |
-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
| 13437 |
-
|
| 13438 |
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 13439 |
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 13440 |
-
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 13441 |
-
|
| 13442 |
-
const int64_t n_embd = hparams.n_embd;
|
| 13443 |
-
//TODO: if the model varies, these parameters need to be read from the model
|
| 13444 |
-
const int64_t n_embd_base = 256;
|
| 13445 |
-
const float scale_embd = 12.0f;
|
| 13446 |
-
const float scale_depth = 1.4f;
|
| 13447 |
-
|
| 13448 |
-
struct ggml_tensor * cur;
|
| 13449 |
-
struct ggml_tensor * inpL;
|
| 13450 |
-
|
| 13451 |
-
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
| 13452 |
-
|
| 13453 |
-
// scale the input embeddings
|
| 13454 |
-
inpL = ggml_scale(ctx0, inpL, scale_embd);
|
| 13455 |
-
cb(inpL, "inp_scaled", -1);
|
| 13456 |
-
|
| 13457 |
-
// inp_pos - contains the positions
|
| 13458 |
-
struct ggml_tensor * inp_pos = build_inp_pos();
|
| 13459 |
-
|
| 13460 |
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 13461 |
-
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
| 13462 |
-
|
| 13463 |
-
for (int il = 0; il < n_layer; ++il) {
|
| 13464 |
-
struct ggml_tensor * inpSA = inpL;
|
| 13465 |
-
|
| 13466 |
-
// norm
|
| 13467 |
-
cur = llm_build_norm(ctx0, inpL, hparams,
|
| 13468 |
-
model.layers[il].attn_norm, NULL,
|
| 13469 |
-
LLM_NORM_RMS, cb, il);
|
| 13470 |
-
cb(cur, "attn_norm", il);
|
| 13471 |
-
|
| 13472 |
-
// self-attention
|
| 13473 |
-
{
|
| 13474 |
-
// compute Q and K and RoPE them
|
| 13475 |
-
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
| 13476 |
-
cb(Qcur, "Qcur", il);
|
| 13477 |
-
if (model.layers[il].bq) {
|
| 13478 |
-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 13479 |
-
cb(Qcur, "Qcur", il);
|
| 13480 |
-
}
|
| 13481 |
-
|
| 13482 |
-
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
| 13483 |
-
cb(Kcur, "Kcur", il);
|
| 13484 |
-
if (model.layers[il].bk) {
|
| 13485 |
-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 13486 |
-
cb(Kcur, "Kcur", il);
|
| 13487 |
-
}
|
| 13488 |
-
|
| 13489 |
-
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
| 13490 |
-
cb(Vcur, "Vcur", il);
|
| 13491 |
-
if (model.layers[il].bv) {
|
| 13492 |
-
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
| 13493 |
-
cb(Vcur, "Vcur", il);
|
| 13494 |
-
}
|
| 13495 |
-
|
| 13496 |
-
Qcur = ggml_rope_ext(
|
| 13497 |
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
| 13498 |
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 13499 |
-
ext_factor, attn_factor, beta_fast, beta_slow
|
| 13500 |
-
);
|
| 13501 |
-
cb(Qcur, "Qcur", il);
|
| 13502 |
-
|
| 13503 |
-
Kcur = ggml_rope_ext(
|
| 13504 |
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
| 13505 |
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 13506 |
-
ext_factor, attn_factor, beta_fast, beta_slow
|
| 13507 |
-
);
|
| 13508 |
-
cb(Kcur, "Kcur", il);
|
| 13509 |
-
|
| 13510 |
-
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
| 13511 |
-
model.layers[il].wo, model.layers[il].bo,
|
| 13512 |
-
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 13513 |
-
}
|
| 13514 |
-
|
| 13515 |
-
if (il == n_layer - 1) {
|
| 13516 |
-
// skip computing output for unused tokens
|
| 13517 |
-
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 13518 |
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
| 13519 |
-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
| 13520 |
-
}
|
| 13521 |
-
|
| 13522 |
-
// scale_res - scale the hidden states for residual connection
|
| 13523 |
-
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
| 13524 |
-
cur = ggml_scale(ctx0, cur, scale_res);
|
| 13525 |
-
cb(cur, "hidden_scaled", -1);
|
| 13526 |
-
|
| 13527 |
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 13528 |
-
cb(ffn_inp, "ffn_inp", il);
|
| 13529 |
-
|
| 13530 |
-
// feed-forward network
|
| 13531 |
-
{
|
| 13532 |
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
| 13533 |
-
model.layers[il].ffn_norm, NULL,
|
| 13534 |
-
LLM_NORM_RMS, cb, il);
|
| 13535 |
-
cb(cur, "ffn_norm", il);
|
| 13536 |
-
|
| 13537 |
-
cur = llm_build_ffn(ctx0, lctx, cur,
|
| 13538 |
-
model.layers[il].ffn_up, NULL, NULL,
|
| 13539 |
-
model.layers[il].ffn_gate, NULL, NULL,
|
| 13540 |
-
model.layers[il].ffn_down, NULL, NULL,
|
| 13541 |
-
NULL,
|
| 13542 |
-
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
| 13543 |
-
cb(cur, "ffn_out", il);
|
| 13544 |
-
}
|
| 13545 |
-
|
| 13546 |
-
// scale the hidden states for residual connection
|
| 13547 |
-
cur = ggml_scale(ctx0, cur, scale_res);
|
| 13548 |
-
cb(cur, "hidden_scaled_ffn", -1);
|
| 13549 |
-
|
| 13550 |
-
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 13551 |
-
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
| 13552 |
-
cb(cur, "l_out", il);
|
| 13553 |
-
|
| 13554 |
-
// input for next layer
|
| 13555 |
-
inpL = cur;
|
| 13556 |
-
}
|
| 13557 |
-
|
| 13558 |
-
cur = inpL;
|
| 13559 |
-
|
| 13560 |
-
cur = llm_build_norm(ctx0, cur, hparams,
|
| 13561 |
-
model.output_norm, NULL,
|
| 13562 |
-
LLM_NORM_RMS, cb, -1);
|
| 13563 |
-
cb(cur, "result_norm", -1);
|
| 13564 |
-
|
| 13565 |
-
// lm_head scaling
|
| 13566 |
-
const float scale_lmhead = float(n_embd_base)/float(n_embd);
|
| 13567 |
-
cur = ggml_scale(ctx0, cur, scale_lmhead);
|
| 13568 |
-
cb(cur, "lmhead_scaling", -1);
|
| 13569 |
-
|
| 13570 |
-
// lm_head
|
| 13571 |
-
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
| 13572 |
-
cb(cur, "result_output", -1);
|
| 13573 |
-
|
| 13574 |
-
ggml_build_forward_expand(gf, cur);
|
| 13575 |
-
|
| 13576 |
-
return gf;
|
| 13577 |
-
}
|
| 13578 |
-
|
| 13579 |
struct ggml_cgraph * build_minicpm3() {
|
| 13580 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
| 13581 |
|
|
@@ -14481,7 +14415,7 @@ struct llm_build_context {
|
|
| 14481 |
return gf;
|
| 14482 |
}
|
| 14483 |
|
| 14484 |
-
struct ggml_cgraph *
|
| 14485 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
| 14486 |
|
| 14487 |
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
@@ -16674,6 +16608,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 16674 |
|
| 16675 |
switch (model.arch) {
|
| 16676 |
case LLM_ARCH_LLAMA:
|
|
|
|
| 16677 |
case LLM_ARCH_GRANITE:
|
| 16678 |
case LLM_ARCH_GRANITE_MOE:
|
| 16679 |
{
|
|
@@ -16757,10 +16692,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 16757 |
{
|
| 16758 |
result = llm.build_internlm2();
|
| 16759 |
} break;
|
| 16760 |
-
case LLM_ARCH_MINICPM:
|
| 16761 |
-
{
|
| 16762 |
-
result = llm.build_minicpm();
|
| 16763 |
-
} break;
|
| 16764 |
case LLM_ARCH_MINICPM3:
|
| 16765 |
{
|
| 16766 |
result = llm.build_minicpm3();
|
|
@@ -16797,9 +16728,9 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 16797 |
{
|
| 16798 |
result = llm.build_olmo();
|
| 16799 |
} break;
|
| 16800 |
-
case
|
| 16801 |
{
|
| 16802 |
-
result = llm.
|
| 16803 |
} break;
|
| 16804 |
case LLM_ARCH_OLMOE:
|
| 16805 |
{
|
|
@@ -17443,8 +17374,9 @@ static enum ggml_status llama_graph_compute(
|
|
| 17443 |
int n_threads,
|
| 17444 |
ggml_threadpool * threadpool) {
|
| 17445 |
if (lctx.backend_cpu != nullptr) {
|
| 17446 |
-
|
| 17447 |
-
|
|
|
|
| 17448 |
}
|
| 17449 |
|
| 17450 |
// set the number of threads for all the backends
|
|
@@ -18211,13 +18143,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
| 18211 |
static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
| 18212 |
bool need_reserve = false;
|
| 18213 |
|
| 18214 |
-
|
| 18215 |
-
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
| 18216 |
if (!llama_kv_cache_can_shift(&lctx)) {
|
| 18217 |
-
GGML_ABORT("
|
| 18218 |
}
|
| 18219 |
|
| 18220 |
-
|
|
|
|
| 18221 |
ggml_backend_sched_reset(lctx.sched.get());
|
| 18222 |
|
| 18223 |
ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
|
|
@@ -19361,6 +19293,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
|
|
| 19361 |
//
|
| 19362 |
struct llama_model_params llama_model_default_params() {
|
| 19363 |
struct llama_model_params result = {
|
|
|
|
| 19364 |
/*.n_gpu_layers =*/ 0,
|
| 19365 |
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
| 19366 |
/*.main_gpu =*/ 0,
|
|
@@ -19478,7 +19411,11 @@ void llama_backend_init(void) {
|
|
| 19478 |
|
| 19479 |
void llama_numa_init(enum ggml_numa_strategy numa) {
|
| 19480 |
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
|
| 19481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19482 |
}
|
| 19483 |
}
|
| 19484 |
|
|
@@ -19569,19 +19506,24 @@ struct llama_model * llama_load_model_from_file(
|
|
| 19569 |
}
|
| 19570 |
|
| 19571 |
// create list of devices to use with this model
|
| 19572 |
-
|
| 19573 |
-
|
| 19574 |
-
|
| 19575 |
-
|
| 19576 |
-
|
| 19577 |
-
|
| 19578 |
-
|
| 19579 |
-
|
| 19580 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19581 |
|
| 19582 |
-
|
| 19583 |
-
|
| 19584 |
-
|
|
|
|
| 19585 |
}
|
| 19586 |
}
|
| 19587 |
|
|
@@ -19752,9 +19694,6 @@ struct llama_context * llama_new_context_with_model(
|
|
| 19752 |
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
| 19753 |
}
|
| 19754 |
|
| 19755 |
-
ctx->abort_callback = params.abort_callback;
|
| 19756 |
-
ctx->abort_callback_data = params.abort_callback_data;
|
| 19757 |
-
|
| 19758 |
ctx->logits_all = params.logits_all;
|
| 19759 |
|
| 19760 |
// build worst-case graph for encoder if a model contains encoder
|
|
@@ -19803,7 +19742,7 @@ struct llama_context * llama_new_context_with_model(
|
|
| 19803 |
}
|
| 19804 |
|
| 19805 |
// add CPU backend
|
| 19806 |
-
ctx->backend_cpu =
|
| 19807 |
if (ctx->backend_cpu == nullptr) {
|
| 19808 |
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
| 19809 |
llama_free(ctx);
|
|
@@ -19823,6 +19762,8 @@ struct llama_context * llama_new_context_with_model(
|
|
| 19823 |
}
|
| 19824 |
}
|
| 19825 |
|
|
|
|
|
|
|
| 19826 |
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
| 19827 |
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
| 19828 |
llama_free(ctx);
|
|
@@ -19868,7 +19809,8 @@ struct llama_context * llama_new_context_with_model(
|
|
| 19868 |
std::vector<ggml_backend_t> backend_ptrs;
|
| 19869 |
for (auto & backend : ctx->backends) {
|
| 19870 |
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
| 19871 |
-
|
|
|
|
| 19872 |
// use the host buffer of the first device CPU for faster transfer of the intermediate state
|
| 19873 |
auto * dev = model->devices[0];
|
| 19874 |
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
|
|
@@ -19896,7 +19838,8 @@ struct llama_context * llama_new_context_with_model(
|
|
| 19896 |
// pipeline parallelism requires support for async compute and events in all devices
|
| 19897 |
if (pipeline_parallel) {
|
| 19898 |
for (auto & backend : ctx->backends) {
|
| 19899 |
-
|
|
|
|
| 19900 |
// ignore CPU backend
|
| 19901 |
continue;
|
| 19902 |
}
|
|
@@ -20070,7 +20013,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
| 20070 |
case LLM_ARCH_QWEN:
|
| 20071 |
case LLM_ARCH_QWEN2:
|
| 20072 |
case LLM_ARCH_QWEN2MOE:
|
| 20073 |
-
case
|
| 20074 |
case LLM_ARCH_OLMOE:
|
| 20075 |
case LLM_ARCH_PHI2:
|
| 20076 |
case LLM_ARCH_PHI3:
|
|
@@ -20463,7 +20406,7 @@ void llama_kv_cache_update(struct llama_context * ctx) {
|
|
| 20463 |
}
|
| 20464 |
|
| 20465 |
bool llama_kv_cache_can_shift(struct llama_context * ctx) {
|
| 20466 |
-
return ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
|
| 20467 |
}
|
| 20468 |
|
| 20469 |
// deprecated
|
|
@@ -21450,6 +21393,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
|
|
| 21450 |
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
| 21451 |
ctx->abort_callback = abort_callback;
|
| 21452 |
ctx->abort_callback_data = abort_callback_data;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21453 |
}
|
| 21454 |
|
| 21455 |
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
|
@@ -21816,18 +21767,109 @@ int32_t llama_detokenize(
|
|
| 21816 |
// chat templates
|
| 21817 |
//
|
| 21818 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21819 |
// Simple version of "llama_apply_chat_template" that only works with strings
|
| 21820 |
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
|
| 21821 |
static int32_t llama_chat_apply_template_internal(
|
| 21822 |
-
const
|
| 21823 |
const std::vector<const llama_chat_message *> & chat,
|
| 21824 |
std::string & dest, bool add_ass) {
|
| 21825 |
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
| 21826 |
std::stringstream ss;
|
| 21827 |
-
|
| 21828 |
-
return tmpl.find(haystack) != std::string::npos;
|
| 21829 |
-
};
|
| 21830 |
-
if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
|
| 21831 |
// chatml template
|
| 21832 |
for (auto message : chat) {
|
| 21833 |
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
|
@@ -21835,16 +21877,59 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 21835 |
if (add_ass) {
|
| 21836 |
ss << "<|im_start|>assistant\n";
|
| 21837 |
}
|
| 21838 |
-
} else if (tmpl ==
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21839 |
// llama2 template and its variants
|
| 21840 |
// [variant] support system message
|
| 21841 |
-
|
| 21842 |
-
|
| 21843 |
-
bool space_around_response = tmpl_contains("' ' + eos_token");
|
| 21844 |
// [variant] add BOS inside history
|
| 21845 |
-
bool add_bos_inside_history =
|
| 21846 |
// [variant] trim spaces from the input message
|
| 21847 |
-
bool strip_message =
|
| 21848 |
// construct the prompt
|
| 21849 |
bool is_inside_turn = true; // skip BOS at the beginning
|
| 21850 |
ss << "[INST] ";
|
|
@@ -21865,12 +21950,11 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 21865 |
} else if (role == "user") {
|
| 21866 |
ss << content << " [/INST]";
|
| 21867 |
} else {
|
| 21868 |
-
ss <<
|
| 21869 |
is_inside_turn = false;
|
| 21870 |
}
|
| 21871 |
}
|
| 21872 |
-
|
| 21873 |
-
} else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
|
| 21874 |
// Phi 3
|
| 21875 |
for (auto message : chat) {
|
| 21876 |
std::string role(message->role);
|
|
@@ -21879,7 +21963,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 21879 |
if (add_ass) {
|
| 21880 |
ss << "<|assistant|>\n";
|
| 21881 |
}
|
| 21882 |
-
} else if (tmpl ==
|
| 21883 |
// zephyr template
|
| 21884 |
for (auto message : chat) {
|
| 21885 |
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
|
@@ -21887,7 +21971,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 21887 |
if (add_ass) {
|
| 21888 |
ss << "<|assistant|>\n";
|
| 21889 |
}
|
| 21890 |
-
} else if (tmpl ==
|
| 21891 |
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
| 21892 |
for (auto message : chat) {
|
| 21893 |
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
|
@@ -21896,7 +21980,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 21896 |
if (add_ass) {
|
| 21897 |
ss << "<s>assistant\n";
|
| 21898 |
}
|
| 21899 |
-
} else if (tmpl ==
|
| 21900 |
// google/gemma-7b-it
|
| 21901 |
std::string system_prompt = "";
|
| 21902 |
for (auto message : chat) {
|
|
@@ -21918,7 +22002,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 21918 |
if (add_ass) {
|
| 21919 |
ss << "<start_of_turn>model\n";
|
| 21920 |
}
|
| 21921 |
-
} else if (tmpl ==
|
| 21922 |
// OrionStarAI/Orion-14B-Chat
|
| 21923 |
std::string system_prompt = "";
|
| 21924 |
for (auto message : chat) {
|
|
@@ -21938,7 +22022,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 21938 |
ss << message->content << "</s>";
|
| 21939 |
}
|
| 21940 |
}
|
| 21941 |
-
} else if (tmpl ==
|
| 21942 |
// openchat/openchat-3.5-0106,
|
| 21943 |
for (auto message : chat) {
|
| 21944 |
std::string role(message->role);
|
|
@@ -21952,13 +22036,13 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 21952 |
if (add_ass) {
|
| 21953 |
ss << "GPT4 Correct Assistant:";
|
| 21954 |
}
|
| 21955 |
-
} else if (tmpl ==
|
| 21956 |
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
| 21957 |
for (auto message : chat) {
|
| 21958 |
std::string role(message->role);
|
| 21959 |
if (role == "system") {
|
| 21960 |
// Orca-Vicuna variant uses a system prefix
|
| 21961 |
-
if (tmpl ==
|
| 21962 |
ss << "SYSTEM: " << message->content << "\n";
|
| 21963 |
} else {
|
| 21964 |
ss << message->content << "\n\n";
|
|
@@ -21972,7 +22056,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 21972 |
if (add_ass) {
|
| 21973 |
ss << "ASSISTANT:";
|
| 21974 |
}
|
| 21975 |
-
} else if (tmpl ==
|
| 21976 |
// deepseek-ai/deepseek-coder-33b-instruct
|
| 21977 |
for (auto message : chat) {
|
| 21978 |
std::string role(message->role);
|
|
@@ -21987,7 +22071,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 21987 |
if (add_ass) {
|
| 21988 |
ss << "### Response:\n";
|
| 21989 |
}
|
| 21990 |
-
} else if (tmpl ==
|
| 21991 |
// CohereForAI/c4ai-command-r-plus
|
| 21992 |
for (auto message : chat) {
|
| 21993 |
std::string role(message->role);
|
|
@@ -22002,7 +22086,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 22002 |
if (add_ass) {
|
| 22003 |
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
| 22004 |
}
|
| 22005 |
-
} else if (tmpl ==
|
| 22006 |
// Llama 3
|
| 22007 |
for (auto message : chat) {
|
| 22008 |
std::string role(message->role);
|
|
@@ -22011,7 +22095,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 22011 |
if (add_ass) {
|
| 22012 |
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
| 22013 |
}
|
| 22014 |
-
} else if (tmpl ==
|
| 22015 |
// chatglm3-6b
|
| 22016 |
ss << "[gMASK]" << "sop";
|
| 22017 |
for (auto message : chat) {
|
|
@@ -22021,7 +22105,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 22021 |
if (add_ass) {
|
| 22022 |
ss << "<|assistant|>";
|
| 22023 |
}
|
| 22024 |
-
} else if (tmpl ==
|
| 22025 |
ss << "[gMASK]" << "<sop>";
|
| 22026 |
for (auto message : chat) {
|
| 22027 |
std::string role(message->role);
|
|
@@ -22030,7 +22114,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 22030 |
if (add_ass) {
|
| 22031 |
ss << "<|assistant|>";
|
| 22032 |
}
|
| 22033 |
-
} else if (tmpl ==
|
| 22034 |
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
| 22035 |
for (auto message : chat) {
|
| 22036 |
std::string role(message->role);
|
|
@@ -22042,7 +22126,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 22042 |
ss << trim(message->content);
|
| 22043 |
}
|
| 22044 |
}
|
| 22045 |
-
} else if (tmpl ==
|
| 22046 |
// DeepSeek-V2
|
| 22047 |
for (auto message : chat) {
|
| 22048 |
std::string role(message->role);
|
|
@@ -22057,7 +22141,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 22057 |
if (add_ass) {
|
| 22058 |
ss << "Assistant:";
|
| 22059 |
}
|
| 22060 |
-
} else if (tmpl ==
|
| 22061 |
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
| 22062 |
// EXAONE-3.0-7.8B-Instruct
|
| 22063 |
for (auto message : chat) {
|
|
@@ -22073,7 +22157,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 22073 |
if (add_ass) {
|
| 22074 |
ss << "[|assistant|]";
|
| 22075 |
}
|
| 22076 |
-
} else if (tmpl ==
|
| 22077 |
// this template requires the model to have "\n\n" as EOT token
|
| 22078 |
for (auto message : chat) {
|
| 22079 |
std::string role(message->role);
|
|
@@ -22083,7 +22167,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 22083 |
ss << message->content << "\n\n";
|
| 22084 |
}
|
| 22085 |
}
|
| 22086 |
-
} else if (tmpl ==
|
| 22087 |
// IBM Granite template
|
| 22088 |
for (const auto & message : chat) {
|
| 22089 |
std::string role(message->role);
|
|
@@ -22135,7 +22219,11 @@ int32_t llama_chat_apply_template(
|
|
| 22135 |
}
|
| 22136 |
|
| 22137 |
std::string formatted_chat;
|
| 22138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22139 |
if (res < 0) {
|
| 22140 |
return res;
|
| 22141 |
}
|
|
@@ -22145,6 +22233,15 @@ int32_t llama_chat_apply_template(
|
|
| 22145 |
return res;
|
| 22146 |
}
|
| 22147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22148 |
//
|
| 22149 |
// sampling
|
| 22150 |
//
|
|
@@ -22191,32 +22288,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
|
| 22191 |
}
|
| 22192 |
|
| 22193 |
const char * llama_print_system_info(void) {
|
| 22194 |
-
ggml_cpu_init(); // some ARM features are detected at runtime
|
| 22195 |
-
|
| 22196 |
static std::string s;
|
| 22197 |
|
| 22198 |
-
|
| 22199 |
-
|
| 22200 |
-
|
| 22201 |
-
|
| 22202 |
-
|
| 22203 |
-
|
| 22204 |
-
|
| 22205 |
-
|
| 22206 |
-
|
| 22207 |
-
|
| 22208 |
-
|
| 22209 |
-
|
| 22210 |
-
|
| 22211 |
-
|
| 22212 |
-
|
| 22213 |
-
s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
|
| 22214 |
-
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
| 22215 |
-
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
| 22216 |
-
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
| 22217 |
-
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
| 22218 |
-
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
| 22219 |
-
s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | ";
|
| 22220 |
|
| 22221 |
return s.c_str();
|
| 22222 |
}
|
|
|
|
| 179 |
LLM_ARCH_COMMAND_R,
|
| 180 |
LLM_ARCH_DBRX,
|
| 181 |
LLM_ARCH_OLMO,
|
| 182 |
+
LLM_ARCH_OLMO2,
|
| 183 |
LLM_ARCH_OLMOE,
|
| 184 |
LLM_ARCH_OPENELM,
|
| 185 |
LLM_ARCH_ARCTIC,
|
|
|
|
| 233 |
{ LLM_ARCH_COMMAND_R, "command-r" },
|
| 234 |
{ LLM_ARCH_DBRX, "dbrx" },
|
| 235 |
{ LLM_ARCH_OLMO, "olmo" },
|
| 236 |
+
{ LLM_ARCH_OLMO2, "olmo2" },
|
| 237 |
{ LLM_ARCH_OLMOE, "olmoe" },
|
| 238 |
{ LLM_ARCH_OPENELM, "openelm" },
|
| 239 |
{ LLM_ARCH_ARCTIC, "arctic" },
|
|
|
|
| 1036 |
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 1037 |
{ LLM_TENSOR_OUTPUT, "output" },
|
| 1038 |
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
| 1039 |
+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
| 1040 |
+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
| 1041 |
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 1042 |
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 1043 |
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
|
|
| 1212 |
},
|
| 1213 |
},
|
| 1214 |
{
|
| 1215 |
+
LLM_ARCH_OLMO2,
|
| 1216 |
{
|
| 1217 |
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 1218 |
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
|
|
| 1551 |
},
|
| 1552 |
};
|
| 1553 |
|
| 1554 |
+
enum llm_chat_template {
|
| 1555 |
+
LLM_CHAT_TEMPLATE_CHATML,
|
| 1556 |
+
LLM_CHAT_TEMPLATE_LLAMA_2,
|
| 1557 |
+
LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
|
| 1558 |
+
LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
|
| 1559 |
+
LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
|
| 1560 |
+
LLM_CHAT_TEMPLATE_MISTRAL_V1,
|
| 1561 |
+
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
| 1562 |
+
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
| 1563 |
+
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
| 1564 |
+
LLM_CHAT_TEMPLATE_PHI_3,
|
| 1565 |
+
LLM_CHAT_TEMPLATE_ZEPHYR,
|
| 1566 |
+
LLM_CHAT_TEMPLATE_MONARCH,
|
| 1567 |
+
LLM_CHAT_TEMPLATE_GEMMA,
|
| 1568 |
+
LLM_CHAT_TEMPLATE_ORION,
|
| 1569 |
+
LLM_CHAT_TEMPLATE_OPENCHAT,
|
| 1570 |
+
LLM_CHAT_TEMPLATE_VICUNA,
|
| 1571 |
+
LLM_CHAT_TEMPLATE_VICUNA_ORCA,
|
| 1572 |
+
LLM_CHAT_TEMPLATE_DEEPSEEK,
|
| 1573 |
+
LLM_CHAT_TEMPLATE_DEEPSEEK_2,
|
| 1574 |
+
LLM_CHAT_TEMPLATE_COMMAND_R,
|
| 1575 |
+
LLM_CHAT_TEMPLATE_LLAMA_3,
|
| 1576 |
+
LLM_CHAT_TEMPLATE_CHATGML_3,
|
| 1577 |
+
LLM_CHAT_TEMPLATE_CHATGML_4,
|
| 1578 |
+
LLM_CHAT_TEMPLATE_MINICPM,
|
| 1579 |
+
LLM_CHAT_TEMPLATE_EXAONE_3,
|
| 1580 |
+
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
| 1581 |
+
LLM_CHAT_TEMPLATE_GRANITE,
|
| 1582 |
+
LLM_CHAT_TEMPLATE_UNKNOWN,
|
| 1583 |
+
};
|
| 1584 |
+
|
| 1585 |
+
static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
| 1586 |
+
{ "chatml", LLM_CHAT_TEMPLATE_CHATML },
|
| 1587 |
+
{ "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
|
| 1588 |
+
{ "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
|
| 1589 |
+
{ "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
|
| 1590 |
+
{ "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
|
| 1591 |
+
{ "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
|
| 1592 |
+
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
| 1593 |
+
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
| 1594 |
+
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
| 1595 |
+
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
| 1596 |
+
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
| 1597 |
+
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
| 1598 |
+
{ "gemma", LLM_CHAT_TEMPLATE_GEMMA },
|
| 1599 |
+
{ "orion", LLM_CHAT_TEMPLATE_ORION },
|
| 1600 |
+
{ "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
|
| 1601 |
+
{ "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
|
| 1602 |
+
{ "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
|
| 1603 |
+
{ "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
|
| 1604 |
+
{ "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
|
| 1605 |
+
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
| 1606 |
+
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
| 1607 |
+
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
|
| 1608 |
+
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
|
| 1609 |
+
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
| 1610 |
+
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
| 1611 |
+
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
| 1612 |
+
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
| 1613 |
+
};
|
| 1614 |
+
|
| 1615 |
static llm_arch llm_arch_from_string(const std::string & name) {
|
| 1616 |
for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
|
| 1617 |
if (kv.second == name) {
|
|
|
|
| 1685 |
//
|
| 1686 |
|
| 1687 |
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
| 1688 |
+
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
| 1689 |
+
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
| 1690 |
+
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
| 1691 |
+
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
| 1692 |
};
|
| 1693 |
|
| 1694 |
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
|
|
|
| 2405 |
MODEL_16B,
|
| 2406 |
MODEL_20B,
|
| 2407 |
MODEL_30B,
|
| 2408 |
+
MODEL_32B,
|
| 2409 |
MODEL_34B,
|
| 2410 |
MODEL_35B,
|
| 2411 |
MODEL_40B,
|
|
|
|
| 4931 |
mappings.reserve(files.size());
|
| 4932 |
mmaps_used.reserve(files.size());
|
| 4933 |
for (const auto & file : files) {
|
| 4934 |
+
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
| 4935 |
+
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
| 4936 |
+
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
|
| 4937 |
mmaps_used.emplace_back(mapping->size, 0);
|
| 4938 |
if (mlock_mmaps) {
|
| 4939 |
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
|
|
|
| 5395 |
case MODEL_16B: return "16B";
|
| 5396 |
case MODEL_20B: return "20B";
|
| 5397 |
case MODEL_30B: return "30B";
|
| 5398 |
+
case MODEL_32B: return "32B";
|
| 5399 |
case MODEL_34B: return "34B";
|
| 5400 |
case MODEL_35B: return "35B";
|
| 5401 |
case MODEL_40B: return "40B";
|
|
|
|
| 5583 |
case LLM_ARCH_MINICPM:
|
| 5584 |
{
|
| 5585 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 5586 |
+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
| 5587 |
+
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
| 5588 |
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
| 5589 |
|
| 5590 |
switch (hparams.n_layer) {
|
| 5591 |
+
case 52: model.type = e_model::MODEL_1B; break;
|
| 5592 |
case 40: model.type = e_model::MODEL_2B; break;
|
| 5593 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 5594 |
}
|
|
|
|
| 5760 |
case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
|
| 5761 |
case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
|
| 5762 |
case 32: model.type = e_model::MODEL_7B; break;
|
| 5763 |
+
case 36: model.type = e_model::MODEL_3B; break;
|
| 5764 |
case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
|
| 5765 |
+
case 48: model.type = e_model::MODEL_14B; break;
|
| 5766 |
+
case 64: model.type = e_model::MODEL_32B; break;
|
| 5767 |
case 80: model.type = e_model::MODEL_70B; break;
|
| 5768 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 5769 |
}
|
|
|
|
| 5973 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 5974 |
}
|
| 5975 |
} break;
|
| 5976 |
+
case LLM_ARCH_OLMO2:
|
| 5977 |
{
|
| 5978 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 5979 |
|
|
|
|
| 7072 |
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
| 7073 |
}
|
| 7074 |
|
| 7075 |
+
if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
|
| 7076 |
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
| 7077 |
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
| 7078 |
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
|
|
| 7256 |
} break;
|
| 7257 |
case GGML_OP_ADD:
|
| 7258 |
{
|
| 7259 |
+
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
| 7260 |
op_tensor = ggml_add(ctx, a, w);
|
| 7261 |
} break;
|
| 7262 |
case GGML_OP_MUL:
|
| 7263 |
{
|
| 7264 |
+
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
| 7265 |
op_tensor = ggml_mul(ctx, a, w);
|
| 7266 |
} break;
|
| 7267 |
case GGML_OP_DIV:
|
|
|
|
| 7697 |
|
| 7698 |
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
| 7699 |
|
| 7700 |
+
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
| 7701 |
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
| 7702 |
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
| 7703 |
+
}
|
| 7704 |
+
else {
|
| 7705 |
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
| 7706 |
+
}
|
| 7707 |
|
| 7708 |
if (n_expert == 0) {
|
| 7709 |
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
|
|
| 8672 |
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
| 8673 |
}
|
| 8674 |
} break;
|
| 8675 |
+
case LLM_ARCH_OLMO2:
|
| 8676 |
{
|
| 8677 |
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
| 8678 |
|
|
|
|
| 9271 |
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
| 9272 |
if (!dev) {
|
| 9273 |
// FIXME: workaround for CPU backend buft having a NULL device
|
| 9274 |
+
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
| 9275 |
}
|
| 9276 |
ggml_backend_dev_props props;
|
| 9277 |
ggml_backend_dev_get_props(dev, &props);
|
|
|
|
| 13510 |
return gf;
|
| 13511 |
}
|
| 13512 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13513 |
struct ggml_cgraph * build_minicpm3() {
|
| 13514 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
| 13515 |
|
|
|
|
| 14415 |
return gf;
|
| 14416 |
}
|
| 14417 |
|
| 14418 |
+
struct ggml_cgraph * build_olmo2() {
|
| 14419 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
| 14420 |
|
| 14421 |
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
|
|
| 16608 |
|
| 16609 |
switch (model.arch) {
|
| 16610 |
case LLM_ARCH_LLAMA:
|
| 16611 |
+
case LLM_ARCH_MINICPM:
|
| 16612 |
case LLM_ARCH_GRANITE:
|
| 16613 |
case LLM_ARCH_GRANITE_MOE:
|
| 16614 |
{
|
|
|
|
| 16692 |
{
|
| 16693 |
result = llm.build_internlm2();
|
| 16694 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16695 |
case LLM_ARCH_MINICPM3:
|
| 16696 |
{
|
| 16697 |
result = llm.build_minicpm3();
|
|
|
|
| 16728 |
{
|
| 16729 |
result = llm.build_olmo();
|
| 16730 |
} break;
|
| 16731 |
+
case LLM_ARCH_OLMO2:
|
| 16732 |
{
|
| 16733 |
+
result = llm.build_olmo2();
|
| 16734 |
} break;
|
| 16735 |
case LLM_ARCH_OLMOE:
|
| 16736 |
{
|
|
|
|
| 17374 |
int n_threads,
|
| 17375 |
ggml_threadpool * threadpool) {
|
| 17376 |
if (lctx.backend_cpu != nullptr) {
|
| 17377 |
+
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
|
| 17378 |
+
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
|
| 17379 |
+
set_threadpool_fn(lctx.backend_cpu, threadpool);
|
| 17380 |
}
|
| 17381 |
|
| 17382 |
// set the number of threads for all the backends
|
|
|
|
| 18143 |
static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
| 18144 |
bool need_reserve = false;
|
| 18145 |
|
| 18146 |
+
if (lctx.kv_self.has_shift) {
|
|
|
|
| 18147 |
if (!llama_kv_cache_can_shift(&lctx)) {
|
| 18148 |
+
GGML_ABORT("The current context does not support K-shift");
|
| 18149 |
}
|
| 18150 |
|
| 18151 |
+
// apply K-shift if needed
|
| 18152 |
+
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
|
| 18153 |
ggml_backend_sched_reset(lctx.sched.get());
|
| 18154 |
|
| 18155 |
ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
|
|
|
|
| 19293 |
//
|
| 19294 |
struct llama_model_params llama_model_default_params() {
|
| 19295 |
struct llama_model_params result = {
|
| 19296 |
+
/*.devices =*/ nullptr,
|
| 19297 |
/*.n_gpu_layers =*/ 0,
|
| 19298 |
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
| 19299 |
/*.main_gpu =*/ 0,
|
|
|
|
| 19411 |
|
| 19412 |
void llama_numa_init(enum ggml_numa_strategy numa) {
|
| 19413 |
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
|
| 19414 |
+
auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
| 19415 |
+
GGML_ASSERT(dev && "CPU backend is not loaded");
|
| 19416 |
+
auto * reg = ggml_backend_dev_backend_reg(dev);
|
| 19417 |
+
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
|
| 19418 |
+
numa_init_fn(numa);
|
| 19419 |
}
|
| 19420 |
}
|
| 19421 |
|
|
|
|
| 19506 |
}
|
| 19507 |
|
| 19508 |
// create list of devices to use with this model
|
| 19509 |
+
if (params.devices) {
|
| 19510 |
+
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
|
| 19511 |
+
model->devices.push_back(*dev);
|
| 19512 |
+
}
|
| 19513 |
+
} else {
|
| 19514 |
+
// use all available devices
|
| 19515 |
+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
| 19516 |
+
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
| 19517 |
+
switch (ggml_backend_dev_type(dev)) {
|
| 19518 |
+
case GGML_BACKEND_DEVICE_TYPE_CPU:
|
| 19519 |
+
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
| 19520 |
+
// skip CPU backends since they are handled separately
|
| 19521 |
+
break;
|
| 19522 |
|
| 19523 |
+
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
| 19524 |
+
model->devices.push_back(dev);
|
| 19525 |
+
break;
|
| 19526 |
+
}
|
| 19527 |
}
|
| 19528 |
}
|
| 19529 |
|
|
|
|
| 19694 |
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
| 19695 |
}
|
| 19696 |
|
|
|
|
|
|
|
|
|
|
| 19697 |
ctx->logits_all = params.logits_all;
|
| 19698 |
|
| 19699 |
// build worst-case graph for encoder if a model contains encoder
|
|
|
|
| 19742 |
}
|
| 19743 |
|
| 19744 |
// add CPU backend
|
| 19745 |
+
ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
| 19746 |
if (ctx->backend_cpu == nullptr) {
|
| 19747 |
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
| 19748 |
llama_free(ctx);
|
|
|
|
| 19762 |
}
|
| 19763 |
}
|
| 19764 |
|
| 19765 |
+
llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
|
| 19766 |
+
|
| 19767 |
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
| 19768 |
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
| 19769 |
llama_free(ctx);
|
|
|
|
| 19809 |
std::vector<ggml_backend_t> backend_ptrs;
|
| 19810 |
for (auto & backend : ctx->backends) {
|
| 19811 |
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
| 19812 |
+
auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
|
| 19813 |
+
if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
|
| 19814 |
// use the host buffer of the first device CPU for faster transfer of the intermediate state
|
| 19815 |
auto * dev = model->devices[0];
|
| 19816 |
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
|
|
|
|
| 19838 |
// pipeline parallelism requires support for async compute and events in all devices
|
| 19839 |
if (pipeline_parallel) {
|
| 19840 |
for (auto & backend : ctx->backends) {
|
| 19841 |
+
auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
|
| 19842 |
+
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
| 19843 |
// ignore CPU backend
|
| 19844 |
continue;
|
| 19845 |
}
|
|
|
|
| 20013 |
case LLM_ARCH_QWEN:
|
| 20014 |
case LLM_ARCH_QWEN2:
|
| 20015 |
case LLM_ARCH_QWEN2MOE:
|
| 20016 |
+
case LLM_ARCH_OLMO2:
|
| 20017 |
case LLM_ARCH_OLMOE:
|
| 20018 |
case LLM_ARCH_PHI2:
|
| 20019 |
case LLM_ARCH_PHI3:
|
|
|
|
| 20406 |
}
|
| 20407 |
|
| 20408 |
bool llama_kv_cache_can_shift(struct llama_context * ctx) {
|
| 20409 |
+
return !ctx->kv_self.recurrent && ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
|
| 20410 |
}
|
| 20411 |
|
| 20412 |
// deprecated
|
|
|
|
| 21393 |
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
| 21394 |
ctx->abort_callback = abort_callback;
|
| 21395 |
ctx->abort_callback_data = abort_callback_data;
|
| 21396 |
+
|
| 21397 |
+
for (auto & backend : ctx->backends) {
|
| 21398 |
+
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
|
| 21399 |
+
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
|
| 21400 |
+
if (set_abort_callback_fn) {
|
| 21401 |
+
set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
|
| 21402 |
+
}
|
| 21403 |
+
}
|
| 21404 |
}
|
| 21405 |
|
| 21406 |
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
|
|
|
| 21767 |
// chat templates
|
| 21768 |
//
|
| 21769 |
|
| 21770 |
+
static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
| 21771 |
+
if (LLM_CHAT_TEMPLATES.find(tmpl) != LLM_CHAT_TEMPLATES.end()) {
|
| 21772 |
+
return LLM_CHAT_TEMPLATES.at(tmpl);
|
| 21773 |
+
}
|
| 21774 |
+
auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
|
| 21775 |
+
return tmpl.find(haystack) != std::string::npos;
|
| 21776 |
+
};
|
| 21777 |
+
if (tmpl_contains("<|im_start|>")) {
|
| 21778 |
+
return LLM_CHAT_TEMPLATE_CHATML;
|
| 21779 |
+
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
|
| 21780 |
+
if (tmpl_contains("[SYSTEM_PROMPT]")) {
|
| 21781 |
+
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
|
| 21782 |
+
} else if (
|
| 21783 |
+
// catches official 'v1' template
|
| 21784 |
+
tmpl_contains("' [INST] ' + system_message")
|
| 21785 |
+
// catches official 'v3' and 'v3-tekken' templates
|
| 21786 |
+
|| tmpl_contains("[AVAILABLE_TOOLS]")
|
| 21787 |
+
) {
|
| 21788 |
+
// Official mistral 'v1', 'v3' and 'v3-tekken' templates
|
| 21789 |
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
|
| 21790 |
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
|
| 21791 |
+
if (tmpl_contains(" [INST]")) {
|
| 21792 |
+
return LLM_CHAT_TEMPLATE_MISTRAL_V1;
|
| 21793 |
+
} else if (tmpl_contains("\"[INST]\"")) {
|
| 21794 |
+
return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
|
| 21795 |
+
}
|
| 21796 |
+
return LLM_CHAT_TEMPLATE_MISTRAL_V3;
|
| 21797 |
+
} else {
|
| 21798 |
+
// llama2 template and its variants
|
| 21799 |
+
// [variant] support system message
|
| 21800 |
+
// See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
|
| 21801 |
+
bool support_system_message = tmpl_contains("<<SYS>>");
|
| 21802 |
+
bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
|
| 21803 |
+
bool strip_message = tmpl_contains("content.strip()");
|
| 21804 |
+
if (strip_message) {
|
| 21805 |
+
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
|
| 21806 |
+
} else if (add_bos_inside_history) {
|
| 21807 |
+
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
|
| 21808 |
+
} else if (support_system_message) {
|
| 21809 |
+
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
|
| 21810 |
+
} else {
|
| 21811 |
+
return LLM_CHAT_TEMPLATE_LLAMA_2;
|
| 21812 |
+
}
|
| 21813 |
+
}
|
| 21814 |
+
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
| 21815 |
+
return LLM_CHAT_TEMPLATE_PHI_3;
|
| 21816 |
+
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
| 21817 |
+
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
| 21818 |
+
} else if (tmpl_contains("bos_token + message['role']")) {
|
| 21819 |
+
return LLM_CHAT_TEMPLATE_MONARCH;
|
| 21820 |
+
} else if (tmpl_contains("<start_of_turn>")) {
|
| 21821 |
+
return LLM_CHAT_TEMPLATE_GEMMA;
|
| 21822 |
+
} else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
|
| 21823 |
+
// OrionStarAI/Orion-14B-Chat
|
| 21824 |
+
return LLM_CHAT_TEMPLATE_ORION;
|
| 21825 |
+
} else if (tmpl_contains("GPT4 Correct ")) {
|
| 21826 |
+
// openchat/openchat-3.5-0106
|
| 21827 |
+
return LLM_CHAT_TEMPLATE_OPENCHAT;
|
| 21828 |
+
} else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
|
| 21829 |
+
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
| 21830 |
+
if (tmpl_contains("SYSTEM: ")) {
|
| 21831 |
+
return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
|
| 21832 |
+
}
|
| 21833 |
+
return LLM_CHAT_TEMPLATE_VICUNA;
|
| 21834 |
+
} else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
|
| 21835 |
+
// deepseek-ai/deepseek-coder-33b-instruct
|
| 21836 |
+
return LLM_CHAT_TEMPLATE_DEEPSEEK;
|
| 21837 |
+
} else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
|
| 21838 |
+
// CohereForAI/c4ai-command-r-plus
|
| 21839 |
+
return LLM_CHAT_TEMPLATE_COMMAND_R;
|
| 21840 |
+
} else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
|
| 21841 |
+
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
| 21842 |
+
} else if (tmpl_contains("[gMASK]sop")) {
|
| 21843 |
+
// chatglm3-6b
|
| 21844 |
+
return LLM_CHAT_TEMPLATE_CHATGML_3;
|
| 21845 |
+
} else if (tmpl_contains("[gMASK]<sop>")) {
|
| 21846 |
+
return LLM_CHAT_TEMPLATE_CHATGML_4;
|
| 21847 |
+
} else if (tmpl_contains(LU8("<用户>"))) {
|
| 21848 |
+
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
| 21849 |
+
return LLM_CHAT_TEMPLATE_MINICPM;
|
| 21850 |
+
} else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
|
| 21851 |
+
return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
|
| 21852 |
+
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
|
| 21853 |
+
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
| 21854 |
+
// EXAONE-3.0-7.8B-Instruct
|
| 21855 |
+
return LLM_CHAT_TEMPLATE_EXAONE_3;
|
| 21856 |
+
} else if (tmpl_contains("rwkv-world")) {
|
| 21857 |
+
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
|
| 21858 |
+
} else if (tmpl_contains("<|start_of_role|>")) {
|
| 21859 |
+
return LLM_CHAT_TEMPLATE_GRANITE;
|
| 21860 |
+
}
|
| 21861 |
+
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
| 21862 |
+
}
|
| 21863 |
+
|
| 21864 |
// Simple version of "llama_apply_chat_template" that only works with strings
|
| 21865 |
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
|
| 21866 |
static int32_t llama_chat_apply_template_internal(
|
| 21867 |
+
const llm_chat_template tmpl,
|
| 21868 |
const std::vector<const llama_chat_message *> & chat,
|
| 21869 |
std::string & dest, bool add_ass) {
|
| 21870 |
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
| 21871 |
std::stringstream ss;
|
| 21872 |
+
if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
|
|
|
|
|
|
|
|
|
|
| 21873 |
// chatml template
|
| 21874 |
for (auto message : chat) {
|
| 21875 |
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
|
|
|
| 21877 |
if (add_ass) {
|
| 21878 |
ss << "<|im_start|>assistant\n";
|
| 21879 |
}
|
| 21880 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
|
| 21881 |
+
// Official mistral 'v7' template
|
| 21882 |
+
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
| 21883 |
+
for (auto message : chat) {
|
| 21884 |
+
std::string role(message->role);
|
| 21885 |
+
std::string content(message->content);
|
| 21886 |
+
if (role == "system") {
|
| 21887 |
+
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
|
| 21888 |
+
} else if (role == "user") {
|
| 21889 |
+
ss << "[INST] " << content << "[/INST]";
|
| 21890 |
+
}
|
| 21891 |
+
else {
|
| 21892 |
+
ss << " " << content << "</s>";
|
| 21893 |
+
}
|
| 21894 |
+
}
|
| 21895 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
| 21896 |
+
|| tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
|
| 21897 |
+
|| tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
|
| 21898 |
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
|
| 21899 |
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
|
| 21900 |
+
std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
|
| 21901 |
+
std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
|
| 21902 |
+
bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
|
| 21903 |
+
bool is_inside_turn = false;
|
| 21904 |
+
for (auto message : chat) {
|
| 21905 |
+
if (!is_inside_turn) {
|
| 21906 |
+
ss << leading_space << "[INST]" << trailing_space;
|
| 21907 |
+
is_inside_turn = true;
|
| 21908 |
+
}
|
| 21909 |
+
std::string role(message->role);
|
| 21910 |
+
std::string content(message->content);
|
| 21911 |
+
if (role == "system") {
|
| 21912 |
+
ss << content << "\n\n";
|
| 21913 |
+
} else if (role == "user") {
|
| 21914 |
+
ss << content << leading_space << "[/INST]";
|
| 21915 |
+
} else {
|
| 21916 |
+
ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
|
| 21917 |
+
is_inside_turn = false;
|
| 21918 |
+
}
|
| 21919 |
+
}
|
| 21920 |
+
} else if (
|
| 21921 |
+
tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
|
| 21922 |
+
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
|
| 21923 |
+
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
|
| 21924 |
+
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
|
| 21925 |
// llama2 template and its variants
|
| 21926 |
// [variant] support system message
|
| 21927 |
+
// See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
|
| 21928 |
+
bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
|
|
|
|
| 21929 |
// [variant] add BOS inside history
|
| 21930 |
+
bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
|
| 21931 |
// [variant] trim spaces from the input message
|
| 21932 |
+
bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
|
| 21933 |
// construct the prompt
|
| 21934 |
bool is_inside_turn = true; // skip BOS at the beginning
|
| 21935 |
ss << "[INST] ";
|
|
|
|
| 21950 |
} else if (role == "user") {
|
| 21951 |
ss << content << " [/INST]";
|
| 21952 |
} else {
|
| 21953 |
+
ss << content << "</s>";
|
| 21954 |
is_inside_turn = false;
|
| 21955 |
}
|
| 21956 |
}
|
| 21957 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
|
|
|
|
| 21958 |
// Phi 3
|
| 21959 |
for (auto message : chat) {
|
| 21960 |
std::string role(message->role);
|
|
|
|
| 21963 |
if (add_ass) {
|
| 21964 |
ss << "<|assistant|>\n";
|
| 21965 |
}
|
| 21966 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
|
| 21967 |
// zephyr template
|
| 21968 |
for (auto message : chat) {
|
| 21969 |
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
|
|
|
| 21971 |
if (add_ass) {
|
| 21972 |
ss << "<|assistant|>\n";
|
| 21973 |
}
|
| 21974 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
|
| 21975 |
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
| 21976 |
for (auto message : chat) {
|
| 21977 |
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
|
|
|
| 21980 |
if (add_ass) {
|
| 21981 |
ss << "<s>assistant\n";
|
| 21982 |
}
|
| 21983 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
|
| 21984 |
// google/gemma-7b-it
|
| 21985 |
std::string system_prompt = "";
|
| 21986 |
for (auto message : chat) {
|
|
|
|
| 22002 |
if (add_ass) {
|
| 22003 |
ss << "<start_of_turn>model\n";
|
| 22004 |
}
|
| 22005 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
|
| 22006 |
// OrionStarAI/Orion-14B-Chat
|
| 22007 |
std::string system_prompt = "";
|
| 22008 |
for (auto message : chat) {
|
|
|
|
| 22022 |
ss << message->content << "</s>";
|
| 22023 |
}
|
| 22024 |
}
|
| 22025 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
|
| 22026 |
// openchat/openchat-3.5-0106,
|
| 22027 |
for (auto message : chat) {
|
| 22028 |
std::string role(message->role);
|
|
|
|
| 22036 |
if (add_ass) {
|
| 22037 |
ss << "GPT4 Correct Assistant:";
|
| 22038 |
}
|
| 22039 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
|
| 22040 |
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
| 22041 |
for (auto message : chat) {
|
| 22042 |
std::string role(message->role);
|
| 22043 |
if (role == "system") {
|
| 22044 |
// Orca-Vicuna variant uses a system prefix
|
| 22045 |
+
if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
|
| 22046 |
ss << "SYSTEM: " << message->content << "\n";
|
| 22047 |
} else {
|
| 22048 |
ss << message->content << "\n\n";
|
|
|
|
| 22056 |
if (add_ass) {
|
| 22057 |
ss << "ASSISTANT:";
|
| 22058 |
}
|
| 22059 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
|
| 22060 |
// deepseek-ai/deepseek-coder-33b-instruct
|
| 22061 |
for (auto message : chat) {
|
| 22062 |
std::string role(message->role);
|
|
|
|
| 22071 |
if (add_ass) {
|
| 22072 |
ss << "### Response:\n";
|
| 22073 |
}
|
| 22074 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
|
| 22075 |
// CohereForAI/c4ai-command-r-plus
|
| 22076 |
for (auto message : chat) {
|
| 22077 |
std::string role(message->role);
|
|
|
|
| 22086 |
if (add_ass) {
|
| 22087 |
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
| 22088 |
}
|
| 22089 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
|
| 22090 |
// Llama 3
|
| 22091 |
for (auto message : chat) {
|
| 22092 |
std::string role(message->role);
|
|
|
|
| 22095 |
if (add_ass) {
|
| 22096 |
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
| 22097 |
}
|
| 22098 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
|
| 22099 |
// chatglm3-6b
|
| 22100 |
ss << "[gMASK]" << "sop";
|
| 22101 |
for (auto message : chat) {
|
|
|
|
| 22105 |
if (add_ass) {
|
| 22106 |
ss << "<|assistant|>";
|
| 22107 |
}
|
| 22108 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
|
| 22109 |
ss << "[gMASK]" << "<sop>";
|
| 22110 |
for (auto message : chat) {
|
| 22111 |
std::string role(message->role);
|
|
|
|
| 22114 |
if (add_ass) {
|
| 22115 |
ss << "<|assistant|>";
|
| 22116 |
}
|
| 22117 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
|
| 22118 |
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
| 22119 |
for (auto message : chat) {
|
| 22120 |
std::string role(message->role);
|
|
|
|
| 22126 |
ss << trim(message->content);
|
| 22127 |
}
|
| 22128 |
}
|
| 22129 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
|
| 22130 |
// DeepSeek-V2
|
| 22131 |
for (auto message : chat) {
|
| 22132 |
std::string role(message->role);
|
|
|
|
| 22141 |
if (add_ass) {
|
| 22142 |
ss << "Assistant:";
|
| 22143 |
}
|
| 22144 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
|
| 22145 |
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
| 22146 |
// EXAONE-3.0-7.8B-Instruct
|
| 22147 |
for (auto message : chat) {
|
|
|
|
| 22157 |
if (add_ass) {
|
| 22158 |
ss << "[|assistant|]";
|
| 22159 |
}
|
| 22160 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
|
| 22161 |
// this template requires the model to have "\n\n" as EOT token
|
| 22162 |
for (auto message : chat) {
|
| 22163 |
std::string role(message->role);
|
|
|
|
| 22167 |
ss << message->content << "\n\n";
|
| 22168 |
}
|
| 22169 |
}
|
| 22170 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
|
| 22171 |
// IBM Granite template
|
| 22172 |
for (const auto & message : chat) {
|
| 22173 |
std::string role(message->role);
|
|
|
|
| 22219 |
}
|
| 22220 |
|
| 22221 |
std::string formatted_chat;
|
| 22222 |
+
llm_chat_template detected_tmpl = llama_chat_detect_template(curr_tmpl);
|
| 22223 |
+
if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
|
| 22224 |
+
return -1;
|
| 22225 |
+
}
|
| 22226 |
+
int32_t res = llama_chat_apply_template_internal(detected_tmpl, chat_vec, formatted_chat, add_ass);
|
| 22227 |
if (res < 0) {
|
| 22228 |
return res;
|
| 22229 |
}
|
|
|
|
| 22233 |
return res;
|
| 22234 |
}
|
| 22235 |
|
| 22236 |
+
int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
|
| 22237 |
+
auto it = LLM_CHAT_TEMPLATES.begin();
|
| 22238 |
+
for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
|
| 22239 |
+
output[i] = it->first.c_str();
|
| 22240 |
+
std::advance(it, 1);
|
| 22241 |
+
}
|
| 22242 |
+
return (int32_t) LLM_CHAT_TEMPLATES.size();
|
| 22243 |
+
}
|
| 22244 |
+
|
| 22245 |
//
|
| 22246 |
// sampling
|
| 22247 |
//
|
|
|
|
| 22288 |
}
|
| 22289 |
|
| 22290 |
const char * llama_print_system_info(void) {
|
|
|
|
|
|
|
| 22291 |
static std::string s;
|
| 22292 |
|
| 22293 |
+
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
| 22294 |
+
auto * reg = ggml_backend_reg_get(i);
|
| 22295 |
+
auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
|
| 22296 |
+
if (get_features_fn) {
|
| 22297 |
+
ggml_backend_feature * features = get_features_fn(reg);
|
| 22298 |
+
s += ggml_backend_reg_name(reg);
|
| 22299 |
+
s += " : ";
|
| 22300 |
+
for (; features->name; features++) {
|
| 22301 |
+
s += features->name;
|
| 22302 |
+
s += " = ";
|
| 22303 |
+
s += features->value;
|
| 22304 |
+
s += " | ";
|
| 22305 |
+
}
|
| 22306 |
+
}
|
| 22307 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22308 |
|
| 22309 |
return s.c_str();
|
| 22310 |
}
|
examples/talk-llama/llama.h
CHANGED
|
@@ -185,7 +185,8 @@ extern "C" {
|
|
| 185 |
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
|
| 186 |
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
|
| 187 |
LLAMA_ROPE_SCALING_TYPE_YARN = 2,
|
| 188 |
-
|
|
|
|
| 189 |
};
|
| 190 |
|
| 191 |
enum llama_pooling_type {
|
|
@@ -272,6 +273,9 @@ extern "C" {
|
|
| 272 |
};
|
| 273 |
|
| 274 |
struct llama_model_params {
|
|
|
|
|
|
|
|
|
|
| 275 |
int32_t n_gpu_layers; // number of layers to store in VRAM
|
| 276 |
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
| 277 |
|
|
@@ -987,6 +991,9 @@ extern "C" {
|
|
| 987 |
char * buf,
|
| 988 |
int32_t length);
|
| 989 |
|
|
|
|
|
|
|
|
|
|
| 990 |
//
|
| 991 |
// Sampling API
|
| 992 |
//
|
|
|
|
| 185 |
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
|
| 186 |
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
|
| 187 |
LLAMA_ROPE_SCALING_TYPE_YARN = 2,
|
| 188 |
+
LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3,
|
| 189 |
+
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
|
| 190 |
};
|
| 191 |
|
| 192 |
enum llama_pooling_type {
|
|
|
|
| 273 |
};
|
| 274 |
|
| 275 |
struct llama_model_params {
|
| 276 |
+
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
|
| 277 |
+
ggml_backend_dev_t * devices;
|
| 278 |
+
|
| 279 |
int32_t n_gpu_layers; // number of layers to store in VRAM
|
| 280 |
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
| 281 |
|
|
|
|
| 991 |
char * buf,
|
| 992 |
int32_t length);
|
| 993 |
|
| 994 |
+
// Get list of built-in chat templates
|
| 995 |
+
LLAMA_API int32_t llama_chat_builtin_templates(const char ** output, size_t len);
|
| 996 |
+
|
| 997 |
//
|
| 998 |
// Sampling API
|
| 999 |
//
|
examples/talk-llama/unicode.cpp
CHANGED
|
@@ -201,7 +201,18 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
|
|
| 201 |
}
|
| 202 |
|
| 203 |
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
return conv.from_bytes(s);
|
| 206 |
}
|
| 207 |
|
|
|
|
| 201 |
}
|
| 202 |
|
| 203 |
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
| 204 |
+
#if defined(__clang__)
|
| 205 |
+
// disable C++17 deprecation warning for std::codecvt_utf8
|
| 206 |
+
# pragma clang diagnostic push
|
| 207 |
+
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
| 208 |
+
#endif
|
| 209 |
+
|
| 210 |
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
| 211 |
+
|
| 212 |
+
#if defined(__clang__)
|
| 213 |
+
# pragma clang diagnostic pop
|
| 214 |
+
#endif
|
| 215 |
+
|
| 216 |
return conv.from_bytes(s);
|
| 217 |
}
|
| 218 |
|