ggerganov commited on
Commit
5908a19
·
1 Parent(s): 00d464f

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -179,7 +179,7 @@ enum llm_arch {
179
  LLM_ARCH_COMMAND_R,
180
  LLM_ARCH_DBRX,
181
  LLM_ARCH_OLMO,
182
- LLM_ARCH_OLMO_1124,
183
  LLM_ARCH_OLMOE,
184
  LLM_ARCH_OPENELM,
185
  LLM_ARCH_ARCTIC,
@@ -233,7 +233,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
233
  { LLM_ARCH_COMMAND_R, "command-r" },
234
  { LLM_ARCH_DBRX, "dbrx" },
235
  { LLM_ARCH_OLMO, "olmo" },
236
- { LLM_ARCH_OLMO_1124, "olmo_1124" },
237
  { LLM_ARCH_OLMOE, "olmoe" },
238
  { LLM_ARCH_OPENELM, "openelm" },
239
  { LLM_ARCH_ARCTIC, "arctic" },
@@ -1036,6 +1036,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1036
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1037
  { LLM_TENSOR_OUTPUT, "output" },
1038
  { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
 
 
1039
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1040
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1041
  { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
@@ -1210,7 +1212,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1210
  },
1211
  },
1212
  {
1213
- LLM_ARCH_OLMO_1124,
1214
  {
1215
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1216
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
@@ -1549,6 +1551,67 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1549
  },
1550
  };
1551
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1552
  static llm_arch llm_arch_from_string(const std::string & name) {
1553
  for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
1554
  if (kv.second == name) {
@@ -1622,9 +1685,10 @@ struct LLM_TN {
1622
  //
1623
 
1624
  static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
1625
- { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
1626
- { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
1627
- { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
 
1628
  };
1629
 
1630
  static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
@@ -2341,6 +2405,7 @@ enum e_model {
2341
  MODEL_16B,
2342
  MODEL_20B,
2343
  MODEL_30B,
 
2344
  MODEL_34B,
2345
  MODEL_35B,
2346
  MODEL_40B,
@@ -4866,7 +4931,9 @@ struct llama_model_loader {
4866
  mappings.reserve(files.size());
4867
  mmaps_used.reserve(files.size());
4868
  for (const auto & file : files) {
4869
- std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
 
 
4870
  mmaps_used.emplace_back(mapping->size, 0);
4871
  if (mlock_mmaps) {
4872
  std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
@@ -5328,6 +5395,7 @@ static const char * llama_model_type_name(e_model type) {
5328
  case MODEL_16B: return "16B";
5329
  case MODEL_20B: return "20B";
5330
  case MODEL_30B: return "30B";
 
5331
  case MODEL_34B: return "34B";
5332
  case MODEL_35B: return "35B";
5333
  case MODEL_40B: return "40B";
@@ -5515,8 +5583,12 @@ static void llm_load_hparams(
5515
  case LLM_ARCH_MINICPM:
5516
  {
5517
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
 
 
5518
 
5519
  switch (hparams.n_layer) {
 
5520
  case 40: model.type = e_model::MODEL_2B; break;
5521
  default: model.type = e_model::MODEL_UNKNOWN;
5522
  }
@@ -5688,7 +5760,10 @@ static void llm_load_hparams(
5688
  case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
5689
  case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
5690
  case 32: model.type = e_model::MODEL_7B; break;
 
5691
  case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
 
 
5692
  case 80: model.type = e_model::MODEL_70B; break;
5693
  default: model.type = e_model::MODEL_UNKNOWN;
5694
  }
@@ -5898,7 +5973,7 @@ static void llm_load_hparams(
5898
  default: model.type = e_model::MODEL_UNKNOWN;
5899
  }
5900
  } break;
5901
- case LLM_ARCH_OLMO_1124:
5902
  {
5903
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5904
 
@@ -6997,7 +7072,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
6997
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6998
  }
6999
 
7000
- if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
7001
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
7002
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
7003
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -7181,12 +7256,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
7181
  } break;
7182
  case GGML_OP_ADD:
7183
  {
7184
- ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512);
7185
  op_tensor = ggml_add(ctx, a, w);
7186
  } break;
7187
  case GGML_OP_MUL:
7188
  {
7189
- ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512);
7190
  op_tensor = ggml_mul(ctx, a, w);
7191
  } break;
7192
  case GGML_OP_DIV:
@@ -7622,7 +7697,13 @@ static bool llm_load_tensors(
7622
 
7623
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7624
 
7625
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
 
 
 
 
 
 
7626
 
7627
  if (n_expert == 0) {
7628
  layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
@@ -8591,7 +8672,7 @@ static bool llm_load_tensors(
8591
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
8592
  }
8593
  } break;
8594
- case LLM_ARCH_OLMO_1124:
8595
  {
8596
  model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
8597
 
@@ -9190,7 +9271,7 @@ static bool llm_load_tensors(
9190
  ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
9191
  if (!dev) {
9192
  // FIXME: workaround for CPU backend buft having a NULL device
9193
- dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0);
9194
  }
9195
  ggml_backend_dev_props props;
9196
  ggml_backend_dev_get_props(dev, &props);
@@ -13429,153 +13510,6 @@ struct llm_build_context {
13429
  return gf;
13430
  }
13431
 
13432
- // ref: https://arxiv.org/abs/2203.03466
13433
- // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
13434
- // based on the original build_llama() function
13435
- struct ggml_cgraph * build_minicpm() {
13436
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13437
-
13438
- const int64_t n_embd_head = hparams.n_embd_head_v;
13439
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13440
- GGML_ASSERT(n_embd_head == hparams.n_rot);
13441
-
13442
- const int64_t n_embd = hparams.n_embd;
13443
- //TODO: if the model varies, these parameters need to be read from the model
13444
- const int64_t n_embd_base = 256;
13445
- const float scale_embd = 12.0f;
13446
- const float scale_depth = 1.4f;
13447
-
13448
- struct ggml_tensor * cur;
13449
- struct ggml_tensor * inpL;
13450
-
13451
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
13452
-
13453
- // scale the input embeddings
13454
- inpL = ggml_scale(ctx0, inpL, scale_embd);
13455
- cb(inpL, "inp_scaled", -1);
13456
-
13457
- // inp_pos - contains the positions
13458
- struct ggml_tensor * inp_pos = build_inp_pos();
13459
-
13460
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13461
- struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
13462
-
13463
- for (int il = 0; il < n_layer; ++il) {
13464
- struct ggml_tensor * inpSA = inpL;
13465
-
13466
- // norm
13467
- cur = llm_build_norm(ctx0, inpL, hparams,
13468
- model.layers[il].attn_norm, NULL,
13469
- LLM_NORM_RMS, cb, il);
13470
- cb(cur, "attn_norm", il);
13471
-
13472
- // self-attention
13473
- {
13474
- // compute Q and K and RoPE them
13475
- struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
13476
- cb(Qcur, "Qcur", il);
13477
- if (model.layers[il].bq) {
13478
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
13479
- cb(Qcur, "Qcur", il);
13480
- }
13481
-
13482
- struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
13483
- cb(Kcur, "Kcur", il);
13484
- if (model.layers[il].bk) {
13485
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
13486
- cb(Kcur, "Kcur", il);
13487
- }
13488
-
13489
- struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
13490
- cb(Vcur, "Vcur", il);
13491
- if (model.layers[il].bv) {
13492
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
13493
- cb(Vcur, "Vcur", il);
13494
- }
13495
-
13496
- Qcur = ggml_rope_ext(
13497
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
13498
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13499
- ext_factor, attn_factor, beta_fast, beta_slow
13500
- );
13501
- cb(Qcur, "Qcur", il);
13502
-
13503
- Kcur = ggml_rope_ext(
13504
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
13505
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13506
- ext_factor, attn_factor, beta_fast, beta_slow
13507
- );
13508
- cb(Kcur, "Kcur", il);
13509
-
13510
- cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13511
- model.layers[il].wo, model.layers[il].bo,
13512
- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
13513
- }
13514
-
13515
- if (il == n_layer - 1) {
13516
- // skip computing output for unused tokens
13517
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
13518
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13519
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13520
- }
13521
-
13522
- // scale_res - scale the hidden states for residual connection
13523
- const float scale_res = scale_depth/sqrtf(float(n_layer));
13524
- cur = ggml_scale(ctx0, cur, scale_res);
13525
- cb(cur, "hidden_scaled", -1);
13526
-
13527
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13528
- cb(ffn_inp, "ffn_inp", il);
13529
-
13530
- // feed-forward network
13531
- {
13532
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
13533
- model.layers[il].ffn_norm, NULL,
13534
- LLM_NORM_RMS, cb, il);
13535
- cb(cur, "ffn_norm", il);
13536
-
13537
- cur = llm_build_ffn(ctx0, lctx, cur,
13538
- model.layers[il].ffn_up, NULL, NULL,
13539
- model.layers[il].ffn_gate, NULL, NULL,
13540
- model.layers[il].ffn_down, NULL, NULL,
13541
- NULL,
13542
- LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
13543
- cb(cur, "ffn_out", il);
13544
- }
13545
-
13546
- // scale the hidden states for residual connection
13547
- cur = ggml_scale(ctx0, cur, scale_res);
13548
- cb(cur, "hidden_scaled_ffn", -1);
13549
-
13550
- cur = ggml_add(ctx0, cur, ffn_inp);
13551
- cur = lctx.cvec.apply_to(ctx0, cur, il);
13552
- cb(cur, "l_out", il);
13553
-
13554
- // input for next layer
13555
- inpL = cur;
13556
- }
13557
-
13558
- cur = inpL;
13559
-
13560
- cur = llm_build_norm(ctx0, cur, hparams,
13561
- model.output_norm, NULL,
13562
- LLM_NORM_RMS, cb, -1);
13563
- cb(cur, "result_norm", -1);
13564
-
13565
- // lm_head scaling
13566
- const float scale_lmhead = float(n_embd_base)/float(n_embd);
13567
- cur = ggml_scale(ctx0, cur, scale_lmhead);
13568
- cb(cur, "lmhead_scaling", -1);
13569
-
13570
- // lm_head
13571
- cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
13572
- cb(cur, "result_output", -1);
13573
-
13574
- ggml_build_forward_expand(gf, cur);
13575
-
13576
- return gf;
13577
- }
13578
-
13579
  struct ggml_cgraph * build_minicpm3() {
13580
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13581
 
@@ -14481,7 +14415,7 @@ struct llm_build_context {
14481
  return gf;
14482
  }
14483
 
14484
- struct ggml_cgraph * build_olmo_1124() {
14485
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
14486
 
14487
  // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -16674,6 +16608,7 @@ static struct ggml_cgraph * llama_build_graph(
16674
 
16675
  switch (model.arch) {
16676
  case LLM_ARCH_LLAMA:
 
16677
  case LLM_ARCH_GRANITE:
16678
  case LLM_ARCH_GRANITE_MOE:
16679
  {
@@ -16757,10 +16692,6 @@ static struct ggml_cgraph * llama_build_graph(
16757
  {
16758
  result = llm.build_internlm2();
16759
  } break;
16760
- case LLM_ARCH_MINICPM:
16761
- {
16762
- result = llm.build_minicpm();
16763
- } break;
16764
  case LLM_ARCH_MINICPM3:
16765
  {
16766
  result = llm.build_minicpm3();
@@ -16797,9 +16728,9 @@ static struct ggml_cgraph * llama_build_graph(
16797
  {
16798
  result = llm.build_olmo();
16799
  } break;
16800
- case LLM_ARCH_OLMO_1124:
16801
  {
16802
- result = llm.build_olmo_1124();
16803
  } break;
16804
  case LLM_ARCH_OLMOE:
16805
  {
@@ -17443,8 +17374,9 @@ static enum ggml_status llama_graph_compute(
17443
  int n_threads,
17444
  ggml_threadpool * threadpool) {
17445
  if (lctx.backend_cpu != nullptr) {
17446
- ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
17447
- ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
 
17448
  }
17449
 
17450
  // set the number of threads for all the backends
@@ -18211,13 +18143,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
18211
  static void llama_kv_cache_update_internal(struct llama_context & lctx) {
18212
  bool need_reserve = false;
18213
 
18214
- // apply K-shift if needed
18215
- if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
18216
  if (!llama_kv_cache_can_shift(&lctx)) {
18217
- GGML_ABORT("Deepseek2 does not support K-shift");
18218
  }
18219
 
18220
- {
 
18221
  ggml_backend_sched_reset(lctx.sched.get());
18222
 
18223
  ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
@@ -19361,6 +19293,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
19361
  //
19362
  struct llama_model_params llama_model_default_params() {
19363
  struct llama_model_params result = {
 
19364
  /*.n_gpu_layers =*/ 0,
19365
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
19366
  /*.main_gpu =*/ 0,
@@ -19478,7 +19411,11 @@ void llama_backend_init(void) {
19478
 
19479
  void llama_numa_init(enum ggml_numa_strategy numa) {
19480
  if (numa != GGML_NUMA_STRATEGY_DISABLED) {
19481
- ggml_numa_init(numa);
 
 
 
 
19482
  }
19483
  }
19484
 
@@ -19569,19 +19506,24 @@ struct llama_model * llama_load_model_from_file(
19569
  }
19570
 
19571
  // create list of devices to use with this model
19572
- // currently, we use all available devices
19573
- // TODO: rework API to give user more control over device selection
19574
- for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
19575
- ggml_backend_dev_t dev = ggml_backend_dev_get(i);
19576
- switch (ggml_backend_dev_type(dev)) {
19577
- case GGML_BACKEND_DEVICE_TYPE_CPU:
19578
- case GGML_BACKEND_DEVICE_TYPE_ACCEL:
19579
- // skip CPU backends since they are handled separately
19580
- break;
 
 
 
 
19581
 
19582
- case GGML_BACKEND_DEVICE_TYPE_GPU:
19583
- model->devices.push_back(dev);
19584
- break;
 
19585
  }
19586
  }
19587
 
@@ -19752,9 +19694,6 @@ struct llama_context * llama_new_context_with_model(
19752
  __func__, n_ctx_per_seq, hparams.n_ctx_train);
19753
  }
19754
 
19755
- ctx->abort_callback = params.abort_callback;
19756
- ctx->abort_callback_data = params.abort_callback_data;
19757
-
19758
  ctx->logits_all = params.logits_all;
19759
 
19760
  // build worst-case graph for encoder if a model contains encoder
@@ -19803,7 +19742,7 @@ struct llama_context * llama_new_context_with_model(
19803
  }
19804
 
19805
  // add CPU backend
19806
- ctx->backend_cpu = ggml_backend_cpu_init();
19807
  if (ctx->backend_cpu == nullptr) {
19808
  LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
19809
  llama_free(ctx);
@@ -19823,6 +19762,8 @@ struct llama_context * llama_new_context_with_model(
19823
  }
19824
  }
19825
 
 
 
19826
  if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
19827
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
19828
  llama_free(ctx);
@@ -19868,7 +19809,8 @@ struct llama_context * llama_new_context_with_model(
19868
  std::vector<ggml_backend_t> backend_ptrs;
19869
  for (auto & backend : ctx->backends) {
19870
  auto * buft = ggml_backend_get_default_buffer_type(backend.get());
19871
- if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) {
 
19872
  // use the host buffer of the first device CPU for faster transfer of the intermediate state
19873
  auto * dev = model->devices[0];
19874
  auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
@@ -19896,7 +19838,8 @@ struct llama_context * llama_new_context_with_model(
19896
  // pipeline parallelism requires support for async compute and events in all devices
19897
  if (pipeline_parallel) {
19898
  for (auto & backend : ctx->backends) {
19899
- if (ggml_backend_is_cpu(backend.get())) {
 
19900
  // ignore CPU backend
19901
  continue;
19902
  }
@@ -20070,7 +20013,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
20070
  case LLM_ARCH_QWEN:
20071
  case LLM_ARCH_QWEN2:
20072
  case LLM_ARCH_QWEN2MOE:
20073
- case LLM_ARCH_OLMO_1124:
20074
  case LLM_ARCH_OLMOE:
20075
  case LLM_ARCH_PHI2:
20076
  case LLM_ARCH_PHI3:
@@ -20463,7 +20406,7 @@ void llama_kv_cache_update(struct llama_context * ctx) {
20463
  }
20464
 
20465
  bool llama_kv_cache_can_shift(struct llama_context * ctx) {
20466
- return ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
20467
  }
20468
 
20469
  // deprecated
@@ -21450,6 +21393,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
21450
  void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
21451
  ctx->abort_callback = abort_callback;
21452
  ctx->abort_callback_data = abort_callback_data;
 
 
 
 
 
 
 
 
21453
  }
21454
 
21455
  void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
@@ -21816,18 +21767,109 @@ int32_t llama_detokenize(
21816
  // chat templates
21817
  //
21818
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21819
  // Simple version of "llama_apply_chat_template" that only works with strings
21820
  // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
21821
  static int32_t llama_chat_apply_template_internal(
21822
- const std::string & tmpl,
21823
  const std::vector<const llama_chat_message *> & chat,
21824
  std::string & dest, bool add_ass) {
21825
  // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
21826
  std::stringstream ss;
21827
- auto tmpl_contains = [&tmpl](std::string haystack) -> bool {
21828
- return tmpl.find(haystack) != std::string::npos;
21829
- };
21830
- if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
21831
  // chatml template
21832
  for (auto message : chat) {
21833
  ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@@ -21835,16 +21877,59 @@ static int32_t llama_chat_apply_template_internal(
21835
  if (add_ass) {
21836
  ss << "<|im_start|>assistant\n";
21837
  }
21838
- } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21839
  // llama2 template and its variants
21840
  // [variant] support system message
21841
- bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "mistral";
21842
- // [variant] space before + after response
21843
- bool space_around_response = tmpl_contains("' ' + eos_token");
21844
  // [variant] add BOS inside history
21845
- bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
21846
  // [variant] trim spaces from the input message
21847
- bool strip_message = tmpl_contains("content.strip()");
21848
  // construct the prompt
21849
  bool is_inside_turn = true; // skip BOS at the beginning
21850
  ss << "[INST] ";
@@ -21865,12 +21950,11 @@ static int32_t llama_chat_apply_template_internal(
21865
  } else if (role == "user") {
21866
  ss << content << " [/INST]";
21867
  } else {
21868
- ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
21869
  is_inside_turn = false;
21870
  }
21871
  }
21872
- // llama2 templates seem to not care about "add_generation_prompt"
21873
- } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
21874
  // Phi 3
21875
  for (auto message : chat) {
21876
  std::string role(message->role);
@@ -21879,7 +21963,7 @@ static int32_t llama_chat_apply_template_internal(
21879
  if (add_ass) {
21880
  ss << "<|assistant|>\n";
21881
  }
21882
- } else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
21883
  // zephyr template
21884
  for (auto message : chat) {
21885
  ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@@ -21887,7 +21971,7 @@ static int32_t llama_chat_apply_template_internal(
21887
  if (add_ass) {
21888
  ss << "<|assistant|>\n";
21889
  }
21890
- } else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) {
21891
  // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
21892
  for (auto message : chat) {
21893
  std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@@ -21896,7 +21980,7 @@ static int32_t llama_chat_apply_template_internal(
21896
  if (add_ass) {
21897
  ss << "<s>assistant\n";
21898
  }
21899
- } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("<start_of_turn>")) {
21900
  // google/gemma-7b-it
21901
  std::string system_prompt = "";
21902
  for (auto message : chat) {
@@ -21918,7 +22002,7 @@ static int32_t llama_chat_apply_template_internal(
21918
  if (add_ass) {
21919
  ss << "<start_of_turn>model\n";
21920
  }
21921
- } else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
21922
  // OrionStarAI/Orion-14B-Chat
21923
  std::string system_prompt = "";
21924
  for (auto message : chat) {
@@ -21938,7 +22022,7 @@ static int32_t llama_chat_apply_template_internal(
21938
  ss << message->content << "</s>";
21939
  }
21940
  }
21941
- } else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) {
21942
  // openchat/openchat-3.5-0106,
21943
  for (auto message : chat) {
21944
  std::string role(message->role);
@@ -21952,13 +22036,13 @@ static int32_t llama_chat_apply_template_internal(
21952
  if (add_ass) {
21953
  ss << "GPT4 Correct Assistant:";
21954
  }
21955
- } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) {
21956
  // eachadea/vicuna-13b-1.1 (and Orca variant)
21957
  for (auto message : chat) {
21958
  std::string role(message->role);
21959
  if (role == "system") {
21960
  // Orca-Vicuna variant uses a system prefix
21961
- if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) {
21962
  ss << "SYSTEM: " << message->content << "\n";
21963
  } else {
21964
  ss << message->content << "\n\n";
@@ -21972,7 +22056,7 @@ static int32_t llama_chat_apply_template_internal(
21972
  if (add_ass) {
21973
  ss << "ASSISTANT:";
21974
  }
21975
- } else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) {
21976
  // deepseek-ai/deepseek-coder-33b-instruct
21977
  for (auto message : chat) {
21978
  std::string role(message->role);
@@ -21987,7 +22071,7 @@ static int32_t llama_chat_apply_template_internal(
21987
  if (add_ass) {
21988
  ss << "### Response:\n";
21989
  }
21990
- } else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) {
21991
  // CohereForAI/c4ai-command-r-plus
21992
  for (auto message : chat) {
21993
  std::string role(message->role);
@@ -22002,7 +22086,7 @@ static int32_t llama_chat_apply_template_internal(
22002
  if (add_ass) {
22003
  ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
22004
  }
22005
- } else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
22006
  // Llama 3
22007
  for (auto message : chat) {
22008
  std::string role(message->role);
@@ -22011,7 +22095,7 @@ static int32_t llama_chat_apply_template_internal(
22011
  if (add_ass) {
22012
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
22013
  }
22014
- } else if (tmpl == "chatglm3" || tmpl_contains("[gMASK]sop")) {
22015
  // chatglm3-6b
22016
  ss << "[gMASK]" << "sop";
22017
  for (auto message : chat) {
@@ -22021,7 +22105,7 @@ static int32_t llama_chat_apply_template_internal(
22021
  if (add_ass) {
22022
  ss << "<|assistant|>";
22023
  }
22024
- } else if (tmpl == "chatglm4" || tmpl_contains("[gMASK]<sop>")) {
22025
  ss << "[gMASK]" << "<sop>";
22026
  for (auto message : chat) {
22027
  std::string role(message->role);
@@ -22030,7 +22114,7 @@ static int32_t llama_chat_apply_template_internal(
22030
  if (add_ass) {
22031
  ss << "<|assistant|>";
22032
  }
22033
- } else if (tmpl == "minicpm" || tmpl_contains(LU8("<用户>"))) {
22034
  // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
22035
  for (auto message : chat) {
22036
  std::string role(message->role);
@@ -22042,7 +22126,7 @@ static int32_t llama_chat_apply_template_internal(
22042
  ss << trim(message->content);
22043
  }
22044
  }
22045
- } else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
22046
  // DeepSeek-V2
22047
  for (auto message : chat) {
22048
  std::string role(message->role);
@@ -22057,7 +22141,7 @@ static int32_t llama_chat_apply_template_internal(
22057
  if (add_ass) {
22058
  ss << "Assistant:";
22059
  }
22060
- } else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) {
22061
  // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
22062
  // EXAONE-3.0-7.8B-Instruct
22063
  for (auto message : chat) {
@@ -22073,7 +22157,7 @@ static int32_t llama_chat_apply_template_internal(
22073
  if (add_ass) {
22074
  ss << "[|assistant|]";
22075
  }
22076
- } else if (tmpl == "rwkv-world" || tmpl_contains("rwkv-world")) {
22077
  // this template requires the model to have "\n\n" as EOT token
22078
  for (auto message : chat) {
22079
  std::string role(message->role);
@@ -22083,7 +22167,7 @@ static int32_t llama_chat_apply_template_internal(
22083
  ss << message->content << "\n\n";
22084
  }
22085
  }
22086
- } else if (tmpl == "granite" || tmpl_contains("<|start_of_role|>")) {
22087
  // IBM Granite template
22088
  for (const auto & message : chat) {
22089
  std::string role(message->role);
@@ -22135,7 +22219,11 @@ int32_t llama_chat_apply_template(
22135
  }
22136
 
22137
  std::string formatted_chat;
22138
- int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
 
 
 
 
22139
  if (res < 0) {
22140
  return res;
22141
  }
@@ -22145,6 +22233,15 @@ int32_t llama_chat_apply_template(
22145
  return res;
22146
  }
22147
 
 
 
 
 
 
 
 
 
 
22148
  //
22149
  // sampling
22150
  //
@@ -22191,32 +22288,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
22191
  }
22192
 
22193
  const char * llama_print_system_info(void) {
22194
- ggml_cpu_init(); // some ARM features are detected at runtime
22195
-
22196
  static std::string s;
22197
 
22198
- s = "";
22199
- s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
22200
- s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
22201
- s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
22202
- s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
22203
- s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
22204
- s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
22205
- s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
22206
- s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | ";
22207
- s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
22208
- s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
22209
- s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
22210
- s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
22211
- s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
22212
- s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
22213
- s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
22214
- s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
22215
- s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
22216
- s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
22217
- s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
22218
- s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
22219
- s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | ";
22220
 
22221
  return s.c_str();
22222
  }
 
179
  LLM_ARCH_COMMAND_R,
180
  LLM_ARCH_DBRX,
181
  LLM_ARCH_OLMO,
182
+ LLM_ARCH_OLMO2,
183
  LLM_ARCH_OLMOE,
184
  LLM_ARCH_OPENELM,
185
  LLM_ARCH_ARCTIC,
 
233
  { LLM_ARCH_COMMAND_R, "command-r" },
234
  { LLM_ARCH_DBRX, "dbrx" },
235
  { LLM_ARCH_OLMO, "olmo" },
236
+ { LLM_ARCH_OLMO2, "olmo2" },
237
  { LLM_ARCH_OLMOE, "olmoe" },
238
  { LLM_ARCH_OPENELM, "openelm" },
239
  { LLM_ARCH_ARCTIC, "arctic" },
 
1036
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1037
  { LLM_TENSOR_OUTPUT, "output" },
1038
  { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1039
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
1040
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
1041
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1042
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1043
  { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
 
1212
  },
1213
  },
1214
  {
1215
+ LLM_ARCH_OLMO2,
1216
  {
1217
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1218
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
 
1551
  },
1552
  };
1553
 
1554
+ enum llm_chat_template {
1555
+ LLM_CHAT_TEMPLATE_CHATML,
1556
+ LLM_CHAT_TEMPLATE_LLAMA_2,
1557
+ LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
1558
+ LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
1559
+ LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
1560
+ LLM_CHAT_TEMPLATE_MISTRAL_V1,
1561
+ LLM_CHAT_TEMPLATE_MISTRAL_V3,
1562
+ LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
1563
+ LLM_CHAT_TEMPLATE_MISTRAL_V7,
1564
+ LLM_CHAT_TEMPLATE_PHI_3,
1565
+ LLM_CHAT_TEMPLATE_ZEPHYR,
1566
+ LLM_CHAT_TEMPLATE_MONARCH,
1567
+ LLM_CHAT_TEMPLATE_GEMMA,
1568
+ LLM_CHAT_TEMPLATE_ORION,
1569
+ LLM_CHAT_TEMPLATE_OPENCHAT,
1570
+ LLM_CHAT_TEMPLATE_VICUNA,
1571
+ LLM_CHAT_TEMPLATE_VICUNA_ORCA,
1572
+ LLM_CHAT_TEMPLATE_DEEPSEEK,
1573
+ LLM_CHAT_TEMPLATE_DEEPSEEK_2,
1574
+ LLM_CHAT_TEMPLATE_COMMAND_R,
1575
+ LLM_CHAT_TEMPLATE_LLAMA_3,
1576
+ LLM_CHAT_TEMPLATE_CHATGML_3,
1577
+ LLM_CHAT_TEMPLATE_CHATGML_4,
1578
+ LLM_CHAT_TEMPLATE_MINICPM,
1579
+ LLM_CHAT_TEMPLATE_EXAONE_3,
1580
+ LLM_CHAT_TEMPLATE_RWKV_WORLD,
1581
+ LLM_CHAT_TEMPLATE_GRANITE,
1582
+ LLM_CHAT_TEMPLATE_UNKNOWN,
1583
+ };
1584
+
1585
+ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
1586
+ { "chatml", LLM_CHAT_TEMPLATE_CHATML },
1587
+ { "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
1588
+ { "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
1589
+ { "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
1590
+ { "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
1591
+ { "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
1592
+ { "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
1593
+ { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
1594
+ { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
1595
+ { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
1596
+ { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
1597
+ { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
1598
+ { "gemma", LLM_CHAT_TEMPLATE_GEMMA },
1599
+ { "orion", LLM_CHAT_TEMPLATE_ORION },
1600
+ { "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
1601
+ { "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
1602
+ { "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
1603
+ { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
1604
+ { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
1605
+ { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
1606
+ { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
1607
+ { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
1608
+ { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
1609
+ { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
1610
+ { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
1611
+ { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
1612
+ { "granite", LLM_CHAT_TEMPLATE_GRANITE },
1613
+ };
1614
+
1615
  static llm_arch llm_arch_from_string(const std::string & name) {
1616
  for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
1617
  if (kv.second == name) {
 
1685
  //
1686
 
1687
  static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
1688
+ { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
1689
+ { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
1690
+ { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
1691
+ { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
1692
  };
1693
 
1694
  static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
 
2405
  MODEL_16B,
2406
  MODEL_20B,
2407
  MODEL_30B,
2408
+ MODEL_32B,
2409
  MODEL_34B,
2410
  MODEL_35B,
2411
  MODEL_40B,
 
4931
  mappings.reserve(files.size());
4932
  mmaps_used.reserve(files.size());
4933
  for (const auto & file : files) {
4934
+ auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
4935
+ auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
4936
+ std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
4937
  mmaps_used.emplace_back(mapping->size, 0);
4938
  if (mlock_mmaps) {
4939
  std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
 
5395
  case MODEL_16B: return "16B";
5396
  case MODEL_20B: return "20B";
5397
  case MODEL_30B: return "30B";
5398
+ case MODEL_32B: return "32B";
5399
  case MODEL_34B: return "34B";
5400
  case MODEL_35B: return "35B";
5401
  case MODEL_40B: return "40B";
 
5583
  case LLM_ARCH_MINICPM:
5584
  {
5585
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5586
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
5587
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
5588
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5589
 
5590
  switch (hparams.n_layer) {
5591
+ case 52: model.type = e_model::MODEL_1B; break;
5592
  case 40: model.type = e_model::MODEL_2B; break;
5593
  default: model.type = e_model::MODEL_UNKNOWN;
5594
  }
 
5760
  case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
5761
  case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
5762
  case 32: model.type = e_model::MODEL_7B; break;
5763
+ case 36: model.type = e_model::MODEL_3B; break;
5764
  case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
5765
+ case 48: model.type = e_model::MODEL_14B; break;
5766
+ case 64: model.type = e_model::MODEL_32B; break;
5767
  case 80: model.type = e_model::MODEL_70B; break;
5768
  default: model.type = e_model::MODEL_UNKNOWN;
5769
  }
 
5973
  default: model.type = e_model::MODEL_UNKNOWN;
5974
  }
5975
  } break;
5976
+ case LLM_ARCH_OLMO2:
5977
  {
5978
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5979
 
 
7072
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7073
  }
7074
 
7075
+ if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
7076
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
7077
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
7078
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
 
7256
  } break;
7257
  case GGML_OP_ADD:
7258
  {
7259
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
7260
  op_tensor = ggml_add(ctx, a, w);
7261
  } break;
7262
  case GGML_OP_MUL:
7263
  {
7264
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
7265
  op_tensor = ggml_mul(ctx, a, w);
7266
  } break;
7267
  case GGML_OP_DIV:
 
7697
 
7698
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7699
 
7700
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
7701
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7702
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7703
+ }
7704
+ else {
7705
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7706
+ }
7707
 
7708
  if (n_expert == 0) {
7709
  layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
 
8672
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
8673
  }
8674
  } break;
8675
+ case LLM_ARCH_OLMO2:
8676
  {
8677
  model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
8678
 
 
9271
  ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
9272
  if (!dev) {
9273
  // FIXME: workaround for CPU backend buft having a NULL device
9274
+ dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
9275
  }
9276
  ggml_backend_dev_props props;
9277
  ggml_backend_dev_get_props(dev, &props);
 
13510
  return gf;
13511
  }
13512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13513
  struct ggml_cgraph * build_minicpm3() {
13514
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13515
 
 
14415
  return gf;
14416
  }
14417
 
14418
+ struct ggml_cgraph * build_olmo2() {
14419
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
14420
 
14421
  // mutable variable, needed during the last layer of the computation to skip unused tokens
 
16608
 
16609
  switch (model.arch) {
16610
  case LLM_ARCH_LLAMA:
16611
+ case LLM_ARCH_MINICPM:
16612
  case LLM_ARCH_GRANITE:
16613
  case LLM_ARCH_GRANITE_MOE:
16614
  {
 
16692
  {
16693
  result = llm.build_internlm2();
16694
  } break;
 
 
 
 
16695
  case LLM_ARCH_MINICPM3:
16696
  {
16697
  result = llm.build_minicpm3();
 
16728
  {
16729
  result = llm.build_olmo();
16730
  } break;
16731
+ case LLM_ARCH_OLMO2:
16732
  {
16733
+ result = llm.build_olmo2();
16734
  } break;
16735
  case LLM_ARCH_OLMOE:
16736
  {
 
17374
  int n_threads,
17375
  ggml_threadpool * threadpool) {
17376
  if (lctx.backend_cpu != nullptr) {
17377
+ auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
17378
+ auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
17379
+ set_threadpool_fn(lctx.backend_cpu, threadpool);
17380
  }
17381
 
17382
  // set the number of threads for all the backends
 
18143
  static void llama_kv_cache_update_internal(struct llama_context & lctx) {
18144
  bool need_reserve = false;
18145
 
18146
+ if (lctx.kv_self.has_shift) {
 
18147
  if (!llama_kv_cache_can_shift(&lctx)) {
18148
+ GGML_ABORT("The current context does not support K-shift");
18149
  }
18150
 
18151
+ // apply K-shift if needed
18152
+ if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
18153
  ggml_backend_sched_reset(lctx.sched.get());
18154
 
18155
  ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
 
19293
  //
19294
  struct llama_model_params llama_model_default_params() {
19295
  struct llama_model_params result = {
19296
+ /*.devices =*/ nullptr,
19297
  /*.n_gpu_layers =*/ 0,
19298
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
19299
  /*.main_gpu =*/ 0,
 
19411
 
19412
  void llama_numa_init(enum ggml_numa_strategy numa) {
19413
  if (numa != GGML_NUMA_STRATEGY_DISABLED) {
19414
+ auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
19415
+ GGML_ASSERT(dev && "CPU backend is not loaded");
19416
+ auto * reg = ggml_backend_dev_backend_reg(dev);
19417
+ auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
19418
+ numa_init_fn(numa);
19419
  }
19420
  }
19421
 
 
19506
  }
19507
 
19508
  // create list of devices to use with this model
19509
+ if (params.devices) {
19510
+ for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
19511
+ model->devices.push_back(*dev);
19512
+ }
19513
+ } else {
19514
+ // use all available devices
19515
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
19516
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
19517
+ switch (ggml_backend_dev_type(dev)) {
19518
+ case GGML_BACKEND_DEVICE_TYPE_CPU:
19519
+ case GGML_BACKEND_DEVICE_TYPE_ACCEL:
19520
+ // skip CPU backends since they are handled separately
19521
+ break;
19522
 
19523
+ case GGML_BACKEND_DEVICE_TYPE_GPU:
19524
+ model->devices.push_back(dev);
19525
+ break;
19526
+ }
19527
  }
19528
  }
19529
 
 
19694
  __func__, n_ctx_per_seq, hparams.n_ctx_train);
19695
  }
19696
 
 
 
 
19697
  ctx->logits_all = params.logits_all;
19698
 
19699
  // build worst-case graph for encoder if a model contains encoder
 
19742
  }
19743
 
19744
  // add CPU backend
19745
+ ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
19746
  if (ctx->backend_cpu == nullptr) {
19747
  LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
19748
  llama_free(ctx);
 
19762
  }
19763
  }
19764
 
19765
+ llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
19766
+
19767
  if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
19768
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
19769
  llama_free(ctx);
 
19809
  std::vector<ggml_backend_t> backend_ptrs;
19810
  for (auto & backend : ctx->backends) {
19811
  auto * buft = ggml_backend_get_default_buffer_type(backend.get());
19812
+ auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
19813
+ if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
19814
  // use the host buffer of the first device CPU for faster transfer of the intermediate state
19815
  auto * dev = model->devices[0];
19816
  auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
 
19838
  // pipeline parallelism requires support for async compute and events in all devices
19839
  if (pipeline_parallel) {
19840
  for (auto & backend : ctx->backends) {
19841
+ auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
19842
+ if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
19843
  // ignore CPU backend
19844
  continue;
19845
  }
 
20013
  case LLM_ARCH_QWEN:
20014
  case LLM_ARCH_QWEN2:
20015
  case LLM_ARCH_QWEN2MOE:
20016
+ case LLM_ARCH_OLMO2:
20017
  case LLM_ARCH_OLMOE:
20018
  case LLM_ARCH_PHI2:
20019
  case LLM_ARCH_PHI3:
 
20406
  }
20407
 
20408
  bool llama_kv_cache_can_shift(struct llama_context * ctx) {
20409
+ return !ctx->kv_self.recurrent && ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
20410
  }
20411
 
20412
  // deprecated
 
21393
  void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
21394
  ctx->abort_callback = abort_callback;
21395
  ctx->abort_callback_data = abort_callback_data;
21396
+
21397
+ for (auto & backend : ctx->backends) {
21398
+ auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
21399
+ auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
21400
+ if (set_abort_callback_fn) {
21401
+ set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
21402
+ }
21403
+ }
21404
  }
21405
 
21406
  void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
 
21767
  // chat templates
21768
  //
21769
 
21770
+ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
21771
+ if (LLM_CHAT_TEMPLATES.find(tmpl) != LLM_CHAT_TEMPLATES.end()) {
21772
+ return LLM_CHAT_TEMPLATES.at(tmpl);
21773
+ }
21774
+ auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
21775
+ return tmpl.find(haystack) != std::string::npos;
21776
+ };
21777
+ if (tmpl_contains("<|im_start|>")) {
21778
+ return LLM_CHAT_TEMPLATE_CHATML;
21779
+ } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
21780
+ if (tmpl_contains("[SYSTEM_PROMPT]")) {
21781
+ return LLM_CHAT_TEMPLATE_MISTRAL_V7;
21782
+ } else if (
21783
+ // catches official 'v1' template
21784
+ tmpl_contains("' [INST] ' + system_message")
21785
+ // catches official 'v3' and 'v3-tekken' templates
21786
+ || tmpl_contains("[AVAILABLE_TOOLS]")
21787
+ ) {
21788
+ // Official mistral 'v1', 'v3' and 'v3-tekken' templates
21789
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
21790
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
21791
+ if (tmpl_contains(" [INST]")) {
21792
+ return LLM_CHAT_TEMPLATE_MISTRAL_V1;
21793
+ } else if (tmpl_contains("\"[INST]\"")) {
21794
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
21795
+ }
21796
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3;
21797
+ } else {
21798
+ // llama2 template and its variants
21799
+ // [variant] support system message
21800
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
21801
+ bool support_system_message = tmpl_contains("<<SYS>>");
21802
+ bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
21803
+ bool strip_message = tmpl_contains("content.strip()");
21804
+ if (strip_message) {
21805
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
21806
+ } else if (add_bos_inside_history) {
21807
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
21808
+ } else if (support_system_message) {
21809
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
21810
+ } else {
21811
+ return LLM_CHAT_TEMPLATE_LLAMA_2;
21812
+ }
21813
+ }
21814
+ } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
21815
+ return LLM_CHAT_TEMPLATE_PHI_3;
21816
+ } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
21817
+ return LLM_CHAT_TEMPLATE_ZEPHYR;
21818
+ } else if (tmpl_contains("bos_token + message['role']")) {
21819
+ return LLM_CHAT_TEMPLATE_MONARCH;
21820
+ } else if (tmpl_contains("<start_of_turn>")) {
21821
+ return LLM_CHAT_TEMPLATE_GEMMA;
21822
+ } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
21823
+ // OrionStarAI/Orion-14B-Chat
21824
+ return LLM_CHAT_TEMPLATE_ORION;
21825
+ } else if (tmpl_contains("GPT4 Correct ")) {
21826
+ // openchat/openchat-3.5-0106
21827
+ return LLM_CHAT_TEMPLATE_OPENCHAT;
21828
+ } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
21829
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
21830
+ if (tmpl_contains("SYSTEM: ")) {
21831
+ return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
21832
+ }
21833
+ return LLM_CHAT_TEMPLATE_VICUNA;
21834
+ } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
21835
+ // deepseek-ai/deepseek-coder-33b-instruct
21836
+ return LLM_CHAT_TEMPLATE_DEEPSEEK;
21837
+ } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
21838
+ // CohereForAI/c4ai-command-r-plus
21839
+ return LLM_CHAT_TEMPLATE_COMMAND_R;
21840
+ } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
21841
+ return LLM_CHAT_TEMPLATE_LLAMA_3;
21842
+ } else if (tmpl_contains("[gMASK]sop")) {
21843
+ // chatglm3-6b
21844
+ return LLM_CHAT_TEMPLATE_CHATGML_3;
21845
+ } else if (tmpl_contains("[gMASK]<sop>")) {
21846
+ return LLM_CHAT_TEMPLATE_CHATGML_4;
21847
+ } else if (tmpl_contains(LU8("<用户>"))) {
21848
+ // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
21849
+ return LLM_CHAT_TEMPLATE_MINICPM;
21850
+ } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
21851
+ return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
21852
+ } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
21853
+ // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
21854
+ // EXAONE-3.0-7.8B-Instruct
21855
+ return LLM_CHAT_TEMPLATE_EXAONE_3;
21856
+ } else if (tmpl_contains("rwkv-world")) {
21857
+ return LLM_CHAT_TEMPLATE_RWKV_WORLD;
21858
+ } else if (tmpl_contains("<|start_of_role|>")) {
21859
+ return LLM_CHAT_TEMPLATE_GRANITE;
21860
+ }
21861
+ return LLM_CHAT_TEMPLATE_UNKNOWN;
21862
+ }
21863
+
21864
  // Simple version of "llama_apply_chat_template" that only works with strings
21865
  // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
21866
  static int32_t llama_chat_apply_template_internal(
21867
+ const llm_chat_template tmpl,
21868
  const std::vector<const llama_chat_message *> & chat,
21869
  std::string & dest, bool add_ass) {
21870
  // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
21871
  std::stringstream ss;
21872
+ if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
 
 
 
21873
  // chatml template
21874
  for (auto message : chat) {
21875
  ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
 
21877
  if (add_ass) {
21878
  ss << "<|im_start|>assistant\n";
21879
  }
21880
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
21881
+ // Official mistral 'v7' template
21882
+ // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
21883
+ for (auto message : chat) {
21884
+ std::string role(message->role);
21885
+ std::string content(message->content);
21886
+ if (role == "system") {
21887
+ ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
21888
+ } else if (role == "user") {
21889
+ ss << "[INST] " << content << "[/INST]";
21890
+ }
21891
+ else {
21892
+ ss << " " << content << "</s>";
21893
+ }
21894
+ }
21895
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
21896
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
21897
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
21898
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
21899
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
21900
+ std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
21901
+ std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
21902
+ bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
21903
+ bool is_inside_turn = false;
21904
+ for (auto message : chat) {
21905
+ if (!is_inside_turn) {
21906
+ ss << leading_space << "[INST]" << trailing_space;
21907
+ is_inside_turn = true;
21908
+ }
21909
+ std::string role(message->role);
21910
+ std::string content(message->content);
21911
+ if (role == "system") {
21912
+ ss << content << "\n\n";
21913
+ } else if (role == "user") {
21914
+ ss << content << leading_space << "[/INST]";
21915
+ } else {
21916
+ ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
21917
+ is_inside_turn = false;
21918
+ }
21919
+ }
21920
+ } else if (
21921
+ tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
21922
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
21923
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
21924
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
21925
  // llama2 template and its variants
21926
  // [variant] support system message
21927
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
21928
+ bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
 
21929
  // [variant] add BOS inside history
21930
+ bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
21931
  // [variant] trim spaces from the input message
21932
+ bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
21933
  // construct the prompt
21934
  bool is_inside_turn = true; // skip BOS at the beginning
21935
  ss << "[INST] ";
 
21950
  } else if (role == "user") {
21951
  ss << content << " [/INST]";
21952
  } else {
21953
+ ss << content << "</s>";
21954
  is_inside_turn = false;
21955
  }
21956
  }
21957
+ } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
 
21958
  // Phi 3
21959
  for (auto message : chat) {
21960
  std::string role(message->role);
 
21963
  if (add_ass) {
21964
  ss << "<|assistant|>\n";
21965
  }
21966
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
21967
  // zephyr template
21968
  for (auto message : chat) {
21969
  ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
 
21971
  if (add_ass) {
21972
  ss << "<|assistant|>\n";
21973
  }
21974
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
21975
  // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
21976
  for (auto message : chat) {
21977
  std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
 
21980
  if (add_ass) {
21981
  ss << "<s>assistant\n";
21982
  }
21983
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
21984
  // google/gemma-7b-it
21985
  std::string system_prompt = "";
21986
  for (auto message : chat) {
 
22002
  if (add_ass) {
22003
  ss << "<start_of_turn>model\n";
22004
  }
22005
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
22006
  // OrionStarAI/Orion-14B-Chat
22007
  std::string system_prompt = "";
22008
  for (auto message : chat) {
 
22022
  ss << message->content << "</s>";
22023
  }
22024
  }
22025
+ } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
22026
  // openchat/openchat-3.5-0106,
22027
  for (auto message : chat) {
22028
  std::string role(message->role);
 
22036
  if (add_ass) {
22037
  ss << "GPT4 Correct Assistant:";
22038
  }
22039
+ } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
22040
  // eachadea/vicuna-13b-1.1 (and Orca variant)
22041
  for (auto message : chat) {
22042
  std::string role(message->role);
22043
  if (role == "system") {
22044
  // Orca-Vicuna variant uses a system prefix
22045
+ if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
22046
  ss << "SYSTEM: " << message->content << "\n";
22047
  } else {
22048
  ss << message->content << "\n\n";
 
22056
  if (add_ass) {
22057
  ss << "ASSISTANT:";
22058
  }
22059
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
22060
  // deepseek-ai/deepseek-coder-33b-instruct
22061
  for (auto message : chat) {
22062
  std::string role(message->role);
 
22071
  if (add_ass) {
22072
  ss << "### Response:\n";
22073
  }
22074
+ } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
22075
  // CohereForAI/c4ai-command-r-plus
22076
  for (auto message : chat) {
22077
  std::string role(message->role);
 
22086
  if (add_ass) {
22087
  ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
22088
  }
22089
+ } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
22090
  // Llama 3
22091
  for (auto message : chat) {
22092
  std::string role(message->role);
 
22095
  if (add_ass) {
22096
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
22097
  }
22098
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
22099
  // chatglm3-6b
22100
  ss << "[gMASK]" << "sop";
22101
  for (auto message : chat) {
 
22105
  if (add_ass) {
22106
  ss << "<|assistant|>";
22107
  }
22108
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
22109
  ss << "[gMASK]" << "<sop>";
22110
  for (auto message : chat) {
22111
  std::string role(message->role);
 
22114
  if (add_ass) {
22115
  ss << "<|assistant|>";
22116
  }
22117
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
22118
  // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
22119
  for (auto message : chat) {
22120
  std::string role(message->role);
 
22126
  ss << trim(message->content);
22127
  }
22128
  }
22129
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
22130
  // DeepSeek-V2
22131
  for (auto message : chat) {
22132
  std::string role(message->role);
 
22141
  if (add_ass) {
22142
  ss << "Assistant:";
22143
  }
22144
+ } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
22145
  // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
22146
  // EXAONE-3.0-7.8B-Instruct
22147
  for (auto message : chat) {
 
22157
  if (add_ass) {
22158
  ss << "[|assistant|]";
22159
  }
22160
+ } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
22161
  // this template requires the model to have "\n\n" as EOT token
22162
  for (auto message : chat) {
22163
  std::string role(message->role);
 
22167
  ss << message->content << "\n\n";
22168
  }
22169
  }
22170
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
22171
  // IBM Granite template
22172
  for (const auto & message : chat) {
22173
  std::string role(message->role);
 
22219
  }
22220
 
22221
  std::string formatted_chat;
22222
+ llm_chat_template detected_tmpl = llama_chat_detect_template(curr_tmpl);
22223
+ if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
22224
+ return -1;
22225
+ }
22226
+ int32_t res = llama_chat_apply_template_internal(detected_tmpl, chat_vec, formatted_chat, add_ass);
22227
  if (res < 0) {
22228
  return res;
22229
  }
 
22233
  return res;
22234
  }
22235
 
22236
+ int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
22237
+ auto it = LLM_CHAT_TEMPLATES.begin();
22238
+ for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
22239
+ output[i] = it->first.c_str();
22240
+ std::advance(it, 1);
22241
+ }
22242
+ return (int32_t) LLM_CHAT_TEMPLATES.size();
22243
+ }
22244
+
22245
  //
22246
  // sampling
22247
  //
 
22288
  }
22289
 
22290
  const char * llama_print_system_info(void) {
 
 
22291
  static std::string s;
22292
 
22293
+ for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
22294
+ auto * reg = ggml_backend_reg_get(i);
22295
+ auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
22296
+ if (get_features_fn) {
22297
+ ggml_backend_feature * features = get_features_fn(reg);
22298
+ s += ggml_backend_reg_name(reg);
22299
+ s += " : ";
22300
+ for (; features->name; features++) {
22301
+ s += features->name;
22302
+ s += " = ";
22303
+ s += features->value;
22304
+ s += " | ";
22305
+ }
22306
+ }
22307
+ }
 
 
 
 
 
 
 
22308
 
22309
  return s.c_str();
22310
  }
examples/talk-llama/llama.h CHANGED
@@ -185,7 +185,8 @@ extern "C" {
185
  LLAMA_ROPE_SCALING_TYPE_NONE = 0,
186
  LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
187
  LLAMA_ROPE_SCALING_TYPE_YARN = 2,
188
- LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
 
189
  };
190
 
191
  enum llama_pooling_type {
@@ -272,6 +273,9 @@ extern "C" {
272
  };
273
 
274
  struct llama_model_params {
 
 
 
275
  int32_t n_gpu_layers; // number of layers to store in VRAM
276
  enum llama_split_mode split_mode; // how to split the model across multiple GPUs
277
 
@@ -987,6 +991,9 @@ extern "C" {
987
  char * buf,
988
  int32_t length);
989
 
 
 
 
990
  //
991
  // Sampling API
992
  //
 
185
  LLAMA_ROPE_SCALING_TYPE_NONE = 0,
186
  LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
187
  LLAMA_ROPE_SCALING_TYPE_YARN = 2,
188
+ LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3,
189
+ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
190
  };
191
 
192
  enum llama_pooling_type {
 
273
  };
274
 
275
  struct llama_model_params {
276
+ // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
277
+ ggml_backend_dev_t * devices;
278
+
279
  int32_t n_gpu_layers; // number of layers to store in VRAM
280
  enum llama_split_mode split_mode; // how to split the model across multiple GPUs
281
 
 
991
  char * buf,
992
  int32_t length);
993
 
994
+ // Get list of built-in chat templates
995
+ LLAMA_API int32_t llama_chat_builtin_templates(const char ** output, size_t len);
996
+
997
  //
998
  // Sampling API
999
  //
examples/talk-llama/unicode.cpp CHANGED
@@ -201,7 +201,18 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
201
  }
202
 
203
  static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
 
 
 
 
 
 
204
  std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
 
 
 
 
 
205
  return conv.from_bytes(s);
206
  }
207
 
 
201
  }
202
 
203
  static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
204
+ #if defined(__clang__)
205
+ // disable C++17 deprecation warning for std::codecvt_utf8
206
+ # pragma clang diagnostic push
207
+ # pragma clang diagnostic ignored "-Wdeprecated-declarations"
208
+ #endif
209
+
210
  std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
211
+
212
+ #if defined(__clang__)
213
+ # pragma clang diagnostic pop
214
+ #endif
215
+
216
  return conv.from_bytes(s);
217
  }
218