evmar ggerganov commited on
Commit
fc5e318
·
unverified ·
1 Parent(s): cde9ea7

whisper : expose API to let user control log output (#1060)

Browse files

* expose api to let user control log output

Add
whisper_set_log_callback()
that lets user set a callback for log messages.

Change all the
fprintf(stderr, ...)
to call via the above.

* whisper : add <cstdarg>

---------

Co-authored-by: Georgi Gerganov <[email protected]>

Files changed (2) hide show
  1. whisper.cpp +113 -92
  2. whisper.h +5 -0
whisper.cpp CHANGED
@@ -14,6 +14,7 @@
14
  #define _USE_MATH_DEFINES
15
  #include <cmath>
16
  #include <cstdio>
 
17
  #include <cstring>
18
  #include <fstream>
19
  #include <map>
@@ -92,7 +93,7 @@ static void byteswap_tensor(ggml_tensor * tensor) {
92
  #define WHISPER_ASSERT(x) \
93
  do { \
94
  if (!(x)) { \
95
- fprintf(stderr, "WHISPER_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
96
  abort(); \
97
  } \
98
  } while (0)
@@ -723,6 +724,21 @@ struct whisper_context {
723
  std::string path_model; // populated by whisper_init_from_file()
724
  };
725
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
  template<typename T>
727
  static void read_safe(whisper_model_loader * loader, T & dest) {
728
  loader->read(loader->context, &dest, sizeof(T));
@@ -746,7 +762,7 @@ static bool kv_cache_init(
746
  cache.ctx = ggml_init(params);
747
 
748
  if (!cache.ctx) {
749
- fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
750
  return false;
751
  }
752
 
@@ -782,7 +798,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
782
  cache.ctx = ggml_init(params);
783
 
784
  if (!cache.ctx) {
785
- fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
786
  return false;
787
  }
788
 
@@ -811,7 +827,7 @@ static void kv_cache_free(struct whisper_kv_cache & cache) {
811
  // see the convert-pt-to-ggml.py script for details
812
  //
813
  static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
814
- fprintf(stderr, "%s: loading model\n", __func__);
815
 
816
  const int64_t t_start_us = ggml_time_us();
817
 
@@ -825,7 +841,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
825
  uint32_t magic;
826
  read_safe(loader, magic);
827
  if (magic != GGML_FILE_MAGIC) {
828
- fprintf(stderr, "%s: invalid model data (bad magic)\n", __func__);
829
  return false;
830
  }
831
  }
@@ -876,25 +892,25 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
876
  // in order to save memory and also to speed up the computation
877
  wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
878
  if (wctx.wtype == GGML_TYPE_COUNT) {
879
- fprintf(stderr, "%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
880
  return false;
881
  }
882
 
883
  const size_t scale = model.hparams.ftype ? 1 : 2;
884
 
885
- fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
886
- fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
887
- fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
888
- fprintf(stderr, "%s: n_audio_head = %d\n", __func__, hparams.n_audio_head);
889
- fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
890
- fprintf(stderr, "%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx);
891
- fprintf(stderr, "%s: n_text_state = %d\n", __func__, hparams.n_text_state);
892
- fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
893
- fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
894
- fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
895
- fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
896
- fprintf(stderr, "%s: qntvr = %d\n", __func__, qntvr);
897
- fprintf(stderr, "%s: type = %d\n", __func__, model.type);
898
 
899
  // print memory requirements
900
  {
@@ -912,7 +928,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
912
  const size_t mem_required_decoder =
913
  scale*MEM_REQ_KV_SELF.at(model.type);
914
 
915
- fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per decoder)\n", __func__,
916
  mem_required / 1024.0 / 1024.0, mem_required_decoder / 1024.0 / 1024.0);
917
  }
918
 
@@ -944,7 +960,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
944
  read_safe(loader, n_vocab);
945
 
946
  //if (n_vocab != model.hparams.n_vocab) {
947
- // fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
948
  // __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
949
  // return false;
950
  //}
@@ -964,7 +980,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
964
  word.assign(&tmp[0], tmp.size());
965
  } else {
966
  // seems like we have an empty-string token in multi-language models (i = 50256)
967
- //fprintf(stderr, "%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
968
  word = "";
969
  }
970
 
@@ -988,7 +1004,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
988
  }
989
 
990
  if (n_vocab < model.hparams.n_vocab) {
991
- fprintf(stderr, "%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
992
  for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
993
  if (i > vocab.token_beg) {
994
  word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
@@ -1127,7 +1143,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1127
 
1128
  ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
1129
 
1130
- fprintf(stderr, "%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
1131
  }
1132
 
1133
  // create the ggml context
@@ -1140,7 +1156,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1140
 
1141
  model.ctx = ggml_init(params);
1142
  if (!model.ctx) {
1143
- fprintf(stderr, "%s: ggml_init() failed\n", __func__);
1144
  return false;
1145
  }
1146
  }
@@ -1373,20 +1389,20 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1373
  name.assign(&tmp[0], tmp.size());
1374
 
1375
  if (model.tensors.find(name) == model.tensors.end()) {
1376
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
1377
  return false;
1378
  }
1379
 
1380
  auto tensor = model.tensors[name.data()];
1381
  if (ggml_nelements(tensor) != nelements) {
1382
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
1383
- fprintf(stderr, "%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
1384
  __func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
1385
  return false;
1386
  }
1387
 
1388
  if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
1389
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
1390
  __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], ne[0], ne[1], ne[2]);
1391
  return false;
1392
  }
@@ -1394,7 +1410,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1394
  const size_t bpe = ggml_type_size(ggml_type(ttype));
1395
 
1396
  if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
1397
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
1398
  __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
1399
  return false;
1400
  }
@@ -1407,12 +1423,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1407
  model.n_loaded++;
1408
  }
1409
 
1410
- fprintf(stderr, "%s: model size = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
1411
 
1412
  if (model.n_loaded == 0) {
1413
- fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
1414
  } else if (model.n_loaded != (int) model.tensors.size()) {
1415
- fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
1416
  return false;
1417
  }
1418
  }
@@ -2616,7 +2632,7 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
2616
  --j;
2617
  }
2618
  if (!found) {
2619
- fprintf(stderr, "unknown token \n");
2620
  ++i;
2621
  }
2622
  }
@@ -2683,41 +2699,41 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
2683
  const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
2684
 
2685
  if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
2686
- fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2687
  delete state;
2688
  return nullptr;
2689
  }
2690
 
2691
  {
2692
  const size_t memory_size = ggml_nbytes(state->decoders[0].kv_self.k) + ggml_nbytes(state->decoders[0].kv_self.v);
2693
- fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2694
  }
2695
 
2696
  if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
2697
- fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
2698
  delete state;
2699
  return nullptr;
2700
  }
2701
 
2702
  {
2703
  const size_t memory_size = ggml_nbytes(state->kv_cross.k) + ggml_nbytes(state->kv_cross.v);
2704
- fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2705
  }
2706
 
2707
  #ifdef WHISPER_USE_COREML
2708
  const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
2709
 
2710
- fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
2711
- fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
2712
 
2713
  state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
2714
  if (!state->ctx_coreml) {
2715
- fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
2716
  #ifndef WHISPER_COREML_ALLOW_FALLBACK
2717
  return nullptr;
2718
  #endif
2719
  } else {
2720
- fprintf(stderr, "%s: Core ML model loaded\n", __func__);
2721
  }
2722
  #endif
2723
 
@@ -2757,7 +2773,7 @@ int whisper_ctx_init_openvino_encoder(
2757
  return 1;
2758
  #else
2759
  if (!model_path && ctx->path_model.empty()) {
2760
- fprintf(stderr, "%s: model_path is nullptr, and ctx has no model_path set.\n", __func__);
2761
  return 1;
2762
  }
2763
 
@@ -2777,15 +2793,15 @@ int whisper_ctx_init_openvino_encoder(
2777
  path_cache = cache_dir;
2778
  }
2779
 
2780
- fprintf(stderr, "%s: loading OpenVINO model from '%s'\n", __func__, path_encoder.c_str());
2781
- fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
2782
 
2783
  ctx->state->ctx_openvino = whisper_openvino_init(path_encoder.c_str(), device, path_cache.c_str());
2784
  if (!ctx->state->ctx_openvino) {
2785
- fprintf(stderr, "%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_encoder.c_str());
2786
  return 1;
2787
  } else {
2788
- fprintf(stderr, "%s: OpenVINO model loaded\n", __func__);
2789
  }
2790
 
2791
  return 0;
@@ -2794,11 +2810,11 @@ int whisper_ctx_init_openvino_encoder(
2794
 
2795
  struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
2796
 
2797
- fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);
2798
 
2799
  auto fin = std::ifstream(path_model, std::ios::binary);
2800
  if (!fin) {
2801
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_model);
2802
  return nullptr;
2803
  }
2804
 
@@ -2840,7 +2856,7 @@ struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t
2840
 
2841
  buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
2842
 
2843
- fprintf(stderr, "%s: loading model from buffer\n", __func__);
2844
 
2845
  whisper_model_loader loader = {};
2846
 
@@ -2875,7 +2891,7 @@ struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loa
2875
 
2876
  if (!whisper_model_load(loader, *ctx)) {
2877
  loader->close(loader->context);
2878
- fprintf(stderr, "%s: failed to load model\n", __func__);
2879
  delete ctx;
2880
  return nullptr;
2881
  }
@@ -2980,7 +2996,7 @@ void whisper_free_params(struct whisper_full_params * params) {
2980
 
2981
  int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
2982
  if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
2983
- fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
2984
  return -1;
2985
  }
2986
 
@@ -2994,7 +3010,7 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
2994
  // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
2995
  int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
2996
  if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, state->mel)) {
2997
- fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
2998
  return -1;
2999
  }
3000
 
@@ -3013,7 +3029,7 @@ int whisper_set_mel_with_state(
3013
  int n_len,
3014
  int n_mel) {
3015
  if (n_mel != WHISPER_N_MEL) {
3016
- fprintf(stderr, "%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
3017
  return -1;
3018
  }
3019
 
@@ -3037,7 +3053,7 @@ int whisper_set_mel(
3037
 
3038
  int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state * state, int offset, int n_threads) {
3039
  if (!whisper_encode_internal(*ctx, *state, offset, n_threads)) {
3040
- fprintf(stderr, "%s: failed to eval\n", __func__);
3041
  return -1;
3042
  }
3043
 
@@ -3046,7 +3062,7 @@ int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state
3046
 
3047
  int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
3048
  if (!whisper_encode_internal(*ctx, *ctx->state, offset, n_threads)) {
3049
- fprintf(stderr, "%s: failed to eval\n", __func__);
3050
  return -1;
3051
  }
3052
 
@@ -3057,7 +3073,7 @@ int whisper_decode_with_state(struct whisper_context * ctx, struct whisper_state
3057
  const int selected_decoder_id = 0;
3058
 
3059
  if (!whisper_decode_internal(*ctx, *state, state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
3060
- fprintf(stderr, "%s: failed to eval\n", __func__);
3061
  return 1;
3062
  }
3063
 
@@ -3069,13 +3085,13 @@ int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, i
3069
  const int selected_decoder_id = 0;
3070
 
3071
  if (ctx->state == nullptr) {
3072
- fprintf(stderr, "%s: ERROR state was not loaded.\n", __func__);
3073
  return false;
3074
  }
3075
 
3076
 
3077
  if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
3078
- fprintf(stderr, "%s: failed to eval\n", __func__);
3079
  return 1;
3080
  }
3081
 
@@ -3086,7 +3102,7 @@ int whisper_tokenize(struct whisper_context * ctx, const char * text, whisper_to
3086
  const auto res = tokenize(ctx->vocab, text);
3087
 
3088
  if (n_max_tokens < (int) res.size()) {
3089
- fprintf(stderr, "%s: too many resulting tokens: %d (max %d)\n", __func__, (int) res.size(), n_max_tokens);
3090
  return -1;
3091
  }
3092
 
@@ -3114,7 +3130,7 @@ int whisper_lang_id(const char * lang) {
3114
  }
3115
  }
3116
 
3117
- fprintf(stderr, "%s: unknown language '%s'\n", __func__, lang);
3118
  return -1;
3119
  }
3120
  return g_lang.at(lang).first;
@@ -3127,7 +3143,7 @@ const char * whisper_lang_str(int id) {
3127
  }
3128
  }
3129
 
3130
- fprintf(stderr, "%s: unknown language id %d\n", __func__, id);
3131
  return nullptr;
3132
  }
3133
 
@@ -3140,25 +3156,25 @@ int whisper_lang_auto_detect_with_state(
3140
  const int seek = offset_ms/10;
3141
 
3142
  if (seek < 0) {
3143
- fprintf(stderr, "%s: offset %dms is before the start of the audio\n", __func__, offset_ms);
3144
  return -1;
3145
  }
3146
 
3147
  if (seek >= state->mel.n_len_org) {
3148
- fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10);
3149
  return -2;
3150
  }
3151
 
3152
  // run the encoder
3153
  if (whisper_encode_with_state(ctx, state, seek, n_threads) != 0) {
3154
- fprintf(stderr, "%s: failed to encode\n", __func__);
3155
  return -6;
3156
  }
3157
 
3158
  const std::vector<whisper_token> prompt = { whisper_token_sot(ctx) };
3159
 
3160
  if (whisper_decode_with_state(ctx, state, prompt.data(), prompt.size(), 0, n_threads) != 0) {
3161
- fprintf(stderr, "%s: failed to decode\n", __func__);
3162
  return -7;
3163
  }
3164
 
@@ -3359,21 +3375,21 @@ whisper_token whisper_token_transcribe(struct whisper_context * ctx) {
3359
  void whisper_print_timings(struct whisper_context * ctx) {
3360
  const int64_t t_end_us = ggml_time_us();
3361
 
3362
- fprintf(stderr, "\n");
3363
- fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
3364
  if (ctx->state != nullptr) {
3365
 
3366
  const int32_t n_sample = std::max(1, ctx->state->n_sample);
3367
  const int32_t n_encode = std::max(1, ctx->state->n_encode);
3368
  const int32_t n_decode = std::max(1, ctx->state->n_decode);
3369
 
3370
- fprintf(stderr, "%s: fallbacks = %3d p / %3d h\n", __func__, ctx->state->n_fail_p, ctx->state->n_fail_h);
3371
- fprintf(stderr, "%s: mel time = %8.2f ms\n", __func__, ctx->state->t_mel_us / 1000.0f);
3372
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_sample_us, n_sample, 1e-3f * ctx->state->t_sample_us / n_sample);
3373
- fprintf(stderr, "%s: encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_encode_us, n_encode, 1e-3f * ctx->state->t_encode_us / n_encode);
3374
- fprintf(stderr, "%s: decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_decode_us, n_decode, 1e-3f * ctx->state->t_decode_us / n_decode);
3375
  }
3376
- fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
3377
  }
3378
 
3379
  void whisper_reset_timings(struct whisper_context * ctx) {
@@ -3697,7 +3713,7 @@ static void whisper_process_logits(
3697
  const bool last_was_timestamp = tokens_cur.size() > 0 && tokens_cur.back().id >= vocab.token_beg;
3698
  const bool penultimate_was_timestamp = tokens_cur.size() < 2 || tokens_cur[tokens_cur.size() - 2].id >= vocab.token_beg;
3699
 
3700
- //fprintf(stderr, "last_was_timestamp=%d penultimate_was_timestamp=%d\n", last_was_timestamp, penultimate_was_timestamp);
3701
 
3702
  if (last_was_timestamp) {
3703
  if (penultimate_was_timestamp) {
@@ -3773,7 +3789,7 @@ static void whisper_process_logits(
3773
 
3774
  const float max_text_token_logprob = *std::max_element(logprobs.begin(), logprobs.begin() + vocab.token_beg);
3775
 
3776
- //fprintf(stderr, "timestamp_logprob=%f max_text_token_logprob=%f\n", timestamp_logprob, max_text_token_logprob);
3777
 
3778
  if (timestamp_logprob > max_text_token_logprob) {
3779
  for (int i = 0; i < vocab.token_beg; ++i) {
@@ -4022,12 +4038,12 @@ int whisper_full_with_state(
4022
  // compute log mel spectrogram
4023
  if (params.speed_up) {
4024
  if (whisper_pcm_to_mel_phase_vocoder_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
4025
- fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
4026
  return -1;
4027
  }
4028
  } else {
4029
  if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
4030
- fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
4031
  return -2;
4032
  }
4033
  }
@@ -4038,13 +4054,13 @@ int whisper_full_with_state(
4038
 
4039
  const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
4040
  if (lang_id < 0) {
4041
- fprintf(stderr, "%s: failed to auto-detect language\n", __func__);
4042
  return -3;
4043
  }
4044
  state->lang_id = lang_id;
4045
  params.language = whisper_lang_str(lang_id);
4046
 
4047
- fprintf(stderr, "%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
4048
  if (params.detect_language) {
4049
  return 0;
4050
  }
@@ -4101,7 +4117,7 @@ int whisper_full_with_state(
4101
  if (decoder.kv_self.ctx == nullptr) {
4102
  decoder.kv_self = state->decoders[0].kv_self;
4103
  if (!kv_cache_reinit(decoder.kv_self)) {
4104
- fprintf(stderr, "%s: kv_cache_reinit() failed for self-attention, decoder %d\n", __func__, j);
4105
  return -4;
4106
  }
4107
 
@@ -4145,7 +4161,7 @@ int whisper_full_with_state(
4145
 
4146
  // overwrite audio_ctx, max allowed is hparams.n_audio_ctx
4147
  if (params.audio_ctx > whisper_n_audio_ctx(ctx)) {
4148
- fprintf(stderr, "%s: audio_ctx is larger than the maximum allowed (%d > %d)\n", __func__, params.audio_ctx, whisper_n_audio_ctx(ctx));
4149
  return -5;
4150
  }
4151
  state->exp_n_audio_ctx = params.audio_ctx;
@@ -4189,8 +4205,9 @@ int whisper_full_with_state(
4189
 
4190
  // main loop
4191
  while (true) {
4192
- const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
4193
  if (params.progress_callback) {
 
 
4194
  params.progress_callback(
4195
  ctx, ctx->state, progress_cur, params.progress_callback_user_data);
4196
  }
@@ -4202,14 +4219,14 @@ int whisper_full_with_state(
4202
 
4203
  if (params.encoder_begin_callback) {
4204
  if (params.encoder_begin_callback(ctx, state, params.encoder_begin_callback_user_data) == false) {
4205
- fprintf(stderr, "%s: encoder_begin_callback returned false - aborting\n", __func__);
4206
  break;
4207
  }
4208
  }
4209
 
4210
  // encode audio features starting at offset seek
4211
  if (!whisper_encode_internal(*ctx, *state, seek, params.n_threads)) {
4212
- fprintf(stderr, "%s: failed to encode\n", __func__);
4213
  return -6;
4214
  }
4215
 
@@ -4292,7 +4309,7 @@ int whisper_full_with_state(
4292
  WHISPER_PRINT_DEBUG("\n\n");
4293
 
4294
  if (!whisper_decode_internal(*ctx, *state, state->decoders[0], prompt.data(), prompt.size(), 0, params.n_threads)) {
4295
- fprintf(stderr, "%s: failed to decode\n", __func__);
4296
  return -7;
4297
  }
4298
 
@@ -4530,7 +4547,7 @@ int whisper_full_with_state(
4530
  //WHISPER_PRINT_DEBUG("%s: decoder %d: token %d, kv_self.n %d, seek_delta %d\n", __func__, j, decoder.tokens_tmp[0], decoder.kv_self.n, decoder.seek_delta);
4531
 
4532
  if (!whisper_decode_internal(*ctx, *state, decoder, decoder.tokens_tmp.data(), decoder.tokens_tmp.size(), decoder.kv_self.n, params.n_threads)) {
4533
- fprintf(stderr, "%s: failed to decode\n", __func__);
4534
  return -8;
4535
  }
4536
 
@@ -4852,12 +4869,12 @@ int whisper_full_parallel(
4852
  ctx->state->t_decode_us /= n_processors;
4853
 
4854
  // print information about the audio boundaries
4855
- fprintf(stderr, "\n");
4856
- fprintf(stderr, "%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors);
4857
  for (int i = 0; i < n_processors - 1; ++i) {
4858
- fprintf(stderr, "%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str());
4859
  }
4860
- fprintf(stderr, "%s: the transcription quality may be degraded near these boundaries\n", __func__);
4861
 
4862
  return ret;
4863
  }
@@ -5217,7 +5234,7 @@ static void whisper_exp_compute_token_level_timestamps(
5217
  const int n_samples = state.energy.size();
5218
 
5219
  if (n_samples == 0) {
5220
- fprintf(stderr, "%s: no signal data available\n", __func__);
5221
  return;
5222
  }
5223
 
@@ -5437,3 +5454,7 @@ static void whisper_exp_compute_token_level_timestamps(
5437
  // }
5438
  //}
5439
  }
 
 
 
 
 
14
  #define _USE_MATH_DEFINES
15
  #include <cmath>
16
  #include <cstdio>
17
+ #include <cstdarg>
18
  #include <cstring>
19
  #include <fstream>
20
  #include <map>
 
93
  #define WHISPER_ASSERT(x) \
94
  do { \
95
  if (!(x)) { \
96
+ log("WHISPER_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
97
  abort(); \
98
  } \
99
  } while (0)
 
724
  std::string path_model; // populated by whisper_init_from_file()
725
  };
726
 
727
+ static void whisper_default_log(const char * text) {
728
+ fprintf(stderr, "%s", text);
729
+ }
730
+
731
+ static whisper_log_callback whisper_log = whisper_default_log;
732
+
733
+ static void log(const char * fmt, ...) {
734
+ if (!whisper_log) return;
735
+ char buf[1024];
736
+ va_list args;
737
+ va_start(args, fmt);
738
+ vsnprintf(buf, sizeof(buf), fmt, args);
739
+ whisper_log(buf);
740
+ }
741
+
742
  template<typename T>
743
  static void read_safe(whisper_model_loader * loader, T & dest) {
744
  loader->read(loader->context, &dest, sizeof(T));
 
762
  cache.ctx = ggml_init(params);
763
 
764
  if (!cache.ctx) {
765
+ log("%s: failed to allocate memory for kv cache\n", __func__);
766
  return false;
767
  }
768
 
 
798
  cache.ctx = ggml_init(params);
799
 
800
  if (!cache.ctx) {
801
+ log("%s: failed to allocate memory for kv cache\n", __func__);
802
  return false;
803
  }
804
 
 
827
  // see the convert-pt-to-ggml.py script for details
828
  //
829
  static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
830
+ log("%s: loading model\n", __func__);
831
 
832
  const int64_t t_start_us = ggml_time_us();
833
 
 
841
  uint32_t magic;
842
  read_safe(loader, magic);
843
  if (magic != GGML_FILE_MAGIC) {
844
+ log("%s: invalid model data (bad magic)\n", __func__);
845
  return false;
846
  }
847
  }
 
892
  // in order to save memory and also to speed up the computation
893
  wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
894
  if (wctx.wtype == GGML_TYPE_COUNT) {
895
+ log("%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
896
  return false;
897
  }
898
 
899
  const size_t scale = model.hparams.ftype ? 1 : 2;
900
 
901
+ log("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
902
+ log("%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
903
+ log("%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
904
+ log("%s: n_audio_head = %d\n", __func__, hparams.n_audio_head);
905
+ log("%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
906
+ log("%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx);
907
+ log("%s: n_text_state = %d\n", __func__, hparams.n_text_state);
908
+ log("%s: n_text_head = %d\n", __func__, hparams.n_text_head);
909
+ log("%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
910
+ log("%s: n_mels = %d\n", __func__, hparams.n_mels);
911
+ log("%s: ftype = %d\n", __func__, model.hparams.ftype);
912
+ log("%s: qntvr = %d\n", __func__, qntvr);
913
+ log("%s: type = %d\n", __func__, model.type);
914
 
915
  // print memory requirements
916
  {
 
928
  const size_t mem_required_decoder =
929
  scale*MEM_REQ_KV_SELF.at(model.type);
930
 
931
+ log("%s: mem required = %7.2f MB (+ %7.2f MB per decoder)\n", __func__,
932
  mem_required / 1024.0 / 1024.0, mem_required_decoder / 1024.0 / 1024.0);
933
  }
934
 
 
960
  read_safe(loader, n_vocab);
961
 
962
  //if (n_vocab != model.hparams.n_vocab) {
963
+ // log("%s: invalid model file '%s' (bad vocab size %d != %d)\n",
964
  // __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
965
  // return false;
966
  //}
 
980
  word.assign(&tmp[0], tmp.size());
981
  } else {
982
  // seems like we have an empty-string token in multi-language models (i = 50256)
983
+ //log("%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
984
  word = "";
985
  }
986
 
 
1004
  }
1005
 
1006
  if (n_vocab < model.hparams.n_vocab) {
1007
+ log("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
1008
  for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
1009
  if (i > vocab.token_beg) {
1010
  word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
 
1143
 
1144
  ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
1145
 
1146
+ log("%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
1147
  }
1148
 
1149
  // create the ggml context
 
1156
 
1157
  model.ctx = ggml_init(params);
1158
  if (!model.ctx) {
1159
+ log("%s: ggml_init() failed\n", __func__);
1160
  return false;
1161
  }
1162
  }
 
1389
  name.assign(&tmp[0], tmp.size());
1390
 
1391
  if (model.tensors.find(name) == model.tensors.end()) {
1392
+ log("%s: unknown tensor '%s' in model file\n", __func__, name.data());
1393
  return false;
1394
  }
1395
 
1396
  auto tensor = model.tensors[name.data()];
1397
  if (ggml_nelements(tensor) != nelements) {
1398
+ log("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
1399
+ log("%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
1400
  __func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
1401
  return false;
1402
  }
1403
 
1404
  if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
1405
+ log("%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
1406
  __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], ne[0], ne[1], ne[2]);
1407
  return false;
1408
  }
 
1410
  const size_t bpe = ggml_type_size(ggml_type(ttype));
1411
 
1412
  if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
1413
+ log("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
1414
  __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
1415
  return false;
1416
  }
 
1423
  model.n_loaded++;
1424
  }
1425
 
1426
+ log("%s: model size = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
1427
 
1428
  if (model.n_loaded == 0) {
1429
+ log("%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
1430
  } else if (model.n_loaded != (int) model.tensors.size()) {
1431
+ log("%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
1432
  return false;
1433
  }
1434
  }
 
2632
  --j;
2633
  }
2634
  if (!found) {
2635
+ log("unknown token\n");
2636
  ++i;
2637
  }
2638
  }
 
2699
  const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
2700
 
2701
  if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
2702
+ log("%s: kv_cache_init() failed for self-attention cache\n", __func__);
2703
  delete state;
2704
  return nullptr;
2705
  }
2706
 
2707
  {
2708
  const size_t memory_size = ggml_nbytes(state->decoders[0].kv_self.k) + ggml_nbytes(state->decoders[0].kv_self.v);
2709
+ log("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2710
  }
2711
 
2712
  if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
2713
+ log("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
2714
  delete state;
2715
  return nullptr;
2716
  }
2717
 
2718
  {
2719
  const size_t memory_size = ggml_nbytes(state->kv_cross.k) + ggml_nbytes(state->kv_cross.v);
2720
+ log("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2721
  }
2722
 
2723
  #ifdef WHISPER_USE_COREML
2724
  const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
2725
 
2726
+ log("%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
2727
+ log("%s: first run on a device may take a while ...\n", __func__);
2728
 
2729
  state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
2730
  if (!state->ctx_coreml) {
2731
+ log("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
2732
  #ifndef WHISPER_COREML_ALLOW_FALLBACK
2733
  return nullptr;
2734
  #endif
2735
  } else {
2736
+ log("%s: Core ML model loaded\n", __func__);
2737
  }
2738
  #endif
2739
 
 
2773
  return 1;
2774
  #else
2775
  if (!model_path && ctx->path_model.empty()) {
2776
+ log("%s: model_path is nullptr, and ctx has no model_path set.\n", __func__);
2777
  return 1;
2778
  }
2779
 
 
2793
  path_cache = cache_dir;
2794
  }
2795
 
2796
+ log("%s: loading OpenVINO model from '%s'\n", __func__, path_encoder.c_str());
2797
+ log("%s: first run on a device may take a while ...\n", __func__);
2798
 
2799
  ctx->state->ctx_openvino = whisper_openvino_init(path_encoder.c_str(), device, path_cache.c_str());
2800
  if (!ctx->state->ctx_openvino) {
2801
+ log("%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_encoder.c_str());
2802
  return 1;
2803
  } else {
2804
+ log("%s: OpenVINO model loaded\n", __func__);
2805
  }
2806
 
2807
  return 0;
 
2810
 
2811
  struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
2812
 
2813
+ log("%s: loading model from '%s'\n", __func__, path_model);
2814
 
2815
  auto fin = std::ifstream(path_model, std::ios::binary);
2816
  if (!fin) {
2817
+ log("%s: failed to open '%s'\n", __func__, path_model);
2818
  return nullptr;
2819
  }
2820
 
 
2856
 
2857
  buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
2858
 
2859
+ log("%s: loading model from buffer\n", __func__);
2860
 
2861
  whisper_model_loader loader = {};
2862
 
 
2891
 
2892
  if (!whisper_model_load(loader, *ctx)) {
2893
  loader->close(loader->context);
2894
+ log("%s: failed to load model\n", __func__);
2895
  delete ctx;
2896
  return nullptr;
2897
  }
 
2996
 
2997
  int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
2998
  if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
2999
+ log("%s: failed to compute mel spectrogram\n", __func__);
3000
  return -1;
3001
  }
3002
 
 
3010
  // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
3011
  int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
3012
  if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, state->mel)) {
3013
+ log("%s: failed to compute mel spectrogram\n", __func__);
3014
  return -1;
3015
  }
3016
 
 
3029
  int n_len,
3030
  int n_mel) {
3031
  if (n_mel != WHISPER_N_MEL) {
3032
+ log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
3033
  return -1;
3034
  }
3035
 
 
3053
 
3054
  int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state * state, int offset, int n_threads) {
3055
  if (!whisper_encode_internal(*ctx, *state, offset, n_threads)) {
3056
+ log("%s: failed to eval\n", __func__);
3057
  return -1;
3058
  }
3059
 
 
3062
 
3063
  int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
3064
  if (!whisper_encode_internal(*ctx, *ctx->state, offset, n_threads)) {
3065
+ log("%s: failed to eval\n", __func__);
3066
  return -1;
3067
  }
3068
 
 
3073
  const int selected_decoder_id = 0;
3074
 
3075
  if (!whisper_decode_internal(*ctx, *state, state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
3076
+ log("%s: failed to eval\n", __func__);
3077
  return 1;
3078
  }
3079
 
 
3085
  const int selected_decoder_id = 0;
3086
 
3087
  if (ctx->state == nullptr) {
3088
+ log("%s: ERROR state was not loaded.\n", __func__);
3089
  return false;
3090
  }
3091
 
3092
 
3093
  if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
3094
+ log("%s: failed to eval\n", __func__);
3095
  return 1;
3096
  }
3097
 
 
3102
  const auto res = tokenize(ctx->vocab, text);
3103
 
3104
  if (n_max_tokens < (int) res.size()) {
3105
+ log("%s: too many resulting tokens: %d (max %d)\n", __func__, (int) res.size(), n_max_tokens);
3106
  return -1;
3107
  }
3108
 
 
3130
  }
3131
  }
3132
 
3133
+ log("%s: unknown language '%s'\n", __func__, lang);
3134
  return -1;
3135
  }
3136
  return g_lang.at(lang).first;
 
3143
  }
3144
  }
3145
 
3146
+ log("%s: unknown language id %d\n", __func__, id);
3147
  return nullptr;
3148
  }
3149
 
 
3156
  const int seek = offset_ms/10;
3157
 
3158
  if (seek < 0) {
3159
+ log("%s: offset %dms is before the start of the audio\n", __func__, offset_ms);
3160
  return -1;
3161
  }
3162
 
3163
  if (seek >= state->mel.n_len_org) {
3164
+ log("%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10);
3165
  return -2;
3166
  }
3167
 
3168
  // run the encoder
3169
  if (whisper_encode_with_state(ctx, state, seek, n_threads) != 0) {
3170
+ log("%s: failed to encode\n", __func__);
3171
  return -6;
3172
  }
3173
 
3174
  const std::vector<whisper_token> prompt = { whisper_token_sot(ctx) };
3175
 
3176
  if (whisper_decode_with_state(ctx, state, prompt.data(), prompt.size(), 0, n_threads) != 0) {
3177
+ log("%s: failed to decode\n", __func__);
3178
  return -7;
3179
  }
3180
 
 
3375
  void whisper_print_timings(struct whisper_context * ctx) {
3376
  const int64_t t_end_us = ggml_time_us();
3377
 
3378
+ log("\n");
3379
+ log("%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
3380
  if (ctx->state != nullptr) {
3381
 
3382
  const int32_t n_sample = std::max(1, ctx->state->n_sample);
3383
  const int32_t n_encode = std::max(1, ctx->state->n_encode);
3384
  const int32_t n_decode = std::max(1, ctx->state->n_decode);
3385
 
3386
+ log("%s: fallbacks = %3d p / %3d h\n", __func__, ctx->state->n_fail_p, ctx->state->n_fail_h);
3387
+ log("%s: mel time = %8.2f ms\n", __func__, ctx->state->t_mel_us / 1000.0f);
3388
+ log("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_sample_us, n_sample, 1e-3f * ctx->state->t_sample_us / n_sample);
3389
+ log("%s: encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_encode_us, n_encode, 1e-3f * ctx->state->t_encode_us / n_encode);
3390
+ log("%s: decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_decode_us, n_decode, 1e-3f * ctx->state->t_decode_us / n_decode);
3391
  }
3392
+ log("%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
3393
  }
3394
 
3395
  void whisper_reset_timings(struct whisper_context * ctx) {
 
3713
  const bool last_was_timestamp = tokens_cur.size() > 0 && tokens_cur.back().id >= vocab.token_beg;
3714
  const bool penultimate_was_timestamp = tokens_cur.size() < 2 || tokens_cur[tokens_cur.size() - 2].id >= vocab.token_beg;
3715
 
3716
+ //log("last_was_timestamp=%d penultimate_was_timestamp=%d\n", last_was_timestamp, penultimate_was_timestamp);
3717
 
3718
  if (last_was_timestamp) {
3719
  if (penultimate_was_timestamp) {
 
3789
 
3790
  const float max_text_token_logprob = *std::max_element(logprobs.begin(), logprobs.begin() + vocab.token_beg);
3791
 
3792
+ //log("timestamp_logprob=%f max_text_token_logprob=%f\n", timestamp_logprob, max_text_token_logprob);
3793
 
3794
  if (timestamp_logprob > max_text_token_logprob) {
3795
  for (int i = 0; i < vocab.token_beg; ++i) {
 
4038
  // compute log mel spectrogram
4039
  if (params.speed_up) {
4040
  if (whisper_pcm_to_mel_phase_vocoder_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
4041
+ log("%s: failed to compute log mel spectrogram\n", __func__);
4042
  return -1;
4043
  }
4044
  } else {
4045
  if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
4046
+ log("%s: failed to compute log mel spectrogram\n", __func__);
4047
  return -2;
4048
  }
4049
  }
 
4054
 
4055
  const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
4056
  if (lang_id < 0) {
4057
+ log("%s: failed to auto-detect language\n", __func__);
4058
  return -3;
4059
  }
4060
  state->lang_id = lang_id;
4061
  params.language = whisper_lang_str(lang_id);
4062
 
4063
+ log("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
4064
  if (params.detect_language) {
4065
  return 0;
4066
  }
 
4117
  if (decoder.kv_self.ctx == nullptr) {
4118
  decoder.kv_self = state->decoders[0].kv_self;
4119
  if (!kv_cache_reinit(decoder.kv_self)) {
4120
+ log("%s: kv_cache_reinit() failed for self-attention, decoder %d\n", __func__, j);
4121
  return -4;
4122
  }
4123
 
 
4161
 
4162
  // overwrite audio_ctx, max allowed is hparams.n_audio_ctx
4163
  if (params.audio_ctx > whisper_n_audio_ctx(ctx)) {
4164
+ log("%s: audio_ctx is larger than the maximum allowed (%d > %d)\n", __func__, params.audio_ctx, whisper_n_audio_ctx(ctx));
4165
  return -5;
4166
  }
4167
  state->exp_n_audio_ctx = params.audio_ctx;
 
4205
 
4206
  // main loop
4207
  while (true) {
 
4208
  if (params.progress_callback) {
4209
+ const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
4210
+
4211
  params.progress_callback(
4212
  ctx, ctx->state, progress_cur, params.progress_callback_user_data);
4213
  }
 
4219
 
4220
  if (params.encoder_begin_callback) {
4221
  if (params.encoder_begin_callback(ctx, state, params.encoder_begin_callback_user_data) == false) {
4222
+ log("%s: encoder_begin_callback returned false - aborting\n", __func__);
4223
  break;
4224
  }
4225
  }
4226
 
4227
  // encode audio features starting at offset seek
4228
  if (!whisper_encode_internal(*ctx, *state, seek, params.n_threads)) {
4229
+ log("%s: failed to encode\n", __func__);
4230
  return -6;
4231
  }
4232
 
 
4309
  WHISPER_PRINT_DEBUG("\n\n");
4310
 
4311
  if (!whisper_decode_internal(*ctx, *state, state->decoders[0], prompt.data(), prompt.size(), 0, params.n_threads)) {
4312
+ log("%s: failed to decode\n", __func__);
4313
  return -7;
4314
  }
4315
 
 
4547
  //WHISPER_PRINT_DEBUG("%s: decoder %d: token %d, kv_self.n %d, seek_delta %d\n", __func__, j, decoder.tokens_tmp[0], decoder.kv_self.n, decoder.seek_delta);
4548
 
4549
  if (!whisper_decode_internal(*ctx, *state, decoder, decoder.tokens_tmp.data(), decoder.tokens_tmp.size(), decoder.kv_self.n, params.n_threads)) {
4550
+ log("%s: failed to decode\n", __func__);
4551
  return -8;
4552
  }
4553
 
 
4869
  ctx->state->t_decode_us /= n_processors;
4870
 
4871
  // print information about the audio boundaries
4872
+ log("\n");
4873
+ log("%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors);
4874
  for (int i = 0; i < n_processors - 1; ++i) {
4875
+ log("%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str());
4876
  }
4877
+ log("%s: the transcription quality may be degraded near these boundaries\n", __func__);
4878
 
4879
  return ret;
4880
  }
 
5234
  const int n_samples = state.energy.size();
5235
 
5236
  if (n_samples == 0) {
5237
+ log("%s: no signal data available\n", __func__);
5238
  return;
5239
  }
5240
 
 
5454
  // }
5455
  //}
5456
  }
5457
+
5458
+ void whisper_set_log_callback(whisper_log_callback callback) {
5459
+ whisper_log = callback;
5460
+ }
whisper.h CHANGED
@@ -517,6 +517,11 @@ extern "C" {
517
  WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
518
  WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
519
 
 
 
 
 
 
520
  #ifdef __cplusplus
521
  }
522
  #endif
 
517
  WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
518
  WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
519
 
520
+ // Control logging output; default behavior is to print to stderr
521
+
522
+ typedef void (*whisper_log_callback)(const char * line);
523
+ WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
524
+
525
  #ifdef __cplusplus
526
  }
527
  #endif