Spaces:
Running
Running
whisper : expose API to let user control log output (#1060)
Browse files* expose api to let user control log output
Add
whisper_set_log_callback()
that lets user set a callback for log messages.
Change all the
fprintf(stderr, ...)
to call via the above.
* whisper : add <cstdarg>
---------
Co-authored-by: Georgi Gerganov <[email protected]>
- whisper.cpp +113 -92
- whisper.h +5 -0
whisper.cpp
CHANGED
|
@@ -14,6 +14,7 @@
|
|
| 14 |
#define _USE_MATH_DEFINES
|
| 15 |
#include <cmath>
|
| 16 |
#include <cstdio>
|
|
|
|
| 17 |
#include <cstring>
|
| 18 |
#include <fstream>
|
| 19 |
#include <map>
|
|
@@ -92,7 +93,7 @@ static void byteswap_tensor(ggml_tensor * tensor) {
|
|
| 92 |
#define WHISPER_ASSERT(x) \
|
| 93 |
do { \
|
| 94 |
if (!(x)) { \
|
| 95 |
-
|
| 96 |
abort(); \
|
| 97 |
} \
|
| 98 |
} while (0)
|
|
@@ -723,6 +724,21 @@ struct whisper_context {
|
|
| 723 |
std::string path_model; // populated by whisper_init_from_file()
|
| 724 |
};
|
| 725 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
template<typename T>
|
| 727 |
static void read_safe(whisper_model_loader * loader, T & dest) {
|
| 728 |
loader->read(loader->context, &dest, sizeof(T));
|
|
@@ -746,7 +762,7 @@ static bool kv_cache_init(
|
|
| 746 |
cache.ctx = ggml_init(params);
|
| 747 |
|
| 748 |
if (!cache.ctx) {
|
| 749 |
-
|
| 750 |
return false;
|
| 751 |
}
|
| 752 |
|
|
@@ -782,7 +798,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
|
|
| 782 |
cache.ctx = ggml_init(params);
|
| 783 |
|
| 784 |
if (!cache.ctx) {
|
| 785 |
-
|
| 786 |
return false;
|
| 787 |
}
|
| 788 |
|
|
@@ -811,7 +827,7 @@ static void kv_cache_free(struct whisper_kv_cache & cache) {
|
|
| 811 |
// see the convert-pt-to-ggml.py script for details
|
| 812 |
//
|
| 813 |
static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
|
| 814 |
-
|
| 815 |
|
| 816 |
const int64_t t_start_us = ggml_time_us();
|
| 817 |
|
|
@@ -825,7 +841,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 825 |
uint32_t magic;
|
| 826 |
read_safe(loader, magic);
|
| 827 |
if (magic != GGML_FILE_MAGIC) {
|
| 828 |
-
|
| 829 |
return false;
|
| 830 |
}
|
| 831 |
}
|
|
@@ -876,25 +892,25 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 876 |
// in order to save memory and also to speed up the computation
|
| 877 |
wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
| 878 |
if (wctx.wtype == GGML_TYPE_COUNT) {
|
| 879 |
-
|
| 880 |
return false;
|
| 881 |
}
|
| 882 |
|
| 883 |
const size_t scale = model.hparams.ftype ? 1 : 2;
|
| 884 |
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
|
| 894 |
-
|
| 895 |
-
|
| 896 |
-
|
| 897 |
-
|
| 898 |
|
| 899 |
// print memory requirements
|
| 900 |
{
|
|
@@ -912,7 +928,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 912 |
const size_t mem_required_decoder =
|
| 913 |
scale*MEM_REQ_KV_SELF.at(model.type);
|
| 914 |
|
| 915 |
-
|
| 916 |
mem_required / 1024.0 / 1024.0, mem_required_decoder / 1024.0 / 1024.0);
|
| 917 |
}
|
| 918 |
|
|
@@ -944,7 +960,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 944 |
read_safe(loader, n_vocab);
|
| 945 |
|
| 946 |
//if (n_vocab != model.hparams.n_vocab) {
|
| 947 |
-
//
|
| 948 |
// __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
|
| 949 |
// return false;
|
| 950 |
//}
|
|
@@ -964,7 +980,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 964 |
word.assign(&tmp[0], tmp.size());
|
| 965 |
} else {
|
| 966 |
// seems like we have an empty-string token in multi-language models (i = 50256)
|
| 967 |
-
//
|
| 968 |
word = "";
|
| 969 |
}
|
| 970 |
|
|
@@ -988,7 +1004,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 988 |
}
|
| 989 |
|
| 990 |
if (n_vocab < model.hparams.n_vocab) {
|
| 991 |
-
|
| 992 |
for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
|
| 993 |
if (i > vocab.token_beg) {
|
| 994 |
word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
|
|
@@ -1127,7 +1143,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1127 |
|
| 1128 |
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
|
| 1129 |
|
| 1130 |
-
|
| 1131 |
}
|
| 1132 |
|
| 1133 |
// create the ggml context
|
|
@@ -1140,7 +1156,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1140 |
|
| 1141 |
model.ctx = ggml_init(params);
|
| 1142 |
if (!model.ctx) {
|
| 1143 |
-
|
| 1144 |
return false;
|
| 1145 |
}
|
| 1146 |
}
|
|
@@ -1373,20 +1389,20 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1373 |
name.assign(&tmp[0], tmp.size());
|
| 1374 |
|
| 1375 |
if (model.tensors.find(name) == model.tensors.end()) {
|
| 1376 |
-
|
| 1377 |
return false;
|
| 1378 |
}
|
| 1379 |
|
| 1380 |
auto tensor = model.tensors[name.data()];
|
| 1381 |
if (ggml_nelements(tensor) != nelements) {
|
| 1382 |
-
|
| 1383 |
-
|
| 1384 |
__func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
|
| 1385 |
return false;
|
| 1386 |
}
|
| 1387 |
|
| 1388 |
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
|
| 1389 |
-
|
| 1390 |
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], ne[0], ne[1], ne[2]);
|
| 1391 |
return false;
|
| 1392 |
}
|
|
@@ -1394,7 +1410,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1394 |
const size_t bpe = ggml_type_size(ggml_type(ttype));
|
| 1395 |
|
| 1396 |
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
| 1397 |
-
|
| 1398 |
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
| 1399 |
return false;
|
| 1400 |
}
|
|
@@ -1407,12 +1423,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1407 |
model.n_loaded++;
|
| 1408 |
}
|
| 1409 |
|
| 1410 |
-
|
| 1411 |
|
| 1412 |
if (model.n_loaded == 0) {
|
| 1413 |
-
|
| 1414 |
} else if (model.n_loaded != (int) model.tensors.size()) {
|
| 1415 |
-
|
| 1416 |
return false;
|
| 1417 |
}
|
| 1418 |
}
|
|
@@ -2616,7 +2632,7 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
|
|
| 2616 |
--j;
|
| 2617 |
}
|
| 2618 |
if (!found) {
|
| 2619 |
-
|
| 2620 |
++i;
|
| 2621 |
}
|
| 2622 |
}
|
|
@@ -2683,41 +2699,41 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
| 2683 |
const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
|
| 2684 |
|
| 2685 |
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
|
| 2686 |
-
|
| 2687 |
delete state;
|
| 2688 |
return nullptr;
|
| 2689 |
}
|
| 2690 |
|
| 2691 |
{
|
| 2692 |
const size_t memory_size = ggml_nbytes(state->decoders[0].kv_self.k) + ggml_nbytes(state->decoders[0].kv_self.v);
|
| 2693 |
-
|
| 2694 |
}
|
| 2695 |
|
| 2696 |
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
|
| 2697 |
-
|
| 2698 |
delete state;
|
| 2699 |
return nullptr;
|
| 2700 |
}
|
| 2701 |
|
| 2702 |
{
|
| 2703 |
const size_t memory_size = ggml_nbytes(state->kv_cross.k) + ggml_nbytes(state->kv_cross.v);
|
| 2704 |
-
|
| 2705 |
}
|
| 2706 |
|
| 2707 |
#ifdef WHISPER_USE_COREML
|
| 2708 |
const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
|
| 2709 |
|
| 2710 |
-
|
| 2711 |
-
|
| 2712 |
|
| 2713 |
state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
|
| 2714 |
if (!state->ctx_coreml) {
|
| 2715 |
-
|
| 2716 |
#ifndef WHISPER_COREML_ALLOW_FALLBACK
|
| 2717 |
return nullptr;
|
| 2718 |
#endif
|
| 2719 |
} else {
|
| 2720 |
-
|
| 2721 |
}
|
| 2722 |
#endif
|
| 2723 |
|
|
@@ -2757,7 +2773,7 @@ int whisper_ctx_init_openvino_encoder(
|
|
| 2757 |
return 1;
|
| 2758 |
#else
|
| 2759 |
if (!model_path && ctx->path_model.empty()) {
|
| 2760 |
-
|
| 2761 |
return 1;
|
| 2762 |
}
|
| 2763 |
|
|
@@ -2777,15 +2793,15 @@ int whisper_ctx_init_openvino_encoder(
|
|
| 2777 |
path_cache = cache_dir;
|
| 2778 |
}
|
| 2779 |
|
| 2780 |
-
|
| 2781 |
-
|
| 2782 |
|
| 2783 |
ctx->state->ctx_openvino = whisper_openvino_init(path_encoder.c_str(), device, path_cache.c_str());
|
| 2784 |
if (!ctx->state->ctx_openvino) {
|
| 2785 |
-
|
| 2786 |
return 1;
|
| 2787 |
} else {
|
| 2788 |
-
|
| 2789 |
}
|
| 2790 |
|
| 2791 |
return 0;
|
|
@@ -2794,11 +2810,11 @@ int whisper_ctx_init_openvino_encoder(
|
|
| 2794 |
|
| 2795 |
struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
|
| 2796 |
|
| 2797 |
-
|
| 2798 |
|
| 2799 |
auto fin = std::ifstream(path_model, std::ios::binary);
|
| 2800 |
if (!fin) {
|
| 2801 |
-
|
| 2802 |
return nullptr;
|
| 2803 |
}
|
| 2804 |
|
|
@@ -2840,7 +2856,7 @@ struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t
|
|
| 2840 |
|
| 2841 |
buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
|
| 2842 |
|
| 2843 |
-
|
| 2844 |
|
| 2845 |
whisper_model_loader loader = {};
|
| 2846 |
|
|
@@ -2875,7 +2891,7 @@ struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loa
|
|
| 2875 |
|
| 2876 |
if (!whisper_model_load(loader, *ctx)) {
|
| 2877 |
loader->close(loader->context);
|
| 2878 |
-
|
| 2879 |
delete ctx;
|
| 2880 |
return nullptr;
|
| 2881 |
}
|
|
@@ -2980,7 +2996,7 @@ void whisper_free_params(struct whisper_full_params * params) {
|
|
| 2980 |
|
| 2981 |
int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
|
| 2982 |
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
|
| 2983 |
-
|
| 2984 |
return -1;
|
| 2985 |
}
|
| 2986 |
|
|
@@ -2994,7 +3010,7 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
|
|
| 2994 |
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
|
| 2995 |
int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
|
| 2996 |
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, state->mel)) {
|
| 2997 |
-
|
| 2998 |
return -1;
|
| 2999 |
}
|
| 3000 |
|
|
@@ -3013,7 +3029,7 @@ int whisper_set_mel_with_state(
|
|
| 3013 |
int n_len,
|
| 3014 |
int n_mel) {
|
| 3015 |
if (n_mel != WHISPER_N_MEL) {
|
| 3016 |
-
|
| 3017 |
return -1;
|
| 3018 |
}
|
| 3019 |
|
|
@@ -3037,7 +3053,7 @@ int whisper_set_mel(
|
|
| 3037 |
|
| 3038 |
int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state * state, int offset, int n_threads) {
|
| 3039 |
if (!whisper_encode_internal(*ctx, *state, offset, n_threads)) {
|
| 3040 |
-
|
| 3041 |
return -1;
|
| 3042 |
}
|
| 3043 |
|
|
@@ -3046,7 +3062,7 @@ int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state
|
|
| 3046 |
|
| 3047 |
int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
|
| 3048 |
if (!whisper_encode_internal(*ctx, *ctx->state, offset, n_threads)) {
|
| 3049 |
-
|
| 3050 |
return -1;
|
| 3051 |
}
|
| 3052 |
|
|
@@ -3057,7 +3073,7 @@ int whisper_decode_with_state(struct whisper_context * ctx, struct whisper_state
|
|
| 3057 |
const int selected_decoder_id = 0;
|
| 3058 |
|
| 3059 |
if (!whisper_decode_internal(*ctx, *state, state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
|
| 3060 |
-
|
| 3061 |
return 1;
|
| 3062 |
}
|
| 3063 |
|
|
@@ -3069,13 +3085,13 @@ int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, i
|
|
| 3069 |
const int selected_decoder_id = 0;
|
| 3070 |
|
| 3071 |
if (ctx->state == nullptr) {
|
| 3072 |
-
|
| 3073 |
return false;
|
| 3074 |
}
|
| 3075 |
|
| 3076 |
|
| 3077 |
if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
|
| 3078 |
-
|
| 3079 |
return 1;
|
| 3080 |
}
|
| 3081 |
|
|
@@ -3086,7 +3102,7 @@ int whisper_tokenize(struct whisper_context * ctx, const char * text, whisper_to
|
|
| 3086 |
const auto res = tokenize(ctx->vocab, text);
|
| 3087 |
|
| 3088 |
if (n_max_tokens < (int) res.size()) {
|
| 3089 |
-
|
| 3090 |
return -1;
|
| 3091 |
}
|
| 3092 |
|
|
@@ -3114,7 +3130,7 @@ int whisper_lang_id(const char * lang) {
|
|
| 3114 |
}
|
| 3115 |
}
|
| 3116 |
|
| 3117 |
-
|
| 3118 |
return -1;
|
| 3119 |
}
|
| 3120 |
return g_lang.at(lang).first;
|
|
@@ -3127,7 +3143,7 @@ const char * whisper_lang_str(int id) {
|
|
| 3127 |
}
|
| 3128 |
}
|
| 3129 |
|
| 3130 |
-
|
| 3131 |
return nullptr;
|
| 3132 |
}
|
| 3133 |
|
|
@@ -3140,25 +3156,25 @@ int whisper_lang_auto_detect_with_state(
|
|
| 3140 |
const int seek = offset_ms/10;
|
| 3141 |
|
| 3142 |
if (seek < 0) {
|
| 3143 |
-
|
| 3144 |
return -1;
|
| 3145 |
}
|
| 3146 |
|
| 3147 |
if (seek >= state->mel.n_len_org) {
|
| 3148 |
-
|
| 3149 |
return -2;
|
| 3150 |
}
|
| 3151 |
|
| 3152 |
// run the encoder
|
| 3153 |
if (whisper_encode_with_state(ctx, state, seek, n_threads) != 0) {
|
| 3154 |
-
|
| 3155 |
return -6;
|
| 3156 |
}
|
| 3157 |
|
| 3158 |
const std::vector<whisper_token> prompt = { whisper_token_sot(ctx) };
|
| 3159 |
|
| 3160 |
if (whisper_decode_with_state(ctx, state, prompt.data(), prompt.size(), 0, n_threads) != 0) {
|
| 3161 |
-
|
| 3162 |
return -7;
|
| 3163 |
}
|
| 3164 |
|
|
@@ -3359,21 +3375,21 @@ whisper_token whisper_token_transcribe(struct whisper_context * ctx) {
|
|
| 3359 |
void whisper_print_timings(struct whisper_context * ctx) {
|
| 3360 |
const int64_t t_end_us = ggml_time_us();
|
| 3361 |
|
| 3362 |
-
|
| 3363 |
-
|
| 3364 |
if (ctx->state != nullptr) {
|
| 3365 |
|
| 3366 |
const int32_t n_sample = std::max(1, ctx->state->n_sample);
|
| 3367 |
const int32_t n_encode = std::max(1, ctx->state->n_encode);
|
| 3368 |
const int32_t n_decode = std::max(1, ctx->state->n_decode);
|
| 3369 |
|
| 3370 |
-
|
| 3371 |
-
|
| 3372 |
-
|
| 3373 |
-
|
| 3374 |
-
|
| 3375 |
}
|
| 3376 |
-
|
| 3377 |
}
|
| 3378 |
|
| 3379 |
void whisper_reset_timings(struct whisper_context * ctx) {
|
|
@@ -3697,7 +3713,7 @@ static void whisper_process_logits(
|
|
| 3697 |
const bool last_was_timestamp = tokens_cur.size() > 0 && tokens_cur.back().id >= vocab.token_beg;
|
| 3698 |
const bool penultimate_was_timestamp = tokens_cur.size() < 2 || tokens_cur[tokens_cur.size() - 2].id >= vocab.token_beg;
|
| 3699 |
|
| 3700 |
-
//
|
| 3701 |
|
| 3702 |
if (last_was_timestamp) {
|
| 3703 |
if (penultimate_was_timestamp) {
|
|
@@ -3773,7 +3789,7 @@ static void whisper_process_logits(
|
|
| 3773 |
|
| 3774 |
const float max_text_token_logprob = *std::max_element(logprobs.begin(), logprobs.begin() + vocab.token_beg);
|
| 3775 |
|
| 3776 |
-
//
|
| 3777 |
|
| 3778 |
if (timestamp_logprob > max_text_token_logprob) {
|
| 3779 |
for (int i = 0; i < vocab.token_beg; ++i) {
|
|
@@ -4022,12 +4038,12 @@ int whisper_full_with_state(
|
|
| 4022 |
// compute log mel spectrogram
|
| 4023 |
if (params.speed_up) {
|
| 4024 |
if (whisper_pcm_to_mel_phase_vocoder_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
|
| 4025 |
-
|
| 4026 |
return -1;
|
| 4027 |
}
|
| 4028 |
} else {
|
| 4029 |
if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
|
| 4030 |
-
|
| 4031 |
return -2;
|
| 4032 |
}
|
| 4033 |
}
|
|
@@ -4038,13 +4054,13 @@ int whisper_full_with_state(
|
|
| 4038 |
|
| 4039 |
const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
|
| 4040 |
if (lang_id < 0) {
|
| 4041 |
-
|
| 4042 |
return -3;
|
| 4043 |
}
|
| 4044 |
state->lang_id = lang_id;
|
| 4045 |
params.language = whisper_lang_str(lang_id);
|
| 4046 |
|
| 4047 |
-
|
| 4048 |
if (params.detect_language) {
|
| 4049 |
return 0;
|
| 4050 |
}
|
|
@@ -4101,7 +4117,7 @@ int whisper_full_with_state(
|
|
| 4101 |
if (decoder.kv_self.ctx == nullptr) {
|
| 4102 |
decoder.kv_self = state->decoders[0].kv_self;
|
| 4103 |
if (!kv_cache_reinit(decoder.kv_self)) {
|
| 4104 |
-
|
| 4105 |
return -4;
|
| 4106 |
}
|
| 4107 |
|
|
@@ -4145,7 +4161,7 @@ int whisper_full_with_state(
|
|
| 4145 |
|
| 4146 |
// overwrite audio_ctx, max allowed is hparams.n_audio_ctx
|
| 4147 |
if (params.audio_ctx > whisper_n_audio_ctx(ctx)) {
|
| 4148 |
-
|
| 4149 |
return -5;
|
| 4150 |
}
|
| 4151 |
state->exp_n_audio_ctx = params.audio_ctx;
|
|
@@ -4189,8 +4205,9 @@ int whisper_full_with_state(
|
|
| 4189 |
|
| 4190 |
// main loop
|
| 4191 |
while (true) {
|
| 4192 |
-
const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
|
| 4193 |
if (params.progress_callback) {
|
|
|
|
|
|
|
| 4194 |
params.progress_callback(
|
| 4195 |
ctx, ctx->state, progress_cur, params.progress_callback_user_data);
|
| 4196 |
}
|
|
@@ -4202,14 +4219,14 @@ int whisper_full_with_state(
|
|
| 4202 |
|
| 4203 |
if (params.encoder_begin_callback) {
|
| 4204 |
if (params.encoder_begin_callback(ctx, state, params.encoder_begin_callback_user_data) == false) {
|
| 4205 |
-
|
| 4206 |
break;
|
| 4207 |
}
|
| 4208 |
}
|
| 4209 |
|
| 4210 |
// encode audio features starting at offset seek
|
| 4211 |
if (!whisper_encode_internal(*ctx, *state, seek, params.n_threads)) {
|
| 4212 |
-
|
| 4213 |
return -6;
|
| 4214 |
}
|
| 4215 |
|
|
@@ -4292,7 +4309,7 @@ int whisper_full_with_state(
|
|
| 4292 |
WHISPER_PRINT_DEBUG("\n\n");
|
| 4293 |
|
| 4294 |
if (!whisper_decode_internal(*ctx, *state, state->decoders[0], prompt.data(), prompt.size(), 0, params.n_threads)) {
|
| 4295 |
-
|
| 4296 |
return -7;
|
| 4297 |
}
|
| 4298 |
|
|
@@ -4530,7 +4547,7 @@ int whisper_full_with_state(
|
|
| 4530 |
//WHISPER_PRINT_DEBUG("%s: decoder %d: token %d, kv_self.n %d, seek_delta %d\n", __func__, j, decoder.tokens_tmp[0], decoder.kv_self.n, decoder.seek_delta);
|
| 4531 |
|
| 4532 |
if (!whisper_decode_internal(*ctx, *state, decoder, decoder.tokens_tmp.data(), decoder.tokens_tmp.size(), decoder.kv_self.n, params.n_threads)) {
|
| 4533 |
-
|
| 4534 |
return -8;
|
| 4535 |
}
|
| 4536 |
|
|
@@ -4852,12 +4869,12 @@ int whisper_full_parallel(
|
|
| 4852 |
ctx->state->t_decode_us /= n_processors;
|
| 4853 |
|
| 4854 |
// print information about the audio boundaries
|
| 4855 |
-
|
| 4856 |
-
|
| 4857 |
for (int i = 0; i < n_processors - 1; ++i) {
|
| 4858 |
-
|
| 4859 |
}
|
| 4860 |
-
|
| 4861 |
|
| 4862 |
return ret;
|
| 4863 |
}
|
|
@@ -5217,7 +5234,7 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
| 5217 |
const int n_samples = state.energy.size();
|
| 5218 |
|
| 5219 |
if (n_samples == 0) {
|
| 5220 |
-
|
| 5221 |
return;
|
| 5222 |
}
|
| 5223 |
|
|
@@ -5437,3 +5454,7 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
| 5437 |
// }
|
| 5438 |
//}
|
| 5439 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
#define _USE_MATH_DEFINES
|
| 15 |
#include <cmath>
|
| 16 |
#include <cstdio>
|
| 17 |
+
#include <cstdarg>
|
| 18 |
#include <cstring>
|
| 19 |
#include <fstream>
|
| 20 |
#include <map>
|
|
|
|
| 93 |
#define WHISPER_ASSERT(x) \
|
| 94 |
do { \
|
| 95 |
if (!(x)) { \
|
| 96 |
+
log("WHISPER_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
| 97 |
abort(); \
|
| 98 |
} \
|
| 99 |
} while (0)
|
|
|
|
| 724 |
std::string path_model; // populated by whisper_init_from_file()
|
| 725 |
};
|
| 726 |
|
| 727 |
+
static void whisper_default_log(const char * text) {
|
| 728 |
+
fprintf(stderr, "%s", text);
|
| 729 |
+
}
|
| 730 |
+
|
| 731 |
+
static whisper_log_callback whisper_log = whisper_default_log;
|
| 732 |
+
|
| 733 |
+
static void log(const char * fmt, ...) {
|
| 734 |
+
if (!whisper_log) return;
|
| 735 |
+
char buf[1024];
|
| 736 |
+
va_list args;
|
| 737 |
+
va_start(args, fmt);
|
| 738 |
+
vsnprintf(buf, sizeof(buf), fmt, args);
|
| 739 |
+
whisper_log(buf);
|
| 740 |
+
}
|
| 741 |
+
|
| 742 |
template<typename T>
|
| 743 |
static void read_safe(whisper_model_loader * loader, T & dest) {
|
| 744 |
loader->read(loader->context, &dest, sizeof(T));
|
|
|
|
| 762 |
cache.ctx = ggml_init(params);
|
| 763 |
|
| 764 |
if (!cache.ctx) {
|
| 765 |
+
log("%s: failed to allocate memory for kv cache\n", __func__);
|
| 766 |
return false;
|
| 767 |
}
|
| 768 |
|
|
|
|
| 798 |
cache.ctx = ggml_init(params);
|
| 799 |
|
| 800 |
if (!cache.ctx) {
|
| 801 |
+
log("%s: failed to allocate memory for kv cache\n", __func__);
|
| 802 |
return false;
|
| 803 |
}
|
| 804 |
|
|
|
|
| 827 |
// see the convert-pt-to-ggml.py script for details
|
| 828 |
//
|
| 829 |
static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
|
| 830 |
+
log("%s: loading model\n", __func__);
|
| 831 |
|
| 832 |
const int64_t t_start_us = ggml_time_us();
|
| 833 |
|
|
|
|
| 841 |
uint32_t magic;
|
| 842 |
read_safe(loader, magic);
|
| 843 |
if (magic != GGML_FILE_MAGIC) {
|
| 844 |
+
log("%s: invalid model data (bad magic)\n", __func__);
|
| 845 |
return false;
|
| 846 |
}
|
| 847 |
}
|
|
|
|
| 892 |
// in order to save memory and also to speed up the computation
|
| 893 |
wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
| 894 |
if (wctx.wtype == GGML_TYPE_COUNT) {
|
| 895 |
+
log("%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
|
| 896 |
return false;
|
| 897 |
}
|
| 898 |
|
| 899 |
const size_t scale = model.hparams.ftype ? 1 : 2;
|
| 900 |
|
| 901 |
+
log("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
| 902 |
+
log("%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
|
| 903 |
+
log("%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
|
| 904 |
+
log("%s: n_audio_head = %d\n", __func__, hparams.n_audio_head);
|
| 905 |
+
log("%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
|
| 906 |
+
log("%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx);
|
| 907 |
+
log("%s: n_text_state = %d\n", __func__, hparams.n_text_state);
|
| 908 |
+
log("%s: n_text_head = %d\n", __func__, hparams.n_text_head);
|
| 909 |
+
log("%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
|
| 910 |
+
log("%s: n_mels = %d\n", __func__, hparams.n_mels);
|
| 911 |
+
log("%s: ftype = %d\n", __func__, model.hparams.ftype);
|
| 912 |
+
log("%s: qntvr = %d\n", __func__, qntvr);
|
| 913 |
+
log("%s: type = %d\n", __func__, model.type);
|
| 914 |
|
| 915 |
// print memory requirements
|
| 916 |
{
|
|
|
|
| 928 |
const size_t mem_required_decoder =
|
| 929 |
scale*MEM_REQ_KV_SELF.at(model.type);
|
| 930 |
|
| 931 |
+
log("%s: mem required = %7.2f MB (+ %7.2f MB per decoder)\n", __func__,
|
| 932 |
mem_required / 1024.0 / 1024.0, mem_required_decoder / 1024.0 / 1024.0);
|
| 933 |
}
|
| 934 |
|
|
|
|
| 960 |
read_safe(loader, n_vocab);
|
| 961 |
|
| 962 |
//if (n_vocab != model.hparams.n_vocab) {
|
| 963 |
+
// log("%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
| 964 |
// __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
|
| 965 |
// return false;
|
| 966 |
//}
|
|
|
|
| 980 |
word.assign(&tmp[0], tmp.size());
|
| 981 |
} else {
|
| 982 |
// seems like we have an empty-string token in multi-language models (i = 50256)
|
| 983 |
+
//log("%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
|
| 984 |
word = "";
|
| 985 |
}
|
| 986 |
|
|
|
|
| 1004 |
}
|
| 1005 |
|
| 1006 |
if (n_vocab < model.hparams.n_vocab) {
|
| 1007 |
+
log("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
|
| 1008 |
for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
|
| 1009 |
if (i > vocab.token_beg) {
|
| 1010 |
word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
|
|
|
|
| 1143 |
|
| 1144 |
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
|
| 1145 |
|
| 1146 |
+
log("%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
| 1147 |
}
|
| 1148 |
|
| 1149 |
// create the ggml context
|
|
|
|
| 1156 |
|
| 1157 |
model.ctx = ggml_init(params);
|
| 1158 |
if (!model.ctx) {
|
| 1159 |
+
log("%s: ggml_init() failed\n", __func__);
|
| 1160 |
return false;
|
| 1161 |
}
|
| 1162 |
}
|
|
|
|
| 1389 |
name.assign(&tmp[0], tmp.size());
|
| 1390 |
|
| 1391 |
if (model.tensors.find(name) == model.tensors.end()) {
|
| 1392 |
+
log("%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
| 1393 |
return false;
|
| 1394 |
}
|
| 1395 |
|
| 1396 |
auto tensor = model.tensors[name.data()];
|
| 1397 |
if (ggml_nelements(tensor) != nelements) {
|
| 1398 |
+
log("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
| 1399 |
+
log("%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
|
| 1400 |
__func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
|
| 1401 |
return false;
|
| 1402 |
}
|
| 1403 |
|
| 1404 |
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
|
| 1405 |
+
log("%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
|
| 1406 |
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], ne[0], ne[1], ne[2]);
|
| 1407 |
return false;
|
| 1408 |
}
|
|
|
|
| 1410 |
const size_t bpe = ggml_type_size(ggml_type(ttype));
|
| 1411 |
|
| 1412 |
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
| 1413 |
+
log("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
| 1414 |
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
| 1415 |
return false;
|
| 1416 |
}
|
|
|
|
| 1423 |
model.n_loaded++;
|
| 1424 |
}
|
| 1425 |
|
| 1426 |
+
log("%s: model size = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
|
| 1427 |
|
| 1428 |
if (model.n_loaded == 0) {
|
| 1429 |
+
log("%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
| 1430 |
} else if (model.n_loaded != (int) model.tensors.size()) {
|
| 1431 |
+
log("%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
|
| 1432 |
return false;
|
| 1433 |
}
|
| 1434 |
}
|
|
|
|
| 2632 |
--j;
|
| 2633 |
}
|
| 2634 |
if (!found) {
|
| 2635 |
+
log("unknown token\n");
|
| 2636 |
++i;
|
| 2637 |
}
|
| 2638 |
}
|
|
|
|
| 2699 |
const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
|
| 2700 |
|
| 2701 |
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
|
| 2702 |
+
log("%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
| 2703 |
delete state;
|
| 2704 |
return nullptr;
|
| 2705 |
}
|
| 2706 |
|
| 2707 |
{
|
| 2708 |
const size_t memory_size = ggml_nbytes(state->decoders[0].kv_self.k) + ggml_nbytes(state->decoders[0].kv_self.v);
|
| 2709 |
+
log("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
| 2710 |
}
|
| 2711 |
|
| 2712 |
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
|
| 2713 |
+
log("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
|
| 2714 |
delete state;
|
| 2715 |
return nullptr;
|
| 2716 |
}
|
| 2717 |
|
| 2718 |
{
|
| 2719 |
const size_t memory_size = ggml_nbytes(state->kv_cross.k) + ggml_nbytes(state->kv_cross.v);
|
| 2720 |
+
log("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
| 2721 |
}
|
| 2722 |
|
| 2723 |
#ifdef WHISPER_USE_COREML
|
| 2724 |
const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
|
| 2725 |
|
| 2726 |
+
log("%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
|
| 2727 |
+
log("%s: first run on a device may take a while ...\n", __func__);
|
| 2728 |
|
| 2729 |
state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
|
| 2730 |
if (!state->ctx_coreml) {
|
| 2731 |
+
log("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
|
| 2732 |
#ifndef WHISPER_COREML_ALLOW_FALLBACK
|
| 2733 |
return nullptr;
|
| 2734 |
#endif
|
| 2735 |
} else {
|
| 2736 |
+
log("%s: Core ML model loaded\n", __func__);
|
| 2737 |
}
|
| 2738 |
#endif
|
| 2739 |
|
|
|
|
| 2773 |
return 1;
|
| 2774 |
#else
|
| 2775 |
if (!model_path && ctx->path_model.empty()) {
|
| 2776 |
+
log("%s: model_path is nullptr, and ctx has no model_path set.\n", __func__);
|
| 2777 |
return 1;
|
| 2778 |
}
|
| 2779 |
|
|
|
|
| 2793 |
path_cache = cache_dir;
|
| 2794 |
}
|
| 2795 |
|
| 2796 |
+
log("%s: loading OpenVINO model from '%s'\n", __func__, path_encoder.c_str());
|
| 2797 |
+
log("%s: first run on a device may take a while ...\n", __func__);
|
| 2798 |
|
| 2799 |
ctx->state->ctx_openvino = whisper_openvino_init(path_encoder.c_str(), device, path_cache.c_str());
|
| 2800 |
if (!ctx->state->ctx_openvino) {
|
| 2801 |
+
log("%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_encoder.c_str());
|
| 2802 |
return 1;
|
| 2803 |
} else {
|
| 2804 |
+
log("%s: OpenVINO model loaded\n", __func__);
|
| 2805 |
}
|
| 2806 |
|
| 2807 |
return 0;
|
|
|
|
| 2810 |
|
| 2811 |
struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
|
| 2812 |
|
| 2813 |
+
log("%s: loading model from '%s'\n", __func__, path_model);
|
| 2814 |
|
| 2815 |
auto fin = std::ifstream(path_model, std::ios::binary);
|
| 2816 |
if (!fin) {
|
| 2817 |
+
log("%s: failed to open '%s'\n", __func__, path_model);
|
| 2818 |
return nullptr;
|
| 2819 |
}
|
| 2820 |
|
|
|
|
| 2856 |
|
| 2857 |
buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
|
| 2858 |
|
| 2859 |
+
log("%s: loading model from buffer\n", __func__);
|
| 2860 |
|
| 2861 |
whisper_model_loader loader = {};
|
| 2862 |
|
|
|
|
| 2891 |
|
| 2892 |
if (!whisper_model_load(loader, *ctx)) {
|
| 2893 |
loader->close(loader->context);
|
| 2894 |
+
log("%s: failed to load model\n", __func__);
|
| 2895 |
delete ctx;
|
| 2896 |
return nullptr;
|
| 2897 |
}
|
|
|
|
| 2996 |
|
| 2997 |
int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
|
| 2998 |
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
|
| 2999 |
+
log("%s: failed to compute mel spectrogram\n", __func__);
|
| 3000 |
return -1;
|
| 3001 |
}
|
| 3002 |
|
|
|
|
| 3010 |
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
|
| 3011 |
int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
|
| 3012 |
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, state->mel)) {
|
| 3013 |
+
log("%s: failed to compute mel spectrogram\n", __func__);
|
| 3014 |
return -1;
|
| 3015 |
}
|
| 3016 |
|
|
|
|
| 3029 |
int n_len,
|
| 3030 |
int n_mel) {
|
| 3031 |
if (n_mel != WHISPER_N_MEL) {
|
| 3032 |
+
log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
|
| 3033 |
return -1;
|
| 3034 |
}
|
| 3035 |
|
|
|
|
| 3053 |
|
| 3054 |
int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state * state, int offset, int n_threads) {
|
| 3055 |
if (!whisper_encode_internal(*ctx, *state, offset, n_threads)) {
|
| 3056 |
+
log("%s: failed to eval\n", __func__);
|
| 3057 |
return -1;
|
| 3058 |
}
|
| 3059 |
|
|
|
|
| 3062 |
|
| 3063 |
int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
|
| 3064 |
if (!whisper_encode_internal(*ctx, *ctx->state, offset, n_threads)) {
|
| 3065 |
+
log("%s: failed to eval\n", __func__);
|
| 3066 |
return -1;
|
| 3067 |
}
|
| 3068 |
|
|
|
|
| 3073 |
const int selected_decoder_id = 0;
|
| 3074 |
|
| 3075 |
if (!whisper_decode_internal(*ctx, *state, state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
|
| 3076 |
+
log("%s: failed to eval\n", __func__);
|
| 3077 |
return 1;
|
| 3078 |
}
|
| 3079 |
|
|
|
|
| 3085 |
const int selected_decoder_id = 0;
|
| 3086 |
|
| 3087 |
if (ctx->state == nullptr) {
|
| 3088 |
+
log("%s: ERROR state was not loaded.\n", __func__);
|
| 3089 |
return false;
|
| 3090 |
}
|
| 3091 |
|
| 3092 |
|
| 3093 |
if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
|
| 3094 |
+
log("%s: failed to eval\n", __func__);
|
| 3095 |
return 1;
|
| 3096 |
}
|
| 3097 |
|
|
|
|
| 3102 |
const auto res = tokenize(ctx->vocab, text);
|
| 3103 |
|
| 3104 |
if (n_max_tokens < (int) res.size()) {
|
| 3105 |
+
log("%s: too many resulting tokens: %d (max %d)\n", __func__, (int) res.size(), n_max_tokens);
|
| 3106 |
return -1;
|
| 3107 |
}
|
| 3108 |
|
|
|
|
| 3130 |
}
|
| 3131 |
}
|
| 3132 |
|
| 3133 |
+
log("%s: unknown language '%s'\n", __func__, lang);
|
| 3134 |
return -1;
|
| 3135 |
}
|
| 3136 |
return g_lang.at(lang).first;
|
|
|
|
| 3143 |
}
|
| 3144 |
}
|
| 3145 |
|
| 3146 |
+
log("%s: unknown language id %d\n", __func__, id);
|
| 3147 |
return nullptr;
|
| 3148 |
}
|
| 3149 |
|
|
|
|
| 3156 |
const int seek = offset_ms/10;
|
| 3157 |
|
| 3158 |
if (seek < 0) {
|
| 3159 |
+
log("%s: offset %dms is before the start of the audio\n", __func__, offset_ms);
|
| 3160 |
return -1;
|
| 3161 |
}
|
| 3162 |
|
| 3163 |
if (seek >= state->mel.n_len_org) {
|
| 3164 |
+
log("%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10);
|
| 3165 |
return -2;
|
| 3166 |
}
|
| 3167 |
|
| 3168 |
// run the encoder
|
| 3169 |
if (whisper_encode_with_state(ctx, state, seek, n_threads) != 0) {
|
| 3170 |
+
log("%s: failed to encode\n", __func__);
|
| 3171 |
return -6;
|
| 3172 |
}
|
| 3173 |
|
| 3174 |
const std::vector<whisper_token> prompt = { whisper_token_sot(ctx) };
|
| 3175 |
|
| 3176 |
if (whisper_decode_with_state(ctx, state, prompt.data(), prompt.size(), 0, n_threads) != 0) {
|
| 3177 |
+
log("%s: failed to decode\n", __func__);
|
| 3178 |
return -7;
|
| 3179 |
}
|
| 3180 |
|
|
|
|
| 3375 |
void whisper_print_timings(struct whisper_context * ctx) {
|
| 3376 |
const int64_t t_end_us = ggml_time_us();
|
| 3377 |
|
| 3378 |
+
log("\n");
|
| 3379 |
+
log("%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
|
| 3380 |
if (ctx->state != nullptr) {
|
| 3381 |
|
| 3382 |
const int32_t n_sample = std::max(1, ctx->state->n_sample);
|
| 3383 |
const int32_t n_encode = std::max(1, ctx->state->n_encode);
|
| 3384 |
const int32_t n_decode = std::max(1, ctx->state->n_decode);
|
| 3385 |
|
| 3386 |
+
log("%s: fallbacks = %3d p / %3d h\n", __func__, ctx->state->n_fail_p, ctx->state->n_fail_h);
|
| 3387 |
+
log("%s: mel time = %8.2f ms\n", __func__, ctx->state->t_mel_us / 1000.0f);
|
| 3388 |
+
log("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_sample_us, n_sample, 1e-3f * ctx->state->t_sample_us / n_sample);
|
| 3389 |
+
log("%s: encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_encode_us, n_encode, 1e-3f * ctx->state->t_encode_us / n_encode);
|
| 3390 |
+
log("%s: decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_decode_us, n_decode, 1e-3f * ctx->state->t_decode_us / n_decode);
|
| 3391 |
}
|
| 3392 |
+
log("%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
|
| 3393 |
}
|
| 3394 |
|
| 3395 |
void whisper_reset_timings(struct whisper_context * ctx) {
|
|
|
|
| 3713 |
const bool last_was_timestamp = tokens_cur.size() > 0 && tokens_cur.back().id >= vocab.token_beg;
|
| 3714 |
const bool penultimate_was_timestamp = tokens_cur.size() < 2 || tokens_cur[tokens_cur.size() - 2].id >= vocab.token_beg;
|
| 3715 |
|
| 3716 |
+
//log("last_was_timestamp=%d penultimate_was_timestamp=%d\n", last_was_timestamp, penultimate_was_timestamp);
|
| 3717 |
|
| 3718 |
if (last_was_timestamp) {
|
| 3719 |
if (penultimate_was_timestamp) {
|
|
|
|
| 3789 |
|
| 3790 |
const float max_text_token_logprob = *std::max_element(logprobs.begin(), logprobs.begin() + vocab.token_beg);
|
| 3791 |
|
| 3792 |
+
//log("timestamp_logprob=%f max_text_token_logprob=%f\n", timestamp_logprob, max_text_token_logprob);
|
| 3793 |
|
| 3794 |
if (timestamp_logprob > max_text_token_logprob) {
|
| 3795 |
for (int i = 0; i < vocab.token_beg; ++i) {
|
|
|
|
| 4038 |
// compute log mel spectrogram
|
| 4039 |
if (params.speed_up) {
|
| 4040 |
if (whisper_pcm_to_mel_phase_vocoder_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
|
| 4041 |
+
log("%s: failed to compute log mel spectrogram\n", __func__);
|
| 4042 |
return -1;
|
| 4043 |
}
|
| 4044 |
} else {
|
| 4045 |
if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
|
| 4046 |
+
log("%s: failed to compute log mel spectrogram\n", __func__);
|
| 4047 |
return -2;
|
| 4048 |
}
|
| 4049 |
}
|
|
|
|
| 4054 |
|
| 4055 |
const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
|
| 4056 |
if (lang_id < 0) {
|
| 4057 |
+
log("%s: failed to auto-detect language\n", __func__);
|
| 4058 |
return -3;
|
| 4059 |
}
|
| 4060 |
state->lang_id = lang_id;
|
| 4061 |
params.language = whisper_lang_str(lang_id);
|
| 4062 |
|
| 4063 |
+
log("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
|
| 4064 |
if (params.detect_language) {
|
| 4065 |
return 0;
|
| 4066 |
}
|
|
|
|
| 4117 |
if (decoder.kv_self.ctx == nullptr) {
|
| 4118 |
decoder.kv_self = state->decoders[0].kv_self;
|
| 4119 |
if (!kv_cache_reinit(decoder.kv_self)) {
|
| 4120 |
+
log("%s: kv_cache_reinit() failed for self-attention, decoder %d\n", __func__, j);
|
| 4121 |
return -4;
|
| 4122 |
}
|
| 4123 |
|
|
|
|
| 4161 |
|
| 4162 |
// overwrite audio_ctx, max allowed is hparams.n_audio_ctx
|
| 4163 |
if (params.audio_ctx > whisper_n_audio_ctx(ctx)) {
|
| 4164 |
+
log("%s: audio_ctx is larger than the maximum allowed (%d > %d)\n", __func__, params.audio_ctx, whisper_n_audio_ctx(ctx));
|
| 4165 |
return -5;
|
| 4166 |
}
|
| 4167 |
state->exp_n_audio_ctx = params.audio_ctx;
|
|
|
|
| 4205 |
|
| 4206 |
// main loop
|
| 4207 |
while (true) {
|
|
|
|
| 4208 |
if (params.progress_callback) {
|
| 4209 |
+
const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
|
| 4210 |
+
|
| 4211 |
params.progress_callback(
|
| 4212 |
ctx, ctx->state, progress_cur, params.progress_callback_user_data);
|
| 4213 |
}
|
|
|
|
| 4219 |
|
| 4220 |
if (params.encoder_begin_callback) {
|
| 4221 |
if (params.encoder_begin_callback(ctx, state, params.encoder_begin_callback_user_data) == false) {
|
| 4222 |
+
log("%s: encoder_begin_callback returned false - aborting\n", __func__);
|
| 4223 |
break;
|
| 4224 |
}
|
| 4225 |
}
|
| 4226 |
|
| 4227 |
// encode audio features starting at offset seek
|
| 4228 |
if (!whisper_encode_internal(*ctx, *state, seek, params.n_threads)) {
|
| 4229 |
+
log("%s: failed to encode\n", __func__);
|
| 4230 |
return -6;
|
| 4231 |
}
|
| 4232 |
|
|
|
|
| 4309 |
WHISPER_PRINT_DEBUG("\n\n");
|
| 4310 |
|
| 4311 |
if (!whisper_decode_internal(*ctx, *state, state->decoders[0], prompt.data(), prompt.size(), 0, params.n_threads)) {
|
| 4312 |
+
log("%s: failed to decode\n", __func__);
|
| 4313 |
return -7;
|
| 4314 |
}
|
| 4315 |
|
|
|
|
| 4547 |
//WHISPER_PRINT_DEBUG("%s: decoder %d: token %d, kv_self.n %d, seek_delta %d\n", __func__, j, decoder.tokens_tmp[0], decoder.kv_self.n, decoder.seek_delta);
|
| 4548 |
|
| 4549 |
if (!whisper_decode_internal(*ctx, *state, decoder, decoder.tokens_tmp.data(), decoder.tokens_tmp.size(), decoder.kv_self.n, params.n_threads)) {
|
| 4550 |
+
log("%s: failed to decode\n", __func__);
|
| 4551 |
return -8;
|
| 4552 |
}
|
| 4553 |
|
|
|
|
| 4869 |
ctx->state->t_decode_us /= n_processors;
|
| 4870 |
|
| 4871 |
// print information about the audio boundaries
|
| 4872 |
+
log("\n");
|
| 4873 |
+
log("%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors);
|
| 4874 |
for (int i = 0; i < n_processors - 1; ++i) {
|
| 4875 |
+
log("%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str());
|
| 4876 |
}
|
| 4877 |
+
log("%s: the transcription quality may be degraded near these boundaries\n", __func__);
|
| 4878 |
|
| 4879 |
return ret;
|
| 4880 |
}
|
|
|
|
| 5234 |
const int n_samples = state.energy.size();
|
| 5235 |
|
| 5236 |
if (n_samples == 0) {
|
| 5237 |
+
log("%s: no signal data available\n", __func__);
|
| 5238 |
return;
|
| 5239 |
}
|
| 5240 |
|
|
|
|
| 5454 |
// }
|
| 5455 |
//}
|
| 5456 |
}
|
| 5457 |
+
|
| 5458 |
+
void whisper_set_log_callback(whisper_log_callback callback) {
|
| 5459 |
+
whisper_log = callback;
|
| 5460 |
+
}
|
whisper.h
CHANGED
|
@@ -517,6 +517,11 @@ extern "C" {
|
|
| 517 |
WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
|
| 518 |
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
|
| 519 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
#ifdef __cplusplus
|
| 521 |
}
|
| 522 |
#endif
|
|
|
|
| 517 |
WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
|
| 518 |
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
|
| 519 |
|
| 520 |
+
// Control logging output; default behavior is to print to stderr
|
| 521 |
+
|
| 522 |
+
typedef void (*whisper_log_callback)(const char * line);
|
| 523 |
+
WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
|
| 524 |
+
|
| 525 |
#ifdef __cplusplus
|
| 526 |
}
|
| 527 |
#endif
|