From 6f5c781e6fe2a2c3f9496fd5756e08e0fb739841 Mon Sep 17 00:00:00 2001 From: Sacha Arbonel Date: Mon, 14 Apr 2025 10:19:55 +0200 Subject: [PATCH 1/3] feat: expose language detection probabilities to server.cpp --- examples/server/server.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f709225bd81..965ba242c9a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -919,6 +919,11 @@ int main(int argc, char ** argv) { } else if (params.response_format == vjson_format) { /* try to match openai/whisper's Python format */ std::string results = output_str(ctx, params, pcmf32s); + + // Get language probabilities + std::vector lang_probs(whisper_lang_max_id() + 1, 0.0f); + const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, params.n_threads, lang_probs.data()); + json jres = json{ {"task", params.translate ? "translate" : "transcribe"}, {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))}, @@ -926,6 +931,22 @@ int main(int argc, char ** argv) { {"text", results}, {"segments", json::array()} }; + + // Always include language detection info + json lang_info = json::object(); + // Include the probability of the detected language + lang_info["probability"] = lang_probs[detected_lang_id]; + + // Add all language probabilities + json all_lang_probs = json::object(); + for (int i = 0; i <= whisper_lang_max_id(); ++i) { + if (lang_probs[i] > 0.001f) { // Only include non-negligible probabilities + all_lang_probs[whisper_lang_str(i)] = lang_probs[i]; + } + } + lang_info["language_probabilities"] = all_lang_probs; + jres["language_detection"] = lang_info; + const int n_segments = whisper_full_n_segments(ctx); for (int i = 0; i < n_segments; ++i) { From 46021af74b383ce1af1fd22323b4bfdcd4270e0d Mon Sep 17 00:00:00 2001 From: Sacha Arbonel Date: Mon, 28 Apr 2025 12:35:47 +0200 Subject: [PATCH 2/3] feat: enhance language detection output in server.cpp --- examples/server/server.cpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 965ba242c9a..665cfc4e00a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -929,23 +929,18 @@ int main(int argc, char ** argv) { {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))}, {"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE}, {"text", results}, - {"segments", json::array()} + {"segments", json::array()}, + {"detected_language", whisper_lang_str_full(detected_lang_id)}, + {"detected_language_probability", lang_probs[detected_lang_id]}, + {"language_probabilities", json::object()} }; - // Always include language detection info - json lang_info = json::object(); - // Include the probability of the detected language - lang_info["probability"] = lang_probs[detected_lang_id]; - // Add all language probabilities - json all_lang_probs = json::object(); for (int i = 0; i <= whisper_lang_max_id(); ++i) { if (lang_probs[i] > 0.001f) { // Only include non-negligible probabilities - all_lang_probs[whisper_lang_str(i)] = lang_probs[i]; + jres["language_probabilities"][whisper_lang_str(i)] = lang_probs[i]; } } - lang_info["language_probabilities"] = all_lang_probs; - jres["language_detection"] = lang_info; const int n_segments = whisper_full_n_segments(ctx); for (int i = 0; i < n_segments; ++i) From c59b8b53ec35ab5ccb2a630bc8f5822502871afe Mon Sep 17 00:00:00 2001 From: Sacha Arbonel Date: Mon, 28 Apr 2025 12:39:25 +0200 Subject: [PATCH 3/3] Remove empty spaces. --- examples/server/server.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 665cfc4e00a..3a629ac9f99 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -918,12 +918,10 @@ int main(int argc, char ** argv) { res.set_content(ss.str(), "text/vtt"); } else if (params.response_format == vjson_format) { /* try to match openai/whisper's Python format */ - std::string results = output_str(ctx, params, pcmf32s); - + std::string results = output_str(ctx, params, pcmf32s); // Get language probabilities std::vector lang_probs(whisper_lang_max_id() + 1, 0.0f); const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, params.n_threads, lang_probs.data()); - json jres = json{ {"task", params.translate ? "translate" : "transcribe"}, {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))}, @@ -934,14 +932,12 @@ int main(int argc, char ** argv) { {"detected_language_probability", lang_probs[detected_lang_id]}, {"language_probabilities", json::object()} }; - // Add all language probabilities for (int i = 0; i <= whisper_lang_max_id(); ++i) { if (lang_probs[i] > 0.001f) { // Only include non-negligible probabilities jres["language_probabilities"][whisper_lang_str(i)] = lang_probs[i]; } } - const int n_segments = whisper_full_n_segments(ctx); for (int i = 0; i < n_segments; ++i) {