Bump whisper.cpp. Simple settings mode (#60)

* bump whispercpp, simple settings mode * lint
locaal-ai · Dec 21, 2023 · b45b235 · b45b235
1 parent 8c02e0c
commit b45b235
Show file tree

Hide file tree

Showing 7 changed files with 82 additions and 44 deletions.
diff --git a/buildspec.json b/buildspec.json
@@ -45,9 +45,9 @@
         }
     },
     "name": "obs-localvocal",
-    "version": "0.0.7",
+    "version": "0.0.8",
     "author": "Roy Shilkrot",
-    "website": "https://github.com/obs-ai/obs-localvocal",
+    "website": "https://github.com/occ-ai/obs-localvocal",
     "email": "[email protected]",
     "uuids": {
         "macosPackage": "CB66E5DF-FF45-4BEA-B38B-7AD3705860C9",

diff --git a/cmake/BuildWhispercpp.cmake b/cmake/BuildWhispercpp.cmake
@@ -2,7 +2,7 @@ include(ExternalProject)
 
 set(CMAKE_OSX_ARCHITECTURES_ "arm64$<SEMICOLON>x86_64")
 
-set(Whispercpp_Build_GIT_TAG "ec7a6f04f9c32adec2e6b0995b8c728c5bf56f35")
+set(Whispercpp_Build_GIT_TAG "8986690c2a7b81b2b5d79cdc186b5aa672311740")
 
 if(${CMAKE_BUILD_TYPE} STREQUAL Release OR ${CMAKE_BUILD_TYPE} STREQUAL RelWithDebInfo)
   set(Whispercpp_BUILD_TYPE Release)

diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
@@ -1,18 +1,18 @@
 LocalVocalPlugin="LocalVocal Plugin"
 transcription_filterAudioFilter="LocalVocal Transcription"
 vad_enabled="VAD Enabled"
-log_level="Log Level"
-log_words="Log Output Words"
+log_level="Internal Log Level"
+log_words="Log Output to Console"
 caption_to_stream="Stream Captions"
-step_by_step_processing="Step-by-step processing (⚠️ processing will increase)"
+step_by_step_processing="Step-by-step processing (⚠️ increased processing)"
 step_size_msec="Step size (ms)"
 subtitle_sources="Subtitles Output"
 none_no_output="None / No output"
 text_file_output="Text File output"
 output_filename="Output filename"
 whisper_model="Whisper Model"
 external_model_file="External model file"
-whisper_parameters="Whisper Parameters"
+whisper_parameters="Advanced Settings"
 language="Language"
 whisper_sampling_method="Whisper Sampling Method"
 n_threads="Number of threads"
@@ -41,4 +41,5 @@ save_srt="Save in SRT format (no file truncation)"
 only_while_recording="Write output only while recording"
 process_while_muted="Process speech while source is muted"
 rename_file_to_match_recording="Rename file to match recording"
-min_sub_duration="Minimal subtitle duration (msec)"
+min_sub_duration="Min. sub duration (ms)"
+advanced_settings="Advanced Settings"
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
@@ -60,11 +60,11 @@ struct transcription_filter_data {
 	struct circlebuf input_buffers[MAX_PREPROC_CHANNELS];
 
 	/* Resampler */
-	audio_resampler_t *resampler = nullptr;
+	audio_resampler_t *resampler;
 
 	/* whisper */
-	char *whisper_model_path = nullptr;
-	struct whisper_context *whisper_context = nullptr;
+	char *whisper_model_path;
+	struct whisper_context *whisper_context;
 	whisper_full_params whisper_params;
 
 	float filler_p_threshold;
@@ -81,21 +81,41 @@ struct transcription_filter_data {
 	bool rename_file_to_match_recording = false;
 
 	// Text source to output the subtitles
-	obs_weak_source_t *text_source = nullptr;
-	char *text_source_name = nullptr;
-	std::mutex *text_source_mutex = nullptr;
+	obs_weak_source_t *text_source;
+	char *text_source_name;
+	std::mutex *text_source_mutex;
 	// Callback to set the text in the output text source (subtitles)
 	std::function<void(const DetectionResultWithText &result)> setTextCallback;
 	// Output file path to write the subtitles
-	std::string output_file_path = "";
-	std::string whisper_model_file_currently_loaded = "";
+	std::string output_file_path;
+	std::string whisper_model_file_currently_loaded;
 
 	// Use std for thread and mutex
 	std::thread whisper_thread;
 
-	std::mutex *whisper_buf_mutex = nullptr;
-	std::mutex *whisper_ctx_mutex = nullptr;
-	std::condition_variable *wshiper_thread_cv = nullptr;
+	std::mutex *whisper_buf_mutex;
+	std::mutex *whisper_ctx_mutex;
+	std::condition_variable *wshiper_thread_cv;
+
+	// ctor
+	transcription_filter_data()
+	{
+		// initialize all pointers to nullptr
+		for (size_t i = 0; i < MAX_PREPROC_CHANNELS; i++) {
+			copy_buffers[i] = nullptr;
+		}
+		context = nullptr;
+		resampler = nullptr;
+		whisper_model_path = nullptr;
+		whisper_context = nullptr;
+		text_source = nullptr;
+		text_source_mutex = nullptr;
+		whisper_buf_mutex = nullptr;
+		whisper_ctx_mutex = nullptr;
+		wshiper_thread_cv = nullptr;
+		output_file_path = "";
+		whisper_model_file_currently_loaded = "";
+	}
 };
 
 // Audio packet info

diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
@@ -18,6 +18,8 @@
 #include <Windows.h>
 #endif
 
+#include <QString>
+
 inline enum speaker_layout convert_speaker_layout(uint8_t channels)
 {
 	switch (channels) {
@@ -125,19 +127,7 @@ void transcription_filter_destroy(void *data)
 		static_cast<struct transcription_filter_data *>(data);
 
 	obs_log(gf->log_level, "transcription_filter_destroy");
-	{
-		std::lock_guard<std::mutex> lock(*gf->whisper_ctx_mutex);
-		if (gf->whisper_context != nullptr) {
-			whisper_free(gf->whisper_context);
-			gf->whisper_context = nullptr;
-			gf->wshiper_thread_cv->notify_all();
-		}
-	}
-
-	// join the thread
-	if (gf->whisper_thread.joinable()) {
-		gf->whisper_thread.join();
-	}
+	shutdown_whisper_thread(gf);
 
 	if (gf->text_source_name) {
 		bfree(gf->text_source_name);
@@ -448,14 +438,14 @@ void transcription_filter_update(void *data, obs_data_t *s)
 		obs_weak_source_release(old_weak_text_source);
 	}
 
-	obs_log(gf->log_level, "transcription_filter: update whisper model");
-	update_whsiper_model_path(gf, s);
-
-	if (!gf->whisper_ctx_mutex) {
+	if (gf->whisper_ctx_mutex == nullptr) {
 		obs_log(LOG_ERROR, "whisper_ctx_mutex is null");
 		return;
 	}
 
+	obs_log(gf->log_level, "transcription_filter: update whisper model");
+	update_whsiper_model_path(gf, s);
+
 	obs_log(gf->log_level, "transcription_filter: update whisper params");
 	std::lock_guard<std::mutex> lock(*gf->whisper_ctx_mutex);
 
@@ -492,7 +482,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 {
 	obs_log(LOG_INFO, "transcription filter create");
 
-	struct transcription_filter_data *gf = new transcription_filter_data;
+	struct transcription_filter_data *gf = new transcription_filter_data();
 
 	// Get the number of channels for the input source
 	gf->channels = audio_output_get_channels(obs_get_audio());
@@ -648,6 +638,7 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_bool(s, "rename_file_to_match_recording", true);
 	obs_data_set_default_int(s, "step_size_msec", 1000);
 	obs_data_set_default_int(s, "min_sub_duration", 3000);
+	obs_data_set_default_bool(s, "advanced_settings", false);
 
 	// Whisper parameters
 	obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH);
@@ -684,12 +675,6 @@ obs_properties_t *transcription_filter_properties(void *data)
 
 	obs_properties_t *ppts = obs_properties_create();
 
-	obs_properties_add_bool(ppts, "vad_enabled", MT_("vad_enabled"));
-	obs_property_t *list = obs_properties_add_list(ppts, "log_level", MT_("log_level"),
-						       OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
-	obs_property_list_add_int(list, "DEBUG", LOG_DEBUG);
-	obs_property_list_add_int(list, "INFO", LOG_INFO);
-	obs_property_list_add_int(list, "WARNING", LOG_WARNING);
 	obs_properties_add_bool(ppts, "log_words", MT_("log_words"));
 	obs_properties_add_bool(ppts, "caption_to_stream", MT_("caption_to_stream"));
 	obs_property_t *step_by_step_processing = obs_properties_add_bool(
@@ -799,10 +784,31 @@ obs_properties_t *transcription_filter_properties(void *data)
 		return true;
 	});
 
+	obs_property_t *advanced_settings_prop =
+		obs_properties_add_bool(ppts, "advanced_settings", MT_("advanced_settings"));
+	obs_property_set_modified_callback(advanced_settings_prop, [](obs_properties_t *props,
+								      obs_property_t *property,
+								      obs_data_t *settings) {
+		UNUSED_PARAMETER(property);
+		// If advanced settings is enabled, show the advanced settings group
+		const bool show_hide = obs_data_get_bool(settings, "advanced_settings");
+		obs_property_set_visible(obs_properties_get(props, "whisper_params_group"),
+					 show_hide);
+		return true;
+	});
+
 	obs_properties_t *whisper_params_group = obs_properties_create();
 	obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"),
 				 OBS_GROUP_NORMAL, whisper_params_group);
 
+	obs_properties_add_bool(whisper_params_group, "vad_enabled", MT_("vad_enabled"));
+	obs_property_t *list = obs_properties_add_list(whisper_params_group, "log_level",
+						       MT_("log_level"), OBS_COMBO_TYPE_LIST,
+						       OBS_COMBO_FORMAT_INT);
+	obs_property_list_add_int(list, "DEBUG", LOG_DEBUG);
+	obs_property_list_add_int(list, "INFO", LOG_INFO);
+	obs_property_list_add_int(list, "WARNING", LOG_WARNING);
+
 	// Add language selector
 	obs_property_t *whisper_language_select_list = obs_properties_add_list(
 		whisper_params_group, "whisper_language_select", MT_("language"),
@@ -885,6 +891,12 @@ obs_properties_t *transcription_filter_properties(void *data)
 	obs_properties_add_float_slider(whisper_params_group, "length_penalty",
 					MT_("length_penalty"), -1.0f, 1.0f, 0.1f);
 
+	// Add a informative text about the plugin
+	obs_properties_add_text(
+		ppts, "info",
+		QString(PLUGIN_INFO_TEMPLATE).arg(PLUGIN_VERSION).toStdString().c_str(),
+		OBS_TEXT_INFO);
+
 	UNUSED_PARAMETER(data);
 	return ppts;
 }
diff --git a/src/transcription-filter.h b/src/transcription-filter.h
@@ -14,6 +14,11 @@ void transcription_filter_deactivate(void *data);
 void transcription_filter_defaults(obs_data_t *s);
 obs_properties_t *transcription_filter_properties(void *data);
 
+const char *const PLUGIN_INFO_TEMPLATE =
+	"<a href=\"https://github.com/occ-ai/obs-localvocal/\">LocalVocal</a> (%1) by "
+	"<a href=\"https://github.com/occ-ai\">OCC AI</a> ❤️ "
+	"<a href=\"https://www.patreon.com/RoyShilkrot\">Support & Follow</a>";
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/whisper-utils/whisper-utils.cpp b/src/whisper-utils/whisper-utils.cpp
@@ -94,7 +94,7 @@ void shutdown_whisper_thread(struct transcription_filter_data *gf)
 void start_whisper_thread_with_path(struct transcription_filter_data *gf, const std::string &path)
 {
 	obs_log(gf->log_level, "start_whisper_thread_with_path: %s", path.c_str());
-	if (!gf->whisper_ctx_mutex) {
+	if (gf->whisper_ctx_mutex == nullptr) {
 		obs_log(LOG_ERROR, "cannot init whisper: whisper_ctx_mutex is null");
 		return;
 	}