refactor: Update timestamp variable name in transcription-filter-data…

….h (#109)
locaal-ai · Jun 11, 2024 · 91c2842 · 91c2842
1 parent 845c1a8
commit 91c2842
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 45 deletions.
diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp
@@ -465,9 +465,10 @@ int wmain(int argc, wchar_t *argv[])
 				struct transcription_filter_audio_info info = {0};
 				info.frames = frames; // number of frames in this packet
 				// make a timestamp from the current position in the audio buffer
-				info.timestamp = start_time + (int64_t)(((float)frames_count /
-									 (float)gf->sample_rate) *
-									1e9);
+				info.timestamp_offset_ns =
+					start_time +
+					(int64_t)(((float)frames_count / (float)gf->sample_rate) *
+						  1e9);
 				circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
 			}
 			frames_count += frames;
@@ -489,7 +490,7 @@ int wmain(int argc, wchar_t *argv[])
 		struct transcription_filter_audio_info info = {0};
 		info.frames = frames; // number of frames in this packet
 		// make a timestamp from the current frame count
-		info.timestamp = frames_count * 1000 / gf->sample_rate;
+		info.timestamp_offset_ns = frames_count * 1000 / gf->sample_rate;
 		circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
 	}
 

diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
@@ -127,7 +127,7 @@ struct transcription_filter_data {
 // Audio packet info
 struct transcription_filter_audio_info {
 	uint32_t frames;
-	uint64_t timestamp; // absolute (since epoch) timestamp in ns
+	uint64_t timestamp_offset_ns; // offset (since start of processing) timestamp in ns
 };
 
 // Callback sent when the transcription has a new result

diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
@@ -102,8 +102,14 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
 		}
 		// push audio packet info (timestamp/frame count) to info circlebuf
 		struct transcription_filter_audio_info info = {0};
-		info.frames = audio->frames;       // number of frames in this packet
-		info.timestamp = audio->timestamp; // timestamp of this packet
+		info.frames = audio->frames; // number of frames in this packet
+		// check if the timestamp is a false "negative" value for uint64_t
+		if (audio->timestamp > (std::numeric_limits<uint64_t>::max() - 100000000)) {
+			// set the timestamp to the current time
+			info.timestamp_offset_ns = 0;
+		} else {
+			info.timestamp_offset_ns = audio->timestamp; // timestamp of this packet
+		}
 		circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
 	}
 

diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
@@ -22,24 +22,10 @@
 
 struct vad_state {
 	bool vad_on;
-	uint64_t start_timestamp;
-	uint64_t end_timestamp;
+	uint64_t start_ts_offest_ms;
+	uint64_t end_ts_offset_ms;
 };
 
-// Taken from https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp
-std::string to_timestamp(uint64_t t)
-{
-	uint64_t sec = t / 1000;
-	uint64_t msec = t - sec * 1000;
-	uint64_t min = sec / 60;
-	sec = sec - min * 60;
-
-	char buf[32];
-	snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int)min, (int)sec, (int)msec);
-
-	return std::string(buf);
-}
-
 struct whisper_context *init_whisper_context(const std::string &model_path_in,
 					     struct transcription_filter_data *gf)
 {
@@ -314,8 +300,8 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
 vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state)
 {
 	uint32_t num_frames_from_infos = 0;
-	uint64_t start_timestamp = 0;
-	uint64_t end_timestamp = 0;
+	uint64_t start_timestamp_offset_ns = 0;
+	uint64_t end_timestamp_offset_ns = 0;
 	size_t overlap_size = 0;
 
 	for (size_t c = 0; c < gf->channels; c++) {
@@ -342,8 +328,8 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
 		while (gf->info_buffer.size >= size_of_audio_info) {
 			circlebuf_pop_front(&gf->info_buffer, &info_from_buf, size_of_audio_info);
 			num_frames_from_infos += info_from_buf.frames;
-			if (start_timestamp == 0) {
-				start_timestamp = info_from_buf.timestamp;
+			if (start_timestamp_offset_ns == 0) {
+				start_timestamp_offset_ns = info_from_buf.timestamp_offset_ns;
 			}
 			// Check if we're within the needed segment length
 			if (num_frames_from_infos > max_num_frames) {
@@ -354,7 +340,7 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
 				break;
 			}
 		}
-		end_timestamp = info_from_buf.timestamp;
+		end_timestamp_offset_ns = info_from_buf.timestamp_offset_ns;
 
 		/* Pop from input circlebuf */
 		for (size_t c = 0; c < gf->channels; c++) {
@@ -386,28 +372,29 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
 				     resampled_16khz[0] + resampled_16khz_frames);
 	gf->vad->process(vad_input, false);
 
-	const uint64_t start_offset_ms = start_timestamp / 1000000 - gf->start_timestamp_ms;
-	const uint64_t end_offset_ms = end_timestamp / 1000000 - gf->start_timestamp_ms;
+	const uint64_t start_ts_offset_ms = start_timestamp_offset_ns / 1000000;
+	const uint64_t end_ts_offset_ms = end_timestamp_offset_ns / 1000000;
 
-	vad_state current_vad_state = {false, start_offset_ms, end_offset_ms};
+	vad_state current_vad_state = {false, start_ts_offset_ms, end_ts_offset_ms};
 
 	std::vector<timestamp_t> stamps = gf->vad->get_speech_timestamps();
 	if (stamps.size() == 0) {
 		obs_log(gf->log_level, "VAD detected no speech in %d frames",
 			resampled_16khz_frames);
 		if (last_vad_state.vad_on) {
 			obs_log(gf->log_level, "Last VAD was ON: segment end -> send to inference");
-			run_inference_and_callbacks(gf, last_vad_state.start_timestamp,
-						    last_vad_state.end_timestamp, VAD_STATE_WAS_ON);
+			run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
+						    last_vad_state.end_ts_offset_ms,
+						    VAD_STATE_WAS_ON);
 		}
 
 		if (gf->enable_audio_chunks_callback) {
 			audio_chunk_callback(gf, resampled_16khz[0], resampled_16khz_frames,
 					     VAD_STATE_IS_OFF,
 					     {DETECTION_RESULT_SILENCE,
 					      "[silence]",
-					      current_vad_state.start_timestamp,
-					      current_vad_state.end_timestamp,
+					      current_vad_state.start_ts_offest_ms,
+					      current_vad_state.end_ts_offset_ms,
 					      {}});
 		}
 	} else {
@@ -447,29 +434,30 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
 				obs_log(gf->log_level, "VAD segment end -> send to inference");
 				// find the end timestamp of the segment
 				const uint64_t segment_end_ts =
-					start_offset_ms + end_frame * 1000 / WHISPER_SAMPLE_RATE;
-				run_inference_and_callbacks(gf, last_vad_state.start_timestamp,
+					start_ts_offset_ms + end_frame * 1000 / WHISPER_SAMPLE_RATE;
+				run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
 							    segment_end_ts,
 							    last_vad_state.vad_on
 								    ? VAD_STATE_WAS_ON
 								    : VAD_STATE_WAS_OFF);
 				current_vad_state.vad_on = false;
-				current_vad_state.start_timestamp = current_vad_state.end_timestamp;
-				current_vad_state.end_timestamp = 0;
+				current_vad_state.start_ts_offest_ms =
+					current_vad_state.end_ts_offset_ms;
+				current_vad_state.end_ts_offset_ms = 0;
 			} else {
 				current_vad_state.vad_on = true;
 				if (last_vad_state.vad_on) {
-					current_vad_state.start_timestamp =
-						last_vad_state.start_timestamp;
+					current_vad_state.start_ts_offest_ms =
+						last_vad_state.start_ts_offest_ms;
 				} else {
-					current_vad_state.start_timestamp =
-						start_offset_ms +
+					current_vad_state.start_ts_offest_ms =
+						start_ts_offset_ms +
 						start_frame * 1000 / WHISPER_SAMPLE_RATE;
 				}
 				obs_log(gf->log_level,
 					"end not reached. vad state: start ts: %llu, end ts: %llu",
-					current_vad_state.start_timestamp,
-					current_vad_state.end_timestamp);
+					current_vad_state.start_ts_offest_ms,
+					current_vad_state.end_ts_offset_ms);
 			}
 			last_vad_state = current_vad_state;
 		}

diff --git a/src/whisper-utils/whisper-utils.cpp b/src/whisper-utils/whisper-utils.cpp
@@ -152,3 +152,16 @@ std::vector<whisper_token_data> reconstructSentence(const std::vector<whisper_to
 
 	return reconstructed;
 }
+
+std::string to_timestamp(uint64_t t_ms_offset)
+{
+	uint64_t sec = t_ms_offset / 1000;
+	uint64_t msec = t_ms_offset - sec * 1000;
+	uint64_t min = sec / 60;
+	sec = sec - min * 60;
+
+	char buf[32];
+	snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int)min, (int)sec, (int)msec);
+
+	return std::string(buf);
+}
diff --git a/src/whisper-utils/whisper-utils.h b/src/whisper-utils/whisper-utils.h
@@ -14,4 +14,12 @@ std::pair<int, int> findStartOfOverlap(const std::vector<whisper_token_data> &se
 std::vector<whisper_token_data> reconstructSentence(const std::vector<whisper_token_data> &seq1,
 						    const std::vector<whisper_token_data> &seq2);
 
+/**
+ * @brief Convert a timestamp in milliseconds to a string in the format "MM:SS.sss" .
+ * Taken from https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp
+ * @param t_ms_offset Timestamp in milliseconds (offset from the beginning of the stream)
+ * @return std::string Timestamp in the format "MM:SS.sss"
+ */
+std::string to_timestamp(uint64_t t_ms_offset);
+
 #endif /* WHISPER_UTILS_H */