Skip to content

Commit

Permalink
refactor: Update timestamp variable name in transcription-filter-data…
Browse files Browse the repository at this point in the history
….h (#109)
  • Loading branch information
royshil authored Jun 11, 2024
1 parent 845c1a8 commit 91c2842
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 45 deletions.
9 changes: 5 additions & 4 deletions src/tests/localvocal-offline-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -465,9 +465,10 @@ int wmain(int argc, wchar_t *argv[])
struct transcription_filter_audio_info info = {0};
info.frames = frames; // number of frames in this packet
// make a timestamp from the current position in the audio buffer
info.timestamp = start_time + (int64_t)(((float)frames_count /
(float)gf->sample_rate) *
1e9);
info.timestamp_offset_ns =
start_time +
(int64_t)(((float)frames_count / (float)gf->sample_rate) *
1e9);
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
}
frames_count += frames;
Expand All @@ -489,7 +490,7 @@ int wmain(int argc, wchar_t *argv[])
struct transcription_filter_audio_info info = {0};
info.frames = frames; // number of frames in this packet
// make a timestamp from the current frame count
info.timestamp = frames_count * 1000 / gf->sample_rate;
info.timestamp_offset_ns = frames_count * 1000 / gf->sample_rate;
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
}

Expand Down
2 changes: 1 addition & 1 deletion src/transcription-filter-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ struct transcription_filter_data {
// Audio packet info
struct transcription_filter_audio_info {
uint32_t frames;
uint64_t timestamp; // absolute (since epoch) timestamp in ns
uint64_t timestamp_offset_ns; // offset (since start of processing) timestamp in ns
};

// Callback sent when the transcription has a new result
Expand Down
10 changes: 8 additions & 2 deletions src/transcription-filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,14 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
}
// push audio packet info (timestamp/frame count) to info circlebuf
struct transcription_filter_audio_info info = {0};
info.frames = audio->frames; // number of frames in this packet
info.timestamp = audio->timestamp; // timestamp of this packet
info.frames = audio->frames; // number of frames in this packet
// check if the timestamp is a false "negative" value for uint64_t
if (audio->timestamp > (std::numeric_limits<uint64_t>::max() - 100000000)) {
// set the timestamp to the current time
info.timestamp_offset_ns = 0;
} else {
info.timestamp_offset_ns = audio->timestamp; // timestamp of this packet
}
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
}

Expand Down
64 changes: 26 additions & 38 deletions src/whisper-utils/whisper-processing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,24 +22,10 @@

struct vad_state {
bool vad_on;
uint64_t start_timestamp;
uint64_t end_timestamp;
uint64_t start_ts_offest_ms;
uint64_t end_ts_offset_ms;
};

// Taken from https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp
std::string to_timestamp(uint64_t t)
{
uint64_t sec = t / 1000;
uint64_t msec = t - sec * 1000;
uint64_t min = sec / 60;
sec = sec - min * 60;

char buf[32];
snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int)min, (int)sec, (int)msec);

return std::string(buf);
}

struct whisper_context *init_whisper_context(const std::string &model_path_in,
struct transcription_filter_data *gf)
{
Expand Down Expand Up @@ -314,8 +300,8 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state)
{
uint32_t num_frames_from_infos = 0;
uint64_t start_timestamp = 0;
uint64_t end_timestamp = 0;
uint64_t start_timestamp_offset_ns = 0;
uint64_t end_timestamp_offset_ns = 0;
size_t overlap_size = 0;

for (size_t c = 0; c < gf->channels; c++) {
Expand All @@ -342,8 +328,8 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
while (gf->info_buffer.size >= size_of_audio_info) {
circlebuf_pop_front(&gf->info_buffer, &info_from_buf, size_of_audio_info);
num_frames_from_infos += info_from_buf.frames;
if (start_timestamp == 0) {
start_timestamp = info_from_buf.timestamp;
if (start_timestamp_offset_ns == 0) {
start_timestamp_offset_ns = info_from_buf.timestamp_offset_ns;
}
// Check if we're within the needed segment length
if (num_frames_from_infos > max_num_frames) {
Expand All @@ -354,7 +340,7 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
break;
}
}
end_timestamp = info_from_buf.timestamp;
end_timestamp_offset_ns = info_from_buf.timestamp_offset_ns;

/* Pop from input circlebuf */
for (size_t c = 0; c < gf->channels; c++) {
Expand Down Expand Up @@ -386,28 +372,29 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
resampled_16khz[0] + resampled_16khz_frames);
gf->vad->process(vad_input, false);

const uint64_t start_offset_ms = start_timestamp / 1000000 - gf->start_timestamp_ms;
const uint64_t end_offset_ms = end_timestamp / 1000000 - gf->start_timestamp_ms;
const uint64_t start_ts_offset_ms = start_timestamp_offset_ns / 1000000;
const uint64_t end_ts_offset_ms = end_timestamp_offset_ns / 1000000;

vad_state current_vad_state = {false, start_offset_ms, end_offset_ms};
vad_state current_vad_state = {false, start_ts_offset_ms, end_ts_offset_ms};

std::vector<timestamp_t> stamps = gf->vad->get_speech_timestamps();
if (stamps.size() == 0) {
obs_log(gf->log_level, "VAD detected no speech in %d frames",
resampled_16khz_frames);
if (last_vad_state.vad_on) {
obs_log(gf->log_level, "Last VAD was ON: segment end -> send to inference");
run_inference_and_callbacks(gf, last_vad_state.start_timestamp,
last_vad_state.end_timestamp, VAD_STATE_WAS_ON);
run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
last_vad_state.end_ts_offset_ms,
VAD_STATE_WAS_ON);
}

if (gf->enable_audio_chunks_callback) {
audio_chunk_callback(gf, resampled_16khz[0], resampled_16khz_frames,
VAD_STATE_IS_OFF,
{DETECTION_RESULT_SILENCE,
"[silence]",
current_vad_state.start_timestamp,
current_vad_state.end_timestamp,
current_vad_state.start_ts_offest_ms,
current_vad_state.end_ts_offset_ms,
{}});
}
} else {
Expand Down Expand Up @@ -447,29 +434,30 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
obs_log(gf->log_level, "VAD segment end -> send to inference");
// find the end timestamp of the segment
const uint64_t segment_end_ts =
start_offset_ms + end_frame * 1000 / WHISPER_SAMPLE_RATE;
run_inference_and_callbacks(gf, last_vad_state.start_timestamp,
start_ts_offset_ms + end_frame * 1000 / WHISPER_SAMPLE_RATE;
run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
segment_end_ts,
last_vad_state.vad_on
? VAD_STATE_WAS_ON
: VAD_STATE_WAS_OFF);
current_vad_state.vad_on = false;
current_vad_state.start_timestamp = current_vad_state.end_timestamp;
current_vad_state.end_timestamp = 0;
current_vad_state.start_ts_offest_ms =
current_vad_state.end_ts_offset_ms;
current_vad_state.end_ts_offset_ms = 0;
} else {
current_vad_state.vad_on = true;
if (last_vad_state.vad_on) {
current_vad_state.start_timestamp =
last_vad_state.start_timestamp;
current_vad_state.start_ts_offest_ms =
last_vad_state.start_ts_offest_ms;
} else {
current_vad_state.start_timestamp =
start_offset_ms +
current_vad_state.start_ts_offest_ms =
start_ts_offset_ms +
start_frame * 1000 / WHISPER_SAMPLE_RATE;
}
obs_log(gf->log_level,
"end not reached. vad state: start ts: %llu, end ts: %llu",
current_vad_state.start_timestamp,
current_vad_state.end_timestamp);
current_vad_state.start_ts_offest_ms,
current_vad_state.end_ts_offset_ms);
}
last_vad_state = current_vad_state;
}
Expand Down
13 changes: 13 additions & 0 deletions src/whisper-utils/whisper-utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,16 @@ std::vector<whisper_token_data> reconstructSentence(const std::vector<whisper_to

return reconstructed;
}

std::string to_timestamp(uint64_t t_ms_offset)
{
uint64_t sec = t_ms_offset / 1000;
uint64_t msec = t_ms_offset - sec * 1000;
uint64_t min = sec / 60;
sec = sec - min * 60;

char buf[32];
snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int)min, (int)sec, (int)msec);

return std::string(buf);
}
8 changes: 8 additions & 0 deletions src/whisper-utils/whisper-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,12 @@ std::pair<int, int> findStartOfOverlap(const std::vector<whisper_token_data> &se
std::vector<whisper_token_data> reconstructSentence(const std::vector<whisper_token_data> &seq1,
const std::vector<whisper_token_data> &seq2);

/**
* @brief Convert a timestamp in milliseconds to a string in the format "MM:SS.sss" .
* Taken from https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp
* @param t_ms_offset Timestamp in milliseconds (offset from the beginning of the stream)
* @return std::string Timestamp in the format "MM:SS.sss"
*/
std::string to_timestamp(uint64_t t_ms_offset);

#endif /* WHISPER_UTILS_H */

0 comments on commit 91c2842

Please sign in to comment.