Skip to content

Commit

Permalink
Merge pull request #428 from jhj0517/fix/faster-whisper
Browse files Browse the repository at this point in the history
Install `faster-whisper` directly from repository
  • Loading branch information
jhj0517 authored Dec 18, 2024
2 parents edc67ab + bdc4855 commit cd2c897
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 38 deletions.
14 changes: 1 addition & 13 deletions backend/requirements-backend.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,5 @@
# Whisper-WebUI dependencies
--extra-index-url https://download.pytorch.org/whl/cu124
torch
torchaudio
git+https://github.com/jhj0517/jhj0517-whisper.git
faster-whisper==1.0.3
transformers
gradio
gradio-i18n
pytubefix
ruamel.yaml==0.18.6
pyannote.audio==3.3.2
git+https://github.com/jhj0517/ultimatevocalremover_api.git
git+https://github.com/jhj0517/pyrubberband.git
-r ./../requirements.txt

# Backend dependencies
python-dotenv
Expand Down
2 changes: 1 addition & 1 deletion configs/default_parameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ whisper:
max_new_tokens: null
hallucination_silence_threshold: null
hotwords: null
language_detection_threshold: null
language_detection_threshold: 0.5
language_detection_segments: 1
add_timestamp: true

Expand Down
52 changes: 31 additions & 21 deletions modules/vad/silero_vad.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
import numpy as np
from typing import BinaryIO, Union, List, Optional, Tuple
import warnings
import bisect
import faster_whisper
from modules.whisper.data_classes import *
from faster_whisper.transcribe import SpeechTimestampsMap
import gradio as gr

from modules.whisper.data_classes import *


class SileroVAD:
def __init__(self):
Expand Down Expand Up @@ -58,6 +60,7 @@ def run(self,
vad_options=vad_parameters,
progress=progress
)

audio = self.collect_chunks(audio, speech_chunks)
duration_after_vad = audio.shape[0] / sampling_rate

Expand Down Expand Up @@ -94,35 +97,27 @@ def get_speech_timestamps(
min_silence_duration_ms = vad_options.min_silence_duration_ms
window_size_samples = self.window_size_samples
speech_pad_ms = vad_options.speech_pad_ms
sampling_rate = 16000
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
min_speech_samples = self.sampling_rate * min_speech_duration_ms / 1000
speech_pad_samples = self.sampling_rate * speech_pad_ms / 1000
max_speech_samples = (
sampling_rate * max_speech_duration_s
self.sampling_rate * max_speech_duration_s
- window_size_samples
- 2 * speech_pad_samples
)
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
min_silence_samples = self.sampling_rate * min_silence_duration_ms / 1000
min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000

audio_length_samples = len(audio)

state, context = self.model.get_initial_states(batch_size=1)

speech_probs = []
for current_start_sample in range(0, audio_length_samples, window_size_samples):
progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")

chunk = audio[current_start_sample: current_start_sample + window_size_samples]
if len(chunk) < window_size_samples:
chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
speech_probs.append(speech_prob)
padded_audio = np.pad(
audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
)
speech_probs = self.model(padded_audio.reshape(1, -1)).squeeze(0)

triggered = False
speeches = []
current_speech = {}
neg_threshold = threshold - 0.15
neg_threshold = vad_options.neg_threshold

# to save potential segment end (and tolerate some silence)
temp_end = 0
Expand Down Expand Up @@ -258,8 +253,23 @@ def restore_speech_timestamps(
ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)

for segment in segments:
segment.start = ts_map.get_original_time(segment.start)
segment.end = ts_map.get_original_time(segment.end)
if segment.words:
words = []
for word in segment.words:
# Ensure the word start and end times are resolved to the same chunk.
middle = (word.start + word.end) / 2
chunk_index = ts_map.get_chunk_index(middle)
word.start = ts_map.get_original_time(word.start, chunk_index)
word.end = ts_map.get_original_time(word.end, chunk_index)
words.append(word)

segment.start = words[0].start
segment.end = words[-1].end
segment.words = words

else:
segment.start = ts_map.get_original_time(segment.start)
segment.end = ts_map.get_original_time(segment.end)

return segments

2 changes: 1 addition & 1 deletion modules/whisper/data_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ class WhisperParams(BaseParams):
)
hotwords: Optional[str] = Field(default=None, description="Hotwords/hint phrases for the model")
language_detection_threshold: Optional[float] = Field(
default=None,
default=0.5,
description="Threshold for language detection probability"
)
language_detection_segments: int = Field(
Expand Down
2 changes: 1 addition & 1 deletion notebook/whisper-webui.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
"!git clone https://github.com/jhj0517/Whisper-WebUI.git\n",
"%cd Whisper-WebUI\n",
"!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
"!pip install faster-whisper==1.0.3\n",
"!pip install git+https://github.com/SYSTRAN/faster-whisper.git\n",
"!pip install ctranslate2==4.4.0\n",
"!pip install gradio\n",
"!pip install gradio-i18n\n",
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
torch
torchaudio
git+https://github.com/jhj0517/jhj0517-whisper.git
faster-whisper==1.0.3
git+https://github.com/SYSTRAN/faster-whisper.git
transformers
gradio
gradio-i18n
Expand Down

0 comments on commit cd2c897

Please sign in to comment.