Skip to content

Commit

Permalink
Refactor and minor fixes (#181)
Browse files Browse the repository at this point in the history
  • Loading branch information
ricky0123 authored Jan 19, 2025
1 parent 4f3f2d7 commit e03c998
Show file tree
Hide file tree
Showing 12 changed files with 147 additions and 133 deletions.
4 changes: 4 additions & 0 deletions changelogs/react-changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# @ricky0123/vad-react Changelog

## 0.0.29

* Upgrade to vad-web 0.0.23

## 0.0.27

* Upgrade to vad-web 0.0.21
Expand Down
3 changes: 3 additions & 0 deletions changelogs/web-changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## 0.0.23

* fix types in non-realtime vad
* Use cdn package version "latest" by default for asset path
* Reduce re-renders [#184](https://github.com/ricky0123/vad/pull/184)
* add support to change VAD parameters dynamically [#137] https://github.com/ricky0123/vad/issues/173
* add onRealSpeechStart callback - "not a misfire" event (https://github.com/ricky0123/vad/issues/67)

Expand Down
4 changes: 2 additions & 2 deletions packages/react/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@
"react"
],
"homepage": "https://github.com/ricky0123/vad",
"version": "0.0.28",
"version": "0.0.29",
"license": "ISC",
"main": "dist/index.js",
"devDependencies": {
"@types/react": "18.0.28"
},
"dependencies": {
"onnxruntime-web": "1.14.0",
"@ricky0123/vad-web": "0.0.22"
"@ricky0123/vad-web": "0.0.23"
},
"peerDependencies": {
"react": "18",
Expand Down
2 changes: 1 addition & 1 deletion packages/web/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Prompt your user for microphone permissions and run callbacks on segments of aud
Quick start:
```html
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.js"></script>
<script src="https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.22/dist/bundle.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@latest/dist/bundle.min.js"></script>
<script>
async function main() {
const myvad = await vad.MicVAD.new({
Expand Down
2 changes: 1 addition & 1 deletion packages/web/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"offline-speech-recognition"
],
"homepage": "https://github.com/ricky0123/vad",
"version": "0.0.22",
"version": "0.0.23",
"license": "ISC",
"main": "dist/index.js",
"unpkg": "dist/bundle.min.js",
Expand Down
78 changes: 49 additions & 29 deletions packages/web/src/frame-processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,14 @@ export function validateOptions(options: FrameProcessorOptions) {

export interface FrameProcessorInterface {
resume: () => void
process: (arr: Float32Array) => Promise<{
probs?: SpeechProbabilities
process: (
arr: Float32Array,
handleEvent: (event: FrameProcessorEvent) => any
) => Promise<any>
endSegment: (handleEvent: (event: FrameProcessorEvent) => any) => {
msg?: Message
audio?: Float32Array
}>
endSegment: () => { msg?: Message; audio?: Float32Array }
}
}

const concatArrays = (arrays: Float32Array[]): Float32Array => {
Expand All @@ -127,6 +129,7 @@ export class FrameProcessor implements FrameProcessorInterface {
redemptionCounter = 0
speechFrameCount = 0
active = false
speechRealStartFired = false

constructor(
public modelProcessFunc: (
Expand All @@ -147,21 +150,20 @@ export class FrameProcessor implements FrameProcessorInterface {
this.speechFrameCount = 0
}

pause = () => {
pause = (handleEvent: (event: FrameProcessorEvent) => any) => {
this.active = false
if (this.options.submitUserSpeechOnPause) {
return this.endSegment()
this.endSegment(handleEvent)
} else {
this.reset()
return {}
}
}

resume = () => {
this.active = true
}

endSegment = () => {
endSegment = (handleEvent: (event: FrameProcessorEvent) => any) => {
const audioBuffer = this.audioBuffer
this.audioBuffer = []
const speaking = this.speaking
Expand All @@ -174,51 +176,49 @@ export class FrameProcessor implements FrameProcessorInterface {
if (speaking) {
if (speechFrameCount >= this.options.minSpeechFrames) {
const audio = concatArrays(audioBuffer.map((item) => item.frame))
return { msg: Message.SpeechEnd, audio }
handleEvent({ msg: Message.SpeechEnd, audio })
} else {
return { msg: Message.VADMisfire }
handleEvent({ msg: Message.VADMisfire })
}
}
return {}
}

process = async (frame: Float32Array) => {
process = async (
frame: Float32Array,
handleEvent: (event: FrameProcessorEvent) => any
) => {
if (!this.active) {
return {}
return
}

const probs = await this.modelProcessFunc(frame)
const isSpeech = probs.isSpeech >= this.options.positiveSpeechThreshold

handleEvent({ probs, msg: Message.FrameProcessed, frame })

this.audioBuffer.push({
frame,
isSpeech,
})

if (isSpeech) {
this.speechFrameCount++
}

if (
probs.isSpeech >= this.options.positiveSpeechThreshold &&
this.redemptionCounter
) {
this.redemptionCounter = 0
}

if (
probs.isSpeech >= this.options.positiveSpeechThreshold &&
!this.speaking
) {
if (isSpeech && !this.speaking) {
this.speaking = true
return { probs, msg: Message.SpeechStart, frame }
handleEvent({ msg: Message.SpeechStart })
}

if (
this.speaking &&
this.speechFrameCount === this.options.minSpeechFrames
this.speechFrameCount === this.options.minSpeechFrames &&
!this.speechRealStartFired
) {
return { probs, msg: Message.SpeechRealStart, frame }
this.speechRealStartFired = true
handleEvent({ msg: Message.SpeechRealStart })
}

if (
Expand All @@ -227,8 +227,9 @@ export class FrameProcessor implements FrameProcessorInterface {
++this.redemptionCounter >= this.options.redemptionFrames
) {
this.redemptionCounter = 0
this.speechFrameCount = 0
this.speaking = false

this.speechRealStartFired = false
const audioBuffer = this.audioBuffer
this.audioBuffer = []

Expand All @@ -238,9 +239,9 @@ export class FrameProcessor implements FrameProcessorInterface {

if (speechFrameCount >= this.options.minSpeechFrames) {
const audio = concatArrays(audioBuffer.map((item) => item.frame))
return { probs, msg: Message.SpeechEnd, audio, frame }
handleEvent({ msg: Message.SpeechEnd, audio })
} else {
return { probs, msg: Message.VADMisfire, frame }
handleEvent({ msg: Message.VADMisfire })
}
}

Expand All @@ -250,6 +251,25 @@ export class FrameProcessor implements FrameProcessorInterface {
}
this.speechFrameCount = 0
}
return { probs, frame }
}
}

export type FrameProcessorEvent =
| {
msg: Message.VADMisfire
}
| {
msg: Message.SpeechStart
}
| {
msg: Message.SpeechRealStart
}
| {
msg: Message.SpeechEnd
audio: Float32Array
}
| {
msg: Message.FrameProcessed
probs: SpeechProbabilities
frame: Float32Array
}
40 changes: 7 additions & 33 deletions packages/web/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,41 +1,17 @@
import * as ort from "onnxruntime-web"
import { baseAssetPath } from "./asset-path"
import { defaultModelFetcher } from "./default-model-fetcher"
import { FrameProcessor, FrameProcessorOptions } from "./frame-processor"
import { Message } from "./messages"
import {
NonRealTimeVADOptions,
PlatformAgnosticNonRealTimeVAD,
} from "./non-real-time-vad"
export { baseAssetPath } from "./asset-path"
export { defaultModelFetcher } from "./default-model-fetcher"
export { FrameProcessor } from "./frame-processor"
export type { FrameProcessorOptions } from "./frame-processor"
export { Message } from "./messages"
export { NonRealTimeVAD } from "./non-real-time-vad"
export type { NonRealTimeVADOptions } from "./non-real-time-vad"
import {
arrayBufferToBase64,
audioFileToArray,
encodeWAV,
minFramesForTargetMS,
} from "./utils"

export interface NonRealTimeVADOptionsWeb extends NonRealTimeVADOptions {
modelURL: string
modelFetcher: (path: string) => Promise<ArrayBuffer>
}

export const defaultNonRealTimeVADOptions = {
modelURL: baseAssetPath + "silero_vad_legacy.onnx",
modelFetcher: defaultModelFetcher,
}

class NonRealTimeVAD extends PlatformAgnosticNonRealTimeVAD {
static async new(
options: Partial<NonRealTimeVADOptionsWeb> = {}
): Promise<NonRealTimeVAD> {
const { modelURL, modelFetcher } = {
...defaultNonRealTimeVADOptions,
...options,
}
return await this._new(() => modelFetcher(modelURL), ort, options)
}
}

export const utils = {
audioFileToArray,
minFramesForTargetMS,
Expand All @@ -50,5 +26,3 @@ export {
getDefaultRealTimeVADOptions,
} from "./real-time-vad"
export type { RealTimeVADOptions } from "./real-time-vad"
export { FrameProcessor, Message, NonRealTimeVAD }
export type { FrameProcessorOptions, NonRealTimeVADOptions }
1 change: 1 addition & 0 deletions packages/web/src/messages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ export enum Message {
SpeechEnd = "SPEECH_END",
SpeechStop = "SPEECH_STOP",
SpeechRealStart = "SPEECH_REAL_START",
FrameProcessed = "FRAME_PROCESSED",
}
Loading

0 comments on commit e03c998

Please sign in to comment.