Refactor and minor fixes (#181)

ricky0123 · Jan 19, 2025 · e03c998 · e03c998
1 parent 4f3f2d7
commit e03c998
Show file tree

Hide file tree

Showing 12 changed files with 147 additions and 133 deletions.
diff --git a/changelogs/react-changelog.md b/changelogs/react-changelog.md
@@ -1,5 +1,9 @@
 # @ricky0123/vad-react Changelog
 
+## 0.0.29
+
+* Upgrade to vad-web 0.0.23
+
 ## 0.0.27
 
 * Upgrade to vad-web 0.0.21

diff --git a/changelogs/web-changelog.md b/changelogs/web-changelog.md
@@ -2,6 +2,9 @@
 
 ## 0.0.23
 
+* fix types in non-realtime vad
+* Use cdn package version "latest" by default for asset path
+* Reduce re-renders [#184](https://github.com/ricky0123/vad/pull/184)
 * add support to change VAD parameters dynamically [#137] https://github.com/ricky0123/vad/issues/173
 * add onRealSpeechStart callback - "not a misfire" event (https://github.com/ricky0123/vad/issues/67)
 

diff --git a/packages/react/package.json b/packages/react/package.json
@@ -13,15 +13,15 @@
     "react"
   ],
   "homepage": "https://github.com/ricky0123/vad",
-  "version": "0.0.28",
+  "version": "0.0.29",
   "license": "ISC",
   "main": "dist/index.js",
   "devDependencies": {
     "@types/react": "18.0.28"
   },
   "dependencies": {
     "onnxruntime-web": "1.14.0",
-    "@ricky0123/vad-web": "0.0.22"
+    "@ricky0123/vad-web": "0.0.23"
   },
   "peerDependencies": {
     "react": "18",

diff --git a/packages/web/README.md b/packages/web/README.md
@@ -5,7 +5,7 @@ Prompt your user for microphone permissions and run callbacks on segments of aud
 Quick start:
 ```html
 <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.js"></script>
-<script src="https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.22/dist/bundle.min.js"></script>
+<script src="https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@latest/dist/bundle.min.js"></script>
 <script>
   async function main() {
     const myvad = await vad.MicVAD.new({

diff --git a/packages/web/package.json b/packages/web/package.json
@@ -12,7 +12,7 @@
     "offline-speech-recognition"
   ],
   "homepage": "https://github.com/ricky0123/vad",
-  "version": "0.0.22",
+  "version": "0.0.23",
   "license": "ISC",
   "main": "dist/index.js",
   "unpkg": "dist/bundle.min.js",

diff --git a/packages/web/src/frame-processor.ts b/packages/web/src/frame-processor.ts
@@ -97,12 +97,14 @@ export function validateOptions(options: FrameProcessorOptions) {
 
 export interface FrameProcessorInterface {
   resume: () => void
-  process: (arr: Float32Array) => Promise<{
-    probs?: SpeechProbabilities
+  process: (
+    arr: Float32Array,
+    handleEvent: (event: FrameProcessorEvent) => any
+  ) => Promise<any>
+  endSegment: (handleEvent: (event: FrameProcessorEvent) => any) => {
     msg?: Message
     audio?: Float32Array
-  }>
-  endSegment: () => { msg?: Message; audio?: Float32Array }
+  }
 }
 
 const concatArrays = (arrays: Float32Array[]): Float32Array => {
@@ -127,6 +129,7 @@ export class FrameProcessor implements FrameProcessorInterface {
   redemptionCounter = 0
   speechFrameCount = 0
   active = false
+  speechRealStartFired = false
 
   constructor(
     public modelProcessFunc: (
@@ -147,21 +150,20 @@ export class FrameProcessor implements FrameProcessorInterface {
     this.speechFrameCount = 0
   }
 
-  pause = () => {
+  pause = (handleEvent: (event: FrameProcessorEvent) => any) => {
     this.active = false
     if (this.options.submitUserSpeechOnPause) {
-      return this.endSegment()
+      this.endSegment(handleEvent)
     } else {
       this.reset()
-      return {}
     }
   }
 
   resume = () => {
     this.active = true
   }
 
-  endSegment = () => {
+  endSegment = (handleEvent: (event: FrameProcessorEvent) => any) => {
     const audioBuffer = this.audioBuffer
     this.audioBuffer = []
     const speaking = this.speaking
@@ -174,51 +176,49 @@ export class FrameProcessor implements FrameProcessorInterface {
     if (speaking) {
       if (speechFrameCount >= this.options.minSpeechFrames) {
         const audio = concatArrays(audioBuffer.map((item) => item.frame))
-        return { msg: Message.SpeechEnd, audio }
+        handleEvent({ msg: Message.SpeechEnd, audio })
       } else {
-        return { msg: Message.VADMisfire }
+        handleEvent({ msg: Message.VADMisfire })
       }
     }
     return {}
   }
 
-  process = async (frame: Float32Array) => {
+  process = async (
+    frame: Float32Array,
+    handleEvent: (event: FrameProcessorEvent) => any
+  ) => {
     if (!this.active) {
-      return {}
+      return
     }
 
     const probs = await this.modelProcessFunc(frame)
     const isSpeech = probs.isSpeech >= this.options.positiveSpeechThreshold
 
+    handleEvent({ probs, msg: Message.FrameProcessed, frame })
+
     this.audioBuffer.push({
       frame,
       isSpeech,
     })
 
     if (isSpeech) {
       this.speechFrameCount++
-    }
-
-    if (
-      probs.isSpeech >= this.options.positiveSpeechThreshold &&
-      this.redemptionCounter
-    ) {
       this.redemptionCounter = 0
     }
 
-    if (
-      probs.isSpeech >= this.options.positiveSpeechThreshold &&
-      !this.speaking
-    ) {
+    if (isSpeech && !this.speaking) {
       this.speaking = true
-      return { probs, msg: Message.SpeechStart, frame }
+      handleEvent({ msg: Message.SpeechStart })
     }
 
     if (
       this.speaking &&
-      this.speechFrameCount === this.options.minSpeechFrames
+      this.speechFrameCount === this.options.minSpeechFrames &&
+      !this.speechRealStartFired
     ) {
-      return { probs, msg: Message.SpeechRealStart, frame }
+      this.speechRealStartFired = true
+      handleEvent({ msg: Message.SpeechRealStart })
     }
 
     if (
@@ -227,8 +227,9 @@ export class FrameProcessor implements FrameProcessorInterface {
       ++this.redemptionCounter >= this.options.redemptionFrames
     ) {
       this.redemptionCounter = 0
+      this.speechFrameCount = 0
       this.speaking = false
-
+      this.speechRealStartFired = false
       const audioBuffer = this.audioBuffer
       this.audioBuffer = []
 
@@ -238,9 +239,9 @@ export class FrameProcessor implements FrameProcessorInterface {
 
       if (speechFrameCount >= this.options.minSpeechFrames) {
         const audio = concatArrays(audioBuffer.map((item) => item.frame))
-        return { probs, msg: Message.SpeechEnd, audio, frame }
+        handleEvent({ msg: Message.SpeechEnd, audio })
       } else {
-        return { probs, msg: Message.VADMisfire, frame }
+        handleEvent({ msg: Message.VADMisfire })
       }
     }
 
@@ -250,6 +251,25 @@ export class FrameProcessor implements FrameProcessorInterface {
       }
       this.speechFrameCount = 0
     }
-    return { probs, frame }
   }
 }
+
+export type FrameProcessorEvent =
+  | {
+      msg: Message.VADMisfire
+    }
+  | {
+      msg: Message.SpeechStart
+    }
+  | {
+      msg: Message.SpeechRealStart
+    }
+  | {
+      msg: Message.SpeechEnd
+      audio: Float32Array
+    }
+  | {
+      msg: Message.FrameProcessed
+      probs: SpeechProbabilities
+      frame: Float32Array
+    }
diff --git a/packages/web/src/index.ts b/packages/web/src/index.ts
@@ -1,41 +1,17 @@
-import * as ort from "onnxruntime-web"
-import { baseAssetPath } from "./asset-path"
-import { defaultModelFetcher } from "./default-model-fetcher"
-import { FrameProcessor, FrameProcessorOptions } from "./frame-processor"
-import { Message } from "./messages"
-import {
-  NonRealTimeVADOptions,
-  PlatformAgnosticNonRealTimeVAD,
-} from "./non-real-time-vad"
+export { baseAssetPath } from "./asset-path"
+export { defaultModelFetcher } from "./default-model-fetcher"
+export { FrameProcessor } from "./frame-processor"
+export type { FrameProcessorOptions } from "./frame-processor"
+export { Message } from "./messages"
+export { NonRealTimeVAD } from "./non-real-time-vad"
+export type { NonRealTimeVADOptions } from "./non-real-time-vad"
 import {
   arrayBufferToBase64,
   audioFileToArray,
   encodeWAV,
   minFramesForTargetMS,
 } from "./utils"
 
-export interface NonRealTimeVADOptionsWeb extends NonRealTimeVADOptions {
-  modelURL: string
-  modelFetcher: (path: string) => Promise<ArrayBuffer>
-}
-
-export const defaultNonRealTimeVADOptions = {
-  modelURL: baseAssetPath + "silero_vad_legacy.onnx",
-  modelFetcher: defaultModelFetcher,
-}
-
-class NonRealTimeVAD extends PlatformAgnosticNonRealTimeVAD {
-  static async new(
-    options: Partial<NonRealTimeVADOptionsWeb> = {}
-  ): Promise<NonRealTimeVAD> {
-    const { modelURL, modelFetcher } = {
-      ...defaultNonRealTimeVADOptions,
-      ...options,
-    }
-    return await this._new(() => modelFetcher(modelURL), ort, options)
-  }
-}
-
 export const utils = {
   audioFileToArray,
   minFramesForTargetMS,
@@ -50,5 +26,3 @@ export {
   getDefaultRealTimeVADOptions,
 } from "./real-time-vad"
 export type { RealTimeVADOptions } from "./real-time-vad"
-export { FrameProcessor, Message, NonRealTimeVAD }
-export type { FrameProcessorOptions, NonRealTimeVADOptions }
diff --git a/packages/web/src/messages.ts b/packages/web/src/messages.ts
@@ -5,4 +5,5 @@ export enum Message {
   SpeechEnd = "SPEECH_END",
   SpeechStop = "SPEECH_STOP",
   SpeechRealStart = "SPEECH_REAL_START",
+  FrameProcessed = "FRAME_PROCESSED",
 }