update and fix docs (#177)

* update and fix docs * fix typo
ricky0123 · Jan 7, 2025 · ba977e2 · ba977e2
1 parent 1d6d8b2
commit ba977e2
Show file tree

Hide file tree

Showing 7 changed files with 17 additions and 15 deletions.
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -0,0 +1 @@
+site_name: My Docs
diff --git a/docs/user-guide/api.md b/docs/user-guide/api.md
@@ -27,18 +27,18 @@ New instances of `MicVAD` are created by calling the async static method `MicVAD
 
 | Option                        | Type                                                          | Description                                                                                                                                                                                                       |
 | ----------------------------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `additionalAudioConstraints`  |                                                               | [constraints](https://developer.mozilla.org/en-US/docs/Web/API/MediaTrackConstraints) to pass to [getUserMedia](https://developer.mozilla.org/en-US/docs/Web/API/MediaDevices/getUserMedia) via the `audio` field |
-| `onFrameProcessed`            | `(probabilities: {isSpeech: float; notSpeech: float}) => any` | Callback to run after each frame.                                                                                                                                                                                 |
+| `additionalAudioConstraints`  | `Partial<MediaTrackConstraints>`                              | Additional [constraints](https://developer.mozilla.org/en-US/docs/Web/API/MediaTrackConstraints) to pass to [getUserMedia](https://developer.mozilla.org/en-US/docs/Web/API/MediaDevices/getUserMedia) via the `audio` field. Note that some constraints (channelCount, echoCancellation, autoGainControl, noiseSuppression) are set by default. |
+| `onFrameProcessed`            | `(probabilities: {isSpeech: float; notSpeech: float}, frame: Float32Array) => any` | Callback to run after each frame. The frame parameter contains the raw audio data for that frame.                                                                                                                    |
 | `onVADMisfire`                | `() => any`                                                   | Callback to run if speech start was detected but `onSpeechEnd` will not be run because the audio segment is smaller than `minSpeechFrames`                                                                        |
 | `onSpeechStart`               | `() => any`                                                   | Callback to run when speech start is detected                                                                                                                                                                     |
 | `onSpeechRealStart`               | `() => any`                                                   | Callback to run when actual speech positive frames exceeds min speech frames threshold is detected                                                                                                                                                                     |
 | `onSpeechEnd`                 | `(audio: Float32Array) => any`                                | Callback to run when speech end is detected. Takes as arg a Float32Array of audio samples between -1 and 1, sample rate 16000. This will not run if the audio segment is smaller than `minSpeechFrames`           |
 | `positiveSpeechThreshold`     | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                         |
 | `negativeSpeechThreshold`     | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                         |
 | `redemptionFrames`            | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                         |
-| `frameSamples`                | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                          |
+| `frameSamples`                | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                         |
 | `preSpeechPadFrames`          | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                         |
-| `minSpeechFrames`             | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                          |
+| `minSpeechFrames`             | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                         |
 | `model` | `"v5" or "legacy"` (default `"legacy"`) | whether to use the new Silero model or not |
 | `baseAssetPath` | `string`) | URL or path relative to webroot where `vad.worklet.bundle.min.js`, `silero_vad_legacy.onnx`, and `silero_vad_v5.onnx` will be loaded from |
 | `onnxWASMBasePath` | `string`) | URL or path relative to webroot where wasm files for onnxruntime-web will be loaded from |
@@ -123,17 +123,17 @@ The `useMicVAD` hook takes an options object with the following fields (all opti
 | Option                        | Type                                                          | Description                                                                                                                                                                                                       |
 | ----------------------------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `startOnLoad`                 | `boolean`                                                     | Should the VAD start listening to mic input when it finishes loading?                                                                                                                                             |
-| `additionalAudioConstraints`  |                                                               | [constraints](https://developer.mozilla.org/en-US/docs/Web/API/MediaTrackConstraints) to pass to [getUserMedia](https://developer.mozilla.org/en-US/docs/Web/API/MediaDevices/getUserMedia) via the `audio` field |
-| `onFrameProcessed`            | `(probabilities: {isSpeech: float; notSpeech: float}) => any` | Callback to run after each frame.                                                                                                                                                                                 |
+| `additionalAudioConstraints`  | `Partial<MediaTrackConstraints>`                              | Additional [constraints](https://developer.mozilla.org/en-US/docs/Web/API/MediaTrackConstraints) to pass to [getUserMedia](https://developer.mozilla.org/en-US/docs/Web/API/MediaDevices/getUserMedia) via the `audio` field. Note that some constraints (channelCount, echoCancellation, autoGainControl, noiseSuppression) are set by default. |
+| `onFrameProcessed`            | `(probabilities: {isSpeech: float; notSpeech: float}, frame: Float32Array) => any` | Callback to run after each frame. The frame parameter contains the raw audio data for that frame.                                                                                                                    |
 | `onVADMisfire`                | `() => any`                                                   | Callback to run if speech start was detected but `onSpeechEnd` will not be run because the audio segment is smaller than `minSpeechFrames`                                                                        |
 | `onSpeechStart`               | `() => any`                                                   | Callback to run when speech start is detected                                                                                                                                                                     |
 | `onSpeechEnd`                 | `(audio: Float32Array) => any`                                | Callback to run when speech end is detected. Takes as arg a Float32Array of audio samples between -1 and 1, sample rate 16000. This will not run if the audio segment is smaller than `minSpeechFrames`           |
 | `positiveSpeechThreshold`     | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                         |
 | `negativeSpeechThreshold`     | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                         |
 | `redemptionFrames`            | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                         |
-| `frameSamples`                | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                          |
+| `frameSamples`                | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                         |
 | `preSpeechPadFrames`          | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                         |
-| `minSpeechFrames`             | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                          |
+| `minSpeechFrames`             | `number`                                                      | [see algorithm configuration](algorithm.md#configuration)                                                                                                                                                         |
 
 ### Returns
 | Attributes     | Type                            | Description                                  |

diff --git a/docs/user-guide/react.md b/docs/user-guide/react.md
@@ -6,16 +6,13 @@
     npm i @ricky0123/vad-react
     ```
 
-2. Follow the [bundling instructions](browser.md#bundling) for `@ricky0123/vad-web`. To recap, you need to serve the worklet and onnx files that come distributed with `@ricky0123/vad-web` and the wasm files from `onnxruntime-web`, which will both be pulled in as dependencies.
-
-3. Use the `useMicVAD` hook to start the voice activity detector:
+2. Use the `useMicVAD` hook to start the voice activity detector:
 
     ```js linenums="1"
     import { useMicVAD } from "@ricky0123/vad-react"
 
     const MyComponent = () => {
     const vad = useMicVAD({
-        startOnLoad: true,
         onSpeechEnd: (audio) => {
         console.log("User stopped talking")
         },
@@ -24,3 +21,5 @@
     }
     ```
     See the docs for [useMicVAD](api.md#usemicvad) for details.
+
+3. The package will work out of the box with default CDN settings. For advanced configuration or if you want to serve files locally, you can refer to the [bundling documentation](browser.md#bundling).
diff --git a/examples/bundler/src/index.js b/examples/bundler/src/index.js
@@ -36,7 +36,7 @@ async function main() {
       negativeSpeechThreshold: 0.4,
       minSpeechFrames: 15,
       preSpeechPadFrames: 30,
-      onFrameProcessed: (probs) => {
+      onFrameProcessed: (probs, frame) => {
         const indicatorColor = interpolateInferno(probs.isSpeech / 2)
         document.body.style.setProperty("--indicator-color", indicatorColor)
       },

diff --git a/examples/script-tags/index.html b/examples/script-tags/index.html
@@ -27,7 +27,7 @@
           negativeSpeechThreshold: 0.4,
           minSpeechFrames: 15,
           preSpeechPadFrames: 30,
-          onFrameProcessed: (probs) => {
+          onFrameProcessed: (probs, frame) => {
             const indicatorColor = interpolateInferno(probs.isSpeech / 2)
             document.body.style.setProperty("--indicator-color", indicatorColor)
           },

diff --git a/packages/web/src/real-time-vad.ts b/packages/web/src/real-time-vad.ts
@@ -104,7 +104,7 @@ export const getDefaultRealTimeVADOptions: (
       : defaultLegacyFrameProcessorOptions
   return {
     ...frameProcessorOptions,
-    onFrameProcessed: (probabilities) => {},
+    onFrameProcessed: (probabilities, frame) => {},
     onVADMisfire: () => {
       log.debug("VAD misfire")
     },

diff --git a/test-site/src/index.tsx b/test-site/src/index.tsx
@@ -146,6 +146,8 @@ function VADDemo({ initializationParameters }) {
     onVADMisfire: () => {
       console.log("Vad misfire")
     },
+    onFrameProcessed: (probabilities, frame) => {
+    },
     onSpeechStart: () => {
       console.log("Speech start")
     },