🚀 feat: Optimize chunk-size according to the latest research

📝 docs(README.md): update command descriptions and examples for clarity and consistency 🔧 chore(setContextConfigCommand.ts): update default number of context documents to retrieve to 4 🔧 chore(index.ts): update default number of context documents to retrieve to 4, add chunkSize and chunkOverlap configuration options 🔧 chore(global.d.ts): add chunkSize and chunkOverlap properties to Config interface 🔧 chore(contextManager.ts): update text splitter usage to use defaultRecursiveCharacterTextSplitter for all file types, remove unused MarkdownTextSplitter
gmickel · Oct 9, 2023 · 6971e10 · 6971e10
1 parent 0916d4a
commit 6971e10
Show file tree

Hide file tree

Showing 5 changed files with 40 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -131,52 +131,45 @@ After starting the chatbot, simply type your questions or messages and press Ent
 ### Commands
 
 <!-- COMMANDS_START -->
-
 - `/add-docs` (/docs) - Adds new documents from your configured docs directory to the context vector store.
 
-  Usage: /add-docs example.txt example.md
-
-  Supports the following file types: .txt, .md, .pdf, .docx, .csv, .epub
+    Usage: /add-docs example.txt example.md
 
+    Supports the following file types: .txt, .md, .pdf, .docx, .csv, .epub
 - `/add-url` (/url) - Scrapes the content from a url and adds it to the context vector store.
 
-  Arguments: `url`, `selector to extract` (Default: body), `Maximum number of links to follow` (Default: 20), `Ignore pages with less than n characters` (Default: 200)
+    Arguments: `url`, `selector to extract` (Default: body), `Maximum number of links to follow` (Default: 20), `Ignore pages with less than n characters` (Default: 200)
 
-  Example: /add-url https://dociq.io main 10 500
+    Example: /add-url https://dociq.io main 10 500
 
-  This operation may try to generate a large number of embeddings depending on the structure of the web pages and may lead to rate-limiting.
-
-  To avoid this, you can try to target a specific selector such as `.main`
+    This operation may try to generate a large number of embeddings depending on the structure of the web pages and may lead to rate-limiting.
 
+    To avoid this, you can try to target a specific selector such as `.main`
 - `/add-youtube` (/yt) - Adds the transcript from a youtube video and adds it to the context vector store.
 
-  Arguments: `youtube url` or `youtube videoid`
-
-  Example: /add-url https://www.youtube.com/watch?v=VMj-3S1tku0
+    Arguments: `youtube url` or `youtube videoid`
 
+    Example: /add-url https://www.youtube.com/watch?v=VMj-3S1tku0
 - `/help` (/h, /?) - Show the list of available commands
 - `/list-context-stores` (/lcs) - Lists all available context vector stores and their details.
 
 - `/quit` (/q) - Terminates the script
 - `/reset` - Resets the chat and starts a new conversation - This clears the memory vector store and the buffer window memory.
 - `/context-config` (/cc) - Sets the number of relevant documents to return from the context vector store.
 
-  Arguments: `number of documents` (Default: 6)
-
-  Example: `/context-config 10`
+    Arguments: `number of documents` (Default: 4)
 
+    Example: `/context-config 10`
 - `/memory-config` (/mc) - Sets the number of relevant documents to return from the memory vector store.
 
-  Arguments: `number of documents` (Default: 4)
-
-  Example: /memory-config 10
+    Arguments: `number of documents` (Default: 4)
 
+    Example: /memory-config 10
 - `/change-context-store` (/ccs) - Loads an existing or creates a new empty context vector store as a subdirectory of the db directory.
 
-  Arguments: `subdirectory`
-
-  Example: /change-context-store newcontext
+    Arguments: `subdirectory`
 
+    Example: /change-context-store newcontext
 - `/toggle-window-memory` (/wm) - Toggles the window buffer memory (MemoryBot's short-term transient memory) on or off.
 <!-- COMMANDS_END -->
 

diff --git a/src/commands/setContextConfigCommand.ts b/src/commands/setContextConfigCommand.ts
@@ -6,7 +6,7 @@ const setContextConfigCommand = createCommand(
   'context-config',
   ['cc'],
   `Sets the number of relevant documents to return from the context vector store.\n
-    Arguments: \`number of documents\` (Default: 6)\n
+    Arguments: \`number of documents\` (Default: 4)\n
     Example: \`/context-config 10\``,
   async (args, output) => {
     if (!args || args.length !== 1) {

diff --git a/src/config/index.ts b/src/config/index.ts
@@ -19,9 +19,11 @@ export function getDefaultOraOptions(output: Writable): Options {
 
 const defaultConfig: Config = {
   currentVectorStoreDatabasePath: path.join(getProjectRoot(), process.env.VECTOR_STORE_DIR || 'db/default'),
-  numContextDocumentsToRetrieve: 6,
+  numContextDocumentsToRetrieve: 4,
   numMemoryDocumentsToRetrieve: 4,
   useWindowMemory: true,
+  chunkSize: 700,
+  chunkOverlap: 50,
 };
 
 let config: Config = { ...defaultConfig };

diff --git a/src/global.d.ts b/src/global.d.ts
@@ -21,6 +21,8 @@ interface Config {
   numContextDocumentsToRetrieve: number;
   numMemoryDocumentsToRetrieve: number;
   useWindowMemory: boolean;
+  chunkSize: number;
+  chunkOverlap: number;
 }
 
 interface FileInfo {

diff --git a/src/lib/contextManager.ts b/src/lib/contextManager.ts
@@ -9,7 +9,7 @@ import { DocxLoader } from 'langchain/document_loaders/fs/docx';
 import { EPubLoader } from 'langchain/document_loaders/fs/epub';
 import { CSVLoader } from 'langchain/document_loaders/fs/csv';
 import ora from 'ora';
-import { MarkdownTextSplitter, RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
+import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 import { Document } from 'langchain/document';
 import path from 'path';
 import { YoutubeTranscript } from 'youtube-transcript';
@@ -23,6 +23,16 @@ const projectRootDir = getProjectRoot();
 
 const defaultOraOptions = getDefaultOraOptions(output);
 
+const defaultRecursiveCharacterTextSplitter = new RecursiveCharacterTextSplitter({
+  chunkSize: getConfig().chunkSize,
+  chunkOverlap: getConfig().chunkOverlap,
+});
+
+const markdownRecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter.fromLanguage('markdown', {
+  chunkSize: getConfig().chunkSize,
+  chunkOverlap: getConfig().chunkOverlap,
+});
+
 /**
  * This function loads and splits a file based on its extension using different loaders and text
  * splitters.
@@ -40,31 +50,31 @@ async function loadAndSplitFile(filePath: string): Promise<Document<Record<strin
   switch (fileExtension) {
     case '.json':
       loader = new JSONLoader(filePath);
-      documents = await loader.loadAndSplit(new RecursiveCharacterTextSplitter());
+      documents = await loader.loadAndSplit(defaultRecursiveCharacterTextSplitter);
       break;
     case '.txt':
       loader = new TextLoader(filePath);
-      documents = await loader.loadAndSplit(new RecursiveCharacterTextSplitter());
+      documents = await loader.loadAndSplit(defaultRecursiveCharacterTextSplitter);
       break;
     case '.md':
       loader = new TextLoader(filePath);
-      documents = await loader.loadAndSplit(new MarkdownTextSplitter());
+      documents = await loader.loadAndSplit(markdownRecursiveCharacterTextSplitter);
       break;
     case '.pdf':
       loader = new PDFLoader(filePath, { splitPages: false });
-      documents = await loader.loadAndSplit(new RecursiveCharacterTextSplitter());
+      documents = await loader.loadAndSplit(defaultRecursiveCharacterTextSplitter);
       break;
     case '.docx':
       loader = new DocxLoader(filePath);
-      documents = await loader.loadAndSplit(new RecursiveCharacterTextSplitter());
+      documents = await loader.loadAndSplit(defaultRecursiveCharacterTextSplitter);
       break;
     case '.csv':
       loader = new CSVLoader(filePath);
-      documents = await loader.loadAndSplit(new RecursiveCharacterTextSplitter());
+      documents = await loader.loadAndSplit(defaultRecursiveCharacterTextSplitter);
       break;
     case '.epub':
       loader = new EPubLoader(filePath, { splitChapters: false });
-      documents = await loader.loadAndSplit(new RecursiveCharacterTextSplitter());
+      documents = await loader.loadAndSplit(defaultRecursiveCharacterTextSplitter);
       break;
     default:
       throw new Error(`Unsupported file extension: ${fileExtension}`);
@@ -195,8 +205,7 @@ async function addYouTube(URLOrVideoID: string) {
     }).start();
     const transcript = await YoutubeTranscript.fetchTranscript(URLOrVideoID);
     const text = transcript.map((part) => part.text).join(' ');
-    const splitter = new RecursiveCharacterTextSplitter();
-    const videoDocs = await splitter.splitDocuments([
+    const videoDocs = await defaultRecursiveCharacterTextSplitter.splitDocuments([
       new Document({
         pageContent: text,
       }),
@@ -244,7 +253,7 @@ async function addURL(URL: string, selector: string, maxPages: number, numberOfC
 
     documents = await Promise.all(
       pages.map((row) => {
-        const splitter = new RecursiveCharacterTextSplitter();
+        const splitter = defaultRecursiveCharacterTextSplitter;
 
         const webDocs = splitter.splitDocuments([
           new Document({