langchain-ai · ptgoetz · May 26, 2024 · May 26, 2024 · May 26, 2024
diff --git a/backend/app/parsing.py b/backend/app/parsing.py
@@ -1,12 +1,105 @@
 """Module contains logic for parsing binary blobs into text."""
-from langchain_community.document_loaders.parsers import BS4HTMLParser, PDFMinerParser
-from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
+from typing import Iterator, Mapping, Optional
+
+from langchain.text_splitter import MarkdownHeaderTextSplitter
+from langchain_community.document_loaders.parsers import (
+    BS4HTMLParser,
+    PDFMinerParser,
+)
 from langchain_community.document_loaders.parsers.msword import MsWordParser
 from langchain_community.document_loaders.parsers.txt import TextParser
+from langchain_core.document_loaders import BaseBlobParser
+from langchain_core.document_loaders.blob_loaders import Blob
+from langchain_core.documents import Document
+
+
+class MimeTypeParser(BaseBlobParser):
+    """Parser that uses `mime`-types to parse a blob.
+
+    This parser is useful for simple pipelines where the mime-type is sufficient
+    to determine how to parse a blob.
+
+    To use, configure handlers based on mime-types and pass them to the initializer.
+
+    Example:
+
+        .. code-block:: python
+
+        from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
+
+        parser = MimeTypeBasedParser(
+            handlers={
+                "application/pdf": ...,
+            },
+            fallback_parser=...,
+        )
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        handlers: Mapping[str, BaseBlobParser],
+        *,
+        fallback_parser: Optional[BaseBlobParser] = None,
+    ) -> None:
+        """Define a parser that uses mime-types to determine how to parse a blob.
+
+        Args:
+            handlers: A mapping from mime-types to functions that take a blob, parse it
+                      and return a document.
+            fallback_parser: A fallback_parser parser to use if the mime-type is not
+                             found in the handlers. If provided, this parser will be
+                             used to parse blobs with all mime-types not found in
+                             the handlers.
+                             If not provided, a ValueError will be raised if the
+                             mime-type is not found in the handlers.
+        """
+        self.handlers = handlers
+        self.fallback_parser = fallback_parser
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Load documents from a blob."""
+        mimetype = blob.mimetype
+
+        if mimetype is None:
+            if blob.path.name.endswith(".md"):
+                mimetype = "text/markdown"
+            else:
+                raise ValueError(f"{blob} does not have a mimetype.")
+
+        if mimetype in self.handlers:
+            handler = self.handlers[mimetype]
+            yield from handler.lazy_parse(blob)
+        else:
+            if self.fallback_parser is not None:
+                yield from self.fallback_parser.lazy_parse(blob)
+            else:
+                raise ValueError(f"Unsupported mime type: {mimetype}")
+
+
+class MarkdownParser(BaseBlobParser):
+    """Parser for Markdown blobs."""
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
+        """Lazily parse the blob."""
+        headers_to_split_on = [
+            ("#", "Header 1"),
+            ("##", "Header 2"),
+            ("###", "Header 3"),
+            ("###$", "Header 4"),
+            ("####", "Header 5"),
+            ("#####", "Header 6"),
+        ]
+        splitter = MarkdownHeaderTextSplitter(
+            headers_to_split_on, return_each_line=True
+        )
+        for doc in splitter.split_text(blob.as_string()):
+            yield doc
+
 
 HANDLERS = {
     "application/pdf": PDFMinerParser(),
     "text/plain": TextParser(),
+    "text/markdown": MarkdownParser(),
     "text/html": BS4HTMLParser(),
     "application/msword": MsWordParser(),
     "application/vnd.openxmlformats-officedocument.wordprocessingml.document": (
@@ -18,7 +111,7 @@
 
 # PUBLIC API
 
-MIMETYPE_BASED_PARSER = MimeTypeBasedParser(
+MIMETYPE_BASED_PARSER = MimeTypeParser(
     handlers=HANDLERS,
     fallback_parser=None,
 )
diff --git a/backend/app/upload.py b/backend/app/upload.py
@@ -32,8 +32,12 @@
 def _guess_mimetype(file_name: str, file_bytes: bytes) -> str:
     """Guess the mime-type of a file based on its name or bytes."""
     # Guess based on the file extension
+
     mime_type, _ = mimetypes.guess_type(file_name)
 
+    if file_name.endswith(".md"):
+        return "text/markdown"
+
     # Return detected mime type from mimetypes guess, unless it's None
     if mime_type:
         return mime_type

diff --git a/backend/tests/unit_tests/agent_executor/test_parsing.py b/backend/tests/unit_tests/agent_executor/test_parsing.py
@@ -14,6 +14,7 @@ def test_list_of_supported_mimetypes() -> None:
         "application/pdf",
         "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
         "text/html",
+        "text/markdown",
         "text/plain",
     ]
 
@@ -23,17 +24,22 @@ def test_attempt_to_parse_each_fixture() -> None:
     seen_mimetypes = set()
     for path in get_sample_paths():
         type_, _ = mimetypes.guess_type(path)
+        if path.name.endswith(".md"):
+            type_ = "text/markdown"
         if type_ not in SUPPORTED_MIMETYPES:
             continue
         seen_mimetypes.add(type_)
         blob = Blob.from_path(path)
         documents = MIMETYPE_BASED_PARSER.parse(blob)
         try:
-            assert len(documents) == 1
-            doc = documents[0]
-            assert "source" in doc.metadata
-            assert doc.metadata["source"] == str(path)
-            assert "🦜" in doc.page_content
+            if type_ == "text/markdown":
+                assert len(documents) >= 1
+            else:
+                assert len(documents) == 1
+                doc = documents[0]
+                assert "source" in doc.metadata
+                assert doc.metadata["source"] == str(path)
+                assert "🦜" in doc.page_content
         except Exception as e:
             raise AssertionError(f"Failed to parse {path}") from e
 

diff --git a/backend/tests/unit_tests/agent_executor/test_upload.py b/backend/tests/unit_tests/agent_executor/test_upload.py
@@ -1,7 +1,8 @@
 from io import BytesIO
 
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 from fastapi import UploadFile
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
 from app.upload import IngestRunnable, _guess_mimetype, convert_ingestion_input_to_blob
 from tests.unit_tests.fixtures import get_sample_paths
 from tests.unit_tests.utils import InMemoryVectorStore
@@ -46,4 +47,5 @@ def test_mimetype_guessing() -> None:
         "sample.pdf": "application/pdf",
         "sample.rtf": "application/rtf",
         "sample.txt": "text/plain",
+        "sample.md": "text/markdown",
     } == name_to_mime
diff --git a/backend/tests/unit_tests/fixtures/sample.md b/backend/tests/unit_tests/fixtures/sample.md
@@ -0,0 +1,13 @@
+# 🦜️ LangChain
+
+## Heading 2
+
+Some text for heading 2.
+
+### Heading 3
+
+Some text for heading 3.
+
+#### Heading 4
+
+Some text for heading 4.
diff --git a/frontend/src/constants.ts b/frontend/src/constants.ts
@@ -27,7 +27,7 @@ export type TYPE_NAME = (typeof TYPES)[keyof typeof TYPES]["id"];
 export const DROPZONE_CONFIG = {
   multiple: true,
   accept: {
-    "text/*": [".txt", ".htm", ".html"],
+    "text/*": [".txt", ".htm", ".html", ".md"],
     "application/pdf": [".pdf"],
     "application/vnd.openxmlformats-officedocument.wordprocessingml.document": [
       ".docx",