From b77ee89238c3cbe01060683f382852e9bb5cc8cc Mon Sep 17 00:00:00 2001 From: "P. Taylor Goetz" Date: Sat, 25 May 2024 23:52:59 -0400 Subject: [PATCH 1/3] Allow Markdown file upload for RAG --- backend/app/parsing.py | 95 ++++++++++++++++++- backend/app/upload.py | 4 + .../unit_tests/agent_executor/test_parsing.py | 16 +++- .../unit_tests/agent_executor/test_upload.py | 1 + backend/tests/unit_tests/fixtures/sample.md | 13 +++ frontend/src/constants.ts | 2 +- 6 files changed, 122 insertions(+), 9 deletions(-) create mode 100644 backend/tests/unit_tests/fixtures/sample.md diff --git a/backend/app/parsing.py b/backend/app/parsing.py index 7f719a3d..a4d07b4a 100644 --- a/backend/app/parsing.py +++ b/backend/app/parsing.py @@ -1,12 +1,101 @@ """Module contains logic for parsing binary blobs into text.""" -from langchain_community.document_loaders.parsers import BS4HTMLParser, PDFMinerParser -from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser +from typing import Iterator, Mapping, Optional + +from langchain.text_splitter import MarkdownHeaderTextSplitter +from langchain_community.document_loaders.parsers import ( + BS4HTMLParser, + PDFMinerParser, +) from langchain_community.document_loaders.parsers.msword import MsWordParser from langchain_community.document_loaders.parsers.txt import TextParser +from langchain_core.document_loaders import BaseBlobParser +from langchain_core.document_loaders.blob_loaders import Blob +from langchain_core.documents import Document + +class MimeTypeBasedParser(BaseBlobParser): + """Parser that uses `mime`-types to parse a blob. + + This parser is useful for simple pipelines where the mime-type is sufficient + to determine how to parse a blob. + + To use, configure handlers based on mime-types and pass them to the initializer. + + Example: + + .. code-block:: python + + from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser + + parser = MimeTypeBasedParser( + handlers={ + "application/pdf": ..., + }, + fallback_parser=..., + ) + """ # noqa: E501 + + def __init__( + self, + handlers: Mapping[str, BaseBlobParser], + *, + fallback_parser: Optional[BaseBlobParser] = None, + ) -> None: + """Define a parser that uses mime-types to determine how to parse a blob. + + Args: + handlers: A mapping from mime-types to functions that take a blob, parse it + and return a document. + fallback_parser: A fallback_parser parser to use if the mime-type is not + found in the handlers. If provided, this parser will be + used to parse blobs with all mime-types not found in + the handlers. + If not provided, a ValueError will be raised if the + mime-type is not found in the handlers. + """ + self.handlers = handlers + self.fallback_parser = fallback_parser + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Load documents from a blob.""" + mimetype = blob.mimetype + + + if mimetype is None: + if blob.path.name.endswith(".md"): + mimetype = "text/markdown" + else: + raise ValueError(f"{blob} does not have a mimetype.") + + if mimetype in self.handlers: + handler = self.handlers[mimetype] + yield from handler.lazy_parse(blob) + else: + if self.fallback_parser is not None: + yield from self.fallback_parser.lazy_parse(blob) + else: + raise ValueError(f"Unsupported mime type: {mimetype}") +class MarkdownParser(BaseBlobParser): + """Parser for Markdown blobs.""" + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] + """Lazily parse the blob.""" + headers_to_split_on = [ + ("#", "Header 1"), + ("##", "Header 2"), + ("###", "Header 3"), + ("###$", "Header 4"), + ("####", "Header 5"), + ("#####", "Header 6"), + ] + splitter = MarkdownHeaderTextSplitter(headers_to_split_on) + for doc in splitter.split_text(blob.as_string()): + yield doc + HANDLERS = { "application/pdf": PDFMinerParser(), "text/plain": TextParser(), + "text/markdown": MarkdownParser(), "text/html": BS4HTMLParser(), "application/msword": MsWordParser(), "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ( @@ -21,4 +110,4 @@ MIMETYPE_BASED_PARSER = MimeTypeBasedParser( handlers=HANDLERS, fallback_parser=None, -) +) \ No newline at end of file diff --git a/backend/app/upload.py b/backend/app/upload.py index e2dac7e9..1009fa45 100644 --- a/backend/app/upload.py +++ b/backend/app/upload.py @@ -32,8 +32,12 @@ def _guess_mimetype(file_name: str, file_bytes: bytes) -> str: """Guess the mime-type of a file based on its name or bytes.""" # Guess based on the file extension + mime_type, _ = mimetypes.guess_type(file_name) + if file_name.endswith(".md"): + return "text/markdown" + # Return detected mime type from mimetypes guess, unless it's None if mime_type: return mime_type diff --git a/backend/tests/unit_tests/agent_executor/test_parsing.py b/backend/tests/unit_tests/agent_executor/test_parsing.py index b4a9ee5a..3067eb73 100644 --- a/backend/tests/unit_tests/agent_executor/test_parsing.py +++ b/backend/tests/unit_tests/agent_executor/test_parsing.py @@ -14,6 +14,7 @@ def test_list_of_supported_mimetypes() -> None: "application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "text/html", + "text/markdown", "text/plain", ] @@ -23,17 +24,22 @@ def test_attempt_to_parse_each_fixture() -> None: seen_mimetypes = set() for path in get_sample_paths(): type_, _ = mimetypes.guess_type(path) + if path.name.endswith(".md"): + type_ = "text/markdown" if type_ not in SUPPORTED_MIMETYPES: continue seen_mimetypes.add(type_) blob = Blob.from_path(path) documents = MIMETYPE_BASED_PARSER.parse(blob) try: - assert len(documents) == 1 - doc = documents[0] - assert "source" in doc.metadata - assert doc.metadata["source"] == str(path) - assert "🦜" in doc.page_content + if type_ == "text/markdown": + assert len(documents) >= 1 + else: + assert len(documents) == 1 + doc = documents[0] + assert "source" in doc.metadata + assert doc.metadata["source"] == str(path) + assert "🦜" in doc.page_content except Exception as e: raise AssertionError(f"Failed to parse {path}") from e diff --git a/backend/tests/unit_tests/agent_executor/test_upload.py b/backend/tests/unit_tests/agent_executor/test_upload.py index e239ef02..e238c3f8 100644 --- a/backend/tests/unit_tests/agent_executor/test_upload.py +++ b/backend/tests/unit_tests/agent_executor/test_upload.py @@ -46,4 +46,5 @@ def test_mimetype_guessing() -> None: "sample.pdf": "application/pdf", "sample.rtf": "application/rtf", "sample.txt": "text/plain", + "sample.md": "text/markdown", } == name_to_mime diff --git a/backend/tests/unit_tests/fixtures/sample.md b/backend/tests/unit_tests/fixtures/sample.md new file mode 100644 index 00000000..884d1821 --- /dev/null +++ b/backend/tests/unit_tests/fixtures/sample.md @@ -0,0 +1,13 @@ +# 🦜️ LangChain + +## Heading 2 + +Some text for heading 2. + +### Heading 3 + +Some text for heading 3. + +#### Heading 4 + +Some text for heading 4. \ No newline at end of file diff --git a/frontend/src/constants.ts b/frontend/src/constants.ts index 9349f7a4..dfdcb8eb 100644 --- a/frontend/src/constants.ts +++ b/frontend/src/constants.ts @@ -27,7 +27,7 @@ export type TYPE_NAME = (typeof TYPES)[keyof typeof TYPES]["id"]; export const DROPZONE_CONFIG = { multiple: true, accept: { - "text/*": [".txt", ".htm", ".html"], + "text/*": [".txt", ".htm", ".html", ".md"], "application/pdf": [".pdf"], "application/vnd.openxmlformats-officedocument.wordprocessingml.document": [ ".docx", From 96d9b09d0eab81f2bcc467902ae73b043af664ea Mon Sep 17 00:00:00 2001 From: "P. Taylor Goetz" Date: Sun, 26 May 2024 01:00:29 -0400 Subject: [PATCH 2/3] rename Markdown class --- backend/app/parsing.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/backend/app/parsing.py b/backend/app/parsing.py index a4d07b4a..ac549dc9 100644 --- a/backend/app/parsing.py +++ b/backend/app/parsing.py @@ -12,7 +12,8 @@ from langchain_core.document_loaders.blob_loaders import Blob from langchain_core.documents import Document -class MimeTypeBasedParser(BaseBlobParser): + +class MimeTypeParser(BaseBlobParser): """Parser that uses `mime`-types to parse a blob. This parser is useful for simple pipelines where the mime-type is sufficient @@ -87,7 +88,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty ("####", "Header 5"), ("#####", "Header 6"), ] - splitter = MarkdownHeaderTextSplitter(headers_to_split_on) + splitter = MarkdownHeaderTextSplitter(headers_to_split_on, return_each_line=True) for doc in splitter.split_text(blob.as_string()): yield doc @@ -107,7 +108,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty # PUBLIC API -MIMETYPE_BASED_PARSER = MimeTypeBasedParser( +MIMETYPE_BASED_PARSER = MimeTypeParser( handlers=HANDLERS, fallback_parser=None, ) \ No newline at end of file From 15e2a55f1f8fd0668642ef7b332bd3e87c92b8e3 Mon Sep 17 00:00:00 2001 From: "P. Taylor Goetz" Date: Sun, 26 May 2024 01:05:48 -0400 Subject: [PATCH 3/3] fix linting --- backend/app/parsing.py | 9 ++++++--- backend/tests/unit_tests/agent_executor/test_upload.py | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/backend/app/parsing.py b/backend/app/parsing.py index ac549dc9..210cb5f4 100644 --- a/backend/app/parsing.py +++ b/backend/app/parsing.py @@ -60,7 +60,6 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Load documents from a blob.""" mimetype = blob.mimetype - if mimetype is None: if blob.path.name.endswith(".md"): mimetype = "text/markdown" @@ -75,6 +74,8 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: yield from self.fallback_parser.lazy_parse(blob) else: raise ValueError(f"Unsupported mime type: {mimetype}") + + class MarkdownParser(BaseBlobParser): """Parser for Markdown blobs.""" @@ -88,7 +89,9 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty ("####", "Header 5"), ("#####", "Header 6"), ] - splitter = MarkdownHeaderTextSplitter(headers_to_split_on, return_each_line=True) + splitter = MarkdownHeaderTextSplitter( + headers_to_split_on, return_each_line=True + ) for doc in splitter.split_text(blob.as_string()): yield doc @@ -111,4 +114,4 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty MIMETYPE_BASED_PARSER = MimeTypeParser( handlers=HANDLERS, fallback_parser=None, -) \ No newline at end of file +) diff --git a/backend/tests/unit_tests/agent_executor/test_upload.py b/backend/tests/unit_tests/agent_executor/test_upload.py index e238c3f8..5c4e2992 100644 --- a/backend/tests/unit_tests/agent_executor/test_upload.py +++ b/backend/tests/unit_tests/agent_executor/test_upload.py @@ -1,7 +1,8 @@ from io import BytesIO -from langchain.text_splitter import RecursiveCharacterTextSplitter from fastapi import UploadFile +from langchain.text_splitter import RecursiveCharacterTextSplitter + from app.upload import IngestRunnable, _guess_mimetype, convert_ingestion_input_to_blob from tests.unit_tests.fixtures import get_sample_paths from tests.unit_tests.utils import InMemoryVectorStore