Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Markdown as a RAG file format #347

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 96 additions & 3 deletions backend/app/parsing.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,105 @@
"""Module contains logic for parsing binary blobs into text."""
from langchain_community.document_loaders.parsers import BS4HTMLParser, PDFMinerParser
from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
from typing import Iterator, Mapping, Optional

from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_community.document_loaders.parsers import (
BS4HTMLParser,
PDFMinerParser,
)
from langchain_community.document_loaders.parsers.msword import MsWordParser
from langchain_community.document_loaders.parsers.txt import TextParser
from langchain_core.document_loaders import BaseBlobParser
from langchain_core.document_loaders.blob_loaders import Blob
from langchain_core.documents import Document


class MimeTypeParser(BaseBlobParser):
"""Parser that uses `mime`-types to parse a blob.

This parser is useful for simple pipelines where the mime-type is sufficient
to determine how to parse a blob.

To use, configure handlers based on mime-types and pass them to the initializer.

Example:

.. code-block:: python

from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser

parser = MimeTypeBasedParser(
handlers={
"application/pdf": ...,
},
fallback_parser=...,
)
""" # noqa: E501

def __init__(
self,
handlers: Mapping[str, BaseBlobParser],
*,
fallback_parser: Optional[BaseBlobParser] = None,
) -> None:
"""Define a parser that uses mime-types to determine how to parse a blob.

Args:
handlers: A mapping from mime-types to functions that take a blob, parse it
and return a document.
fallback_parser: A fallback_parser parser to use if the mime-type is not
found in the handlers. If provided, this parser will be
used to parse blobs with all mime-types not found in
the handlers.
If not provided, a ValueError will be raised if the
mime-type is not found in the handlers.
"""
self.handlers = handlers
self.fallback_parser = fallback_parser

def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Load documents from a blob."""
mimetype = blob.mimetype

if mimetype is None:
if blob.path.name.endswith(".md"):
mimetype = "text/markdown"
else:
raise ValueError(f"{blob} does not have a mimetype.")

if mimetype in self.handlers:
handler = self.handlers[mimetype]
yield from handler.lazy_parse(blob)
else:
if self.fallback_parser is not None:
yield from self.fallback_parser.lazy_parse(blob)
else:
raise ValueError(f"Unsupported mime type: {mimetype}")


class MarkdownParser(BaseBlobParser):
"""Parser for Markdown blobs."""

def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
("###$", "Header 4"),
("####", "Header 5"),
("#####", "Header 6"),
]
splitter = MarkdownHeaderTextSplitter(
headers_to_split_on, return_each_line=True
)
for doc in splitter.split_text(blob.as_string()):
yield doc


HANDLERS = {
"application/pdf": PDFMinerParser(),
"text/plain": TextParser(),
"text/markdown": MarkdownParser(),
"text/html": BS4HTMLParser(),
"application/msword": MsWordParser(),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": (
Expand All @@ -18,7 +111,7 @@

# PUBLIC API

MIMETYPE_BASED_PARSER = MimeTypeBasedParser(
MIMETYPE_BASED_PARSER = MimeTypeParser(
handlers=HANDLERS,
fallback_parser=None,
)
4 changes: 4 additions & 0 deletions backend/app/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,12 @@
def _guess_mimetype(file_name: str, file_bytes: bytes) -> str:
"""Guess the mime-type of a file based on its name or bytes."""
# Guess based on the file extension

mime_type, _ = mimetypes.guess_type(file_name)

if file_name.endswith(".md"):
return "text/markdown"

# Return detected mime type from mimetypes guess, unless it's None
if mime_type:
return mime_type
Expand Down
16 changes: 11 additions & 5 deletions backend/tests/unit_tests/agent_executor/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def test_list_of_supported_mimetypes() -> None:
"application/pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"text/html",
"text/markdown",
"text/plain",
]

Expand All @@ -23,17 +24,22 @@ def test_attempt_to_parse_each_fixture() -> None:
seen_mimetypes = set()
for path in get_sample_paths():
type_, _ = mimetypes.guess_type(path)
if path.name.endswith(".md"):
type_ = "text/markdown"
if type_ not in SUPPORTED_MIMETYPES:
continue
seen_mimetypes.add(type_)
blob = Blob.from_path(path)
documents = MIMETYPE_BASED_PARSER.parse(blob)
try:
assert len(documents) == 1
doc = documents[0]
assert "source" in doc.metadata
assert doc.metadata["source"] == str(path)
assert "🦜" in doc.page_content
if type_ == "text/markdown":
assert len(documents) >= 1
else:
assert len(documents) == 1
doc = documents[0]
assert "source" in doc.metadata
assert doc.metadata["source"] == str(path)
assert "🦜" in doc.page_content
except Exception as e:
raise AssertionError(f"Failed to parse {path}") from e

Expand Down
4 changes: 3 additions & 1 deletion backend/tests/unit_tests/agent_executor/test_upload.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from io import BytesIO

from langchain.text_splitter import RecursiveCharacterTextSplitter
from fastapi import UploadFile
from langchain.text_splitter import RecursiveCharacterTextSplitter

from app.upload import IngestRunnable, _guess_mimetype, convert_ingestion_input_to_blob
from tests.unit_tests.fixtures import get_sample_paths
from tests.unit_tests.utils import InMemoryVectorStore
Expand Down Expand Up @@ -46,4 +47,5 @@ def test_mimetype_guessing() -> None:
"sample.pdf": "application/pdf",
"sample.rtf": "application/rtf",
"sample.txt": "text/plain",
"sample.md": "text/markdown",
} == name_to_mime
13 changes: 13 additions & 0 deletions backend/tests/unit_tests/fixtures/sample.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# 🦜️ LangChain

## Heading 2

Some text for heading 2.

### Heading 3

Some text for heading 3.

#### Heading 4

Some text for heading 4.
2 changes: 1 addition & 1 deletion frontend/src/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ export type TYPE_NAME = (typeof TYPES)[keyof typeof TYPES]["id"];
export const DROPZONE_CONFIG = {
multiple: true,
accept: {
"text/*": [".txt", ".htm", ".html"],
"text/*": [".txt", ".htm", ".html", ".md"],
"application/pdf": [".pdf"],
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": [
".docx",
Expand Down
Loading