Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add file parsing based on mimetype #14

Merged
merged 11 commits into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from fastapi import FastAPI, Form, Request, UploadFile
from fastapi.staticfiles import StaticFiles
from gizmo_agent import agent, ingest_runnable
from langserve import add_routes
from langchain.schema.runnable import RunnableConfig
from langserve import add_routes

from app.storage import (
get_thread_messages,
Expand Down
51 changes: 51 additions & 0 deletions backend/packages/agent-executor/agent_executor/ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""Code to ingest blob into a vectorstore.

Code is responsible for taking binary data, parsing it and then indexing it
into a vector store.

This code should be agnostic to how the blob got generated; i.e., it does not
know about server/uploading etc.
"""
from typing import List

from langchain.document_loaders import Blob
from langchain.document_loaders.base import BaseBlobParser
from langchain.schema import Document
from langchain.schema.vectorstore import VectorStore
from langchain.text_splitter import TextSplitter


def _update_document_metadata(document: Document, namespace: str) -> None:
"""Mutation in place that adds a namespace to the document metadata."""
document.metadata["namespace"] = namespace


# PUBLIC API


def ingest_blob(
blob: Blob,
parser: BaseBlobParser,
text_splitter: TextSplitter,
vectorstore: VectorStore,
namespace: str,
*,
batch_size: int = 100,
) -> List[str]:
"""Ingest a document into the vectorstore."""
docs_to_index = []
ids = []
for document in parser.lazy_parse(blob):
docs = text_splitter.split_documents([document])
for doc in docs:
_update_document_metadata(doc, namespace)
docs_to_index.extend(docs)

if len(docs_to_index) >= batch_size:
ids.extend(vectorstore.add_documents(docs_to_index))
docs_to_index = []

if docs_to_index:
ids.extend(vectorstore.add_documents(docs_to_index))

return ids
24 changes: 24 additions & 0 deletions backend/packages/agent-executor/agent_executor/parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Module contains logic for parsing binary blobs into text."""
from langchain.document_loaders.parsers import BS4HTMLParser, PDFMinerParser
from langchain.document_loaders.parsers.generic import MimeTypeBasedParser
from langchain.document_loaders.parsers.msword import MsWordParser
from langchain.document_loaders.parsers.txt import TextParser

HANDLERS = {
"application/pdf": PDFMinerParser(),
"text/plain": TextParser(),
"text/html": BS4HTMLParser(),
"application/msword": MsWordParser(),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": (
MsWordParser()
),
}

SUPPORTED_MIMETYPES = sorted(HANDLERS.keys())

# PUBLIC API

MIMETYPE_BASED_PARSER = MimeTypeBasedParser(
handlers=HANDLERS,
fallback_parser=None,
)
60 changes: 54 additions & 6 deletions backend/packages/agent-executor/agent_executor/upload.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,49 @@
"""API to deal with file uploads via a runnable.

For now this code assumes that the content is a base64 encoded string.

The details here might change in the future.

For the time being, upload and ingestion are coupled
"""
from __future__ import annotations

from typing import Any, BinaryIO, List, Optional

from langchain.document_loaders.blob_loaders.schema import Blob
from langchain.schema.runnable import RunnableConfig, RunnableSerializable
from langchain.schema.vectorstore import VectorStore
from langchain.text_splitter import TextSplitter

from agent_executor.ingest import ingest_blob
from agent_executor.parsing import MIMETYPE_BASED_PARSER


def _guess_mimetype(file_bytes: bytes) -> str:
"""Guess the mime-type of a file."""
try:
import magic
except ImportError:
raise ImportError(
"magic package not found, please install it with `pip install python-magic`"
)

mime = magic.Magic(mime=True)
mime_type = mime.from_buffer(file_bytes)
return mime_type


def _convert_ingestion_input_to_blob(data: BinaryIO) -> Blob:
"""Convert ingestion input to blob."""
file_data = data.read()
mimetype = _guess_mimetype(file_data)
file_name = data.name
return Blob.from_data(
data=file_data,
path=file_name,
mime_type=mimetype,
)


class IngestRunnable(RunnableSerializable[BinaryIO, List[str]]):
text_splitter: TextSplitter
Expand Down Expand Up @@ -33,9 +73,17 @@ def batch(
return_exceptions: bool = False,
**kwargs: Any | None,
) -> List:
docs = self.text_splitter.create_documents(
# TODO change this line to accept binary formats
[part.read().decode() for part in inputs],
[{"namespace": self.namespace}],
)
return self.vectorstore.add_documents(docs)
"""Ingest a batch of files into the vectorstore."""
ids = []
for data in inputs:
blob = _convert_ingestion_input_to_blob(data)
ids.extend(
ingest_blob(
blob,
MIMETYPE_BASED_PARSER,
self.text_splitter,
self.vectorstore,
self.namespace,
)
)
return ids
1 change: 0 additions & 1 deletion backend/packages/gizmo-agent/gizmo_agent/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
ingest_runnable = IngestRunnable(
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200),
vectorstore=vstore,
input_key="file_contents",
).configurable_fields(
assistant_id=ConfigurableField(
id="assistant_id",
Expand Down
4 changes: 4 additions & 0 deletions backend/packages/gizmo-agent/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ openai = ">=0.5.0,<1.0"
anthropic = "^0.3.11"
langchain-experimental = "^0.0.37"
duckduckgo-search = "^3.9.4"
python-magic = "^0.4.27"
bs4 = "^0.0.1"
unstructured = {extras = ["doc", "docx"], version = "^0.10.29"}
pdfminer-six = "^20221105"

[tool.poetry.group.dev.dependencies]
langchain-cli = ">=0.0.15"
Expand Down
18 changes: 0 additions & 18 deletions backend/tests/unit_tests/agent_executor/test_ingestion.py

This file was deleted.

41 changes: 41 additions & 0 deletions backend/tests/unit_tests/agent_executor/test_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Test parsing logic."""
import mimetypes

from agent_executor.parsing import MIMETYPE_BASED_PARSER, SUPPORTED_MIMETYPES
from langchain.document_loaders import Blob

from tests.unit_tests.fixtures import get_sample_paths


def test_list_of_supported_mimetypes() -> None:
"""This list should generally grow! Protecting against typos in mimetypes."""
assert SUPPORTED_MIMETYPES == [
"application/msword",
"application/pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"text/html",
"text/plain",
]


def test_attempt_to_parse_each_fixture() -> None:
"""Attempt to parse supported fixtures."""
seen_mimetypes = set()
for path in get_sample_paths():
type_, _ = mimetypes.guess_type(path)
if type_ not in SUPPORTED_MIMETYPES:
continue
seen_mimetypes.add(type_)
blob = Blob.from_path(path)
documents = MIMETYPE_BASED_PARSER.parse(blob)
try:
assert len(documents) == 1
doc = documents[0]
assert "source" in doc.metadata
assert doc.metadata["source"] == str(path)
assert "🦜" in doc.page_content
except Exception as e:
raise AssertionError(f"Failed to parse {path}") from e

known_missing = {"application/msword"}
assert set(SUPPORTED_MIMETYPES) - known_missing == seen_mimetypes
43 changes: 43 additions & 0 deletions backend/tests/unit_tests/agent_executor/test_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from io import BytesIO

from agent_executor.upload import IngestRunnable, _guess_mimetype
from langchain.text_splitter import RecursiveCharacterTextSplitter

from tests.unit_tests.fixtures import get_sample_paths
from tests.unit_tests.utils import InMemoryVectorStore


def test_ingestion_runnable() -> None:
"""Test ingestion runnable"""
vectorstore = InMemoryVectorStore()
splitter = RecursiveCharacterTextSplitter()
runnable = IngestRunnable(
text_splitter=splitter,
vectorstore=vectorstore,
input_key="file_contents",
assistant_id="TheParrot",
)
data = BytesIO(b"test")
data.name = "filename"
ids = runnable.invoke(data)
assert len(ids) == 1


def test_mimetype_guessing() -> None:
"""Verify mimetype guessing for all fixtures."""
name_to_mime = {}
for file in sorted(get_sample_paths()):
data = file.read_bytes()
name_to_mime[file.name] = _guess_mimetype(data)

assert {
"sample.docx": (
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
),
"sample.epub": "application/epub+zip",
"sample.html": "text/html",
"sample.odt": "application/vnd.oasis.opendocument.text",
"sample.pdf": "application/pdf",
"sample.rtf": "text/rtf",
"sample.txt": "text/plain",
} == name_to_mime
11 changes: 11 additions & 0 deletions backend/tests/unit_tests/fixtures/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from pathlib import Path
from typing import List

HERE = Path(__file__).parent

# PUBLIC API


def get_sample_paths() -> List[Path]:
"""List all fixtures."""
return list(HERE.glob("sample.*"))
Binary file added backend/tests/unit_tests/fixtures/sample.docx
Binary file not shown.
Binary file added backend/tests/unit_tests/fixtures/sample.epub
Binary file not shown.
1 change: 1 addition & 0 deletions backend/tests/unit_tests/fixtures/sample.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<html><head><meta content="text/html; charset=UTF-8" http-equiv="content-type"><style type="text/css">.lst-kix_n6n0tzfwn8i8-5>li:before{content:"\0025a0 "}.lst-kix_n6n0tzfwn8i8-6>li:before{content:"\0025cf "}ul.lst-kix_n6n0tzfwn8i8-8{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-7{list-style-type:none}.lst-kix_n6n0tzfwn8i8-3>li:before{content:"\0025cf "}.lst-kix_n6n0tzfwn8i8-4>li:before{content:"\0025cb "}.lst-kix_n6n0tzfwn8i8-7>li:before{content:"\0025cb "}.lst-kix_n6n0tzfwn8i8-8>li:before{content:"\0025a0 "}.lst-kix_n6n0tzfwn8i8-1>li:before{content:"\0025cb "}.lst-kix_n6n0tzfwn8i8-2>li:before{content:"\0025a0 "}li.li-bullet-0:before{margin-left:-18pt;white-space:nowrap;display:inline-block;min-width:18pt}.lst-kix_n6n0tzfwn8i8-0>li:before{content:"\0025cf "}ul.lst-kix_n6n0tzfwn8i8-2{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-1{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-0{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-6{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-5{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-4{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-3{list-style-type:none}ol{margin:0;padding:0}table td,table th{padding:0}.c6{border-right-style:solid;padding:5pt 5pt 5pt 5pt;border-bottom-color:#000000;border-top-width:1pt;border-right-width:1pt;border-left-color:#000000;vertical-align:top;border-right-color:#000000;border-left-width:1pt;border-top-style:solid;border-left-style:solid;border-bottom-width:1pt;width:156pt;border-top-color:#000000;border-bottom-style:solid}.c0{-webkit-text-decoration-skip:none;color:#000000;font-weight:400;text-decoration:underline;vertical-align:baseline;text-decoration-skip-ink:none;font-size:11pt;font-family:"Arial";font-style:normal}.c4{padding-top:0pt;padding-bottom:0pt;line-height:1.0;orphans:2;widows:2;text-align:left;height:11pt}.c11{color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:italic}.c3{color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:normal}.c12{color:#000000;font-weight:700;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:normal}.c7{padding-top:0pt;padding-bottom:0pt;line-height:1.0;orphans:2;widows:2;text-align:left}.c1{padding-top:0pt;padding-bottom:0pt;line-height:1.15;orphans:2;widows:2;text-align:left}.c8{text-decoration-skip-ink:none;-webkit-text-decoration-skip:none;color:#1155cc;text-decoration:underline}.c14{border-spacing:0;border-collapse:collapse;margin-right:auto}.c13{background-color:#ffffff;max-width:468pt;padding:72pt 72pt 72pt 72pt}.c15{padding:0;margin:0}.c10{margin-left:36pt;padding-left:0pt}.c5{color:inherit;text-decoration:inherit}.c9{height:11pt}.c2{height:0pt}.title{padding-top:0pt;color:#000000;font-size:26pt;padding-bottom:3pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}.subtitle{padding-top:0pt;color:#666666;font-size:15pt;padding-bottom:16pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}li{color:#000000;font-size:11pt;font-family:"Arial"}p{margin:0;color:#000000;font-size:11pt;font-family:"Arial"}h1{padding-top:20pt;color:#000000;font-size:20pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h2{padding-top:18pt;color:#000000;font-size:16pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h3{padding-top:16pt;color:#434343;font-size:14pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h4{padding-top:14pt;color:#666666;font-size:12pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h5{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h6{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;font-style:italic;orphans:2;widows:2;text-align:left}</style></head><body class="c13 doc-content"><p class="c1"><span class="c3">🦜️ LangChain</span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span class="c0">Underline</span></p><p class="c1 c9"><span class="c0"></span></p><p class="c1"><span class="c12">Bold</span></p><p class="c1 c9"><span class="c12"></span></p><p class="c1"><span class="c11">Italics</span></p><p class="c1 c9"><span class="c11"></span></p><p class="c1 c9"><span class="c11"></span></p><a id="t.e89270b97fc18eabe5c666cba79cd82cff5b5c3d"></a><a id="t.0"></a><table class="c14"><tbody><tr class="c2"><td class="c6" colspan="1" rowspan="1"><p class="c4"><span class="c12"></span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Col 1</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Col 2</span></p></td></tr><tr class="c2"><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Row 1</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">1</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">2</span></p></td></tr><tr class="c2"><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Row 2</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">3</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">4</span></p></td></tr></tbody></table><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span>Link: </span><span class="c8"><a class="c5" href="https://www.google.com/url?q=https://www.langchain.com/&amp;sa=D&amp;source=editors&amp;ust=1699572948600868&amp;usg=AOvVaw2T4jvAmPuMvcyed6PrEjq1">https://www.langchain.com/</a></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><ul class="c15 lst-kix_n6n0tzfwn8i8-0 start"><li class="c1 c10 li-bullet-0"><span class="c3">Item 1</span></li><li class="c1 c10 li-bullet-0"><span class="c3">Item 2</span></li><li class="c1 c10 li-bullet-0"><span class="c3">Item 3</span></li><li class="c1 c10 li-bullet-0"><span class="c3">We also love cats 🐱</span></li></ul><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span class="c3">Image</span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 624.00px; height: 132.00px;"><img alt="" src="sample_files/image1.png" style="width: 624.00px; height: 132.00px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p></body></html>
Binary file added backend/tests/unit_tests/fixtures/sample.odt
Binary file not shown.
Binary file added backend/tests/unit_tests/fixtures/sample.pdf
Binary file not shown.
Loading