Skip to content

Commit

Permalink
lint
Browse files Browse the repository at this point in the history
  • Loading branch information
mkorpela committed May 3, 2024
1 parent 8d56788 commit 22dce38
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 19 deletions.
2 changes: 1 addition & 1 deletion backend/app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from app.api import router as api_router
from app.auth.handlers import AuthedUser
from app.lifespan import lifespan
from app.upload import ingest_runnable, convert_ingestion_input_to_blob
from app.upload import convert_ingestion_input_to_blob, ingest_runnable

logger = logging.getLogger(__name__)

Expand Down
38 changes: 20 additions & 18 deletions backend/app/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@

import mimetypes
import os

from typing import BinaryIO, List, Optional

from fastapi import UploadFile
from langchain_community.vectorstores.pgvector import PGVector
from langchain_core.document_loaders.blob_loaders import Blob
Expand All @@ -39,26 +39,30 @@ def _guess_mimetype(file_name: str, file_bytes: bytes) -> str:
return mime_type

# Signature-based detection for common types
if file_bytes.startswith(b'%PDF'):
return 'application/pdf'
elif file_bytes.startswith((b'\x50\x4B\x03\x04', b'\x50\x4B\x05\x06', b'\x50\x4B\x07\x08')):
return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
elif file_bytes.startswith(b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'):
return 'application/msword'
elif file_bytes.startswith(b'\x09\x00\xff\x00\x06\x00'):
return 'application/vnd.ms-excel'
if file_bytes.startswith(b"%PDF"):
return "application/pdf"
elif file_bytes.startswith(
(b"\x50\x4B\x03\x04", b"\x50\x4B\x05\x06", b"\x50\x4B\x07\x08")
):
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
elif file_bytes.startswith(b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"):
return "application/msword"
elif file_bytes.startswith(b"\x09\x00\xff\x00\x06\x00"):
return "application/vnd.ms-excel"

# Check for CSV-like plain text content (commas, tabs, newlines)
try:
decoded = file_bytes[:1024].decode('utf-8', errors='ignore')
if all(char in decoded for char in (',', '\n')) or all(char in decoded for char in ('\t', '\n')):
return 'text/csv'
elif decoded.isprintable() or decoded == '':
return 'text/plain'
decoded = file_bytes[:1024].decode("utf-8", errors="ignore")
if all(char in decoded for char in (",", "\n")) or all(
char in decoded for char in ("\t", "\n")
):
return "text/csv"
elif decoded.isprintable() or decoded == "":
return "text/plain"
except UnicodeDecodeError:
pass

return 'application/octet-stream'
return "application/octet-stream"


def convert_ingestion_input_to_blob(file: UploadFile) -> Blob:
Expand Down Expand Up @@ -129,9 +133,7 @@ def namespace(self) -> str:
)
return self.assistant_id if self.assistant_id is not None else self.thread_id

def invoke(
self, blob: Blob, config: Optional[RunnableConfig] = None
) -> List[str]:
def invoke(self, blob: Blob, config: Optional[RunnableConfig] = None) -> List[str]:
out = ingest_blob(
blob,
MIMETYPE_BASED_PARSER,
Expand Down

0 comments on commit 22dce38

Please sign in to comment.