Skip to content

Commit

Permalink
Merge simjak changes
Browse files Browse the repository at this point in the history
  • Loading branch information
homanp committed Feb 12, 2024
1 parent d7bfafc commit b1817cc
Show file tree
Hide file tree
Showing 13 changed files with 285 additions and 115 deletions.
10 changes: 7 additions & 3 deletions api/delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,18 @@

from models.delete import RequestPayload, ResponsePayload
from service.vector_database import VectorService, get_vector_service
from service.embedding import get_encoder

router = APIRouter()


@router.delete("/delete", response_model=ResponsePayload)
async def delete(payload: RequestPayload):
encoder = get_encoder(encoder_type=payload.encoder)
vector_service: VectorService = get_vector_service(
index_name=payload.index_name, credentials=payload.vector_database
index_name=payload.index_name,
credentials=payload.vector_database,
encoder=encoder,
)
await vector_service.delete(file_url=payload.file_url)
return {"success": True}
data = await vector_service.delete(file_url=payload.file_url)
return {"success": True, "data": data}
6 changes: 3 additions & 3 deletions api/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ async def ingest(payload: RequestPayload) -> Dict:
index_name=payload.index_name,
vector_credentials=payload.vector_database,
)
documents = await embedding_service.generate_documents()
chunks = await embedding_service.generate_chunks()
encoder = get_encoder(encoder_type=payload.encoder)
summary_documents = await embedding_service.generate_summary_documents(
documents=documents
documents=chunks
)

await asyncio.gather(
embedding_service.generate_embeddings(
documents=documents, encoder=encoder, index_name=payload.index_name
documents=chunks, encoder=encoder, index_name=payload.index_name
),
embedding_service.generate_embeddings(
documents=summary_documents,
Expand Down
12 changes: 9 additions & 3 deletions api/query.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
from fastapi import APIRouter

from models.query import RequestPayload, ResponsePayload
from models.query import RequestPayload, ResponseData, ResponsePayload
from service.router import query as _query

router = APIRouter()


@router.post("/query", response_model=ResponsePayload)
async def query(payload: RequestPayload):
output = await _query(payload=payload)
return {"success": True, "data": output}
chunks = await _query(payload=payload)
response_data = [
ResponseData(
content=chunk.content, doc_url=chunk.doc_url, page_label=chunk.page_number
)
for chunk in chunks
]
return {"success": True, "data": response_data}
12 changes: 6 additions & 6 deletions dev/embedding.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"\n",
"file = File(\n",
" type=FileType.pdf,\n",
" url=\"https://arxiv.org/pdf/2402.05131.pdf\"\n",
" url=\"https://arxiv.org/pdf/2210.03629.pdf\"\n",
")\n",
"vector_credentials = {\n",
" \"type\": \"pinecone\",\n",
Expand All @@ -40,7 +40,7 @@
"metadata": {},
"outputs": [],
"source": [
"docs = await embedding_service.generate_documents()"
"docs = await embedding_service.generate_chunks()"
]
},
{
Expand All @@ -49,7 +49,7 @@
"metadata": {},
"outputs": [],
"source": [
"chunks = await embedding_service.generate_chunks(docs)"
"texts = [doc.content for doc in docs]"
]
},
{
Expand All @@ -62,11 +62,11 @@
"\n",
"concatenated_document = \"\"\n",
"\n",
"for i, chunk in enumerate(chunks):\n",
"for i, chunk in enumerate(texts):\n",
" color = colors[i % len(colors)]\n",
" colored_text = colored(chunk.text, color)\n",
" colored_text = colored(chunk, color)\n",
" print(colored_text)\n",
" concatenated_document += chunk.text + \" \"\n",
" concatenated_document += chunk + \" \"\n",
"\n",
"print(\"\\nConcatenated Document:\\n\", concatenated_document)"
]
Expand Down
23 changes: 17 additions & 6 deletions dev/walkthrough.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
" \"files\": [\n",
" {\n",
" \"type\": \"PDF\",\n",
" \"url\": \"https://arxiv.org/pdf/2402.05131.pdf\"\n",
" \"url\": \"https://arxiv.org/pdf/2210.03629.pdf\"\n",
" }\n",
" ],\n",
" \"vector_database\": {\n",
Expand All @@ -46,7 +46,7 @@
" }\n",
" },\n",
" \"index_name\": PINECONE_INDEX,\n",
" \"encoder\": \"openai\",\n",
" \"encoder\": \"cohere\",\n",
"}\n",
"\n",
"response = requests.post(url, json=payload)\n",
Expand All @@ -64,7 +64,7 @@
"query_url = f\"{API_URL}/api/v1/query\"\n",
"\n",
"query_payload = {\n",
" \"input\": \"What is the best chunk strategy?\",\n",
" \"input\": \"What is CoT?\",\n",
" \"vector_database\": {\n",
" \"type\": \"pinecone\",\n",
" \"config\": {\n",
Expand All @@ -73,12 +73,22 @@
" }\n",
" },\n",
" \"index_name\": PINECONE_INDEX,\n",
" \"encoder\": \"openai\",\n",
" \"encoder\": \"cohere\",\n",
"}\n",
"\n",
"query_response = requests.post(query_url, json=query_payload)\n",
"\n",
"print(query_response.json())\n"
"print(query_response.json())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = query_response.json().get('data', [])\n",
"data"
]
},
{
Expand All @@ -91,7 +101,7 @@
"query_url = f\"{API_URL}/api/v1/delete\"\n",
"\n",
"delete_payload = {\n",
" \"file_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"file_url\": \"https://arxiv.org/pdf/2210.03629.pdf\",\n",
" \"vector_database\": {\n",
" \"type\": \"pinecone\",\n",
" \"config\": {\n",
Expand All @@ -100,6 +110,7 @@
" }\n",
" },\n",
" \"index_name\": PINECONE_INDEX,\n",
" \"encoder\": \"cohere\",\n",
"}\n",
"\n",
"delete_response = requests.delete(query_url, json=delete_payload)\n",
Expand Down
1 change: 1 addition & 0 deletions encoders/cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
class CohereEncoder(BaseEncoder):
client: Optional[cohere.Client] = None
type: str = "cohere"
dimension: int = 1024 # https://docs.cohere.com/reference/embed

def __init__(
self,
Expand Down
3 changes: 3 additions & 0 deletions models/delete.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pydantic import BaseModel
from models.ingest import EncoderEnum

from models.vector_database import VectorDatabase

Expand All @@ -7,7 +8,9 @@ class RequestPayload(BaseModel):
index_name: str
file_url: str
vector_database: VectorDatabase
encoder: EncoderEnum


class ResponsePayload(BaseModel):
success: bool
data: dict = {}
12 changes: 9 additions & 3 deletions models/document.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
from pydantic import BaseModel


class Document(BaseModel):
class BaseDocument(BaseModel):
id: str
text: str
file_url: str
content: str
doc_url: str
metadata: dict | None = None


class BaseDocumentChunk(BaseDocument):
document_id: str
page_number: str = ""
dense_embedding: list[float] | None = None
2 changes: 1 addition & 1 deletion models/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class RequestPayload(BaseModel):

class ResponseData(BaseModel):
content: str
file_url: str
doc_url: str
page_label: Optional[str]


Expand Down
Loading

0 comments on commit b1817cc

Please sign in to comment.