From 0accea1ec3d876a7126c7e007ae621b4fe6f1943 Mon Sep 17 00:00:00 2001 From: Shamsuddin Ahmed Date: Tue, 27 Feb 2024 00:43:15 +0600 Subject: [PATCH 1/5] add httpx proxy client for embedded model --- backend/app/upload.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/backend/app/upload.py b/backend/app/upload.py index 87168003..ce1dd5e4 100644 --- a/backend/app/upload.py +++ b/backend/app/upload.py @@ -9,6 +9,7 @@ from __future__ import annotations import os +import httpx from typing import Any, BinaryIO, List, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter @@ -106,10 +107,17 @@ def batch( index_schema = { "tag": [{"name": "namespace"}], } + +proxy_url = os.getenv("PROXY_URL") +if proxy_url is not None and proxy_url != "": + http_client = httpx.Client(proxies=proxy_url) +else: + http_client = None + vstore = Redis( redis_url=os.environ["REDIS_URL"], index_name="opengpts", - embedding=OpenAIEmbeddings(), + embedding=OpenAIEmbeddings(http_client=http_client), index_schema=index_schema, ) From 735f6e17ad7182d504dbd540f2b327ebc19f75ee Mon Sep 17 00:00:00 2001 From: Shamsuddin Ahmed Date: Thu, 29 Feb 2024 12:56:43 +0600 Subject: [PATCH 2/5] fix proxy validation --- backend/app/upload.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/backend/app/upload.py b/backend/app/upload.py index ce1dd5e4..277a0e94 100644 --- a/backend/app/upload.py +++ b/backend/app/upload.py @@ -10,6 +10,8 @@ import os import httpx +import logging + from typing import Any, BinaryIO, List, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter @@ -22,10 +24,14 @@ ) from langchain_core.vectorstores import VectorStore from langchain_openai import OpenAIEmbeddings +from urllib.parse import urlparse + from app.ingest import ingest_blob from app.parsing import MIMETYPE_BASED_PARSER +logger = logging.getLogger(__name__) + def _guess_mimetype(file_bytes: bytes) -> str: """Guess the mime-type of a file.""" @@ -110,7 +116,12 @@ def batch( proxy_url = os.getenv("PROXY_URL") if proxy_url is not None and proxy_url != "": - http_client = httpx.Client(proxies=proxy_url) + parsed_url = urlparse(proxy_url) + if parsed_url.scheme and parsed_url.netloc: + http_client = httpx.Client(proxies=proxy_url) + else: + http_client = None + logger.warn("Invalid proxy URL provided. Proceeding without proxy.") else: http_client = None From a0da3bccdc0a7b5766bdf63c1254440e8cd8f218 Mon Sep 17 00:00:00 2001 From: Shamsuddin Ahmed Date: Thu, 21 Mar 2024 13:21:18 +0600 Subject: [PATCH 3/5] fix proxy in embedded http client resolve conflict with #241 --- backend/app/upload.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/backend/app/upload.py b/backend/app/upload.py index ce32f74f..3e53b2a7 100644 --- a/backend/app/upload.py +++ b/backend/app/upload.py @@ -9,6 +9,8 @@ from __future__ import annotations import os +import httpx +import logging from typing import Any, BinaryIO, List, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter @@ -21,10 +23,13 @@ ) from langchain_core.vectorstores import VectorStore from langchain_openai import OpenAIEmbeddings +from urllib.parse import urlparse from app.ingest import ingest_blob from app.parsing import MIMETYPE_BASED_PARSER +logger = logging.getLogger(__name__) + def _guess_mimetype(file_bytes: bytes) -> str: """Guess the mime-type of a file.""" @@ -52,6 +57,27 @@ def _convert_ingestion_input_to_blob(data: BinaryIO) -> Blob: ) +def _get_http_client() -> httpx.Client: + """Create and return a httpx.Client instance, configured with a proxy if available and valid. + + The method checks for a PROXY_URL environment variable. If a valid proxy URL is found, + the client is configured to use this proxy. Otherwise, a default client is returned. + + Returns: + An httpx.Client instance configured with or without a proxy based on the environment configuration. + """ + proxy_url = os.getenv("PROXY_URL") + if proxy_url: + parsed_url = urlparse(proxy_url) + if parsed_url.scheme and parsed_url.netloc: + return httpx.Client(proxies=proxy_url) + else: + logger.warning("Invalid proxy URL provided. Proceeding without proxy.") + + # Return a default client if no valid proxy URL is set + return httpx.Client() + + class IngestRunnable(RunnableSerializable[BinaryIO, List[str]]): """Runnable for ingesting files into a vectorstore.""" @@ -118,7 +144,7 @@ def batch( ) vstore = PGVector( connection_string=PG_CONNECTION_STRING, - embedding_function=OpenAIEmbeddings(), + embedding_function=OpenAIEmbeddings(http_client=_get_http_client()), ) From 21a5126c63ea866f0f2031de56b0a169e0898184 Mon Sep 17 00:00:00 2001 From: Shamsuddin Ahmed Date: Thu, 21 Mar 2024 16:39:40 +0600 Subject: [PATCH 4/5] fix conflict with #242 --- backend/app/upload.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/backend/app/upload.py b/backend/app/upload.py index e4adc814..572b909c 100644 --- a/backend/app/upload.py +++ b/backend/app/upload.py @@ -9,7 +9,10 @@ from __future__ import annotations import os +import httpx +import logging from typing import Any, BinaryIO, List, Optional +from urllib.parse import urlparse from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter from langchain_community.document_loaders.blob_loaders.schema import Blob @@ -22,9 +25,12 @@ from langchain_core.vectorstores import VectorStore from langchain_openai import OpenAIEmbeddings + from app.ingest import ingest_blob from app.parsing import MIMETYPE_BASED_PARSER +logger = logging.getLogger(__name__) + def _guess_mimetype(file_bytes: bytes) -> str: """Guess the mime-type of a file.""" @@ -52,6 +58,27 @@ def _convert_ingestion_input_to_blob(data: BinaryIO) -> Blob: ) +def _get_http_client() -> httpx.Client: + """Create and return a httpx.Client instance, configured with a proxy if available and valid. + + The method checks for a PROXY_URL environment variable. If a valid proxy URL is found, + the client is configured to use this proxy. Otherwise, a default client is returned. + + Returns: + An httpx.Client instance configured with or without a proxy based on the environment configuration. + """ + proxy_url = os.getenv("PROXY_URL") + if proxy_url: + parsed_url = urlparse(proxy_url) + if parsed_url.scheme and parsed_url.netloc: + return httpx.Client(proxies=proxy_url) + else: + logger.warning("Invalid proxy URL provided. Proceeding without proxy.") + + # Return a default client if no valid proxy URL is set + return httpx.Client() + + class IngestRunnable(RunnableSerializable[BinaryIO, List[str]]): """Runnable for ingesting files into a vectorstore.""" @@ -118,7 +145,7 @@ def batch( ) vstore = PGVector( connection_string=PG_CONNECTION_STRING, - embedding_function=OpenAIEmbeddings(), + embedding_function=OpenAIEmbeddings(http_client=_get_http_client()), use_jsonb=True, ) From b76d7746052596e2c41a8c4e5daa9c9c46633447 Mon Sep 17 00:00:00 2001 From: Shamsuddin Ahmed Date: Thu, 21 Mar 2024 21:57:14 +0600 Subject: [PATCH 5/5] error don't marge, pydantic.error_wrappers.ValidationError: 1 validation error for OpenAIEmbeddings --- backend/app/upload.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/backend/app/upload.py b/backend/app/upload.py index 572b909c..0acae02e 100644 --- a/backend/app/upload.py +++ b/backend/app/upload.py @@ -11,7 +11,7 @@ import os import httpx import logging -from typing import Any, BinaryIO, List, Optional +from typing import Any, BinaryIO, List, Optional, Union from urllib.parse import urlparse from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter @@ -25,7 +25,6 @@ from langchain_core.vectorstores import VectorStore from langchain_openai import OpenAIEmbeddings - from app.ingest import ingest_blob from app.parsing import MIMETYPE_BASED_PARSER @@ -58,25 +57,32 @@ def _convert_ingestion_input_to_blob(data: BinaryIO) -> Blob: ) -def _get_http_client() -> httpx.Client: - """Create and return a httpx.Client instance, configured with a proxy if available and valid. +def _get_http_client(use_async: bool = False) -> Union[httpx.Client, httpx.AsyncClient]: + """ + Create and return a httpx.Client or httpx.AsyncClient instance, configured with a proxy if available and valid. The method checks for a PROXY_URL environment variable. If a valid proxy URL is found, the client is configured to use this proxy. Otherwise, a default client is returned. + Args: + use_async (bool): Flag indicating whether to return an asynchronous HTTP client. + Returns: - An httpx.Client instance configured with or without a proxy based on the environment configuration. + An instance of httpx.Client or httpx.AsyncClient configured with or without a proxy based on the environment configuration. """ proxy_url = os.getenv("PROXY_URL") + client_kwargs = {} if proxy_url: parsed_url = urlparse(proxy_url) if parsed_url.scheme and parsed_url.netloc: - return httpx.Client(proxies=proxy_url) + client_kwargs["proxies"] = proxy_url else: logger.warning("Invalid proxy URL provided. Proceeding without proxy.") - # Return a default client if no valid proxy URL is set - return httpx.Client() + if use_async: + return httpx.AsyncClient(**client_kwargs) + else: + return httpx.Client(**client_kwargs) class IngestRunnable(RunnableSerializable[BinaryIO, List[str]]): @@ -99,7 +105,7 @@ class Config: @property def namespace(self) -> str: if (self.assistant_id is None and self.thread_id is None) or ( - self.assistant_id is not None and self.thread_id is not None + self.assistant_id is not None and self.thread_id is not None ): raise ValueError( "Exactly one of assistant_id or thread_id must be provided" @@ -107,17 +113,17 @@ def namespace(self) -> str: return self.assistant_id if self.assistant_id is not None else self.thread_id def invoke( - self, input: BinaryIO, config: Optional[RunnableConfig] = None + self, input: BinaryIO, config: Optional[RunnableConfig] = None ) -> List[str]: return self.batch([input], config) def batch( - self, - inputs: List[BinaryIO], - config: RunnableConfig | List[RunnableConfig] | None = None, - *, - return_exceptions: bool = False, - **kwargs: Any | None, + self, + inputs: List[BinaryIO], + config: RunnableConfig | List[RunnableConfig] | None = None, + *, + return_exceptions: bool = False, + **kwargs: Any | None, ) -> List: """Ingest a batch of files into the vectorstore.""" ids = [] @@ -149,7 +155,6 @@ def batch( use_jsonb=True, ) - ingest_runnable = IngestRunnable( text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200), vectorstore=vstore,