From b5e590c6832926c3c2f3291f181f0e7fe4c06fea Mon Sep 17 00:00:00 2001 From: klaudialemiec Date: Tue, 21 May 2024 22:53:39 +0000 Subject: [PATCH 1/4] Chroma docstrings update --- .../chroma/langchain_chroma/vectorstores.py | 114 +++++++++++++++--- 1 file changed, 95 insertions(+), 19 deletions(-) diff --git a/libs/partners/chroma/langchain_chroma/vectorstores.py b/libs/partners/chroma/langchain_chroma/vectorstores.py index 221820173be13..f7425c432dcfe 100644 --- a/libs/partners/chroma/langchain_chroma/vectorstores.py +++ b/libs/partners/chroma/langchain_chroma/vectorstores.py @@ -52,7 +52,11 @@ def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]: def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: - """Row-wise cosine similarity between two equal-width matrices.""" + """Row-wise cosine similarity between two equal-width matrices. + + Raises: + ValueError: If the number of columns in X and Y are not the same. + """ if len(X) == 0 or len(Y) == 0: return np.array([]) @@ -80,7 +84,21 @@ def maximal_marginal_relevance( lambda_mult: float = 0.5, k: int = 4, ) -> List[int]: - """Calculate maximal marginal relevance.""" + """Calculate maximal marginal relevance. + + Args: + query_embedding (np.ndarray): Query embedding. + embedding_list (list): List of embeddings to select from. + lambda_mult (float): Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + k (int): Number of Documents to return. Defaults to 4. + + Returns: + List[int]: List of indices of embeddings selected by maximal marginal relevance. + """ + if min(k, len(embedding_list)) <= 0: return [] if query_embedding.ndim == 1: @@ -136,8 +154,22 @@ def __init__( relevance_score_fn: Optional[Callable[[float], float]] = None, create_collection_if_not_exists: Optional[bool] = True, ) -> None: - """Initialize with a Chroma client.""" + """Initialize with a Chroma client. + Args: + collection_name (str): Name of the collection to create. + embedding_function (Optional[Embeddings]): Embedding class object. Used to embed texts. + persist_director (Optional[str]): Directory to persist the collection. + client_settings (Optional[chromadb.config.Settings]): Chroma client settings + collection_metadata (Optional[Dict]): Collection configurations. + client (Optional[chromadb.ClientAPI]): Chroma client. + Documentation: https://docs.trychroma.com/reference/js-client#class:-chromaclient + relevance_score_fn (Optional[Callable[[float], float]]): + Fuction to calculate relevance score from distance. + Used only in `similarity_search_with_relevance_scores` + create_collection_if_not_exists (Optional[bool]): + Whether to create collection if it doesn't exist. Defaults to True. + """ if client is not None: self._client_settings = client_settings self._client = client @@ -204,7 +236,22 @@ def __query_collection( where_document: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> Union[List[Document], chromadb.QueryResult]: - """Query the chroma collection.""" + """Query the chroma collection. + + Args: + query_texts (Optional[List[str]]): List of query texts. + query_embeddings (Optional[List[List[float]]]): List of query embeddings. + n_results (int): Number of results to return. Defaults to 4. + where (Optional[Dict[str, str]]): dict used to filter results by + e.g. {"color" : "red", "price": 4.20}. + where_document (Optional[Dict[str, str]]): dict used to filter by the documents. + E.g. {$contains: {"text": "hello"}}. + + Returns: + List of `n_results` nearest neighbor embeddings for provided query_embeddings or query_texts. + + See more: https://docs.trychroma.com/reference/py-collection#query + """ return self._collection.query( query_texts=query_texts, query_embeddings=query_embeddings, # type: ignore @@ -229,12 +276,16 @@ def add_images( """Run more images through the embeddings and add to the vectorstore. Args: - uris List[str]: File path to the image. + uris (List[str]): File path to the image. metadatas (Optional[List[dict]], optional): Optional list of metadatas. + When querying, you can filter on this metadata. ids (Optional[List[str]], optional): Optional list of IDs. Returns: List[str]: List of IDs of the added images. + + Raises: + ValueError: When matadata is incorrect. """ # Map from uris to b64 encoded strings b64_texts = [self.encode_image(uri=uri) for uri in uris] @@ -314,12 +365,16 @@ def add_texts( Args: texts (Iterable[str]): Texts to add to the vectorstore. metadatas (Optional[List[dict]], optional): Optional list of metadatas. + When querying, you can filter on this metadata. ids (Optional[List[str]], optional): Optional list of IDs. Returns: List[str]: List of IDs of the added texts. + + Raises: + ValueError: When matadata is incorrect. """ - # TODO: Handle the case where the user doesn't provide ids on the Collection + if ids is None: ids = [str(uuid.uuid4()) for _ in texts] embeddings = None @@ -412,10 +467,14 @@ def similarity_search_by_vector( **kwargs: Any, ) -> List[Document]: """Return docs most similar to embedding vector. + Args: embedding (List[float]): Embedding to look up documents similar to. k (int): Number of Documents to return. Defaults to 4. filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + where_document (Optional[Dict[str, str]]): dict used to filter by the documents. + E.g. {$contains: {"text": "hello"}}. + Returns: List of Documents most similar to the query vector. """ @@ -443,6 +502,8 @@ def similarity_search_by_vector_with_relevance_scores( embedding (List[float]): Embedding to look up documents similar to. k (int): Number of Documents to return. Defaults to 4. filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + where_document (Optional[Dict[str, str]]): dict used to filter by the documents. + E.g. {$contains: {"text": "hello"}}. Returns: List[Tuple[Document, float]]: List of documents most similar to @@ -472,10 +533,12 @@ def similarity_search_with_score( query (str): Query text to search for. k (int): Number of results to return. Defaults to 4. filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + where_document (Optional[Dict[str, str]]): dict used to filter by the documents. + E.g. {$contains: {"text": "hello"}}. Returns: List[Tuple[Document, float]]: List of documents most similar to - the query text and cosine distance in float for each. + the query text and distance in float for each. Lower score represents more similarity. """ if self._embedding_function is None: @@ -500,13 +563,20 @@ def similarity_search_with_score( def _select_relevance_score_fn(self) -> Callable[[float], float]: """ - The 'correct' relevance function - may differ depending on a few things, including: - - the distance / similarity metric used by the VectorStore - - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) - - embedding dimensionality - - etc. + Select the relevance score function based on the distance metric used by the VectorStore. + The most similar documents will have the lowest relevance score. + Default relevance score function is euclidean distance. + Distance metric must be provided in `collection_metadata` during initizalition of Chroma object . + Example: collection_metadata={"hnsw:space": "cosine"} + Available distance metrics are: 'cosine', 'l2' and 'ip'. + + Returns: + Callable[[float], float]: The relevance score function. + + Raises: + ValueError: If the distance metric is not supported. """ + if self.override_relevance_score_fn: return self.override_relevance_score_fn @@ -545,10 +615,10 @@ def max_marginal_relevance_search_by_vector( among selected documents. Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. - lambda_mult: Number between 0 and 1 that determines the degree + embedding (List[float]): Embedding to look up documents similar to. + k (int): Number of Documents to return. Defaults to 4. + fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. Defaults to 20. + lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. @@ -601,9 +671,13 @@ def max_marginal_relevance_search( to maximum diversity and 1 to minimum diversity. Defaults to 0.5. filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + where_document (Optional[Dict[str, str]]): dict used to filter by the documents. E.g. {$contains: {"text": "hello"}}. Returns: List of Documents selected by maximal marginal relevance. + + Raises: + ValueError: If the embedding function is not provided. """ if self._embedding_function is None: raise ValueError( @@ -611,7 +685,7 @@ def max_marginal_relevance_search( ) embedding = self._embedding_function.embed_query(query) - docs = self.max_marginal_relevance_search_by_vector( + return self.max_marginal_relevance_search_by_vector( embedding, k, fetch_k, @@ -619,7 +693,6 @@ def max_marginal_relevance_search( filter=filter, where_document=where_document, ) - return docs def delete_collection(self) -> None: """Delete the collection.""" @@ -686,6 +759,9 @@ def update_documents(self, ids: List[str], documents: List[Document]) -> None: Args: ids (List[str]): List of ids of the document to update. documents (List[Document]): List of documents to update. + + Raises: + ValueError: If the embedding function is not provided. """ text = [document.page_content for document in documents] metadata = [document.metadata for document in documents] From 7cc99a470a8f157b5703c6d6f06a530e561d8f41 Mon Sep 17 00:00:00 2001 From: klaudialemiec Date: Wed, 22 May 2024 10:42:44 +0000 Subject: [PATCH 2/4] Update of docstrings --- .../chroma/langchain_chroma/vectorstores.py | 161 +++++++++--------- 1 file changed, 81 insertions(+), 80 deletions(-) diff --git a/libs/partners/chroma/langchain_chroma/vectorstores.py b/libs/partners/chroma/langchain_chroma/vectorstores.py index f7425c432dcfe..5570c72e6b245 100644 --- a/libs/partners/chroma/langchain_chroma/vectorstores.py +++ b/libs/partners/chroma/langchain_chroma/vectorstores.py @@ -87,16 +87,16 @@ def maximal_marginal_relevance( """Calculate maximal marginal relevance. Args: - query_embedding (np.ndarray): Query embedding. - embedding_list (list): List of embeddings to select from. - lambda_mult (float): Number between 0 and 1 that determines the degree + query_embedding: Query embedding. + embedding_list: List of embeddings to select from. + lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. - k (int): Number of Documents to return. Defaults to 4. + k: Number of Documents to return. Defaults to 4. Returns: - List[int]: List of indices of embeddings selected by maximal marginal relevance. + List of indices of embeddings selected by maximal marginal relevance. """ if min(k, len(embedding_list)) <= 0: @@ -157,18 +157,17 @@ def __init__( """Initialize with a Chroma client. Args: - collection_name (str): Name of the collection to create. - embedding_function (Optional[Embeddings]): Embedding class object. Used to embed texts. - persist_director (Optional[str]): Directory to persist the collection. - client_settings (Optional[chromadb.config.Settings]): Chroma client settings - collection_metadata (Optional[Dict]): Collection configurations. - client (Optional[chromadb.ClientAPI]): Chroma client. - Documentation: https://docs.trychroma.com/reference/js-client#class:-chromaclient - relevance_score_fn (Optional[Callable[[float], float]]): - Fuction to calculate relevance score from distance. + collection_name: Name of the collection to create. + embedding_function: Embedding class object. Used to embed texts. + persist_director: Directory to persist the collection. + client_settings: Chroma client settings + collection_metadata: Collection configurations. + client: Chroma client. Documentation: + https://docs.trychroma.com/reference/js-client#class:-chromaclient + relevance_score_fn: Fuction to calculate relevance score from distance. Used only in `similarity_search_with_relevance_scores` - create_collection_if_not_exists (Optional[bool]): - Whether to create collection if it doesn't exist. Defaults to True. + create_collection_if_not_exists: Whether to create collection + if it doesn't exist. Defaults to True. """ if client is not None: self._client_settings = client_settings @@ -239,12 +238,12 @@ def __query_collection( """Query the chroma collection. Args: - query_texts (Optional[List[str]]): List of query texts. - query_embeddings (Optional[List[List[float]]]): List of query embeddings. - n_results (int): Number of results to return. Defaults to 4. - where (Optional[Dict[str, str]]): dict used to filter results by + query_texts: List of query texts. + query_embeddings: List of query embeddings. + n_results: Number of results to return. Defaults to 4. + where: dict used to filter results by e.g. {"color" : "red", "price": 4.20}. - where_document (Optional[Dict[str, str]]): dict used to filter by the documents. + where_document: dict used to filter by the documents. E.g. {$contains: {"text": "hello"}}. Returns: @@ -276,13 +275,13 @@ def add_images( """Run more images through the embeddings and add to the vectorstore. Args: - uris (List[str]): File path to the image. - metadatas (Optional[List[dict]], optional): Optional list of metadatas. + uris: File path to the image. + metadatas: Optional list of metadatas. When querying, you can filter on this metadata. - ids (Optional[List[str]], optional): Optional list of IDs. + ids: Optional list of IDs. Returns: - List[str]: List of IDs of the added images. + List of IDs of the added images. Raises: ValueError: When matadata is incorrect. @@ -363,13 +362,13 @@ def add_texts( """Run more texts through the embeddings and add to the vectorstore. Args: - texts (Iterable[str]): Texts to add to the vectorstore. - metadatas (Optional[List[dict]], optional): Optional list of metadatas. + texts: Texts to add to the vectorstore. + metadatas: Optional list of metadatas. When querying, you can filter on this metadata. - ids (Optional[List[str]], optional): Optional list of IDs. + ids: Optional list of IDs. Returns: - List[str]: List of IDs of the added texts. + List of IDs of the added texts. Raises: ValueError: When matadata is incorrect. @@ -446,12 +445,12 @@ def similarity_search( """Run similarity search with Chroma. Args: - query (str): Query text to search for. - k (int): Number of results to return. Defaults to 4. - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + query: Query text to search for. + k: Number of results to return. Defaults to 4. + filter: Filter by metadata. Defaults to None. Returns: - List[Document]: List of documents most similar to the query text. + List of documents most similar to the query text. """ docs_and_scores = self.similarity_search_with_score( query, k, filter=filter, **kwargs @@ -469,10 +468,10 @@ def similarity_search_by_vector( """Return docs most similar to embedding vector. Args: - embedding (List[float]): Embedding to look up documents similar to. - k (int): Number of Documents to return. Defaults to 4. - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. - where_document (Optional[Dict[str, str]]): dict used to filter by the documents. + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: Filter by metadata. Defaults to None. + where_document: dict used to filter by the documents. E.g. {$contains: {"text": "hello"}}. Returns: @@ -500,15 +499,14 @@ def similarity_search_by_vector_with_relevance_scores( Args: embedding (List[float]): Embedding to look up documents similar to. - k (int): Number of Documents to return. Defaults to 4. - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. - where_document (Optional[Dict[str, str]]): dict used to filter by the documents. + k: Number of Documents to return. Defaults to 4. + filter: Filter by metadata. Defaults to None. + where_document: dict used to filter by the documents. E.g. {$contains: {"text": "hello"}}. Returns: - List[Tuple[Document, float]]: List of documents most similar to - the query text and cosine distance in float for each. - Lower score represents more similarity. + List of documents most similar to the query text and relevance score + in float for each. Lower score represents more similarity. """ results = self.__query_collection( query_embeddings=embedding, @@ -530,16 +528,15 @@ def similarity_search_with_score( """Run similarity search with Chroma with distance. Args: - query (str): Query text to search for. - k (int): Number of results to return. Defaults to 4. - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. - where_document (Optional[Dict[str, str]]): dict used to filter by the documents. + query: Query text to search for. + k: Number of results to return. Defaults to 4. + filter: Filter by metadata. Defaults to None. + where_document: dict used to filter by the documents. E.g. {$contains: {"text": "hello"}}. Returns: - List[Tuple[Document, float]]: List of documents most similar to - the query text and distance in float for each. - Lower score represents more similarity. + List of documents most similar to the query text and + distance in float for each. Lower score represents more similarity. """ if self._embedding_function is None: results = self.__query_collection( @@ -562,16 +559,16 @@ def similarity_search_with_score( return _results_to_docs_and_scores(results) def _select_relevance_score_fn(self) -> Callable[[float], float]: - """ - Select the relevance score function based on the distance metric used by the VectorStore. + """Select the relevance score function based on the distance metric used by the VectorStore. + The most similar documents will have the lowest relevance score. Default relevance score function is euclidean distance. - Distance metric must be provided in `collection_metadata` during initizalition of Chroma object . + Distance metric must be provided in `collection_metadata` during initizalition of Chroma object. Example: collection_metadata={"hnsw:space": "cosine"} Available distance metrics are: 'cosine', 'l2' and 'ip'. Returns: - Callable[[float], float]: The relevance score function. + The relevance score function. Raises: ValueError: If the distance metric is not supported. @@ -615,14 +612,14 @@ def max_marginal_relevance_search_by_vector( among selected documents. Args: - embedding (List[float]): Embedding to look up documents similar to. - k (int): Number of Documents to return. Defaults to 4. - fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. Defaults to 20. - lambda_mult (float): Number between 0 and 1 that determines the degree + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. Defaults to 20. + lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + filter: Filter by metadata. Defaults to None. Returns: List of Documents selected by maximal marginal relevance. @@ -670,8 +667,9 @@ def max_marginal_relevance_search( of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. - where_document (Optional[Dict[str, str]]): dict used to filter by the documents. E.g. {$contains: {"text": "hello"}}. + filter: Filter by metadata. Defaults to None. + where_document: dict used to filter by the documents. + E.g. {$contains: {"text": "hello"}}. Returns: List of Documents selected by maximal marginal relevance. @@ -729,6 +727,9 @@ def get( Can contain `"embeddings"`, `"metadatas"`, `"documents"`. Ids are always included. Defaults to `["metadatas", "documents"]`. Optional. + + Return: + A dict with the keys `"ids"`, `"embeddings"`, `"metadatas"`, `"documents"`. """ kwargs = { "ids": ids, @@ -747,8 +748,8 @@ def update_document(self, document_id: str, document: Document) -> None: """Update a document in the collection. Args: - document_id (str): ID of the document to update. - document (Document): Document to update. + document_id: ID of the document to update. + document: Document to update. """ return self.update_documents([document_id], [document]) @@ -757,8 +758,8 @@ def update_documents(self, ids: List[str], documents: List[Document]) -> None: """Update a document in the collection. Args: - ids (List[str]): List of ids of the document to update. - documents (List[Document]): List of documents to update. + ids: List of ids of the document to update. + documents: List of documents to update. Raises: ValueError: If the embedding function is not provided. @@ -817,14 +818,14 @@ def from_texts( Otherwise, the data will be ephemeral in-memory. Args: - texts (List[str]): List of texts to add to the collection. - collection_name (str): Name of the collection to create. - persist_directory (Optional[str]): Directory to persist the collection. - embedding (Optional[Embeddings]): Embedding function. Defaults to None. - metadatas (Optional[List[dict]]): List of metadatas. Defaults to None. - ids (Optional[List[str]]): List of document IDs. Defaults to None. - client_settings (Optional[chromadb.config.Settings]): Chroma client settings - collection_metadata (Optional[Dict]): Collection configurations. + texts: List of texts to add to the collection. + collection_name: Name of the collection to create. + persist_directory: Directory to persist the collection. + embedding: Embedding function. Defaults to None. + metadatas: List of metadatas. Defaults to None. + ids: List of document IDs. Defaults to None. + client_settings: Chroma client settings + collection_metadata: Collection configurations. Defaults to None. Returns: @@ -880,13 +881,13 @@ def from_documents( Otherwise, the data will be ephemeral in-memory. Args: - collection_name (str): Name of the collection to create. - persist_directory (Optional[str]): Directory to persist the collection. - ids (Optional[List[str]]): List of document IDs. Defaults to None. - documents (List[Document]): List of documents to add to the vectorstore. - embedding (Optional[Embeddings]): Embedding function. Defaults to None. - client_settings (Optional[chromadb.config.Settings]): Chroma client settings - collection_metadata (Optional[Dict]): Collection configurations. + collection_name: Name of the collection to create. + persist_directory: Directory to persist the collection. + ids : List of document IDs. Defaults to None. + documents: List of documents to add to the vectorstore. + embedding: Embedding function. Defaults to None. + client_settings: Chroma client settings + collection_metadata: Collection configurations. Defaults to None. Returns: From 225c36e625dccc90151ad661e58e3418392874da Mon Sep 17 00:00:00 2001 From: Bagatur Date: Wed, 22 May 2024 14:42:14 -0700 Subject: [PATCH 3/4] fmt --- .../chroma/langchain_chroma/vectorstores.py | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/libs/partners/chroma/langchain_chroma/vectorstores.py b/libs/partners/chroma/langchain_chroma/vectorstores.py index 5570c72e6b245..985a6c44b9163 100644 --- a/libs/partners/chroma/langchain_chroma/vectorstores.py +++ b/libs/partners/chroma/langchain_chroma/vectorstores.py @@ -162,11 +162,11 @@ def __init__( persist_director: Directory to persist the collection. client_settings: Chroma client settings collection_metadata: Collection configurations. - client: Chroma client. Documentation: + client: Chroma client. Documentation: https://docs.trychroma.com/reference/js-client#class:-chromaclient - relevance_score_fn: Fuction to calculate relevance score from distance. + relevance_score_fn: Fuction to calculate relevance score from distance. Used only in `similarity_search_with_relevance_scores` - create_collection_if_not_exists: Whether to create collection + create_collection_if_not_exists: Whether to create collection if it doesn't exist. Defaults to True. """ if client is not None: @@ -241,13 +241,14 @@ def __query_collection( query_texts: List of query texts. query_embeddings: List of query embeddings. n_results: Number of results to return. Defaults to 4. - where: dict used to filter results by + where: dict used to filter results by e.g. {"color" : "red", "price": 4.20}. - where_document: dict used to filter by the documents. + where_document: dict used to filter by the documents. E.g. {$contains: {"text": "hello"}}. Returns: - List of `n_results` nearest neighbor embeddings for provided query_embeddings or query_texts. + List of `n_results` nearest neighbor embeddings for provided + query_embeddings or query_texts. See more: https://docs.trychroma.com/reference/py-collection#query """ @@ -471,7 +472,7 @@ def similarity_search_by_vector( embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: Filter by metadata. Defaults to None. - where_document: dict used to filter by the documents. + where_document: dict used to filter by the documents. E.g. {$contains: {"text": "hello"}}. Returns: @@ -501,11 +502,11 @@ def similarity_search_by_vector_with_relevance_scores( embedding (List[float]): Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: Filter by metadata. Defaults to None. - where_document: dict used to filter by the documents. + where_document: dict used to filter by the documents. E.g. {$contains: {"text": "hello"}}. Returns: - List of documents most similar to the query text and relevance score + List of documents most similar to the query text and relevance score in float for each. Lower score represents more similarity. """ results = self.__query_collection( @@ -531,11 +532,11 @@ def similarity_search_with_score( query: Query text to search for. k: Number of results to return. Defaults to 4. filter: Filter by metadata. Defaults to None. - where_document: dict used to filter by the documents. + where_document: dict used to filter by the documents. E.g. {$contains: {"text": "hello"}}. Returns: - List of documents most similar to the query text and + List of documents most similar to the query text and distance in float for each. Lower score represents more similarity. """ if self._embedding_function is None: @@ -559,13 +560,13 @@ def similarity_search_with_score( return _results_to_docs_and_scores(results) def _select_relevance_score_fn(self) -> Callable[[float], float]: - """Select the relevance score function based on the distance metric used by the VectorStore. + """Select the relevance score function based on collections distance metric. - The most similar documents will have the lowest relevance score. - Default relevance score function is euclidean distance. - Distance metric must be provided in `collection_metadata` during initizalition of Chroma object. - Example: collection_metadata={"hnsw:space": "cosine"} - Available distance metrics are: 'cosine', 'l2' and 'ip'. + The most similar documents will have the lowest relevance score. Default + relevance score function is euclidean distance. Distance metric must be + provided in `collection_metadata` during initizalition of Chroma object. + Example: collection_metadata={"hnsw:space": "cosine"}. Available distance + metrics are: 'cosine', 'l2' and 'ip'. Returns: The relevance score function. @@ -608,17 +609,19 @@ def max_marginal_relevance_search_by_vector( **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. + Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. Defaults to 20. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. Defaults to + 20. lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. filter: Filter by metadata. Defaults to None. Returns: @@ -668,7 +671,7 @@ def max_marginal_relevance_search( to maximum diversity and 1 to minimum diversity. Defaults to 0.5. filter: Filter by metadata. Defaults to None. - where_document: dict used to filter by the documents. + where_document: dict used to filter by the documents. E.g. {$contains: {"text": "hello"}}. Returns: From a93107fc0796643aa2542f295a8ed24025a287e7 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Wed, 22 May 2024 14:42:52 -0700 Subject: [PATCH 4/4] fmt --- libs/partners/chroma/langchain_chroma/vectorstores.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/partners/chroma/langchain_chroma/vectorstores.py b/libs/partners/chroma/langchain_chroma/vectorstores.py index 985a6c44b9163..e4e1819d752bb 100644 --- a/libs/partners/chroma/langchain_chroma/vectorstores.py +++ b/libs/partners/chroma/langchain_chroma/vectorstores.py @@ -164,7 +164,7 @@ def __init__( collection_metadata: Collection configurations. client: Chroma client. Documentation: https://docs.trychroma.com/reference/js-client#class:-chromaclient - relevance_score_fn: Fuction to calculate relevance score from distance. + relevance_score_fn: Function to calculate relevance score from distance. Used only in `similarity_search_with_relevance_scores` create_collection_if_not_exists: Whether to create collection if it doesn't exist. Defaults to True. @@ -285,7 +285,7 @@ def add_images( List of IDs of the added images. Raises: - ValueError: When matadata is incorrect. + ValueError: When metadata is incorrect. """ # Map from uris to b64 encoded strings b64_texts = [self.encode_image(uri=uri) for uri in uris] @@ -372,7 +372,7 @@ def add_texts( List of IDs of the added texts. Raises: - ValueError: When matadata is incorrect. + ValueError: When metadata is incorrect. """ if ids is None: