From 1db85a60c66b7a6d858d758470e86595ea55b0b8 Mon Sep 17 00:00:00 2001 From: Stefano Lottini Date: Fri, 31 Jan 2025 16:42:55 +0100 Subject: [PATCH 1/6] improved handling of multidoc insertion errors for vector store --- .../astradb/langchain_astradb/vectorstores.py | 30 +++++++++++++++++-- .../test_vectorstore_ddl_tests.py | 5 ++-- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/libs/astradb/langchain_astradb/vectorstores.py b/libs/astradb/langchain_astradb/vectorstores.py index bfb03da..fd73e59 100644 --- a/libs/astradb/langchain_astradb/vectorstores.py +++ b/libs/astradb/langchain_astradb/vectorstores.py @@ -1062,7 +1062,20 @@ def add_texts( if document["_id"] not in inserted_ids_set ] else: - raise + # Raise a non-astrapy error with a message covering all errors + # except the DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE, which this + # method can handle (if they were the only error) + all_err_descs = "; ".join( + edesc.message + for edesc in err.error_descriptors + if edesc.error_code != DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE + if edesc.message + ) + full_err_message = ( + "Cannot insert documents. The Data API returned the " + f"following error(s): {all_err_descs}" + ) + raise ValueError(full_err_message) from err # if necessary, replace docs for the non-inserted ids if ids_to_replace: @@ -1191,7 +1204,20 @@ async def aadd_texts( if document["_id"] not in inserted_ids_set ] else: - raise + # Raise a non-astrapy error with a message covering all errors + # except the DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE, which this + # method can handle (if they were the only error) + all_err_descs = "; ".join( + edesc.message + for edesc in err.error_descriptors + if edesc.error_code != DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE + if edesc.message + ) + full_err_message = ( + "Cannot insert documents. The Data API returned the " + f"following error(s): {all_err_descs}" + ) + raise ValueError(full_err_message) from err # if necessary, replace docs for the non-inserted ids if ids_to_replace: diff --git a/libs/astradb/tests/integration_tests/test_vectorstore_ddl_tests.py b/libs/astradb/tests/integration_tests/test_vectorstore_ddl_tests.py index 8218ff5..c719bf1 100644 --- a/libs/astradb/tests/integration_tests/test_vectorstore_ddl_tests.py +++ b/libs/astradb/tests/integration_tests/test_vectorstore_ddl_tests.py @@ -11,7 +11,6 @@ import pytest from astrapy.authentication import EmbeddingAPIKeyHeaderProvider, StaticTokenProvider -from astrapy.exceptions import InsertManyException from langchain_astradb.utils.astradb import SetupMode from langchain_astradb.vectorstores import AstraDBVectorStore @@ -510,7 +509,7 @@ def test_astradb_vectorstore_vectorize_headers_precedence_stringheader( ) # More specific messages are provider-specific, such as OpenAI returning: # "... Incorrect API key provided: verywrong ..." - with pytest.raises(InsertManyException, match="Embedding Provider returned"): + with pytest.raises(ValueError, match="Embedding Provider returned"): v_store.add_texts(["Failing"]) @pytest.mark.skipif( @@ -538,5 +537,5 @@ def test_astradb_vectorstore_vectorize_headers_precedence_headerprovider( ) # More specific messages are provider-specific, such as OpenAI returning: # "... Incorrect API key provided: verywrong ..." - with pytest.raises(InsertManyException, match="Embedding Provider returned"): + with pytest.raises(ValueError, match="Embedding Provider returned"): v_store.add_texts(["Failing"]) From b9bc4839c35b59855e44e81d15c8dd2ca846b41c Mon Sep 17 00:00:00 2001 From: Stefano Lottini Date: Fri, 31 Jan 2025 18:25:37 +0100 Subject: [PATCH 2/6] Capped number of shown errors --- .../astradb/langchain_astradb/vectorstores.py | 38 ++++++++++++++++--- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/libs/astradb/langchain_astradb/vectorstores.py b/libs/astradb/langchain_astradb/vectorstores.py index fd73e59..7f27049 100644 --- a/libs/astradb/langchain_astradb/vectorstores.py +++ b/libs/astradb/langchain_astradb/vectorstores.py @@ -67,6 +67,8 @@ DEFAULT_INDEXING_OPTIONS = {"allow": ["metadata"]} # error code to check for during bulk insertions DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE = "DOCUMENT_ALREADY_EXISTS" +# max number of errors shown in full insertion error messages +MAX_SHOWN_INSERTION_ERRORS = 8 logger = logging.getLogger(__name__) @@ -1065,15 +1067,27 @@ def add_texts( # Raise a non-astrapy error with a message covering all errors # except the DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE, which this # method can handle (if they were the only error) - all_err_descs = "; ".join( - edesc.message + filtered_error_descs = [ + edesc for edesc in err.error_descriptors if edesc.error_code != DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE if edesc.message + ] + all_err_descs = "; ".join( + edesc.message or "" + for edesc in filtered_error_descs[:MAX_SHOWN_INSERTION_ERRORS] ) + there_s_more: str + if len(filtered_error_descs) > MAX_SHOWN_INSERTION_ERRORS: + num_residual = ( + len(filtered_error_descs) - MAX_SHOWN_INSERTION_ERRORS + ) + there_s_more = f". (Note: {num_residual} further errors omitted.)" + else: + there_s_more = "" full_err_message = ( "Cannot insert documents. The Data API returned the " - f"following error(s): {all_err_descs}" + f"following error(s): {all_err_descs}{there_s_more}" ) raise ValueError(full_err_message) from err @@ -1207,15 +1221,27 @@ async def aadd_texts( # Raise a non-astrapy error with a message covering all errors # except the DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE, which this # method can handle (if they were the only error) - all_err_descs = "; ".join( - edesc.message + filtered_error_descs = [ + edesc for edesc in err.error_descriptors if edesc.error_code != DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE if edesc.message + ] + all_err_descs = "; ".join( + edesc.message or "" + for edesc in filtered_error_descs[:MAX_SHOWN_INSERTION_ERRORS] ) + there_s_more: str + if len(filtered_error_descs) > MAX_SHOWN_INSERTION_ERRORS: + num_residual = ( + len(filtered_error_descs) - MAX_SHOWN_INSERTION_ERRORS + ) + there_s_more = f". (Note: {num_residual} further errors omitted.)" + else: + there_s_more = "" full_err_message = ( "Cannot insert documents. The Data API returned the " - f"following error(s): {all_err_descs}" + f"following error(s): {all_err_descs}{there_s_more}" ) raise ValueError(full_err_message) from err From 91df23803cabd751b2bd3b69ffe21075c7eb5cd5 Mon Sep 17 00:00:00 2001 From: Stefano Lottini Date: Sun, 2 Feb 2025 23:34:16 +0100 Subject: [PATCH 3/6] explicit messaging on exc.chaining for full insert troubles --- libs/astradb/langchain_astradb/vectorstores.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/libs/astradb/langchain_astradb/vectorstores.py b/libs/astradb/langchain_astradb/vectorstores.py index 7f27049..71988f8 100644 --- a/libs/astradb/langchain_astradb/vectorstores.py +++ b/libs/astradb/langchain_astradb/vectorstores.py @@ -1085,9 +1085,14 @@ def add_texts( there_s_more = f". (Note: {num_residual} further errors omitted.)" else: there_s_more = "" + original_err_note = ( + " (Full API error in '.__cause__.error_descriptors'" + f": ignore '{DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE}'.)" + ) full_err_message = ( "Cannot insert documents. The Data API returned the " - f"following error(s): {all_err_descs}{there_s_more}" + f"following error(s): {all_err_descs}" + f"{there_s_more}{original_err_note}" ) raise ValueError(full_err_message) from err @@ -1239,9 +1244,14 @@ async def aadd_texts( there_s_more = f". (Note: {num_residual} further errors omitted.)" else: there_s_more = "" + original_err_note = ( + " (Full API error in '.__cause__.error_descriptors'" + f": ignore '{DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE}'.)" + ) full_err_message = ( "Cannot insert documents. The Data API returned the " - f"following error(s): {all_err_descs}{there_s_more}" + f"following error(s): {all_err_descs}" + f"{there_s_more}{original_err_note}" ) raise ValueError(full_err_message) from err From fb46712410390b2fe1311e85951ec03e9905fe19 Mon Sep 17 00:00:00 2001 From: Stefano Lottini Date: Tue, 4 Feb 2025 12:05:47 +0100 Subject: [PATCH 4/6] err message preparation: code refactoring --- .../astradb/langchain_astradb/vectorstores.py | 94 +++++++------------ 1 file changed, 34 insertions(+), 60 deletions(-) diff --git a/libs/astradb/langchain_astradb/vectorstores.py b/libs/astradb/langchain_astradb/vectorstores.py index 71988f8..7eebfad 100644 --- a/libs/astradb/langchain_astradb/vectorstores.py +++ b/libs/astradb/langchain_astradb/vectorstores.py @@ -148,6 +148,38 @@ def _validate_autodetect_init_params( raise ValueError(msg) +def _insertmany_error_message(err: InsertManyException) -> str: + """Format an astrapy insert exception into an error message. + + This utility prepares a detailed message from an astrapy InsertManyException, + to be used in raising an exception within a vectorstore multiple insertion. + + This operation must filter out duplicate-id specific errors + (which the vector store could actually handle, if they were the only ondes). + """ + err_msg = "Cannot insert documents. The Data API returned the following error(s): " + + filtered_error_descs = [ + edesc + for edesc in err.error_descriptors + if edesc.error_code != DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE + if edesc.message + ] + err_msg += "; ".join( + edesc.message or "" + for edesc in filtered_error_descs[:MAX_SHOWN_INSERTION_ERRORS] + ) + + if num_residual := max(0, len(filtered_error_descs) - MAX_SHOWN_INSERTION_ERRORS): + err_msg += f". (Note: {num_residual} further errors omitted.)" + + err_msg += ( + " (Full API error in '.__cause__.error_descriptors'" + f": ignore '{DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE}'.)" + ) + return err_msg + + class AstraDBVectorStore(VectorStore): """AstraDB vector store integration. @@ -1064,36 +1096,7 @@ def add_texts( if document["_id"] not in inserted_ids_set ] else: - # Raise a non-astrapy error with a message covering all errors - # except the DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE, which this - # method can handle (if they were the only error) - filtered_error_descs = [ - edesc - for edesc in err.error_descriptors - if edesc.error_code != DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE - if edesc.message - ] - all_err_descs = "; ".join( - edesc.message or "" - for edesc in filtered_error_descs[:MAX_SHOWN_INSERTION_ERRORS] - ) - there_s_more: str - if len(filtered_error_descs) > MAX_SHOWN_INSERTION_ERRORS: - num_residual = ( - len(filtered_error_descs) - MAX_SHOWN_INSERTION_ERRORS - ) - there_s_more = f". (Note: {num_residual} further errors omitted.)" - else: - there_s_more = "" - original_err_note = ( - " (Full API error in '.__cause__.error_descriptors'" - f": ignore '{DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE}'.)" - ) - full_err_message = ( - "Cannot insert documents. The Data API returned the " - f"following error(s): {all_err_descs}" - f"{there_s_more}{original_err_note}" - ) + full_err_message = _insertmany_error_message(err) raise ValueError(full_err_message) from err # if necessary, replace docs for the non-inserted ids @@ -1223,36 +1226,7 @@ async def aadd_texts( if document["_id"] not in inserted_ids_set ] else: - # Raise a non-astrapy error with a message covering all errors - # except the DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE, which this - # method can handle (if they were the only error) - filtered_error_descs = [ - edesc - for edesc in err.error_descriptors - if edesc.error_code != DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE - if edesc.message - ] - all_err_descs = "; ".join( - edesc.message or "" - for edesc in filtered_error_descs[:MAX_SHOWN_INSERTION_ERRORS] - ) - there_s_more: str - if len(filtered_error_descs) > MAX_SHOWN_INSERTION_ERRORS: - num_residual = ( - len(filtered_error_descs) - MAX_SHOWN_INSERTION_ERRORS - ) - there_s_more = f". (Note: {num_residual} further errors omitted.)" - else: - there_s_more = "" - original_err_note = ( - " (Full API error in '.__cause__.error_descriptors'" - f": ignore '{DOCUMENT_ALREADY_EXISTS_API_ERROR_CODE}'.)" - ) - full_err_message = ( - "Cannot insert documents. The Data API returned the " - f"following error(s): {all_err_descs}" - f"{there_s_more}{original_err_note}" - ) + full_err_message = _insertmany_error_message(err) raise ValueError(full_err_message) from err # if necessary, replace docs for the non-inserted ids From f2bfec09cf61b331efdb3d0cc2e87a0f2fcc5dba Mon Sep 17 00:00:00 2001 From: Stefano Lottini Date: Tue, 4 Feb 2025 16:24:58 +0100 Subject: [PATCH 5/6] introduction of AstraDBVectorStoreError --- libs/astradb/langchain_astradb/__init__.py | 3 +- .../astradb/langchain_astradb/vectorstores.py | 31 +++++++++++++------ .../test_vectorstore_ddl_tests.py | 6 ++-- libs/astradb/tests/unit_tests/test_imports.py | 1 + 4 files changed, 27 insertions(+), 14 deletions(-) diff --git a/libs/astradb/langchain_astradb/__init__.py b/libs/astradb/langchain_astradb/__init__.py index 3e027cc..c462ac3 100644 --- a/libs/astradb/langchain_astradb/__init__.py +++ b/libs/astradb/langchain_astradb/__init__.py @@ -7,7 +7,7 @@ from langchain_astradb.document_loaders import AstraDBLoader from langchain_astradb.graph_vectorstores import AstraDBGraphVectorStore from langchain_astradb.storage import AstraDBByteStore, AstraDBStore -from langchain_astradb.vectorstores import AstraDBVectorStore +from langchain_astradb.vectorstores import AstraDBVectorStore, AstraDBVectorStoreError __all__ = [ "AstraDBByteStore", @@ -18,5 +18,6 @@ "AstraDBSemanticCache", "AstraDBStore", "AstraDBVectorStore", + "AstraDBVectorStoreError", "CollectionVectorServiceOptions", ] diff --git a/libs/astradb/langchain_astradb/vectorstores.py b/libs/astradb/langchain_astradb/vectorstores.py index 7eebfad..c19c986 100644 --- a/libs/astradb/langchain_astradb/vectorstores.py +++ b/libs/astradb/langchain_astradb/vectorstores.py @@ -170,8 +170,11 @@ def _insertmany_error_message(err: InsertManyException) -> str: for edesc in filtered_error_descs[:MAX_SHOWN_INSERTION_ERRORS] ) - if num_residual := max(0, len(filtered_error_descs) - MAX_SHOWN_INSERTION_ERRORS): - err_msg += f". (Note: {num_residual} further errors omitted.)" + if len(filtered_error_descs) > MAX_SHOWN_INSERTION_ERRORS: + err_msg += ( + f". (Note: {len(filtered_error_descs) - MAX_SHOWN_INSERTION_ERRORS}" + " further errors omitted.)" + ) err_msg += ( " (Full API error in '.__cause__.error_descriptors'" @@ -180,6 +183,14 @@ def _insertmany_error_message(err: InsertManyException) -> str: return err_msg +class AstraDBVectorStoreError(Exception): + """An exception during vector-store activities. + + This exception represents any operational exception occurring while + performing an action within an AstraDBVectorStore. + """ + + class AstraDBVectorStore(VectorStore): """AstraDB vector store integration. @@ -989,7 +1000,7 @@ def _get_missing_from_batch( ) -> tuple[list[str], list[DocDict]]: if "status" not in insert_result: msg = f"API Exception while running bulk insertion: {insert_result}" - raise ValueError(msg) + raise AstraDBVectorStoreError(msg) batch_inserted = insert_result["status"]["insertedIds"] # estimation of the preexisting documents that failed missed_inserted_ids = {document["_id"] for document in document_batch} - set( @@ -1003,7 +1014,7 @@ def _get_missing_from_batch( ) if num_errors != len(missed_inserted_ids) or unexpected_errors: msg = f"API Exception while running bulk insertion: {errors}" - raise ValueError(msg) + raise AstraDBVectorStoreError(msg) # deal with the missing insertions as upserts missing_from_batch = [ document @@ -1097,7 +1108,7 @@ def add_texts( ] else: full_err_message = _insertmany_error_message(err) - raise ValueError(full_err_message) from err + raise AstraDBVectorStoreError(full_err_message) from err # if necessary, replace docs for the non-inserted ids if ids_to_replace: @@ -1137,7 +1148,7 @@ def _replace_document( "AstraDBVectorStore.add_texts could not insert all requested " f"documents ({missing} failed replace_one calls)" ) - raise ValueError(msg) + raise AstraDBVectorStoreError(msg) return inserted_ids @override @@ -1227,7 +1238,7 @@ async def aadd_texts( ] else: full_err_message = _insertmany_error_message(err) - raise ValueError(full_err_message) from err + raise AstraDBVectorStoreError(full_err_message) from err # if necessary, replace docs for the non-inserted ids if ids_to_replace: @@ -1268,7 +1279,7 @@ async def _replace_document( "AstraDBVectorStore.add_texts could not insert all requested " f"documents ({missing} failed replace_one calls)" ) - raise ValueError(msg) + raise AstraDBVectorStoreError(msg) return inserted_ids def update_metadata( @@ -1955,7 +1966,7 @@ async def _asimilarity_search_with_embedding_by_sort( sort_vector = await async_cursor.get_sort_vector() if sort_vector is None: msg = "Unable to retrieve the server-side embedding of the query." - raise ValueError(msg) + raise AstraDBVectorStoreError(msg) query_embedding = sort_vector return ( @@ -1995,7 +2006,7 @@ def _similarity_search_with_embedding_by_sort( sort_vector = cursor.get_sort_vector() if sort_vector is None: msg = "Unable to retrieve the server-side embedding of the query." - raise ValueError(msg) + raise AstraDBVectorStoreError(msg) query_embedding = sort_vector return ( diff --git a/libs/astradb/tests/integration_tests/test_vectorstore_ddl_tests.py b/libs/astradb/tests/integration_tests/test_vectorstore_ddl_tests.py index c719bf1..6edd76d 100644 --- a/libs/astradb/tests/integration_tests/test_vectorstore_ddl_tests.py +++ b/libs/astradb/tests/integration_tests/test_vectorstore_ddl_tests.py @@ -13,7 +13,7 @@ from astrapy.authentication import EmbeddingAPIKeyHeaderProvider, StaticTokenProvider from langchain_astradb.utils.astradb import SetupMode -from langchain_astradb.vectorstores import AstraDBVectorStore +from langchain_astradb.vectorstores import AstraDBVectorStore, AstraDBVectorStoreError from .conftest import ( EPHEMERAL_CUSTOM_IDX_NAME_D2, @@ -509,7 +509,7 @@ def test_astradb_vectorstore_vectorize_headers_precedence_stringheader( ) # More specific messages are provider-specific, such as OpenAI returning: # "... Incorrect API key provided: verywrong ..." - with pytest.raises(ValueError, match="Embedding Provider returned"): + with pytest.raises(AstraDBVectorStoreError, match="verywrong"): v_store.add_texts(["Failing"]) @pytest.mark.skipif( @@ -537,5 +537,5 @@ def test_astradb_vectorstore_vectorize_headers_precedence_headerprovider( ) # More specific messages are provider-specific, such as OpenAI returning: # "... Incorrect API key provided: verywrong ..." - with pytest.raises(ValueError, match="Embedding Provider returned"): + with pytest.raises(AstraDBVectorStoreError, match="verywrong"): v_store.add_texts(["Failing"]) diff --git a/libs/astradb/tests/unit_tests/test_imports.py b/libs/astradb/tests/unit_tests/test_imports.py index 878f4f7..6644743 100644 --- a/libs/astradb/tests/unit_tests/test_imports.py +++ b/libs/astradb/tests/unit_tests/test_imports.py @@ -9,6 +9,7 @@ "AstraDBGraphVectorStore", "AstraDBLoader", "AstraDBVectorStore", + "AstraDBVectorStoreError", "CollectionVectorServiceOptions", ] From 364bea473276fda17ac1cbcca142bf98e24d2569 Mon Sep 17 00:00:00 2001 From: Stefano Lottini Date: Tue, 4 Feb 2025 19:47:20 +0100 Subject: [PATCH 6/6] return of the walrus --- libs/astradb/langchain_astradb/vectorstores.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/libs/astradb/langchain_astradb/vectorstores.py b/libs/astradb/langchain_astradb/vectorstores.py index c19c986..539c494 100644 --- a/libs/astradb/langchain_astradb/vectorstores.py +++ b/libs/astradb/langchain_astradb/vectorstores.py @@ -170,11 +170,8 @@ def _insertmany_error_message(err: InsertManyException) -> str: for edesc in filtered_error_descs[:MAX_SHOWN_INSERTION_ERRORS] ) - if len(filtered_error_descs) > MAX_SHOWN_INSERTION_ERRORS: - err_msg += ( - f". (Note: {len(filtered_error_descs) - MAX_SHOWN_INSERTION_ERRORS}" - " further errors omitted.)" - ) + if (num_residual := len(filtered_error_descs) - MAX_SHOWN_INSERTION_ERRORS) > 0: + err_msg += f". (Note: {num_residual} further errors omitted.)" err_msg += ( " (Full API error in '.__cause__.error_descriptors'"