From ceb7914a18460e7936e5d8e3139e98248dd67646 Mon Sep 17 00:00:00 2001 From: brightertiger Date: Mon, 23 Dec 2024 12:37:13 +0530 Subject: [PATCH 1/4] Update pdf.py --- py_zerox/pyzerox/processor/pdf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/py_zerox/pyzerox/processor/pdf.py b/py_zerox/pyzerox/processor/pdf.py index af36629..5f6890e 100644 --- a/py_zerox/pyzerox/processor/pdf.py +++ b/py_zerox/pyzerox/processor/pdf.py @@ -11,7 +11,7 @@ from ..models import litellmmodel -async def convert_pdf_to_images(image_density: int, image_height: tuple[Optional[int], int], local_path: str, temp_dir: str) -> List[str]: +async def convert_pdf_to_images(image_density: int, image_height: tuple[Optional[int], int], local_path: str, temp_dir: str, poppler_path: str) -> List[str]: """Converts a PDF file to a series of images in the temp_dir. Returns a list of image paths in page order.""" options = { "pdf_path": local_path, @@ -22,6 +22,7 @@ async def convert_pdf_to_images(image_density: int, image_height: tuple[Optional "thread_count": PDFConversionDefaultOptions.THREAD_COUNT, "use_pdftocairo": PDFConversionDefaultOptions.USE_PDFTOCAIRO, "paths_only": True, + "poppler_path": poppler_path } try: From 256ab4b649e11958c8a520c1618376e8877e02bf Mon Sep 17 00:00:00 2001 From: brightertiger Date: Mon, 23 Dec 2024 12:38:08 +0530 Subject: [PATCH 2/4] Update zerox.py --- py_zerox/pyzerox/core/zerox.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py index 6b7c8dc..ae0cc61 100644 --- a/py_zerox/pyzerox/core/zerox.py +++ b/py_zerox/pyzerox/core/zerox.py @@ -133,7 +133,7 @@ async def zerox( **subset_pdf_create_kwargs) # Convert the file to a series of images, below function returns a list of image paths in page order - images = await convert_pdf_to_images(image_density=image_density, image_height=image_height, local_path=local_path, temp_dir=temp_directory) + images = await convert_pdf_to_images(image_density=image_density, image_height=image_height, local_path=local_path, temp_dir=temp_directory, poppler_path=poppler_path) if maintain_format: for image in images: @@ -199,4 +199,4 @@ async def zerox( input_tokens=input_token_count, output_tokens=output_token_count, pages=formatted_pages, - ) \ No newline at end of file + ) From 22c9a3e68a2cdc3229050fdea33165519dcd5b03 Mon Sep 17 00:00:00 2001 From: brightertiger Date: Mon, 23 Dec 2024 12:39:33 +0530 Subject: [PATCH 3/4] Update pdf.py --- py_zerox/pyzerox/processor/pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py_zerox/pyzerox/processor/pdf.py b/py_zerox/pyzerox/processor/pdf.py index 5f6890e..b0dc4e8 100644 --- a/py_zerox/pyzerox/processor/pdf.py +++ b/py_zerox/pyzerox/processor/pdf.py @@ -11,7 +11,7 @@ from ..models import litellmmodel -async def convert_pdf_to_images(image_density: int, image_height: tuple[Optional[int], int], local_path: str, temp_dir: str, poppler_path: str) -> List[str]: +async def convert_pdf_to_images(image_density: int, image_height: tuple[Optional[int], int], local_path: str, temp_dir: str, poppler_path: str = None) -> List[str]: """Converts a PDF file to a series of images in the temp_dir. Returns a list of image paths in page order.""" options = { "pdf_path": local_path, From bc933bee06ff7eb5c38cbd48b787873ba59dd000 Mon Sep 17 00:00:00 2001 From: brightertiger Date: Mon, 23 Dec 2024 12:43:31 +0530 Subject: [PATCH 4/4] Update zerox.py --- py_zerox/pyzerox/core/zerox.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py index ae0cc61..bbce062 100644 --- a/py_zerox/pyzerox/core/zerox.py +++ b/py_zerox/pyzerox/core/zerox.py @@ -131,6 +131,12 @@ async def zerox( "save_directory":temp_directory, "suffix":"_selected_pages"} local_path = await asyncio.to_thread(create_selected_pages_pdf, **subset_pdf_create_kwargs) + + # explicitly pass poppler path via kwargs + if "poppler_path" in kwargs: + poppler_path = kwargs["poppler_path"] + else: + poppler_path = None # Convert the file to a series of images, below function returns a list of image paths in page order images = await convert_pdf_to_images(image_density=image_density, image_height=image_height, local_path=local_path, temp_dir=temp_directory, poppler_path=poppler_path)