diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py index 6b7c8dc..bbce062 100644 --- a/py_zerox/pyzerox/core/zerox.py +++ b/py_zerox/pyzerox/core/zerox.py @@ -131,9 +131,15 @@ async def zerox( "save_directory":temp_directory, "suffix":"_selected_pages"} local_path = await asyncio.to_thread(create_selected_pages_pdf, **subset_pdf_create_kwargs) + + # explicitly pass poppler path via kwargs + if "poppler_path" in kwargs: + poppler_path = kwargs["poppler_path"] + else: + poppler_path = None # Convert the file to a series of images, below function returns a list of image paths in page order - images = await convert_pdf_to_images(image_density=image_density, image_height=image_height, local_path=local_path, temp_dir=temp_directory) + images = await convert_pdf_to_images(image_density=image_density, image_height=image_height, local_path=local_path, temp_dir=temp_directory, poppler_path=poppler_path) if maintain_format: for image in images: @@ -199,4 +205,4 @@ async def zerox( input_tokens=input_token_count, output_tokens=output_token_count, pages=formatted_pages, - ) \ No newline at end of file + ) diff --git a/py_zerox/pyzerox/processor/pdf.py b/py_zerox/pyzerox/processor/pdf.py index af36629..b0dc4e8 100644 --- a/py_zerox/pyzerox/processor/pdf.py +++ b/py_zerox/pyzerox/processor/pdf.py @@ -11,7 +11,7 @@ from ..models import litellmmodel -async def convert_pdf_to_images(image_density: int, image_height: tuple[Optional[int], int], local_path: str, temp_dir: str) -> List[str]: +async def convert_pdf_to_images(image_density: int, image_height: tuple[Optional[int], int], local_path: str, temp_dir: str, poppler_path: str = None) -> List[str]: """Converts a PDF file to a series of images in the temp_dir. Returns a list of image paths in page order.""" options = { "pdf_path": local_path, @@ -22,6 +22,7 @@ async def convert_pdf_to_images(image_density: int, image_height: tuple[Optional "thread_count": PDFConversionDefaultOptions.THREAD_COUNT, "use_pdftocairo": PDFConversionDefaultOptions.USE_PDFTOCAIRO, "paths_only": True, + "poppler_path": poppler_path } try: