Skip to content

Commit

Permalink
Fix strip existing ocr bug
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Mar 4, 2025
1 parent b586f78 commit e84534c
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 5 deletions.
4 changes: 2 additions & 2 deletions marker/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ def check_page(self, page_id: int, doc: PdfDocument) -> bool:

# we also skip pages without embedded fonts and fonts without names
non_embedded_fonts.append(pdfium_c.FPDFFont_GetIsEmbedded(font) == 0)
empty_fonts.append(not font_name or font_name == "GlyphLessFont")
empty_fonts.append("glyphless" in font_name.lower()) # Add font name check back in when we bump pypdfium2
if font_name not in font_map:
font_map[font_name or 'Unknown'] = font

Expand Down Expand Up @@ -363,7 +363,7 @@ def _get_fontname(font) -> str:
font_name_buffer = ctypes.create_string_buffer(length)
pdfium_c.FPDFFont_GetBaseFontName(font, font_name_buffer, length)
font_name = font_name_buffer.value.decode("utf-8")
except:
except Exception as e:
pass

return font_name
4 changes: 2 additions & 2 deletions marker/scripts/streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
]

with open(
os.path.join(os.path.dirname(__file__), "streamlit_app_blocks_viz.html")
os.path.join(os.path.dirname(__file__), "streamlit_app_blocks_viz.html"), encoding="utf-8"
) as f:
BLOCKS_VIZ_TMPL = string.Template(f.read())

Expand Down Expand Up @@ -179,7 +179,7 @@ def block_display(image: Image, blocks: dict | None = None, dpi=96):
Find the project [here](https://github.com/VikParuchuri/marker).
""")

in_file: UploadedFile = st.sidebar.file_uploader("PDF or image file:", type=["pdf", "png", "jpg", "jpeg", "gif"])
in_file: UploadedFile = st.sidebar.file_uploader("PDF, document, or image file:", type=["pdf", "png", "jpg", "jpeg", "gif", "pptx", "docx", "xlsx", "html", "epub"])

if in_file is None:
st.stop()
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "1.6.0"
version = "1.6.1"
description = "Convert documents to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit e84534c

Please sign in to comment.