Skip to content

Commit

Permalink
Fix superscripts
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Feb 26, 2025
1 parent f8a3c8c commit e2a2861
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 3 deletions.
8 changes: 6 additions & 2 deletions marker/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,14 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines:
font_size = span["font"]["size"] or 0
polygon = PolygonBox.from_bbox(span["bbox"], ensure_nonzero_area=True)
span_chars = [Char(char=c['char'], polygon=PolygonBox.from_bbox(c['bbox'], ensure_nonzero_area=True), char_idx=c['char_idx']) for c in span["chars"]]
superscript = span.get("superscript", False)
text = self.normalize_spaces(fix_text(span["text"]))
if superscript:
text = text.strip()
spans.append(
SpanClass(
polygon=polygon,
text=self.normalize_spaces(fix_text(span["text"])),
text=text,
font=font_name,
font_weight=font_weight,
font_size=font_size,
Expand All @@ -218,7 +222,7 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines:
page_id=page_id,
text_extraction_method="pdftext",
url=span.get("url"),
has_superscript=span.get("superscript", False),
has_superscript=superscript,
)
)
chars.append(span_chars)
Expand Down
1 change: 0 additions & 1 deletion marker/schema/text/span.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def assemble_html(self, document, child_blocks, parent_structure):

# Handle full block superscript
if "<sup>" not in text:
print(text)
text = f"<sup>{text}</sup>"

if self.url:
Expand Down

0 comments on commit e2a2861

Please sign in to comment.