Skip to content

Commit

Permalink
Merge pull request #3749 from mlibrary/HELIO-4784/replace-origami-wit…
Browse files Browse the repository at this point in the history
…h-qpdf

HELIO-4784 remove Origami gem and use qpdf json to get pdf chapter metadata instead
  • Loading branch information
conorom authored Feb 17, 2025
2 parents 5f03ab9 + 00a9a06 commit c862996
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 73 deletions.
2 changes: 1 addition & 1 deletion .circleci/.force_rebuild
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# modify this file to force circleci to rebuild
2024-02-08.2
2025-02-13.1
3 changes: 0 additions & 3 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,6 @@ gem "nokogiri", ">= 1.13.6"

gem "okcomputer", "~> 1.18.4"

# Read PDF ToC
gem 'origami'

# Force epub search results to be sentences
gem 'pragmatic_segmenter', '~> 0.3'

Expand Down
4 changes: 0 additions & 4 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,6 @@ GEM
execjs
coffee-script-source (1.12.2)
colorator (1.1.0)
colorize (0.8.1)
commonjs (0.2.7)
concurrent-ruby (1.2.3)
config (5.1.0)
Expand Down Expand Up @@ -741,8 +740,6 @@ GEM
openurl (1.0.0)
marc
scrub_rb (~> 1.0)
origami (2.1.0)
colorize (~> 0.7)
orm_adapter (0.5.0)
os (1.1.4)
ostruct (0.6.0)
Expand Down Expand Up @@ -1302,7 +1299,6 @@ DEPENDENCIES
oauth
oauth2 (~> 1.2)
okcomputer (~> 1.18.4)
origami
pragmatic_segmenter (~> 0.3)
prawn (~> 2.2)
pry-rails
Expand Down
5 changes: 0 additions & 5 deletions lib/pdf_ebook.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,6 @@ def self.configure
end
end

#
# Require Dependencies
#
require 'origami'

#
# Require Relative
#
Expand Down
90 changes: 30 additions & 60 deletions lib/pdf_ebook/publication.rb
Original file line number Diff line number Diff line change
@@ -1,87 +1,58 @@
# frozen_string_literal: true

require "skylight"
require 'open3'

module PDFEbook
class Publication
include Skylight::Helpers
private_class_method :new
attr_reader :id
attr_reader :id, :path, :outlines

# Class Methods
def self.from_path_id(path, id)
file = File.new(path)
new(file, id)
new(path, id)
rescue StandardError => e
::PDFEbook.logger.info("Publication.from_path_id(#{path},#{id}) raised #{e} #{e.backtrace.join("\n")}")
PublicationNullObject.send(:new)
end

# Public method
def intervals
@intervals ||= extract_intervals
@intervals ||= extract_titles_and_pages(@outlines["outlines"])
end

private

instrument_method
def initialize(file, id)
@pdf = Origami::PDF.read(file, verbosity: Origami::Parser::VERBOSE_QUIET, lazy: true)
@id = id
@obj_to_page = {}
end

instrument_method
def extract_intervals
# Map of PDF page object number to a page number (pages start from 1)
if @obj_to_page.empty?
@pdf.pages.each_with_index do |p, i|
@obj_to_page[p.no] = i + 1
def extract_titles_and_pages(outlines, depth = 1)
intervals = []
index = 0
outlines.each do |outline|
intervals << PDFEbook::Interval.from_title_level_cfi(id, index, outline['title'], depth, "page=#{outline['destpageposfrom1']}")
index += 1
# Recursively process kids if they exist
if outline["kids"].any?
intervals.concat(extract_titles_and_pages(outline["kids"], depth + 1))
end
end
@pdf.Catalog.Outlines.present? ? iterate_outlines(@pdf.Catalog.Outlines[:First]&.solve, 1) : []

# Add an "overall_index" to each Interval, I don't remember why we're doing this
intervals.each_with_index { |interval, i| interval.overall_index = i }
end

# Takes Origami::OutlineItem and 1-based depth
instrument_method
def iterate_outlines(outline, depth) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
intervals = []
index = 0
until outline.nil?
page = nil
page = outline&.[](:A)&.solve&.[](:D)
# HELIO-3717 some "named destinations" have `:Dest` not `:A` here. The sample I'm looking at is PDF v1.3
page ||= outline&.[](:Dest)
def initialize(path, id)
@id = id
@path = path
command = "qpdf --json --json-key=outlines #{@path}"
stdin, stdout, stderr, wait_thr = Open3.popen3(command)
stdin.close
stdout.binmode
out = stdout.read
stdout.close
err = stderr.read
stderr.close

if page.is_a?(Origami::Reference) # skips external links
begin
target = page.solve
rescue Origami::InvalidReferenceError
outline = outline[:Next]&.solve
next
end
page = target
elsif page.is_a?(Origami::LiteralString)
# At this point some ToC entries are "named destinations", essentially strings for some...
# different type of lookup directory than a page number type destination. See HELIO-3377.
page = @pdf.get_destination_by_name(page)
end
raise StandardError.new "ERROR command: \"#{command}\"\n#{err}" unless wait_thr.value.success?

page = page&.[](0)&.solve # gets to Origami::Page
page ||= outline[:Dest]&.solve&.[](0)&.solve
unless page.nil?
page_number = @obj_to_page[page.no] || 0
# HELIO-4768: very rarely `title` is an `Origami::Reference` at this point, for whatever reason
title = outline[:Title].is_a?(Origami::Reference) ? outline[:Title]&.solve : outline[:Title]
intervals << PDFEbook::Interval.from_title_level_cfi(id, index, title.to_utf8, depth, "page=#{page_number}")
index += 1
end
unless outline[:First]&.solve.nil? # Child outline
intervals += iterate_outlines(outline[:First].solve, depth + 1)
end
outline = outline[:Next]&.solve
end
intervals.each_with_index { |interval, i| interval.overall_index = i }
@outlines = JSON.parse(out)
end
end

Expand All @@ -95,9 +66,8 @@ def intervals
private

def initialize
@pdf = ''
@path = ''
@id = ''
@obj_to_page = {}
end
end
end

0 comments on commit c862996

Please sign in to comment.