From 98f5bba8324a6e9ec8ed96ef38f06c26750a661a Mon Sep 17 00:00:00 2001 From: Pete Matsyburka Date: Mon, 15 Jan 2024 01:44:56 +0200 Subject: [PATCH] update pdf processor --- lib/pdf_processor.rb | 58 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/lib/pdf_processor.rb b/lib/pdf_processor.rb index 27257eea..c929dcc4 100644 --- a/lib/pdf_processor.rb +++ b/lib/pdf_processor.rb @@ -1,39 +1,71 @@ # frozen_string_literal: true class PdfProcessor < HexaPDF::Content::Processor - attr_reader :result, :contents - attr_accessor :handler + attr_accessor :handler, :serializer + + class ParseTextHandler + attr_accessor :pos, :num, :search_chars, :handler + + def initialize(handler) + @num = 0 + @pos = 0 + + @handler = handler + @search_chars = handler.search_chars + end + + TJS = %i[TJ Tj].freeze + + def call(processor, operator, operands) + return unless TJS.include?(operator) + + processor.send(:decode_text, *operands).chars.each do |char| + handler.tokens << [char, [@num, @pos]] if search_chars.include?(char) + + @pos += 1 + end + + @pos = 0 + @num += 1 + end + end def initialize(page) super - @contents = ''.b - @result = [] - @serializer = HexaPDF::Serializer.new end + def serialize(operator, operands) + operators[operator].serialize(serializer, *operands) + end + def process(operator, operands = []) super - contents << @operators[operator].serialize( - @serializer, - *handler.call(self, operator, operands) - ) + handler.call(self, operator, operands) end - def self.call(data, pdf_handler, result_handler, acc = {}) + def self.call(data, process_handler, result_handler, acc = {}) doc = HexaPDF::Document.new(io: StringIO.new(data)) doc.pages.each do |page| processor = PdfProcessor.new(page) - processor.handler = pdf_handler + process_handler_instance = process_handler.new + processor.handler = ParseTextHandler.new(process_handler_instance) page.process_contents(processor) - page.contents = processor.contents + if process_handler_instance.tokens? + processor = PdfProcessor.new(page) + processor.handler = process_handler_instance + + page.process_contents(processor) + + page.contents = process_handler_instance.contents + end - processor.result.each do |item| + process_handler_instance.result.each do |item| result_handler.call(item, page, acc) end end