From a265ca17167bbc5683056b8bc2355e45f04ad7cc Mon Sep 17 00:00:00 2001 From: Pete Matsyburka Date: Thu, 16 May 2024 13:37:23 +0300 Subject: [PATCH] remove pdf processor --- lib/pdf_processor.rb | 96 -------------------------------------------- 1 file changed, 96 deletions(-) delete mode 100644 lib/pdf_processor.rb diff --git a/lib/pdf_processor.rb b/lib/pdf_processor.rb deleted file mode 100644 index 3e08768a..00000000 --- a/lib/pdf_processor.rb +++ /dev/null @@ -1,96 +0,0 @@ -# frozen_string_literal: true - -class PdfProcessor < HexaPDF::Content::Processor - attr_accessor :handler, :serializer - - class ParseTextHandler - attr_accessor :pos, :num, :search_chars, :handler - - def initialize(handler) - @num = 0 - @pos = 0 - - @handler = handler - @search_chars = handler.search_chars - end - - TJS = %i[TJ Tj].freeze - - def call(processor, operator, operands) - return unless TJS.include?(operator) - - processor.send(:decode_text, *operands).chars.each do |char| - handler.tokens << [char, [@num, @pos]] if search_chars.include?(char) - - @pos += 1 - end - - @pos = 0 - @num += 1 - rescue HexaPDF::Error => e - Rails.logger.error(e.message) - - @pos = 0 - @num += 1 - end - end - - def initialize(page) - super - - @serializer = HexaPDF::Serializer.new - end - - def serialize(operator, operands) - operators[operator].serialize(serializer, *operands) - end - - def process(operator, operands = []) - super - - handler.call(self, operator, operands) - end - - def self.call(data, process_handler, result_handler, acc = {}, remove_tags: true) - doc = HexaPDF::Document.new(io: StringIO.new(data)) - - doc.pages.each do |page| - processor = PdfProcessor.new(page) - process_handler_instance = process_handler.new - processor.handler = ParseTextHandler.new(process_handler_instance) - - page.process_contents(processor) - - if process_handler_instance.tokens? - processor = PdfProcessor.new(page) - processor.handler = process_handler_instance - - page.process_contents(processor) - - page.contents = process_handler_instance.contents if process_handler_instance.result.present? && remove_tags - end - - page[:Annots].to_a.each do |annot| - next unless annot - - text = annot[:Contents].to_s.squish - - next unless text.starts_with?('{{') && text.ends_with?('}}') - - result_handler.call({ text:, rect: annot[:Rect] }, page, acc) - - page[:Annots].delete(annot) - end - - process_handler_instance.result.each do |item| - result_handler.call(item, page, acc) - end - end - - new_io = StringIO.new - - doc.write(new_io, validate: false) - - [new_io.tap(&:rewind).read, acc] - end -end