mirror of https://github.com/docusealco/docuseal
				
				
				
			
			You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							97 lines
						
					
					
						
							2.2 KiB
						
					
					
				
			
		
		
	
	
							97 lines
						
					
					
						
							2.2 KiB
						
					
					
				# frozen_string_literal: true
 | 
						|
 | 
						|
class PdfProcessor < HexaPDF::Content::Processor
 | 
						|
  attr_accessor :handler, :serializer
 | 
						|
 | 
						|
  class ParseTextHandler
 | 
						|
    attr_accessor :pos, :num, :search_chars, :handler
 | 
						|
 | 
						|
    def initialize(handler)
 | 
						|
      @num = 0
 | 
						|
      @pos = 0
 | 
						|
 | 
						|
      @handler = handler
 | 
						|
      @search_chars = handler.search_chars
 | 
						|
    end
 | 
						|
 | 
						|
    TJS = %i[TJ Tj].freeze
 | 
						|
 | 
						|
    def call(processor, operator, operands)
 | 
						|
      return unless TJS.include?(operator)
 | 
						|
 | 
						|
      processor.send(:decode_text, *operands).chars.each do |char|
 | 
						|
        handler.tokens << [char, [@num, @pos]] if search_chars.include?(char)
 | 
						|
 | 
						|
        @pos += 1
 | 
						|
      end
 | 
						|
 | 
						|
      @pos = 0
 | 
						|
      @num += 1
 | 
						|
    rescue HexaPDF::Error => e
 | 
						|
      Rails.logger.error(e.message)
 | 
						|
 | 
						|
      @pos = 0
 | 
						|
      @num += 1
 | 
						|
    end
 | 
						|
  end
 | 
						|
 | 
						|
  def initialize(page)
 | 
						|
    super
 | 
						|
 | 
						|
    @serializer = HexaPDF::Serializer.new
 | 
						|
  end
 | 
						|
 | 
						|
  def serialize(operator, operands)
 | 
						|
    operators[operator].serialize(serializer, *operands)
 | 
						|
  end
 | 
						|
 | 
						|
  def process(operator, operands = [])
 | 
						|
    super
 | 
						|
 | 
						|
    handler.call(self, operator, operands)
 | 
						|
  end
 | 
						|
 | 
						|
  def self.call(data, process_handler, result_handler, acc = {}, remove_tags: true)
 | 
						|
    doc = HexaPDF::Document.new(io: StringIO.new(data))
 | 
						|
 | 
						|
    doc.pages.each do |page|
 | 
						|
      processor = PdfProcessor.new(page)
 | 
						|
      process_handler_instance = process_handler.new
 | 
						|
      processor.handler = ParseTextHandler.new(process_handler_instance)
 | 
						|
 | 
						|
      page.process_contents(processor)
 | 
						|
 | 
						|
      if process_handler_instance.tokens?
 | 
						|
        processor = PdfProcessor.new(page)
 | 
						|
        processor.handler = process_handler_instance
 | 
						|
 | 
						|
        page.process_contents(processor)
 | 
						|
 | 
						|
        page.contents = process_handler_instance.contents if process_handler_instance.result.present? && remove_tags
 | 
						|
      end
 | 
						|
 | 
						|
      page[:Annots].to_a.each do |annot|
 | 
						|
        next unless annot
 | 
						|
 | 
						|
        text = annot[:Contents].to_s.squish
 | 
						|
 | 
						|
        next unless text.starts_with?('{{') && text.ends_with?('}}')
 | 
						|
 | 
						|
        result_handler.call({ text:, rect: annot[:Rect] }, page, acc)
 | 
						|
 | 
						|
        page[:Annots].delete(annot)
 | 
						|
      end
 | 
						|
 | 
						|
      process_handler_instance.result.each do |item|
 | 
						|
        result_handler.call(item, page, acc)
 | 
						|
      end
 | 
						|
    end
 | 
						|
 | 
						|
    new_io = StringIO.new
 | 
						|
 | 
						|
    doc.write(new_io, validate: false)
 | 
						|
 | 
						|
    [new_io.tap(&:rewind).read, acc]
 | 
						|
  end
 | 
						|
end
 |