diff --git a/lib/pdfium.rb b/lib/pdfium.rb index 464f95e2..b43c3b20 100644 --- a/lib/pdfium.rb +++ b/lib/pdfium.rb @@ -39,8 +39,17 @@ class Pdfium FPDF_RENDER_FORCEHALFTONE = 0x400 FPDF_PRINTING = 0x800 - TextNode = Struct.new(:content, :x, :y, :w, :h, keyword_init: true) - LineNode = Struct.new(:x, :y, :w, :h, :tilt, keyword_init: true) + TextNode = Struct.new(:content, :x, :y, :w, :h) do + def endx + @endx ||= x + w + end + + def endy + @endy ||= y + h + end + end + + LineNode = Struct.new(:x, :y, :w, :h, :tilt) # rubocop:disable Naming/ClassAndModuleCamelCase class FPDF_LIBRARY_CONFIG < FFI::Struct @@ -433,15 +442,15 @@ class Pdfium return @text_nodes if char_count.zero? - char_count.times do |i| - unicode = Pdfium.FPDFText_GetUnicode(text_page, i) - - char = [unicode].pack('U*') + left_ptr = FFI::MemoryPointer.new(:double) + right_ptr = FFI::MemoryPointer.new(:double) + bottom_ptr = FFI::MemoryPointer.new(:double) + top_ptr = FFI::MemoryPointer.new(:double) + origin_x_ptr = FFI::MemoryPointer.new(:double) + origin_y_ptr = FFI::MemoryPointer.new(:double) - left_ptr = FFI::MemoryPointer.new(:double) - right_ptr = FFI::MemoryPointer.new(:double) - bottom_ptr = FFI::MemoryPointer.new(:double) - top_ptr = FFI::MemoryPointer.new(:double) + char_count.times do |i| + char = Pdfium.FPDFText_GetUnicode(text_page, i).chr(Encoding::UTF_8) result = Pdfium.FPDFText_GetCharBox(text_page, i, left_ptr, right_ptr, bottom_ptr, top_ptr) @@ -450,12 +459,10 @@ class Pdfium left = left_ptr.read_double right = right_ptr.read_double - origin_x_ptr = FFI::MemoryPointer.new(:double) - origin_y_ptr = FFI::MemoryPointer.new(:double) - Pdfium.FPDFText_GetCharOrigin(text_page, i, origin_x_ptr, origin_y_ptr) origin_y = origin_y_ptr.read_double + origin_x = origin_x_ptr.read_double font_size = Pdfium.FPDFText_GetFontSize(text_page, i) font_size = 8 if font_size == 1 @@ -465,12 +472,12 @@ class Pdfium abs_width = right - left abs_height = font_size - x = abs_x / width + x = origin_x / width y = abs_y / height - node_width = abs_width / width + node_width = (abs_width + ((abs_x - origin_x).abs * 2)) / width node_height = abs_height / height - @text_nodes << TextNode.new(content: char, x: x, y: y, w: node_width, h: node_height) + @text_nodes << TextNode.new(char, x, y, node_width, node_height) end @text_nodes = @text_nodes.sort { |a, b| a.y == b.y ? a.x <=> b.x : a.y <=> b.y } @@ -539,7 +546,7 @@ class Pdfium norm_w = w / width norm_h = h / height - @line_nodes << LineNode.new(x: norm_x, y: norm_y, w: norm_w, h: norm_h, tilt: tilt) + @line_nodes << LineNode.new(norm_x, norm_y, norm_w, norm_h, tilt) end @line_nodes = @line_nodes.sort { |a, b| a.y == b.y ? a.x <=> b.x : a.y <=> b.y } diff --git a/lib/templates/image_to_fields.rb b/lib/templates/image_to_fields.rb index 786e9785..72c578a7 100755 --- a/lib/templates/image_to_fields.rb +++ b/lib/templates/image_to_fields.rb @@ -4,7 +4,11 @@ module Templates module ImageToFields module_function - Field = Struct.new(:type, :x, :y, :w, :h, :confidence, keyword_init: true) + Field = Struct.new(:type, :x, :y, :w, :h, :confidence, keyword_init: true) do + def endy + @endy ||= y + h + end + end MODEL_PATH = Rails.root.join('tmp/model.onnx') @@ -299,7 +303,7 @@ module Templates end def sort_fields(fields, y_threshold: 0.01) - sorted_fields = fields.sort { |a, b| a.y == b.y ? a.x <=> b.x : a.y <=> b.y } + sorted_fields = fields.sort { |a, b| a.endy == b.endy ? a.x <=> b.x : a.endy <=> b.endy } lines = [] current_line = []