inference v2

1 month ago · f5b4a0c5ab
parent b10ed46ccc
commit f5b4a0c5ab
2 changed files with 147 additions and 12 deletions
--- a/lib/templates/detect_fields.rb
+++ b/lib/templates/detect_fields.rb
@ -59,8 +59,8 @@ module Templates
    CHECKBOXES = ['☐', '□'].freeze

    # rubocop:disable Metrics, Style
-    def call(io, attachment: nil, confidence: 0.3, temperature: 1, inference: Templates::ImageToFields,
-             nms: 0.1, split_page: false, aspect_ratio: true, padding: 20, regexp_type: true, &)
+    def call(io, attachment: nil, confidence: 0.3, temperature: 1, inference: Templates::ImageToFields, nms: 0.1,
+             split_page: false, aspect_ratio: true, padding: inference.model_v2? ? nil : 20, regexp_type: true, &)
      fields, head_node =
        if attachment&.image?
          process_image_attachment(io, attachment:, confidence:, nms:, split_page:, inference:,
@ -114,7 +114,10 @@ module Templates
      fields = doc.page_count.times.flat_map do |page_number|
        page = doc.get_page(page_number)

-        data, width, height = page.render_to_bitmap(width: inference::RESOLUTION * 1.5)
+        size_key = page.width > page.height ? :width : :height
+        size = padding ? inference.resolution * 1.5 : inference.resolution
+
+        data, width, height = page.render_to_bitmap(size_key => size)

        image = Vips::Image.new_from_memory(data, width, height, 4, :uchar)

@ -126,8 +129,8 @@ module Templates

        fields = sort_fields(fields, y_threshold: 10.0 / image.height)

-        fields = increase_confidence_for_overlapping_fields(fields, text_fields)
-        fields = increase_confidence_for_overlapping_fields(fields, line_fields)
+        fields = increase_confidence_for_overlapping_fields(fields, text_fields, confidence:)
+        fields = increase_confidence_for_overlapping_fields(fields, line_fields, confidence:)

        fields = fields.reject { |f| f.confidence < confidence }

@ -477,10 +480,11 @@ module Templates
      !(box1.endx < box2.x || box2.endx < box1.x || box1.endy < box2.y || box2.endy < box1.y)
    end

-    def increase_confidence_for_overlapping_fields(image_fields, text_fields, by: 1.0)
+    def increase_confidence_for_overlapping_fields(image_fields, text_fields, confidence: 1, by: 1.0)
      return image_fields if text_fields.blank?

      image_fields.map do |image_field|
+        next if image_field.confidence >= confidence
        next if image_field.type != 'text'

        text_fields.each do |text_field|
--- a/lib/templates/image_to_fields.rb
+++ b/lib/templates/image_to_fields.rb
@ -16,7 +16,7 @@ module Templates

    MODEL_PATH = Rails.root.join('tmp/model.onnx')

-    RESOLUTION = 704
+    INPUT_NAMES = %w[images input].freeze

    ID_TO_CLASS = %w[text checkbox].freeze

@ -27,12 +27,14 @@ module Templates

    # rubocop:disable Metrics
    def call(image, confidence: 0.3, nms: 0.1, temperature: 1,
-             split_page: false, aspect_ratio: true, padding: nil, resolution: RESOLUTION)
-      base_image = image.extract_band(0, n: 3)
+             split_page: false, aspect_ratio: true, padding: nil, resolution: self.resolution)
+      image = image.extract_band(0, n: 3) if image.bands > 3

-      trimmed_base, base_offset_x, base_offset_y = trim_image_with_padding(base_image, padding)
+      trimmed_base, base_offset_x, base_offset_y = trim_image_with_padding(image, padding)

-      if split_page && image.height > image.width
+      if model_v2?
+        detections = call_v2(trimmed_base, base_offset_x, base_offset_y, split_page, confidence:, resolution:)
+      elsif split_page && image.height > image.width
        regions = build_split_image_regions(trimmed_base)

        detections = { xyxy: Numo::SFloat[], confidence: Numo::SFloat[], class_id: Numo::Int32[] }
@ -71,6 +73,127 @@ module Templates
      build_fields_from_detections(detections, image)
    end

+    def call_v2(image, offset_x, offset_y, split_page, confidence:, resolution:)
+      if split_page && image.height > image.width
+        regions = build_split_image_regions(image)
+
+        detections = { xyxy: Numo::SFloat[], confidence: Numo::SFloat[], class_id: Numo::Int32[] }
+
+        regions.reduce(detections) do |acc, r|
+          next acc if r[:img].height <= 0 || r[:img].width <= 0
+
+          input_tensor, orig_size_tensor, transform_info = preprocess_image_v2(r[:img], resolution)
+
+          outputs = model.predict({ 'images' => input_tensor, 'orig_target_sizes' => orig_size_tensor },
+                                  output_type: :numo)
+
+          boxes = outputs['boxes'][0, true, true]
+          labels = outputs['labels'][0, true]
+          scores = outputs['scores'][0, true]
+
+          postprocess_outputs_v2(boxes, labels, scores, acc,
+                                 offset_x:, offset_y: offset_y + r[:offset_y],
+                                 confidence:, transform_info:)
+        end
+      else
+        input_tensor, orig_size_tensor, transform_info = preprocess_image_v2(image, resolution)
+
+        outputs = model.predict({ 'images' => input_tensor, 'orig_target_sizes' => orig_size_tensor },
+                                output_type: :numo)
+
+        boxes = outputs['boxes'][0, true, true]
+        labels = outputs['labels'][0, true]
+        scores = outputs['scores'][0, true]
+
+        postprocess_outputs_v2(boxes, labels, scores, offset_x:, offset_y:,
+                                                      confidence:, transform_info:)
+      end
+    end
+
+    def preprocess_image_v2(image, resolution)
+      image = image.extract_band(0, n: 3) if image.bands > 3
+
+      ratio = [resolution.to_f / image.width, resolution.to_f / image.height].min
+      new_width = (image.width * ratio).to_i
+      new_height = (image.height * ratio).to_i
+
+      image = image.resize(ratio, vscale: ratio, kernel: :linear) if ratio != 1
+
+      pad_w = (resolution - new_width) / 2
+      pad_h = (resolution - new_height) / 2
+
+      padded = image.embed(pad_w, pad_h, resolution, resolution, background: [0, 0, 0])
+
+      padded /= 255.0
+
+      img_array = Numo::SFloat.from_binary(padded.write_to_memory, [resolution, resolution, 3])
+
+      img_array = img_array.transpose(2, 0, 1)
+
+      input_tensor = img_array.reshape(1, 3, resolution, resolution)
+
+      orig_size_tensor = Numo::Int64[[resolution, resolution]]
+
+      transform_info = { ratio: ratio, pad_w: pad_w, pad_h: pad_h }
+
+      [input_tensor, orig_size_tensor, transform_info]
+    end
+
+    def postprocess_outputs_v2(boxes, labels, scores, detections = nil, offset_x:, offset_y:, confidence:,
+                               transform_info:)
+      keep_mask = scores.gt(confidence)
+      keep_indices = keep_mask.where
+
+      if keep_indices.empty?
+        detections || {
+          xyxy: Numo::SFloat[],
+          confidence: Numo::SFloat[],
+          class_id: Numo::Int32[]
+        }
+      else
+        scores = scores[keep_indices]
+        labels = labels[keep_indices]
+        boxes_xyxy = boxes[keep_indices, true]
+
+        ratio = transform_info[:ratio]
+        pad_w = transform_info[:pad_w]
+        pad_h = transform_info[:pad_h]
+
+        boxes_xyxy[true, 0] = ((boxes_xyxy[true, 0] - pad_w) / ratio) + offset_x
+        boxes_xyxy[true, 1] = ((boxes_xyxy[true, 1] - pad_h) / ratio) + offset_y
+        boxes_xyxy[true, 2] = ((boxes_xyxy[true, 2] - pad_w) / ratio) + offset_x
+        boxes_xyxy[true, 3] = ((boxes_xyxy[true, 3] - pad_h) / ratio) + offset_y
+
+        if detections
+          existing_n = detections[:xyxy].shape[0]
+          new_n = boxes_xyxy.shape[0]
+          total = existing_n + new_n
+
+          xyxy = Numo::SFloat.zeros(total, 4)
+          conf = Numo::SFloat.zeros(total)
+          cls = Numo::Int32.zeros(total)
+
+          if existing_n.positive?
+            xyxy[0...existing_n, true] = detections[:xyxy]
+            conf[0...existing_n] = detections[:confidence]
+            cls[0...existing_n] = detections[:class_id]
+          end
+
+          xyxy[existing_n...(existing_n + new_n), true] = boxes_xyxy
+          conf[existing_n...(existing_n + new_n)] = scores
+          cls[existing_n...(existing_n + new_n)] = labels
+
+          { xyxy: xyxy, confidence: conf, class_id: cls }
+        else
+          {
+            xyxy: boxes_xyxy,
+            confidence: scores,
+            class_id: labels
+          }
+        end
+      end
+    end
+
    def build_split_image_regions(image)
      half_h = image.height / 2
      top_h = half_h
@ -212,7 +335,7 @@ module Templates
    end

    def postprocess_outputs(boxes, logits, transform_info, detections = nil, confidence: 0.3, temperature: 1,
-                            resolution: RESOLUTION)
+                            resolution: self.resolution)
      scaled_logits = logits / temperature

      probs = 1.0 / (1.0 + Numo::NMath.exp(-scaled_logits))
@ -326,6 +449,14 @@ module Templates
        providers: ['CPUExecutionProvider']
      )
    end
+
+    def resolution
+      @resolution ||= model.inputs.find { |i| INPUT_NAMES.include?(i[:name]) }.dig(:shape, 2)
+    end
+
+    def model_v2?
+      @model_v2 ||= model.inputs.pluck(:name).include?('orig_target_sizes')
+    end
    # rubocop:enable Metrics
  end
 end