diff --git a/lib/templates/detect_fields.rb b/lib/templates/detect_fields.rb index 2ade0efd..512d7804 100755 --- a/lib/templates/detect_fields.rb +++ b/lib/templates/detect_fields.rb @@ -59,8 +59,8 @@ module Templates CHECKBOXES = ['☐', '□'].freeze # rubocop:disable Metrics, Style - def call(io, attachment: nil, confidence: 0.3, temperature: 1, inference: Templates::ImageToFields, - nms: 0.1, split_page: false, aspect_ratio: true, padding: 20, regexp_type: true, &) + def call(io, attachment: nil, confidence: 0.3, temperature: 1, inference: Templates::ImageToFields, nms: 0.1, + split_page: false, aspect_ratio: true, padding: inference.model_v2? ? nil : 20, regexp_type: true, &) fields, head_node = if attachment&.image? process_image_attachment(io, attachment:, confidence:, nms:, split_page:, inference:, @@ -114,7 +114,10 @@ module Templates fields = doc.page_count.times.flat_map do |page_number| page = doc.get_page(page_number) - data, width, height = page.render_to_bitmap(width: inference::RESOLUTION * 1.5) + size_key = page.width > page.height ? :width : :height + size = padding ? inference.resolution * 1.5 : inference.resolution + + data, width, height = page.render_to_bitmap(size_key => size) image = Vips::Image.new_from_memory(data, width, height, 4, :uchar) @@ -126,8 +129,8 @@ module Templates fields = sort_fields(fields, y_threshold: 10.0 / image.height) - fields = increase_confidence_for_overlapping_fields(fields, text_fields) - fields = increase_confidence_for_overlapping_fields(fields, line_fields) + fields = increase_confidence_for_overlapping_fields(fields, text_fields, confidence:) + fields = increase_confidence_for_overlapping_fields(fields, line_fields, confidence:) fields = fields.reject { |f| f.confidence < confidence } @@ -477,10 +480,11 @@ module Templates !(box1.endx < box2.x || box2.endx < box1.x || box1.endy < box2.y || box2.endy < box1.y) end - def increase_confidence_for_overlapping_fields(image_fields, text_fields, by: 1.0) + def increase_confidence_for_overlapping_fields(image_fields, text_fields, confidence: 1, by: 1.0) return image_fields if text_fields.blank? image_fields.map do |image_field| + next if image_field.confidence >= confidence next if image_field.type != 'text' text_fields.each do |text_field| diff --git a/lib/templates/image_to_fields.rb b/lib/templates/image_to_fields.rb index d9c04221..a2dcc201 100755 --- a/lib/templates/image_to_fields.rb +++ b/lib/templates/image_to_fields.rb @@ -16,7 +16,7 @@ module Templates MODEL_PATH = Rails.root.join('tmp/model.onnx') - RESOLUTION = 704 + INPUT_NAMES = %w[images input].freeze ID_TO_CLASS = %w[text checkbox].freeze @@ -27,12 +27,14 @@ module Templates # rubocop:disable Metrics def call(image, confidence: 0.3, nms: 0.1, temperature: 1, - split_page: false, aspect_ratio: true, padding: nil, resolution: RESOLUTION) - base_image = image.extract_band(0, n: 3) + split_page: false, aspect_ratio: true, padding: nil, resolution: self.resolution) + image = image.extract_band(0, n: 3) if image.bands > 3 - trimmed_base, base_offset_x, base_offset_y = trim_image_with_padding(base_image, padding) + trimmed_base, base_offset_x, base_offset_y = trim_image_with_padding(image, padding) - if split_page && image.height > image.width + if model_v2? + detections = call_v2(trimmed_base, base_offset_x, base_offset_y, split_page, confidence:, resolution:) + elsif split_page && image.height > image.width regions = build_split_image_regions(trimmed_base) detections = { xyxy: Numo::SFloat[], confidence: Numo::SFloat[], class_id: Numo::Int32[] } @@ -71,6 +73,127 @@ module Templates build_fields_from_detections(detections, image) end + def call_v2(image, offset_x, offset_y, split_page, confidence:, resolution:) + if split_page && image.height > image.width + regions = build_split_image_regions(image) + + detections = { xyxy: Numo::SFloat[], confidence: Numo::SFloat[], class_id: Numo::Int32[] } + + regions.reduce(detections) do |acc, r| + next acc if r[:img].height <= 0 || r[:img].width <= 0 + + input_tensor, orig_size_tensor, transform_info = preprocess_image_v2(r[:img], resolution) + + outputs = model.predict({ 'images' => input_tensor, 'orig_target_sizes' => orig_size_tensor }, + output_type: :numo) + + boxes = outputs['boxes'][0, true, true] + labels = outputs['labels'][0, true] + scores = outputs['scores'][0, true] + + postprocess_outputs_v2(boxes, labels, scores, acc, + offset_x:, offset_y: offset_y + r[:offset_y], + confidence:, transform_info:) + end + else + input_tensor, orig_size_tensor, transform_info = preprocess_image_v2(image, resolution) + + outputs = model.predict({ 'images' => input_tensor, 'orig_target_sizes' => orig_size_tensor }, + output_type: :numo) + + boxes = outputs['boxes'][0, true, true] + labels = outputs['labels'][0, true] + scores = outputs['scores'][0, true] + + postprocess_outputs_v2(boxes, labels, scores, offset_x:, offset_y:, + confidence:, transform_info:) + end + end + + def preprocess_image_v2(image, resolution) + image = image.extract_band(0, n: 3) if image.bands > 3 + + ratio = [resolution.to_f / image.width, resolution.to_f / image.height].min + new_width = (image.width * ratio).to_i + new_height = (image.height * ratio).to_i + + image = image.resize(ratio, vscale: ratio, kernel: :linear) if ratio != 1 + + pad_w = (resolution - new_width) / 2 + pad_h = (resolution - new_height) / 2 + + padded = image.embed(pad_w, pad_h, resolution, resolution, background: [0, 0, 0]) + + padded /= 255.0 + + img_array = Numo::SFloat.from_binary(padded.write_to_memory, [resolution, resolution, 3]) + + img_array = img_array.transpose(2, 0, 1) + + input_tensor = img_array.reshape(1, 3, resolution, resolution) + + orig_size_tensor = Numo::Int64[[resolution, resolution]] + + transform_info = { ratio: ratio, pad_w: pad_w, pad_h: pad_h } + + [input_tensor, orig_size_tensor, transform_info] + end + + def postprocess_outputs_v2(boxes, labels, scores, detections = nil, offset_x:, offset_y:, confidence:, + transform_info:) + keep_mask = scores.gt(confidence) + keep_indices = keep_mask.where + + if keep_indices.empty? + detections || { + xyxy: Numo::SFloat[], + confidence: Numo::SFloat[], + class_id: Numo::Int32[] + } + else + scores = scores[keep_indices] + labels = labels[keep_indices] + boxes_xyxy = boxes[keep_indices, true] + + ratio = transform_info[:ratio] + pad_w = transform_info[:pad_w] + pad_h = transform_info[:pad_h] + + boxes_xyxy[true, 0] = ((boxes_xyxy[true, 0] - pad_w) / ratio) + offset_x + boxes_xyxy[true, 1] = ((boxes_xyxy[true, 1] - pad_h) / ratio) + offset_y + boxes_xyxy[true, 2] = ((boxes_xyxy[true, 2] - pad_w) / ratio) + offset_x + boxes_xyxy[true, 3] = ((boxes_xyxy[true, 3] - pad_h) / ratio) + offset_y + + if detections + existing_n = detections[:xyxy].shape[0] + new_n = boxes_xyxy.shape[0] + total = existing_n + new_n + + xyxy = Numo::SFloat.zeros(total, 4) + conf = Numo::SFloat.zeros(total) + cls = Numo::Int32.zeros(total) + + if existing_n.positive? + xyxy[0...existing_n, true] = detections[:xyxy] + conf[0...existing_n] = detections[:confidence] + cls[0...existing_n] = detections[:class_id] + end + + xyxy[existing_n...(existing_n + new_n), true] = boxes_xyxy + conf[existing_n...(existing_n + new_n)] = scores + cls[existing_n...(existing_n + new_n)] = labels + + { xyxy: xyxy, confidence: conf, class_id: cls } + else + { + xyxy: boxes_xyxy, + confidence: scores, + class_id: labels + } + end + end + end + def build_split_image_regions(image) half_h = image.height / 2 top_h = half_h @@ -212,7 +335,7 @@ module Templates end def postprocess_outputs(boxes, logits, transform_info, detections = nil, confidence: 0.3, temperature: 1, - resolution: RESOLUTION) + resolution: self.resolution) scaled_logits = logits / temperature probs = 1.0 / (1.0 + Numo::NMath.exp(-scaled_logits)) @@ -326,6 +449,14 @@ module Templates providers: ['CPUExecutionProvider'] ) end + + def resolution + @resolution ||= model.inputs.find { |i| INPUT_NAMES.include?(i[:name]) }.dig(:shape, 2) + end + + def model_v2? + @model_v2 ||= model.inputs.pluck(:name).include?('orig_target_sizes') + end # rubocop:enable Metrics end end