From bf6afc5e61bd3ce2092af6b4816f6b24ecad4201 Mon Sep 17 00:00:00 2001 From: Pete Matsyburka Date: Mon, 3 Nov 2025 17:00:52 +0200 Subject: [PATCH] adjust detection --- app/javascript/template_builder/fields.vue | 6 ++- lib/templates/detect_fields.rb | 22 +++++---- lib/templates/image_to_fields.rb | 57 +++++++++++++--------- 3 files changed, 50 insertions(+), 35 deletions(-) diff --git a/app/javascript/template_builder/fields.vue b/app/javascript/template_builder/fields.vue index 25a0d531..87209a3f 100644 --- a/app/javascript/template_builder/fields.vue +++ b/app/javascript/template_builder/fields.vue @@ -461,7 +461,11 @@ export default { const jsonStr = line.replace(/^data: /, '') const data = JSON.parse(jsonStr) - if (data.completed) { + if (data.error) { + alert(data.error) + + break + } else if (data.completed) { this.fieldPagesLoaded = null this.template.fields = fields this.save() diff --git a/lib/templates/detect_fields.rb b/lib/templates/detect_fields.rb index d9a81180..4eb90d55 100755 --- a/lib/templates/detect_fields.rb +++ b/lib/templates/detect_fields.rb @@ -7,23 +7,23 @@ module Templates TextFieldBox = Struct.new(:x, :y, :w, :h, keyword_init: true) # rubocop:disable Metrics - def call(io, attachment: nil, confidence: 0.3, temperature: 1, + def call(io, attachment: nil, confidence: 0.3, temperature: 1, inference: Templates::ImageToFields, nms: 0.1, split_page: false, aspect_ratio: true, padding: 20, &) if attachment&.image? - process_image_attachment(io, attachment:, confidence:, nms:, split_page:, + process_image_attachment(io, attachment:, confidence:, nms:, split_page:, inference:, temperature:, aspect_ratio:, padding:, &) else - process_pdf_attachment(io, attachment:, confidence:, nms:, split_page:, + process_pdf_attachment(io, attachment:, confidence:, nms:, split_page:, inference:, temperature:, aspect_ratio:, padding:, &) end end - def process_image_attachment(io, attachment:, confidence:, nms:, temperature: 1, + def process_image_attachment(io, attachment:, confidence:, nms:, temperature:, inference:, split_page: false, aspect_ratio: false, padding: nil) image = Vips::Image.new_from_buffer(io.read, '') - fields = Templates::ImageToFields.call(image, confidence:, nms:, split_page:, - temperature:, aspect_ratio:, padding:) + fields = inference.call(image, confidence:, nms:, split_page:, + temperature:, aspect_ratio:, padding:) fields = fields.map do |f| { @@ -47,19 +47,19 @@ module Templates fields end - def process_pdf_attachment(io, attachment:, confidence:, nms:, temperature: 1, + def process_pdf_attachment(io, attachment:, confidence:, nms:, temperature:, inference:, split_page: false, aspect_ratio: false, padding: nil) doc = Pdfium::Document.open_bytes(io.read) doc.page_count.times.flat_map do |page_number| page = doc.get_page(page_number) - data, width, height = page.render_to_bitmap(width: ImageToFields::RESOLUTION * 1.5) + data, width, height = page.render_to_bitmap(width: inference::RESOLUTION * 1.5) image = Vips::Image.new_from_memory(data, width, height, 4, :uchar) - fields = Templates::ImageToFields.call(image, confidence: 0.05, nms:, split_page:, - temperature:, aspect_ratio:, padding:) + fields = inference.call(image, confidence: 0.05, nms:, split_page:, + temperature:, aspect_ratio:, padding:) text_fields = extract_text_fields_from_page(page) line_fields = extract_line_fields_from_page(page) @@ -151,6 +151,8 @@ module Templates next_node = page.text_nodes[node_index + 1] + break unless next_node + break if next_node.x + next_node.w < line.x || line.x + line.w < next_node.x || next_node.y + next_node.h < line.y - next_node.h || line.y < next_node.y end diff --git a/lib/templates/image_to_fields.rb b/lib/templates/image_to_fields.rb index 5eb39af4..fa342f4e 100755 --- a/lib/templates/image_to_fields.rb +++ b/lib/templates/image_to_fields.rb @@ -19,49 +19,65 @@ module Templates # rubocop:disable Metrics def call(image, confidence: 0.3, nms: 0.1, temperature: 1, - split_page: false, aspect_ratio: true, padding: nil) + split_page: false, aspect_ratio: true, padding: nil, resolution: RESOLUTION) base_image = image.extract_band(0, n: 3) trimmed_base, base_offset_x, base_offset_y = trim_image_with_padding(base_image, padding) if split_page && image.height > image.width - half_h = trimmed_base.height / 2 - top_h = half_h - bottom_h = trimmed_base.height - half_h - - regions = [ - { img: trimmed_base.crop(0, 0, trimmed_base.width, top_h), offset_y: 0 }, - { img: trimmed_base.crop(0, top_h, trimmed_base.width, bottom_h), offset_y: top_h } - ] + regions = build_split_image_regions(trimmed_base) detections = { xyxy: Numo::SFloat[], confidence: Numo::SFloat[], class_id: Numo::Int32[] } detections = regions.reduce(detections) do |acc, r| next detections if r[:img].height <= 0 || r[:img].width <= 0 - input_tensor, transform_info = preprocess_image(r[:img], RESOLUTION, aspect_ratio:) + input_tensor, transform_info = preprocess_image(r[:img], resolution, aspect_ratio:) transform_info[:trim_offset_x] = base_offset_x transform_info[:trim_offset_y] = base_offset_y + r[:offset_y] outputs = model.predict({ 'input' => input_tensor }) - postprocess_outputs(outputs, transform_info, acc, confidence:, temperature:) + boxes = Numo::SFloat.cast(outputs['dets'])[0, true, true] + logits = Numo::SFloat.cast(outputs['labels'])[0, true, true] + + postprocess_outputs(boxes, logits, transform_info, acc, confidence:, temperature:, resolution:) end else - input_tensor, transform_info = preprocess_image(trimmed_base, RESOLUTION, aspect_ratio:) + input_tensor, transform_info = preprocess_image(trimmed_base, resolution, aspect_ratio:) transform_info[:trim_offset_x] = base_offset_x transform_info[:trim_offset_y] = base_offset_y outputs = model.predict({ 'input' => input_tensor }) - detections = postprocess_outputs(outputs, transform_info, confidence:, temperature:) + boxes = Numo::SFloat.cast(outputs['dets'])[0, true, true] + logits = Numo::SFloat.cast(outputs['labels'])[0, true, true] + + detections = postprocess_outputs(boxes, logits, transform_info, confidence:, temperature:, resolution:) end detections = apply_nms(detections, nms) - fields = Array.new(detections[:xyxy].shape[0]) do |i| + fields = build_fields_from_detections(detections, image) + + sort_fields(fields, y_threshold: 10.0 / image.height) + end + + def build_split_image_regions(image) + half_h = image.height / 2 + top_h = half_h + bottom_h = image.height - half_h + + [ + { img: image.crop(0, 0, image.width, top_h), offset_y: 0 }, + { img: image.crop(0, top_h, image.width, bottom_h), offset_y: top_h } + ] + end + + def build_fields_from_detections(detections, image) + Array.new(detections[:xyxy].shape[0]) do |i| x1 = detections[:xyxy][i, 0] y1 = detections[:xyxy][i, 1] x2 = detections[:xyxy][i, 2] @@ -87,8 +103,6 @@ module Templates confidence: ) end - - sort_fields(fields, y_threshold: 10.0 / image.height) end def trim_image_with_padding(image, padding = 0) @@ -185,13 +199,8 @@ module Templates Numo::Int32.cast(keep) end - def postprocess_outputs(outputs, transform_info, detections = nil, confidence: 0.3, temperature: 1) - boxes = Numo::SFloat.cast(outputs['dets']) - logits = Numo::SFloat.cast(outputs['labels']) - - boxes = boxes[0, true, true] # [300, 4] - logits = logits[0, true, true] # [300, num_classes] - + def postprocess_outputs(boxes, logits, transform_info, detections = nil, confidence: 0.3, temperature: 1, + resolution: RESOLUTION) scaled_logits = logits / temperature probs = 1.0 / (1.0 + Numo::NMath.exp(-scaled_logits)) @@ -215,7 +224,7 @@ module Templates boxes_xyxy[true, 2] = x2 boxes_xyxy[true, 3] = y2 - boxes_xyxy *= RESOLUTION + boxes_xyxy *= resolution pad_x = transform_info[:pad_x] pad_y = transform_info[:pad_y]