# frozen_string_literal: true module Templates module DetectFields module_function TextFieldBox = Struct.new(:x, :y, :w, :h, keyword_init: true) PageNode = Struct.new(:prev, :next, :elem, :page, :attachment_uuid, keyword_init: true) DATE_REGEXP = / (?: date | signed\sat | datum ) \s*[:-]?\s*\z /ix NUMBER_REGEXP = / (?: price | \$ | € | total | quantity | prix | quantité | preis | summe | gesamt(?:betrag)? | menge | anzahl | stückzahl ) \s*[:-]?\s*\z /ix SIGNATURE_REGEXP = / (?: signature | sign\shere | sign | signez\sici | signer\sici | unterschrift | unterschreiben | unterzeichnen ) \s*[:-]?\s*\z /ix # rubocop:disable Metrics, Style def call(io, attachment: nil, confidence: 0.3, temperature: 1, inference: Templates::ImageToFields, nms: 0.1, split_page: false, aspect_ratio: true, padding: 20, regexp_type: true, &) fields, head_node = if attachment&.image? process_image_attachment(io, attachment:, confidence:, nms:, split_page:, inference:, temperature:, aspect_ratio:, padding:, &) else process_pdf_attachment(io, attachment:, confidence:, nms:, split_page:, inference:, temperature:, aspect_ratio:, regexp_type:, padding:, &) end [fields, head_node] end def process_image_attachment(io, attachment:, confidence:, nms:, temperature:, inference:, split_page: false, aspect_ratio: false, padding: nil) image = Vips::Image.new_from_buffer(io.read, '') fields = inference.call(image, confidence:, nms:, split_page:, temperature:, aspect_ratio:, padding:) fields = fields.map do |f| { uuid: SecureRandom.uuid, type: f.type, required: f.type == 'signature', preferences: {}, areas: [{ x: f.x, y: f.y, w: f.w, h: f.h, page: 0, attachment_uuid: attachment&.uuid }] } end yield [attachment&.uuid, 0, fields] if block_given? [fields, nil] end def process_pdf_attachment(io, attachment:, confidence:, nms:, temperature:, inference:, split_page: false, aspect_ratio: false, padding: nil, regexp_type: false) doc = Pdfium::Document.open_bytes(io.read) head_node = PageNode.new(elem: ''.b, page: 0, attachment_uuid: attachment&.uuid) tail_node = head_node fields = doc.page_count.times.flat_map do |page_number| page = doc.get_page(page_number) data, width, height = page.render_to_bitmap(width: inference::RESOLUTION * 1.5) image = Vips::Image.new_from_memory(data, width, height, 4, :uchar) fields = inference.call(image, confidence: confidence / 4.0, nms:, split_page:, temperature:, aspect_ratio:, padding:) text_fields = extract_text_fields_from_page(page) line_fields = extract_line_fields_from_page(page) fields = increase_confidence_for_overlapping_fields(fields, text_fields) fields = increase_confidence_for_overlapping_fields(fields, line_fields) fields = fields.reject { |f| f.confidence < confidence } field_nodes, tail_node = build_page_nodes(page, fields, tail_node, attachment_uuid: attachment&.uuid) fields = field_nodes.map do |node| field = node.elem type = regexp_type ? type_from_page_node(node) : field.type { uuid: SecureRandom.uuid, type:, required: type == 'signature', preferences: {}, areas: [{ x: field.x, y: field.y, w: field.w, h: field.h, page: page_number, attachment_uuid: attachment&.uuid }] } end yield [attachment&.uuid, page_number, fields] if block_given? fields ensure page.close end print_debug(head_node) if Rails.env.development? [fields, head_node] ensure doc.close end def print_debug(head_node) current_node = head_node index = 0 string = ''.b loop do string << if current_node.elem.is_a?(String) current_node.elem else "[#{current_node.elem.type == 'checkbox' ? 'Checkbox' : 'Field'}_#{index += 1}]" end current_node = current_node.next break unless current_node end Rails.logger.info(string) end def type_from_page_node(node) return node.elem.type unless node.prev.elem.is_a?(String) return node.elem.type unless node.elem.type == 'text' string = node.prev.elem return 'date' if string.match?(DATE_REGEXP) return 'signature' if string.match?(SIGNATURE_REGEXP) return 'number' if string.match?(NUMBER_REGEXP) return 'text' end def build_page_nodes(page, fields, tail_node, attachment_uuid: nil) field_nodes = [] current_text = ''.b text_nodes = page.text_nodes text_idx = 0 field_idx = 0 while text_idx < text_nodes.length || field_idx < fields.length text_node = text_nodes[text_idx] field = fields[field_idx] process_text_node = false process_field_node = false if text_node && field text_y_center = text_node.y + (text_node.h / 2.0) field_y_center = field.y + (field.h / 2.0) y_threshold = text_node.h / 2.0 vertical_distance = (text_y_center - field_y_center).abs if vertical_distance < y_threshold is_underscore = text_node.content == '_' is_left_of_field = text_node.x < field.x if is_underscore && is_left_of_field text_x_end = text_node.x + text_node.w distance = field.x - text_x_end proximity_threshold = text_node.w * 3.0 if distance < proximity_threshold process_field_node = true else process_text_node = true end elsif is_left_of_field process_text_node = true else process_field_node = true end elsif text_node.y < field.y process_text_node = true else process_field_node = true end elsif text_node process_text_node = true elsif field process_field_node = true end if process_field_node unless current_text.empty? new_text_node = PageNode.new(prev: tail_node, elem: current_text, page: page.page_index, attachment_uuid:) tail_node.next = new_text_node tail_node = new_text_node current_text = ''.b end new_field_node = PageNode.new(prev: tail_node, elem: field, page: page.page_index, attachment_uuid:) tail_node.next = new_field_node tail_node = new_field_node field_nodes << tail_node while text_idx < text_nodes.length text_node_to_check = text_nodes[text_idx] is_part_of_field = false if text_node_to_check.content == '_' check_y_center = text_node_to_check.y + (text_node_to_check.h / 2.0) check_y_dist = (check_y_center - field_y_center).abs check_y_thresh = text_node_to_check.h / 2.0 if check_y_dist < check_y_thresh padding = text_node_to_check.w * 3.0 field_x_start = field.x - padding field_x_end = field.x + field.w + padding text_x_start = text_node_to_check.x text_x_end = text_node_to_check.x + text_node_to_check.w is_part_of_field = true if text_x_start <= field_x_end && field_x_start <= text_x_end end end break unless is_part_of_field text_idx += 1 end field_idx += 1 elsif process_text_node if text_idx > 0 prev_text_node = text_nodes[text_idx - 1] x_gap = text_node.x - (prev_text_node.x + prev_text_node.w) gap_w = text_node.w > prev_text_node.w ? text_node.w : prev_text_node.w current_text << ' ' if x_gap > gap_w * 2 end current_text << text_node.content text_idx += 1 end end unless current_text.empty? new_text_node = PageNode.new(prev: tail_node, elem: current_text, page: page.page_index, attachment_uuid:) tail_node.next = new_text_node tail_node = new_text_node end [field_nodes, tail_node] end def extract_line_fields_from_page(page) line_thickness = 5.0 / page.height vertical_lines, all_horizontal_lines = page.line_nodes.partition { |line| line.tilt == 90 } horizontal_lines = all_horizontal_lines.reject do |h_line| next true if h_line.w > 0.7 && (h_line.h < 0.1 || h_line.h < 0.9) next false if vertical_lines.blank? h_x_min = h_line.x h_x_max = h_line.x + h_line.w h_y_avg = h_line.y + (h_line.h / 2) vertical_lines.any? do |v_line| v_x_avg = v_line.x + (v_line.w / 2) v_y_min = v_line.y v_y_max = v_line.y + v_line.h h_x_min_expanded = h_x_min - line_thickness h_x_max_expanded = h_x_max + line_thickness h_y_min_expanded = h_y_avg - line_thickness h_y_max_expanded = h_y_avg + line_thickness v_x_min_expanded = v_x_avg - line_thickness v_x_max_expanded = v_x_avg + line_thickness v_y_min_expanded = v_y_min - line_thickness v_y_max_expanded = v_y_max + line_thickness x_overlap = v_x_min_expanded <= h_x_max_expanded && v_x_max_expanded >= h_x_min_expanded y_overlap = h_y_min_expanded <= v_y_max_expanded && h_y_max_expanded >= v_y_min_expanded x_overlap && y_overlap end end node_index = 0 horizontal_lines = horizontal_lines.reject do |line| nodes = [] loop do node = page.text_nodes[node_index += 1] break unless node break if node.y > line.y next if node.x + node.w < line.x || line.x + line.w < node.x || node.y + node.h < line.y - node.h || line.y < node.y nodes << node next if nodes.blank? next_node = page.text_nodes[node_index + 1] break unless next_node break if next_node.x + next_node.w < line.x || line.x + line.w < next_node.x || next_node.y + next_node.h < line.y - next_node.h || line.y < next_node.y end next if nodes.blank? width = nodes.last.x + nodes.last.w - nodes.first.x next true if width > line.w / 2.0 end horizontal_lines.each do |line| line.h += 4 * line_thickness line.y -= 4 * line_thickness end end def extract_text_fields_from_page(page) text_nodes = page.text_nodes field_boxes = [] i = 0 while i < text_nodes.length node = text_nodes[i] next i += 1 if node.content != '_' x1 = node.x y1 = node.y x2 = node.x + node.w y2 = node.y + node.h underscore_count = 1 j = i + 1 while j < text_nodes.length next_node = text_nodes[j] break unless next_node.content == '_' distance = next_node.x - x2 height_diff = (next_node.y - y1).abs break if distance > 0.02 || height_diff > node.h * 0.5 underscore_count += 1 next_x2 = next_node.x + next_node.w next_y2 = next_node.y + next_node.h x2 = next_x2 y2 = [y2, next_y2].max y1 = [y1, next_node.y].min j += 1 end field_boxes << TextFieldBox.new(x: x1, y: y1, w: x2 - x1, h: y2 - y1) if underscore_count >= 2 i = j end field_boxes end def calculate_iou(box1, box2) x1 = [box1.x, box2.x].max y1 = [box1.y, box2.y].max x2 = [box1.x + box1.w, box2.x + box2.w].min y2 = [box1.y + box1.h, box2.y + box2.h].min intersection_width = [0, x2 - x1].max intersection_height = [0, y2 - y1].max intersection_area = intersection_width * intersection_height return 0.0 if intersection_area.zero? box1_area = box1.w * box1.h box2_area = box2.w * box2.h union_area = box1_area + box2_area - intersection_area intersection_area / union_area end def boxes_overlap?(box1, box2) !(box1.x + box1.w < box2.x || box2.x + box2.w < box1.x || box1.y + box1.h < box2.y || box2.y + box2.h < box1.y) end def increase_confidence_for_overlapping_fields(image_fields, text_fields, by: 1.0) return image_fields if text_fields.blank? image_fields.map do |image_field| next if image_field.type != 'text' field_bottom = image_field.y + image_field.h text_fields.each do |text_field| break if text_field.y > field_bottom next if text_field.y + text_field.h < image_field.y next unless boxes_overlap?(image_field, text_field) next if calculate_iou(image_field, text_field) < 0.4 break image_field.confidence += by end end image_fields end # rubocop:enable Metrics, Style end end