You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
docuseal/lib/templates/detect_fields.rb

503 lines
14 KiB

# frozen_string_literal: true
module Templates
module DetectFields
module_function
TextFieldBox = Struct.new(:x, :y, :w, :h, keyword_init: true) do
def endy
@endy ||= y + h
end
def endx
@endx ||= x + w
end
end
PageNode = Struct.new(:prev, :next, :elem, :page, :attachment_uuid, keyword_init: true)
DATE_REGEXP = /
(?:
date
| signed\sat
| datum
)[:_\s-]*\z
/ix
NUMBER_REGEXP = /
(?:
price
| \$
| €
| total
| quantity
| prix
| quantité
| preis
| summe
| gesamt(?:betrag)?
| menge
| anzahl
| stückzahl
)[:_\s-]*\z
/ix
SIGNATURE_REGEXP = /
(?:
signature
| sign\shere
| sign
| signez\sici
| signer\sici
| unterschrift
| unterschreiben
| unterzeichnen
)[:_\s-]*\z
/ix
LINEBREAK = ["\n", "\r"].freeze
CHECKBOXES = ['☐', '□'].freeze
# rubocop:disable Metrics, Style
def call(io, attachment: nil, confidence: 0.3, temperature: 1, inference: Templates::ImageToFields,
nms: 0.1, split_page: false, aspect_ratio: true, padding: 20, regexp_type: true, &)
fields, head_node =
if attachment&.image?
process_image_attachment(io, attachment:, confidence:, nms:, split_page:, inference:,
temperature:, aspect_ratio:, padding:, &)
else
process_pdf_attachment(io, attachment:, confidence:, nms:, split_page:, inference:,
temperature:, aspect_ratio:, regexp_type:, padding:, &)
end
[fields, head_node]
end
def process_image_attachment(io, attachment:, confidence:, nms:, temperature:, inference:,
split_page: false, aspect_ratio: false, padding: nil)
image = Vips::Image.new_from_buffer(io.read, '')
fields = inference.call(image, confidence:, nms:, split_page:,
temperature:, aspect_ratio:, padding:)
fields = sort_fields(fields, y_threshold: 10.0 / image.height)
fields = fields.map do |f|
{
uuid: SecureRandom.uuid,
type: f.type,
required: f.type == 'signature',
preferences: {},
areas: [{
x: f.x,
y: f.y,
w: f.w,
h: f.h,
page: 0,
attachment_uuid: attachment&.uuid
}]
}
end
yield [attachment&.uuid, 0, fields] if block_given?
[fields, nil]
end
def process_pdf_attachment(io, attachment:, confidence:, nms:, temperature:, inference:,
split_page: false, aspect_ratio: false, padding: nil, regexp_type: false)
doc = Pdfium::Document.open_bytes(io.read)
head_node = PageNode.new(elem: ''.b, page: 0, attachment_uuid: attachment&.uuid)
tail_node = head_node
fields = doc.page_count.times.flat_map do |page_number|
page = doc.get_page(page_number)
data, width, height = page.render_to_bitmap(width: inference::RESOLUTION * 1.5)
image = Vips::Image.new_from_memory(data, width, height, 4, :uchar)
fields = inference.call(image, confidence: confidence / 4.0, nms:, split_page:,
temperature:, aspect_ratio:, padding:)
text_fields = extract_text_fields_from_page(page)
line_fields = extract_line_fields_from_page(page)
fields = sort_fields(fields, y_threshold: 10.0 / page.height)
fields = increase_confidence_for_overlapping_fields(fields, text_fields)
fields = increase_confidence_for_overlapping_fields(fields, line_fields)
fields = fields.reject { |f| f.confidence < confidence }
field_nodes, tail_node = build_page_nodes(page, fields, tail_node, attachment_uuid: attachment&.uuid)
fields = field_nodes.map do |node|
field = node.elem
type = regexp_type ? type_from_page_node(node) : field.type
{
uuid: SecureRandom.uuid,
type:,
required: type == 'signature',
preferences: {},
areas: [{
x: field.x, y: field.y,
w: field.w, h: field.h,
page: page_number,
attachment_uuid: attachment&.uuid
}]
}
end
yield [attachment&.uuid, page_number, fields] if block_given?
fields
ensure
page.close
end
print_debug(head_node) if Rails.env.development?
[fields, head_node]
ensure
doc.close
end
def sort_fields(fields, y_threshold: 0.01)
fields.sort do |a, b|
(a.endy - b.endy).abs < y_threshold ? a.x <=> b.x : a.endy <=> b.endy
end
end
def print_debug(head_node)
current_node = head_node
index = 0
string = ''.b
loop do
string <<
if current_node.elem.is_a?(String)
current_node.elem
else
"[#{current_node.elem.type == 'checkbox' ? 'Checkbox' : 'Field'}_#{index += 1}]"
end
current_node = current_node.next
break unless current_node
end
Rails.logger.info(string)
end
def type_from_page_node(node)
return node.elem.type unless node.prev.elem.is_a?(String)
return node.elem.type unless node.elem.type == 'text'
string = node.prev.elem
return 'date' if string.match?(DATE_REGEXP)
return 'signature' if string.match?(SIGNATURE_REGEXP)
return 'number' if string.match?(NUMBER_REGEXP)
return 'text'
end
def build_page_nodes(page, fields, tail_node, attachment_uuid: nil)
field_nodes = []
y_threshold = 4.0 / page.height
x_threshold = 30.0 / page.width
text_nodes = page.text_nodes
current_field = fields.shift
index = 0
prev_node = nil
loop do
node = text_nodes[index]
break unless node
if node.content.in?(LINEBREAK)
next_node = text_nodes[index]
if next_node && (next_node.endy - node.endy) < y_threshold
index += 1
next
end
end
loop do
break unless current_field
if ((current_field.endy - node.endy).abs < y_threshold &&
(current_field.x <= node.x || node.content.in?(LINEBREAK))) ||
current_field.endy < node.y
if tail_node.elem.is_a?(Templates::ImageToFields::Field)
divider =
if (tail_node.elem.endy - current_field.endy).abs > y_threshold
"\n".b
elsif tail_node.elem.endx - current_field.x > x_threshold
"\t".b
else
' '.b
end
text_node = PageNode.new(prev: tail_node, elem: divider, page: page.page_index, attachment_uuid:)
tail_node.next = text_node
tail_node = text_node
elsif prev_node && (prev_node.endy - current_field.endy).abs > y_threshold
text_node = PageNode.new(prev: tail_node, elem: "\n".b, page: page.page_index, attachment_uuid:)
tail_node.next = text_node
tail_node = text_node
end
field_node = PageNode.new(prev: tail_node, elem: current_field, page: page.page_index, attachment_uuid:)
tail_node.next = field_node
tail_node = field_node
field_nodes << tail_node
current_field = fields.shift
else
break
end
end
if tail_node.elem.is_a?(Templates::ImageToFields::Field)
prev_field = tail_node.elem
text_node = PageNode.new(prev: tail_node, elem: ''.b, page: page.page_index, attachment_uuid:)
tail_node.next = text_node
tail_node = text_node
if (node.endy - prev_field.endy).abs > y_threshold
tail_node.elem << "\n"
elsif (node.x - prev_field.endx) > x_threshold
tail_node.elem << "\t"
end
elsif prev_node
if (node.endy - prev_node.endy) > y_threshold && LINEBREAK.exclude?(prev_node.content)
tail_node.elem << "\n"
elsif (node.x - prev_node.endx) > x_threshold && !tail_node.elem.ends_with?("\t")
tail_node.elem << "\t"
end
end
if node.content != '_' || !tail_node.elem.ends_with?('___')
tail_node.elem << node.content unless CHECKBOXES.include?(node.content)
end
prev_node = node
index += 1
end
loop do
break unless current_field
field_node = PageNode.new(prev: tail_node, elem: current_field, page: page.page_index, attachment_uuid:)
tail_node.next = field_node
tail_node = field_node
field_nodes << tail_node
current_field = fields.shift
end
if tail_node.elem.is_a?(Templates::ImageToFields::Field)
text_node = PageNode.new(prev: tail_node, elem: "\n".b, page: page.page_index, attachment_uuid:)
tail_node.next = text_node
tail_node = text_node
else
tail_node.elem << "\n"
end
[field_nodes, tail_node]
end
def extract_line_fields_from_page(page)
line_thickness = 5.0 / page.height
vertical_lines, all_horizontal_lines = page.line_nodes.partition { |line| line.tilt == 90 }
horizontal_lines = all_horizontal_lines.reject do |h_line|
next true if h_line.w > 0.7 && (h_line.h < 0.1 || h_line.h < 0.9)
next false if vertical_lines.blank?
h_x_min = h_line.x
h_x_max = h_line.x + h_line.w
h_y_avg = h_line.y + (h_line.h / 2)
vertical_lines.any? do |v_line|
v_x_avg = v_line.x + (v_line.w / 2)
v_y_min = v_line.y
v_y_max = v_line.y + v_line.h
h_x_min_expanded = h_x_min - line_thickness
h_x_max_expanded = h_x_max + line_thickness
h_y_min_expanded = h_y_avg - line_thickness
h_y_max_expanded = h_y_avg + line_thickness
v_x_min_expanded = v_x_avg - line_thickness
v_x_max_expanded = v_x_avg + line_thickness
v_y_min_expanded = v_y_min - line_thickness
v_y_max_expanded = v_y_max + line_thickness
x_overlap = v_x_min_expanded <= h_x_max_expanded && v_x_max_expanded >= h_x_min_expanded
y_overlap = h_y_min_expanded <= v_y_max_expanded && h_y_max_expanded >= v_y_min_expanded
x_overlap && y_overlap
end
end
node_index = 0
horizontal_lines = horizontal_lines.reject do |line|
nodes = []
loop do
node = page.text_nodes[node_index += 1]
break unless node
break if node.y > line.y
next if node.x + node.w < line.x || line.x + line.w < node.x ||
node.y + node.h < line.y - node.h || line.y < node.y
nodes << node
next if nodes.blank?
next_node = page.text_nodes[node_index + 1]
break unless next_node
break if next_node.x + next_node.w < line.x || line.x + line.w < next_node.x ||
next_node.y + next_node.h < line.y - next_node.h || line.y < next_node.y
end
next if nodes.blank?
width = nodes.last.x + nodes.last.w - nodes.first.x
next true if width > line.w / 2.0
end
horizontal_lines.each do |line|
line.h += 4 * line_thickness
line.y -= 4 * line_thickness
end
end
def extract_text_fields_from_page(page)
text_nodes = page.text_nodes
field_boxes = []
i = 0
while i < text_nodes.length
node = text_nodes[i]
next i += 1 if node.content != '_'
x1 = node.x
y1 = node.y
x2 = node.endx
y2 = node.endy
underscore_count = 1
j = i + 1
while j < text_nodes.length
next_node = text_nodes[j]
break unless next_node.content == '_'
distance = next_node.x - x2
height_diff = (next_node.y - y1).abs
break if distance > 0.02 || height_diff > node.h * 0.5
underscore_count += 1
next_x2 = next_node.endx
next_y2 = next_node.endy
x2 = next_x2
y2 = [y2, next_y2].max
y1 = [y1, next_node.y].min
j += 1
end
field_boxes << TextFieldBox.new(x: x1, y: y1, w: x2 - x1, h: y2 - y1) if underscore_count >= 2
i = j
end
field_boxes
end
def calculate_iou(box1, box2)
x1 = [box1.x, box2.x].max
y1 = [box1.y, box2.y].max
x2 = [box1.endx, box2.endx].min
y2 = [box1.endy, box2.endy].min
intersection_width = [0, x2 - x1].max
intersection_height = [0, y2 - y1].max
intersection_area = intersection_width * intersection_height
return 0.0 if intersection_area.zero?
box1_area = box1.w * box1.h
box2_area = box2.w * box2.h
union_area = box1_area + box2_area - intersection_area
intersection_area / union_area
end
def boxes_overlap?(box1, box2)
!(box1.endx < box2.x || box2.endx < box1.x || box1.endy < box2.y || box2.endy < box1.y)
end
def increase_confidence_for_overlapping_fields(image_fields, text_fields, by: 1.0)
return image_fields if text_fields.blank?
image_fields.map do |image_field|
next if image_field.type != 'text'
text_fields.each do |text_field|
break if text_field.y > image_field.endy
next if text_field.endy < image_field.y
next unless boxes_overlap?(image_field, text_field)
next if calculate_iou(image_field, text_field) < 0.4
break image_field.confidence += by
end
end
image_fields
end
# rubocop:enable Metrics, Style
end
end