mirror of https://github.com/docusealco/docuseal
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
506 lines
14 KiB
506 lines
14 KiB
# frozen_string_literal: true
|
|
|
|
module Templates
|
|
module DetectFields
|
|
module_function
|
|
|
|
TextFieldBox = Struct.new(:x, :y, :w, :h, keyword_init: true) do
|
|
def endy
|
|
@endy ||= y + h
|
|
end
|
|
|
|
def endx
|
|
@endx ||= x + w
|
|
end
|
|
end
|
|
|
|
PageNode = Struct.new(:prev, :next, :elem, :page, :attachment_uuid, keyword_init: true)
|
|
|
|
DATE_REGEXP = /
|
|
(?:
|
|
date
|
|
| signed\sat
|
|
| datum
|
|
)
|
|
\s*[:-]?\s*\z
|
|
/ix
|
|
|
|
NUMBER_REGEXP = /
|
|
(?:
|
|
price
|
|
| \$
|
|
| €
|
|
| total
|
|
| quantity
|
|
| prix
|
|
| quantité
|
|
| preis
|
|
| summe
|
|
| gesamt(?:betrag)?
|
|
| menge
|
|
| anzahl
|
|
| stückzahl
|
|
)
|
|
\s*[:-]?\s*\z
|
|
/ix
|
|
|
|
SIGNATURE_REGEXP = /
|
|
(?:
|
|
signature
|
|
| sign\shere
|
|
| sign
|
|
| signez\sici
|
|
| signer\sici
|
|
| unterschrift
|
|
| unterschreiben
|
|
| unterzeichnen
|
|
)
|
|
\s*[:-]?\s*\z
|
|
/ix
|
|
|
|
LINEBREAK = ["\n", "\r"].freeze
|
|
CHECBOXES = ['☐', '□'].freeze
|
|
|
|
# rubocop:disable Metrics, Style
|
|
def call(io, attachment: nil, confidence: 0.3, temperature: 1, inference: Templates::ImageToFields,
|
|
nms: 0.1, split_page: false, aspect_ratio: true, padding: 20, regexp_type: true, &)
|
|
fields, head_node =
|
|
if attachment&.image?
|
|
process_image_attachment(io, attachment:, confidence:, nms:, split_page:, inference:,
|
|
temperature:, aspect_ratio:, padding:, &)
|
|
else
|
|
process_pdf_attachment(io, attachment:, confidence:, nms:, split_page:, inference:,
|
|
temperature:, aspect_ratio:, regexp_type:, padding:, &)
|
|
end
|
|
|
|
[fields, head_node]
|
|
end
|
|
|
|
def process_image_attachment(io, attachment:, confidence:, nms:, temperature:, inference:,
|
|
split_page: false, aspect_ratio: false, padding: nil)
|
|
image = Vips::Image.new_from_buffer(io.read, '')
|
|
|
|
fields = inference.call(image, confidence:, nms:, split_page:,
|
|
temperature:, aspect_ratio:, padding:)
|
|
|
|
fields = sort_fields(fields, y_threshold: 10.0 / image.height)
|
|
|
|
fields = fields.map do |f|
|
|
{
|
|
uuid: SecureRandom.uuid,
|
|
type: f.type,
|
|
required: f.type == 'signature',
|
|
preferences: {},
|
|
areas: [{
|
|
x: f.x,
|
|
y: f.y,
|
|
w: f.w,
|
|
h: f.h,
|
|
page: 0,
|
|
attachment_uuid: attachment&.uuid
|
|
}]
|
|
}
|
|
end
|
|
|
|
yield [attachment&.uuid, 0, fields] if block_given?
|
|
|
|
[fields, nil]
|
|
end
|
|
|
|
def process_pdf_attachment(io, attachment:, confidence:, nms:, temperature:, inference:,
|
|
split_page: false, aspect_ratio: false, padding: nil, regexp_type: false)
|
|
doc = Pdfium::Document.open_bytes(io.read)
|
|
|
|
head_node = PageNode.new(elem: ''.b, page: 0, attachment_uuid: attachment&.uuid)
|
|
tail_node = head_node
|
|
|
|
fields = doc.page_count.times.flat_map do |page_number|
|
|
page = doc.get_page(page_number)
|
|
|
|
data, width, height = page.render_to_bitmap(width: inference::RESOLUTION * 1.5)
|
|
|
|
image = Vips::Image.new_from_memory(data, width, height, 4, :uchar)
|
|
|
|
fields = inference.call(image, confidence: confidence / 4.0, nms:, split_page:,
|
|
temperature:, aspect_ratio:, padding:)
|
|
|
|
text_fields = extract_text_fields_from_page(page)
|
|
line_fields = extract_line_fields_from_page(page)
|
|
|
|
fields = sort_fields(fields, y_threshold: 10.0 / page.height)
|
|
|
|
fields = increase_confidence_for_overlapping_fields(fields, text_fields)
|
|
fields = increase_confidence_for_overlapping_fields(fields, line_fields)
|
|
|
|
fields = fields.reject { |f| f.confidence < confidence }
|
|
|
|
field_nodes, tail_node = build_page_nodes(page, fields, tail_node, attachment_uuid: attachment&.uuid)
|
|
|
|
fields = field_nodes.map do |node|
|
|
field = node.elem
|
|
|
|
type = regexp_type ? type_from_page_node(node) : field.type
|
|
|
|
{
|
|
uuid: SecureRandom.uuid,
|
|
type:,
|
|
required: type == 'signature',
|
|
preferences: {},
|
|
areas: [{
|
|
x: field.x, y: field.y,
|
|
w: field.w, h: field.h,
|
|
page: page_number,
|
|
attachment_uuid: attachment&.uuid
|
|
}]
|
|
}
|
|
end
|
|
|
|
yield [attachment&.uuid, page_number, fields] if block_given?
|
|
|
|
fields
|
|
ensure
|
|
page.close
|
|
end
|
|
|
|
print_debug(head_node) if Rails.env.development?
|
|
|
|
[fields, head_node]
|
|
ensure
|
|
doc.close
|
|
end
|
|
|
|
def sort_fields(fields, y_threshold: 0.01)
|
|
fields.sort do |a, b|
|
|
(a.endy - b.endy).abs < y_threshold ? a.x <=> b.x : a.endy <=> b.endy
|
|
end
|
|
end
|
|
|
|
def print_debug(head_node)
|
|
current_node = head_node
|
|
index = 0
|
|
string = ''.b
|
|
|
|
loop do
|
|
string <<
|
|
if current_node.elem.is_a?(String)
|
|
current_node.elem
|
|
else
|
|
"[#{current_node.elem.type == 'checkbox' ? 'Checkbox' : 'Field'}_#{index += 1}]"
|
|
end
|
|
|
|
current_node = current_node.next
|
|
|
|
break unless current_node
|
|
end
|
|
|
|
Rails.logger.info(string)
|
|
end
|
|
|
|
def type_from_page_node(node)
|
|
return node.elem.type unless node.prev.elem.is_a?(String)
|
|
return node.elem.type unless node.elem.type == 'text'
|
|
|
|
string = node.prev.elem
|
|
|
|
return 'date' if string.match?(DATE_REGEXP)
|
|
return 'signature' if string.match?(SIGNATURE_REGEXP)
|
|
return 'number' if string.match?(NUMBER_REGEXP)
|
|
|
|
return 'text'
|
|
end
|
|
|
|
def build_page_nodes(page, fields, tail_node, attachment_uuid: nil)
|
|
field_nodes = []
|
|
|
|
y_theshold = 4.0 / page.height
|
|
x_theshold = 30.0 / page.width
|
|
|
|
text_nodes = page.text_nodes
|
|
|
|
current_field = fields.shift
|
|
|
|
index = 0
|
|
|
|
prev_node = nil
|
|
|
|
loop do
|
|
node = text_nodes[index]
|
|
|
|
break unless node
|
|
|
|
if node.content.in?(LINEBREAK)
|
|
next_node = text_nodes[index]
|
|
|
|
if next_node && (next_node.endy - node.endy) < y_theshold
|
|
index += 1
|
|
|
|
next
|
|
end
|
|
end
|
|
|
|
loop do
|
|
break unless current_field
|
|
|
|
if ((current_field.endy - node.endy).abs < y_theshold &&
|
|
(current_field.x <= node.x || node.content.in?(LINEBREAK))) ||
|
|
current_field.endy < node.y
|
|
if tail_node.elem.is_a?(Templates::ImageToFields::Field)
|
|
divider =
|
|
if (tail_node.elem.endy - current_field.endy).abs > y_theshold
|
|
"\n".b
|
|
elsif tail_node.elem.endx - current_field.x > x_theshold
|
|
"\t".b
|
|
else
|
|
' '.b
|
|
end
|
|
|
|
text_node = PageNode.new(prev: tail_node, elem: divider, page: page.page_index, attachment_uuid:)
|
|
tail_node.next = text_node
|
|
|
|
tail_node = text_node
|
|
elsif prev_node && (prev_node.endy - current_field.endy).abs > y_theshold
|
|
text_node = PageNode.new(prev: tail_node, elem: "\n".b, page: page.page_index, attachment_uuid:)
|
|
tail_node.next = text_node
|
|
|
|
tail_node = text_node
|
|
end
|
|
|
|
field_node = PageNode.new(prev: tail_node, elem: current_field, page: page.page_index, attachment_uuid:)
|
|
|
|
tail_node.next = field_node
|
|
tail_node = field_node
|
|
field_nodes << tail_node
|
|
|
|
current_field = fields.shift
|
|
else
|
|
break
|
|
end
|
|
end
|
|
|
|
if tail_node.elem.is_a?(Templates::ImageToFields::Field)
|
|
prev_field = tail_node.elem
|
|
|
|
text_node = PageNode.new(prev: tail_node, elem: ''.b, page: page.page_index, attachment_uuid:)
|
|
tail_node.next = text_node
|
|
|
|
tail_node = text_node
|
|
|
|
if (node.endy - prev_field.endy).abs > y_theshold
|
|
tail_node.elem << "\n"
|
|
elsif (node.x - prev_field.endx) > x_theshold
|
|
tail_node.elem << "\t"
|
|
end
|
|
elsif prev_node
|
|
if (node.endy - prev_node.endy) > y_theshold && LINEBREAK.exclude?(prev_node.content)
|
|
tail_node.elem << "\n"
|
|
elsif (node.x - prev_node.endx) > x_theshold && !tail_node.elem.ends_with?("\t")
|
|
tail_node.elem << "\t"
|
|
end
|
|
end
|
|
|
|
if node.content != '_' || !tail_node.elem.ends_with?('___')
|
|
tail_node.elem << node.content unless CHECBOXES.include?(node.content)
|
|
end
|
|
|
|
prev_node = node
|
|
|
|
index += 1
|
|
end
|
|
|
|
loop do
|
|
break unless current_field
|
|
|
|
field_node = PageNode.new(prev: tail_node, elem: current_field, page: page.page_index, attachment_uuid:)
|
|
tail_node.next = field_node
|
|
tail_node = field_node
|
|
field_nodes << tail_node
|
|
|
|
current_field = fields.shift
|
|
end
|
|
|
|
if tail_node.elem.is_a?(Templates::ImageToFields::Field)
|
|
text_node = PageNode.new(prev: tail_node, elem: "\n".b, page: page.page_index, attachment_uuid:)
|
|
tail_node.next = text_node
|
|
|
|
tail_node = text_node
|
|
else
|
|
tail_node.elem << "\n"
|
|
end
|
|
|
|
[field_nodes, tail_node]
|
|
end
|
|
|
|
def extract_line_fields_from_page(page)
|
|
line_thickness = 5.0 / page.height
|
|
|
|
vertical_lines, all_horizontal_lines = page.line_nodes.partition { |line| line.tilt == 90 }
|
|
|
|
horizontal_lines = all_horizontal_lines.reject do |h_line|
|
|
next true if h_line.w > 0.7 && (h_line.h < 0.1 || h_line.h < 0.9)
|
|
|
|
next false if vertical_lines.blank?
|
|
|
|
h_x_min = h_line.x
|
|
h_x_max = h_line.x + h_line.w
|
|
h_y_avg = h_line.y + (h_line.h / 2)
|
|
|
|
vertical_lines.any? do |v_line|
|
|
v_x_avg = v_line.x + (v_line.w / 2)
|
|
v_y_min = v_line.y
|
|
v_y_max = v_line.y + v_line.h
|
|
|
|
h_x_min_expanded = h_x_min - line_thickness
|
|
h_x_max_expanded = h_x_max + line_thickness
|
|
h_y_min_expanded = h_y_avg - line_thickness
|
|
h_y_max_expanded = h_y_avg + line_thickness
|
|
|
|
v_x_min_expanded = v_x_avg - line_thickness
|
|
v_x_max_expanded = v_x_avg + line_thickness
|
|
v_y_min_expanded = v_y_min - line_thickness
|
|
v_y_max_expanded = v_y_max + line_thickness
|
|
|
|
x_overlap = v_x_min_expanded <= h_x_max_expanded && v_x_max_expanded >= h_x_min_expanded
|
|
y_overlap = h_y_min_expanded <= v_y_max_expanded && h_y_max_expanded >= v_y_min_expanded
|
|
|
|
x_overlap && y_overlap
|
|
end
|
|
end
|
|
|
|
node_index = 0
|
|
|
|
horizontal_lines = horizontal_lines.reject do |line|
|
|
nodes = []
|
|
|
|
loop do
|
|
node = page.text_nodes[node_index += 1]
|
|
|
|
break unless node
|
|
|
|
break if node.y > line.y
|
|
|
|
next if node.x + node.w < line.x || line.x + line.w < node.x ||
|
|
node.y + node.h < line.y - node.h || line.y < node.y
|
|
|
|
nodes << node
|
|
|
|
next if nodes.blank?
|
|
|
|
next_node = page.text_nodes[node_index + 1]
|
|
|
|
break unless next_node
|
|
|
|
break if next_node.x + next_node.w < line.x || line.x + line.w < next_node.x ||
|
|
next_node.y + next_node.h < line.y - next_node.h || line.y < next_node.y
|
|
end
|
|
|
|
next if nodes.blank?
|
|
|
|
width = nodes.last.x + nodes.last.w - nodes.first.x
|
|
|
|
next true if width > line.w / 2.0
|
|
end
|
|
|
|
horizontal_lines.each do |line|
|
|
line.h += 4 * line_thickness
|
|
line.y -= 4 * line_thickness
|
|
end
|
|
end
|
|
|
|
def extract_text_fields_from_page(page)
|
|
text_nodes = page.text_nodes
|
|
|
|
field_boxes = []
|
|
|
|
i = 0
|
|
|
|
while i < text_nodes.length
|
|
node = text_nodes[i]
|
|
|
|
next i += 1 if node.content != '_'
|
|
|
|
x1 = node.x
|
|
y1 = node.y
|
|
x2 = node.endx
|
|
y2 = node.endy
|
|
|
|
underscore_count = 1
|
|
|
|
j = i + 1
|
|
|
|
while j < text_nodes.length
|
|
next_node = text_nodes[j]
|
|
|
|
break unless next_node.content == '_'
|
|
|
|
distance = next_node.x - x2
|
|
height_diff = (next_node.y - y1).abs
|
|
|
|
break if distance > 0.02 || height_diff > node.h * 0.5
|
|
|
|
underscore_count += 1
|
|
|
|
next_x2 = next_node.endx
|
|
next_y2 = next_node.endy
|
|
|
|
x2 = next_x2
|
|
y2 = [y2, next_y2].max
|
|
y1 = [y1, next_node.y].min
|
|
|
|
j += 1
|
|
end
|
|
|
|
field_boxes << TextFieldBox.new(x: x1, y: y1, w: x2 - x1, h: y2 - y1) if underscore_count >= 2
|
|
|
|
i = j
|
|
end
|
|
|
|
field_boxes
|
|
end
|
|
|
|
def calculate_iou(box1, box2)
|
|
x1 = [box1.x, box2.x].max
|
|
y1 = [box1.y, box2.y].max
|
|
x2 = [box1.endx, box2.endx].min
|
|
y2 = [box1.endy, box2.endy].min
|
|
|
|
intersection_width = [0, x2 - x1].max
|
|
intersection_height = [0, y2 - y1].max
|
|
intersection_area = intersection_width * intersection_height
|
|
|
|
return 0.0 if intersection_area.zero?
|
|
|
|
box1_area = box1.w * box1.h
|
|
box2_area = box2.w * box2.h
|
|
union_area = box1_area + box2_area - intersection_area
|
|
|
|
intersection_area / union_area
|
|
end
|
|
|
|
def boxes_overlap?(box1, box2)
|
|
!(box1.endx < box2.x || box2.endx < box1.x || box1.endy < box2.y || box2.endy < box1.y)
|
|
end
|
|
|
|
def increase_confidence_for_overlapping_fields(image_fields, text_fields, by: 1.0)
|
|
return image_fields if text_fields.blank?
|
|
|
|
image_fields.map do |image_field|
|
|
next if image_field.type != 'text'
|
|
|
|
text_fields.each do |text_field|
|
|
break if text_field.y > image_field.endy
|
|
|
|
next if text_field.endy < image_field.y
|
|
|
|
next unless boxes_overlap?(image_field, text_field)
|
|
next if calculate_iou(image_field, text_field) < 0.4
|
|
|
|
break image_field.confidence += by
|
|
end
|
|
end
|
|
|
|
image_fields
|
|
end
|
|
# rubocop:enable Metrics, Style
|
|
end
|
|
end
|