<%= item['name'].presence || document.filename.base %>
@@ -103,7 +103,7 @@
<% (document.metadata.dig('pdf', 'number_of_pages') || (document.preview_images.loaded? ? preview_images_index.size : document.preview_images.size)).times do |index| %>
<% page = preview_images_index[index] || page_blob_struct.new(metadata: lazyload_metadata, url: preview_document_page_path(document.signed_uuid, "#{index}.jpg")) %>
" class="block before:border before:absolute before:top-0 before:bottom-0 before:left-0 before:right-0 before:rounded relative mb-4" style="container-type: size; aspect-ratio: <%= width = page.metadata['width'] %> / <%= height = page.metadata['height'] %>">
-
+
<% document_annots_index[index]&.each do |annot| %>
<%= render 'submissions/annotation', annot: %>
diff --git a/config/routes.rb b/config/routes.rb
index 43701da1..20a3035d 100644
--- a/config/routes.rb
+++ b/config/routes.rb
@@ -98,11 +98,9 @@ Rails.application.routes.draw do
resources :submissions_filters, only: %i[show], param: 'name'
resources :templates, only: %i[new create edit update show destroy] do
resource :debug, only: %i[show], controller: 'templates_debug' if Rails.env.development?
- resources :documents, only: %i[create], controller: 'template_documents'
+ resources :documents, only: %i[index create], controller: 'template_documents'
resources :clone_and_replace, only: %i[create], controller: 'templates_clone_and_replace'
- if !Docuseal.multitenant? || Docuseal.demo?
- resources :detect_fields, only: %i[create], controller: 'templates_detect_fields'
- end
+ resources :detect_fields, only: %i[create], controller: 'templates_detect_fields' unless Docuseal.multitenant?
resources :restore, only: %i[create], controller: 'templates_restore'
resources :archived, only: %i[index], controller: 'templates_archived_submissions'
resources :submissions, only: %i[new create]
diff --git a/lib/puma/plugin/sidekiq_embed.rb b/lib/puma/plugin/sidekiq_embed.rb
index c7b9db01..97c4e3be 100644
--- a/lib/puma/plugin/sidekiq_embed.rb
+++ b/lib/puma/plugin/sidekiq_embed.rb
@@ -39,6 +39,7 @@ Puma::Plugin.create do
configs = Sidekiq.configure_embed do |config|
config.logger.level = Logger::INFO
sidekiq_config = YAML.load_file('config/sidekiq.yml')
+ sidekiq_config['queues'] << 'fields' if ENV['DEMO'] == 'true'
config.queues = sidekiq_config['queues']
config.concurrency = ENV.fetch('SIDEKIQ_THREADS', 5).to_i
config.merge!(sidekiq_config)
diff --git a/lib/templates/detect_fields.rb b/lib/templates/detect_fields.rb
index 4eb90d55..1f3b5c0c 100755
--- a/lib/templates/detect_fields.rb
+++ b/lib/templates/detect_fields.rb
@@ -5,17 +5,63 @@ module Templates
module_function
TextFieldBox = Struct.new(:x, :y, :w, :h, keyword_init: true)
-
- # rubocop:disable Metrics
+ PageNode = Struct.new(:prev, :next, :elem, :page, :attachment_uuid, keyword_init: true)
+
+ DATE_REGEXP = /
+ (?:
+ date
+ | signed\sat
+ | datum
+ )
+ \s*[:-]?\s*\z
+ /ix
+
+ NUMBER_REGEXP = /
+ (?:
+ price
+ | \$
+ | €
+ | total
+ | quantity
+ | prix
+ | quantité
+ | preis
+ | summe
+ | gesamt(?:betrag)?
+ | menge
+ | anzahl
+ | stückzahl
+ )
+ \s*[:-]?\s*\z
+ /ix
+
+ SIGNATURE_REGEXP = /
+ (?:
+ signature
+ | sign\shere
+ | sign
+ | signez\sici
+ | signer\sici
+ | unterschrift
+ | unterschreiben
+ | unterzeichnen
+ )
+ \s*[:-]?\s*\z
+ /ix
+
+ # rubocop:disable Metrics, Style
def call(io, attachment: nil, confidence: 0.3, temperature: 1, inference: Templates::ImageToFields,
- nms: 0.1, split_page: false, aspect_ratio: true, padding: 20, &)
- if attachment&.image?
- process_image_attachment(io, attachment:, confidence:, nms:, split_page:, inference:,
- temperature:, aspect_ratio:, padding:, &)
- else
- process_pdf_attachment(io, attachment:, confidence:, nms:, split_page:, inference:,
- temperature:, aspect_ratio:, padding:, &)
- end
+ nms: 0.1, split_page: false, aspect_ratio: true, padding: 20, regexp_type: true, &)
+ fields, head_node =
+ if attachment&.image?
+ process_image_attachment(io, attachment:, confidence:, nms:, split_page:, inference:,
+ temperature:, aspect_ratio:, padding:, &)
+ else
+ process_pdf_attachment(io, attachment:, confidence:, nms:, split_page:, inference:,
+ temperature:, aspect_ratio:, regexp_type:, padding:, &)
+ end
+
+ [fields, head_node]
end
def process_image_attachment(io, attachment:, confidence:, nms:, temperature:, inference:,
@@ -29,7 +75,7 @@ module Templates
{
uuid: SecureRandom.uuid,
type: f.type,
- required: true,
+ required: f.type != 'checkbox',
preferences: {},
areas: [{
x: f.x,
@@ -44,21 +90,24 @@ module Templates
yield [attachment&.uuid, 0, fields] if block_given?
- fields
+ [fields, nil]
end
def process_pdf_attachment(io, attachment:, confidence:, nms:, temperature:, inference:,
- split_page: false, aspect_ratio: false, padding: nil)
+ split_page: false, aspect_ratio: false, padding: nil, regexp_type: false)
doc = Pdfium::Document.open_bytes(io.read)
- doc.page_count.times.flat_map do |page_number|
+ head_node = PageNode.new(elem: ''.b, page: 0, attachment_uuid: attachment&.uuid)
+ tail_node = head_node
+
+ fields = doc.page_count.times.flat_map do |page_number|
page = doc.get_page(page_number)
data, width, height = page.render_to_bitmap(width: inference::RESOLUTION * 1.5)
image = Vips::Image.new_from_memory(data, width, height, 4, :uchar)
- fields = inference.call(image, confidence: 0.05, nms:, split_page:,
+ fields = inference.call(image, confidence: confidence / 4.0, nms:, split_page:,
temperature:, aspect_ratio:, padding:)
text_fields = extract_text_fields_from_page(page)
@@ -67,17 +116,23 @@ module Templates
fields = increase_confidence_for_overlapping_fields(fields, text_fields)
fields = increase_confidence_for_overlapping_fields(fields, line_fields)
- fields = fields.filter_map do |f|
- next if f.confidence < confidence
+ fields = fields.reject { |f| f.confidence < confidence }
+
+ field_nodes, tail_node = build_page_nodes(page, fields, tail_node, attachment_uuid: attachment&.uuid)
+
+ fields = field_nodes.map do |node|
+ field = node.elem
+
+ type = regexp_type ? type_from_page_node(node) : field.type
{
uuid: SecureRandom.uuid,
- type: f.type,
- required: true,
+ type:,
+ required: type != 'checkbox',
preferences: {},
areas: [{
- x: f.x, y: f.y,
- w: f.w, h: f.h,
+ x: field.x, y: field.y,
+ w: field.w, h: field.h,
page: page_number,
attachment_uuid: attachment&.uuid
}]
@@ -90,10 +145,170 @@ module Templates
ensure
page.close
end
+
+ print_debug(head_node) if Rails.env.development?
+
+ [fields, head_node]
ensure
doc.close
end
+ def print_debug(head_node)
+ current_node = head_node
+ index = 0
+ string = ''.b
+
+ loop do
+ string <<
+ if current_node.elem.is_a?(String)
+ current_node.elem
+ else
+ "[#{current_node.elem.type == 'checkbox' ? 'Checkbox' : 'Field'}_#{index += 1}]"
+ end
+
+ current_node = current_node.next
+
+ break unless current_node
+ end
+
+ Rails.logger.info(string)
+ end
+
+ def type_from_page_node(node)
+ return node.elem.type unless node.prev.elem.is_a?(String)
+ return node.elem.type unless node.elem.type == 'text'
+
+ string = node.prev.elem
+
+ return 'date' if string.match?(DATE_REGEXP)
+ return 'signature' if string.match?(SIGNATURE_REGEXP)
+ return 'number' if string.match?(NUMBER_REGEXP)
+
+ return 'text'
+ end
+
+ def build_page_nodes(page, fields, tail_node, attachment_uuid: nil)
+ field_nodes = []
+ current_text = ''.b
+
+ text_nodes = page.text_nodes
+
+ text_idx = 0
+ field_idx = 0
+
+ while text_idx < text_nodes.length || field_idx < fields.length
+ text_node = text_nodes[text_idx]
+ field = fields[field_idx]
+
+ process_text_node = false
+ process_field_node = false
+
+ if text_node && field
+ text_y_center = text_node.y + (text_node.h / 2.0)
+ field_y_center = field.y + (field.h / 2.0)
+ y_threshold = text_node.h / 2.0
+ vertical_distance = (text_y_center - field_y_center).abs
+
+ if vertical_distance < y_threshold
+ is_underscore = text_node.content == '_'
+ is_left_of_field = text_node.x < field.x
+
+ if is_underscore && is_left_of_field
+ text_x_end = text_node.x + text_node.w
+
+ distance = field.x - text_x_end
+ proximity_threshold = text_node.w * 3.0
+
+ if distance < proximity_threshold
+ process_field_node = true
+ else
+ process_text_node = true
+ end
+
+ elsif is_left_of_field
+ process_text_node = true
+ else
+ process_field_node = true
+ end
+
+ elsif text_node.y < field.y
+ process_text_node = true
+ else
+ process_field_node = true
+ end
+
+ elsif text_node
+ process_text_node = true
+ elsif field
+ process_field_node = true
+ end
+
+ if process_field_node
+ unless current_text.empty?
+ new_text_node = PageNode.new(prev: tail_node, elem: current_text, page: page.page_index, attachment_uuid:)
+ tail_node.next = new_text_node
+ tail_node = new_text_node
+ current_text = ''.b
+ end
+
+ new_field_node = PageNode.new(prev: tail_node, elem: field, page: page.page_index, attachment_uuid:)
+ tail_node.next = new_field_node
+ tail_node = new_field_node
+
+ field_nodes << tail_node
+
+ while text_idx < text_nodes.length
+ text_node_to_check = text_nodes[text_idx]
+
+ is_part_of_field = false
+
+ if text_node_to_check.content == '_'
+ check_y_center = text_node_to_check.y + (text_node_to_check.h / 2.0)
+ check_y_dist = (check_y_center - field_y_center).abs
+ check_y_thresh = text_node_to_check.h / 2.0
+
+ if check_y_dist < check_y_thresh
+ padding = text_node_to_check.w * 3.0
+ field_x_start = field.x - padding
+ field_x_end = field.x + field.w + padding
+ text_x_start = text_node_to_check.x
+ text_x_end = text_node_to_check.x + text_node_to_check.w
+
+ is_part_of_field = true if text_x_start <= field_x_end && field_x_start <= text_x_end
+ end
+ end
+
+ break unless is_part_of_field
+
+ text_idx += 1
+ end
+
+ field_idx += 1
+ elsif process_text_node
+ if text_idx > 0
+ prev_text_node = text_nodes[text_idx - 1]
+
+ x_gap = text_node.x - (prev_text_node.x + prev_text_node.w)
+
+ gap_w = text_node.w > prev_text_node.w ? text_node.w : prev_text_node.w
+
+ current_text << ' ' if x_gap > gap_w * 2
+ end
+
+ current_text << text_node.content
+ text_idx += 1
+ end
+ end
+
+ unless current_text.empty?
+ new_text_node = PageNode.new(prev: tail_node, elem: current_text, page: page.page_index, attachment_uuid:)
+ tail_node.next = new_text_node
+ tail_node = new_text_node
+ end
+
+ [field_nodes, tail_node]
+ end
+
def extract_line_fields_from_page(page)
line_thickness = 5.0 / page.height
@@ -265,6 +480,6 @@ module Templates
image_fields
end
- # rubocop:enable Metrics
+ # rubocop:enable Metrics, Style
end
end