Compare commits

...

6 Commits

Author SHA1 Message Date
Alex Turchyn f87ef670d1
Merge from docusealco/wip
1 month ago
Pete Matsyburka daba2505bc fix detection
1 month ago
Pete Matsyburka 71528eda45 map page nodes
1 month ago
Pete Matsyburka 269e2beeaa fix preview
1 month ago
Pete Matsyburka 72dd2c6bc5 add template download
1 month ago
Pete Matsyburka 9758f7a15f adjust detection
1 month ago

@ -3,6 +3,12 @@
class TemplateDocumentsController < ApplicationController class TemplateDocumentsController < ApplicationController
load_and_authorize_resource :template load_and_authorize_resource :template
FILES_TTL = 5.minutes
def index
render json: @template.schema_documents.map { |d| ActiveStorage::Blob.proxy_url(d.blob, expires_at: FILES_TTL.from_now.to_i) }
end
def create def create
if params[:blobs].blank? && params[:files].blank? if params[:blobs].blank? && params[:files].blank?
return render json: { error: I18n.t('file_is_missing') }, status: :unprocessable_content return render json: { error: I18n.t('file_is_missing') }, status: :unprocessable_content

@ -17,7 +17,7 @@ class TemplatesDebugController < ApplicationController
fields = Templates::FindAcroFields.call(pdf, attachment, data) fields = Templates::FindAcroFields.call(pdf, attachment, data)
end end
fields = Templates::DetectFields.call(StringIO.new(data), attachment:) if fields.blank? fields, = Templates::DetectFields.call(StringIO.new(data), attachment:) if fields.blank?
attachment.metadata['pdf'] ||= {} attachment.metadata['pdf'] ||= {}
attachment.metadata['pdf']['fields'] = fields attachment.metadata['pdf']['fields'] = fields

@ -167,6 +167,7 @@ safeRegisterElement('template-builder', class extends HTMLElement {
withConditions: this.dataset.withConditions === 'true', withConditions: this.dataset.withConditions === 'true',
withGoogleDrive: this.dataset.withGoogleDrive === 'true', withGoogleDrive: this.dataset.withGoogleDrive === 'true',
withReplaceAndCloneUpload: true, withReplaceAndCloneUpload: true,
withDownload: true,
currencies: (this.dataset.currencies || '').split(',').filter(Boolean), currencies: (this.dataset.currencies || '').split(',').filter(Boolean),
acceptFileTypes: this.dataset.acceptFileTypes, acceptFileTypes: this.dataset.acceptFileTypes,
showTourStartForm: this.dataset.showTourStartForm === 'true' showTourStartForm: this.dataset.showTourStartForm === 'true'

@ -175,7 +175,10 @@
{{ t('save') }} {{ t('save') }}
</span> </span>
</button> </button>
<div class="dropdown dropdown-end"> <div
class="dropdown dropdown-end"
:class="{ 'dropdown-open': isDownloading }"
>
<label <label
tabindex="0" tabindex="0"
class="base-button !rounded-l-none !pl-1 !pr-2 !border-l-neutral-500" class="base-button !rounded-l-none !pl-1 !pr-2 !border-l-neutral-500"
@ -209,6 +212,30 @@
<span class="whitespace-nowrap">{{ t('preferences') }}</span> <span class="whitespace-nowrap">{{ t('preferences') }}</span>
</a> </a>
</li> </li>
<li v-if="withDownload">
<button
class="flex space-x-2"
:disabled="isDownloading"
@click.stop.prevent="download"
>
<IconInnerShadowTop
v-if="isDownloading"
class="animate-spin w-6 h-6 flex-shrink-0"
/>
<IconDownload
v-else
class="w-6 h-6 flex-shrink-0"
/>
<span
v-if="isDownloading"
class="whitespace-nowrap"
>{{ t('downloading_') }}</span>
<span
v-else
class="whitespace-nowrap"
>{{ t('download') }}</span>
</button>
</li>
</ul> </ul>
</div> </div>
</span> </span>
@ -457,6 +484,7 @@
:show-tour-start-form="showTourStartForm" :show-tour-start-form="showTourStartForm"
@add-field="addField" @add-field="addField"
@set-draw="[drawField = $event.field, drawOption = $event.option]" @set-draw="[drawField = $event.field, drawOption = $event.option]"
@select-submitter="selectedSubmitter = $event"
@set-draw-type="[drawFieldType = $event, showDrawField = true]" @set-draw-type="[drawFieldType = $event, showDrawField = true]"
@set-drag="dragField = $event" @set-drag="dragField = $event"
@set-drag-placeholder="$refs.dragPlaceholder.dragPlaceholder = $event" @set-drag-placeholder="$refs.dragPlaceholder.dragPlaceholder = $event"
@ -511,7 +539,7 @@ import DocumentPreview from './preview'
import DocumentControls from './controls' import DocumentControls from './controls'
import MobileFields from './mobile_fields' import MobileFields from './mobile_fields'
import FieldSubmitter from './field_submitter' import FieldSubmitter from './field_submitter'
import { IconPlus, IconUsersPlus, IconDeviceFloppy, IconChevronDown, IconEye, IconWritingSign, IconInnerShadowTop, IconInfoCircle, IconAdjustments } from '@tabler/icons-vue' import { IconPlus, IconUsersPlus, IconDeviceFloppy, IconChevronDown, IconEye, IconWritingSign, IconInnerShadowTop, IconInfoCircle, IconAdjustments, IconDownload } from '@tabler/icons-vue'
import { v4 } from 'uuid' import { v4 } from 'uuid'
import { ref, computed, toRaw } from 'vue' import { ref, computed, toRaw } from 'vue'
import * as i18n from './i18n' import * as i18n from './i18n'
@ -537,6 +565,7 @@ export default {
Contenteditable, Contenteditable,
IconUsersPlus, IconUsersPlus,
IconChevronDown, IconChevronDown,
IconDownload,
IconAdjustments, IconAdjustments,
IconEye, IconEye,
IconDeviceFloppy IconDeviceFloppy
@ -584,6 +613,11 @@ export default {
required: false, required: false,
default: null default: null
}, },
withDownload: {
type: Boolean,
required: false,
default: false
},
backgroundColor: { backgroundColor: {
type: String, type: String,
required: false, required: false,
@ -805,6 +839,7 @@ export default {
return { return {
documentRefs: [], documentRefs: [],
isBreakpointLg: false, isBreakpointLg: false,
isDownloading: false,
isLoadingBlankPage: false, isLoadingBlankPage: false,
isSaving: false, isSaving: false,
selectedSubmitter: null, selectedSubmitter: null,
@ -963,6 +998,75 @@ export default {
}, },
methods: { methods: {
toRaw, toRaw,
download () {
this.isDownloading = true
this.baseFetch(`/templates/${this.template.id}/documents`).then(async (response) => {
if (response.ok) {
const urls = await response.json()
const isMobileSafariIos = 'ontouchstart' in window && navigator.maxTouchPoints > 0 && /AppleWebKit/i.test(navigator.userAgent)
const isSafariIos = isMobileSafariIos || /iPhone|iPad|iPod/i.test(navigator.userAgent)
if (isSafariIos && urls.length > 1) {
this.downloadSafariIos(urls)
} else {
this.downloadUrls(urls)
}
} else {
alert(this.t('failed_to_download_files'))
}
})
},
downloadUrls (urls) {
const fileRequests = urls.map((url) => {
return () => {
return fetch(url).then(async (resp) => {
const blobUrl = URL.createObjectURL(await resp.blob())
const link = document.createElement('a')
link.href = blobUrl
link.setAttribute('download', decodeURI(url.split('/').pop()))
link.click()
URL.revokeObjectURL(blobUrl)
})
}
})
fileRequests.reduce(
(prevPromise, request) => prevPromise.then(() => request()),
Promise.resolve()
).finally(() => {
this.isDownloading = false
})
},
downloadSafariIos (urls) {
const fileRequests = urls.map((url) => {
return fetch(url).then(async (resp) => {
const blob = await resp.blob()
const blobUrl = URL.createObjectURL(blob.slice(0, blob.size, 'application/octet-stream'))
const link = document.createElement('a')
link.href = blobUrl
link.setAttribute('download', decodeURI(url.split('/').pop()))
return link
})
})
Promise.all(fileRequests).then((links) => {
links.forEach((link, index) => {
setTimeout(() => {
link.click()
URL.revokeObjectURL(link.href)
}, index * 50)
})
}).finally(() => {
this.isDownloading = false
})
},
onDragover (e) { onDragover (e) {
if (this.$refs.dragPlaceholder?.dragPlaceholder) { if (this.$refs.dragPlaceholder?.dragPlaceholder) {
this.$refs.dragPlaceholder.isMask = e.target.id === 'mask' this.$refs.dragPlaceholder.isMask = e.target.id === 'mask'

@ -222,7 +222,16 @@
width="22" width="22"
class="animate-spin" class="animate-spin"
/> />
<span class="hidden md:inline"> <span
v-if="analyzingProgress"
class="hidden md:inline"
>
{{ Math.round(analyzingProgress * 100) }}% {{ t('analyzing_') }}
</span>
<span
v-else
class="hidden md:inline"
>
{{ fieldPagesLoaded }} / {{ numberOfPages }} {{ t('processing_') }} {{ fieldPagesLoaded }} / {{ numberOfPages }} {{ t('processing_') }}
</span> </span>
</template> </template>
@ -363,10 +372,11 @@ export default {
default: false default: false
} }
}, },
emits: ['add-field', 'set-draw', 'set-draw-type', 'set-drag', 'drag-end', 'scroll-to-area', 'change-submitter', 'set-drag-placeholder'], emits: ['add-field', 'set-draw', 'set-draw-type', 'set-drag', 'drag-end', 'scroll-to-area', 'change-submitter', 'set-drag-placeholder', 'select-submitter'],
data () { data () {
return { return {
fieldPagesLoaded: null, fieldPagesLoaded: null,
analyzingProgress: 0,
defaultFieldsSearch: '' defaultFieldsSearch: ''
} }
}, },
@ -448,8 +458,6 @@ export default {
while (true) { while (true) {
const { value, done } = await reader.read() const { value, done } = await reader.read()
if (done) break
buffer += decoder.decode(value, { stream: true }) buffer += decoder.decode(value, { stream: true })
const lines = buffer.split('\n\n') const lines = buffer.split('\n\n')
@ -464,10 +472,21 @@ export default {
if (data.error) { if (data.error) {
alert(data.error) alert(data.error)
this.template.fields = data.fields || fields
break break
} else if (data.analyzing) {
this.analyzingProgress = data.progress
} else if (data.completed) { } else if (data.completed) {
this.fieldPagesLoaded = null this.fieldPagesLoaded = null
this.template.fields = fields
if (data.submitters) {
this.template.submitters = data.submitters
this.$emit('select-submitter', this.template.submitters[0])
}
this.template.fields = data.fields || fields
this.save() this.save()
break break
@ -484,11 +503,14 @@ export default {
} }
} }
} }
if (done) break
} }
}).catch(error => { }).catch(error => {
console.error('Error in streaming message: ', error) console.error('Error in streaming message: ', error)
}).finally(() => { }).finally(() => {
this.fieldPagesLoaded = null this.fieldPagesLoaded = null
this.analyzingProgress = null
this.isFieldsLoading = false this.isFieldsLoading = false
}) })
}, },

@ -1,4 +1,7 @@
const en = { const en = {
analyzing_: 'Analyzing...',
download: 'Download',
downloading_: 'Downloading...',
view: 'View', view: 'View',
autodetect_fields: 'Autodetect fields', autodetect_fields: 'Autodetect fields',
payment_link: 'Payment link', payment_link: 'Payment link',
@ -185,6 +188,9 @@ const en = {
} }
const es = { const es = {
analyzing_: 'Analizando...',
download: 'Descargar',
downloading_: 'Descargando...',
view: 'Vista', view: 'Vista',
payment_link: 'Enlace de pago', payment_link: 'Enlace de pago',
strikeout: 'Tachar', strikeout: 'Tachar',
@ -370,6 +376,9 @@ const es = {
} }
const it = { const it = {
analyzing_: 'Analisi...',
download: 'Scarica',
downloading_: 'Download in corso...',
view: 'Vista', view: 'Vista',
payment_link: 'Link di pagamento', payment_link: 'Link di pagamento',
strikeout: 'Barrato', strikeout: 'Barrato',
@ -555,6 +564,9 @@ const it = {
} }
const pt = { const pt = {
analyzing_: 'Analisando...',
download: 'Baixar',
downloading_: 'Baixando...',
view: 'Visualizar', view: 'Visualizar',
payment_link: 'Link de pagamento', payment_link: 'Link de pagamento',
strikeout: 'Tachado', strikeout: 'Tachado',
@ -740,6 +752,9 @@ const pt = {
} }
const fr = { const fr = {
analyzing_: 'Analyse...',
download: 'Télécharger',
downloading_: 'Téléchargement...',
view: 'Voir', view: 'Voir',
payment_link: 'Lien de paiement', payment_link: 'Lien de paiement',
strikeout: 'Rature', strikeout: 'Rature',
@ -925,6 +940,9 @@ const fr = {
} }
const de = { const de = {
analyzing_: 'Analysiere...',
download: 'Download',
downloading_: 'Download...',
view: 'Anzeigen', view: 'Anzeigen',
payment_link: 'Zahlungslink', payment_link: 'Zahlungslink',
strikeout: 'Durchstreichen', strikeout: 'Durchstreichen',
@ -1110,6 +1128,9 @@ const de = {
} }
const nl = { const nl = {
analyzing_: 'Analyseren...',
download: 'Downloaden',
downloading_: 'Downloaden...',
view: 'Bekijken', view: 'Bekijken',
payment_link: 'Betaallink', payment_link: 'Betaallink',
strikeout: 'Doorhalen', strikeout: 'Doorhalen',

@ -81,7 +81,7 @@
<% document = @submission.schema_documents.find { |a| item['attachment_uuid'] == a.uuid } %> <% document = @submission.schema_documents.find { |a| item['attachment_uuid'] == a.uuid } %>
<% if document.preview_images.first %> <% if document.preview_images.first %>
<scroll-to data-selector-id="page-<%= document.uuid %>-0" class="block cursor-pointer"> <scroll-to data-selector-id="page-<%= document.uuid %>-0" class="block cursor-pointer">
<img src="<%= Docuseal::URL_CACHE.fetch([document.id, document.uuid, 0].join(':'), expires_in: 10.minutes) { document.preview_images.first.url } %>" width="<%= document.preview_images.first.metadata['width'] %>" height="<%= document.preview_images.first.metadata['height'] %>" class="rounded border" loading="lazy"> <img src="<%= (document.preview_images.find { |e| e.filename.base.to_i.zero? } || document.preview_images.first).url %>" width="<%= document.preview_images.first.metadata['width'] %>" height="<%= document.preview_images.first.metadata['height'] %>" class="rounded border" loading="lazy">
<div class="pb-2 pt-1.5 text-center" dir="auto"> <div class="pb-2 pt-1.5 text-center" dir="auto">
<%= item['name'].presence || document.filename.base %> <%= item['name'].presence || document.filename.base %>
</div> </div>
@ -103,7 +103,7 @@
<% (document.metadata.dig('pdf', 'number_of_pages') || (document.preview_images.loaded? ? preview_images_index.size : document.preview_images.size)).times do |index| %> <% (document.metadata.dig('pdf', 'number_of_pages') || (document.preview_images.loaded? ? preview_images_index.size : document.preview_images.size)).times do |index| %>
<% page = preview_images_index[index] || page_blob_struct.new(metadata: lazyload_metadata, url: preview_document_page_path(document.signed_uuid, "#{index}.jpg")) %> <% page = preview_images_index[index] || page_blob_struct.new(metadata: lazyload_metadata, url: preview_document_page_path(document.signed_uuid, "#{index}.jpg")) %>
<page-container id="<%= "page-#{document.uuid}-#{index}" %>" class="block before:border before:absolute before:top-0 before:bottom-0 before:left-0 before:right-0 before:rounded relative mb-4" style="container-type: size; aspect-ratio: <%= width = page.metadata['width'] %> / <%= height = page.metadata['height'] %>"> <page-container id="<%= "page-#{document.uuid}-#{index}" %>" class="block before:border before:absolute before:top-0 before:bottom-0 before:left-0 before:right-0 before:rounded relative mb-4" style="container-type: size; aspect-ratio: <%= width = page.metadata['width'] %> / <%= height = page.metadata['height'] %>">
<img loading="lazy" src="<%= Docuseal::URL_CACHE.fetch([document.id, document.uuid, index].join(':'), expires_in: 10.minutes) { page.url } %>" width="<%= width %>" class="rounded" height="<%= height %>"> <img loading="lazy" src="<%= page.url %>" width="<%= width %>" class="rounded" height="<%= height %>">
<div class="top-0 bottom-0 left-0 right-0 absolute"> <div class="top-0 bottom-0 left-0 right-0 absolute">
<% document_annots_index[index]&.each do |annot| %> <% document_annots_index[index]&.each do |annot| %>
<%= render 'submissions/annotation', annot: %> <%= render 'submissions/annotation', annot: %>

@ -98,11 +98,9 @@ Rails.application.routes.draw do
resources :submissions_filters, only: %i[show], param: 'name' resources :submissions_filters, only: %i[show], param: 'name'
resources :templates, only: %i[new create edit update show destroy] do resources :templates, only: %i[new create edit update show destroy] do
resource :debug, only: %i[show], controller: 'templates_debug' if Rails.env.development? resource :debug, only: %i[show], controller: 'templates_debug' if Rails.env.development?
resources :documents, only: %i[create], controller: 'template_documents' resources :documents, only: %i[index create], controller: 'template_documents'
resources :clone_and_replace, only: %i[create], controller: 'templates_clone_and_replace' resources :clone_and_replace, only: %i[create], controller: 'templates_clone_and_replace'
if !Docuseal.multitenant? || Docuseal.demo? resources :detect_fields, only: %i[create], controller: 'templates_detect_fields' unless Docuseal.multitenant?
resources :detect_fields, only: %i[create], controller: 'templates_detect_fields'
end
resources :restore, only: %i[create], controller: 'templates_restore' resources :restore, only: %i[create], controller: 'templates_restore'
resources :archived, only: %i[index], controller: 'templates_archived_submissions' resources :archived, only: %i[index], controller: 'templates_archived_submissions'
resources :submissions, only: %i[new create] resources :submissions, only: %i[new create]

@ -39,6 +39,7 @@ Puma::Plugin.create do
configs = Sidekiq.configure_embed do |config| configs = Sidekiq.configure_embed do |config|
config.logger.level = Logger::INFO config.logger.level = Logger::INFO
sidekiq_config = YAML.load_file('config/sidekiq.yml') sidekiq_config = YAML.load_file('config/sidekiq.yml')
sidekiq_config['queues'] << 'fields' if ENV['DEMO'] == 'true'
config.queues = sidekiq_config['queues'] config.queues = sidekiq_config['queues']
config.concurrency = ENV.fetch('SIDEKIQ_THREADS', 5).to_i config.concurrency = ENV.fetch('SIDEKIQ_THREADS', 5).to_i
config.merge!(sidekiq_config) config.merge!(sidekiq_config)

@ -5,17 +5,63 @@ module Templates
module_function module_function
TextFieldBox = Struct.new(:x, :y, :w, :h, keyword_init: true) TextFieldBox = Struct.new(:x, :y, :w, :h, keyword_init: true)
PageNode = Struct.new(:prev, :next, :elem, :page, :attachment_uuid, keyword_init: true)
# rubocop:disable Metrics
DATE_REGEXP = /
(?:
date
| signed\sat
| datum
)
\s*[:-]?\s*\z
/ix
NUMBER_REGEXP = /
(?:
price
| \$
|
| total
| quantity
| prix
| quantité
| preis
| summe
| gesamt(?:betrag)?
| menge
| anzahl
| stückzahl
)
\s*[:-]?\s*\z
/ix
SIGNATURE_REGEXP = /
(?:
signature
| sign\shere
| sign
| signez\sici
| signer\sici
| unterschrift
| unterschreiben
| unterzeichnen
)
\s*[:-]?\s*\z
/ix
# rubocop:disable Metrics, Style
def call(io, attachment: nil, confidence: 0.3, temperature: 1, inference: Templates::ImageToFields, def call(io, attachment: nil, confidence: 0.3, temperature: 1, inference: Templates::ImageToFields,
nms: 0.1, split_page: false, aspect_ratio: true, padding: 20, &) nms: 0.1, split_page: false, aspect_ratio: true, padding: 20, regexp_type: true, &)
if attachment&.image? fields, head_node =
process_image_attachment(io, attachment:, confidence:, nms:, split_page:, inference:, if attachment&.image?
temperature:, aspect_ratio:, padding:, &) process_image_attachment(io, attachment:, confidence:, nms:, split_page:, inference:,
else temperature:, aspect_ratio:, padding:, &)
process_pdf_attachment(io, attachment:, confidence:, nms:, split_page:, inference:, else
temperature:, aspect_ratio:, padding:, &) process_pdf_attachment(io, attachment:, confidence:, nms:, split_page:, inference:,
end temperature:, aspect_ratio:, regexp_type:, padding:, &)
end
[fields, head_node]
end end
def process_image_attachment(io, attachment:, confidence:, nms:, temperature:, inference:, def process_image_attachment(io, attachment:, confidence:, nms:, temperature:, inference:,
@ -29,7 +75,7 @@ module Templates
{ {
uuid: SecureRandom.uuid, uuid: SecureRandom.uuid,
type: f.type, type: f.type,
required: true, required: f.type != 'checkbox',
preferences: {}, preferences: {},
areas: [{ areas: [{
x: f.x, x: f.x,
@ -44,21 +90,24 @@ module Templates
yield [attachment&.uuid, 0, fields] if block_given? yield [attachment&.uuid, 0, fields] if block_given?
fields [fields, nil]
end end
def process_pdf_attachment(io, attachment:, confidence:, nms:, temperature:, inference:, def process_pdf_attachment(io, attachment:, confidence:, nms:, temperature:, inference:,
split_page: false, aspect_ratio: false, padding: nil) split_page: false, aspect_ratio: false, padding: nil, regexp_type: false)
doc = Pdfium::Document.open_bytes(io.read) doc = Pdfium::Document.open_bytes(io.read)
doc.page_count.times.flat_map do |page_number| head_node = PageNode.new(elem: ''.b, page: 0, attachment_uuid: attachment&.uuid)
tail_node = head_node
fields = doc.page_count.times.flat_map do |page_number|
page = doc.get_page(page_number) page = doc.get_page(page_number)
data, width, height = page.render_to_bitmap(width: inference::RESOLUTION * 1.5) data, width, height = page.render_to_bitmap(width: inference::RESOLUTION * 1.5)
image = Vips::Image.new_from_memory(data, width, height, 4, :uchar) image = Vips::Image.new_from_memory(data, width, height, 4, :uchar)
fields = inference.call(image, confidence: 0.05, nms:, split_page:, fields = inference.call(image, confidence: confidence / 4.0, nms:, split_page:,
temperature:, aspect_ratio:, padding:) temperature:, aspect_ratio:, padding:)
text_fields = extract_text_fields_from_page(page) text_fields = extract_text_fields_from_page(page)
@ -67,17 +116,23 @@ module Templates
fields = increase_confidence_for_overlapping_fields(fields, text_fields) fields = increase_confidence_for_overlapping_fields(fields, text_fields)
fields = increase_confidence_for_overlapping_fields(fields, line_fields) fields = increase_confidence_for_overlapping_fields(fields, line_fields)
fields = fields.filter_map do |f| fields = fields.reject { |f| f.confidence < confidence }
next if f.confidence < confidence
field_nodes, tail_node = build_page_nodes(page, fields, tail_node, attachment_uuid: attachment&.uuid)
fields = field_nodes.map do |node|
field = node.elem
type = regexp_type ? type_from_page_node(node) : field.type
{ {
uuid: SecureRandom.uuid, uuid: SecureRandom.uuid,
type: f.type, type:,
required: true, required: type != 'checkbox',
preferences: {}, preferences: {},
areas: [{ areas: [{
x: f.x, y: f.y, x: field.x, y: field.y,
w: f.w, h: f.h, w: field.w, h: field.h,
page: page_number, page: page_number,
attachment_uuid: attachment&.uuid attachment_uuid: attachment&.uuid
}] }]
@ -90,10 +145,170 @@ module Templates
ensure ensure
page.close page.close
end end
print_debug(head_node) if Rails.env.development?
[fields, head_node]
ensure ensure
doc.close doc.close
end end
def print_debug(head_node)
current_node = head_node
index = 0
string = ''.b
loop do
string <<
if current_node.elem.is_a?(String)
current_node.elem
else
"[#{current_node.elem.type == 'checkbox' ? 'Checkbox' : 'Field'}_#{index += 1}]"
end
current_node = current_node.next
break unless current_node
end
Rails.logger.info(string)
end
def type_from_page_node(node)
return node.elem.type unless node.prev.elem.is_a?(String)
return node.elem.type unless node.elem.type == 'text'
string = node.prev.elem
return 'date' if string.match?(DATE_REGEXP)
return 'signature' if string.match?(SIGNATURE_REGEXP)
return 'number' if string.match?(NUMBER_REGEXP)
return 'text'
end
def build_page_nodes(page, fields, tail_node, attachment_uuid: nil)
field_nodes = []
current_text = ''.b
text_nodes = page.text_nodes
text_idx = 0
field_idx = 0
while text_idx < text_nodes.length || field_idx < fields.length
text_node = text_nodes[text_idx]
field = fields[field_idx]
process_text_node = false
process_field_node = false
if text_node && field
text_y_center = text_node.y + (text_node.h / 2.0)
field_y_center = field.y + (field.h / 2.0)
y_threshold = text_node.h / 2.0
vertical_distance = (text_y_center - field_y_center).abs
if vertical_distance < y_threshold
is_underscore = text_node.content == '_'
is_left_of_field = text_node.x < field.x
if is_underscore && is_left_of_field
text_x_end = text_node.x + text_node.w
distance = field.x - text_x_end
proximity_threshold = text_node.w * 3.0
if distance < proximity_threshold
process_field_node = true
else
process_text_node = true
end
elsif is_left_of_field
process_text_node = true
else
process_field_node = true
end
elsif text_node.y < field.y
process_text_node = true
else
process_field_node = true
end
elsif text_node
process_text_node = true
elsif field
process_field_node = true
end
if process_field_node
unless current_text.empty?
new_text_node = PageNode.new(prev: tail_node, elem: current_text, page: page.page_index, attachment_uuid:)
tail_node.next = new_text_node
tail_node = new_text_node
current_text = ''.b
end
new_field_node = PageNode.new(prev: tail_node, elem: field, page: page.page_index, attachment_uuid:)
tail_node.next = new_field_node
tail_node = new_field_node
field_nodes << tail_node
while text_idx < text_nodes.length
text_node_to_check = text_nodes[text_idx]
is_part_of_field = false
if text_node_to_check.content == '_'
check_y_center = text_node_to_check.y + (text_node_to_check.h / 2.0)
check_y_dist = (check_y_center - field_y_center).abs
check_y_thresh = text_node_to_check.h / 2.0
if check_y_dist < check_y_thresh
padding = text_node_to_check.w * 3.0
field_x_start = field.x - padding
field_x_end = field.x + field.w + padding
text_x_start = text_node_to_check.x
text_x_end = text_node_to_check.x + text_node_to_check.w
is_part_of_field = true if text_x_start <= field_x_end && field_x_start <= text_x_end
end
end
break unless is_part_of_field
text_idx += 1
end
field_idx += 1
elsif process_text_node
if text_idx > 0
prev_text_node = text_nodes[text_idx - 1]
x_gap = text_node.x - (prev_text_node.x + prev_text_node.w)
gap_w = text_node.w > prev_text_node.w ? text_node.w : prev_text_node.w
current_text << ' ' if x_gap > gap_w * 2
end
current_text << text_node.content
text_idx += 1
end
end
unless current_text.empty?
new_text_node = PageNode.new(prev: tail_node, elem: current_text, page: page.page_index, attachment_uuid:)
tail_node.next = new_text_node
tail_node = new_text_node
end
[field_nodes, tail_node]
end
def extract_line_fields_from_page(page) def extract_line_fields_from_page(page)
line_thickness = 5.0 / page.height line_thickness = 5.0 / page.height
@ -265,6 +480,6 @@ module Templates
image_fields image_fields
end end
# rubocop:enable Metrics # rubocop:enable Metrics, Style
end end
end end

Loading…
Cancel
Save