inference v2

pull/572/head
Pete Matsyburka 1 month ago
parent b10ed46ccc
commit f5b4a0c5ab

@ -59,8 +59,8 @@ module Templates
CHECKBOXES = ['☐', '□'].freeze
# rubocop:disable Metrics, Style
def call(io, attachment: nil, confidence: 0.3, temperature: 1, inference: Templates::ImageToFields,
nms: 0.1, split_page: false, aspect_ratio: true, padding: 20, regexp_type: true, &)
def call(io, attachment: nil, confidence: 0.3, temperature: 1, inference: Templates::ImageToFields, nms: 0.1,
split_page: false, aspect_ratio: true, padding: inference.model_v2? ? nil : 20, regexp_type: true, &)
fields, head_node =
if attachment&.image?
process_image_attachment(io, attachment:, confidence:, nms:, split_page:, inference:,
@ -114,7 +114,10 @@ module Templates
fields = doc.page_count.times.flat_map do |page_number|
page = doc.get_page(page_number)
data, width, height = page.render_to_bitmap(width: inference::RESOLUTION * 1.5)
size_key = page.width > page.height ? :width : :height
size = padding ? inference.resolution * 1.5 : inference.resolution
data, width, height = page.render_to_bitmap(size_key => size)
image = Vips::Image.new_from_memory(data, width, height, 4, :uchar)
@ -126,8 +129,8 @@ module Templates
fields = sort_fields(fields, y_threshold: 10.0 / image.height)
fields = increase_confidence_for_overlapping_fields(fields, text_fields)
fields = increase_confidence_for_overlapping_fields(fields, line_fields)
fields = increase_confidence_for_overlapping_fields(fields, text_fields, confidence:)
fields = increase_confidence_for_overlapping_fields(fields, line_fields, confidence:)
fields = fields.reject { |f| f.confidence < confidence }
@ -477,10 +480,11 @@ module Templates
!(box1.endx < box2.x || box2.endx < box1.x || box1.endy < box2.y || box2.endy < box1.y)
end
def increase_confidence_for_overlapping_fields(image_fields, text_fields, by: 1.0)
def increase_confidence_for_overlapping_fields(image_fields, text_fields, confidence: 1, by: 1.0)
return image_fields if text_fields.blank?
image_fields.map do |image_field|
next if image_field.confidence >= confidence
next if image_field.type != 'text'
text_fields.each do |text_field|

@ -16,7 +16,7 @@ module Templates
MODEL_PATH = Rails.root.join('tmp/model.onnx')
RESOLUTION = 704
INPUT_NAMES = %w[images input].freeze
ID_TO_CLASS = %w[text checkbox].freeze
@ -27,12 +27,14 @@ module Templates
# rubocop:disable Metrics
def call(image, confidence: 0.3, nms: 0.1, temperature: 1,
split_page: false, aspect_ratio: true, padding: nil, resolution: RESOLUTION)
base_image = image.extract_band(0, n: 3)
split_page: false, aspect_ratio: true, padding: nil, resolution: self.resolution)
image = image.extract_band(0, n: 3) if image.bands > 3
trimmed_base, base_offset_x, base_offset_y = trim_image_with_padding(base_image, padding)
trimmed_base, base_offset_x, base_offset_y = trim_image_with_padding(image, padding)
if split_page && image.height > image.width
if model_v2?
detections = call_v2(trimmed_base, base_offset_x, base_offset_y, split_page, confidence:, resolution:)
elsif split_page && image.height > image.width
regions = build_split_image_regions(trimmed_base)
detections = { xyxy: Numo::SFloat[], confidence: Numo::SFloat[], class_id: Numo::Int32[] }
@ -71,6 +73,127 @@ module Templates
build_fields_from_detections(detections, image)
end
def call_v2(image, offset_x, offset_y, split_page, confidence:, resolution:)
if split_page && image.height > image.width
regions = build_split_image_regions(image)
detections = { xyxy: Numo::SFloat[], confidence: Numo::SFloat[], class_id: Numo::Int32[] }
regions.reduce(detections) do |acc, r|
next acc if r[:img].height <= 0 || r[:img].width <= 0
input_tensor, orig_size_tensor, transform_info = preprocess_image_v2(r[:img], resolution)
outputs = model.predict({ 'images' => input_tensor, 'orig_target_sizes' => orig_size_tensor },
output_type: :numo)
boxes = outputs['boxes'][0, true, true]
labels = outputs['labels'][0, true]
scores = outputs['scores'][0, true]
postprocess_outputs_v2(boxes, labels, scores, acc,
offset_x:, offset_y: offset_y + r[:offset_y],
confidence:, transform_info:)
end
else
input_tensor, orig_size_tensor, transform_info = preprocess_image_v2(image, resolution)
outputs = model.predict({ 'images' => input_tensor, 'orig_target_sizes' => orig_size_tensor },
output_type: :numo)
boxes = outputs['boxes'][0, true, true]
labels = outputs['labels'][0, true]
scores = outputs['scores'][0, true]
postprocess_outputs_v2(boxes, labels, scores, offset_x:, offset_y:,
confidence:, transform_info:)
end
end
def preprocess_image_v2(image, resolution)
image = image.extract_band(0, n: 3) if image.bands > 3
ratio = [resolution.to_f / image.width, resolution.to_f / image.height].min
new_width = (image.width * ratio).to_i
new_height = (image.height * ratio).to_i
image = image.resize(ratio, vscale: ratio, kernel: :linear) if ratio != 1
pad_w = (resolution - new_width) / 2
pad_h = (resolution - new_height) / 2
padded = image.embed(pad_w, pad_h, resolution, resolution, background: [0, 0, 0])
padded /= 255.0
img_array = Numo::SFloat.from_binary(padded.write_to_memory, [resolution, resolution, 3])
img_array = img_array.transpose(2, 0, 1)
input_tensor = img_array.reshape(1, 3, resolution, resolution)
orig_size_tensor = Numo::Int64[[resolution, resolution]]
transform_info = { ratio: ratio, pad_w: pad_w, pad_h: pad_h }
[input_tensor, orig_size_tensor, transform_info]
end
def postprocess_outputs_v2(boxes, labels, scores, detections = nil, offset_x:, offset_y:, confidence:,
transform_info:)
keep_mask = scores.gt(confidence)
keep_indices = keep_mask.where
if keep_indices.empty?
detections || {
xyxy: Numo::SFloat[],
confidence: Numo::SFloat[],
class_id: Numo::Int32[]
}
else
scores = scores[keep_indices]
labels = labels[keep_indices]
boxes_xyxy = boxes[keep_indices, true]
ratio = transform_info[:ratio]
pad_w = transform_info[:pad_w]
pad_h = transform_info[:pad_h]
boxes_xyxy[true, 0] = ((boxes_xyxy[true, 0] - pad_w) / ratio) + offset_x
boxes_xyxy[true, 1] = ((boxes_xyxy[true, 1] - pad_h) / ratio) + offset_y
boxes_xyxy[true, 2] = ((boxes_xyxy[true, 2] - pad_w) / ratio) + offset_x
boxes_xyxy[true, 3] = ((boxes_xyxy[true, 3] - pad_h) / ratio) + offset_y
if detections
existing_n = detections[:xyxy].shape[0]
new_n = boxes_xyxy.shape[0]
total = existing_n + new_n
xyxy = Numo::SFloat.zeros(total, 4)
conf = Numo::SFloat.zeros(total)
cls = Numo::Int32.zeros(total)
if existing_n.positive?
xyxy[0...existing_n, true] = detections[:xyxy]
conf[0...existing_n] = detections[:confidence]
cls[0...existing_n] = detections[:class_id]
end
xyxy[existing_n...(existing_n + new_n), true] = boxes_xyxy
conf[existing_n...(existing_n + new_n)] = scores
cls[existing_n...(existing_n + new_n)] = labels
{ xyxy: xyxy, confidence: conf, class_id: cls }
else
{
xyxy: boxes_xyxy,
confidence: scores,
class_id: labels
}
end
end
end
def build_split_image_regions(image)
half_h = image.height / 2
top_h = half_h
@ -212,7 +335,7 @@ module Templates
end
def postprocess_outputs(boxes, logits, transform_info, detections = nil, confidence: 0.3, temperature: 1,
resolution: RESOLUTION)
resolution: self.resolution)
scaled_logits = logits / temperature
probs = 1.0 / (1.0 + Numo::NMath.exp(-scaled_logits))
@ -326,6 +449,14 @@ module Templates
providers: ['CPUExecutionProvider']
)
end
def resolution
@resolution ||= model.inputs.find { |i| INPUT_NAMES.include?(i[:name]) }.dig(:shape, 2)
end
def model_v2?
@model_v2 ||= model.inputs.pluck(:name).include?('orig_target_sizes')
end
# rubocop:enable Metrics
end
end

Loading…
Cancel
Save