|
|
|
@ -16,7 +16,7 @@ module Templates
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_PATH = Rails.root.join('tmp/model.onnx')
|
|
|
|
MODEL_PATH = Rails.root.join('tmp/model.onnx')
|
|
|
|
|
|
|
|
|
|
|
|
RESOLUTION = 704
|
|
|
|
INPUT_NAMES = %w[images input].freeze
|
|
|
|
|
|
|
|
|
|
|
|
ID_TO_CLASS = %w[text checkbox].freeze
|
|
|
|
ID_TO_CLASS = %w[text checkbox].freeze
|
|
|
|
|
|
|
|
|
|
|
|
@ -27,12 +27,14 @@ module Templates
|
|
|
|
|
|
|
|
|
|
|
|
# rubocop:disable Metrics
|
|
|
|
# rubocop:disable Metrics
|
|
|
|
def call(image, confidence: 0.3, nms: 0.1, temperature: 1,
|
|
|
|
def call(image, confidence: 0.3, nms: 0.1, temperature: 1,
|
|
|
|
split_page: false, aspect_ratio: true, padding: nil, resolution: RESOLUTION)
|
|
|
|
split_page: false, aspect_ratio: true, padding: nil, resolution: self.resolution)
|
|
|
|
base_image = image.extract_band(0, n: 3)
|
|
|
|
image = image.extract_band(0, n: 3) if image.bands > 3
|
|
|
|
|
|
|
|
|
|
|
|
trimmed_base, base_offset_x, base_offset_y = trim_image_with_padding(base_image, padding)
|
|
|
|
trimmed_base, base_offset_x, base_offset_y = trim_image_with_padding(image, padding)
|
|
|
|
|
|
|
|
|
|
|
|
if split_page && image.height > image.width
|
|
|
|
if model_v2?
|
|
|
|
|
|
|
|
detections = call_v2(trimmed_base, base_offset_x, base_offset_y, split_page, confidence:, resolution:)
|
|
|
|
|
|
|
|
elsif split_page && image.height > image.width
|
|
|
|
regions = build_split_image_regions(trimmed_base)
|
|
|
|
regions = build_split_image_regions(trimmed_base)
|
|
|
|
|
|
|
|
|
|
|
|
detections = { xyxy: Numo::SFloat[], confidence: Numo::SFloat[], class_id: Numo::Int32[] }
|
|
|
|
detections = { xyxy: Numo::SFloat[], confidence: Numo::SFloat[], class_id: Numo::Int32[] }
|
|
|
|
@ -71,6 +73,127 @@ module Templates
|
|
|
|
build_fields_from_detections(detections, image)
|
|
|
|
build_fields_from_detections(detections, image)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def call_v2(image, offset_x, offset_y, split_page, confidence:, resolution:)
|
|
|
|
|
|
|
|
if split_page && image.height > image.width
|
|
|
|
|
|
|
|
regions = build_split_image_regions(image)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
detections = { xyxy: Numo::SFloat[], confidence: Numo::SFloat[], class_id: Numo::Int32[] }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
regions.reduce(detections) do |acc, r|
|
|
|
|
|
|
|
|
next acc if r[:img].height <= 0 || r[:img].width <= 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
input_tensor, orig_size_tensor, transform_info = preprocess_image_v2(r[:img], resolution)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
outputs = model.predict({ 'images' => input_tensor, 'orig_target_sizes' => orig_size_tensor },
|
|
|
|
|
|
|
|
output_type: :numo)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
boxes = outputs['boxes'][0, true, true]
|
|
|
|
|
|
|
|
labels = outputs['labels'][0, true]
|
|
|
|
|
|
|
|
scores = outputs['scores'][0, true]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
postprocess_outputs_v2(boxes, labels, scores, acc,
|
|
|
|
|
|
|
|
offset_x:, offset_y: offset_y + r[:offset_y],
|
|
|
|
|
|
|
|
confidence:, transform_info:)
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
else
|
|
|
|
|
|
|
|
input_tensor, orig_size_tensor, transform_info = preprocess_image_v2(image, resolution)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
outputs = model.predict({ 'images' => input_tensor, 'orig_target_sizes' => orig_size_tensor },
|
|
|
|
|
|
|
|
output_type: :numo)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
boxes = outputs['boxes'][0, true, true]
|
|
|
|
|
|
|
|
labels = outputs['labels'][0, true]
|
|
|
|
|
|
|
|
scores = outputs['scores'][0, true]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
postprocess_outputs_v2(boxes, labels, scores, offset_x:, offset_y:,
|
|
|
|
|
|
|
|
confidence:, transform_info:)
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def preprocess_image_v2(image, resolution)
|
|
|
|
|
|
|
|
image = image.extract_band(0, n: 3) if image.bands > 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ratio = [resolution.to_f / image.width, resolution.to_f / image.height].min
|
|
|
|
|
|
|
|
new_width = (image.width * ratio).to_i
|
|
|
|
|
|
|
|
new_height = (image.height * ratio).to_i
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
image = image.resize(ratio, vscale: ratio, kernel: :linear) if ratio != 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pad_w = (resolution - new_width) / 2
|
|
|
|
|
|
|
|
pad_h = (resolution - new_height) / 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
padded = image.embed(pad_w, pad_h, resolution, resolution, background: [0, 0, 0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
padded /= 255.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
img_array = Numo::SFloat.from_binary(padded.write_to_memory, [resolution, resolution, 3])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
img_array = img_array.transpose(2, 0, 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
input_tensor = img_array.reshape(1, 3, resolution, resolution)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
orig_size_tensor = Numo::Int64[[resolution, resolution]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
transform_info = { ratio: ratio, pad_w: pad_w, pad_h: pad_h }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
[input_tensor, orig_size_tensor, transform_info]
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def postprocess_outputs_v2(boxes, labels, scores, detections = nil, offset_x:, offset_y:, confidence:,
|
|
|
|
|
|
|
|
transform_info:)
|
|
|
|
|
|
|
|
keep_mask = scores.gt(confidence)
|
|
|
|
|
|
|
|
keep_indices = keep_mask.where
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if keep_indices.empty?
|
|
|
|
|
|
|
|
detections || {
|
|
|
|
|
|
|
|
xyxy: Numo::SFloat[],
|
|
|
|
|
|
|
|
confidence: Numo::SFloat[],
|
|
|
|
|
|
|
|
class_id: Numo::Int32[]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
else
|
|
|
|
|
|
|
|
scores = scores[keep_indices]
|
|
|
|
|
|
|
|
labels = labels[keep_indices]
|
|
|
|
|
|
|
|
boxes_xyxy = boxes[keep_indices, true]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ratio = transform_info[:ratio]
|
|
|
|
|
|
|
|
pad_w = transform_info[:pad_w]
|
|
|
|
|
|
|
|
pad_h = transform_info[:pad_h]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
boxes_xyxy[true, 0] = ((boxes_xyxy[true, 0] - pad_w) / ratio) + offset_x
|
|
|
|
|
|
|
|
boxes_xyxy[true, 1] = ((boxes_xyxy[true, 1] - pad_h) / ratio) + offset_y
|
|
|
|
|
|
|
|
boxes_xyxy[true, 2] = ((boxes_xyxy[true, 2] - pad_w) / ratio) + offset_x
|
|
|
|
|
|
|
|
boxes_xyxy[true, 3] = ((boxes_xyxy[true, 3] - pad_h) / ratio) + offset_y
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if detections
|
|
|
|
|
|
|
|
existing_n = detections[:xyxy].shape[0]
|
|
|
|
|
|
|
|
new_n = boxes_xyxy.shape[0]
|
|
|
|
|
|
|
|
total = existing_n + new_n
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
xyxy = Numo::SFloat.zeros(total, 4)
|
|
|
|
|
|
|
|
conf = Numo::SFloat.zeros(total)
|
|
|
|
|
|
|
|
cls = Numo::Int32.zeros(total)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if existing_n.positive?
|
|
|
|
|
|
|
|
xyxy[0...existing_n, true] = detections[:xyxy]
|
|
|
|
|
|
|
|
conf[0...existing_n] = detections[:confidence]
|
|
|
|
|
|
|
|
cls[0...existing_n] = detections[:class_id]
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
xyxy[existing_n...(existing_n + new_n), true] = boxes_xyxy
|
|
|
|
|
|
|
|
conf[existing_n...(existing_n + new_n)] = scores
|
|
|
|
|
|
|
|
cls[existing_n...(existing_n + new_n)] = labels
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{ xyxy: xyxy, confidence: conf, class_id: cls }
|
|
|
|
|
|
|
|
else
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
xyxy: boxes_xyxy,
|
|
|
|
|
|
|
|
confidence: scores,
|
|
|
|
|
|
|
|
class_id: labels
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
def build_split_image_regions(image)
|
|
|
|
def build_split_image_regions(image)
|
|
|
|
half_h = image.height / 2
|
|
|
|
half_h = image.height / 2
|
|
|
|
top_h = half_h
|
|
|
|
top_h = half_h
|
|
|
|
@ -212,7 +335,7 @@ module Templates
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
def postprocess_outputs(boxes, logits, transform_info, detections = nil, confidence: 0.3, temperature: 1,
|
|
|
|
def postprocess_outputs(boxes, logits, transform_info, detections = nil, confidence: 0.3, temperature: 1,
|
|
|
|
resolution: RESOLUTION)
|
|
|
|
resolution: self.resolution)
|
|
|
|
scaled_logits = logits / temperature
|
|
|
|
scaled_logits = logits / temperature
|
|
|
|
|
|
|
|
|
|
|
|
probs = 1.0 / (1.0 + Numo::NMath.exp(-scaled_logits))
|
|
|
|
probs = 1.0 / (1.0 + Numo::NMath.exp(-scaled_logits))
|
|
|
|
@ -326,6 +449,14 @@ module Templates
|
|
|
|
providers: ['CPUExecutionProvider']
|
|
|
|
providers: ['CPUExecutionProvider']
|
|
|
|
)
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def resolution
|
|
|
|
|
|
|
|
@resolution ||= model.inputs.find { |i| INPUT_NAMES.include?(i[:name]) }.dig(:shape, 2)
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def model_v2?
|
|
|
|
|
|
|
|
@model_v2 ||= model.inputs.pluck(:name).include?('orig_target_sizes')
|
|
|
|
|
|
|
|
end
|
|
|
|
# rubocop:enable Metrics
|
|
|
|
# rubocop:enable Metrics
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|