# frozen_string_literal: true module Templates module CreateAttachments PDF_CONTENT_TYPE = 'application/pdf' ZIP_CONTENT_TYPE = 'application/zip' X_ZIP_CONTENT_TYPE = 'application/x-zip-compressed' JSON_CONTENT_TYPE = 'application/json' DOCUMENT_EXTENSIONS = %w[.docx .doc .xlsx .xls .odt .rtf].freeze DOCUMENT_CONTENT_TYPES = %w[ application/vnd.openxmlformats-officedocument.wordprocessingml.document application/msword application/vnd.openxmlformats-officedocument.spreadsheetml.sheet application/vnd.ms-excel application/vnd.oasis.opendocument.text application/rtf ].freeze ANNOTATIONS_SIZE_LIMIT = 6.megabytes InvalidFileType = Class.new(StandardError) PdfEncrypted = Class.new(StandardError) module_function def call(template, params, extract_fields: false) extract_zip_files(params[:files].presence || params[:file]).flat_map do |file| handle_file_types(template, file, params, extract_fields:) end end def handle_pdf_or_image(template, file, document_data = nil, params = {}, extract_fields: false, content_type_override: nil, filename_override: nil) document_data ||= file.read content_type = content_type_override || file.content_type filename = filename_override || file.original_filename if content_type == PDF_CONTENT_TYPE document_data = maybe_decrypt_pdf_or_raise(document_data, params) annotations = document_data.size < ANNOTATIONS_SIZE_LIMIT ? Templates::BuildAnnotations.call(document_data) : [] end sha256 = Base64.urlsafe_encode64(Digest::SHA256.digest(document_data)) blob = ActiveStorage::Blob.create_and_upload!( io: StringIO.new(document_data), filename: filename, metadata: { identified: content_type == PDF_CONTENT_TYPE, analyzed: content_type == PDF_CONTENT_TYPE, pdf: { annotations: }.compact_blank, sha256: }.compact_blank, content_type: content_type ) document = template.documents.create!(blob:) Templates::ProcessDocument.call(document, document_data, extract_fields:) end def maybe_decrypt_pdf_or_raise(data, params) if data.size < ANNOTATIONS_SIZE_LIMIT && PdfUtils.encrypted?(data) PdfUtils.decrypt(data, params[:password]) else data end rescue HexaPDF::EncryptionError raise PdfEncrypted end def extract_zip_files(files) extracted_files = [] Array.wrap(files).each do |file| if file.content_type == ZIP_CONTENT_TYPE || file.content_type == X_ZIP_CONTENT_TYPE Zip::File.open(file.tempfile).each do |entry| next if entry.directory? tempfile = Tempfile.new(entry.name) tempfile.binmode entry.get_input_stream { |in_stream| IO.copy_stream(in_stream, tempfile) } tempfile.rewind type = Marcel::MimeType.for(tempfile, name: entry.name) next if type.exclude?('image') && type != PDF_CONTENT_TYPE && type != JSON_CONTENT_TYPE && DOCUMENT_CONTENT_TYPES.exclude?(type) extracted_files << ActionDispatch::Http::UploadedFile.new( filename: File.basename(entry.name), type:, tempfile: ) end else extracted_files << file end end extracted_files end def handle_file_types(template, file, params, extract_fields:) if file.content_type.include?('image') || file.content_type == PDF_CONTENT_TYPE return handle_pdf_or_image(template, file, file.read, params, extract_fields:) end # Handle document types (DOCX, DOC, XLSX, etc.) by converting to PDF if DOCUMENT_CONTENT_TYPES.include?(file.content_type) pdf_data = convert_document_to_pdf(file) if pdf_data # Process the converted PDF with PDF content type and filename pdf_filename = File.basename(file.original_filename, '.*') + '.pdf' return handle_pdf_or_image(template, file, pdf_data, params, extract_fields: extract_fields, content_type_override: PDF_CONTENT_TYPE, filename_override: pdf_filename) else raise InvalidFileType, "Unable to convert #{file.content_type} to PDF. Please install LibreOffice (brew install --cask libreoffice on macOS or apt-get install libreoffice on Linux) or convert the document to PDF manually." end end raise InvalidFileType, file.content_type end def convert_document_to_pdf(file) # Try to use LibreOffice to convert document to PDF libreoffice_path = find_libreoffice return nil unless libreoffice_path # Create a temporary file for the input document input_temp = Tempfile.new(['input', File.extname(file.original_filename)]) input_temp.binmode file.rewind input_temp.write(file.read) input_temp.close output_dir = Dir.mktmpdir output_file = File.join(output_dir, File.basename(file.original_filename, '.*') + '.pdf') begin # Use LibreOffice headless mode to convert to PDF success = system(libreoffice_path, '--headless', '--convert-to', 'pdf', '--outdir', output_dir, input_temp.path, out: File::NULL, err: File::NULL) if success generated_pdf = Dir.glob(File.join(output_dir, '*.pdf')).first if generated_pdf && File.exist?(generated_pdf) return File.binread(generated_pdf) end end rescue StandardError => e Rails.logger.warn("Document conversion failed: #{e.message}") ensure input_temp.unlink if input_temp FileUtils.rm_rf(output_dir) if Dir.exist?(output_dir) end nil end def find_libreoffice # Check common LibreOffice installation paths paths = [ '/Applications/LibreOffice.app/Contents/MacOS/soffice', # macOS '/usr/bin/libreoffice', # Linux '/usr/local/bin/libreoffice', # Linux alternative `which libreoffice`.strip, # System PATH `which soffice`.strip # Alternative command name ].compact.reject(&:empty?) paths.find { |path| File.executable?(path) } end end end