mirror of https://github.com/docusealco/docuseal
- Create pdf_text_to_html.js: JS port of the Ruby heuristic parser (ALL_CAPS→h2, numbered headings→h3, bullets→ul/li, body→p dir=auto) - Add pdf_view, text_view, document_view_options keys to i18n.js (en) - Update document.vue: tab switcher shown when all pages have extracted text; PDF View renders the existing page images; Text View renders heuristic HTML in a prose container with per-page sections - ArrowLeft/ArrowRight keyboard navigation between tabs with focus management - Tab is hidden entirely for scanned/image-only PDFs (hasFullText gate) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>pull/599/head
parent
40dd223393
commit
797fb32a37
@ -0,0 +1,53 @@
|
||||
function escapeHtml (str) {
|
||||
return str
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, ''')
|
||||
}
|
||||
|
||||
function isNumberedHeading (line) {
|
||||
return line.length <= 80 && /^\d+\.\s+[A-Z]/.test(line) && !/[.!?,;]$/.test(line)
|
||||
}
|
||||
|
||||
function isAllCapsHeading (line) {
|
||||
return line.length >= 3 && !/[.!?,;]$/.test(line) && line === line.toUpperCase() && /[A-Z]/.test(line)
|
||||
}
|
||||
|
||||
export function pdfTextToHtml (pageText) {
|
||||
if (!pageText) return ''
|
||||
|
||||
const lines = pageText.split(/\r?\n/)
|
||||
let output = ''
|
||||
let inList = false
|
||||
|
||||
for (const line of lines) {
|
||||
const stripped = line.trim()
|
||||
|
||||
if (!stripped) {
|
||||
if (inList) { output += '</ul>'; inList = false }
|
||||
continue
|
||||
}
|
||||
|
||||
if (isNumberedHeading(stripped)) {
|
||||
if (inList) { output += '</ul>'; inList = false }
|
||||
output += `<h3>${escapeHtml(stripped)}</h3>`
|
||||
} else if (isAllCapsHeading(stripped)) {
|
||||
if (inList) { output += '</ul>'; inList = false }
|
||||
output += `<h2>${escapeHtml(stripped)}</h2>`
|
||||
} else {
|
||||
const match = stripped.match(/^[•*-]\s+(.+)/)
|
||||
if (match) {
|
||||
if (!inList) { output += '<ul>'; inList = true }
|
||||
output += `<li>${escapeHtml(match[1])}</li>`
|
||||
} else {
|
||||
if (inList) { output += '</ul>'; inList = false }
|
||||
output += `<p dir="auto">${escapeHtml(stripped)}</p>`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (inList) output += '</ul>'
|
||||
return output
|
||||
}
|
||||
Loading…
Reference in new issue