mirror of https://github.com/docusealco/docuseal
- Create pdf_text_to_html.js: JS port of the Ruby heuristic parser (ALL_CAPS→h2, numbered headings→h3, bullets→ul/li, body→p dir=auto) - Add pdf_view, text_view, document_view_options keys to i18n.js (en) - Update document.vue: tab switcher shown when all pages have extracted text; PDF View renders the existing page images; Text View renders heuristic HTML in a prose container with per-page sections - ArrowLeft/ArrowRight keyboard navigation between tabs with focus management - Tab is hidden entirely for scanned/image-only PDFs (hasFullText gate) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>pull/599/head
parent
40dd223393
commit
797fb32a37
@ -0,0 +1,53 @@
|
|||||||
|
function escapeHtml (str) {
|
||||||
|
return str
|
||||||
|
.replace(/&/g, '&')
|
||||||
|
.replace(/</g, '<')
|
||||||
|
.replace(/>/g, '>')
|
||||||
|
.replace(/"/g, '"')
|
||||||
|
.replace(/'/g, ''')
|
||||||
|
}
|
||||||
|
|
||||||
|
function isNumberedHeading (line) {
|
||||||
|
return line.length <= 80 && /^\d+\.\s+[A-Z]/.test(line) && !/[.!?,;]$/.test(line)
|
||||||
|
}
|
||||||
|
|
||||||
|
function isAllCapsHeading (line) {
|
||||||
|
return line.length >= 3 && !/[.!?,;]$/.test(line) && line === line.toUpperCase() && /[A-Z]/.test(line)
|
||||||
|
}
|
||||||
|
|
||||||
|
export function pdfTextToHtml (pageText) {
|
||||||
|
if (!pageText) return ''
|
||||||
|
|
||||||
|
const lines = pageText.split(/\r?\n/)
|
||||||
|
let output = ''
|
||||||
|
let inList = false
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
const stripped = line.trim()
|
||||||
|
|
||||||
|
if (!stripped) {
|
||||||
|
if (inList) { output += '</ul>'; inList = false }
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isNumberedHeading(stripped)) {
|
||||||
|
if (inList) { output += '</ul>'; inList = false }
|
||||||
|
output += `<h3>${escapeHtml(stripped)}</h3>`
|
||||||
|
} else if (isAllCapsHeading(stripped)) {
|
||||||
|
if (inList) { output += '</ul>'; inList = false }
|
||||||
|
output += `<h2>${escapeHtml(stripped)}</h2>`
|
||||||
|
} else {
|
||||||
|
const match = stripped.match(/^[•*-]\s+(.+)/)
|
||||||
|
if (match) {
|
||||||
|
if (!inList) { output += '<ul>'; inList = true }
|
||||||
|
output += `<li>${escapeHtml(match[1])}</li>`
|
||||||
|
} else {
|
||||||
|
if (inList) { output += '</ul>'; inList = false }
|
||||||
|
output += `<p dir="auto">${escapeHtml(stripped)}</p>`
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inList) output += '</ul>'
|
||||||
|
return output
|
||||||
|
}
|
||||||
Loading…
Reference in new issue