142 lines
5.2 KiB
TypeScript
142 lines
5.2 KiB
TypeScript
// Chargements dynamiques locaux (pdfjs-dist/tesseract.js)
|
|
let _pdfjsLib: any | null = null
|
|
async function getPdfJs() {
|
|
if (_pdfjsLib) return _pdfjsLib
|
|
const pdfjsLib: any = await import('pdfjs-dist')
|
|
try {
|
|
// Utilise un worker module réel pour éviter le fake worker
|
|
const workerUrl = new URL('pdfjs-dist/build/pdf.worker.min.mjs', import.meta.url)
|
|
// @ts-expect-error - API v4
|
|
pdfjsLib.GlobalWorkerOptions.workerPort = new Worker(workerUrl, { type: 'module' })
|
|
} catch {
|
|
// ignore si worker introuvable
|
|
}
|
|
_pdfjsLib = pdfjsLib
|
|
return _pdfjsLib
|
|
}
|
|
|
|
export async function extractTextFromFile(file: File): Promise<string> {
|
|
const mime = file.type || ''
|
|
if (mime.includes('pdf') || file.name.toLowerCase().endsWith('.pdf')) {
|
|
return extractFromPdf(file)
|
|
}
|
|
if (mime.startsWith('image/') || ['.png', '.jpg', '.jpeg'].some((ext) => file.name.toLowerCase().endsWith(ext))) {
|
|
return extractFromImage(file)
|
|
}
|
|
// Fallback: lecture texte brut
|
|
try {
|
|
return await file.text()
|
|
} catch {
|
|
return ''
|
|
}
|
|
}
|
|
|
|
async function extractFromPdf(file: File): Promise<string> {
|
|
const pdfjsLib = await getPdfJs().catch(() => null)
|
|
if (!pdfjsLib) return ''
|
|
const arrayBuffer = await file.arrayBuffer()
|
|
const pdf = await pdfjsLib.getDocument({ data: new Uint8Array(arrayBuffer) }).promise
|
|
const texts: string[] = []
|
|
const numPages = Math.min(pdf.numPages, 50)
|
|
for (let i = 1; i <= numPages; i += 1) {
|
|
const page = await pdf.getPage(i)
|
|
const content = await page.getTextContent()
|
|
const pageText = content.items.map((it: any) => (it.str ? it.str : '')).join(' ')
|
|
if (pageText.trim()) texts.push(pageText)
|
|
}
|
|
return texts.join('\n')
|
|
}
|
|
|
|
async function extractFromImage(file: File): Promise<string> {
|
|
const { createWorker } = await import('tesseract.js')
|
|
|
|
// Pré-redimensionne l'image si trop petite (largeur minimale 300px)
|
|
const imgBitmap = await createImageBitmap(file)
|
|
let source: Blob = file
|
|
// Normalisation pour CNI: contraste, gris, upscaling plus agressif
|
|
const minWidth = /recto|verso|cni|carte/i.test(file.name) ? 1200 : 300
|
|
if (imgBitmap.width < minWidth) {
|
|
const scale = minWidth / Math.max(1, imgBitmap.width)
|
|
const canvas = document.createElement('canvas')
|
|
canvas.width = Math.max(300, Math.floor(imgBitmap.width * scale))
|
|
canvas.height = Math.floor(imgBitmap.height * scale)
|
|
const ctx = canvas.getContext('2d')!
|
|
ctx.imageSmoothingEnabled = true
|
|
ctx.imageSmoothingQuality = 'high'
|
|
ctx.drawImage(imgBitmap, 0, 0, canvas.width, canvas.height)
|
|
// Conversion en niveaux de gris + amélioration du contraste
|
|
const imgData = ctx.getImageData(0, 0, canvas.width, canvas.height)
|
|
const data = imgData.data
|
|
for (let i = 0; i < data.length; i += 4) {
|
|
const r = data[i], g = data[i + 1], b = data[i + 2]
|
|
// luma
|
|
let y = 0.299 * r + 0.587 * g + 0.114 * b
|
|
// contraste simple
|
|
y = Math.max(0, Math.min(255, (y - 128) * 1.2 + 128))
|
|
data[i] = data[i + 1] = data[i + 2] = y
|
|
}
|
|
ctx.putImageData(imgData, 0, 0)
|
|
source = await new Promise<Blob>((resolve) => canvas.toBlob((b) => resolve(b || file))!)
|
|
}
|
|
|
|
const worker = await createWorker()
|
|
try {
|
|
// Configure le logger après création pour éviter DataCloneError
|
|
// eslint-disable-next-line no-console
|
|
worker.setLogger?.((m: any) => {
|
|
if (m?.progress != null) console.info('[OCR]', Math.round(m.progress * 100) + '%')
|
|
})
|
|
await worker.load()
|
|
await worker.loadLanguage('fra+eng')
|
|
await worker.initialize('fra+eng')
|
|
// Essaie plusieurs PSM et orientations (0/90/180/270) et garde le meilleur résultat
|
|
const rotations = [0, 90, 180, 270]
|
|
const psmModes = ['6', '7', '11'] // 6: block, 7: single line, 11: sparse text
|
|
let bestText = ''
|
|
let bestScore = -1
|
|
|
|
for (const psm of psmModes) {
|
|
await worker.setParameters({ tessedit_pageseg_mode: psm })
|
|
for (const deg of rotations) {
|
|
const rotatedBlob = await rotateBlob(source, deg)
|
|
const { data } = await worker.recognize(rotatedBlob)
|
|
const text = data.text || ''
|
|
const len = text.replace(/\s+/g, ' ').trim().length
|
|
const score = (data.confidence || 0) * Math.log(len + 1)
|
|
if (score > bestScore) {
|
|
bestScore = score
|
|
bestText = text
|
|
}
|
|
// Court-circuit si très bon
|
|
if (data.confidence >= 85 && len > 40) break
|
|
}
|
|
}
|
|
|
|
return bestText
|
|
} finally {
|
|
await worker.terminate()
|
|
}
|
|
}
|
|
|
|
async function rotateBlob(blob: Blob, deg: number): Promise<Blob> {
|
|
if (deg % 360 === 0) return blob
|
|
const bmp = await createImageBitmap(blob)
|
|
const rad = (deg * Math.PI) / 180
|
|
const sin = Math.abs(Math.sin(rad))
|
|
const cos = Math.abs(Math.cos(rad))
|
|
const w = bmp.width
|
|
const h = bmp.height
|
|
const newW = Math.floor(w * cos + h * sin)
|
|
const newH = Math.floor(w * sin + h * cos)
|
|
const canvas = document.createElement('canvas')
|
|
canvas.width = newW
|
|
canvas.height = newH
|
|
const ctx = canvas.getContext('2d')!
|
|
ctx.imageSmoothingEnabled = true
|
|
ctx.imageSmoothingQuality = 'high'
|
|
ctx.translate(newW / 2, newH / 2)
|
|
ctx.rotate(rad)
|
|
ctx.drawImage(bmp, -w / 2, -h / 2)
|
|
return await new Promise<Blob>((resolve) => canvas.toBlob((b) => resolve(b || blob))!)
|
|
}
|