Press n or j to go to the next uncovered block, b, p or k for the previous block.
|| // Chargements dynamiques locaux (pdfjs-dist/tesseract.js) let _pdfjsLib: any | null = null async function getPdfJs() { if (_pdfjsLib) return _pdfjsLib const pdfjsLib: any = await import('pdfjs-dist') try { // Utilise un worker module réel pour éviter le fake worker const workerUrl = new URL('pdfjs-dist/build/pdf.worker.min.mjs', import.meta.url) pdfjsLib.GlobalWorkerOptions.workerPort = new Worker(workerUrl, { type: 'module' }) } catch { // ignore si worker introuvable } _pdfjsLib = pdfjsLib return _pdfjsLib } export async function extractTextFromFile(file: File): Promise<string> { const mime = file.type || '' if (mime.includes('pdf') || file.name.toLowerCase().endsWith('.pdf')) { const pdfText = await extractFromPdf(file) if (import.meta.env.DEV) { // eslint-disable-next-line no-console console.info('[OCR][PDF]', file.name, 'len=', pdfText.length, 'peek=', pdfText.slice(0, 200)) } return pdfText } if (mime.startsWith('image/') || ['.png', '.jpg', '.jpeg'].some((ext) => file.name.toLowerCase().endsWith(ext))) { const imgText = await extractFromImage(file) if (import.meta.env.DEV) { // eslint-disable-next-line no-console console.info('[OCR][IMG]', file.name, 'len=', imgText.length, 'peek=', imgText.slice(0, 200)) } return imgText } // Fallback: lecture texte brut try { return await file.text() } catch { return '' } } async function extractFromPdf(file: File): Promise<string> { const pdfjsLib = await getPdfJs().catch(() => null) if (!pdfjsLib) return '' const arrayBuffer = await file.arrayBuffer() const pdf = await pdfjsLib.getDocument({ data: new Uint8Array(arrayBuffer) }).promise const texts: string[] = [] const numPages = Math.min(pdf.numPages, 50) for (let i = 1; i <= numPages; i += 1) { const page = await pdf.getPage(i) const content = await page.getTextContent().catch(() => null) let pageText = '' if (content) { pageText = content.items.map((it: any) => (it.str ? it.str : '')).join(' ') } // Fallback OCR si pas de texte exploitable if (!pageText || pageText.replace(/\s+/g, '').length < 30) { const viewport = page.getViewport({ scale: 2 }) const canvas = document.createElement('canvas') canvas.width = viewport.width canvas.height = viewport.height const ctx = canvas.getContext('2d') as any await page.render({ canvasContext: ctx, viewport }).promise const blob: Blob = await new Promise((resolve) => canvas.toBlob((b) => resolve(b as Blob), 'image/png')) const ocrText = await extractFromImage(new File([blob], `${file.name}-p${i}.png`, { type: 'image/png' })) pageText = ocrText } if (pageText.trim()) texts.push(pageText) } return texts.join('\n') } async function extractFromImage(file: File): Promise<string> { const { createWorker } = await import('tesseract.js') // Pré-redimensionne l'image si trop petite (largeur minimale 300px) const imgBitmap = await createImageBitmap(file) let source: Blob = file // Normalisation pour CNI: contraste, gris, upscaling plus agressif const minWidth = /recto|verso|cni|carte/i.test(file.name) ? 1200 : 300 if (imgBitmap.width < minWidth) { const scale = minWidth / Math.max(1, imgBitmap.width) const canvas = document.createElement('canvas') canvas.width = Math.max(300, Math.floor(imgBitmap.width * scale)) canvas.height = Math.floor(imgBitmap.height * scale) const ctx = canvas.getContext('2d')! ctx.imageSmoothingEnabled = true ctx.imageSmoothingQuality = 'high' ctx.drawImage(imgBitmap, 0, 0, canvas.width, canvas.height) // Conversion en niveaux de gris + amélioration du contraste const imgData = ctx.getImageData(0, 0, canvas.width, canvas.height) const data = imgData.data for (let i = 0; i < data.length; i += 4) { const r = data[i], g = data[i + 1], b = data[i + 2] // luma let y = 0.299 * r + 0.587 * g + 0.114 * b // contraste simple y = Math.max(0, Math.min(255, (y - 128) * 1.2 + 128)) data[i] = data[i + 1] = data[i + 2] = y } ctx.putImageData(imgData, 0, 0) source = await new Promise<Blob>((resolve) => canvas.toBlob((b) => resolve(b || file))!) } const worker = await createWorker() try { // Configure le logger après création pour éviter DataCloneError // @ts-expect-error - setLogger is not directly on Worker type worker.setLogger?.((m: any) => { if (m?.progress != null) console.info('[OCR]', Math.round(m.progress * 100) + '%') }) await worker.load() // @ts-expect-error - loadLanguage is not directly on Worker type await worker.loadLanguage('fra+eng') // @ts-expect-error - initialize is not directly on Worker type await worker.initialize('fra+eng') // Essaie plusieurs PSM et orientations (0/90/180/270) et garde le meilleur résultat const rotations = [0, 90, 180, 270] const psmModes = ['6', '7', '11'] // 6: block, 7: single line, 11: sparse text let bestText = '' let bestScore = -1 for (const psm of psmModes) { // @ts-expect-error - tessedit_pageseg_mode expects PSM enum, but string is used await worker.setParameters({ tessedit_pageseg_mode: psm }) for (const deg of rotations) { const rotatedBlob = await rotateBlob(source, deg) const { data } = await worker.recognize(rotatedBlob) const text = data.text || '' const len = text.replace(/\s+/g, ' ').trim().length const score = (data.confidence || 0) * Math.log(len + 1) if (score > bestScore) { bestScore = score bestText = text } // Court-circuit si très bon if (data.confidence >= 85 && len > 40) break } } return bestText } finally { await worker.terminate() } } async function rotateBlob(blob: Blob, deg: number): Promise<Blob> { if (deg % 360 === 0) return blob const bmp = await createImageBitmap(blob) const rad = (deg * Math.PI) / 180 const sin = Math.abs(Math.sin(rad)) const cos = Math.abs(Math.cos(rad)) const w = bmp.width const h = bmp.height const newW = Math.floor(w * cos + h * sin) const newH = Math.floor(w * sin + h * cos) const canvas = document.createElement('canvas') canvas.width = newW canvas.height = newH const ctx = canvas.getContext('2d')! ctx.imageSmoothingEnabled = true ctx.imageSmoothingQuality = 'high' ctx.translate(newW / 2, newH / 2) ctx.rotate(rad) ctx.drawImage(bmp, -w / 2, -h / 2) return await new Promise<Blob>((resolve) => canvas.toBlob((b) => resolve(b || blob))!) } |