Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | // Chargements dynamiques locaux (pdfjs-dist/tesseract.js) let _pdfjsLib: any | null = null async function getPdfJs() { if (_pdfjsLib) return _pdfjsLib const pdfjsLib: any = await import('pdfjs-dist') try { // Utilise un worker module réel pour éviter le fake worker const workerUrl = new URL('pdfjs-dist/build/pdf.worker.min.mjs', import.meta.url) pdfjsLib.GlobalWorkerOptions.workerPort = new Worker(workerUrl, { type: 'module' }) } catch { // ignore si worker introuvable } _pdfjsLib = pdfjsLib return _pdfjsLib } export async function extractTextFromFile(file: File): Promise<string> { const mime = file.type || '' if (mime.includes('pdf') || file.name.toLowerCase().endsWith('.pdf')) { const pdfText = await extractFromPdf(file) if (import.meta.env.DEV) { // eslint-disable-next-line no-console console.info('[OCR][PDF]', file.name, 'len=', pdfText.length, 'peek=', pdfText.slice(0, 200)) } return pdfText } if (mime.startsWith('image/') || ['.png', '.jpg', '.jpeg'].some((ext) => file.name.toLowerCase().endsWith(ext))) { const imgText = await extractFromImage(file) if (import.meta.env.DEV) { // eslint-disable-next-line no-console console.info('[OCR][IMG]', file.name, 'len=', imgText.length, 'peek=', imgText.slice(0, 200)) } return imgText } // Fallback: lecture texte brut try { return await file.text() } catch { return '' } } async function extractFromPdf(file: File): Promise<string> { const pdfjsLib = await getPdfJs().catch(() => null) if (!pdfjsLib) return '' const arrayBuffer = await file.arrayBuffer() const pdf = await pdfjsLib.getDocument({ data: new Uint8Array(arrayBuffer) }).promise const texts: string[] = [] const numPages = Math.min(pdf.numPages, 50) for (let i = 1; i <= numPages; i += 1) { const page = await pdf.getPage(i) const content = await page.getTextContent().catch(() => null) let pageText = '' if (content) { pageText = content.items.map((it: any) => (it.str ? it.str : '')).join(' ') } // Fallback OCR si pas de texte exploitable if (!pageText || pageText.replace(/\s+/g, '').length < 30) { const viewport = page.getViewport({ scale: 2 }) const canvas = document.createElement('canvas') canvas.width = viewport.width canvas.height = viewport.height const ctx = canvas.getContext('2d') as any await page.render({ canvasContext: ctx, viewport }).promise const blob: Blob = await new Promise((resolve) => canvas.toBlob((b) => resolve(b as Blob), 'image/png')) const ocrText = await extractFromImage(new File([blob], `${file.name}-p${i}.png`, { type: 'image/png' })) pageText = ocrText } if (pageText.trim()) texts.push(pageText) } return texts.join('\n') } async function extractFromImage(file: File): Promise<string> { const { createWorker } = await import('tesseract.js') // Pré-redimensionne l'image si trop petite (largeur minimale 300px) const imgBitmap = await createImageBitmap(file) let source: Blob = file // Normalisation pour CNI: contraste, gris, upscaling plus agressif const minWidth = /recto|verso|cni|carte/i.test(file.name) ? 1200 : 300 if (imgBitmap.width < minWidth) { const scale = minWidth / Math.max(1, imgBitmap.width) const canvas = document.createElement('canvas') canvas.width = Math.max(300, Math.floor(imgBitmap.width * scale)) canvas.height = Math.floor(imgBitmap.height * scale) const ctx = canvas.getContext('2d')! ctx.imageSmoothingEnabled = true ctx.imageSmoothingQuality = 'high' ctx.drawImage(imgBitmap, 0, 0, canvas.width, canvas.height) // Conversion en niveaux de gris + amélioration du contraste const imgData = ctx.getImageData(0, 0, canvas.width, canvas.height) const data = imgData.data for (let i = 0; i < data.length; i += 4) { const r = data[i], g = data[i + 1], b = data[i + 2] // luma let y = 0.299 * r + 0.587 * g + 0.114 * b // contraste simple y = Math.max(0, Math.min(255, (y - 128) * 1.2 + 128)) data[i] = data[i + 1] = data[i + 2] = y } ctx.putImageData(imgData, 0, 0) source = await new Promise<Blob>((resolve) => canvas.toBlob((b) => resolve(b || file))!) } const worker = await createWorker() try { // Configure le logger après création pour éviter DataCloneError // @ts-expect-error - setLogger is not directly on Worker type worker.setLogger?.((m: any) => { if (m?.progress != null) console.info('[OCR]', Math.round(m.progress * 100) + '%') }) await worker.load() // @ts-expect-error - loadLanguage is not directly on Worker type await worker.loadLanguage('fra+eng') // @ts-expect-error - initialize is not directly on Worker type await worker.initialize('fra+eng') // Essaie plusieurs PSM et orientations (0/90/180/270) et garde le meilleur résultat const rotations = [0, 90, 180, 270] const psmModes = ['6', '7', '11'] // 6: block, 7: single line, 11: sparse text let bestText = '' let bestScore = -1 for (const psm of psmModes) { // @ts-expect-error - tessedit_pageseg_mode expects PSM enum, but string is used await worker.setParameters({ tessedit_pageseg_mode: psm }) for (const deg of rotations) { const rotatedBlob = await rotateBlob(source, deg) const { data } = await worker.recognize(rotatedBlob) const text = data.text || '' const len = text.replace(/\s+/g, ' ').trim().length const score = (data.confidence || 0) * Math.log(len + 1) if (score > bestScore) { bestScore = score bestText = text } // Court-circuit si très bon if (data.confidence >= 85 && len > 40) break } } return bestText } finally { await worker.terminate() } } async function rotateBlob(blob: Blob, deg: number): Promise<Blob> { if (deg % 360 === 0) return blob const bmp = await createImageBitmap(blob) const rad = (deg * Math.PI) / 180 const sin = Math.abs(Math.sin(rad)) const cos = Math.abs(Math.cos(rad)) const w = bmp.width const h = bmp.height const newW = Math.floor(w * cos + h * sin) const newH = Math.floor(w * sin + h * cos) const canvas = document.createElement('canvas') canvas.width = newW canvas.height = newH const ctx = canvas.getContext('2d')! ctx.imageSmoothingEnabled = true ctx.imageSmoothingQuality = 'high' ctx.translate(newW / 2, newH / 2) ctx.rotate(rad) ctx.drawImage(bmp, -w / 2, -h / 2) return await new Promise<Blob>((resolve) => canvas.toBlob((b) => resolve(b || blob))!) } |