All files / src/services fileExtract.ts

0% Statements 0/141
100% Branches 1/1
100% Functions 1/1
0% Lines 0/141

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169                                                                                                                                                                                                                                                                                                                                                 
// Chargements dynamiques locaux (pdfjs-dist/tesseract.js)
let _pdfjsLib: any | null = null
async function getPdfJs() {
  if (_pdfjsLib) return _pdfjsLib
  const pdfjsLib: any = await import('pdfjs-dist')
  try {
    // Utilise un worker module réel pour éviter le fake worker
    const workerUrl = new URL('pdfjs-dist/build/pdf.worker.min.mjs', import.meta.url)
    pdfjsLib.GlobalWorkerOptions.workerPort = new Worker(workerUrl, { type: 'module' })
  } catch {
    // ignore si worker introuvable
  }
  _pdfjsLib = pdfjsLib
  return _pdfjsLib
}
 
export async function extractTextFromFile(file: File): Promise<string> {
  const mime = file.type || ''
  if (mime.includes('pdf') || file.name.toLowerCase().endsWith('.pdf')) {
    const pdfText = await extractFromPdf(file)
    if (import.meta.env.DEV) {
      // eslint-disable-next-line no-console
      console.info('[OCR][PDF]', file.name, 'len=', pdfText.length, 'peek=', pdfText.slice(0, 200))
    }
    return pdfText
  }
  if (mime.startsWith('image/') || ['.png', '.jpg', '.jpeg'].some((ext) => file.name.toLowerCase().endsWith(ext))) {
    const imgText = await extractFromImage(file)
    if (import.meta.env.DEV) {
      // eslint-disable-next-line no-console
      console.info('[OCR][IMG]', file.name, 'len=', imgText.length, 'peek=', imgText.slice(0, 200))
    }
    return imgText
  }
  // Fallback: lecture texte brut
  try {
    return await file.text()
  } catch {
    return ''
  }
}
 
async function extractFromPdf(file: File): Promise<string> {
  const pdfjsLib = await getPdfJs().catch(() => null)
  if (!pdfjsLib) return ''
  const arrayBuffer = await file.arrayBuffer()
  const pdf = await pdfjsLib.getDocument({ data: new Uint8Array(arrayBuffer) }).promise
  const texts: string[] = []
  const numPages = Math.min(pdf.numPages, 50)
  for (let i = 1; i <= numPages; i += 1) {
    const page = await pdf.getPage(i)
    const content = await page.getTextContent().catch(() => null)
    let pageText = ''
    if (content) {
      pageText = content.items.map((it: any) => (it.str ? it.str : '')).join(' ')
    }
    // Fallback OCR si pas de texte exploitable
    if (!pageText || pageText.replace(/\s+/g, '').length < 30) {
      const viewport = page.getViewport({ scale: 2 })
      const canvas = document.createElement('canvas')
      canvas.width = viewport.width
      canvas.height = viewport.height
      const ctx = canvas.getContext('2d') as any
      await page.render({ canvasContext: ctx, viewport }).promise
      const blob: Blob = await new Promise((resolve) => canvas.toBlob((b) => resolve(b as Blob), 'image/png'))
      const ocrText = await extractFromImage(new File([blob], `${file.name}-p${i}.png`, { type: 'image/png' }))
      pageText = ocrText
    }
    if (pageText.trim()) texts.push(pageText)
  }
  return texts.join('\n')
}
 
async function extractFromImage(file: File): Promise<string> {
  const { createWorker } = await import('tesseract.js')
 
  // Pré-redimensionne l'image si trop petite (largeur minimale 300px)
  const imgBitmap = await createImageBitmap(file)
  let source: Blob = file
  // Normalisation pour CNI: contraste, gris, upscaling plus agressif
  const minWidth = /recto|verso|cni|carte/i.test(file.name) ? 1200 : 300
  if (imgBitmap.width < minWidth) {
    const scale = minWidth / Math.max(1, imgBitmap.width)
    const canvas = document.createElement('canvas')
    canvas.width = Math.max(300, Math.floor(imgBitmap.width * scale))
    canvas.height = Math.floor(imgBitmap.height * scale)
    const ctx = canvas.getContext('2d')!
    ctx.imageSmoothingEnabled = true
    ctx.imageSmoothingQuality = 'high'
    ctx.drawImage(imgBitmap, 0, 0, canvas.width, canvas.height)
    // Conversion en niveaux de gris + amélioration du contraste
    const imgData = ctx.getImageData(0, 0, canvas.width, canvas.height)
    const data = imgData.data
    for (let i = 0; i < data.length; i += 4) {
      const r = data[i], g = data[i + 1], b = data[i + 2]
      // luma
      let y = 0.299 * r + 0.587 * g + 0.114 * b
      // contraste simple
      y = Math.max(0, Math.min(255, (y - 128) * 1.2 + 128))
      data[i] = data[i + 1] = data[i + 2] = y
    }
    ctx.putImageData(imgData, 0, 0)
    source = await new Promise<Blob>((resolve) => canvas.toBlob((b) => resolve(b || file))!)
  }
 
  const worker = await createWorker()
  try {
    // Configure le logger après création pour éviter DataCloneError
    // @ts-expect-error - setLogger is not directly on Worker type
    worker.setLogger?.((m: any) => {
      if (m?.progress != null) console.info('[OCR]', Math.round(m.progress * 100) + '%')
    })
    await worker.load()
    // @ts-expect-error - loadLanguage is not directly on Worker type
    await worker.loadLanguage('fra+eng')
    // @ts-expect-error - initialize is not directly on Worker type
    await worker.initialize('fra+eng')
    // Essaie plusieurs PSM et orientations (0/90/180/270) et garde le meilleur résultat
    const rotations = [0, 90, 180, 270]
    const psmModes = ['6', '7', '11'] // 6: block, 7: single line, 11: sparse text
    let bestText = ''
    let bestScore = -1
 
    for (const psm of psmModes) {
      // @ts-expect-error - tessedit_pageseg_mode expects PSM enum, but string is used
      await worker.setParameters({ tessedit_pageseg_mode: psm })
      for (const deg of rotations) {
        const rotatedBlob = await rotateBlob(source, deg)
        const { data } = await worker.recognize(rotatedBlob)
        const text = data.text || ''
        const len = text.replace(/\s+/g, ' ').trim().length
        const score = (data.confidence || 0) * Math.log(len + 1)
        if (score > bestScore) {
          bestScore = score
          bestText = text
        }
        // Court-circuit si très bon
        if (data.confidence >= 85 && len > 40) break
      }
    }
 
    return bestText
  } finally {
    await worker.terminate()
  }
}
 
async function rotateBlob(blob: Blob, deg: number): Promise<Blob> {
  if (deg % 360 === 0) return blob
  const bmp = await createImageBitmap(blob)
  const rad = (deg * Math.PI) / 180
  const sin = Math.abs(Math.sin(rad))
  const cos = Math.abs(Math.cos(rad))
  const w = bmp.width
  const h = bmp.height
  const newW = Math.floor(w * cos + h * sin)
  const newH = Math.floor(w * sin + h * cos)
  const canvas = document.createElement('canvas')
  canvas.width = newW
  canvas.height = newH
  const ctx = canvas.getContext('2d')!
  ctx.imageSmoothingEnabled = true
  ctx.imageSmoothingQuality = 'high'
  ctx.translate(newW / 2, newH / 2)
  ctx.rotate(rad)
  ctx.drawImage(bmp, -w / 2, -h / 2)
  return await new Promise<Blob>((resolve) => canvas.toBlob((b) => resolve(b || blob))!)
}