4NK_IA_front/src/services/fileExtract.ts

// Chargements dynamiques locaux (pdfjs-dist/tesseract.js)
let _pdfjsLib: any | null = null
async function getPdfJs() {
  if (_pdfjsLib) return _pdfjsLib
  const pdfjsLib: any = await import('pdfjs-dist')
  try {
    // Utilise un worker module réel pour éviter le fake worker
    const workerUrl = new URL('pdfjs-dist/build/pdf.worker.min.mjs', import.meta.url)
    pdfjsLib.GlobalWorkerOptions.workerPort = new Worker(workerUrl, { type: 'module' })
  } catch {
    // ignore si worker introuvable
  }
  _pdfjsLib = pdfjsLib
  return _pdfjsLib
}

export async function extractTextFromFile(file: File): Promise<string> {
  const mime = file.type || ''
  if (mime.includes('pdf') || file.name.toLowerCase().endsWith('.pdf')) {
    const pdfText = await extractFromPdf(file)
    if (import.meta.env.DEV) {
      // eslint-disable-next-line no-console
      console.info('[OCR][PDF]', file.name, 'len=', pdfText.length, 'peek=', pdfText.slice(0, 200))
    }
    return pdfText
  }
  if (mime.startsWith('image/') || ['.png', '.jpg', '.jpeg'].some((ext) => file.name.toLowerCase().endsWith(ext))) {
    const imgText = await extractFromImage(file)
    if (import.meta.env.DEV) {
      // eslint-disable-next-line no-console
      console.info('[OCR][IMG]', file.name, 'len=', imgText.length, 'peek=', imgText.slice(0, 200))
    }
    return imgText
  }
  // Fallback: lecture texte brut
  try {
    return await file.text()
  } catch {
    return ''
  }
}

async function extractFromPdf(file: File): Promise<string> {
  const pdfjsLib = await getPdfJs().catch(() => null)
  if (!pdfjsLib) return ''
  const arrayBuffer = await file.arrayBuffer()
  const pdf = await pdfjsLib.getDocument({ data: new Uint8Array(arrayBuffer) }).promise
  const texts: string[] = []
  const numPages = Math.min(pdf.numPages, 50)
  for (let i = 1; i <= numPages; i += 1) {
    const page = await pdf.getPage(i)
    const content = await page.getTextContent().catch(() => null)
    let pageText = ''
    if (content) {
      pageText = content.items.map((it: any) => (it.str ? it.str : '')).join(' ')
    }
    // Fallback OCR si pas de texte exploitable
    if (!pageText || pageText.replace(/\s+/g, '').length < 30) {
      const viewport = page.getViewport({ scale: 2 })
      const canvas = document.createElement('canvas')
      canvas.width = viewport.width
      canvas.height = viewport.height
      const ctx = canvas.getContext('2d') as any
      await page.render({ canvasContext: ctx, viewport }).promise
      const blob: Blob = await new Promise((resolve) => canvas.toBlob((b) => resolve(b as Blob), 'image/png'))
      const ocrText = await extractFromImage(new File([blob], `${file.name}-p${i}.png`, { type: 'image/png' }))
      pageText = ocrText
    }
    if (pageText.trim()) texts.push(pageText)
  }
  return texts.join('\n')
}

async function extractFromImage(file: File): Promise<string> {
  const { createWorker } = await import('tesseract.js')

  // Pré-redimensionne l'image si trop petite (largeur minimale 300px)
  const imgBitmap = await createImageBitmap(file)
  let source: Blob = file
  // Normalisation pour CNI: contraste, gris, upscaling plus agressif
  const minWidth = /recto|verso|cni|carte/i.test(file.name) ? 1200 : 300
  if (imgBitmap.width < minWidth) {
    const scale = minWidth / Math.max(1, imgBitmap.width)
    const canvas = document.createElement('canvas')
    canvas.width = Math.max(300, Math.floor(imgBitmap.width * scale))
    canvas.height = Math.floor(imgBitmap.height * scale)
    const ctx = canvas.getContext('2d')!
    ctx.imageSmoothingEnabled = true
    ctx.imageSmoothingQuality = 'high'
    ctx.drawImage(imgBitmap, 0, 0, canvas.width, canvas.height)
    // Conversion en niveaux de gris + amélioration du contraste
    const imgData = ctx.getImageData(0, 0, canvas.width, canvas.height)
    const data = imgData.data
    for (let i = 0; i < data.length; i += 4) {
      const r = data[i], g = data[i + 1], b = data[i + 2]
      // luma
      let y = 0.299 * r + 0.587 * g + 0.114 * b
      // contraste simple
      y = Math.max(0, Math.min(255, (y - 128) * 1.2 + 128))
      data[i] = data[i + 1] = data[i + 2] = y
    }
    ctx.putImageData(imgData, 0, 0)
    source = await new Promise<Blob>((resolve) => canvas.toBlob((b) => resolve(b || file))!)
  }

  const worker = await createWorker('fra+eng')
  try {
    // Configure le logger après création pour éviter DataCloneError
    // @ts-expect-error - setLogger is not directly on Worker type
    worker.setLogger?.((m: any) => {
      if (m?.progress != null) console.info('[OCR]', Math.round(m.progress * 100) + '%')
    })
    // Configuration optimisée pour les cartes d'identité et documents
    const rotations = [0, 90, 180, 270]
    const psmModes = ['6', '7', '8', '11', '13'] // 6: block, 7: single line, 8: single word, 11: sparse text, 13: raw line
    let bestText = ''
    let bestScore = -1

    for (const psm of psmModes) {
      // Configuration optimisée pour les images de petite taille
      const params = {
        tessedit_pageseg_mode: psm,
        tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ.,/-:() ',
        tessedit_ocr_engine_mode: '1', // LSTM OCR Engine
        preserve_interword_spaces: '1',
        textord_min_linesize: '2.0', // Réduit la taille minimale des lignes
        textord_min_xheight: '6', // Hauteur minimale très réduite pour les petits textes
        classify_bln_numeric_mode: '0',
        textord_heavy_nr: '1',
        textord_old_baselines: '0',
        textord_old_xheight: '0'
      }

      // @ts-expect-error - tessedit_pageseg_mode expects PSM enum, but string is used
      await worker.setParameters(params)

      for (const deg of rotations) {
        try {
          const rotatedBlob = await rotateBlob(source, deg)
          const { data } = await worker.recognize(rotatedBlob)
          const text = data.text || ''
          const len = text.replace(/\s+/g, ' ').trim().length

          // Score amélioré qui privilégie la longueur et la confiance
          const confidence = Math.max(0, data.confidence || 0)
          const score = confidence * Math.log(len + 1) * (len > 10 ? 1.2 : 0.8)

          console.log(`[OCR] PSM:${psm} Rot:${deg}° Conf:${confidence.toFixed(1)}% Len:${len} Score:${score.toFixed(2)}`)

          if (score > bestScore) {
            bestScore = score
            bestText = text
            console.log(`[OCR] Nouveau meilleur résultat: "${text.substring(0, 100)}..."`)
          }

          // Court-circuit si très bon résultat
          if (confidence >= 80 && len > 20) {
            console.log(`[OCR] Résultat satisfaisant trouvé, arrêt de la recherche`)
            break
          }
        } catch (error) {
          console.warn(`[OCR] Erreur PSM:${psm} Rot:${deg}°:`, error instanceof Error ? error.message : String(error))
        }
      }

      // Si on a un bon résultat, on peut s'arrêter
      if (bestScore > 100) break
    }

    return bestText
  } finally {
    await worker.terminate()
  }
}

async function rotateBlob(blob: Blob, deg: number): Promise<Blob> {
  if (deg % 360 === 0) return blob
  const bmp = await createImageBitmap(blob)
  const rad = (deg * Math.PI) / 180
  const sin = Math.abs(Math.sin(rad))
  const cos = Math.abs(Math.cos(rad))
  const w = bmp.width
  const h = bmp.height
  const newW = Math.floor(w * cos + h * sin)
  const newH = Math.floor(w * sin + h * cos)
  const canvas = document.createElement('canvas')
  canvas.width = newW
  canvas.height = newH
  const ctx = canvas.getContext('2d')!
  ctx.imageSmoothingEnabled = true
  ctx.imageSmoothingQuality = 'high'
  ctx.translate(newW / 2, newH / 2)
  ctx.rotate(rad)
  ctx.drawImage(bmp, -w / 2, -h / 2)
  return await new Promise<Blob>((resolve) => canvas.toBlob((b) => resolve(b || blob))!)
}