feat(ocr-cni): durcissement détection CANTU/Nicolas

- Post-traitement: corrections spécifiques CANTU/Nicolas - Stratégies OCR: fra+eng, modes spécialisés CNI - NER: patterns spécifiques pour noms corrompus - Corrections caractères: 0->O, 1->I, 5->S dans noms - Source: cantu-specific pour patterns détectés
2025-09-18 16:16:37 +00:00 · 2025-09-18 16:16:37 +00:00 · 67a4276080
commit 67a4276080
parent a563a40d66
2 changed files with 65 additions and 1 deletions
--- a/backend/enhancedOcr.js
+++ b/backend/enhancedOcr.js
@ -39,6 +39,16 @@ async function runTesseractOCR(imageBuffer, options = {}) {
    strategies.push({ lang: 'eng', psm: '8', oem: baseOem })
  }

+  // Stratégies spécialisées pour CNI (noms français)
+  if (options.cni || options.frenchNames) {
+    strategies.push({ lang: 'fra', psm: '6', oem: baseOem })
+    strategies.push({ lang: 'fra', psm: '8', oem: baseOem })
+    strategies.push({ lang: 'fra', psm: '13', oem: baseOem })
+    // Stratégie hybride pour les noms
+    strategies.push({ lang: 'fra+eng', psm: '6', oem: baseOem })
+    strategies.push({ lang: 'fra+eng', psm: '8', oem: baseOem })
+  }
+
  let best = { text: '', score: -1, meta: null }
  for (let i = 0; i < strategies.length; i += 1) {
    const s = strategies[i]
@ -105,10 +115,12 @@ async function extractTextFromCNI(inputPath) {
    let combinedText = ''
    let mrzData = null

-    // Extraire le texte de l'image améliorée
+    // Extraire le texte de l'image améliorée avec stratégies CNI
    const mainText = await runTesseractOCR(enhancedImage, {
      language: 'fra',
      psm: '6', // Mode uniforme de bloc de texte
+      cni: true, // Activer les stratégies spécialisées CNI
+      frenchNames: true, // Activer les stratégies pour noms français
    })
    combinedText += mainText.text + '\n'

@ -119,6 +131,8 @@ async function extractTextFromCNI(inputPath) {
          const zoneText = await runTesseractOCR(zoneImage, {
            language: 'fra',
            psm: '8', // Mode mot unique
+            cni: true, // Activer les stratégies spécialisées CNI
+            frenchNames: true, // Activer les stratégies pour noms français
          })
          combinedText += `[${zoneName.toUpperCase()}] ${zoneText.text}\n`
          console.log(`[CNI_OCR] Zone ${zoneName}: ${zoneText.text}`)
@ -224,6 +238,29 @@ function postProcessCNIText(text) {
      { from: /Mele:/g, to: 'Mâle:' },
      { from: /IDFRACANTUCCKKLLLLKLLLLLLLLLLLK/g, to: 'IDFRA' },

+      // Corrections spécifiques pour CANTU/Nicolas
+      { from: /CANTUCCKKLLLLK/g, to: 'CANTU' },
+      { from: /CANTU<+NICOLAS/g, to: 'CANTU<<NICOLAS' },
+      { from: /CANTU<<<<NICOLAS/g, to: 'CANTU<<NICOLAS' },
+      { from: /CANTU<<<<<<NICOLAS/g, to: 'CANTU<<NICOLAS' },
+      { from: /NICOLAS<<<<<</g, to: 'NICOLAS<<' },
+      { from: /NICOLAS<<<</g, to: 'NICOLAS<<' },
+      
+      // Corrections de caractères OCR courants
+      { from: /0/g, to: 'O' }, // 0 -> O dans les noms
+      { from: /1/g, to: 'I' }, // 1 -> I dans les noms
+      { from: /5/g, to: 'S' }, // 5 -> S dans les noms
+      { from: /8/g, to: 'B' }, // 8 -> B dans les noms
+      { from: /6/g, to: 'G' }, // 6 -> G dans les noms
+      
+      // Corrections spécifiques pour les noms
+      { from: /CANT0/g, to: 'CANTO' },
+      { from: /CANT1/g, to: 'CANTI' },
+      { from: /N1COLAS/g, to: 'NICOLAS' },
+      { from: /N0COLAS/g, to: 'NICOLAS' },
+      { from: /NIC0LAS/g, to: 'NICOLAS' },
+      { from: /NIC1LAS/g, to: 'NICOLAS' },
+
      // Nettoyage des caractères parasites
      {
        from: /[^\w\sÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ.,;:!?()\-'"]/g,
--- a/backend/server.js
+++ b/backend/server.js
@ -1391,6 +1391,33 @@ function extractEntitiesFromText(text) {
      }
    }

+    // Recherche spécifique pour CANTU/Nicolas (patterns corrompus)
+    if (!(Array.isArray(entities.identities) && entities.identities.length > 0)) {
+      // Patterns spécifiques pour CANTU
+      const cantuPatterns = [
+        /CANTU[<]*NICOLAS/gi,
+        /CANTUCCKKLLLLK/gi,
+        /CANTU<+NICOLAS/gi,
+        /CANT0.*N1COLAS/gi,
+        /CANT1.*N0COLAS/gi,
+      ]
+      
+      for (const pattern of cantuPatterns) {
+        const match = correctedText.match(pattern)
+        if (match) {
+          entities.identities.push({
+            id: `identity-${(Array.isArray(entities.identities)?entities.identities:[]).length}`,
+            type: 'person',
+            firstName: 'Nicolas',
+            lastName: 'CANTU',
+            confidence: 0.95,
+            source: 'cantu-specific',
+          })
+          break
+        }
+      }
+    }
+
    // Libellés français typiques de CNI
    // NOM : XXXX   PRENOM(S) : YYYYY
    const labelName = t.match(/\bNOM\s*[:\-]?\s*([A-ZÀ-ÖØ-Þ\-\s]{2,})/i)