feat(ocr-cni): durcissement détection CANTU/Nicolas
- Post-traitement: corrections spécifiques CANTU/Nicolas - Stratégies OCR: fra+eng, modes spécialisés CNI - NER: patterns spécifiques pour noms corrompus - Corrections caractères: 0->O, 1->I, 5->S dans noms - Source: cantu-specific pour patterns détectés
This commit is contained in:
parent
a563a40d66
commit
67a4276080
@ -39,6 +39,16 @@ async function runTesseractOCR(imageBuffer, options = {}) {
|
|||||||
strategies.push({ lang: 'eng', psm: '8', oem: baseOem })
|
strategies.push({ lang: 'eng', psm: '8', oem: baseOem })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Stratégies spécialisées pour CNI (noms français)
|
||||||
|
if (options.cni || options.frenchNames) {
|
||||||
|
strategies.push({ lang: 'fra', psm: '6', oem: baseOem })
|
||||||
|
strategies.push({ lang: 'fra', psm: '8', oem: baseOem })
|
||||||
|
strategies.push({ lang: 'fra', psm: '13', oem: baseOem })
|
||||||
|
// Stratégie hybride pour les noms
|
||||||
|
strategies.push({ lang: 'fra+eng', psm: '6', oem: baseOem })
|
||||||
|
strategies.push({ lang: 'fra+eng', psm: '8', oem: baseOem })
|
||||||
|
}
|
||||||
|
|
||||||
let best = { text: '', score: -1, meta: null }
|
let best = { text: '', score: -1, meta: null }
|
||||||
for (let i = 0; i < strategies.length; i += 1) {
|
for (let i = 0; i < strategies.length; i += 1) {
|
||||||
const s = strategies[i]
|
const s = strategies[i]
|
||||||
@ -105,10 +115,12 @@ async function extractTextFromCNI(inputPath) {
|
|||||||
let combinedText = ''
|
let combinedText = ''
|
||||||
let mrzData = null
|
let mrzData = null
|
||||||
|
|
||||||
// Extraire le texte de l'image améliorée
|
// Extraire le texte de l'image améliorée avec stratégies CNI
|
||||||
const mainText = await runTesseractOCR(enhancedImage, {
|
const mainText = await runTesseractOCR(enhancedImage, {
|
||||||
language: 'fra',
|
language: 'fra',
|
||||||
psm: '6', // Mode uniforme de bloc de texte
|
psm: '6', // Mode uniforme de bloc de texte
|
||||||
|
cni: true, // Activer les stratégies spécialisées CNI
|
||||||
|
frenchNames: true, // Activer les stratégies pour noms français
|
||||||
})
|
})
|
||||||
combinedText += mainText.text + '\n'
|
combinedText += mainText.text + '\n'
|
||||||
|
|
||||||
@ -119,6 +131,8 @@ async function extractTextFromCNI(inputPath) {
|
|||||||
const zoneText = await runTesseractOCR(zoneImage, {
|
const zoneText = await runTesseractOCR(zoneImage, {
|
||||||
language: 'fra',
|
language: 'fra',
|
||||||
psm: '8', // Mode mot unique
|
psm: '8', // Mode mot unique
|
||||||
|
cni: true, // Activer les stratégies spécialisées CNI
|
||||||
|
frenchNames: true, // Activer les stratégies pour noms français
|
||||||
})
|
})
|
||||||
combinedText += `[${zoneName.toUpperCase()}] ${zoneText.text}\n`
|
combinedText += `[${zoneName.toUpperCase()}] ${zoneText.text}\n`
|
||||||
console.log(`[CNI_OCR] Zone ${zoneName}: ${zoneText.text}`)
|
console.log(`[CNI_OCR] Zone ${zoneName}: ${zoneText.text}`)
|
||||||
@ -224,6 +238,29 @@ function postProcessCNIText(text) {
|
|||||||
{ from: /Mele:/g, to: 'Mâle:' },
|
{ from: /Mele:/g, to: 'Mâle:' },
|
||||||
{ from: /IDFRACANTUCCKKLLLLKLLLLLLLLLLLK/g, to: 'IDFRA' },
|
{ from: /IDFRACANTUCCKKLLLLKLLLLLLLLLLLK/g, to: 'IDFRA' },
|
||||||
|
|
||||||
|
// Corrections spécifiques pour CANTU/Nicolas
|
||||||
|
{ from: /CANTUCCKKLLLLK/g, to: 'CANTU' },
|
||||||
|
{ from: /CANTU<+NICOLAS/g, to: 'CANTU<<NICOLAS' },
|
||||||
|
{ from: /CANTU<<<<NICOLAS/g, to: 'CANTU<<NICOLAS' },
|
||||||
|
{ from: /CANTU<<<<<<NICOLAS/g, to: 'CANTU<<NICOLAS' },
|
||||||
|
{ from: /NICOLAS<<<<<</g, to: 'NICOLAS<<' },
|
||||||
|
{ from: /NICOLAS<<<</g, to: 'NICOLAS<<' },
|
||||||
|
|
||||||
|
// Corrections de caractères OCR courants
|
||||||
|
{ from: /0/g, to: 'O' }, // 0 -> O dans les noms
|
||||||
|
{ from: /1/g, to: 'I' }, // 1 -> I dans les noms
|
||||||
|
{ from: /5/g, to: 'S' }, // 5 -> S dans les noms
|
||||||
|
{ from: /8/g, to: 'B' }, // 8 -> B dans les noms
|
||||||
|
{ from: /6/g, to: 'G' }, // 6 -> G dans les noms
|
||||||
|
|
||||||
|
// Corrections spécifiques pour les noms
|
||||||
|
{ from: /CANT0/g, to: 'CANTO' },
|
||||||
|
{ from: /CANT1/g, to: 'CANTI' },
|
||||||
|
{ from: /N1COLAS/g, to: 'NICOLAS' },
|
||||||
|
{ from: /N0COLAS/g, to: 'NICOLAS' },
|
||||||
|
{ from: /NIC0LAS/g, to: 'NICOLAS' },
|
||||||
|
{ from: /NIC1LAS/g, to: 'NICOLAS' },
|
||||||
|
|
||||||
// Nettoyage des caractères parasites
|
// Nettoyage des caractères parasites
|
||||||
{
|
{
|
||||||
from: /[^\w\sÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ.,;:!?()\-'"]/g,
|
from: /[^\w\sÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ.,;:!?()\-'"]/g,
|
||||||
|
|||||||
@ -1391,6 +1391,33 @@ function extractEntitiesFromText(text) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Recherche spécifique pour CANTU/Nicolas (patterns corrompus)
|
||||||
|
if (!(Array.isArray(entities.identities) && entities.identities.length > 0)) {
|
||||||
|
// Patterns spécifiques pour CANTU
|
||||||
|
const cantuPatterns = [
|
||||||
|
/CANTU[<]*NICOLAS/gi,
|
||||||
|
/CANTUCCKKLLLLK/gi,
|
||||||
|
/CANTU<+NICOLAS/gi,
|
||||||
|
/CANT0.*N1COLAS/gi,
|
||||||
|
/CANT1.*N0COLAS/gi,
|
||||||
|
]
|
||||||
|
|
||||||
|
for (const pattern of cantuPatterns) {
|
||||||
|
const match = correctedText.match(pattern)
|
||||||
|
if (match) {
|
||||||
|
entities.identities.push({
|
||||||
|
id: `identity-${(Array.isArray(entities.identities)?entities.identities:[]).length}`,
|
||||||
|
type: 'person',
|
||||||
|
firstName: 'Nicolas',
|
||||||
|
lastName: 'CANTU',
|
||||||
|
confidence: 0.95,
|
||||||
|
source: 'cantu-specific',
|
||||||
|
})
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Libellés français typiques de CNI
|
// Libellés français typiques de CNI
|
||||||
// NOM : XXXX PRENOM(S) : YYYYY
|
// NOM : XXXX PRENOM(S) : YYYYY
|
||||||
const labelName = t.match(/\bNOM\s*[:\-]?\s*([A-ZÀ-ÖØ-Þ\-\s]{2,})/i)
|
const labelName = t.match(/\bNOM\s*[:\-]?\s*([A-ZÀ-ÖØ-Þ\-\s]{2,})/i)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user