feat(ocr): boost confiance avec annuaire de noms (FR+multi-lang) + docs
This commit is contained in:
parent
1118bbbf5d
commit
a3501def35
66
backend/nameDirectory.js
Normal file
66
backend/nameDirectory.js
Normal file
@ -0,0 +1,66 @@
|
||||
const fs = require('fs')
|
||||
const path = require('path')
|
||||
|
||||
function loadCsvNames(filePath) {
|
||||
try {
|
||||
const raw = fs.readFileSync(filePath, 'utf8')
|
||||
const lines = raw.split(/\r?\n/).map((l) => l.trim()).filter(Boolean)
|
||||
const names = []
|
||||
for (const line of lines) {
|
||||
const parts = line.split(/[;,\t]/).map((p) => p.trim()).filter(Boolean)
|
||||
for (const p of parts) {
|
||||
if (/^[A-Za-zÀ-ÖØ-öø-ÿ'\-\s]{2,}$/.test(p)) names.push(p)
|
||||
}
|
||||
}
|
||||
return names
|
||||
} catch {
|
||||
return []
|
||||
}
|
||||
}
|
||||
|
||||
function buildNameSets() {
|
||||
const baseDir = path.join(__dirname, 'data', 'names')
|
||||
const firstNames = new Set()
|
||||
const lastNames = new Set()
|
||||
try {
|
||||
if (!fs.existsSync(baseDir)) return { firstNames, lastNames }
|
||||
const files = fs.readdirSync(baseDir)
|
||||
for (const f of files) {
|
||||
const fp = path.join(baseDir, f)
|
||||
if (!fs.statSync(fp).isFile()) continue
|
||||
const list = loadCsvNames(fp)
|
||||
const isFirst = /first|prenom|given/i.test(f)
|
||||
const isLast = /last|nom|surname|family/i.test(f)
|
||||
for (const n of list) {
|
||||
const norm = n.normalize('NFD').replace(/[\u0300-\u036f]/g, '').toLowerCase()
|
||||
if (isFirst) firstNames.add(norm)
|
||||
if (isLast) lastNames.add(norm)
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
return { firstNames, lastNames }
|
||||
}
|
||||
|
||||
let cache = null
|
||||
function getNameDirectory() {
|
||||
if (!cache) cache = buildNameSets()
|
||||
return cache
|
||||
}
|
||||
|
||||
function nameConfidenceBoost(firstName, lastName) {
|
||||
try {
|
||||
const dir = getNameDirectory()
|
||||
const f = (firstName || '').normalize('NFD').replace(/[\u0300-\u036f]/g, '').toLowerCase()
|
||||
const l = (lastName || '').normalize('NFD').replace(/[\u0300-\u036f]/g, '').toLowerCase()
|
||||
let boost = 0
|
||||
if (f && dir.firstNames.has(f)) boost += 0.05
|
||||
if (l && dir.lastNames.has(l)) boost += 0.05
|
||||
return boost
|
||||
} catch {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { getNameDirectory, nameConfidenceBoost }
|
||||
|
||||
|
||||
@ -13,6 +13,7 @@ const fs = require('fs')
|
||||
const crypto = require('crypto')
|
||||
const { createWorker } = require('tesseract.js')
|
||||
const { preprocessImageForOCR, analyzeImageMetadata } = require('./imagePreprocessing')
|
||||
const { nameConfidenceBoost } = require('./nameDirectory')
|
||||
const pdf = require('pdf-parse')
|
||||
|
||||
const app = express()
|
||||
@ -923,13 +924,20 @@ function generateStandardJSON(documentInfo, ocrResult, entities, processingTime)
|
||||
|
||||
// Calcul de la confiance globale
|
||||
const baseConfidence = typeof ocrResult.confidence === 'number' ? ocrResult.confidence : 0
|
||||
const dirBoost = (() => {
|
||||
try {
|
||||
const id0 = (Array.isArray(identities) && identities.length > 0) ? identities[0] : null
|
||||
return id0 ? nameConfidenceBoost(id0.firstName, id0.lastName) : 0
|
||||
} catch { return 0 }
|
||||
})()
|
||||
const globalConfidence = Math.min(
|
||||
95,
|
||||
Math.max(
|
||||
60,
|
||||
baseConfidence * 0.8 +
|
||||
(identities.length > 0 ? 10 : 0) +
|
||||
(cniNumbers.length > 0 ? 15 : 0),
|
||||
(cniNumbers.length > 0 ? 15 : 0) +
|
||||
Math.round(dirBoost * 100),
|
||||
),
|
||||
)
|
||||
|
||||
@ -1054,10 +1062,51 @@ function generateStandardJSON(documentInfo, ocrResult, entities, processingTime)
|
||||
? ['Aucune signature détectée']
|
||||
: [],
|
||||
timestamp: timestamp,
|
||||
suggestions: computeQualitySuggestions({
|
||||
documentType,
|
||||
confidence: (typeof ocrResult.confidence === 'number' ? ocrResult.confidence : 0) / 100,
|
||||
identities: safeEntities.identities,
|
||||
addresses: safeEntities.addresses,
|
||||
}),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Détermine des recommandations de qualité (remplacement/confirmation)
|
||||
function computeQualitySuggestions(ctx) {
|
||||
try {
|
||||
const suggestions = {
|
||||
needsReupload: false,
|
||||
reasons: [],
|
||||
needsAddressConfirmation: false,
|
||||
detectedAddress: null,
|
||||
}
|
||||
|
||||
// Critères de re-upload: faible confiance OCR OU CNI sans NOM/PRÉNOM
|
||||
if (typeof ctx.confidence === 'number' && ctx.confidence < 0.75) {
|
||||
suggestions.needsReupload = true
|
||||
suggestions.reasons.push('Confiance OCR faible')
|
||||
}
|
||||
if ((ctx.documentType || '').toUpperCase() === 'CNI') {
|
||||
const numIds = Array.isArray(ctx.identities) ? ctx.identities.length : 0
|
||||
if (numIds === 0) {
|
||||
suggestions.needsReupload = true
|
||||
suggestions.reasons.push('Nom/Prénom non reconnus')
|
||||
}
|
||||
}
|
||||
|
||||
// Confirmation d'adresse si détectée
|
||||
if (Array.isArray(ctx.addresses) && ctx.addresses.length > 0) {
|
||||
suggestions.needsAddressConfirmation = true
|
||||
suggestions.detectedAddress = ctx.addresses[0]
|
||||
}
|
||||
|
||||
return suggestions
|
||||
} catch {
|
||||
return { needsReupload: false, reasons: [], needsAddressConfirmation: false, detectedAddress: null }
|
||||
}
|
||||
}
|
||||
|
||||
// Fonction pour déterminer le sous-type de document
|
||||
function getDocumentSubType(documentType, text) {
|
||||
if (documentType === 'Facture') {
|
||||
@ -1192,12 +1241,14 @@ function extractEntitiesFromText(text) {
|
||||
try {
|
||||
const t = correctedText.replace(/\u200B|\u200E|\u200F/g, '')
|
||||
// MRZ de CNI (deux ou trois lignes, séparateur << : NOM<<PRENOMS)
|
||||
// Scanner le texte complet (sans filtrage par lignes) et normaliser
|
||||
// Normaliser (suppression accents) et mettre en majuscules
|
||||
const mrzText = t
|
||||
.normalize('NFD')
|
||||
.replace(/[\u0300-\u036f]/g, '')
|
||||
.toUpperCase()
|
||||
const mrzRegex = /([A-Z]{2,})<<([A-Z<]{2,})/g
|
||||
|
||||
// Recherche stricte NOM<<PRENOMS (MRZ)
|
||||
const mrzRegex = /\b([A-Z]{2,})(?:<+)([A-Z<]{2,})\b/g
|
||||
let match
|
||||
while ((match = mrzRegex.exec(mrzText)) !== null) {
|
||||
const rawSurname = match[1].replace(/</g, ' ').trim()
|
||||
@ -1218,9 +1269,9 @@ function extractEntitiesFromText(text) {
|
||||
|
||||
// Repli: si pas d'identité MRZ extraite, tenter reconstruction NOM/PRÉNOM séparés
|
||||
if (!(Array.isArray(entities.identities) && entities.identities.some((i)=> (i.source||'').toLowerCase()==='mrz'))) {
|
||||
// Chercher NOM après IDFRA (ex: IDFRACANTU<<<<...)
|
||||
// Chercher NOM après IDFRA (ex: IDFRA CANTU<<<<...)
|
||||
const mSurname = mrzText.match(/IDFRA\s*([A-Z]{2,})</)
|
||||
// Chercher PRENOM en premier avant << (ex: NICOLAS<<FRANCS)
|
||||
// Chercher PRENOM avant << (ex: NICOLAS<<...)
|
||||
const mGiven = mrzText.match(/\b([A-Z]{2,})<<[A-Z<]{2,}\b/)
|
||||
const last = mSurname?.[1]?.trim()
|
||||
const first = mGiven?.[1]?.trim()
|
||||
@ -1239,7 +1290,7 @@ function extractEntitiesFromText(text) {
|
||||
// Libellés français typiques de CNI
|
||||
// NOM : XXXX PRENOM(S) : YYYYY
|
||||
const labelName = t.match(/\bNOM\s*[:\-]?\s*([A-ZÀ-ÖØ-Þ\-\s]{2,})/i)
|
||||
const labelGiven = t.match(/\bPRÉ?NOM\S*\s*[:\-]?\s*([A-Za-zÀ-ÖØ-öø-ÿ'\-\s]{2,})/i)
|
||||
const labelGiven = t.match(/\bPR[ÉE]?NOM\S*\s*[:\-]?\s*([A-Za-zÀ-ÖØ-öø-ÿ'\-\s]{2,})/i)
|
||||
if (labelName || labelGiven) {
|
||||
const last = (labelName?.[1] || '').replace(/[^A-Za-zÀ-ÖØ-Þ'\-\s]/g, '').trim()
|
||||
const first = (labelGiven?.[1] || '').replace(/[^A-Za-zÀ-ÖØ-öø-ÿ'\-\s]/g, '').trim()
|
||||
@ -1312,25 +1363,26 @@ function extractEntitiesFromText(text) {
|
||||
// Extraction des adresses
|
||||
const addressPatterns = [
|
||||
// 10 rue Exemple, 75001 Paris
|
||||
/(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{2}\s?\d{3})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi,
|
||||
/(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{5})\s+([A-Za-zÀ-ÖØ-öø-ÿ\-\s]{2,})(?:\b(France)\b)?/gi,
|
||||
// demeurant 10 rue Exemple, 75001 Paris
|
||||
/demeurant\s+(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{2}\s?\d{3})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi,
|
||||
/demeurant\s+(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{5})\s+([A-Za-zÀ-ÖØ-öø-ÿ\-\s]{2,})(?:\b(France)\b)?/gi,
|
||||
// Adresse: 10 rue Exemple, 75001 Paris
|
||||
/(Adresse|Siège|Adresse de facturation)\s*:\s*(\d{1,4}\s+[A-Za-zÀ-ÖØ-öø-ÿ\s\-']+,\s*\d{2}\s?\d{3}\s+[A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi,
|
||||
/(Adresse|Siège|Adresse de facturation)\s*:\s*(\d{1,4}\s+[A-Za-zÀ-ÖØ-öø-ÿ\s\-']+,\s*\d{5}\s+[A-Za-zÀ-ÖØ-öø-ÿ\-\s]{2,})(?:\b(France)\b)?/gi,
|
||||
]
|
||||
|
||||
addressPatterns.forEach((pattern) => {
|
||||
for (const match of text.matchAll(pattern)) {
|
||||
const street = match[2] || match[1]
|
||||
const city = match[4] || match[3]
|
||||
const postalCode = (match[3] || match[2] || '').replace(/\s+/g, '')
|
||||
const postalCode = (match[3] || '').replace(/\s+/g, '')
|
||||
const city = (match[4] || '').replace(/\s+France$/i, '').trim()
|
||||
const country = (match[5] || 'France').trim()
|
||||
|
||||
entities.addresses.push({
|
||||
id: `address-${(Array.isArray(entities.addresses)?entities.addresses:[]).length}`,
|
||||
street: street ? `${street}`.trim() : '',
|
||||
city: city ? city.trim() : '',
|
||||
postalCode: postalCode ? postalCode.trim() : '',
|
||||
country: 'France',
|
||||
city: city,
|
||||
postalCode: postalCode,
|
||||
country: country || 'France',
|
||||
confidence: 0.9,
|
||||
source: 'rule-based',
|
||||
})
|
||||
@ -2005,6 +2057,50 @@ app.get('/api/folders/:folderHash/files/:fileHash', (req, res) => {
|
||||
}
|
||||
})
|
||||
|
||||
// Confirmer (ou corriger) l'adresse détectée pour un fichier
|
||||
app.post('/api/folders/:folderHash/files/:fileHash/confirm-address', express.json(), (req, res) => {
|
||||
try {
|
||||
const { folderHash, fileHash } = req.params
|
||||
const { address, confirmed } = req.body || {}
|
||||
const cachePath = path.join('cache', folderHash)
|
||||
const jsonPath = path.join(cachePath, `${fileHash}.json`)
|
||||
if (!fs.existsSync(jsonPath)) {
|
||||
return res.status(404).json({ success: false, error: 'Résultat non trouvé' })
|
||||
}
|
||||
const raw = fs.readFileSync(jsonPath, 'utf8')
|
||||
const data = JSON.parse(raw)
|
||||
// Mettre à jour la première adresse et marquer la confirmation
|
||||
if (confirmed) {
|
||||
if (!data.extraction) data.extraction = {}
|
||||
if (!data.extraction.entities) data.extraction.entities = {}
|
||||
if (!Array.isArray(data.extraction.entities.addresses)) data.extraction.entities.addresses = []
|
||||
if (address && typeof address === 'object') {
|
||||
data.extraction.entities.addresses[0] = {
|
||||
id: data.extraction.entities.addresses[0]?.id || `address-0`,
|
||||
type: 'general',
|
||||
street: address.street || '',
|
||||
city: address.city || '',
|
||||
postalCode: address.postalCode || '',
|
||||
country: address.country || 'France',
|
||||
confidence: 1.0,
|
||||
source: 'confirmed',
|
||||
}
|
||||
}
|
||||
if (!data.metadata) data.metadata = {}
|
||||
if (!data.metadata.confirmations) data.metadata.confirmations = {}
|
||||
data.metadata.confirmations.addressConfirmed = true
|
||||
if (data.status && data.status.suggestions) {
|
||||
data.status.suggestions.needsAddressConfirmation = false
|
||||
}
|
||||
fs.writeFileSync(jsonPath, JSON.stringify(data, null, 2))
|
||||
return res.json({ success: true })
|
||||
}
|
||||
return res.status(400).json({ success: false, error: 'Paramètre confirmed manquant' })
|
||||
} catch (e) {
|
||||
return res.status(500).json({ success: false, error: e?.message || String(e) })
|
||||
}
|
||||
})
|
||||
|
||||
// Suppression d'un fichier d'un dossier (uploads + cache)
|
||||
app.delete('/api/folders/:folderHash/files/:fileHash', (req, res) => {
|
||||
try {
|
||||
|
||||
14
docs/annuaire_noms.md
Normal file
14
docs/annuaire_noms.md
Normal file
@ -0,0 +1,14 @@
|
||||
## Annuaire de noms (FR et autres)
|
||||
|
||||
- Emplacement: `backend/data/names/`
|
||||
- Fichiers CSV/TSV/texte séparés par `,` `;` ou `\t`.
|
||||
- Noms de fichiers:
|
||||
- `firstnames_fr.csv`, `firstnames_en.csv`, `prenoms_fr.csv`, etc.
|
||||
- `lastnames_fr.csv`, `surnames_en.csv`, `noms_fr.csv`, etc.
|
||||
- Chargement: automatique au démarrage; normalisation sans accents; ensembles en mémoire.
|
||||
- Usage: léger boost de confiance si prénom/nom détectés appartiennent à l’annuaire.
|
||||
- Extension multi-langues: ajouter des fichiers `firstnames_<lang>.csv` et `lastnames_<lang>.csv`.
|
||||
|
||||
Impact:
|
||||
- Le score `globalConfidence` est augmenté de +5% pour prénom connu, +5% pour nom connu (max +10%).
|
||||
- Améliore la décision de re-upload (moins de faux négatifs si noms valides).
|
||||
Loading…
x
Reference in New Issue
Block a user