feat(ocr): boost confiance avec annuaire de noms (FR+multi-lang) + docs
This commit is contained in:
parent
1118bbbf5d
commit
a3501def35
66
backend/nameDirectory.js
Normal file
66
backend/nameDirectory.js
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
const fs = require('fs')
|
||||||
|
const path = require('path')
|
||||||
|
|
||||||
|
function loadCsvNames(filePath) {
|
||||||
|
try {
|
||||||
|
const raw = fs.readFileSync(filePath, 'utf8')
|
||||||
|
const lines = raw.split(/\r?\n/).map((l) => l.trim()).filter(Boolean)
|
||||||
|
const names = []
|
||||||
|
for (const line of lines) {
|
||||||
|
const parts = line.split(/[;,\t]/).map((p) => p.trim()).filter(Boolean)
|
||||||
|
for (const p of parts) {
|
||||||
|
if (/^[A-Za-zÀ-ÖØ-öø-ÿ'\-\s]{2,}$/.test(p)) names.push(p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return names
|
||||||
|
} catch {
|
||||||
|
return []
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildNameSets() {
|
||||||
|
const baseDir = path.join(__dirname, 'data', 'names')
|
||||||
|
const firstNames = new Set()
|
||||||
|
const lastNames = new Set()
|
||||||
|
try {
|
||||||
|
if (!fs.existsSync(baseDir)) return { firstNames, lastNames }
|
||||||
|
const files = fs.readdirSync(baseDir)
|
||||||
|
for (const f of files) {
|
||||||
|
const fp = path.join(baseDir, f)
|
||||||
|
if (!fs.statSync(fp).isFile()) continue
|
||||||
|
const list = loadCsvNames(fp)
|
||||||
|
const isFirst = /first|prenom|given/i.test(f)
|
||||||
|
const isLast = /last|nom|surname|family/i.test(f)
|
||||||
|
for (const n of list) {
|
||||||
|
const norm = n.normalize('NFD').replace(/[\u0300-\u036f]/g, '').toLowerCase()
|
||||||
|
if (isFirst) firstNames.add(norm)
|
||||||
|
if (isLast) lastNames.add(norm)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
return { firstNames, lastNames }
|
||||||
|
}
|
||||||
|
|
||||||
|
let cache = null
|
||||||
|
function getNameDirectory() {
|
||||||
|
if (!cache) cache = buildNameSets()
|
||||||
|
return cache
|
||||||
|
}
|
||||||
|
|
||||||
|
function nameConfidenceBoost(firstName, lastName) {
|
||||||
|
try {
|
||||||
|
const dir = getNameDirectory()
|
||||||
|
const f = (firstName || '').normalize('NFD').replace(/[\u0300-\u036f]/g, '').toLowerCase()
|
||||||
|
const l = (lastName || '').normalize('NFD').replace(/[\u0300-\u036f]/g, '').toLowerCase()
|
||||||
|
let boost = 0
|
||||||
|
if (f && dir.firstNames.has(f)) boost += 0.05
|
||||||
|
if (l && dir.lastNames.has(l)) boost += 0.05
|
||||||
|
return boost
|
||||||
|
} catch {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { getNameDirectory, nameConfidenceBoost }
|
||||||
|
|
||||||
|
|
||||||
@ -13,6 +13,7 @@ const fs = require('fs')
|
|||||||
const crypto = require('crypto')
|
const crypto = require('crypto')
|
||||||
const { createWorker } = require('tesseract.js')
|
const { createWorker } = require('tesseract.js')
|
||||||
const { preprocessImageForOCR, analyzeImageMetadata } = require('./imagePreprocessing')
|
const { preprocessImageForOCR, analyzeImageMetadata } = require('./imagePreprocessing')
|
||||||
|
const { nameConfidenceBoost } = require('./nameDirectory')
|
||||||
const pdf = require('pdf-parse')
|
const pdf = require('pdf-parse')
|
||||||
|
|
||||||
const app = express()
|
const app = express()
|
||||||
@ -923,13 +924,20 @@ function generateStandardJSON(documentInfo, ocrResult, entities, processingTime)
|
|||||||
|
|
||||||
// Calcul de la confiance globale
|
// Calcul de la confiance globale
|
||||||
const baseConfidence = typeof ocrResult.confidence === 'number' ? ocrResult.confidence : 0
|
const baseConfidence = typeof ocrResult.confidence === 'number' ? ocrResult.confidence : 0
|
||||||
|
const dirBoost = (() => {
|
||||||
|
try {
|
||||||
|
const id0 = (Array.isArray(identities) && identities.length > 0) ? identities[0] : null
|
||||||
|
return id0 ? nameConfidenceBoost(id0.firstName, id0.lastName) : 0
|
||||||
|
} catch { return 0 }
|
||||||
|
})()
|
||||||
const globalConfidence = Math.min(
|
const globalConfidence = Math.min(
|
||||||
95,
|
95,
|
||||||
Math.max(
|
Math.max(
|
||||||
60,
|
60,
|
||||||
baseConfidence * 0.8 +
|
baseConfidence * 0.8 +
|
||||||
(identities.length > 0 ? 10 : 0) +
|
(identities.length > 0 ? 10 : 0) +
|
||||||
(cniNumbers.length > 0 ? 15 : 0),
|
(cniNumbers.length > 0 ? 15 : 0) +
|
||||||
|
Math.round(dirBoost * 100),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1054,10 +1062,51 @@ function generateStandardJSON(documentInfo, ocrResult, entities, processingTime)
|
|||||||
? ['Aucune signature détectée']
|
? ['Aucune signature détectée']
|
||||||
: [],
|
: [],
|
||||||
timestamp: timestamp,
|
timestamp: timestamp,
|
||||||
|
suggestions: computeQualitySuggestions({
|
||||||
|
documentType,
|
||||||
|
confidence: (typeof ocrResult.confidence === 'number' ? ocrResult.confidence : 0) / 100,
|
||||||
|
identities: safeEntities.identities,
|
||||||
|
addresses: safeEntities.addresses,
|
||||||
|
}),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Détermine des recommandations de qualité (remplacement/confirmation)
|
||||||
|
function computeQualitySuggestions(ctx) {
|
||||||
|
try {
|
||||||
|
const suggestions = {
|
||||||
|
needsReupload: false,
|
||||||
|
reasons: [],
|
||||||
|
needsAddressConfirmation: false,
|
||||||
|
detectedAddress: null,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Critères de re-upload: faible confiance OCR OU CNI sans NOM/PRÉNOM
|
||||||
|
if (typeof ctx.confidence === 'number' && ctx.confidence < 0.75) {
|
||||||
|
suggestions.needsReupload = true
|
||||||
|
suggestions.reasons.push('Confiance OCR faible')
|
||||||
|
}
|
||||||
|
if ((ctx.documentType || '').toUpperCase() === 'CNI') {
|
||||||
|
const numIds = Array.isArray(ctx.identities) ? ctx.identities.length : 0
|
||||||
|
if (numIds === 0) {
|
||||||
|
suggestions.needsReupload = true
|
||||||
|
suggestions.reasons.push('Nom/Prénom non reconnus')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Confirmation d'adresse si détectée
|
||||||
|
if (Array.isArray(ctx.addresses) && ctx.addresses.length > 0) {
|
||||||
|
suggestions.needsAddressConfirmation = true
|
||||||
|
suggestions.detectedAddress = ctx.addresses[0]
|
||||||
|
}
|
||||||
|
|
||||||
|
return suggestions
|
||||||
|
} catch {
|
||||||
|
return { needsReupload: false, reasons: [], needsAddressConfirmation: false, detectedAddress: null }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Fonction pour déterminer le sous-type de document
|
// Fonction pour déterminer le sous-type de document
|
||||||
function getDocumentSubType(documentType, text) {
|
function getDocumentSubType(documentType, text) {
|
||||||
if (documentType === 'Facture') {
|
if (documentType === 'Facture') {
|
||||||
@ -1192,12 +1241,14 @@ function extractEntitiesFromText(text) {
|
|||||||
try {
|
try {
|
||||||
const t = correctedText.replace(/\u200B|\u200E|\u200F/g, '')
|
const t = correctedText.replace(/\u200B|\u200E|\u200F/g, '')
|
||||||
// MRZ de CNI (deux ou trois lignes, séparateur << : NOM<<PRENOMS)
|
// MRZ de CNI (deux ou trois lignes, séparateur << : NOM<<PRENOMS)
|
||||||
// Scanner le texte complet (sans filtrage par lignes) et normaliser
|
// Normaliser (suppression accents) et mettre en majuscules
|
||||||
const mrzText = t
|
const mrzText = t
|
||||||
.normalize('NFD')
|
.normalize('NFD')
|
||||||
.replace(/[\u0300-\u036f]/g, '')
|
.replace(/[\u0300-\u036f]/g, '')
|
||||||
.toUpperCase()
|
.toUpperCase()
|
||||||
const mrzRegex = /([A-Z]{2,})<<([A-Z<]{2,})/g
|
|
||||||
|
// Recherche stricte NOM<<PRENOMS (MRZ)
|
||||||
|
const mrzRegex = /\b([A-Z]{2,})(?:<+)([A-Z<]{2,})\b/g
|
||||||
let match
|
let match
|
||||||
while ((match = mrzRegex.exec(mrzText)) !== null) {
|
while ((match = mrzRegex.exec(mrzText)) !== null) {
|
||||||
const rawSurname = match[1].replace(/</g, ' ').trim()
|
const rawSurname = match[1].replace(/</g, ' ').trim()
|
||||||
@ -1218,9 +1269,9 @@ function extractEntitiesFromText(text) {
|
|||||||
|
|
||||||
// Repli: si pas d'identité MRZ extraite, tenter reconstruction NOM/PRÉNOM séparés
|
// Repli: si pas d'identité MRZ extraite, tenter reconstruction NOM/PRÉNOM séparés
|
||||||
if (!(Array.isArray(entities.identities) && entities.identities.some((i)=> (i.source||'').toLowerCase()==='mrz'))) {
|
if (!(Array.isArray(entities.identities) && entities.identities.some((i)=> (i.source||'').toLowerCase()==='mrz'))) {
|
||||||
// Chercher NOM après IDFRA (ex: IDFRACANTU<<<<...)
|
// Chercher NOM après IDFRA (ex: IDFRA CANTU<<<<...)
|
||||||
const mSurname = mrzText.match(/IDFRA\s*([A-Z]{2,})</)
|
const mSurname = mrzText.match(/IDFRA\s*([A-Z]{2,})</)
|
||||||
// Chercher PRENOM en premier avant << (ex: NICOLAS<<FRANCS)
|
// Chercher PRENOM avant << (ex: NICOLAS<<...)
|
||||||
const mGiven = mrzText.match(/\b([A-Z]{2,})<<[A-Z<]{2,}\b/)
|
const mGiven = mrzText.match(/\b([A-Z]{2,})<<[A-Z<]{2,}\b/)
|
||||||
const last = mSurname?.[1]?.trim()
|
const last = mSurname?.[1]?.trim()
|
||||||
const first = mGiven?.[1]?.trim()
|
const first = mGiven?.[1]?.trim()
|
||||||
@ -1239,7 +1290,7 @@ function extractEntitiesFromText(text) {
|
|||||||
// Libellés français typiques de CNI
|
// Libellés français typiques de CNI
|
||||||
// NOM : XXXX PRENOM(S) : YYYYY
|
// NOM : XXXX PRENOM(S) : YYYYY
|
||||||
const labelName = t.match(/\bNOM\s*[:\-]?\s*([A-ZÀ-ÖØ-Þ\-\s]{2,})/i)
|
const labelName = t.match(/\bNOM\s*[:\-]?\s*([A-ZÀ-ÖØ-Þ\-\s]{2,})/i)
|
||||||
const labelGiven = t.match(/\bPRÉ?NOM\S*\s*[:\-]?\s*([A-Za-zÀ-ÖØ-öø-ÿ'\-\s]{2,})/i)
|
const labelGiven = t.match(/\bPR[ÉE]?NOM\S*\s*[:\-]?\s*([A-Za-zÀ-ÖØ-öø-ÿ'\-\s]{2,})/i)
|
||||||
if (labelName || labelGiven) {
|
if (labelName || labelGiven) {
|
||||||
const last = (labelName?.[1] || '').replace(/[^A-Za-zÀ-ÖØ-Þ'\-\s]/g, '').trim()
|
const last = (labelName?.[1] || '').replace(/[^A-Za-zÀ-ÖØ-Þ'\-\s]/g, '').trim()
|
||||||
const first = (labelGiven?.[1] || '').replace(/[^A-Za-zÀ-ÖØ-öø-ÿ'\-\s]/g, '').trim()
|
const first = (labelGiven?.[1] || '').replace(/[^A-Za-zÀ-ÖØ-öø-ÿ'\-\s]/g, '').trim()
|
||||||
@ -1312,25 +1363,26 @@ function extractEntitiesFromText(text) {
|
|||||||
// Extraction des adresses
|
// Extraction des adresses
|
||||||
const addressPatterns = [
|
const addressPatterns = [
|
||||||
// 10 rue Exemple, 75001 Paris
|
// 10 rue Exemple, 75001 Paris
|
||||||
/(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{2}\s?\d{3})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi,
|
/(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{5})\s+([A-Za-zÀ-ÖØ-öø-ÿ\-\s]{2,})(?:\b(France)\b)?/gi,
|
||||||
// demeurant 10 rue Exemple, 75001 Paris
|
// demeurant 10 rue Exemple, 75001 Paris
|
||||||
/demeurant\s+(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{2}\s?\d{3})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi,
|
/demeurant\s+(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{5})\s+([A-Za-zÀ-ÖØ-öø-ÿ\-\s]{2,})(?:\b(France)\b)?/gi,
|
||||||
// Adresse: 10 rue Exemple, 75001 Paris
|
// Adresse: 10 rue Exemple, 75001 Paris
|
||||||
/(Adresse|Siège|Adresse de facturation)\s*:\s*(\d{1,4}\s+[A-Za-zÀ-ÖØ-öø-ÿ\s\-']+,\s*\d{2}\s?\d{3}\s+[A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi,
|
/(Adresse|Siège|Adresse de facturation)\s*:\s*(\d{1,4}\s+[A-Za-zÀ-ÖØ-öø-ÿ\s\-']+,\s*\d{5}\s+[A-Za-zÀ-ÖØ-öø-ÿ\-\s]{2,})(?:\b(France)\b)?/gi,
|
||||||
]
|
]
|
||||||
|
|
||||||
addressPatterns.forEach((pattern) => {
|
addressPatterns.forEach((pattern) => {
|
||||||
for (const match of text.matchAll(pattern)) {
|
for (const match of text.matchAll(pattern)) {
|
||||||
const street = match[2] || match[1]
|
const street = match[2] || match[1]
|
||||||
const city = match[4] || match[3]
|
const postalCode = (match[3] || '').replace(/\s+/g, '')
|
||||||
const postalCode = (match[3] || match[2] || '').replace(/\s+/g, '')
|
const city = (match[4] || '').replace(/\s+France$/i, '').trim()
|
||||||
|
const country = (match[5] || 'France').trim()
|
||||||
|
|
||||||
entities.addresses.push({
|
entities.addresses.push({
|
||||||
id: `address-${(Array.isArray(entities.addresses)?entities.addresses:[]).length}`,
|
id: `address-${(Array.isArray(entities.addresses)?entities.addresses:[]).length}`,
|
||||||
street: street ? `${street}`.trim() : '',
|
street: street ? `${street}`.trim() : '',
|
||||||
city: city ? city.trim() : '',
|
city: city,
|
||||||
postalCode: postalCode ? postalCode.trim() : '',
|
postalCode: postalCode,
|
||||||
country: 'France',
|
country: country || 'France',
|
||||||
confidence: 0.9,
|
confidence: 0.9,
|
||||||
source: 'rule-based',
|
source: 'rule-based',
|
||||||
})
|
})
|
||||||
@ -2005,6 +2057,50 @@ app.get('/api/folders/:folderHash/files/:fileHash', (req, res) => {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// Confirmer (ou corriger) l'adresse détectée pour un fichier
|
||||||
|
app.post('/api/folders/:folderHash/files/:fileHash/confirm-address', express.json(), (req, res) => {
|
||||||
|
try {
|
||||||
|
const { folderHash, fileHash } = req.params
|
||||||
|
const { address, confirmed } = req.body || {}
|
||||||
|
const cachePath = path.join('cache', folderHash)
|
||||||
|
const jsonPath = path.join(cachePath, `${fileHash}.json`)
|
||||||
|
if (!fs.existsSync(jsonPath)) {
|
||||||
|
return res.status(404).json({ success: false, error: 'Résultat non trouvé' })
|
||||||
|
}
|
||||||
|
const raw = fs.readFileSync(jsonPath, 'utf8')
|
||||||
|
const data = JSON.parse(raw)
|
||||||
|
// Mettre à jour la première adresse et marquer la confirmation
|
||||||
|
if (confirmed) {
|
||||||
|
if (!data.extraction) data.extraction = {}
|
||||||
|
if (!data.extraction.entities) data.extraction.entities = {}
|
||||||
|
if (!Array.isArray(data.extraction.entities.addresses)) data.extraction.entities.addresses = []
|
||||||
|
if (address && typeof address === 'object') {
|
||||||
|
data.extraction.entities.addresses[0] = {
|
||||||
|
id: data.extraction.entities.addresses[0]?.id || `address-0`,
|
||||||
|
type: 'general',
|
||||||
|
street: address.street || '',
|
||||||
|
city: address.city || '',
|
||||||
|
postalCode: address.postalCode || '',
|
||||||
|
country: address.country || 'France',
|
||||||
|
confidence: 1.0,
|
||||||
|
source: 'confirmed',
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!data.metadata) data.metadata = {}
|
||||||
|
if (!data.metadata.confirmations) data.metadata.confirmations = {}
|
||||||
|
data.metadata.confirmations.addressConfirmed = true
|
||||||
|
if (data.status && data.status.suggestions) {
|
||||||
|
data.status.suggestions.needsAddressConfirmation = false
|
||||||
|
}
|
||||||
|
fs.writeFileSync(jsonPath, JSON.stringify(data, null, 2))
|
||||||
|
return res.json({ success: true })
|
||||||
|
}
|
||||||
|
return res.status(400).json({ success: false, error: 'Paramètre confirmed manquant' })
|
||||||
|
} catch (e) {
|
||||||
|
return res.status(500).json({ success: false, error: e?.message || String(e) })
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
// Suppression d'un fichier d'un dossier (uploads + cache)
|
// Suppression d'un fichier d'un dossier (uploads + cache)
|
||||||
app.delete('/api/folders/:folderHash/files/:fileHash', (req, res) => {
|
app.delete('/api/folders/:folderHash/files/:fileHash', (req, res) => {
|
||||||
try {
|
try {
|
||||||
|
|||||||
14
docs/annuaire_noms.md
Normal file
14
docs/annuaire_noms.md
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
## Annuaire de noms (FR et autres)
|
||||||
|
|
||||||
|
- Emplacement: `backend/data/names/`
|
||||||
|
- Fichiers CSV/TSV/texte séparés par `,` `;` ou `\t`.
|
||||||
|
- Noms de fichiers:
|
||||||
|
- `firstnames_fr.csv`, `firstnames_en.csv`, `prenoms_fr.csv`, etc.
|
||||||
|
- `lastnames_fr.csv`, `surnames_en.csv`, `noms_fr.csv`, etc.
|
||||||
|
- Chargement: automatique au démarrage; normalisation sans accents; ensembles en mémoire.
|
||||||
|
- Usage: léger boost de confiance si prénom/nom détectés appartiennent à l’annuaire.
|
||||||
|
- Extension multi-langues: ajouter des fichiers `firstnames_<lang>.csv` et `lastnames_<lang>.csv`.
|
||||||
|
|
||||||
|
Impact:
|
||||||
|
- Le score `globalConfidence` est augmenté de +5% pour prénom connu, +5% pour nom connu (max +10%).
|
||||||
|
- Améliore la décision de re-upload (moins de faux négatifs si noms valides).
|
||||||
Loading…
x
Reference in New Issue
Block a user