4NK_IA_front/backend/server.js
Nicolas Cantu a5a0421b32 backend
2025-09-16 01:04:57 +02:00

741 lines
24 KiB
JavaScript

#!/usr/bin/env node
/**
* Serveur backend pour le traitement des documents
* Gère l'OCR, l'extraction NER et renvoie du JSON au frontend
*/
const express = require('express')
const multer = require('multer')
const cors = require('cors')
const path = require('path')
const fs = require('fs')
const { createWorker } = require('tesseract.js')
const { preprocessImageForOCR, analyzeImageMetadata } = require('./imagePreprocessing')
const pdf = require('pdf-parse')
const app = express()
const PORT = process.env.PORT || 3001
// Middleware
app.use(cors())
app.use(express.json())
app.use(express.static('public'))
// Configuration multer pour l'upload de fichiers
const storage = multer.diskStorage({
destination: (req, file, cb) => {
const uploadDir = 'uploads/'
if (!fs.existsSync(uploadDir)) {
fs.mkdirSync(uploadDir, { recursive: true })
}
cb(null, uploadDir)
},
filename: (req, file, cb) => {
const uniqueSuffix = Date.now() + '-' + Math.round(Math.random() * 1E9)
cb(null, file.fieldname + '-' + uniqueSuffix + path.extname(file.originalname))
}
})
const upload = multer({
storage,
limits: { fileSize: 10 * 1024 * 1024 }, // 10MB max
fileFilter: (req, file, cb) => {
const allowedTypes = ['image/jpeg', 'image/png', 'image/tiff', 'application/pdf']
if (allowedTypes.includes(file.mimetype)) {
cb(null, true)
} else {
cb(new Error('Type de fichier non supporté'), false)
}
}
})
// Fonction d'extraction de texte depuis un PDF
async function extractTextFromPdf(pdfPath) {
console.log(`[PDF] Début de l'extraction de texte pour: ${path.basename(pdfPath)}`)
try {
const dataBuffer = fs.readFileSync(pdfPath)
const data = await pdf(dataBuffer)
console.log(`[PDF] Texte extrait: ${data.text.length} caractères`)
console.log(`[PDF] Nombre de pages: ${data.numpages}`)
return {
text: data.text,
confidence: 95, // PDF text extraction est très fiable
words: data.text.split(/\s+/).filter(word => word.length > 0)
}
} catch (error) {
console.error(`[PDF] Erreur lors de l'extraction:`, error.message)
throw error
}
}
// Fonction d'extraction OCR optimisée avec préprocessing
async function extractTextFromImage(imagePath) {
console.log(`[OCR] Début de l'extraction pour: ${imagePath}`)
// Analyse des métadonnées de l'image
const metadata = await analyzeImageMetadata(imagePath)
// Préprocessing de l'image pour améliorer l'OCR
console.log(`[OCR] Préprocessing de l'image...`)
const preprocessedBuffer = await preprocessImageForOCR(imagePath, null, {
width: 2000,
contrast: 1.5,
brightness: 1.1,
grayscale: true,
sharpen: true,
denoise: true
})
// Sauvegarde temporaire de l'image préprocessée
const tempPath = imagePath.replace(/\.[^/.]+$/, '_preprocessed.png')
await fs.promises.writeFile(tempPath, preprocessedBuffer)
console.log(`[OCR] Image préprocessée sauvegardée: ${tempPath}`)
const worker = await createWorker('fra+eng')
try {
// Stratégie multi-modes pour améliorer la détection
const strategies = [
{
name: 'Mode Standard',
params: {
tessedit_pageseg_mode: '6',
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ.,/-:() ',
tessedit_ocr_engine_mode: '1',
preserve_interword_spaces: '1',
textord_min_linesize: '2.0',
textord_min_xheight: '6'
}
},
{
name: 'Mode Fine',
params: {
tessedit_pageseg_mode: '8', // Mot unique
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ.,/-:() ',
tessedit_ocr_engine_mode: '1',
textord_min_linesize: '1.0',
textord_min_xheight: '4',
textord_heavy_nr: '0'
}
},
{
name: 'Mode Ligne',
params: {
tessedit_pageseg_mode: '13', // Ligne brute de texte
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ.,/-:() ',
tessedit_ocr_engine_mode: '1',
textord_min_linesize: '1.5',
textord_min_xheight: '5'
}
}
]
let bestResult = { text: '', confidence: 0, words: [], strategy: 'none' }
for (const strategy of strategies) {
try {
console.log(`[OCR] Test de la stratégie: ${strategy.name}`)
await worker.setParameters(strategy.params)
const { data } = await worker.recognize(tempPath)
console.log(`[OCR] ${strategy.name} - Confiance: ${data.confidence}%`)
if (data.confidence > bestResult.confidence) {
bestResult = {
text: data.text,
confidence: data.confidence,
words: data.words || [],
strategy: strategy.name
}
}
} catch (error) {
console.log(`[OCR] Erreur avec ${strategy.name}: ${error.message}`)
}
}
console.log(`[OCR] Meilleur résultat (${bestResult.strategy}) - Confiance: ${bestResult.confidence}%`)
console.log(`[OCR] Texte extrait (${bestResult.text.length} caractères): ${bestResult.text.substring(0, 200)}...`)
return {
text: bestResult.text,
confidence: bestResult.confidence,
words: bestResult.words
}
} finally {
await worker.terminate()
// Nettoyage du fichier temporaire
try {
if (fs.existsSync(tempPath)) {
await fs.promises.unlink(tempPath)
console.log(`[OCR] Fichier temporaire supprimé: ${tempPath}`)
}
} catch (error) {
console.warn(`[OCR] Erreur lors de la suppression du fichier temporaire: ${error.message}`)
}
}
}
// Fonction de correction de texte pour améliorer la détection
function correctOCRText(text) {
// Corrections courantes pour les erreurs OCR
const corrections = {
// Corrections générales courantes seulement
'0': 'o', '1': 'l', '5': 's', '@': 'a', '3': 'e'
}
let correctedText = text
for (const [wrong, correct] of Object.entries(corrections)) {
correctedText = correctedText.replace(new RegExp(wrong, 'gi'), correct)
}
return correctedText
}
// Fonction pour générer le format JSON standard
function generateStandardJSON(documentInfo, ocrResult, entities, processingTime) {
const timestamp = new Date().toISOString()
const documentId = `doc-${Date.now()}`
// Classification du document
const documentType = entities.documentType || 'Document'
const subType = getDocumentSubType(documentType, ocrResult.text)
// Extraction des informations financières pour les factures
const financial = extractFinancialInfo(ocrResult.text, documentType)
// Extraction des références
const references = extractReferences(ocrResult.text, documentType)
// Calcul de la confiance globale
const globalConfidence = Math.min(95, Math.max(60, ocrResult.confidence * 0.8 +
(entities.identities.length > 0 ? 10 : 0) +
(entities.cniNumbers.length > 0 ? 15 : 0)))
return {
document: {
id: documentId,
fileName: documentInfo.originalname,
fileSize: documentInfo.size,
mimeType: documentInfo.mimetype,
uploadTimestamp: timestamp
},
classification: {
documentType: documentType,
confidence: globalConfidence / 100,
subType: subType,
language: 'fr',
pageCount: 1
},
extraction: {
text: {
raw: ocrResult.text,
processed: correctOCRText(ocrResult.text),
wordCount: ocrResult.words.length,
characterCount: ocrResult.text.length,
confidence: ocrResult.confidence / 100
},
entities: {
persons: entities.identities.map(identity => ({
id: identity.id,
type: 'person',
firstName: identity.firstName,
lastName: identity.lastName,
role: identity.role || null,
email: identity.email || null,
phone: identity.phone || null,
confidence: identity.confidence,
source: identity.source
})),
companies: entities.companies.map(company => ({
id: company.id,
name: company.name,
legalForm: company.legalForm || null,
siret: company.siret || null,
rcs: company.rcs || null,
tva: company.tva || null,
capital: company.capital || null,
role: company.role || null,
confidence: company.confidence,
source: company.source
})),
addresses: entities.addresses.map(address => ({
id: address.id,
type: address.type || 'general',
street: address.street,
city: address.city,
postalCode: address.postalCode,
country: address.country,
company: address.company || null,
confidence: address.confidence,
source: address.source
})),
financial: financial,
dates: entities.dates.map(date => ({
id: date.id,
type: date.type || 'general',
value: date.date || date.value,
formatted: formatDate(date.date || date.value),
confidence: date.confidence,
source: date.source
})),
contractual: {
clauses: entities.contractClauses.map(clause => ({
id: clause.id,
type: clause.type,
content: clause.text,
confidence: clause.confidence
})),
signatures: entities.signatures.map(signature => ({
id: signature.id,
type: signature.type || 'électronique',
present: signature.present || false,
signatory: signature.signatory || null,
date: signature.date || null,
confidence: signature.confidence
}))
},
references: references
}
},
metadata: {
processing: {
engine: '4NK_IA_Backend',
version: '1.0.0',
processingTime: `${processingTime}ms`,
ocrEngine: documentInfo.mimetype === 'application/pdf' ? 'pdf-parse' : 'tesseract.js',
nerEngine: 'rule-based',
preprocessing: {
applied: documentInfo.mimetype !== 'application/pdf',
reason: documentInfo.mimetype === 'application/pdf' ? 'PDF direct text extraction' : 'Image preprocessing applied'
}
},
quality: {
globalConfidence: globalConfidence / 100,
textExtractionConfidence: ocrResult.confidence / 100,
entityExtractionConfidence: 0.90,
classificationConfidence: globalConfidence / 100
}
},
status: {
success: true,
errors: [],
warnings: entities.signatures.length === 0 ? ['Aucune signature détectée'] : [],
timestamp: timestamp
}
}
}
// Fonction pour déterminer le sous-type de document
function getDocumentSubType(documentType, text) {
if (documentType === 'Facture') {
if (/prestation|service/i.test(text)) return 'Facture de prestation'
if (/vente|achat/i.test(text)) return 'Facture de vente'
return 'Facture'
}
if (documentType === 'CNI') return 'Carte Nationale d\'Identité'
if (documentType === 'Contrat') {
if (/vente|achat/i.test(text)) return 'Contrat de vente'
if (/location|bail/i.test(text)) return 'Contrat de location'
return 'Contrat'
}
return documentType
}
// Fonction pour extraire les informations financières
function extractFinancialInfo(text, documentType) {
if (documentType !== 'Facture') {
return { amounts: [], totals: {}, payment: {} }
}
const amounts = []
const totals = {}
const payment = {}
// Extraction des montants
const amountPatterns = [
/(\d+(?:[.,]\d{2})?)\s*€/g,
/Total\s+H\.T\.\s*[:\-]?\s*(\d+(?:[.,]\d{2})?)\s*€/gi,
/Total\s+T\.T\.C\.\s*[:\-]?\s*(\d+(?:[.,]\d{2})?)\s*€/gi,
/T\.V\.A\.\s*[:\-]?\s*(\d+(?:[.,]\d{2})?)\s*€/gi
]
amountPatterns.forEach(pattern => {
for (const match of text.matchAll(pattern)) {
const amount = parseFloat(match[1].replace(',', '.'))
if (amount > 0) {
amounts.push({
id: `amount-${amounts.length}`,
type: 'montant',
value: amount,
currency: 'EUR',
confidence: 0.9
})
}
}
})
// Extraction des conditions de paiement
const paymentPattern = /paiement\s+se\s+fera\s+\(maximum\)\s+(\d+)\s+jours/gi
const paymentMatch = paymentPattern.exec(text)
if (paymentMatch) {
payment.terms = `${paymentMatch[1]} jours après émission`
}
return { amounts, totals, payment }
}
// Fonction pour extraire les références
function extractReferences(text, documentType) {
const references = []
if (documentType === 'Facture') {
const facturePattern = /Facture\s+N°\s*[:\-]?\s*([A-Z0-9_-]+)/gi
for (const match of text.matchAll(facturePattern)) {
references.push({
id: `ref-${references.length}`,
type: 'facture',
number: match[1],
confidence: 0.95
})
}
}
return references
}
// Fonction pour formater les dates
function formatDate(dateStr) {
if (!dateStr) return null
// Format DD-MM-YY vers YYYY-MM-DD
const match = dateStr.match(/(\d{2})-(\w+)-(\d{2})/)
if (match) {
const months = {
'janvier': '01', 'février': '02', 'mars': '03', 'avril': '04',
'mai': '05', 'juin': '06', 'juillet': '07', 'août': '08',
'septembre': '09', 'octobre': '10', 'novembre': '11', 'décembre': '12'
}
const month = months[match[2].toLowerCase()]
if (month) {
const year = '20' + match[3]
return `${year}-${month}-${match[1].padStart(2, '0')}`
}
}
return dateStr
}
// Fonction d'extraction NER par règles
function extractEntitiesFromText(text) {
console.log(`[NER] Début de l'extraction d'entités pour ${text.length} caractères`)
// Correction du texte OCR
const correctedText = correctOCRText(text)
if (correctedText !== text) {
console.log(`[NER] Texte corrigé: ${correctedText.substring(0, 100)}...`)
}
const entities = {
identities: [],
companies: [],
addresses: [],
cniNumbers: [],
dates: [],
contractClauses: [],
signatures: [],
documentType: 'Document'
}
// Extraction des noms avec patterns généraux
const namePatterns = [
// Patterns pour documents officiels
/(Vendeur|Acheteur|Vendeuse|Acheteuse|Propriétaire|Locataire|Bailleur|Preneur)\s*:\s*([A-Z][a-zà-öø-ÿ'\-]+\s+[A-Z][a-zà-öø-ÿ'\-]+)/gi,
// Lignes en MAJUSCULES (noms complets)
/^([A-Z][A-ZÀ-ÖØ-öø-ÿ\s\-']{2,30})$/gm,
// Noms avec prénom + nom
/([A-Z][a-zà-öø-ÿ'\-]+\s+[A-Z][a-zà-öø-ÿ'\-]+)/g
]
namePatterns.forEach(pattern => {
for (const match of correctedText.matchAll(pattern)) {
const fullName = match[2] || match[1] || match[0]
if (fullName && fullName.length > 3) {
const nameParts = fullName.trim().split(/\s+/)
if (nameParts.length >= 2) {
entities.identities.push({
id: `identity-${entities.identities.length}`,
type: 'person',
firstName: nameParts[0],
lastName: nameParts.slice(1).join(' '),
confidence: 0.9,
source: 'rule-based'
})
}
}
}
})
// Extraction des sociétés
const companyPatterns = [
/(S\.A\.R\.L\.|SAS|SASU|EURL|SNC|SCI|SARL|SA|SAS|SASU|EURL|SNC|SCI|S\.A\.|S\.A\.R\.L\.|S\.A\.S\.|S\.A\.S\.U\.|E\.U\.R\.L\.|S\.N\.C\.|S\.C\.I\.)/gi,
/([A-Z][A-Za-zÀ-ÖØ-öø-ÿ\s\-']{3,50})\s+(S\.A\.R\.L\.|SAS|SASU|EURL|SNC|SCI|SARL|SA)/gi,
/(Entreprise|Société|Compagnie|Groupe|Corporation|Corp\.|Inc\.|Ltd\.|LLC)/gi
]
companyPatterns.forEach(pattern => {
for (const match of text.matchAll(pattern)) {
const companyName = match[1] || match[0]
if (companyName && companyName.length > 3) {
entities.companies.push({
id: `company-${entities.companies.length}`,
name: companyName.trim(),
type: 'company',
confidence: 0.8,
source: 'rule-based'
})
}
}
})
// Extraction des adresses
const addressPatterns = [
/(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{5})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi,
/demeurant\s+(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{5})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi,
/(Adresse|Siège|Adresse de facturation)\s*:\s*(\d{1,4}\s+[A-Za-zÀ-ÖØ-öø-ÿ\s\-']+,\s*\d{5}\s+[A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi
]
addressPatterns.forEach(pattern => {
for (const match of text.matchAll(pattern)) {
const street = match[2] || match[1]
const city = match[4] || match[3]
const postalCode = match[3] || match[2]
entities.addresses.push({
id: `address-${entities.addresses.length}`,
street: street ? `${street}`.trim() : '',
city: city ? city.trim() : '',
postalCode: postalCode ? postalCode.trim() : '',
country: 'France',
confidence: 0.9,
source: 'rule-based'
})
}
})
// Extraction des numéros de carte d'identité
const cniPattern = /([A-Z]{2}\d{6})/g
for (const match of text.matchAll(cniPattern)) {
entities.cniNumbers.push({
id: `cni-${entities.cniNumbers.length}`,
number: match[1],
confidence: 0.95,
source: 'rule-based'
})
}
// Extraction des dates
const datePatterns = [
/(\d{2}\/\d{2}\/\d{4})/g,
/(né|née)\s+le\s+(\d{2}\/\d{2}\/\d{4})/gi
]
datePatterns.forEach(pattern => {
for (const match of text.matchAll(pattern)) {
const date = match[2] || match[1]
entities.dates.push({
id: `date-${entities.dates.length}`,
date: date,
type: match[1]?.toLowerCase().includes('né') ? 'birth' : 'general',
confidence: 0.9,
source: 'rule-based'
})
}
})
// Extraction des clauses contractuelles
const clausePatterns = [
/(Article\s+\d+[:\-]?\s*[^\.]+\.)/gi,
/(Clause\s+\d+[:\-]?\s*[^\.]+\.)/gi,
/(Conditions\s+générales[^\.]+\.)/gi,
/(Modalités\s+de\s+[^\.]+\.)/gi,
/(Obligations\s+du\s+[^\.]+\.)/gi,
/(Responsabilités[^\.]+\.)/gi
]
clausePatterns.forEach(pattern => {
for (const match of text.matchAll(pattern)) {
const clause = match[1] || match[0]
if (clause && clause.length > 10) {
entities.contractClauses.push({
id: `clause-${entities.contractClauses.length}`,
text: clause.trim(),
type: 'contractual',
confidence: 0.8,
source: 'rule-based'
})
}
}
})
// Extraction des signatures
const signaturePatterns = [
/(Signé\s+le\s+\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4})/gi,
/(Signature\s+de\s+[A-Z][a-zà-öø-ÿ'\-]+\s+[A-Z][a-zà-öø-ÿ'\-]+)/gi,
/(Par\s+[A-Z][a-zà-öø-ÿ'\-]+\s+[A-Z][a-zà-öø-ÿ'\-]+)/gi,
/(Fait\s+et\s+signé\s+[^\.]+\.)/gi
]
signaturePatterns.forEach(pattern => {
for (const match of text.matchAll(pattern)) {
const signature = match[1] || match[0]
if (signature && signature.length > 5) {
entities.signatures.push({
id: `signature-${entities.signatures.length}`,
text: signature.trim(),
type: 'signature',
confidence: 0.8,
source: 'rule-based'
})
}
}
})
// Classification du type de document
if (/carte\s+nationale\s+d'identité|cni|mrz|identite/i.test(text)) {
entities.documentType = 'CNI'
} else if (/facture|tva|siren|montant|facturation/i.test(text)) {
entities.documentType = 'Facture'
} else if (/attestation|certificat/i.test(text)) {
entities.documentType = 'Attestation'
} else if (/contrat|vente|achat|acte/i.test(text)) {
entities.documentType = 'Contrat'
}
console.log(`[NER] Extraction terminée:`)
console.log(` - Identités: ${entities.identities.length}`)
console.log(` - Sociétés: ${entities.companies.length}`)
console.log(` - Adresses: ${entities.addresses.length}`)
console.log(` - Numéros CNI: ${entities.cniNumbers.length}`)
console.log(` - Dates: ${entities.dates.length}`)
console.log(` - Clauses contractuelles: ${entities.contractClauses.length}`)
console.log(` - Signatures: ${entities.signatures.length}`)
console.log(` - Type: ${entities.documentType}`)
return entities
}
// Route pour l'extraction de documents
app.post('/api/extract', upload.single('document'), async (req, res) => {
const startTime = Date.now()
try {
if (!req.file) {
return res.status(400).json({ error: 'Aucun fichier fourni' })
}
console.log(`[API] Traitement du fichier: ${req.file.originalname}`)
let ocrResult
// Si c'est un PDF, extraire le texte directement
if (req.file.mimetype === 'application/pdf') {
console.log(`[API] Extraction de texte depuis PDF...`)
try {
ocrResult = await extractTextFromPdf(req.file.path)
console.log(`[API] Texte extrait du PDF: ${ocrResult.text.length} caractères`)
} catch (error) {
console.error(`[API] Erreur lors de l'extraction PDF:`, error.message)
return res.status(500).json({
success: false,
error: 'Erreur lors de l\'extraction PDF',
details: error.message
})
}
} else {
// Pour les images, utiliser l'OCR avec préprocessing
ocrResult = await extractTextFromImage(req.file.path)
}
// Extraction NER
const entities = extractEntitiesFromText(ocrResult.text)
// Mesure du temps de traitement
const processingTime = Date.now() - startTime
// Génération du format JSON standard
const result = generateStandardJSON(req.file, ocrResult, entities, processingTime)
// Nettoyage du fichier temporaire
fs.unlinkSync(req.file.path)
console.log(`[API] Traitement terminé avec succès - Confiance: ${Math.round(result.metadata.quality.globalConfidence * 100)}%`)
res.json(result)
} catch (error) {
console.error('[API] Erreur lors du traitement:', error)
// Nettoyage en cas d'erreur
if (req.file && fs.existsSync(req.file.path)) {
fs.unlinkSync(req.file.path)
}
res.status(500).json({
success: false,
error: 'Erreur lors du traitement du document',
details: error.message
})
}
})
// Route pour lister les fichiers de test
app.get('/api/test-files', (req, res) => {
try {
const testFilesDir = path.join(__dirname, '..', 'test-files')
const files = fs.readdirSync(testFilesDir)
.filter(file => {
const ext = path.extname(file).toLowerCase()
return ['.jpg', '.jpeg', '.png', '.pdf', '.tiff'].includes(ext)
})
.map(file => {
const filePath = path.join(testFilesDir, file)
const stats = fs.statSync(filePath)
return {
name: file,
size: stats.size,
type: path.extname(file).toLowerCase(),
lastModified: stats.mtime
}
})
res.json({ success: true, files })
} catch (error) {
res.status(500).json({ success: false, error: error.message })
}
})
// Route de santé
app.get('/api/health', (req, res) => {
res.json({
status: 'OK',
timestamp: new Date().toISOString(),
version: '1.0.0'
})
})
// Démarrage du serveur
app.listen(PORT, () => {
console.log(`🚀 Serveur backend démarré sur le port ${PORT}`)
console.log(`📡 API disponible sur: http://localhost:${PORT}/api`)
console.log(`🏥 Health check: http://localhost:${PORT}/api/health`)
console.log(`📁 Test files: http://localhost:${PORT}/api/test-files`)
})
module.exports = app