#!/usr/bin/env node /** * Serveur backend pour le traitement des documents * Gère l'OCR, l'extraction NER et renvoie du JSON au frontend */ const express = require('express') const multer = require('multer') const cors = require('cors') const path = require('path') const fs = require('fs') const { createWorker } = require('tesseract.js') const { preprocessImageForOCR, analyzeImageMetadata } = require('./imagePreprocessing') const pdf = require('pdf-parse') const app = express() const PORT = process.env.PORT || 3001 // Middleware app.use(cors()) app.use(express.json()) app.use(express.static('public')) // Configuration multer pour l'upload de fichiers const storage = multer.diskStorage({ destination: (req, file, cb) => { const uploadDir = 'uploads/' if (!fs.existsSync(uploadDir)) { fs.mkdirSync(uploadDir, { recursive: true }) } cb(null, uploadDir) }, filename: (req, file, cb) => { const uniqueSuffix = Date.now() + '-' + Math.round(Math.random() * 1E9) cb(null, file.fieldname + '-' + uniqueSuffix + path.extname(file.originalname)) } }) const upload = multer({ storage, limits: { fileSize: 10 * 1024 * 1024 }, // 10MB max fileFilter: (req, file, cb) => { const allowedTypes = ['image/jpeg', 'image/png', 'image/tiff', 'application/pdf'] if (allowedTypes.includes(file.mimetype)) { cb(null, true) } else { cb(new Error('Type de fichier non supporté'), false) } } }) // Fonction d'extraction de texte depuis un PDF async function extractTextFromPdf(pdfPath) { console.log(`[PDF] Début de l'extraction de texte pour: ${path.basename(pdfPath)}`) try { const dataBuffer = fs.readFileSync(pdfPath) const data = await pdf(dataBuffer) console.log(`[PDF] Texte extrait: ${data.text.length} caractères`) console.log(`[PDF] Nombre de pages: ${data.numpages}`) return { text: data.text, confidence: 95, // PDF text extraction est très fiable words: data.text.split(/\s+/).filter(word => word.length > 0) } } catch (error) { console.error(`[PDF] Erreur lors de l'extraction:`, error.message) throw error } } // Fonction d'extraction OCR optimisée avec préprocessing async function extractTextFromImage(imagePath) { console.log(`[OCR] Début de l'extraction pour: ${imagePath}`) // Analyse des métadonnées de l'image const metadata = await analyzeImageMetadata(imagePath) // Préprocessing de l'image pour améliorer l'OCR console.log(`[OCR] Préprocessing de l'image...`) const preprocessedBuffer = await preprocessImageForOCR(imagePath, null, { width: 2000, contrast: 1.5, brightness: 1.1, grayscale: true, sharpen: true, denoise: true }) // Sauvegarde temporaire de l'image préprocessée const tempPath = imagePath.replace(/\.[^/.]+$/, '_preprocessed.png') await fs.promises.writeFile(tempPath, preprocessedBuffer) console.log(`[OCR] Image préprocessée sauvegardée: ${tempPath}`) const worker = await createWorker('fra+eng') try { // Stratégie multi-modes pour améliorer la détection const strategies = [ { name: 'Mode Standard', params: { tessedit_pageseg_mode: '6', tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ.,/-:() ', tessedit_ocr_engine_mode: '1', preserve_interword_spaces: '1', textord_min_linesize: '2.0', textord_min_xheight: '6' } }, { name: 'Mode Fine', params: { tessedit_pageseg_mode: '8', // Mot unique tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ.,/-:() ', tessedit_ocr_engine_mode: '1', textord_min_linesize: '1.0', textord_min_xheight: '4', textord_heavy_nr: '0' } }, { name: 'Mode Ligne', params: { tessedit_pageseg_mode: '13', // Ligne brute de texte tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ.,/-:() ', tessedit_ocr_engine_mode: '1', textord_min_linesize: '1.5', textord_min_xheight: '5' } } ] let bestResult = { text: '', confidence: 0, words: [], strategy: 'none' } for (const strategy of strategies) { try { console.log(`[OCR] Test de la stratégie: ${strategy.name}`) await worker.setParameters(strategy.params) const { data } = await worker.recognize(tempPath) console.log(`[OCR] ${strategy.name} - Confiance: ${data.confidence}%`) if (data.confidence > bestResult.confidence) { bestResult = { text: data.text, confidence: data.confidence, words: data.words || [], strategy: strategy.name } } } catch (error) { console.log(`[OCR] Erreur avec ${strategy.name}: ${error.message}`) } } console.log(`[OCR] Meilleur résultat (${bestResult.strategy}) - Confiance: ${bestResult.confidence}%`) console.log(`[OCR] Texte extrait (${bestResult.text.length} caractères): ${bestResult.text.substring(0, 200)}...`) return { text: bestResult.text, confidence: bestResult.confidence, words: bestResult.words } } finally { await worker.terminate() // Nettoyage du fichier temporaire try { if (fs.existsSync(tempPath)) { await fs.promises.unlink(tempPath) console.log(`[OCR] Fichier temporaire supprimé: ${tempPath}`) } } catch (error) { console.warn(`[OCR] Erreur lors de la suppression du fichier temporaire: ${error.message}`) } } } // Fonction de correction de texte pour améliorer la détection function correctOCRText(text) { // Corrections courantes pour les erreurs OCR const corrections = { // Corrections générales courantes seulement '0': 'o', '1': 'l', '5': 's', '@': 'a', '3': 'e' } let correctedText = text for (const [wrong, correct] of Object.entries(corrections)) { correctedText = correctedText.replace(new RegExp(wrong, 'gi'), correct) } return correctedText } // Fonction pour générer le format JSON standard function generateStandardJSON(documentInfo, ocrResult, entities, processingTime) { const timestamp = new Date().toISOString() const documentId = `doc-${Date.now()}` // Classification du document const documentType = entities.documentType || 'Document' const subType = getDocumentSubType(documentType, ocrResult.text) // Extraction des informations financières pour les factures const financial = extractFinancialInfo(ocrResult.text, documentType) // Extraction des références const references = extractReferences(ocrResult.text, documentType) // Calcul de la confiance globale const globalConfidence = Math.min(95, Math.max(60, ocrResult.confidence * 0.8 + (entities.identities.length > 0 ? 10 : 0) + (entities.cniNumbers.length > 0 ? 15 : 0))) return { document: { id: documentId, fileName: documentInfo.originalname, fileSize: documentInfo.size, mimeType: documentInfo.mimetype, uploadTimestamp: timestamp }, classification: { documentType: documentType, confidence: globalConfidence / 100, subType: subType, language: 'fr', pageCount: 1 }, extraction: { text: { raw: ocrResult.text, processed: correctOCRText(ocrResult.text), wordCount: ocrResult.words.length, characterCount: ocrResult.text.length, confidence: ocrResult.confidence / 100 }, entities: { persons: entities.identities.map(identity => ({ id: identity.id, type: 'person', firstName: identity.firstName, lastName: identity.lastName, role: identity.role || null, email: identity.email || null, phone: identity.phone || null, confidence: identity.confidence, source: identity.source })), companies: entities.companies.map(company => ({ id: company.id, name: company.name, legalForm: company.legalForm || null, siret: company.siret || null, rcs: company.rcs || null, tva: company.tva || null, capital: company.capital || null, role: company.role || null, confidence: company.confidence, source: company.source })), addresses: entities.addresses.map(address => ({ id: address.id, type: address.type || 'general', street: address.street, city: address.city, postalCode: address.postalCode, country: address.country, company: address.company || null, confidence: address.confidence, source: address.source })), financial: financial, dates: entities.dates.map(date => ({ id: date.id, type: date.type || 'general', value: date.date || date.value, formatted: formatDate(date.date || date.value), confidence: date.confidence, source: date.source })), contractual: { clauses: entities.contractClauses.map(clause => ({ id: clause.id, type: clause.type, content: clause.text, confidence: clause.confidence })), signatures: entities.signatures.map(signature => ({ id: signature.id, type: signature.type || 'électronique', present: signature.present || false, signatory: signature.signatory || null, date: signature.date || null, confidence: signature.confidence })) }, references: references } }, metadata: { processing: { engine: '4NK_IA_Backend', version: '1.0.0', processingTime: `${processingTime}ms`, ocrEngine: documentInfo.mimetype === 'application/pdf' ? 'pdf-parse' : 'tesseract.js', nerEngine: 'rule-based', preprocessing: { applied: documentInfo.mimetype !== 'application/pdf', reason: documentInfo.mimetype === 'application/pdf' ? 'PDF direct text extraction' : 'Image preprocessing applied' } }, quality: { globalConfidence: globalConfidence / 100, textExtractionConfidence: ocrResult.confidence / 100, entityExtractionConfidence: 0.90, classificationConfidence: globalConfidence / 100 } }, status: { success: true, errors: [], warnings: entities.signatures.length === 0 ? ['Aucune signature détectée'] : [], timestamp: timestamp } } } // Fonction pour déterminer le sous-type de document function getDocumentSubType(documentType, text) { if (documentType === 'Facture') { if (/prestation|service/i.test(text)) return 'Facture de prestation' if (/vente|achat/i.test(text)) return 'Facture de vente' return 'Facture' } if (documentType === 'CNI') return 'Carte Nationale d\'Identité' if (documentType === 'Contrat') { if (/vente|achat/i.test(text)) return 'Contrat de vente' if (/location|bail/i.test(text)) return 'Contrat de location' return 'Contrat' } return documentType } // Fonction pour extraire les informations financières function extractFinancialInfo(text, documentType) { if (documentType !== 'Facture') { return { amounts: [], totals: {}, payment: {} } } const amounts = [] const totals = {} const payment = {} // Extraction des montants const amountPatterns = [ /(\d+(?:[.,]\d{2})?)\s*€/g, /Total\s+H\.T\.\s*[:\-]?\s*(\d+(?:[.,]\d{2})?)\s*€/gi, /Total\s+T\.T\.C\.\s*[:\-]?\s*(\d+(?:[.,]\d{2})?)\s*€/gi, /T\.V\.A\.\s*[:\-]?\s*(\d+(?:[.,]\d{2})?)\s*€/gi ] amountPatterns.forEach(pattern => { for (const match of text.matchAll(pattern)) { const amount = parseFloat(match[1].replace(',', '.')) if (amount > 0) { amounts.push({ id: `amount-${amounts.length}`, type: 'montant', value: amount, currency: 'EUR', confidence: 0.9 }) } } }) // Extraction des conditions de paiement const paymentPattern = /paiement\s+se\s+fera\s+\(maximum\)\s+(\d+)\s+jours/gi const paymentMatch = paymentPattern.exec(text) if (paymentMatch) { payment.terms = `${paymentMatch[1]} jours après émission` } return { amounts, totals, payment } } // Fonction pour extraire les références function extractReferences(text, documentType) { const references = [] if (documentType === 'Facture') { const facturePattern = /Facture\s+N°\s*[:\-]?\s*([A-Z0-9_-]+)/gi for (const match of text.matchAll(facturePattern)) { references.push({ id: `ref-${references.length}`, type: 'facture', number: match[1], confidence: 0.95 }) } } return references } // Fonction pour formater les dates function formatDate(dateStr) { if (!dateStr) return null // Format DD-MM-YY vers YYYY-MM-DD const match = dateStr.match(/(\d{2})-(\w+)-(\d{2})/) if (match) { const months = { 'janvier': '01', 'février': '02', 'mars': '03', 'avril': '04', 'mai': '05', 'juin': '06', 'juillet': '07', 'août': '08', 'septembre': '09', 'octobre': '10', 'novembre': '11', 'décembre': '12' } const month = months[match[2].toLowerCase()] if (month) { const year = '20' + match[3] return `${year}-${month}-${match[1].padStart(2, '0')}` } } return dateStr } // Fonction d'extraction NER par règles function extractEntitiesFromText(text) { console.log(`[NER] Début de l'extraction d'entités pour ${text.length} caractères`) // Correction du texte OCR const correctedText = correctOCRText(text) if (correctedText !== text) { console.log(`[NER] Texte corrigé: ${correctedText.substring(0, 100)}...`) } const entities = { identities: [], companies: [], addresses: [], cniNumbers: [], dates: [], contractClauses: [], signatures: [], documentType: 'Document' } // Extraction des noms avec patterns généraux const namePatterns = [ // Patterns pour documents officiels /(Vendeur|Acheteur|Vendeuse|Acheteuse|Propriétaire|Locataire|Bailleur|Preneur)\s*:\s*([A-Z][a-zà-öø-ÿ'\-]+\s+[A-Z][a-zà-öø-ÿ'\-]+)/gi, // Lignes en MAJUSCULES (noms complets) /^([A-Z][A-ZÀ-ÖØ-öø-ÿ\s\-']{2,30})$/gm, // Noms avec prénom + nom /([A-Z][a-zà-öø-ÿ'\-]+\s+[A-Z][a-zà-öø-ÿ'\-]+)/g ] namePatterns.forEach(pattern => { for (const match of correctedText.matchAll(pattern)) { const fullName = match[2] || match[1] || match[0] if (fullName && fullName.length > 3) { const nameParts = fullName.trim().split(/\s+/) if (nameParts.length >= 2) { entities.identities.push({ id: `identity-${entities.identities.length}`, type: 'person', firstName: nameParts[0], lastName: nameParts.slice(1).join(' '), confidence: 0.9, source: 'rule-based' }) } } } }) // Extraction des sociétés const companyPatterns = [ /(S\.A\.R\.L\.|SAS|SASU|EURL|SNC|SCI|SARL|SA|SAS|SASU|EURL|SNC|SCI|S\.A\.|S\.A\.R\.L\.|S\.A\.S\.|S\.A\.S\.U\.|E\.U\.R\.L\.|S\.N\.C\.|S\.C\.I\.)/gi, /([A-Z][A-Za-zÀ-ÖØ-öø-ÿ\s\-']{3,50})\s+(S\.A\.R\.L\.|SAS|SASU|EURL|SNC|SCI|SARL|SA)/gi, /(Entreprise|Société|Compagnie|Groupe|Corporation|Corp\.|Inc\.|Ltd\.|LLC)/gi ] companyPatterns.forEach(pattern => { for (const match of text.matchAll(pattern)) { const companyName = match[1] || match[0] if (companyName && companyName.length > 3) { entities.companies.push({ id: `company-${entities.companies.length}`, name: companyName.trim(), type: 'company', confidence: 0.8, source: 'rule-based' }) } } }) // Extraction des adresses const addressPatterns = [ /(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{5})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi, /demeurant\s+(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{5})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi, /(Adresse|Siège|Adresse de facturation)\s*:\s*(\d{1,4}\s+[A-Za-zÀ-ÖØ-öø-ÿ\s\-']+,\s*\d{5}\s+[A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi ] addressPatterns.forEach(pattern => { for (const match of text.matchAll(pattern)) { const street = match[2] || match[1] const city = match[4] || match[3] const postalCode = match[3] || match[2] entities.addresses.push({ id: `address-${entities.addresses.length}`, street: street ? `${street}`.trim() : '', city: city ? city.trim() : '', postalCode: postalCode ? postalCode.trim() : '', country: 'France', confidence: 0.9, source: 'rule-based' }) } }) // Extraction des numéros de carte d'identité const cniPattern = /([A-Z]{2}\d{6})/g for (const match of text.matchAll(cniPattern)) { entities.cniNumbers.push({ id: `cni-${entities.cniNumbers.length}`, number: match[1], confidence: 0.95, source: 'rule-based' }) } // Extraction des dates const datePatterns = [ /(\d{2}\/\d{2}\/\d{4})/g, /(né|née)\s+le\s+(\d{2}\/\d{2}\/\d{4})/gi ] datePatterns.forEach(pattern => { for (const match of text.matchAll(pattern)) { const date = match[2] || match[1] entities.dates.push({ id: `date-${entities.dates.length}`, date: date, type: match[1]?.toLowerCase().includes('né') ? 'birth' : 'general', confidence: 0.9, source: 'rule-based' }) } }) // Extraction des clauses contractuelles const clausePatterns = [ /(Article\s+\d+[:\-]?\s*[^\.]+\.)/gi, /(Clause\s+\d+[:\-]?\s*[^\.]+\.)/gi, /(Conditions\s+générales[^\.]+\.)/gi, /(Modalités\s+de\s+[^\.]+\.)/gi, /(Obligations\s+du\s+[^\.]+\.)/gi, /(Responsabilités[^\.]+\.)/gi ] clausePatterns.forEach(pattern => { for (const match of text.matchAll(pattern)) { const clause = match[1] || match[0] if (clause && clause.length > 10) { entities.contractClauses.push({ id: `clause-${entities.contractClauses.length}`, text: clause.trim(), type: 'contractual', confidence: 0.8, source: 'rule-based' }) } } }) // Extraction des signatures const signaturePatterns = [ /(Signé\s+le\s+\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4})/gi, /(Signature\s+de\s+[A-Z][a-zà-öø-ÿ'\-]+\s+[A-Z][a-zà-öø-ÿ'\-]+)/gi, /(Par\s+[A-Z][a-zà-öø-ÿ'\-]+\s+[A-Z][a-zà-öø-ÿ'\-]+)/gi, /(Fait\s+et\s+signé\s+[^\.]+\.)/gi ] signaturePatterns.forEach(pattern => { for (const match of text.matchAll(pattern)) { const signature = match[1] || match[0] if (signature && signature.length > 5) { entities.signatures.push({ id: `signature-${entities.signatures.length}`, text: signature.trim(), type: 'signature', confidence: 0.8, source: 'rule-based' }) } } }) // Classification du type de document if (/carte\s+nationale\s+d'identité|cni|mrz|identite/i.test(text)) { entities.documentType = 'CNI' } else if (/facture|tva|siren|montant|facturation/i.test(text)) { entities.documentType = 'Facture' } else if (/attestation|certificat/i.test(text)) { entities.documentType = 'Attestation' } else if (/contrat|vente|achat|acte/i.test(text)) { entities.documentType = 'Contrat' } console.log(`[NER] Extraction terminée:`) console.log(` - Identités: ${entities.identities.length}`) console.log(` - Sociétés: ${entities.companies.length}`) console.log(` - Adresses: ${entities.addresses.length}`) console.log(` - Numéros CNI: ${entities.cniNumbers.length}`) console.log(` - Dates: ${entities.dates.length}`) console.log(` - Clauses contractuelles: ${entities.contractClauses.length}`) console.log(` - Signatures: ${entities.signatures.length}`) console.log(` - Type: ${entities.documentType}`) return entities } // Route pour l'extraction de documents app.post('/api/extract', upload.single('document'), async (req, res) => { const startTime = Date.now() try { if (!req.file) { return res.status(400).json({ error: 'Aucun fichier fourni' }) } console.log(`[API] Traitement du fichier: ${req.file.originalname}`) let ocrResult // Si c'est un PDF, extraire le texte directement if (req.file.mimetype === 'application/pdf') { console.log(`[API] Extraction de texte depuis PDF...`) try { ocrResult = await extractTextFromPdf(req.file.path) console.log(`[API] Texte extrait du PDF: ${ocrResult.text.length} caractères`) } catch (error) { console.error(`[API] Erreur lors de l'extraction PDF:`, error.message) return res.status(500).json({ success: false, error: 'Erreur lors de l\'extraction PDF', details: error.message }) } } else { // Pour les images, utiliser l'OCR avec préprocessing ocrResult = await extractTextFromImage(req.file.path) } // Extraction NER const entities = extractEntitiesFromText(ocrResult.text) // Mesure du temps de traitement const processingTime = Date.now() - startTime // Génération du format JSON standard const result = generateStandardJSON(req.file, ocrResult, entities, processingTime) // Nettoyage du fichier temporaire fs.unlinkSync(req.file.path) console.log(`[API] Traitement terminé avec succès - Confiance: ${Math.round(result.metadata.quality.globalConfidence * 100)}%`) res.json(result) } catch (error) { console.error('[API] Erreur lors du traitement:', error) // Nettoyage en cas d'erreur if (req.file && fs.existsSync(req.file.path)) { fs.unlinkSync(req.file.path) } res.status(500).json({ success: false, error: 'Erreur lors du traitement du document', details: error.message }) } }) // Route pour lister les fichiers de test app.get('/api/test-files', (req, res) => { try { const testFilesDir = path.join(__dirname, '..', 'test-files') const files = fs.readdirSync(testFilesDir) .filter(file => { const ext = path.extname(file).toLowerCase() return ['.jpg', '.jpeg', '.png', '.pdf', '.tiff'].includes(ext) }) .map(file => { const filePath = path.join(testFilesDir, file) const stats = fs.statSync(filePath) return { name: file, size: stats.size, type: path.extname(file).toLowerCase(), lastModified: stats.mtime } }) res.json({ success: true, files }) } catch (error) { res.status(500).json({ success: false, error: error.message }) } }) // Route de santé app.get('/api/health', (req, res) => { res.json({ status: 'OK', timestamp: new Date().toISOString(), version: '1.0.0' }) }) // Démarrage du serveur app.listen(PORT, () => { console.log(`🚀 Serveur backend démarré sur le port ${PORT}`) console.log(`📡 API disponible sur: http://localhost:${PORT}/api`) console.log(`🏥 Health check: http://localhost:${PORT}/api/health`) console.log(`📁 Test files: http://localhost:${PORT}/api/test-files`) }) module.exports = app