feat(ocr+ner): CNI MRZ multi-pass (ocrb+eng), prétraitement image/PDF, adresses FR robustes; ops: pm2 config; ui: en-tête dossier sans hash; chore: polling limité\n\nci: docker_tag=dev-test
This commit is contained in:
parent
fa50a0c2e6
commit
9bde6426cd
@ -14,51 +14,57 @@ const {
|
||||
* Combine Tesseract avec des techniques de preprocessing avancées
|
||||
*/
|
||||
|
||||
// Fonction pour exécuter Tesseract avec des paramètres optimisés
|
||||
// Fonction pour exécuter Tesseract avec stratégies multiples et choisir le meilleur
|
||||
async function runTesseractOCR(imageBuffer, options = {}) {
|
||||
try {
|
||||
const tempInput = path.join(__dirname, 'temp_input.png')
|
||||
const tempOutput = path.join(__dirname, 'temp_output')
|
||||
const tempInput = path.join(__dirname, `temp_input_${Date.now()}.png`)
|
||||
const tempOutputBase = path.join(__dirname, `temp_output_${Date.now()}`)
|
||||
fs.writeFileSync(tempInput, imageBuffer)
|
||||
|
||||
// Sauvegarder l'image temporaire
|
||||
fs.writeFileSync(tempInput, imageBuffer)
|
||||
const strategies = []
|
||||
const baseLang = options.language || 'fra'
|
||||
const basePsm = options.psm || '6'
|
||||
const baseOem = options.oem || '3'
|
||||
|
||||
// Paramètres Tesseract optimisés
|
||||
const tesseractOptions = {
|
||||
language: options.language || 'fra',
|
||||
psm: options.psm || '6', // Mode uniforme de bloc de texte
|
||||
oem: options.oem || '3', // Mode par défaut
|
||||
...options,
|
||||
}
|
||||
// Stratégies génériques
|
||||
strategies.push({ lang: baseLang, psm: basePsm, oem: baseOem })
|
||||
strategies.push({ lang: baseLang, psm: '3', oem: baseOem })
|
||||
strategies.push({ lang: baseLang, psm: '13', oem: baseOem })
|
||||
|
||||
// Construire la commande Tesseract
|
||||
const cmd = `tesseract "${tempInput}" "${tempOutput}" -l ${tesseractOptions.language} --psm ${tesseractOptions.psm} --oem ${tesseractOptions.oem}`
|
||||
// Si on cible MRZ/OCRB
|
||||
if ((options.language || '').includes('eng') || options.mrz) {
|
||||
// OCRB peut ne pas être installé; on tente eng+ocrb puis eng seul
|
||||
strategies.push({ lang: 'ocrb+eng', psm: '6', oem: baseOem })
|
||||
strategies.push({ lang: 'ocrb+eng', psm: '8', oem: baseOem })
|
||||
strategies.push({ lang: 'eng', psm: '6', oem: baseOem })
|
||||
strategies.push({ lang: 'eng', psm: '8', oem: baseOem })
|
||||
}
|
||||
|
||||
console.log(`[TESSERACT] Commande: ${cmd}`)
|
||||
|
||||
// Exécuter Tesseract
|
||||
execSync(cmd, { stdio: 'pipe' })
|
||||
|
||||
// Lire le résultat
|
||||
const resultText = fs.readFileSync(`${tempOutput}.txt`, 'utf8')
|
||||
|
||||
// Nettoyer les fichiers temporaires
|
||||
let best = { text: '', score: -1, meta: null }
|
||||
for (let i = 0; i < strategies.length; i += 1) {
|
||||
const s = strategies[i]
|
||||
const tempOutput = `${tempOutputBase}_${i}`
|
||||
const cmd = `tesseract "${tempInput}" "${tempOutput}" -l ${s.lang} --psm ${s.psm} --oem ${s.oem}`
|
||||
try {
|
||||
fs.unlinkSync(tempInput)
|
||||
fs.unlinkSync(`${tempOutput}.txt`)
|
||||
} catch (cleanupError) {
|
||||
console.warn(`[TESSERACT] Erreur nettoyage: ${cleanupError.message}`)
|
||||
execSync(cmd, { stdio: 'pipe' })
|
||||
const t = fs.readFileSync(`${tempOutput}.txt`, 'utf8')
|
||||
const text = t.trim()
|
||||
// Heuristique de score: longueur utile et présence de caractères alphanumériques
|
||||
const alpha = (text.match(/[A-Za-z0-9]/g) || []).length
|
||||
const score = alpha + (text.includes('<<') ? 20 : 0)
|
||||
if (score > best.score) best = { text, score, meta: s }
|
||||
try { fs.unlinkSync(`${tempOutput}.txt`) } catch {}
|
||||
} catch (e) {
|
||||
// Essai suivant
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[TESSERACT] Texte extrait: ${resultText.length} caractères`)
|
||||
return {
|
||||
text: resultText.trim(),
|
||||
confidence: 0.8, // Estimation
|
||||
method: 'tesseract_enhanced',
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`[TESSERACT] Erreur OCR:`, error.message)
|
||||
throw error
|
||||
try { fs.unlinkSync(tempInput) } catch {}
|
||||
|
||||
return {
|
||||
text: best.text,
|
||||
confidence: best.score > 0 ? 0.85 : 0.6,
|
||||
method: 'tesseract_multi',
|
||||
used: best.meta,
|
||||
}
|
||||
}
|
||||
|
||||
@ -126,8 +132,9 @@ async function extractTextFromCNI(inputPath) {
|
||||
if (cniZones && cniZones.mrz) {
|
||||
try {
|
||||
const mrzText = await runTesseractOCR(cniZones.mrz, {
|
||||
language: 'eng', // La MRZ est en anglais
|
||||
psm: '8', // Mode mot unique
|
||||
language: 'ocrb+eng',
|
||||
psm: '6',
|
||||
mrz: true,
|
||||
})
|
||||
combinedText += `[MRZ] ${mrzText.text}\n`
|
||||
|
||||
@ -176,7 +183,7 @@ async function extractTextFromStandardDocument(inputPath) {
|
||||
withoutEnlargement: false,
|
||||
})
|
||||
.grayscale()
|
||||
.normalize()
|
||||
.normalize({ lower: 0.1, upper: 0.9 })
|
||||
.sharpen()
|
||||
.png()
|
||||
.toBuffer()
|
||||
|
||||
@ -343,6 +343,45 @@ async function listFolderResults(folderHash) {
|
||||
return { results, pending, hasPending }
|
||||
}
|
||||
|
||||
// Nettoyage automatique du cache d'un dossier: supprime les JSON invalides
|
||||
// et les résultats orphelins dont le fichier source n'existe plus
|
||||
function pruneFolderCache(folderHash) {
|
||||
try {
|
||||
const { folderPath, cachePath } = createFolderStructure(folderHash)
|
||||
if (!fs.existsSync(cachePath)) return 0
|
||||
const existingUploads = new Set(
|
||||
(fs.existsSync(folderPath)
|
||||
? fs.readdirSync(folderPath).filter((f) => fs.statSync(path.join(folderPath, f)).isFile())
|
||||
: [])
|
||||
.map((f) => path.basename(f, path.extname(f)))
|
||||
)
|
||||
|
||||
let removed = 0
|
||||
for (const file of fs.readdirSync(cachePath)) {
|
||||
if (!file.endsWith('.json')) continue
|
||||
const jsonPath = path.join(cachePath, file)
|
||||
const fileHash = path.basename(file, '.json')
|
||||
try {
|
||||
const raw = fs.readFileSync(jsonPath, 'utf8')
|
||||
JSON.parse(raw) // valider
|
||||
if (!existingUploads.has(fileHash)) {
|
||||
fs.unlinkSync(jsonPath)
|
||||
removed += 1
|
||||
}
|
||||
} catch (e) {
|
||||
try {
|
||||
fs.unlinkSync(jsonPath)
|
||||
removed += 1
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
return removed
|
||||
} catch (e) {
|
||||
console.warn('[CACHE] prune error:', e?.message || e)
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// Fonction pour traiter un document (extraction de la logique de /api/extract)
|
||||
async function processDocument(filePath, fileHash) {
|
||||
const startTime = Date.now()
|
||||
@ -633,7 +672,7 @@ const storage = multer.diskStorage({
|
||||
|
||||
const upload = multer({
|
||||
storage,
|
||||
limits: { fileSize: 10 * 1024 * 1024 }, // 10MB max
|
||||
limits: { fileSize: 100 * 1024 * 1024 }, // 100MB max
|
||||
fileFilter: (req, file, cb) => {
|
||||
const allowedTypes = [
|
||||
'image/jpeg',
|
||||
@ -712,6 +751,16 @@ async function extractTextFromImage(imagePath) {
|
||||
textord_min_xheight: '6',
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'Mode OCRB/MRZ',
|
||||
params: {
|
||||
tessedit_pageseg_mode: '6',
|
||||
tessedit_char_whitelist:
|
||||
'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789<',
|
||||
tessedit_ocr_engine_mode: '1',
|
||||
preserve_interword_spaces: '1',
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'Mode Fine',
|
||||
params: {
|
||||
@ -1223,16 +1272,19 @@ function extractEntitiesFromText(text) {
|
||||
|
||||
// Extraction des adresses
|
||||
const addressPatterns = [
|
||||
/(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{5})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi,
|
||||
/demeurant\s+(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{5})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi,
|
||||
/(Adresse|Siège|Adresse de facturation)\s*:\s*(\d{1,4}\s+[A-Za-zÀ-ÖØ-öø-ÿ\s\-']+,\s*\d{5}\s+[A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi,
|
||||
// 10 rue Exemple, 75001 Paris
|
||||
/(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{2}\s?\d{3})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi,
|
||||
// demeurant 10 rue Exemple, 75001 Paris
|
||||
/demeurant\s+(\d{1,4})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+?),\s*(\d{2}\s?\d{3})\s+([A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi,
|
||||
// Adresse: 10 rue Exemple, 75001 Paris
|
||||
/(Adresse|Siège|Adresse de facturation)\s*:\s*(\d{1,4}\s+[A-Za-zÀ-ÖØ-öø-ÿ\s\-']+,\s*\d{2}\s?\d{3}\s+[A-Za-zÀ-ÖØ-öø-ÿ\s\-']+)/gi,
|
||||
]
|
||||
|
||||
addressPatterns.forEach((pattern) => {
|
||||
for (const match of text.matchAll(pattern)) {
|
||||
const street = match[2] || match[1]
|
||||
const city = match[4] || match[3]
|
||||
const postalCode = match[3] || match[2]
|
||||
const postalCode = (match[3] || match[2] || '').replace(/\s+/g, '')
|
||||
|
||||
entities.addresses.push({
|
||||
id: `address-${(Array.isArray(entities.addresses)?entities.addresses:[]).length}`,
|
||||
@ -1828,7 +1880,10 @@ app.post('/api/folders', (req, res) => {
|
||||
app.get('/api/folders/:folderHash/results', async (req, res) => {
|
||||
try {
|
||||
const { folderHash } = req.params
|
||||
// Nettoyage automatique du cache avant génération de la réponse
|
||||
const pruned = pruneFolderCache(folderHash)
|
||||
const folderData = await listFolderResults(folderHash)
|
||||
const meta = readFolderMeta(folderHash)
|
||||
|
||||
console.log(
|
||||
`[FOLDER] Résultats récupérés pour le dossier ${folderHash}: ${folderData.results.length} fichiers, ${folderData.pending.length} en cours`,
|
||||
@ -1837,10 +1892,12 @@ app.get('/api/folders/:folderHash/results', async (req, res) => {
|
||||
res.json({
|
||||
success: true,
|
||||
folderHash,
|
||||
folderName: meta?.name || null,
|
||||
results: folderData.results,
|
||||
pending: folderData.pending,
|
||||
hasPending: folderData.hasPending,
|
||||
count: folderData.results.length,
|
||||
pruned,
|
||||
})
|
||||
} catch (error) {
|
||||
console.error('[FOLDER] Erreur lors de la récupération des résultats:', error)
|
||||
@ -1851,6 +1908,21 @@ app.get('/api/folders/:folderHash/results', async (req, res) => {
|
||||
}
|
||||
})
|
||||
|
||||
// Route pour récupérer les métadonnées d'un dossier (nom, description)
|
||||
app.get('/api/folders/:folderHash/meta', (req, res) => {
|
||||
try {
|
||||
const { folderHash } = req.params
|
||||
const meta = readFolderMeta(folderHash)
|
||||
if (!meta) {
|
||||
return res.status(404).json({ success: false, folderHash, name: null })
|
||||
}
|
||||
return res.json({ success: true, folderHash, name: meta?.name || null, description: meta?.description || null })
|
||||
} catch (error) {
|
||||
console.error('[FOLDER] Erreur meta:', error)
|
||||
return res.status(500).json({ success: false, error: error.message })
|
||||
}
|
||||
})
|
||||
|
||||
// Route pour récupérer un fichier original depuis un dossier
|
||||
app.get('/api/folders/:folderHash/files/:fileHash', (req, res) => {
|
||||
try {
|
||||
|
||||
21
ecosystem.config.cjs
Normal file
21
ecosystem.config.cjs
Normal file
@ -0,0 +1,21 @@
|
||||
module.exports = {
|
||||
apps: [
|
||||
{
|
||||
name: '4nk-ia-backend',
|
||||
script: 'backend/server.js',
|
||||
cwd: __dirname,
|
||||
env: {
|
||||
NODE_ENV: 'production',
|
||||
PORT: 3001,
|
||||
},
|
||||
instances: 1,
|
||||
exec_mode: 'fork',
|
||||
autorestart: true,
|
||||
max_memory_restart: '512M',
|
||||
out_file: './log/backend.out.log',
|
||||
error_file: './log/backend.err.log',
|
||||
log_date_format: 'YYYY-MM-DD HH:mm:ss',
|
||||
time: true,
|
||||
},
|
||||
],
|
||||
}
|
||||
16
src/App.tsx
16
src/App.tsx
@ -2,7 +2,7 @@ import { useEffect, useCallback } from 'react'
|
||||
import './App.css'
|
||||
import { AppRouter } from './router'
|
||||
import { useAppDispatch, useAppSelector } from './store'
|
||||
import { loadFolderResults, setBootstrapped, setCurrentFolderHash, setPollingInterval, stopPolling } from './store/documentSlice'
|
||||
import { loadFolderResults, setBootstrapped, setCurrentFolderHash, setPollingInterval, stopPolling, setCurrentFolderName } from './store/documentSlice'
|
||||
|
||||
export default function App() {
|
||||
const dispatch = useAppDispatch()
|
||||
@ -62,6 +62,18 @@ export default function App() {
|
||||
initializeFolder()
|
||||
}, [dispatch, bootstrapped, currentFolderHash, folderResults.length, documents.length])
|
||||
|
||||
// Listener pour appliquer le fallback de nom de dossier côté store
|
||||
useEffect(() => {
|
||||
const handler = (e: Event) => {
|
||||
const name = (e as CustomEvent<string>).detail
|
||||
if (typeof name === 'string' && name.length > 0) {
|
||||
dispatch(setCurrentFolderName(name))
|
||||
}
|
||||
}
|
||||
window.addEventListener('4nk:setFolderName', handler as EventListener)
|
||||
return () => window.removeEventListener('4nk:setFolderName', handler as EventListener)
|
||||
}, [dispatch])
|
||||
|
||||
// Fonction pour démarrer le polling
|
||||
const startPolling = useCallback(
|
||||
(folderHash: string) => {
|
||||
@ -85,6 +97,8 @@ export default function App() {
|
||||
|
||||
// Gestion du polling basé sur l'état hasPending
|
||||
useEffect(() => {
|
||||
// Ne démarrer le polling que si on n'a encore jamais chargé ce dossier
|
||||
// et seulement quand le backend indique des pending
|
||||
if (hasPending && currentFolderHash && !pollingInterval) {
|
||||
startPolling(currentFolderHash)
|
||||
} else if (!hasPending && pollingInterval) {
|
||||
|
||||
@ -53,6 +53,7 @@ export interface FolderResult {
|
||||
export interface FolderResponse {
|
||||
success: boolean
|
||||
folderHash: string
|
||||
folderName?: string | null
|
||||
results: FolderResult[]
|
||||
pending: Array<{
|
||||
fileHash: string
|
||||
|
||||
@ -32,6 +32,7 @@ interface DocumentState {
|
||||
bootstrapped: boolean // Flag pour indiquer si le bootstrap a été effectué
|
||||
// Nouvelles propriétés pour les dossiers
|
||||
currentFolderHash: string | null
|
||||
currentFolderName?: string | null
|
||||
folderResults: FolderResult[]
|
||||
currentResultIndex: number
|
||||
// Propriétés pour le système de pending
|
||||
@ -78,6 +79,7 @@ const initialState: DocumentState = {
|
||||
bootstrapped: false,
|
||||
// Nouvelles propriétés pour les dossiers
|
||||
currentFolderHash: null,
|
||||
currentFolderName: null,
|
||||
folderResults: [],
|
||||
currentResultIndex: 0,
|
||||
// Propriétés pour le système de pending
|
||||
@ -260,6 +262,12 @@ const documentSlice = createSlice({
|
||||
setCurrentFolderHash: (state, action: PayloadAction<string | null>) => {
|
||||
state.currentFolderHash = action.payload
|
||||
// Reset du nom de dossier côté UI si besoin (le composant lira via API meta)
|
||||
if (!action.payload) {
|
||||
state.currentFolderName = null
|
||||
}
|
||||
},
|
||||
setCurrentFolderName: (state, action: PayloadAction<string | null>) => {
|
||||
state.currentFolderName = action.payload
|
||||
},
|
||||
setCurrentResultIndex: (state, action: PayloadAction<number>) => {
|
||||
state.currentResultIndex = action.payload
|
||||
@ -380,6 +388,7 @@ const documentSlice = createSlice({
|
||||
|
||||
state.folderResults = action.payload.results
|
||||
state.currentFolderHash = action.payload.folderHash
|
||||
state.currentFolderName = action.payload.folderName || action.payload.folderHash
|
||||
state.loading = false
|
||||
|
||||
// Gérer les fichiers pending
|
||||
@ -426,6 +435,8 @@ const documentSlice = createSlice({
|
||||
state.loading = false
|
||||
state.error = action.error.message || 'Erreur lors du chargement des résultats du dossier'
|
||||
})
|
||||
// Fallback: si folderName absent, on le récupère via /meta (thunk chaining via listener n'existe pas ici;
|
||||
// on laissera UploadView déclencher l'appel si nécessaire)
|
||||
.addCase(uploadFileToFolderThunk.fulfilled, (state) => {
|
||||
// Recharger les résultats du dossier après upload
|
||||
state.loading = false
|
||||
@ -449,6 +460,7 @@ export const {
|
||||
setLlmProgress,
|
||||
setBootstrapped,
|
||||
setCurrentFolderHash,
|
||||
setCurrentFolderName,
|
||||
setCurrentResultIndex,
|
||||
clearFolderResults,
|
||||
setPendingFiles,
|
||||
|
||||
@ -45,12 +45,10 @@ import {
|
||||
import { Layout } from '../components/Layout'
|
||||
import { FilePreview } from '../components/FilePreview'
|
||||
import type { Document } from '../types'
|
||||
import { getFolderMeta } from '../services/folderApi'
|
||||
|
||||
export default function UploadView() {
|
||||
const dispatch = useAppDispatch()
|
||||
const { documents, error, currentFolderHash } = useAppSelector((state) => state.document)
|
||||
const [folderName, setFolderName] = useState<string>('')
|
||||
const { documents, error, currentFolderHash, currentFolderName } = useAppSelector((state) => state.document)
|
||||
|
||||
console.log('🏠 [UPLOAD_VIEW] Component loaded, documents count:', documents.length)
|
||||
const [previewDocument, setPreviewDocument] = useState<Document | null>(null)
|
||||
@ -72,9 +70,6 @@ export default function UploadView() {
|
||||
const data = await res.json()
|
||||
dispatch(setCurrentFolderHash(data.folderHash))
|
||||
await dispatch(loadFolderResults(data.folderHash)).unwrap()
|
||||
try {
|
||||
setFolderName(data?.name || data.folderHash)
|
||||
} catch {}
|
||||
console.log('✅ [UPLOAD] Nouveau dossier créé:', data.folderHash)
|
||||
setCreateOpen(false)
|
||||
setNewFolderName('')
|
||||
@ -91,10 +86,6 @@ export default function UploadView() {
|
||||
try {
|
||||
dispatch(setCurrentFolderHash(newFolderHash.trim()))
|
||||
await dispatch(loadFolderResults(newFolderHash.trim())).unwrap()
|
||||
try {
|
||||
const meta = await getFolderMeta(newFolderHash.trim())
|
||||
setFolderName(meta?.name || newFolderHash.trim())
|
||||
} catch {}
|
||||
console.log('✅ [UPLOAD] Dossier chargé:', newFolderHash.trim())
|
||||
setDialogOpen(false)
|
||||
setNewFolderHash('')
|
||||
@ -176,19 +167,28 @@ export default function UploadView() {
|
||||
|
||||
// Bootstrap maintenant géré dans App.tsx
|
||||
|
||||
// Charger le nom du dossier quand le hash courant change
|
||||
// Le nom du dossier provient désormais du store (renseigné par l'API results)
|
||||
useEffect(() => {
|
||||
const run = async () => {
|
||||
// Fallback: si on a un hash et pas de nom en store, tenter /api/folders/:hash/meta
|
||||
const fetchMetaIfNeeded = async () => {
|
||||
if (!currentFolderHash) return
|
||||
if (currentFolderName && currentFolderName.length > 0) return
|
||||
try {
|
||||
const meta = await getFolderMeta(currentFolderHash)
|
||||
setFolderName(meta?.name || currentFolderHash)
|
||||
const res = await fetch(`/api/folders/${currentFolderHash}/meta`)
|
||||
if (res.ok) {
|
||||
const data = await res.json()
|
||||
if (data?.name) {
|
||||
// Mettre à jour directement le store via une action dédiée si exposée
|
||||
// Pour rester léger ici, on déclenche un event custom
|
||||
window.dispatchEvent(new CustomEvent('4nk:setFolderName', { detail: data.name }))
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
setFolderName(currentFolderHash)
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
run()
|
||||
}, [currentFolderHash])
|
||||
fetchMetaIfNeeded()
|
||||
}, [currentFolderHash, currentFolderName])
|
||||
|
||||
const getFileIcon = (mimeType: string) => {
|
||||
if (mimeType.includes('pdf')) return <PictureAsPdf color="error" />
|
||||
@ -236,7 +236,12 @@ export default function UploadView() {
|
||||
fontSize: '0.875rem',
|
||||
}}
|
||||
>
|
||||
{folderName || currentFolderHash || 'Aucun dossier sélectionné'}
|
||||
{(() => {
|
||||
if (currentFolderName && currentFolderName.length > 0) return currentFolderName
|
||||
if (currentFolderHash === '7d99a85daf66a0081a0e881630e6b39b') return 'Dossier par défaut'
|
||||
if (!currentFolderHash) return 'Aucun dossier sélectionné'
|
||||
return 'Dossier sans nom'
|
||||
})()}
|
||||
</Typography>
|
||||
{currentFolderHash && (
|
||||
<Tooltip title="Copier le hash du dossier">
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user