diff --git a/.env b/.env index 07368fa..54fcb42 100644 --- a/.env +++ b/.env @@ -4,6 +4,8 @@ VITE_API_URL=http://localhost:18000 # Configuration pour le développement VITE_APP_NAME=4NK IA Lecoffre.io VITE_APP_VERSION=0.1.0 +VITE_USE_RULE_NER=true +VITE_LLM_CLASSIFY_ONLY=true # Configuration des services externes (optionnel) VITE_CADASTRE_API_URL=https://apicarto.ign.fr/api/cadastre diff --git a/README.md b/README.md index dc49578..374b085 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,7 @@ VITE_USE_OPENAI=false VITE_OPENAI_API_KEY= VITE_OPENAI_BASE_URL=https://api.openai.com/v1 VITE_OPENAI_MODEL=gpt-4o-mini +VITE_USE_RULE_NER=true VITE_CADASTRE_API_URL=https://api.cadastre.gouv.fr VITE_GEORISQUES_API_URL=https://www.georisques.gouv.fr/api VITE_GEOFONCIER_API_URL=https://api.geofoncier.fr diff --git a/index.html b/index.html index e4b78ea..f775e5d 100644 --- a/index.html +++ b/index.html @@ -4,7 +4,7 @@ - Vite + React + TS + 4NK IA - Lecoffre.io
diff --git a/src/components/Layout.tsx b/src/components/Layout.tsx index 050fc15..776906a 100644 --- a/src/components/Layout.tsx +++ b/src/components/Layout.tsx @@ -1,7 +1,9 @@ -import React from 'react' -import { AppBar, Toolbar, Typography, Container, Box } from '@mui/material' +import React, { useEffect } from 'react' +import { AppBar, Toolbar, Typography, Container, Box, LinearProgress } from '@mui/material' import { useNavigate, useLocation } from 'react-router-dom' import { NavigationTabs } from './NavigationTabs' +import { useAppDispatch, useAppSelector } from '../store' +import { extractDocument, analyzeDocument, getContextData, getConseil } from '../store/documentSlice' interface LayoutProps { children: React.ReactNode @@ -10,6 +12,24 @@ interface LayoutProps { export const Layout: React.FC = ({ children }) => { const navigate = useNavigate() const location = useLocation() + const dispatch = useAppDispatch() + const { documents, extractionById, loading, currentDocument, contextResult, conseilResult, analysisResult } = useAppSelector((s) => s.document) + + // Au chargement/nav: lancer OCR+classification pour tous les documents sans résultat + useEffect(() => { + documents.forEach((doc) => { + if (!extractionById[doc.id]) dispatch(extractDocument(doc.id)) + }) + }, [documents, extractionById, dispatch]) + + // Déclencher contexte et conseil globaux une fois qu'un document courant existe + useEffect(() => { + if (currentDocument) { + if (!analysisResult) dispatch(analyzeDocument(currentDocument.id)) + if (!contextResult) dispatch(getContextData(currentDocument.id)) + if (!conseilResult) dispatch(getConseil(currentDocument.id)) + } + }, [currentDocument, analysisResult, contextResult, conseilResult, dispatch]) return ( @@ -28,6 +48,12 @@ export const Layout: React.FC = ({ children }) => { + {loading && ( + + + + )} + {children} diff --git a/src/services/fileExtract.ts b/src/services/fileExtract.ts index b572e76..47ff0d3 100644 --- a/src/services/fileExtract.ts +++ b/src/services/fileExtract.ts @@ -18,10 +18,20 @@ async function getPdfJs() { export async function extractTextFromFile(file: File): Promise { const mime = file.type || '' if (mime.includes('pdf') || file.name.toLowerCase().endsWith('.pdf')) { - return extractFromPdf(file) + const pdfText = await extractFromPdf(file) + if (import.meta.env.DEV) { + // eslint-disable-next-line no-console + console.info('[OCR][PDF]', file.name, 'len=', pdfText.length, 'peek=', pdfText.slice(0, 200)) + } + return pdfText } if (mime.startsWith('image/') || ['.png', '.jpg', '.jpeg'].some((ext) => file.name.toLowerCase().endsWith(ext))) { - return extractFromImage(file) + const imgText = await extractFromImage(file) + if (import.meta.env.DEV) { + // eslint-disable-next-line no-console + console.info('[OCR][IMG]', file.name, 'len=', imgText.length, 'peek=', imgText.slice(0, 200)) + } + return imgText } // Fallback: lecture texte brut try { @@ -40,8 +50,23 @@ async function extractFromPdf(file: File): Promise { const numPages = Math.min(pdf.numPages, 50) for (let i = 1; i <= numPages; i += 1) { const page = await pdf.getPage(i) - const content = await page.getTextContent() - const pageText = content.items.map((it: any) => (it.str ? it.str : '')).join(' ') + const content = await page.getTextContent().catch(() => null) + let pageText = '' + if (content) { + pageText = content.items.map((it: any) => (it.str ? it.str : '')).join(' ') + } + // Fallback OCR si pas de texte exploitable + if (!pageText || pageText.replace(/\s+/g, '').length < 30) { + const viewport = page.getViewport({ scale: 2 }) + const canvas = document.createElement('canvas') + canvas.width = viewport.width + canvas.height = viewport.height + const ctx = canvas.getContext('2d') as any + await page.render({ canvasContext: ctx, viewport }).promise + const blob: Blob = await new Promise((resolve) => canvas.toBlob((b) => resolve(b as Blob), 'image/png')) + const ocrText = await extractFromImage(new File([blob], `${file.name}-p${i}.png`, { type: 'image/png' })) + pageText = ocrText + } if (pageText.trim()) texts.push(pageText) } return texts.join('\n') diff --git a/src/services/openai.ts b/src/services/openai.ts index 9876a25..1f854b2 100644 --- a/src/services/openai.ts +++ b/src/services/openai.ts @@ -10,6 +10,7 @@ import type { ConseilResult, } from '../types' import { extractTextFromFile } from './fileExtract' +import { runRuleNER } from './ruleNer' const OPENAI_API_KEY = import.meta.env.VITE_OPENAI_API_KEY const OPENAI_BASE_URL = import.meta.env.VITE_OPENAI_BASE_URL || 'https://api.openai.com/v1' @@ -83,28 +84,105 @@ export const openaiDocumentApi = { localText = '' } } + // Flags de mode + const useRuleNer = import.meta.env.VITE_USE_RULE_NER === 'true' + const classifyOnly = import.meta.env.VITE_LLM_CLASSIFY_ONLY === 'true' + + // Si NER local actif, on l'utilise pour tout (identités/adresses/...) puis, si demandé, + // on peut consulter le LLM uniquement pour classifier le type de document + if (useRuleNer) { + let res = runRuleNER(documentId, localText) + if (classifyOnly && OPENAI_API_KEY && localText) { + try { + hooks?.onLlmProgress?.(0) + const cls = await callOpenAIChat([ + { role: 'system', content: 'Tu es un classifieur. Retourne uniquement un JSON strict.' }, + { role: 'user', content: `Classifie ce texte en une des catégories suivantes: [CNI, Facture, Attestation, Document]. Réponds strictement sous la forme {"documentType":"..."}.\nTexte:\n${localText.slice(0, 8000)}` }, + ]) + const parsed = JSON.parse(cls) + if (parsed && typeof parsed.documentType === 'string') { + res = { ...res, documentType: parsed.documentType } + res.confidenceReasons = [...(res.confidenceReasons || []), 'Classification LLM limitée au documentType'] + } + hooks?.onLlmProgress?.(1) + } catch { + // ignore échec de classification + hooks?.onLlmProgress?.(1) + } + } + return res + } + hooks?.onLlmProgress?.(0) + // Si on demande uniquement la classification par LLM, ne demander que le type; + // sinon on demande la structuration complète (mode précédent) + if (classifyOnly) { + try { + const cls = await callOpenAIChat([ + { role: 'system', content: 'Tu es un classifieur. Retourne uniquement un JSON strict.' }, + { role: 'user', content: `Classifie ce texte en une des catégories suivantes: [CNI, Facture, Attestation, Document]. Réponds strictement sous la forme {"documentType":"..."}.\nTexte:\n${localText.slice(0, 8000)}` }, + ]) + const parsed = JSON.parse(cls) + hooks?.onLlmProgress?.(1) + return { + documentId, + text: localText || '', + language: 'fr', + documentType: (parsed && parsed.documentType) || 'Document', + identities: [], + addresses: [], + properties: [], + contracts: [], + signatures: [], + confidence: 0.6, + confidenceReasons: ['Classification LLM sans contexte, pas d\'extraction d\'identités'], + } + } catch { + hooks?.onLlmProgress?.(1) + return { + documentId, + text: localText || '', + language: 'fr', + documentType: 'Document', + identities: [], + addresses: [], + properties: [], + contracts: [], + signatures: [], + confidence: 0.6, + confidenceReasons: ['Classification LLM échouée, valeur par défaut'], + } + } + } + const content = await callOpenAIChat([ { role: 'system', content: - 'Tu es un assistant qui extrait des informations structurées (identités, adresses, biens, contrats) à partir de documents. Réponds en JSON strict, sans texte autour.', + 'Tu extrais uniquement les informations présentes dans le texte OCR. Interdiction d\'inventer. Interdiction d\'utiliser le nom du fichier comme identité. Réponds en JSON strict, sans texte autour.', }, { role: 'user', - content: `Document ID: ${documentId}. Texte: ${localText.slice(0, 8000)}\nRetourne un JSON avec la forme suivante: {"language":"fr","documentType":"...","identities":[{"id":"id-1","type":"person","firstName":"...","lastName":"...","confidence":0.9}],"addresses":[{"street":"...","city":"...","postalCode":"...","country":"..."}],"properties":[{"id":"prop-1","type":"apartment","address":{"street":"...","city":"...","postalCode":"...","country":"..."},"surface":75}],"contracts":[{"id":"contract-1","type":"sale","parties":[],"amount":0,"date":"YYYY-MM-DD","clauses":["..."]}],"signatures":[],"confidence":0.7,"confidenceReasons":["..."]}`, + content: `Document ID: ${documentId}. Texte OCR (tronqué): ${localText.slice(0, 8000)}\nRègles: 1) ne pas inventer, 2) si incertitude, laisser vide, 3) ne JAMAIS utiliser le nom du fichier comme identité. Schéma JSON: {"language":"fr","documentType":"...","identities":[{"id":"id-1","type":"person","firstName":"...","lastName":"...","confidence":0.9}],"addresses":[{"street":"...","city":"...","postalCode":"...","country":"..."}],"properties":[{"id":"prop-1","type":"apartment","address":{"street":"...","city":"...","postalCode":"...","country":"..."},"surface":75}],"contracts":[{"id":"contract-1","type":"sale","parties":[],"amount":0,"date":"YYYY-MM-DD","clauses":["..."]}],"signatures":[],"confidence":0.7,"confidenceReasons":["sources présentes dans le texte"]}`, }, ]) // Essaye d'analyser le JSON, sinon fallback heuristique try { const parsed = JSON.parse(content) hooks?.onLlmProgress?.(1) + // Post-traitement: filtrage des identités qui ressemblent au nom de fichier + const docBase = (file?.name || '').toLowerCase().replace(/\.[a-z0-9]+$/, '') + const safeIdentities = (parsed.identities || []).filter((it: any) => { + const full = `${it.firstName || ''} ${it.lastName || ''}`.trim().toLowerCase() + return full && !docBase || (full && !docBase.includes(full) && !full.includes(docBase)) + }) + return { documentId, text: localText || '', language: parsed.language || 'fr', documentType: parsed.documentType || 'Document', - identities: parsed.identities || [], + identities: safeIdentities, addresses: parsed.addresses || [], properties: parsed.properties || [], contracts: parsed.contracts || [], diff --git a/src/services/ruleNer.ts b/src/services/ruleNer.ts new file mode 100644 index 0000000..481c433 --- /dev/null +++ b/src/services/ruleNer.ts @@ -0,0 +1,140 @@ +import type { ExtractionResult, Identity, Address, Property, Contract } from '../types' + +function toTitleCase(input: string): string { + return input + .toLowerCase() + .split(/\s+/) + .map((w) => w.charAt(0).toUpperCase() + w.slice(1)) + .join(' ') +} + +function extractMRZ(text: string): { firstName?: string; lastName?: string } | null { + // Cherche MRZ (deux lignes, < comme séparateur). Stricte A-Z0-9< + const lines = text.split(/\n|\r/).map((l) => l.trim().toUpperCase()) + for (let i = 0; i < lines.length - 1; i += 1) { + const a = lines[i].replace(/[^A-Z0-9<]/g, '') + const b = lines[i + 1].replace(/[^A-Z0-9<]/g, '') + if (a.includes('<<') || b.includes('<<')) { + const target = a.length >= b.length ? a : b + const parts = target.split('<<') + if (parts.length >= 2) { + const rawLast = parts[0].replace(/<+/g, ' ').trim() + const rawFirst = parts[1].replace(/<+/g, ' ').trim() + if (rawLast && rawFirst) return { firstName: toTitleCase(rawFirst), lastName: rawLast.replace(/\s+/g, ' ') } + } + } + } + return null +} + +function extractDates(text: string): string[] { + const results = new Set() + const patterns = [ + /(\b\d{2}[\/\-]\d{2}[\/\-]\d{4}\b)/g, // JJ/MM/AAAA ou JJ-MM-AAAA + /(\b\d{4}[\/\-]\d{2}[\/\-]\d{2}\b)/g, // AAAA/MM/JJ + ] + for (const re of patterns) { + for (const m of text.matchAll(re)) results.add(m[1]) + } + return Array.from(results) +} + +function extractCniNumbers(text: string): string[] { + const results = new Set() + const re = /\b[A-Z0-9]{12,15}\b/g + for (const m of text.toUpperCase().matchAll(re)) results.add(m[0]) + return Array.from(results) +} + +function extractAddresses(text: string): Address[] { + const items: Address[] = [] + const typeVoie = '(rue|avenue|av\.?|bd\.?|boulevard|impasse|chemin|all(é|e)e|route|place|quai|passage|square|voie|faubourg|fg\.?|cours|sentier|residence|résidence)' + const re = new RegExp(`(\\b\\d{1,4})\\s+([A-Za-zÀ-ÖØ-öø-ÿ\\-']{2,})\\s+${typeVoie}\\s+([A-Za-zÀ-ÖØ-öø-ÿ\\-']{2,})(?:\\s+|,)+(\\b\\d{5}\\b)\\s+([A-Za-zÀ-ÖØ-öø-ÿ\\-']{2,})`, 'gi') + for (const m of text.matchAll(re)) { + const street = `${m[1]} ${toTitleCase(`${m[2]} ${m[3]} ${m[4]}`)}`.trim() + const postalCode = m[5] + const city = toTitleCase(m[6]) + items.push({ street, city, postalCode, country: 'France' }) + } + return items +} + +function extractNames(text: string): Identity[] { + const identities: Identity[] = [] + // Heuristique: lignes en MAJUSCULES pour NOM; prénoms capitalisés à proximité + const lines = text.split(/\n|\r/).map((l) => l.trim()).filter(Boolean) + for (let i = 0; i < lines.length; i += 1) { + const line = lines[i] + if (/^[A-ZÀ-ÖØ-Þ\-\s]{3,}$/.test(line) && line.length <= 40) { + const lastName = line.replace(/\s+/g, ' ').trim() + // Cherche un prénom sur la ligne suivante ou la même ligne + const cand = (lines[i + 1] || '').trim() + const firstNameMatch = cand.match(/^[A-Z][a-zà-öø-ÿ'\-]{1,}(?:\s+[A-Z][a-zà-öø-ÿ'\-]{1,})?$/) + const firstName = firstNameMatch ? cand : undefined + if (lastName && (!firstName || firstName.length <= 40)) { + identities.push({ + id: `id-${i}`, + type: 'person', + firstName: firstName ? toTitleCase(firstName) : undefined, + lastName, + confidence: firstName ? 0.85 : 0.7, + }) + } + } + } + return identities +} + +export function runRuleNER(documentId: string, text: string): ExtractionResult { + const identitiesFromMRZ = extractMRZ(text) + const identities = identitiesFromMRZ + ? [ + { + id: 'mrz-1', + type: 'person', + firstName: identitiesFromMRZ.firstName, + lastName: identitiesFromMRZ.lastName!, + confidence: 0.9, + } as Identity, + ] + : extractNames(text) + + const addresses = extractAddresses(text) + const cniNumbers = extractCniNumbers(text) + const dates = extractDates(text) + + const contracts: Contract[] = [] + const properties: Property[] = [] + + const reasons: string[] = [] + if (identities.length) reasons.push('Identités détectées par règles') + if (addresses.length) reasons.push('Adresse(s) détectée(s) par motifs') + if (cniNumbers.length) reasons.push('Numéro CNI plausible détecté') + if (dates.length) reasons.push('Dates détectées') + + let documentType = 'Document' + if (/carte\s+nationale\s+d'identité|cni|mrz|identite/i.test(text)) documentType = 'CNI' + else if (/facture|tva|siren|montant/i.test(text)) documentType = 'Facture' + else if (/attestation|certificat/i.test(text)) documentType = 'Attestation' + + // Confiance: base 0.6 + bonus par signal + let confidence = 0.6 + if (identities.length) confidence += 0.15 + if (cniNumbers.length) confidence += 0.15 + if (addresses.length) confidence += 0.05 + confidence = Math.max(0, Math.min(1, confidence)) + + return { + documentId, + text, + language: 'fr', + documentType, + identities, + addresses, + properties, + contracts, + signatures: [], + confidence, + confidenceReasons: reasons, + } +} diff --git a/src/store/documentSlice.ts b/src/store/documentSlice.ts index 17cb427..7a7a614 100644 --- a/src/store/documentSlice.ts +++ b/src/store/documentSlice.ts @@ -9,6 +9,7 @@ interface DocumentState { currentDocument: Document | null extractionResult: ExtractionResult | null extractionById: Record + fileById: Record analysisResult: AnalysisResult | null contextResult: ContextResult | null conseilResult: ConseilResult | null @@ -22,6 +23,7 @@ const initialState: DocumentState = { currentDocument: null, extractionResult: null, extractionById: {}, + fileById: {}, analysisResult: null, contextResult: null, conseilResult: null, @@ -148,6 +150,11 @@ const documentSlice = createSlice({ state.loading = false state.documents.push(action.payload) state.currentDocument = action.payload + // Capture le File depuis l'URL blob si disponible + if (action.payload.previewUrl?.startsWith('blob:')) { + // On ne peut pas récupérer l'objet File initial ici sans passer par onDrop; + // il est reconstruit lors de l'extraction via fetch blob. + } }) .addCase(uploadDocument.rejected, (state, action) => { state.loading = false diff --git a/src/views/AnalyseView.tsx b/src/views/AnalyseView.tsx index bd3ed1c..8d76650 100644 --- a/src/views/AnalyseView.tsx +++ b/src/views/AnalyseView.tsx @@ -24,20 +24,19 @@ import { } from '@mui/icons-material' import type { ChipProps, LinearProgressProps } from '@mui/material' import { useAppDispatch, useAppSelector } from '../store' -import { analyzeDocument } from '../store/documentSlice' +import { analyzeDocument, getConseil, getContextData } from '../store/documentSlice' import { Layout } from '../components/Layout' export default function AnalyseView() { const dispatch = useAppDispatch() - const { currentDocument, analysisResult, loading } = useAppSelector( - (state) => state.document - ) + const { currentDocument, analysisResult, loading, conseilResult, contextResult } = useAppSelector((state) => state.document) useEffect(() => { - if (currentDocument && !analysisResult) { - dispatch(analyzeDocument(currentDocument.id)) - } - }, [currentDocument, analysisResult, dispatch]) + if (!currentDocument) return + if (!analysisResult) dispatch(analyzeDocument(currentDocument.id)) + if (!conseilResult) dispatch(getConseil(currentDocument.id)) + if (!contextResult) dispatch(getContextData(currentDocument.id)) + }, [currentDocument, analysisResult, conseilResult, contextResult, dispatch]) if (!currentDocument) { return ( @@ -97,16 +96,10 @@ export default function AnalyseView() { } - label={`Score de vraisemblance: ${(analysisResult.credibilityScore * 100).toFixed(1)}%`} + label={`Avancement: ${Math.round(analysisResult.credibilityScore * 100)}%`} color={getScoreColor(analysisResult.credibilityScore)} variant="filled" /> - } - label={`Type: ${analysisResult.documentType}`} - color="primary" - variant="outlined" - /> {analysisResult.isCNI && ( } diff --git a/src/views/ExtractionView.tsx b/src/views/ExtractionView.tsx index 04844e1..124cb1f 100644 --- a/src/views/ExtractionView.tsx +++ b/src/views/ExtractionView.tsx @@ -68,7 +68,9 @@ export default function ExtractionView() { ) } - if (!extractionResult) { + const activeResult = currentDocument ? (extractionById[currentDocument.id] || extractionResult) : extractionResult + + if (!activeResult) { return ( @@ -113,28 +115,28 @@ export default function ExtractionView() { } - label={`Langue: ${ (extractionById[currentDocument!.id] || extractionResult)!.language }`} + label={`Langue: ${ activeResult.language }`} color="primary" variant="outlined" /> } - label={`Type: ${ (extractionById[currentDocument!.id] || extractionResult)!.documentType }`} + label={`Type: ${ activeResult.documentType }`} color="secondary" variant="outlined" /> { const r = (extractionById[currentDocument!.id] || extractionResult)!; return (r.confidenceReasons && r.confidenceReasons.length > 0) - ? r.confidenceReasons.join(' • ') - : `Évaluation automatique basée sur le contenu et le type (${r.documentType}).` })() + (activeResult.confidenceReasons && activeResult.confidenceReasons.length > 0) + ? activeResult.confidenceReasons.join(' • ') + : `Évaluation automatique basée sur le contenu et le type (${activeResult.documentType}).` } > } - label={`Confiance: ${(() => { const r = (extractionById[currentDocument!.id] || extractionResult)!; return Math.round(r.confidence * 100)})()}%`} - color={(() => { const r = (extractionById[currentDocument!.id] || extractionResult)!; return r.confidence > 0.8 ? 'success' : 'warning' })()} + label={`Confiance: ${Math.round(activeResult.confidence * 100)}%`} + color={activeResult.confidence > 0.8 ? 'success' : 'warning'} variant="outlined" /> @@ -210,10 +212,10 @@ export default function ExtractionView() { - Identités ({extractionResult.identities?.length || 0}) + Identités ({activeResult.identities?.length || 0}) - {(extractionResult.identities || []).map((identity, index) => ( + {(activeResult.identities || []).map((identity, index) => ( - Adresses ({extractionResult.addresses?.length || 0}) + Adresses ({activeResult.addresses?.length || 0}) - {(extractionResult.addresses || []).map((address, index) => ( + {(activeResult.addresses || []).map((address, index) => ( - Biens ({extractionResult.properties?.length || 0}) + Biens ({activeResult.properties?.length || 0}) - {(extractionResult.properties || []).map((property, index) => ( + {(activeResult.properties || []).map((property, index) => ( - Contrats ({extractionResult.contracts?.length || 0}) + Contrats ({activeResult.contracts?.length || 0}) - {(extractionResult.contracts || []).map((contract, index) => ( + {(activeResult.contracts || []).map((contract, index) => ( - Signatures détectées ({extractionResult.signatures?.length || 0}) + Signatures détectées ({activeResult.signatures?.length || 0}) - {(extractionResult.signatures || []).map((signature: any, index: number) => { + {(activeResult.signatures || []).map((signature: any, index: number) => { const label = typeof signature === 'string' ? signature : signature?.name || signature?.title || signature?.date || JSON.stringify(signature) @@ -387,7 +389,7 @@ export default function ExtractionView() { }} > - {extractionResult.text} + {activeResult.text}