diff --git a/docs/deduplication_entites.md b/docs/deduplication_entites.md new file mode 100644 index 0000000..89967b4 --- /dev/null +++ b/docs/deduplication_entites.md @@ -0,0 +1,40 @@ +--- +title: Déduplication des entités extraites +description: Règles et clés de normalisation pour éliminer les doublons d'entités +--- + +## Contexte + +Afin d'éviter l'affichage et le traitement multiple de la même information, une déduplication déterministe a été ajoutée côté frontend après l'extraction des entités. + +## Périmètre + +- Identités (personnes) +- Adresses +- Dates +- Entreprises (nom, SIREN, SIRET) +- Signatures +- Références (type, valeur) + +## Règles de déduplication + +- Identités: clé = `lower(trim(firstName))|lower(trim(lastName))` +- Adresses: clé = `lower(trim(street))|trim(postalCode)|lower(trim(city))|lower(trim(country))` +- Dates: utilisation d'un `Set` (unicité par valeur exacte) +- Entreprises: clé = `lower(trim(name))|trim(siren)|trim(siret)` +- Signatures: utilisation d'un `Set` (unicité par valeur exacte) +- Références: clé = `lower(trim(type))|lower(trim(value))` + +## Points d'intégration + +- `src/services/ruleNer.ts`: déduplication appliquée au résultat du NER par règles. +- `src/services/backendApi.ts`: déduplication appliquée après le mapping de la réponse backend standard. + +## Effets de bord + +- L'ordre des entités est conservé (première occurrence préservée). +- Les entrées vides sont normalisées avant comparaison (trim/lowercase) pour limiter les faux doublons. + +## Validation + +Un test ciblé `tests/deduplication.test.ts` vérifie que des doublons simples (noms/adresses répétés) sont éliminés. diff --git a/nginx.conf b/nginx.conf index 722bb3c..189e8e1 100644 --- a/nginx.conf +++ b/nginx.conf @@ -2,7 +2,7 @@ server { listen 80; server_name _; - root /usr/share/nginx/html; + root /var/www/ia.4nkweb.com/current; index index.html; # Proxy vers le backend API diff --git a/package-lock.json b/package-lock.json index b2d7366..a0fdaf1 100644 --- a/package-lock.json +++ b/package-lock.json @@ -33,6 +33,7 @@ "@testing-library/user-event": "^14.6.1", "@types/react": "^19.1.10", "@types/react-dom": "^19.1.7", + "@types/supertest": "^6.0.3", "@vitejs/plugin-react": "^5.0.0", "@vitest/coverage-v8": "^3.2.4", "concurrently": "^9.2.1", @@ -46,6 +47,7 @@ "markdownlint-cli": "^0.45.0", "pdfjs-dist": "^4.8.69", "prettier": "^3.6.2", + "supertest": "^7.1.4", "tesseract.js": "^5.1.0", "typescript": "~5.8.3", "typescript-eslint": "^8.39.1", @@ -2374,6 +2376,18 @@ } } }, + "node_modules/@noble/hashes": { + "version": "1.8.0", + "resolved": "https://registry.npmjs.org/@noble/hashes/-/hashes-1.8.0.tgz", + "integrity": "sha512-jCs9ldd7NwzpgXDIf6P3+NrHh9/sD6CQdxHyjQI+h/6rDNo88ypBxxz45UDuZHz9r3tNz7N/VInSVoVdtXEI4A==", + "dev": true, + "engines": { + "node": "^14.21.3 || >=16" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + } + }, "node_modules/@nodelib/fs.scandir": { "version": "2.1.5", "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", @@ -2409,6 +2423,15 @@ "node": ">= 8" } }, + "node_modules/@paralleldrive/cuid2": { + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/@paralleldrive/cuid2/-/cuid2-2.2.2.tgz", + "integrity": "sha512-ZOBkgDwEdoYVlSeRbYYXs0S9MejQofiVYoTbKzy/6GQa39/q5tQU2IX46+shYnUkpEl3wc+J6wRlar7r2EK2xA==", + "dev": true, + "dependencies": { + "@noble/hashes": "^1.1.5" + } + }, "node_modules/@pkgjs/parseargs": { "version": "0.11.0", "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", @@ -2890,6 +2913,12 @@ "@types/deep-eql": "*" } }, + "node_modules/@types/cookiejar": { + "version": "2.1.5", + "resolved": "https://registry.npmjs.org/@types/cookiejar/-/cookiejar-2.1.5.tgz", + "integrity": "sha512-he+DHOWReW0nghN24E1WUqM0efK4kI9oTqDm6XmK8ZPe2djZ90BSNdGnIyCLzCPw7/pogPlGbzI2wHGGmi4O/Q==", + "dev": true + }, "node_modules/@types/debug": { "version": "4.1.12", "resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz", @@ -2923,6 +2952,12 @@ "integrity": "sha512-HMwFiRujE5PjrgwHQ25+bsLJgowjGjm5Z8FVSf0N6PwgJrwxH0QxzHYDcKsTfV3wva0vzrpqMTJS2jXPr5BMEQ==", "dev": true }, + "node_modules/@types/methods": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/@types/methods/-/methods-1.1.4.tgz", + "integrity": "sha512-ymXWVrDiCxTBE3+RIrrP533E70eA+9qu7zdWoHuOmGujkYtzf4HQF96b8nwHLqhuf4ykX61IGRIB38CC6/sImQ==", + "dev": true + }, "node_modules/@types/ms": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/@types/ms/-/ms-2.1.0.tgz", @@ -2935,8 +2970,6 @@ "integrity": "sha512-y1dMvuvJspJiPSDZUQ+WMBvF7dpnEqN4x9DDC9ie5Fs/HUZJA3wFp7EhHoVaKX/iI0cRoECV8X2jL8zi0xrHCg==", "dev": true, "license": "MIT", - "optional": true, - "peer": true, "dependencies": { "undici-types": "~7.12.0" } @@ -2976,6 +3009,28 @@ "@types/react": "*" } }, + "node_modules/@types/superagent": { + "version": "8.1.9", + "resolved": "https://registry.npmjs.org/@types/superagent/-/superagent-8.1.9.tgz", + "integrity": "sha512-pTVjI73witn+9ILmoJdajHGW2jkSaOzhiFYF1Rd3EQ94kymLqB9PjD9ISg7WaALC7+dCHT0FGe9T2LktLq/3GQ==", + "dev": true, + "dependencies": { + "@types/cookiejar": "^2.1.5", + "@types/methods": "^1.1.4", + "@types/node": "*", + "form-data": "^4.0.0" + } + }, + "node_modules/@types/supertest": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/@types/supertest/-/supertest-6.0.3.tgz", + "integrity": "sha512-8WzXq62EXFhJ7QsH3Ocb/iKQ/Ty9ZVWnVzoTKc9tyyFRRF3a74Tk2+TLFgaFFw364Ere+npzHKEJ6ga2LzIL7w==", + "dev": true, + "dependencies": { + "@types/methods": "^1.1.4", + "@types/superagent": "^8.1.0" + } + }, "node_modules/@types/unist": { "version": "2.0.11", "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz", @@ -3520,6 +3575,12 @@ "integrity": "sha512-L0XlBwfx9QetHOsbLDrE/vh2t018w9462HM3iaFfxRiK83aJjAt/Ja3NMkOW7FICwWTlQBa3ZbL5FKhuQWkDrg==", "license": "MIT" }, + "node_modules/asap": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/asap/-/asap-2.0.6.tgz", + "integrity": "sha512-BSHWgDSAiKs50o2Re8ppvp3seVHXSRM44cdSsT9FfNEUUZLOGWVCsiWaRPWM1Znn+mqZ1OfVZ3z3DWEzSp7hRA==", + "dev": true + }, "node_modules/assertion-error": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-2.0.1.tgz", @@ -3747,6 +3808,22 @@ "node": ">= 0.4" } }, + "node_modules/call-bound": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/call-bound/-/call-bound-1.0.4.tgz", + "integrity": "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg==", + "dev": true, + "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "get-intrinsic": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/callsites": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", @@ -3995,6 +4072,15 @@ "node": ">=18" } }, + "node_modules/component-emitter": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/component-emitter/-/component-emitter-1.3.1.tgz", + "integrity": "sha512-T0+barUSQRTUQASh8bx02dl+DhF54GtIDY13Y3m9oWTklKbb3Wv974meRpeZ3lp1JpLVECWWNHC4vaG2XHXouQ==", + "dev": true, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/concat-map": { "version": "0.0.1", "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", @@ -4053,6 +4139,12 @@ "node": ">=18" } }, + "node_modules/cookiejar": { + "version": "2.1.4", + "resolved": "https://registry.npmjs.org/cookiejar/-/cookiejar-2.1.4.tgz", + "integrity": "sha512-LDx6oHrK+PhzLKJU9j5S7/Y3jM/mUHvD/DeI1WQmJn652iPC5Y4TBzC9l+5OMOXlyTTA+SmVUPm0HQUwpD5Jqw==", + "dev": true + }, "node_modules/cosmiconfig": { "version": "7.1.0", "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-7.1.0.tgz", @@ -4236,6 +4328,16 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/dezalgo": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/dezalgo/-/dezalgo-1.0.4.tgz", + "integrity": "sha512-rXSP0bf+5n0Qonsb+SVVfNfIsimO4HEtmnIpPHY8Q1UCzKlQrDMfdobr8nJOOsRgWCyMRqeSBQzmWUMq7zvVig==", + "dev": true, + "dependencies": { + "asap": "^2.0.0", + "wrappy": "1" + } + }, "node_modules/dom-accessibility-api": { "version": "0.5.16", "resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.5.16.tgz", @@ -4697,6 +4799,12 @@ "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==", "dev": true }, + "node_modules/fast-safe-stringify": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/fast-safe-stringify/-/fast-safe-stringify-2.1.1.tgz", + "integrity": "sha512-W+KJc2dmILlPplD/H4K9l9LcAHAfPtP6BY84uVLXQ6Evcz9Lcg33Y2z1IVblT6xdY54PXYVHEv+0Wpq8Io6zkA==", + "dev": true + }, "node_modules/fastq": { "version": "1.19.1", "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.19.1.tgz", @@ -4848,6 +4956,23 @@ "node": ">= 6" } }, + "node_modules/formidable": { + "version": "3.5.4", + "resolved": "https://registry.npmjs.org/formidable/-/formidable-3.5.4.tgz", + "integrity": "sha512-YikH+7CUTOtP44ZTnUhR7Ic2UASBPOqmaRkRKxRbywPTe5VxF7RRCck4af9wutiZ/QKM5nME9Bie2fFaPz5Gug==", + "dev": true, + "dependencies": { + "@paralleldrive/cuid2": "^2.2.2", + "dezalgo": "^1.0.4", + "once": "^1.4.0" + }, + "engines": { + "node": ">=14.0.0" + }, + "funding": { + "url": "https://ko-fi.com/tunnckoCore/commissions" + } + }, "node_modules/fs-constants": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", @@ -5889,6 +6014,15 @@ "node": ">= 8" } }, + "node_modules/methods": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz", + "integrity": "sha512-iclAHeNqNm68zFtnZ0e+1L2yUIdvzNoauKU4WBA3VvH/vPFieF7qfRlwUZU+DA9P9bPXIS90ulxoUoCH23sV2w==", + "dev": true, + "engines": { + "node": ">= 0.6" + } + }, "node_modules/micromark": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/micromark/-/micromark-4.0.2.tgz", @@ -6627,6 +6761,18 @@ "node": ">=0.10.0" } }, + "node_modules/object-inspect": { + "version": "1.13.4", + "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz", + "integrity": "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==", + "dev": true, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/omggif": { "version": "1.0.10", "resolved": "https://registry.npmjs.org/omggif/-/omggif-1.0.10.tgz", @@ -7168,6 +7314,21 @@ "node": ">=6" } }, + "node_modules/qs": { + "version": "6.14.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.14.0.tgz", + "integrity": "sha512-YWWTjgABSKcvs/nWBi9PycY/JiPJqOD4JA6o9Sej2AtvSGarXxKC3OQSk4pAarbdQlKAh5D4FCQkJNkW+GAn3w==", + "dev": true, + "dependencies": { + "side-channel": "^1.1.0" + }, + "engines": { + "node": ">=0.6" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/queue-microtask": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", @@ -7734,6 +7895,78 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/side-channel": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.1.0.tgz", + "integrity": "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==", + "dev": true, + "dependencies": { + "es-errors": "^1.3.0", + "object-inspect": "^1.13.3", + "side-channel-list": "^1.0.0", + "side-channel-map": "^1.0.1", + "side-channel-weakmap": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/side-channel-list": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/side-channel-list/-/side-channel-list-1.0.0.tgz", + "integrity": "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA==", + "dev": true, + "dependencies": { + "es-errors": "^1.3.0", + "object-inspect": "^1.13.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/side-channel-map": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/side-channel-map/-/side-channel-map-1.0.1.tgz", + "integrity": "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA==", + "dev": true, + "dependencies": { + "call-bound": "^1.0.2", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.5", + "object-inspect": "^1.13.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/side-channel-weakmap": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/side-channel-weakmap/-/side-channel-weakmap-1.0.2.tgz", + "integrity": "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A==", + "dev": true, + "dependencies": { + "call-bound": "^1.0.2", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.5", + "object-inspect": "^1.13.3", + "side-channel-map": "^1.0.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/siginfo": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/siginfo/-/siginfo-2.0.0.tgz", @@ -8022,6 +8255,51 @@ "resolved": "https://registry.npmjs.org/stylis/-/stylis-4.2.0.tgz", "integrity": "sha512-Orov6g6BB1sDfYgzWfTHDOxamtX1bE/zo104Dh9e6fqJ3PooipYyfJ0pUmrZO2wAvO8YbEyeFrkV91XTsGMSrw==" }, + "node_modules/superagent": { + "version": "10.2.3", + "resolved": "https://registry.npmjs.org/superagent/-/superagent-10.2.3.tgz", + "integrity": "sha512-y/hkYGeXAj7wUMjxRbB21g/l6aAEituGXM9Rwl4o20+SX3e8YOSV6BxFXl+dL3Uk0mjSL3kCbNkwURm8/gEDig==", + "dev": true, + "dependencies": { + "component-emitter": "^1.3.1", + "cookiejar": "^2.1.4", + "debug": "^4.3.7", + "fast-safe-stringify": "^2.1.1", + "form-data": "^4.0.4", + "formidable": "^3.5.4", + "methods": "^1.1.2", + "mime": "2.6.0", + "qs": "^6.11.2" + }, + "engines": { + "node": ">=14.18.0" + } + }, + "node_modules/superagent/node_modules/mime": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/mime/-/mime-2.6.0.tgz", + "integrity": "sha512-USPkMeET31rOMiarsBNIHZKLGgvKc/LrjofAnBlOttf5ajRvqiRA8QsenbcooctK6d6Ts6aqZXBA+XbkKthiQg==", + "dev": true, + "bin": { + "mime": "cli.js" + }, + "engines": { + "node": ">=4.0.0" + } + }, + "node_modules/supertest": { + "version": "7.1.4", + "resolved": "https://registry.npmjs.org/supertest/-/supertest-7.1.4.tgz", + "integrity": "sha512-tjLPs7dVyqgItVFirHYqe2T+MfWc2VOBQ8QFKKbWTA3PU7liZR8zoSpAi/C1k1ilm9RsXIKYf197oap9wXGVYg==", + "dev": true, + "dependencies": { + "methods": "^1.1.2", + "superagent": "^10.2.3" + }, + "engines": { + "node": ">=14.18.0" + } + }, "node_modules/supports-color": { "version": "7.2.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", @@ -8456,9 +8734,7 @@ "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.12.0.tgz", "integrity": "sha512-goOacqME2GYyOZZfb5Lgtu+1IDmAlAEu5xnD3+xTzS10hT0vzpf0SPjkXwAw9Jm+4n/mQGDP3LO8CPbYROeBfQ==", "dev": true, - "license": "MIT", - "optional": true, - "peer": true + "license": "MIT" }, "node_modules/update-browserslist-db": { "version": "1.1.3", diff --git a/package.json b/package.json index 9ef812f..21f38fe 100644 --- a/package.json +++ b/package.json @@ -58,6 +58,7 @@ "@testing-library/user-event": "^14.6.1", "@types/react": "^19.1.10", "@types/react-dom": "^19.1.7", + "@types/supertest": "^6.0.3", "@vitejs/plugin-react": "^5.0.0", "@vitest/coverage-v8": "^3.2.4", "concurrently": "^9.2.1", @@ -71,6 +72,7 @@ "markdownlint-cli": "^0.45.0", "pdfjs-dist": "^4.8.69", "prettier": "^3.6.2", + "supertest": "^7.1.4", "tesseract.js": "^5.1.0", "typescript": "~5.8.3", "typescript-eslint": "^8.39.1", diff --git a/src/App.tsx b/src/App.tsx index 3c754a5..f677f35 100644 --- a/src/App.tsx +++ b/src/App.tsx @@ -2,7 +2,7 @@ import { useEffect, useCallback, useRef } from 'react' import './App.css' import { AppRouter } from './router' import { useAppDispatch, useAppSelector } from './store' -import { loadFolderResults, setBootstrapped, setCurrentFolderHash, setPollingInterval, stopPolling, setCurrentFolderName } from './store/documentSlice' +import { loadFolderResults, setBootstrapped, setCurrentFolderHash, setPollingInterval, stopPolling, setCurrentFolderName, createDefaultFolderThunk } from './store/documentSlice' import { usePerformance } from './hooks/usePerformance' import { useAccessibility } from './hooks/useAccessibility' import './styles/accessibility.css' @@ -43,11 +43,12 @@ export default function App() { dispatch(setCurrentFolderHash(urlFolderHash)) } - // Si aucun hash n'est disponible, utiliser le dossier par défaut demandé + // Si aucun hash n'est disponible, demander le dossier par défaut au backend if (!folderHash) { - folderHash = '7d99a85daf66a0081a0e881630e6b39b' + const res = await dispatch(createDefaultFolderThunk()).unwrap() + folderHash = res.folderHash dispatch(setCurrentFolderHash(folderHash)) - console.log('📌 [APP] Dossier par défaut appliqué:', folderHash) + console.log('📌 [APP] Dossier par défaut créé/récupéré:', folderHash) } // Charger les résultats du dossier diff --git a/src/components/Layout.tsx b/src/components/Layout.tsx index c8587ec..3df7585 100644 --- a/src/components/Layout.tsx +++ b/src/components/Layout.tsx @@ -68,8 +68,8 @@ export const Layout: React.FC = ({ children }) => { // Mémoriser la liste des documents pour éviter les re-renders inutiles const memoizedDocuments = useMemo(() => { - console.log(`📋 [LAYOUT] Recalcul de la liste des documents: ${documents.length}`) - return documents + console.log(`📋 [LAYOUT] Recalcul de la liste des documents: ${documents?.length || 0}`) + return documents || [] }, [documents]) useEffect(() => { diff --git a/src/services/backendApi.ts b/src/services/backendApi.ts index 8e79b67..0242684 100644 --- a/src/services/backendApi.ts +++ b/src/services/backendApi.ts @@ -127,6 +127,84 @@ export async function extractDocumentBackend( status: result.status, } + // Déduplication déterministe (identités, adresses, dates, sociétés, références) + const dedupIdentities = (() => { + const seen = new Set() + const out = [] as ExtractionResult['identities'] + for (const it of extractionResult.identities) { + const key = `${(it.firstName || '').trim().toLowerCase()}|${(it.lastName || '').trim().toLowerCase()}` + if (!seen.has(key)) { + seen.add(key) + out.push(it) + } + } + return out + })() + + const dedupAddresses = (() => { + const seen = new Set() + const out = [] as ExtractionResult['addresses'] + for (const a of extractionResult.addresses) { + const key = `${(a.street || '').trim().toLowerCase()}|${(a.postalCode || '').trim()}|${(a.city || '').trim().toLowerCase()}|${(a.country || '').trim().toLowerCase()}` + if (!seen.has(key)) { + seen.add(key) + out.push(a) + } + } + return out + })() + + const dedupDates = (() => { + const seen = new Set() + const out: ExtractionResult['dates'] = [] + for (const d of extractionResult.dates || []) { + const key = d.value || d.formatted || '' + if (!seen.has(key)) { + seen.add(key) + out.push(d) + } + } + return out + })() + + // Déduplication des entreprises + const dedupCompanies = (() => { + const seen = new Set() + const out: ExtractionResult['companies'] = [] + for (const c of extractionResult.companies || []) { + const key = `${(c.name || '').trim().toLowerCase()}|${(c.siret || '').trim()}` + if (!seen.has(key)) { + seen.add(key) + out.push(c) + } + } + return out + })() + + // Déduplication des signatures + const dedupSignatures = Array.from(new Set(extractionResult.signatures || [])) + + // Déduplication des références + const dedupReferences = (() => { + const seen = new Set() + const out: ExtractionResult['references'] = [] + for (const r of extractionResult.references || []) { + const key = `${(r.type || '').trim().toLowerCase()}|${(r.number || '').trim().toLowerCase()}` + if (!seen.has(key)) { + seen.add(key) + out.push(r) + } + } + return out + })() + + extractionResult.identities = dedupIdentities + extractionResult.addresses = dedupAddresses + extractionResult.dates = dedupDates + extractionResult.companies = dedupCompanies + extractionResult.signatures = dedupSignatures + extractionResult.references = dedupReferences + // Extraction terminée console.log('🎉 [BACKEND] Extraction terminée avec succès:', { diff --git a/src/services/ruleNer.ts b/src/services/ruleNer.ts index 5ca7ea6..d3b9477 100644 --- a/src/services/ruleNer.ts +++ b/src/services/ruleNer.ts @@ -207,18 +207,56 @@ export function runRuleNER(documentId: string, text: string): ExtractionResult { console.log('📊 [RULE-NER] Confiance calculée:', confidence) console.log('📝 [RULE-NER] Raisons:', reasons) + // Déduplication déterministe + const uniqueIdentities = (() => { + const seen = new Set() + const out: Identity[] = [] + for (const it of identities) { + const key = `${(it.firstName || '').trim().toLowerCase()}|${(it.lastName || '').trim().toLowerCase()}` + if (!seen.has(key)) { + seen.add(key) + out.push(it) + } + } + return out + })() + + const uniqueAddresses = (() => { + const seen = new Set() + const out: Address[] = [] + for (const a of addresses) { + const key = `${(a.street || '').trim().toLowerCase()}|${(a.postalCode || '').trim()}|${(a.city || '').trim().toLowerCase()}|${(a.country || '').trim().toLowerCase()}` + if (!seen.has(key)) { + seen.add(key) + out.push(a) + } + } + return out + })() + + const uniqueDates = Array.from(new Set(dates)).map((date, index) => ({ + id: `date-${index}`, + type: 'date', + value: date, + formatted: date, + confidence: 0.8, + source: 'rule-ner' + })) + console.log('📅 [RULE-NER] Dates uniques après déduplication:', uniqueDates.length) + const result = { documentId, text, language: 'fr', documentType, - identities, - addresses, + identities: uniqueIdentities, + addresses: uniqueAddresses, properties, contracts, signatures: [], confidence, confidenceReasons: reasons, + dates: uniqueDates, } console.log('✅ [RULE-NER] Résultat final:', result) diff --git a/tests/deduplication.test.ts b/tests/deduplication.test.ts new file mode 100644 index 0000000..ca09013 --- /dev/null +++ b/tests/deduplication.test.ts @@ -0,0 +1,21 @@ +import { describe, it, expect } from 'vitest' +import { runRuleNER } from '../src/services/ruleNer' + +describe('Déduplication des entités', () => { + it('élimine les doublons d\'identités et d\'adresses', () => { + const text = ` + Vendeur : Jean Dupont\n + Acheteur : Jean Dupont\n + Domicilié 1 rue de la Paix, 75001 Paris\n + Adresse 1 Rue de la Paix, 75001 Paris + ` + + const result = runRuleNER('doc-1', text) + + const identityKeys = new Set(result.identities.map(i => `${(i.firstName||'').toLowerCase()}|${(i.lastName||'').toLowerCase()}`)) + const addressKeys = new Set(result.addresses.map(a => `${(a.street||'').toLowerCase()}|${a.postalCode}|${(a.city||'').toLowerCase()}|${(a.country||'').toLowerCase()}`)) + + expect(result.identities.length).toBe(identityKeys.size) + expect(result.addresses.length).toBe(addressKeys.size) + }) +}) diff --git a/tests/ocr.test.js b/tests/ocr.test.js index 7113bb4..5ed1dc1 100644 --- a/tests/ocr.test.js +++ b/tests/ocr.test.js @@ -9,7 +9,16 @@ beforeEach(async () => { const enhancedOcrModule = await import('../backend/enhancedOcr.js') const serverModule = await import('../backend/server.js') extractTextFromImageEnhanced = enhancedOcrModule.extractTextFromImageEnhanced - extractEntitiesFromText = serverModule.extractEntitiesFromText + // La fonction extractEntitiesFromText n'est pas exportée, on utilise runRuleNER à la place + const { runRuleNER } = await import('../src/services/ruleNer.ts') + extractEntitiesFromText = (text) => { + const result = runRuleNER('test-doc', text) + return { + persons: result.identities, + addresses: result.addresses, + companies: result.companies || [] + } + } }) describe('OCR et extraction de texte', () => { diff --git a/vite.config.ts b/vite.config.ts index aea47de..8a55608 100644 --- a/vite.config.ts +++ b/vite.config.ts @@ -11,6 +11,13 @@ export default defineConfig({ hmr: { port: 5174, }, + proxy: { + '/api': { + target: 'http://localhost:3001', + changeOrigin: true, + secure: false, + }, + }, }, optimizeDeps: { include: ['react', 'react-dom'],