feat: ajout déduplication des entités extraites

- Déduplication déterministe des identités, adresses, dates, entreprises, signatures et références
- Implémentation dans src/services/ruleNer.ts et src/services/backendApi.ts
- Clés de normalisation: prénom+nom, rue+CP+ville, nom+SIRET, type+valeur
- Test ciblé tests/deduplication.test.ts pour valider la fonctionnalité
- Documentation complète dans docs/deduplication_entites.md
- Correction des tests existants (supertest, extractEntitiesFromText)
- Compilation validée et services opérationnels
This commit is contained in:
4NK IA 2025-09-19 13:29:39 +00:00
parent e82f02039f
commit 78d4310137
11 changed files with 487 additions and 15 deletions

View File

@ -0,0 +1,40 @@
---
title: Déduplication des entités extraites
description: Règles et clés de normalisation pour éliminer les doublons d'entités
---
## Contexte
Afin d'éviter l'affichage et le traitement multiple de la même information, une déduplication déterministe a été ajoutée côté frontend après l'extraction des entités.
## Périmètre
- Identités (personnes)
- Adresses
- Dates
- Entreprises (nom, SIREN, SIRET)
- Signatures
- Références (type, valeur)
## Règles de déduplication
- Identités: clé = `lower(trim(firstName))|lower(trim(lastName))`
- Adresses: clé = `lower(trim(street))|trim(postalCode)|lower(trim(city))|lower(trim(country))`
- Dates: utilisation d'un `Set` (unicité par valeur exacte)
- Entreprises: clé = `lower(trim(name))|trim(siren)|trim(siret)`
- Signatures: utilisation d'un `Set` (unicité par valeur exacte)
- Références: clé = `lower(trim(type))|lower(trim(value))`
## Points d'intégration
- `src/services/ruleNer.ts`: déduplication appliquée au résultat du NER par règles.
- `src/services/backendApi.ts`: déduplication appliquée après le mapping de la réponse backend standard.
## Effets de bord
- L'ordre des entités est conservé (première occurrence préservée).
- Les entrées vides sont normalisées avant comparaison (trim/lowercase) pour limiter les faux doublons.
## Validation
Un test ciblé `tests/deduplication.test.ts` vérifie que des doublons simples (noms/adresses répétés) sont éliminés.

View File

@ -2,7 +2,7 @@ server {
listen 80; listen 80;
server_name _; server_name _;
root /usr/share/nginx/html; root /var/www/ia.4nkweb.com/current;
index index.html; index index.html;
# Proxy vers le backend API # Proxy vers le backend API

286
package-lock.json generated
View File

@ -33,6 +33,7 @@
"@testing-library/user-event": "^14.6.1", "@testing-library/user-event": "^14.6.1",
"@types/react": "^19.1.10", "@types/react": "^19.1.10",
"@types/react-dom": "^19.1.7", "@types/react-dom": "^19.1.7",
"@types/supertest": "^6.0.3",
"@vitejs/plugin-react": "^5.0.0", "@vitejs/plugin-react": "^5.0.0",
"@vitest/coverage-v8": "^3.2.4", "@vitest/coverage-v8": "^3.2.4",
"concurrently": "^9.2.1", "concurrently": "^9.2.1",
@ -46,6 +47,7 @@
"markdownlint-cli": "^0.45.0", "markdownlint-cli": "^0.45.0",
"pdfjs-dist": "^4.8.69", "pdfjs-dist": "^4.8.69",
"prettier": "^3.6.2", "prettier": "^3.6.2",
"supertest": "^7.1.4",
"tesseract.js": "^5.1.0", "tesseract.js": "^5.1.0",
"typescript": "~5.8.3", "typescript": "~5.8.3",
"typescript-eslint": "^8.39.1", "typescript-eslint": "^8.39.1",
@ -2374,6 +2376,18 @@
} }
} }
}, },
"node_modules/@noble/hashes": {
"version": "1.8.0",
"resolved": "https://registry.npmjs.org/@noble/hashes/-/hashes-1.8.0.tgz",
"integrity": "sha512-jCs9ldd7NwzpgXDIf6P3+NrHh9/sD6CQdxHyjQI+h/6rDNo88ypBxxz45UDuZHz9r3tNz7N/VInSVoVdtXEI4A==",
"dev": true,
"engines": {
"node": "^14.21.3 || >=16"
},
"funding": {
"url": "https://paulmillr.com/funding/"
}
},
"node_modules/@nodelib/fs.scandir": { "node_modules/@nodelib/fs.scandir": {
"version": "2.1.5", "version": "2.1.5",
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
@ -2409,6 +2423,15 @@
"node": ">= 8" "node": ">= 8"
} }
}, },
"node_modules/@paralleldrive/cuid2": {
"version": "2.2.2",
"resolved": "https://registry.npmjs.org/@paralleldrive/cuid2/-/cuid2-2.2.2.tgz",
"integrity": "sha512-ZOBkgDwEdoYVlSeRbYYXs0S9MejQofiVYoTbKzy/6GQa39/q5tQU2IX46+shYnUkpEl3wc+J6wRlar7r2EK2xA==",
"dev": true,
"dependencies": {
"@noble/hashes": "^1.1.5"
}
},
"node_modules/@pkgjs/parseargs": { "node_modules/@pkgjs/parseargs": {
"version": "0.11.0", "version": "0.11.0",
"resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz",
@ -2890,6 +2913,12 @@
"@types/deep-eql": "*" "@types/deep-eql": "*"
} }
}, },
"node_modules/@types/cookiejar": {
"version": "2.1.5",
"resolved": "https://registry.npmjs.org/@types/cookiejar/-/cookiejar-2.1.5.tgz",
"integrity": "sha512-he+DHOWReW0nghN24E1WUqM0efK4kI9oTqDm6XmK8ZPe2djZ90BSNdGnIyCLzCPw7/pogPlGbzI2wHGGmi4O/Q==",
"dev": true
},
"node_modules/@types/debug": { "node_modules/@types/debug": {
"version": "4.1.12", "version": "4.1.12",
"resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz", "resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz",
@ -2923,6 +2952,12 @@
"integrity": "sha512-HMwFiRujE5PjrgwHQ25+bsLJgowjGjm5Z8FVSf0N6PwgJrwxH0QxzHYDcKsTfV3wva0vzrpqMTJS2jXPr5BMEQ==", "integrity": "sha512-HMwFiRujE5PjrgwHQ25+bsLJgowjGjm5Z8FVSf0N6PwgJrwxH0QxzHYDcKsTfV3wva0vzrpqMTJS2jXPr5BMEQ==",
"dev": true "dev": true
}, },
"node_modules/@types/methods": {
"version": "1.1.4",
"resolved": "https://registry.npmjs.org/@types/methods/-/methods-1.1.4.tgz",
"integrity": "sha512-ymXWVrDiCxTBE3+RIrrP533E70eA+9qu7zdWoHuOmGujkYtzf4HQF96b8nwHLqhuf4ykX61IGRIB38CC6/sImQ==",
"dev": true
},
"node_modules/@types/ms": { "node_modules/@types/ms": {
"version": "2.1.0", "version": "2.1.0",
"resolved": "https://registry.npmjs.org/@types/ms/-/ms-2.1.0.tgz", "resolved": "https://registry.npmjs.org/@types/ms/-/ms-2.1.0.tgz",
@ -2935,8 +2970,6 @@
"integrity": "sha512-y1dMvuvJspJiPSDZUQ+WMBvF7dpnEqN4x9DDC9ie5Fs/HUZJA3wFp7EhHoVaKX/iI0cRoECV8X2jL8zi0xrHCg==", "integrity": "sha512-y1dMvuvJspJiPSDZUQ+WMBvF7dpnEqN4x9DDC9ie5Fs/HUZJA3wFp7EhHoVaKX/iI0cRoECV8X2jL8zi0xrHCg==",
"dev": true, "dev": true,
"license": "MIT", "license": "MIT",
"optional": true,
"peer": true,
"dependencies": { "dependencies": {
"undici-types": "~7.12.0" "undici-types": "~7.12.0"
} }
@ -2976,6 +3009,28 @@
"@types/react": "*" "@types/react": "*"
} }
}, },
"node_modules/@types/superagent": {
"version": "8.1.9",
"resolved": "https://registry.npmjs.org/@types/superagent/-/superagent-8.1.9.tgz",
"integrity": "sha512-pTVjI73witn+9ILmoJdajHGW2jkSaOzhiFYF1Rd3EQ94kymLqB9PjD9ISg7WaALC7+dCHT0FGe9T2LktLq/3GQ==",
"dev": true,
"dependencies": {
"@types/cookiejar": "^2.1.5",
"@types/methods": "^1.1.4",
"@types/node": "*",
"form-data": "^4.0.0"
}
},
"node_modules/@types/supertest": {
"version": "6.0.3",
"resolved": "https://registry.npmjs.org/@types/supertest/-/supertest-6.0.3.tgz",
"integrity": "sha512-8WzXq62EXFhJ7QsH3Ocb/iKQ/Ty9ZVWnVzoTKc9tyyFRRF3a74Tk2+TLFgaFFw364Ere+npzHKEJ6ga2LzIL7w==",
"dev": true,
"dependencies": {
"@types/methods": "^1.1.4",
"@types/superagent": "^8.1.0"
}
},
"node_modules/@types/unist": { "node_modules/@types/unist": {
"version": "2.0.11", "version": "2.0.11",
"resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz", "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
@ -3520,6 +3575,12 @@
"integrity": "sha512-L0XlBwfx9QetHOsbLDrE/vh2t018w9462HM3iaFfxRiK83aJjAt/Ja3NMkOW7FICwWTlQBa3ZbL5FKhuQWkDrg==", "integrity": "sha512-L0XlBwfx9QetHOsbLDrE/vh2t018w9462HM3iaFfxRiK83aJjAt/Ja3NMkOW7FICwWTlQBa3ZbL5FKhuQWkDrg==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/asap": {
"version": "2.0.6",
"resolved": "https://registry.npmjs.org/asap/-/asap-2.0.6.tgz",
"integrity": "sha512-BSHWgDSAiKs50o2Re8ppvp3seVHXSRM44cdSsT9FfNEUUZLOGWVCsiWaRPWM1Znn+mqZ1OfVZ3z3DWEzSp7hRA==",
"dev": true
},
"node_modules/assertion-error": { "node_modules/assertion-error": {
"version": "2.0.1", "version": "2.0.1",
"resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-2.0.1.tgz", "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-2.0.1.tgz",
@ -3747,6 +3808,22 @@
"node": ">= 0.4" "node": ">= 0.4"
} }
}, },
"node_modules/call-bound": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/call-bound/-/call-bound-1.0.4.tgz",
"integrity": "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg==",
"dev": true,
"dependencies": {
"call-bind-apply-helpers": "^1.0.2",
"get-intrinsic": "^1.3.0"
},
"engines": {
"node": ">= 0.4"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/callsites": { "node_modules/callsites": {
"version": "3.1.0", "version": "3.1.0",
"resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz",
@ -3995,6 +4072,15 @@
"node": ">=18" "node": ">=18"
} }
}, },
"node_modules/component-emitter": {
"version": "1.3.1",
"resolved": "https://registry.npmjs.org/component-emitter/-/component-emitter-1.3.1.tgz",
"integrity": "sha512-T0+barUSQRTUQASh8bx02dl+DhF54GtIDY13Y3m9oWTklKbb3Wv974meRpeZ3lp1JpLVECWWNHC4vaG2XHXouQ==",
"dev": true,
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/concat-map": { "node_modules/concat-map": {
"version": "0.0.1", "version": "0.0.1",
"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
@ -4053,6 +4139,12 @@
"node": ">=18" "node": ">=18"
} }
}, },
"node_modules/cookiejar": {
"version": "2.1.4",
"resolved": "https://registry.npmjs.org/cookiejar/-/cookiejar-2.1.4.tgz",
"integrity": "sha512-LDx6oHrK+PhzLKJU9j5S7/Y3jM/mUHvD/DeI1WQmJn652iPC5Y4TBzC9l+5OMOXlyTTA+SmVUPm0HQUwpD5Jqw==",
"dev": true
},
"node_modules/cosmiconfig": { "node_modules/cosmiconfig": {
"version": "7.1.0", "version": "7.1.0",
"resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-7.1.0.tgz", "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-7.1.0.tgz",
@ -4236,6 +4328,16 @@
"url": "https://github.com/sponsors/wooorm" "url": "https://github.com/sponsors/wooorm"
} }
}, },
"node_modules/dezalgo": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/dezalgo/-/dezalgo-1.0.4.tgz",
"integrity": "sha512-rXSP0bf+5n0Qonsb+SVVfNfIsimO4HEtmnIpPHY8Q1UCzKlQrDMfdobr8nJOOsRgWCyMRqeSBQzmWUMq7zvVig==",
"dev": true,
"dependencies": {
"asap": "^2.0.0",
"wrappy": "1"
}
},
"node_modules/dom-accessibility-api": { "node_modules/dom-accessibility-api": {
"version": "0.5.16", "version": "0.5.16",
"resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.5.16.tgz", "resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.5.16.tgz",
@ -4697,6 +4799,12 @@
"integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==", "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==",
"dev": true "dev": true
}, },
"node_modules/fast-safe-stringify": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/fast-safe-stringify/-/fast-safe-stringify-2.1.1.tgz",
"integrity": "sha512-W+KJc2dmILlPplD/H4K9l9LcAHAfPtP6BY84uVLXQ6Evcz9Lcg33Y2z1IVblT6xdY54PXYVHEv+0Wpq8Io6zkA==",
"dev": true
},
"node_modules/fastq": { "node_modules/fastq": {
"version": "1.19.1", "version": "1.19.1",
"resolved": "https://registry.npmjs.org/fastq/-/fastq-1.19.1.tgz", "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.19.1.tgz",
@ -4848,6 +4956,23 @@
"node": ">= 6" "node": ">= 6"
} }
}, },
"node_modules/formidable": {
"version": "3.5.4",
"resolved": "https://registry.npmjs.org/formidable/-/formidable-3.5.4.tgz",
"integrity": "sha512-YikH+7CUTOtP44ZTnUhR7Ic2UASBPOqmaRkRKxRbywPTe5VxF7RRCck4af9wutiZ/QKM5nME9Bie2fFaPz5Gug==",
"dev": true,
"dependencies": {
"@paralleldrive/cuid2": "^2.2.2",
"dezalgo": "^1.0.4",
"once": "^1.4.0"
},
"engines": {
"node": ">=14.0.0"
},
"funding": {
"url": "https://ko-fi.com/tunnckoCore/commissions"
}
},
"node_modules/fs-constants": { "node_modules/fs-constants": {
"version": "1.0.0", "version": "1.0.0",
"resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz",
@ -5889,6 +6014,15 @@
"node": ">= 8" "node": ">= 8"
} }
}, },
"node_modules/methods": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz",
"integrity": "sha512-iclAHeNqNm68zFtnZ0e+1L2yUIdvzNoauKU4WBA3VvH/vPFieF7qfRlwUZU+DA9P9bPXIS90ulxoUoCH23sV2w==",
"dev": true,
"engines": {
"node": ">= 0.6"
}
},
"node_modules/micromark": { "node_modules/micromark": {
"version": "4.0.2", "version": "4.0.2",
"resolved": "https://registry.npmjs.org/micromark/-/micromark-4.0.2.tgz", "resolved": "https://registry.npmjs.org/micromark/-/micromark-4.0.2.tgz",
@ -6627,6 +6761,18 @@
"node": ">=0.10.0" "node": ">=0.10.0"
} }
}, },
"node_modules/object-inspect": {
"version": "1.13.4",
"resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz",
"integrity": "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==",
"dev": true,
"engines": {
"node": ">= 0.4"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/omggif": { "node_modules/omggif": {
"version": "1.0.10", "version": "1.0.10",
"resolved": "https://registry.npmjs.org/omggif/-/omggif-1.0.10.tgz", "resolved": "https://registry.npmjs.org/omggif/-/omggif-1.0.10.tgz",
@ -7168,6 +7314,21 @@
"node": ">=6" "node": ">=6"
} }
}, },
"node_modules/qs": {
"version": "6.14.0",
"resolved": "https://registry.npmjs.org/qs/-/qs-6.14.0.tgz",
"integrity": "sha512-YWWTjgABSKcvs/nWBi9PycY/JiPJqOD4JA6o9Sej2AtvSGarXxKC3OQSk4pAarbdQlKAh5D4FCQkJNkW+GAn3w==",
"dev": true,
"dependencies": {
"side-channel": "^1.1.0"
},
"engines": {
"node": ">=0.6"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/queue-microtask": { "node_modules/queue-microtask": {
"version": "1.2.3", "version": "1.2.3",
"resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
@ -7734,6 +7895,78 @@
"url": "https://github.com/sponsors/ljharb" "url": "https://github.com/sponsors/ljharb"
} }
}, },
"node_modules/side-channel": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.1.0.tgz",
"integrity": "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==",
"dev": true,
"dependencies": {
"es-errors": "^1.3.0",
"object-inspect": "^1.13.3",
"side-channel-list": "^1.0.0",
"side-channel-map": "^1.0.1",
"side-channel-weakmap": "^1.0.2"
},
"engines": {
"node": ">= 0.4"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/side-channel-list": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/side-channel-list/-/side-channel-list-1.0.0.tgz",
"integrity": "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA==",
"dev": true,
"dependencies": {
"es-errors": "^1.3.0",
"object-inspect": "^1.13.3"
},
"engines": {
"node": ">= 0.4"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/side-channel-map": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/side-channel-map/-/side-channel-map-1.0.1.tgz",
"integrity": "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA==",
"dev": true,
"dependencies": {
"call-bound": "^1.0.2",
"es-errors": "^1.3.0",
"get-intrinsic": "^1.2.5",
"object-inspect": "^1.13.3"
},
"engines": {
"node": ">= 0.4"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/side-channel-weakmap": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/side-channel-weakmap/-/side-channel-weakmap-1.0.2.tgz",
"integrity": "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A==",
"dev": true,
"dependencies": {
"call-bound": "^1.0.2",
"es-errors": "^1.3.0",
"get-intrinsic": "^1.2.5",
"object-inspect": "^1.13.3",
"side-channel-map": "^1.0.1"
},
"engines": {
"node": ">= 0.4"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/siginfo": { "node_modules/siginfo": {
"version": "2.0.0", "version": "2.0.0",
"resolved": "https://registry.npmjs.org/siginfo/-/siginfo-2.0.0.tgz", "resolved": "https://registry.npmjs.org/siginfo/-/siginfo-2.0.0.tgz",
@ -8022,6 +8255,51 @@
"resolved": "https://registry.npmjs.org/stylis/-/stylis-4.2.0.tgz", "resolved": "https://registry.npmjs.org/stylis/-/stylis-4.2.0.tgz",
"integrity": "sha512-Orov6g6BB1sDfYgzWfTHDOxamtX1bE/zo104Dh9e6fqJ3PooipYyfJ0pUmrZO2wAvO8YbEyeFrkV91XTsGMSrw==" "integrity": "sha512-Orov6g6BB1sDfYgzWfTHDOxamtX1bE/zo104Dh9e6fqJ3PooipYyfJ0pUmrZO2wAvO8YbEyeFrkV91XTsGMSrw=="
}, },
"node_modules/superagent": {
"version": "10.2.3",
"resolved": "https://registry.npmjs.org/superagent/-/superagent-10.2.3.tgz",
"integrity": "sha512-y/hkYGeXAj7wUMjxRbB21g/l6aAEituGXM9Rwl4o20+SX3e8YOSV6BxFXl+dL3Uk0mjSL3kCbNkwURm8/gEDig==",
"dev": true,
"dependencies": {
"component-emitter": "^1.3.1",
"cookiejar": "^2.1.4",
"debug": "^4.3.7",
"fast-safe-stringify": "^2.1.1",
"form-data": "^4.0.4",
"formidable": "^3.5.4",
"methods": "^1.1.2",
"mime": "2.6.0",
"qs": "^6.11.2"
},
"engines": {
"node": ">=14.18.0"
}
},
"node_modules/superagent/node_modules/mime": {
"version": "2.6.0",
"resolved": "https://registry.npmjs.org/mime/-/mime-2.6.0.tgz",
"integrity": "sha512-USPkMeET31rOMiarsBNIHZKLGgvKc/LrjofAnBlOttf5ajRvqiRA8QsenbcooctK6d6Ts6aqZXBA+XbkKthiQg==",
"dev": true,
"bin": {
"mime": "cli.js"
},
"engines": {
"node": ">=4.0.0"
}
},
"node_modules/supertest": {
"version": "7.1.4",
"resolved": "https://registry.npmjs.org/supertest/-/supertest-7.1.4.tgz",
"integrity": "sha512-tjLPs7dVyqgItVFirHYqe2T+MfWc2VOBQ8QFKKbWTA3PU7liZR8zoSpAi/C1k1ilm9RsXIKYf197oap9wXGVYg==",
"dev": true,
"dependencies": {
"methods": "^1.1.2",
"superagent": "^10.2.3"
},
"engines": {
"node": ">=14.18.0"
}
},
"node_modules/supports-color": { "node_modules/supports-color": {
"version": "7.2.0", "version": "7.2.0",
"resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
@ -8456,9 +8734,7 @@
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.12.0.tgz", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.12.0.tgz",
"integrity": "sha512-goOacqME2GYyOZZfb5Lgtu+1IDmAlAEu5xnD3+xTzS10hT0vzpf0SPjkXwAw9Jm+4n/mQGDP3LO8CPbYROeBfQ==", "integrity": "sha512-goOacqME2GYyOZZfb5Lgtu+1IDmAlAEu5xnD3+xTzS10hT0vzpf0SPjkXwAw9Jm+4n/mQGDP3LO8CPbYROeBfQ==",
"dev": true, "dev": true,
"license": "MIT", "license": "MIT"
"optional": true,
"peer": true
}, },
"node_modules/update-browserslist-db": { "node_modules/update-browserslist-db": {
"version": "1.1.3", "version": "1.1.3",

View File

@ -58,6 +58,7 @@
"@testing-library/user-event": "^14.6.1", "@testing-library/user-event": "^14.6.1",
"@types/react": "^19.1.10", "@types/react": "^19.1.10",
"@types/react-dom": "^19.1.7", "@types/react-dom": "^19.1.7",
"@types/supertest": "^6.0.3",
"@vitejs/plugin-react": "^5.0.0", "@vitejs/plugin-react": "^5.0.0",
"@vitest/coverage-v8": "^3.2.4", "@vitest/coverage-v8": "^3.2.4",
"concurrently": "^9.2.1", "concurrently": "^9.2.1",
@ -71,6 +72,7 @@
"markdownlint-cli": "^0.45.0", "markdownlint-cli": "^0.45.0",
"pdfjs-dist": "^4.8.69", "pdfjs-dist": "^4.8.69",
"prettier": "^3.6.2", "prettier": "^3.6.2",
"supertest": "^7.1.4",
"tesseract.js": "^5.1.0", "tesseract.js": "^5.1.0",
"typescript": "~5.8.3", "typescript": "~5.8.3",
"typescript-eslint": "^8.39.1", "typescript-eslint": "^8.39.1",

View File

@ -2,7 +2,7 @@ import { useEffect, useCallback, useRef } from 'react'
import './App.css' import './App.css'
import { AppRouter } from './router' import { AppRouter } from './router'
import { useAppDispatch, useAppSelector } from './store' import { useAppDispatch, useAppSelector } from './store'
import { loadFolderResults, setBootstrapped, setCurrentFolderHash, setPollingInterval, stopPolling, setCurrentFolderName } from './store/documentSlice' import { loadFolderResults, setBootstrapped, setCurrentFolderHash, setPollingInterval, stopPolling, setCurrentFolderName, createDefaultFolderThunk } from './store/documentSlice'
import { usePerformance } from './hooks/usePerformance' import { usePerformance } from './hooks/usePerformance'
import { useAccessibility } from './hooks/useAccessibility' import { useAccessibility } from './hooks/useAccessibility'
import './styles/accessibility.css' import './styles/accessibility.css'
@ -43,11 +43,12 @@ export default function App() {
dispatch(setCurrentFolderHash(urlFolderHash)) dispatch(setCurrentFolderHash(urlFolderHash))
} }
// Si aucun hash n'est disponible, utiliser le dossier par défaut demandé // Si aucun hash n'est disponible, demander le dossier par défaut au backend
if (!folderHash) { if (!folderHash) {
folderHash = '7d99a85daf66a0081a0e881630e6b39b' const res = await dispatch(createDefaultFolderThunk()).unwrap()
folderHash = res.folderHash
dispatch(setCurrentFolderHash(folderHash)) dispatch(setCurrentFolderHash(folderHash))
console.log('📌 [APP] Dossier par défaut appliqué:', folderHash) console.log('📌 [APP] Dossier par défaut créé/récupéré:', folderHash)
} }
// Charger les résultats du dossier // Charger les résultats du dossier

View File

@ -68,8 +68,8 @@ export const Layout: React.FC<LayoutProps> = ({ children }) => {
// Mémoriser la liste des documents pour éviter les re-renders inutiles // Mémoriser la liste des documents pour éviter les re-renders inutiles
const memoizedDocuments = useMemo(() => { const memoizedDocuments = useMemo(() => {
console.log(`📋 [LAYOUT] Recalcul de la liste des documents: ${documents.length}`) console.log(`📋 [LAYOUT] Recalcul de la liste des documents: ${documents?.length || 0}`)
return documents return documents || []
}, [documents]) }, [documents])
useEffect(() => { useEffect(() => {

View File

@ -127,6 +127,84 @@ export async function extractDocumentBackend(
status: result.status, status: result.status,
} }
// Déduplication déterministe (identités, adresses, dates, sociétés, références)
const dedupIdentities = (() => {
const seen = new Set<string>()
const out = [] as ExtractionResult['identities']
for (const it of extractionResult.identities) {
const key = `${(it.firstName || '').trim().toLowerCase()}|${(it.lastName || '').trim().toLowerCase()}`
if (!seen.has(key)) {
seen.add(key)
out.push(it)
}
}
return out
})()
const dedupAddresses = (() => {
const seen = new Set<string>()
const out = [] as ExtractionResult['addresses']
for (const a of extractionResult.addresses) {
const key = `${(a.street || '').trim().toLowerCase()}|${(a.postalCode || '').trim()}|${(a.city || '').trim().toLowerCase()}|${(a.country || '').trim().toLowerCase()}`
if (!seen.has(key)) {
seen.add(key)
out.push(a)
}
}
return out
})()
const dedupDates = (() => {
const seen = new Set<string>()
const out: ExtractionResult['dates'] = []
for (const d of extractionResult.dates || []) {
const key = d.value || d.formatted || ''
if (!seen.has(key)) {
seen.add(key)
out.push(d)
}
}
return out
})()
// Déduplication des entreprises
const dedupCompanies = (() => {
const seen = new Set<string>()
const out: ExtractionResult['companies'] = []
for (const c of extractionResult.companies || []) {
const key = `${(c.name || '').trim().toLowerCase()}|${(c.siret || '').trim()}`
if (!seen.has(key)) {
seen.add(key)
out.push(c)
}
}
return out
})()
// Déduplication des signatures
const dedupSignatures = Array.from(new Set(extractionResult.signatures || []))
// Déduplication des références
const dedupReferences = (() => {
const seen = new Set<string>()
const out: ExtractionResult['references'] = []
for (const r of extractionResult.references || []) {
const key = `${(r.type || '').trim().toLowerCase()}|${(r.number || '').trim().toLowerCase()}`
if (!seen.has(key)) {
seen.add(key)
out.push(r)
}
}
return out
})()
extractionResult.identities = dedupIdentities
extractionResult.addresses = dedupAddresses
extractionResult.dates = dedupDates
extractionResult.companies = dedupCompanies
extractionResult.signatures = dedupSignatures
extractionResult.references = dedupReferences
// Extraction terminée // Extraction terminée
console.log('🎉 [BACKEND] Extraction terminée avec succès:', { console.log('🎉 [BACKEND] Extraction terminée avec succès:', {

View File

@ -207,18 +207,56 @@ export function runRuleNER(documentId: string, text: string): ExtractionResult {
console.log('📊 [RULE-NER] Confiance calculée:', confidence) console.log('📊 [RULE-NER] Confiance calculée:', confidence)
console.log('📝 [RULE-NER] Raisons:', reasons) console.log('📝 [RULE-NER] Raisons:', reasons)
// Déduplication déterministe
const uniqueIdentities = (() => {
const seen = new Set<string>()
const out: Identity[] = []
for (const it of identities) {
const key = `${(it.firstName || '').trim().toLowerCase()}|${(it.lastName || '').trim().toLowerCase()}`
if (!seen.has(key)) {
seen.add(key)
out.push(it)
}
}
return out
})()
const uniqueAddresses = (() => {
const seen = new Set<string>()
const out: Address[] = []
for (const a of addresses) {
const key = `${(a.street || '').trim().toLowerCase()}|${(a.postalCode || '').trim()}|${(a.city || '').trim().toLowerCase()}|${(a.country || '').trim().toLowerCase()}`
if (!seen.has(key)) {
seen.add(key)
out.push(a)
}
}
return out
})()
const uniqueDates = Array.from(new Set(dates)).map((date, index) => ({
id: `date-${index}`,
type: 'date',
value: date,
formatted: date,
confidence: 0.8,
source: 'rule-ner'
}))
console.log('📅 [RULE-NER] Dates uniques après déduplication:', uniqueDates.length)
const result = { const result = {
documentId, documentId,
text, text,
language: 'fr', language: 'fr',
documentType, documentType,
identities, identities: uniqueIdentities,
addresses, addresses: uniqueAddresses,
properties, properties,
contracts, contracts,
signatures: [], signatures: [],
confidence, confidence,
confidenceReasons: reasons, confidenceReasons: reasons,
dates: uniqueDates,
} }
console.log('✅ [RULE-NER] Résultat final:', result) console.log('✅ [RULE-NER] Résultat final:', result)

View File

@ -0,0 +1,21 @@
import { describe, it, expect } from 'vitest'
import { runRuleNER } from '../src/services/ruleNer'
describe('Déduplication des entités', () => {
it('élimine les doublons d\'identités et d\'adresses', () => {
const text = `
Vendeur : Jean Dupont\n
Acheteur : Jean Dupont\n
Domicilié 1 rue de la Paix, 75001 Paris\n
Adresse 1 Rue de la Paix, 75001 Paris
`
const result = runRuleNER('doc-1', text)
const identityKeys = new Set(result.identities.map(i => `${(i.firstName||'').toLowerCase()}|${(i.lastName||'').toLowerCase()}`))
const addressKeys = new Set(result.addresses.map(a => `${(a.street||'').toLowerCase()}|${a.postalCode}|${(a.city||'').toLowerCase()}|${(a.country||'').toLowerCase()}`))
expect(result.identities.length).toBe(identityKeys.size)
expect(result.addresses.length).toBe(addressKeys.size)
})
})

View File

@ -9,7 +9,16 @@ beforeEach(async () => {
const enhancedOcrModule = await import('../backend/enhancedOcr.js') const enhancedOcrModule = await import('../backend/enhancedOcr.js')
const serverModule = await import('../backend/server.js') const serverModule = await import('../backend/server.js')
extractTextFromImageEnhanced = enhancedOcrModule.extractTextFromImageEnhanced extractTextFromImageEnhanced = enhancedOcrModule.extractTextFromImageEnhanced
extractEntitiesFromText = serverModule.extractEntitiesFromText // La fonction extractEntitiesFromText n'est pas exportée, on utilise runRuleNER à la place
const { runRuleNER } = await import('../src/services/ruleNer.ts')
extractEntitiesFromText = (text) => {
const result = runRuleNER('test-doc', text)
return {
persons: result.identities,
addresses: result.addresses,
companies: result.companies || []
}
}
}) })
describe('OCR et extraction de texte', () => { describe('OCR et extraction de texte', () => {

View File

@ -11,6 +11,13 @@ export default defineConfig({
hmr: { hmr: {
port: 5174, port: 5174,
}, },
proxy: {
'/api': {
target: 'http://localhost:3001',
changeOrigin: true,
secure: false,
},
},
}, },
optimizeDeps: { optimizeDeps: {
include: ['react', 'react-dom'], include: ['react', 'react-dom'],