root 5d8ad901d1 Initial commit: Pipeline notarial complet
- Infrastructure complète de traitement de documents notariaux
- API FastAPI d'ingestion et d'orchestration
- Pipelines Celery pour le traitement asynchrone
- Support des formats PDF, JPEG, PNG, TIFF, HEIC
- OCR avec Tesseract et correction lexicale
- Classification automatique des documents avec Ollama
- Extraction de données structurées
- Indexation dans AnythingLLM et OpenSearch
- Système de vérifications et contrôles métier
- Base de données PostgreSQL pour le métier
- Stockage objet avec MinIO
- Base de données graphe Neo4j
- Recherche plein-texte avec OpenSearch
- Supervision avec Prometheus et Grafana
- Scripts d'installation pour Debian
- Documentation complète
- Tests unitaires et de performance
- Service systemd pour le déploiement
- Scripts de déploiement automatisés
2025-09-08 22:05:22 +02:00

356 lines
11 KiB
Python

"""
Pipeline de vérifications et contrôles métier
"""
import os
import logging
from typing import Dict, Any, List
logger = logging.getLogger(__name__)
def run(doc_id: str, ctx: dict):
"""
Vérifications et contrôles métier
"""
logger.info(f"Vérifications du document {doc_id}")
try:
# Récupération des données
classification = ctx.get("classification", {})
extracted_data = ctx.get("extracted_data", {})
ocr_meta = ctx.get("ocr_meta", {})
# Liste des vérifications
checks_results = []
# Vérification de la qualité OCR
ocr_check = _check_ocr_quality(ocr_meta)
checks_results.append(ocr_check)
# Vérification de la classification
classification_check = _check_classification(classification)
checks_results.append(classification_check)
# Vérifications spécifiques au type de document
type_checks = _check_document_type(classification.get("label", ""), extracted_data)
checks_results.extend(type_checks)
# Vérification de la cohérence des données
consistency_check = _check_data_consistency(extracted_data)
checks_results.append(consistency_check)
# Détermination du statut final
overall_status = _determine_overall_status(checks_results)
# Stockage des résultats
ctx["checks_results"] = checks_results
ctx["overall_status"] = overall_status
# Métadonnées de vérification
checks_meta = {
"checks_completed": True,
"total_checks": len(checks_results),
"passed_checks": sum(1 for check in checks_results if check["status"] == "passed"),
"failed_checks": sum(1 for check in checks_results if check["status"] == "failed"),
"warnings": sum(1 for check in checks_results if check["status"] == "warning"),
"overall_status": overall_status
}
ctx["checks_meta"] = checks_meta
logger.info(f"Vérifications terminées pour le document {doc_id}: {overall_status}")
except Exception as e:
logger.error(f"Erreur lors des vérifications du document {doc_id}: {e}")
raise
def _check_ocr_quality(ocr_meta: Dict[str, Any]) -> Dict[str, Any]:
"""
Vérification de la qualité OCR
"""
confidence = ocr_meta.get("confidence", 0.0)
text_length = ocr_meta.get("text_length", 0)
if confidence >= 0.8:
status = "passed"
message = f"Qualité OCR excellente (confiance: {confidence:.2f})"
elif confidence >= 0.6:
status = "warning"
message = f"Qualité OCR acceptable (confiance: {confidence:.2f})"
else:
status = "failed"
message = f"Qualité OCR insuffisante (confiance: {confidence:.2f})"
if text_length < 100:
status = "failed"
message += " - Texte trop court"
return {
"check_name": "ocr_quality",
"status": status,
"message": message,
"details": {
"confidence": confidence,
"text_length": text_length
}
}
def _check_classification(classification: Dict[str, Any]) -> Dict[str, Any]:
"""
Vérification de la classification
"""
confidence = classification.get("confidence", 0.0)
label = classification.get("label", "document_inconnu")
if confidence >= 0.8:
status = "passed"
message = f"Classification fiable ({label}, confiance: {confidence:.2f})"
elif confidence >= 0.6:
status = "warning"
message = f"Classification incertaine ({label}, confiance: {confidence:.2f})"
else:
status = "failed"
message = f"Classification non fiable ({label}, confiance: {confidence:.2f})"
if label == "document_inconnu":
status = "warning"
message = "Type de document non identifié"
return {
"check_name": "classification",
"status": status,
"message": message,
"details": {
"label": label,
"confidence": confidence
}
}
def _check_document_type(document_type: str, extracted_data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Vérifications spécifiques au type de document
"""
checks = []
if document_type == "acte_vente":
checks.extend(_check_vente_requirements(extracted_data))
elif document_type == "acte_achat":
checks.extend(_check_achat_requirements(extracted_data))
elif document_type == "donation":
checks.extend(_check_donation_requirements(extracted_data))
elif document_type == "testament":
checks.extend(_check_testament_requirements(extracted_data))
elif document_type == "succession":
checks.extend(_check_succession_requirements(extracted_data))
return checks
def _check_vente_requirements(data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Vérifications pour un acte de vente
"""
checks = []
# Vérification des champs obligatoires
required_fields = ["vendeur", "acheteur", "prix", "bien"]
for field in required_fields:
if not data.get(field):
checks.append({
"check_name": f"vente_{field}_present",
"status": "failed",
"message": f"Champ obligatoire manquant: {field}",
"details": {"field": field}
})
else:
checks.append({
"check_name": f"vente_{field}_present",
"status": "passed",
"message": f"Champ {field} présent",
"details": {"field": field, "value": data[field]}
})
# Vérification du prix
prix = data.get("prix", "")
if prix and not _is_valid_amount(prix):
checks.append({
"check_name": "vente_prix_format",
"status": "warning",
"message": f"Format de prix suspect: {prix}",
"details": {"prix": prix}
})
return checks
def _check_achat_requirements(data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Vérifications pour un acte d'achat
"""
checks = []
# Vérification des champs obligatoires
required_fields = ["vendeur", "acheteur", "prix", "bien"]
for field in required_fields:
if not data.get(field):
checks.append({
"check_name": f"achat_{field}_present",
"status": "failed",
"message": f"Champ obligatoire manquant: {field}",
"details": {"field": field}
})
else:
checks.append({
"check_name": f"achat_{field}_present",
"status": "passed",
"message": f"Champ {field} présent",
"details": {"field": field, "value": data[field]}
})
return checks
def _check_donation_requirements(data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Vérifications pour une donation
"""
checks = []
# Vérification des champs obligatoires
required_fields = ["donateur", "donataire", "bien_donne"]
for field in required_fields:
if not data.get(field):
checks.append({
"check_name": f"donation_{field}_present",
"status": "failed",
"message": f"Champ obligatoire manquant: {field}",
"details": {"field": field}
})
else:
checks.append({
"check_name": f"donation_{field}_present",
"status": "passed",
"message": f"Champ {field} présent",
"details": {"field": field, "value": data[field]}
})
return checks
def _check_testament_requirements(data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Vérifications pour un testament
"""
checks = []
# Vérification des champs obligatoires
required_fields = ["testateur"]
for field in required_fields:
if not data.get(field):
checks.append({
"check_name": f"testament_{field}_present",
"status": "failed",
"message": f"Champ obligatoire manquant: {field}",
"details": {"field": field}
})
else:
checks.append({
"check_name": f"testament_{field}_present",
"status": "passed",
"message": f"Champ {field} présent",
"details": {"field": field, "value": data[field]}
})
return checks
def _check_succession_requirements(data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Vérifications pour une succession
"""
checks = []
# Vérification des champs obligatoires
required_fields = ["defunt"]
for field in required_fields:
if not data.get(field):
checks.append({
"check_name": f"succession_{field}_present",
"status": "failed",
"message": f"Champ obligatoire manquant: {field}",
"details": {"field": field}
})
else:
checks.append({
"check_name": f"succession_{field}_present",
"status": "passed",
"message": f"Champ {field} présent",
"details": {"field": field, "value": data[field]}
})
return checks
def _check_data_consistency(data: Dict[str, Any]) -> Dict[str, Any]:
"""
Vérification de la cohérence des données
"""
issues = []
# Vérification des dates
dates = data.get("dates", [])
for date in dates:
if not _is_valid_date(date):
issues.append(f"Date invalide: {date}")
# Vérification des montants
montants = data.get("montants", [])
for montant in montants:
if not _is_valid_amount(montant):
issues.append(f"Montant invalide: {montant}")
if issues:
return {
"check_name": "data_consistency",
"status": "warning",
"message": f"Cohérence des données: {len(issues)} problème(s) détecté(s)",
"details": {"issues": issues}
}
else:
return {
"check_name": "data_consistency",
"status": "passed",
"message": "Données cohérentes",
"details": {}
}
def _determine_overall_status(checks_results: List[Dict[str, Any]]) -> str:
"""
Détermination du statut global
"""
failed_checks = sum(1 for check in checks_results if check["status"] == "failed")
warning_checks = sum(1 for check in checks_results if check["status"] == "warning")
if failed_checks > 0:
return "manual_review"
elif warning_checks > 2:
return "manual_review"
else:
return "completed"
def _is_valid_date(date_str: str) -> bool:
"""
Validation d'une date
"""
import re
# Format DD/MM/YYYY ou DD-MM-YYYY
pattern = r'^\d{1,2}[/-]\d{1,2}[/-]\d{2,4}$'
return bool(re.match(pattern, date_str))
def _is_valid_amount(amount_str: str) -> bool:
"""
Validation d'un montant
"""
import re
# Format avec euros
pattern = r'^\d{1,3}(?:\s\d{3})*(?:[.,]\d{2})?\s*€?$'
return bool(re.match(pattern, amount_str))