
- Infrastructure complète de traitement de documents notariaux - API FastAPI d'ingestion et d'orchestration - Pipelines Celery pour le traitement asynchrone - Support des formats PDF, JPEG, PNG, TIFF, HEIC - OCR avec Tesseract et correction lexicale - Classification automatique des documents avec Ollama - Extraction de données structurées - Indexation dans AnythingLLM et OpenSearch - Système de vérifications et contrôles métier - Base de données PostgreSQL pour le métier - Stockage objet avec MinIO - Base de données graphe Neo4j - Recherche plein-texte avec OpenSearch - Supervision avec Prometheus et Grafana - Scripts d'installation pour Debian - Documentation complète - Tests unitaires et de performance - Service systemd pour le déploiement - Scripts de déploiement automatisés
356 lines
11 KiB
Python
356 lines
11 KiB
Python
"""
|
|
Pipeline de vérifications et contrôles métier
|
|
"""
|
|
import os
|
|
import logging
|
|
from typing import Dict, Any, List
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def run(doc_id: str, ctx: dict):
|
|
"""
|
|
Vérifications et contrôles métier
|
|
"""
|
|
logger.info(f"Vérifications du document {doc_id}")
|
|
|
|
try:
|
|
# Récupération des données
|
|
classification = ctx.get("classification", {})
|
|
extracted_data = ctx.get("extracted_data", {})
|
|
ocr_meta = ctx.get("ocr_meta", {})
|
|
|
|
# Liste des vérifications
|
|
checks_results = []
|
|
|
|
# Vérification de la qualité OCR
|
|
ocr_check = _check_ocr_quality(ocr_meta)
|
|
checks_results.append(ocr_check)
|
|
|
|
# Vérification de la classification
|
|
classification_check = _check_classification(classification)
|
|
checks_results.append(classification_check)
|
|
|
|
# Vérifications spécifiques au type de document
|
|
type_checks = _check_document_type(classification.get("label", ""), extracted_data)
|
|
checks_results.extend(type_checks)
|
|
|
|
# Vérification de la cohérence des données
|
|
consistency_check = _check_data_consistency(extracted_data)
|
|
checks_results.append(consistency_check)
|
|
|
|
# Détermination du statut final
|
|
overall_status = _determine_overall_status(checks_results)
|
|
|
|
# Stockage des résultats
|
|
ctx["checks_results"] = checks_results
|
|
ctx["overall_status"] = overall_status
|
|
|
|
# Métadonnées de vérification
|
|
checks_meta = {
|
|
"checks_completed": True,
|
|
"total_checks": len(checks_results),
|
|
"passed_checks": sum(1 for check in checks_results if check["status"] == "passed"),
|
|
"failed_checks": sum(1 for check in checks_results if check["status"] == "failed"),
|
|
"warnings": sum(1 for check in checks_results if check["status"] == "warning"),
|
|
"overall_status": overall_status
|
|
}
|
|
|
|
ctx["checks_meta"] = checks_meta
|
|
|
|
logger.info(f"Vérifications terminées pour le document {doc_id}: {overall_status}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Erreur lors des vérifications du document {doc_id}: {e}")
|
|
raise
|
|
|
|
def _check_ocr_quality(ocr_meta: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Vérification de la qualité OCR
|
|
"""
|
|
confidence = ocr_meta.get("confidence", 0.0)
|
|
text_length = ocr_meta.get("text_length", 0)
|
|
|
|
if confidence >= 0.8:
|
|
status = "passed"
|
|
message = f"Qualité OCR excellente (confiance: {confidence:.2f})"
|
|
elif confidence >= 0.6:
|
|
status = "warning"
|
|
message = f"Qualité OCR acceptable (confiance: {confidence:.2f})"
|
|
else:
|
|
status = "failed"
|
|
message = f"Qualité OCR insuffisante (confiance: {confidence:.2f})"
|
|
|
|
if text_length < 100:
|
|
status = "failed"
|
|
message += " - Texte trop court"
|
|
|
|
return {
|
|
"check_name": "ocr_quality",
|
|
"status": status,
|
|
"message": message,
|
|
"details": {
|
|
"confidence": confidence,
|
|
"text_length": text_length
|
|
}
|
|
}
|
|
|
|
def _check_classification(classification: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Vérification de la classification
|
|
"""
|
|
confidence = classification.get("confidence", 0.0)
|
|
label = classification.get("label", "document_inconnu")
|
|
|
|
if confidence >= 0.8:
|
|
status = "passed"
|
|
message = f"Classification fiable ({label}, confiance: {confidence:.2f})"
|
|
elif confidence >= 0.6:
|
|
status = "warning"
|
|
message = f"Classification incertaine ({label}, confiance: {confidence:.2f})"
|
|
else:
|
|
status = "failed"
|
|
message = f"Classification non fiable ({label}, confiance: {confidence:.2f})"
|
|
|
|
if label == "document_inconnu":
|
|
status = "warning"
|
|
message = "Type de document non identifié"
|
|
|
|
return {
|
|
"check_name": "classification",
|
|
"status": status,
|
|
"message": message,
|
|
"details": {
|
|
"label": label,
|
|
"confidence": confidence
|
|
}
|
|
}
|
|
|
|
def _check_document_type(document_type: str, extracted_data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Vérifications spécifiques au type de document
|
|
"""
|
|
checks = []
|
|
|
|
if document_type == "acte_vente":
|
|
checks.extend(_check_vente_requirements(extracted_data))
|
|
elif document_type == "acte_achat":
|
|
checks.extend(_check_achat_requirements(extracted_data))
|
|
elif document_type == "donation":
|
|
checks.extend(_check_donation_requirements(extracted_data))
|
|
elif document_type == "testament":
|
|
checks.extend(_check_testament_requirements(extracted_data))
|
|
elif document_type == "succession":
|
|
checks.extend(_check_succession_requirements(extracted_data))
|
|
|
|
return checks
|
|
|
|
def _check_vente_requirements(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Vérifications pour un acte de vente
|
|
"""
|
|
checks = []
|
|
|
|
# Vérification des champs obligatoires
|
|
required_fields = ["vendeur", "acheteur", "prix", "bien"]
|
|
|
|
for field in required_fields:
|
|
if not data.get(field):
|
|
checks.append({
|
|
"check_name": f"vente_{field}_present",
|
|
"status": "failed",
|
|
"message": f"Champ obligatoire manquant: {field}",
|
|
"details": {"field": field}
|
|
})
|
|
else:
|
|
checks.append({
|
|
"check_name": f"vente_{field}_present",
|
|
"status": "passed",
|
|
"message": f"Champ {field} présent",
|
|
"details": {"field": field, "value": data[field]}
|
|
})
|
|
|
|
# Vérification du prix
|
|
prix = data.get("prix", "")
|
|
if prix and not _is_valid_amount(prix):
|
|
checks.append({
|
|
"check_name": "vente_prix_format",
|
|
"status": "warning",
|
|
"message": f"Format de prix suspect: {prix}",
|
|
"details": {"prix": prix}
|
|
})
|
|
|
|
return checks
|
|
|
|
def _check_achat_requirements(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Vérifications pour un acte d'achat
|
|
"""
|
|
checks = []
|
|
|
|
# Vérification des champs obligatoires
|
|
required_fields = ["vendeur", "acheteur", "prix", "bien"]
|
|
|
|
for field in required_fields:
|
|
if not data.get(field):
|
|
checks.append({
|
|
"check_name": f"achat_{field}_present",
|
|
"status": "failed",
|
|
"message": f"Champ obligatoire manquant: {field}",
|
|
"details": {"field": field}
|
|
})
|
|
else:
|
|
checks.append({
|
|
"check_name": f"achat_{field}_present",
|
|
"status": "passed",
|
|
"message": f"Champ {field} présent",
|
|
"details": {"field": field, "value": data[field]}
|
|
})
|
|
|
|
return checks
|
|
|
|
def _check_donation_requirements(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Vérifications pour une donation
|
|
"""
|
|
checks = []
|
|
|
|
# Vérification des champs obligatoires
|
|
required_fields = ["donateur", "donataire", "bien_donne"]
|
|
|
|
for field in required_fields:
|
|
if not data.get(field):
|
|
checks.append({
|
|
"check_name": f"donation_{field}_present",
|
|
"status": "failed",
|
|
"message": f"Champ obligatoire manquant: {field}",
|
|
"details": {"field": field}
|
|
})
|
|
else:
|
|
checks.append({
|
|
"check_name": f"donation_{field}_present",
|
|
"status": "passed",
|
|
"message": f"Champ {field} présent",
|
|
"details": {"field": field, "value": data[field]}
|
|
})
|
|
|
|
return checks
|
|
|
|
def _check_testament_requirements(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Vérifications pour un testament
|
|
"""
|
|
checks = []
|
|
|
|
# Vérification des champs obligatoires
|
|
required_fields = ["testateur"]
|
|
|
|
for field in required_fields:
|
|
if not data.get(field):
|
|
checks.append({
|
|
"check_name": f"testament_{field}_present",
|
|
"status": "failed",
|
|
"message": f"Champ obligatoire manquant: {field}",
|
|
"details": {"field": field}
|
|
})
|
|
else:
|
|
checks.append({
|
|
"check_name": f"testament_{field}_present",
|
|
"status": "passed",
|
|
"message": f"Champ {field} présent",
|
|
"details": {"field": field, "value": data[field]}
|
|
})
|
|
|
|
return checks
|
|
|
|
def _check_succession_requirements(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Vérifications pour une succession
|
|
"""
|
|
checks = []
|
|
|
|
# Vérification des champs obligatoires
|
|
required_fields = ["defunt"]
|
|
|
|
for field in required_fields:
|
|
if not data.get(field):
|
|
checks.append({
|
|
"check_name": f"succession_{field}_present",
|
|
"status": "failed",
|
|
"message": f"Champ obligatoire manquant: {field}",
|
|
"details": {"field": field}
|
|
})
|
|
else:
|
|
checks.append({
|
|
"check_name": f"succession_{field}_present",
|
|
"status": "passed",
|
|
"message": f"Champ {field} présent",
|
|
"details": {"field": field, "value": data[field]}
|
|
})
|
|
|
|
return checks
|
|
|
|
def _check_data_consistency(data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Vérification de la cohérence des données
|
|
"""
|
|
issues = []
|
|
|
|
# Vérification des dates
|
|
dates = data.get("dates", [])
|
|
for date in dates:
|
|
if not _is_valid_date(date):
|
|
issues.append(f"Date invalide: {date}")
|
|
|
|
# Vérification des montants
|
|
montants = data.get("montants", [])
|
|
for montant in montants:
|
|
if not _is_valid_amount(montant):
|
|
issues.append(f"Montant invalide: {montant}")
|
|
|
|
if issues:
|
|
return {
|
|
"check_name": "data_consistency",
|
|
"status": "warning",
|
|
"message": f"Cohérence des données: {len(issues)} problème(s) détecté(s)",
|
|
"details": {"issues": issues}
|
|
}
|
|
else:
|
|
return {
|
|
"check_name": "data_consistency",
|
|
"status": "passed",
|
|
"message": "Données cohérentes",
|
|
"details": {}
|
|
}
|
|
|
|
def _determine_overall_status(checks_results: List[Dict[str, Any]]) -> str:
|
|
"""
|
|
Détermination du statut global
|
|
"""
|
|
failed_checks = sum(1 for check in checks_results if check["status"] == "failed")
|
|
warning_checks = sum(1 for check in checks_results if check["status"] == "warning")
|
|
|
|
if failed_checks > 0:
|
|
return "manual_review"
|
|
elif warning_checks > 2:
|
|
return "manual_review"
|
|
else:
|
|
return "completed"
|
|
|
|
def _is_valid_date(date_str: str) -> bool:
|
|
"""
|
|
Validation d'une date
|
|
"""
|
|
import re
|
|
# Format DD/MM/YYYY ou DD-MM-YYYY
|
|
pattern = r'^\d{1,2}[/-]\d{1,2}[/-]\d{2,4}$'
|
|
return bool(re.match(pattern, date_str))
|
|
|
|
def _is_valid_amount(amount_str: str) -> bool:
|
|
"""
|
|
Validation d'un montant
|
|
"""
|
|
import re
|
|
# Format avec euros
|
|
pattern = r'^\d{1,3}(?:\s\d{3})*(?:[.,]\d{2})?\s*€?$'
|
|
return bool(re.match(pattern, amount_str))
|