
- Corrections mineures dans les pipelines - Optimisations de l'API complète - Améliorations de la documentation - Finalisation du système
233 lines
7.0 KiB
Python
233 lines
7.0 KiB
Python
"""
|
|
Worker Celery pour l'orchestration des pipelines de traitement
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
from celery import Celery
|
|
from typing import Dict, Any
|
|
import traceback
|
|
|
|
# Configuration du logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration Celery
|
|
redis_url = os.getenv("REDIS_URL", "redis://localhost:6379/0")
|
|
app = Celery('worker', broker=redis_url, backend=redis_url)
|
|
|
|
# Configuration des tâches
|
|
app.conf.update(
|
|
task_serializer='json',
|
|
accept_content=['json'],
|
|
result_serializer='json',
|
|
timezone='Europe/Paris',
|
|
enable_utc=True,
|
|
task_track_started=True,
|
|
task_time_limit=30 * 60, # 30 minutes
|
|
task_soft_time_limit=25 * 60, # 25 minutes
|
|
worker_prefetch_multiplier=1,
|
|
worker_max_tasks_per_child=1000,
|
|
)
|
|
|
|
# Import des pipelines
|
|
from pipelines import preprocess, ocr, classify, extract, index, checks, finalize
|
|
|
|
@app.task(bind=True, name='pipeline.process_document')
|
|
def process_document(self, doc_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Tâche principale d'orchestration du pipeline de traitement
|
|
|
|
Args:
|
|
doc_id: Identifiant du document
|
|
metadata: Métadonnées du document
|
|
|
|
Returns:
|
|
Résultat du traitement
|
|
"""
|
|
logger.info(f"🚀 Début du traitement du document {doc_id}")
|
|
|
|
# Contexte partagé entre les pipelines
|
|
ctx = {
|
|
"doc_id": doc_id,
|
|
"metadata": metadata,
|
|
"task_id": self.request.id,
|
|
"start_time": self.request.get("start_time"),
|
|
"steps_completed": [],
|
|
"steps_failed": []
|
|
}
|
|
|
|
try:
|
|
# Mise à jour du statut
|
|
self.update_state(
|
|
state='PROGRESS',
|
|
meta={'step': 'initialization', 'progress': 0}
|
|
)
|
|
|
|
# Pipeline de traitement
|
|
pipeline_steps = [
|
|
("preprocess", preprocess.run, 10),
|
|
("ocr", ocr.run, 30),
|
|
("classify", classify.run, 50),
|
|
("extract", extract.run, 70),
|
|
("index", index.run, 85),
|
|
("checks", checks.run, 95),
|
|
("finalize", finalize.run, 100)
|
|
]
|
|
|
|
for step_name, step_func, progress in pipeline_steps:
|
|
try:
|
|
logger.info(f"📋 Exécution de l'étape: {step_name}")
|
|
|
|
# Mise à jour du statut
|
|
self.update_state(
|
|
state='PROGRESS',
|
|
meta={
|
|
'step': step_name,
|
|
'progress': progress,
|
|
'doc_id': doc_id
|
|
}
|
|
)
|
|
|
|
# Exécution de l'étape
|
|
step_func(doc_id, ctx)
|
|
ctx["steps_completed"].append(step_name)
|
|
|
|
logger.info(f"✅ Étape {step_name} terminée avec succès")
|
|
|
|
except Exception as e:
|
|
error_msg = f"Erreur dans l'étape {step_name}: {str(e)}"
|
|
logger.error(f"❌ {error_msg}")
|
|
logger.error(traceback.format_exc())
|
|
|
|
ctx["steps_failed"].append({
|
|
"step": step_name,
|
|
"error": str(e),
|
|
"traceback": traceback.format_exc()
|
|
})
|
|
|
|
# Si c'est une étape critique, on arrête
|
|
if step_name in ["preprocess", "ocr"]:
|
|
raise e
|
|
|
|
# Sinon, on continue avec les étapes suivantes
|
|
logger.warning(f"⚠️ Continuation malgré l'erreur dans {step_name}")
|
|
|
|
# Traitement terminé avec succès
|
|
result = {
|
|
"status": "completed",
|
|
"doc_id": doc_id,
|
|
"steps_completed": ctx["steps_completed"],
|
|
"steps_failed": ctx["steps_failed"],
|
|
"final_context": ctx
|
|
}
|
|
|
|
logger.info(f"🎉 Traitement terminé avec succès pour {doc_id}")
|
|
return result
|
|
|
|
except Exception as e:
|
|
error_msg = f"Erreur critique dans le traitement de {doc_id}: {str(e)}"
|
|
logger.error(f"💥 {error_msg}")
|
|
logger.error(traceback.format_exc())
|
|
|
|
# Mise à jour du statut d'erreur
|
|
self.update_state(
|
|
state='FAILURE',
|
|
meta={
|
|
'error': str(e),
|
|
'traceback': traceback.format_exc(),
|
|
'doc_id': doc_id,
|
|
'steps_completed': ctx.get("steps_completed", []),
|
|
'steps_failed': ctx.get("steps_failed", [])
|
|
}
|
|
)
|
|
|
|
return {
|
|
"status": "failed",
|
|
"doc_id": doc_id,
|
|
"error": str(e),
|
|
"traceback": traceback.format_exc(),
|
|
"steps_completed": ctx.get("steps_completed", []),
|
|
"steps_failed": ctx.get("steps_failed", [])
|
|
}
|
|
|
|
@app.task(name='pipeline.health_check')
|
|
def health_check() -> Dict[str, Any]:
|
|
"""Vérification de l'état du worker"""
|
|
return {
|
|
"status": "healthy",
|
|
"worker": "notariat-worker",
|
|
"version": "1.0.0"
|
|
}
|
|
|
|
@app.task(name='pipeline.get_stats')
|
|
def get_stats() -> Dict[str, Any]:
|
|
"""Retourne les statistiques du worker"""
|
|
try:
|
|
# Statistiques des tâches
|
|
stats = {
|
|
"total_tasks": 0,
|
|
"completed_tasks": 0,
|
|
"failed_tasks": 0,
|
|
"active_tasks": 0
|
|
}
|
|
|
|
# Récupération des statistiques depuis Redis
|
|
from celery import current_app
|
|
inspect = current_app.control.inspect()
|
|
|
|
# Tâches actives
|
|
active = inspect.active()
|
|
if active:
|
|
stats["active_tasks"] = sum(len(tasks) for tasks in active.values())
|
|
|
|
# Tâches réservées
|
|
reserved = inspect.reserved()
|
|
if reserved:
|
|
stats["reserved_tasks"] = sum(len(tasks) for tasks in reserved.values())
|
|
|
|
return stats
|
|
|
|
except Exception as e:
|
|
logger.error(f"Erreur lors de la récupération des statistiques: {e}")
|
|
return {"error": str(e)}
|
|
|
|
@app.task(name='pipeline.cleanup')
|
|
def cleanup(doc_id: str) -> Dict[str, Any]:
|
|
"""Nettoyage des fichiers temporaires d'un document"""
|
|
logger.info(f"🧹 Nettoyage des fichiers temporaires pour {doc_id}")
|
|
|
|
try:
|
|
work_base = os.getenv("WORK_DIR", "/tmp/processing")
|
|
work_dir = os.path.join(work_base, doc_id)
|
|
|
|
if os.path.exists(work_dir):
|
|
import shutil
|
|
shutil.rmtree(work_dir)
|
|
logger.info(f"✅ Répertoire {work_dir} supprimé")
|
|
|
|
return {
|
|
"status": "cleaned",
|
|
"doc_id": doc_id,
|
|
"work_dir": work_dir
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Erreur lors du nettoyage de {doc_id}: {e}")
|
|
return {
|
|
"status": "error",
|
|
"doc_id": doc_id,
|
|
"error": str(e)
|
|
}
|
|
|
|
# Configuration des routes de tâches
|
|
app.conf.task_routes = {
|
|
'pipeline.process_document': {'queue': 'processing'},
|
|
'pipeline.health_check': {'queue': 'monitoring'},
|
|
'pipeline.get_stats': {'queue': 'monitoring'},
|
|
'pipeline.cleanup': {'queue': 'cleanup'},
|
|
}
|
|
|
|
if __name__ == '__main__':
|
|
# Démarrage du worker
|
|
app.start() |