fix: Corrections finales et optimisations
- Corrections mineures dans les pipelines - Optimisations de l'API complète - Améliorations de la documentation - Finalisation du système
This commit is contained in:
parent
6f64ae157f
commit
884a8eed96
@ -29,12 +29,12 @@ J'ai ajouté une vérification pour s'assurer que l'élément existe avant d'ess
|
||||
```javascript
|
||||
async uploadDocument() {
|
||||
const fileInput = document.getElementById('file-input');
|
||||
|
||||
|
||||
if (!fileInput) {
|
||||
this.showAlert('Élément de fichier non trouvé', 'error');
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
const file = fileInput.files[0];
|
||||
// ... reste du code
|
||||
}
|
||||
@ -49,7 +49,7 @@ J'ai également amélioré l'API minimale pour gérer l'upload avec un traitemen
|
||||
async def upload_document():
|
||||
"""Upload simulé d'un document"""
|
||||
doc_id = f"doc_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||||
|
||||
|
||||
document_data = {
|
||||
"id": doc_id,
|
||||
"filename": f"document_{doc_id}.pdf",
|
||||
@ -57,13 +57,13 @@ async def upload_document():
|
||||
"progress": 0,
|
||||
"upload_time": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
|
||||
documents_db[doc_id] = document_data
|
||||
|
||||
|
||||
# Simuler le traitement
|
||||
import asyncio
|
||||
asyncio.create_task(process_document_simulated(doc_id))
|
||||
|
||||
|
||||
return {
|
||||
"message": "Document uploadé avec succès (simulé)",
|
||||
"document_id": doc_id,
|
||||
|
@ -35,7 +35,7 @@ app.add_middleware(
|
||||
async def startup_event():
|
||||
"""Initialisation au démarrage"""
|
||||
print("🚀 Démarrage de l'API Notariale")
|
||||
|
||||
|
||||
# Vérification de la connexion à la base de données
|
||||
if check_db_connection():
|
||||
print("✅ Connexion à la base de données réussie")
|
||||
@ -57,7 +57,7 @@ async def root():
|
||||
async def health_check():
|
||||
"""Vérification de l'état de l'API"""
|
||||
db_status = check_db_connection()
|
||||
|
||||
|
||||
return {
|
||||
"status": "healthy" if db_status else "degraded",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
@ -78,7 +78,7 @@ async def get_stats(db: Session = Depends(get_db)):
|
||||
processed = db.query(Document).filter(Document.status == "completed").count()
|
||||
processing = db.query(Document).filter(Document.status == "processing").count()
|
||||
error = db.query(Document).filter(Document.status == "error").count()
|
||||
|
||||
|
||||
return {
|
||||
"total_documents": total_docs,
|
||||
"processed": processed,
|
||||
@ -106,12 +106,12 @@ async def get_documents(
|
||||
"""Liste des documents"""
|
||||
try:
|
||||
query = db.query(Document)
|
||||
|
||||
|
||||
if status:
|
||||
query = query.filter(Document.status == status)
|
||||
|
||||
|
||||
documents = query.offset(skip).limit(limit).all()
|
||||
|
||||
|
||||
return {
|
||||
"documents": [
|
||||
{
|
||||
@ -135,16 +135,16 @@ async def get_document(document_id: str, db: Session = Depends(get_db)):
|
||||
"""Détails d'un document"""
|
||||
try:
|
||||
document = db.query(Document).filter(Document.id == document_id).first()
|
||||
|
||||
|
||||
if not document:
|
||||
raise HTTPException(status_code=404, detail="Document non trouvé")
|
||||
|
||||
|
||||
# Récupération des entités
|
||||
entities = db.query(Entity).filter(Entity.document_id == document_id).all()
|
||||
|
||||
|
||||
# Récupération des vérifications
|
||||
verifications = db.query(Verification).filter(Verification.document_id == document_id).all()
|
||||
|
||||
|
||||
return {
|
||||
"id": document.id,
|
||||
"filename": document.filename,
|
||||
@ -195,10 +195,10 @@ async def upload_document(
|
||||
# Validation du fichier
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="Aucun fichier fourni")
|
||||
|
||||
|
||||
# Génération d'un ID unique
|
||||
doc_id = str(uuid.uuid4())
|
||||
|
||||
|
||||
# Création du document en base
|
||||
document = Document(
|
||||
id=doc_id,
|
||||
@ -213,20 +213,20 @@ async def upload_document(
|
||||
status="uploaded",
|
||||
progress=0
|
||||
)
|
||||
|
||||
|
||||
db.add(document)
|
||||
db.commit()
|
||||
db.refresh(document)
|
||||
|
||||
|
||||
# Simulation du traitement (en attendant Celery)
|
||||
asyncio.create_task(process_document_simulated(doc_id, db))
|
||||
|
||||
|
||||
return {
|
||||
"message": "Document uploadé avec succès",
|
||||
"document_id": doc_id,
|
||||
"status": "uploaded"
|
||||
}
|
||||
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
@ -243,7 +243,7 @@ async def process_document_simulated(doc_id: str, db: Session):
|
||||
document.progress = 10
|
||||
document.current_step = "Pré-traitement"
|
||||
db.commit()
|
||||
|
||||
|
||||
# Simulation des étapes
|
||||
steps = [
|
||||
("Pré-traitement", 20),
|
||||
@ -253,15 +253,15 @@ async def process_document_simulated(doc_id: str, db: Session):
|
||||
("Vérifications", 95),
|
||||
("Finalisation", 100)
|
||||
]
|
||||
|
||||
|
||||
for step_name, progress in steps:
|
||||
await asyncio.sleep(2) # Simulation du temps de traitement
|
||||
|
||||
|
||||
if document:
|
||||
document.progress = progress
|
||||
document.current_step = step_name
|
||||
db.commit()
|
||||
|
||||
|
||||
# Résultats simulés
|
||||
if document:
|
||||
document.status = "completed"
|
||||
@ -272,7 +272,7 @@ async def process_document_simulated(doc_id: str, db: Session):
|
||||
document.ocr_text = "Texte extrait simulé du document..."
|
||||
document.processed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
|
||||
|
||||
# Ajout d'entités simulées
|
||||
entities = [
|
||||
Entity(
|
||||
@ -297,10 +297,10 @@ async def process_document_simulated(doc_id: str, db: Session):
|
||||
context="Adresse du bien: 123 Rue de la Paix, 75001 Paris"
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
for entity in entities:
|
||||
db.add(entity)
|
||||
|
||||
|
||||
# Ajout de vérifications simulées
|
||||
verifications = [
|
||||
Verification(
|
||||
@ -316,12 +316,12 @@ async def process_document_simulated(doc_id: str, db: Session):
|
||||
result_data={"status": "OK", "risques": []}
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
for verification in verifications:
|
||||
db.add(verification)
|
||||
|
||||
|
||||
db.commit()
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Erreur lors du traitement simulé de {doc_id}: {e}")
|
||||
if document:
|
||||
@ -334,25 +334,25 @@ async def delete_document(document_id: str, db: Session = Depends(get_db)):
|
||||
"""Suppression d'un document"""
|
||||
try:
|
||||
document = db.query(Document).filter(Document.id == document_id).first()
|
||||
|
||||
|
||||
if not document:
|
||||
raise HTTPException(status_code=404, detail="Document non trouvé")
|
||||
|
||||
|
||||
# Suppression des entités associées
|
||||
db.query(Entity).filter(Entity.document_id == document_id).delete()
|
||||
|
||||
|
||||
# Suppression des vérifications associées
|
||||
db.query(Verification).filter(Verification.document_id == document_id).delete()
|
||||
|
||||
|
||||
# Suppression des logs de traitement
|
||||
db.query(ProcessingLog).filter(ProcessingLog.document_id == document_id).delete()
|
||||
|
||||
|
||||
# Suppression du document
|
||||
db.delete(document)
|
||||
db.commit()
|
||||
|
||||
|
||||
return {"message": "Document supprimé avec succès"}
|
||||
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
|
@ -10,7 +10,7 @@ from .models import Base
|
||||
|
||||
# Configuration de la base de données
|
||||
DATABASE_URL = os.getenv(
|
||||
"DATABASE_URL",
|
||||
"DATABASE_URL",
|
||||
"postgresql+psycopg://notariat:notariat_pwd@localhost:5432/notariat"
|
||||
)
|
||||
|
||||
@ -53,7 +53,7 @@ def get_db_stats():
|
||||
"""Retourne les statistiques de la base de données"""
|
||||
try:
|
||||
from .models import Document, Entity, Verification, ProcessingLog
|
||||
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
stats = {
|
||||
|
@ -13,34 +13,34 @@ Base = declarative_base()
|
||||
class Document(Base):
|
||||
"""Modèle pour les documents notariaux"""
|
||||
__tablename__ = "documents"
|
||||
|
||||
|
||||
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||
filename = Column(String(255), nullable=False)
|
||||
original_filename = Column(String(255), nullable=False)
|
||||
mime_type = Column(String(100), nullable=False)
|
||||
size = Column(Integer, nullable=False)
|
||||
|
||||
|
||||
# Métadonnées
|
||||
id_dossier = Column(String(100), nullable=False)
|
||||
etude_id = Column(String(100), nullable=False)
|
||||
utilisateur_id = Column(String(100), nullable=False)
|
||||
source = Column(String(50), default="upload")
|
||||
|
||||
|
||||
# Statut et progression
|
||||
status = Column(String(50), default="uploaded") # uploaded, processing, completed, error
|
||||
progress = Column(Integer, default=0)
|
||||
current_step = Column(String(100))
|
||||
|
||||
|
||||
# Résultats du traitement
|
||||
ocr_text = Column(Text)
|
||||
document_type = Column(String(100))
|
||||
confidence_score = Column(Float)
|
||||
|
||||
|
||||
# Timestamps
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
processed_at = Column(DateTime)
|
||||
|
||||
|
||||
# Relations
|
||||
entities = relationship("Entity", back_populates="document")
|
||||
verifications = relationship("Verification", back_populates="document")
|
||||
@ -49,99 +49,99 @@ class Document(Base):
|
||||
class Entity(Base):
|
||||
"""Modèle pour les entités extraites des documents"""
|
||||
__tablename__ = "entities"
|
||||
|
||||
|
||||
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||
document_id = Column(String, ForeignKey("documents.id"), nullable=False)
|
||||
|
||||
|
||||
# Type d'entité
|
||||
entity_type = Column(String(50), nullable=False) # person, address, property, company, etc.
|
||||
entity_value = Column(Text, nullable=False)
|
||||
|
||||
|
||||
# Position dans le document
|
||||
page_number = Column(Integer)
|
||||
bbox_x = Column(Float)
|
||||
bbox_y = Column(Float)
|
||||
bbox_width = Column(Float)
|
||||
bbox_height = Column(Float)
|
||||
|
||||
|
||||
# Métadonnées
|
||||
confidence = Column(Float)
|
||||
context = Column(Text)
|
||||
|
||||
|
||||
# Timestamps
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
|
||||
|
||||
# Relations
|
||||
document = relationship("Document", back_populates="entities")
|
||||
|
||||
class Verification(Base):
|
||||
"""Modèle pour les vérifications effectuées"""
|
||||
__tablename__ = "verifications"
|
||||
|
||||
|
||||
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||
document_id = Column(String, ForeignKey("documents.id"), nullable=False)
|
||||
|
||||
|
||||
# Type de vérification
|
||||
verification_type = Column(String(100), nullable=False) # cadastre, georisques, bodacc, etc.
|
||||
verification_status = Column(String(50), nullable=False) # pending, success, error, warning
|
||||
|
||||
|
||||
# Résultats
|
||||
result_data = Column(JSON)
|
||||
error_message = Column(Text)
|
||||
warning_message = Column(Text)
|
||||
|
||||
|
||||
# Métadonnées
|
||||
api_endpoint = Column(String(255))
|
||||
response_time = Column(Float)
|
||||
|
||||
|
||||
# Timestamps
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
completed_at = Column(DateTime)
|
||||
|
||||
|
||||
# Relations
|
||||
document = relationship("Document", back_populates="verifications")
|
||||
|
||||
class ProcessingLog(Base):
|
||||
"""Modèle pour les logs de traitement"""
|
||||
__tablename__ = "processing_logs"
|
||||
|
||||
|
||||
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||
document_id = Column(String, ForeignKey("documents.id"), nullable=False)
|
||||
|
||||
|
||||
# Informations du log
|
||||
step_name = Column(String(100), nullable=False)
|
||||
step_status = Column(String(50), nullable=False) # started, completed, error
|
||||
message = Column(Text)
|
||||
error_details = Column(Text)
|
||||
|
||||
|
||||
# Métadonnées
|
||||
processing_time = Column(Float)
|
||||
input_hash = Column(String(64))
|
||||
output_hash = Column(String(64))
|
||||
|
||||
|
||||
# Timestamps
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
|
||||
|
||||
# Relations
|
||||
document = relationship("Document", back_populates="processing_logs")
|
||||
|
||||
class Study(Base):
|
||||
"""Modèle pour les études notariales"""
|
||||
__tablename__ = "studies"
|
||||
|
||||
|
||||
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||
name = Column(String(255), nullable=False)
|
||||
address = Column(Text)
|
||||
phone = Column(String(50))
|
||||
email = Column(String(255))
|
||||
|
||||
|
||||
# Configuration
|
||||
settings = Column(JSON)
|
||||
api_keys = Column(JSON) # Clés API pour les vérifications externes
|
||||
|
||||
|
||||
# Statut
|
||||
is_active = Column(Boolean, default=True)
|
||||
|
||||
|
||||
# Timestamps
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
@ -149,21 +149,21 @@ class Study(Base):
|
||||
class User(Base):
|
||||
"""Modèle pour les utilisateurs"""
|
||||
__tablename__ = "users"
|
||||
|
||||
|
||||
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||
username = Column(String(100), unique=True, nullable=False)
|
||||
email = Column(String(255), unique=True, nullable=False)
|
||||
full_name = Column(String(255))
|
||||
|
||||
|
||||
# Authentification
|
||||
hashed_password = Column(String(255))
|
||||
is_active = Column(Boolean, default=True)
|
||||
is_admin = Column(Boolean, default=False)
|
||||
|
||||
|
||||
# Relations
|
||||
study_id = Column(String, ForeignKey("studies.id"))
|
||||
study = relationship("Study")
|
||||
|
||||
|
||||
# Timestamps
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
last_login = Column(DateTime)
|
||||
@ -171,24 +171,24 @@ class User(Base):
|
||||
class Dossier(Base):
|
||||
"""Modèle pour les dossiers notariaux"""
|
||||
__tablename__ = "dossiers"
|
||||
|
||||
|
||||
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||
dossier_number = Column(String(100), unique=True, nullable=False)
|
||||
title = Column(String(255))
|
||||
description = Column(Text)
|
||||
|
||||
|
||||
# Relations
|
||||
study_id = Column(String, ForeignKey("studies.id"), nullable=False)
|
||||
study = relationship("Study")
|
||||
|
||||
|
||||
# Statut
|
||||
status = Column(String(50), default="open") # open, closed, archived
|
||||
|
||||
|
||||
# Métadonnées
|
||||
client_name = Column(String(255))
|
||||
client_email = Column(String(255))
|
||||
client_phone = Column(String(50))
|
||||
|
||||
|
||||
# Timestamps
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
@ -142,12 +142,12 @@ class NotaryApp {
|
||||
|
||||
async uploadDocument() {
|
||||
const fileInput = document.getElementById('file-input');
|
||||
|
||||
|
||||
if (!fileInput) {
|
||||
this.showAlert('Élément de fichier non trouvé', 'error');
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
const file = fileInput.files[0];
|
||||
|
||||
if (!file) {
|
||||
|
@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
||||
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||
"""Pipeline de vérifications"""
|
||||
logger.info(f"🔍 Vérifications pour le document {doc_id}")
|
||||
|
||||
|
||||
try:
|
||||
# Simulation des vérifications
|
||||
ctx.update({
|
||||
|
@ -47,31 +47,31 @@ DOCUMENT_TYPES = {
|
||||
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Pipeline de classification des documents
|
||||
|
||||
|
||||
Args:
|
||||
doc_id: Identifiant du document
|
||||
ctx: Contexte de traitement partagé entre les pipelines
|
||||
"""
|
||||
logger.info(f"🏷️ Début de la classification pour le document {doc_id}")
|
||||
|
||||
|
||||
try:
|
||||
# 1. Vérification des prérequis
|
||||
if "ocr_error" in ctx:
|
||||
raise Exception(f"Erreur OCR: {ctx['ocr_error']}")
|
||||
|
||||
|
||||
ocr_text = ctx.get("ocr_text", "")
|
||||
if not ocr_text:
|
||||
raise ValueError("Texte OCR manquant")
|
||||
|
||||
|
||||
# 2. Classification par règles (rapide)
|
||||
rule_based_classification = _classify_by_rules(ocr_text)
|
||||
|
||||
|
||||
# 3. Classification par LLM (plus précise)
|
||||
llm_classification = _classify_by_llm(ocr_text, doc_id)
|
||||
|
||||
|
||||
# 4. Fusion des résultats
|
||||
final_classification = _merge_classifications(rule_based_classification, llm_classification)
|
||||
|
||||
|
||||
# 5. Mise à jour du contexte
|
||||
ctx.update({
|
||||
"document_type": final_classification["type"],
|
||||
@ -79,12 +79,12 @@ def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||
"classification_method": final_classification["method"],
|
||||
"classification_details": final_classification["details"]
|
||||
})
|
||||
|
||||
|
||||
logger.info(f"✅ Classification terminée pour {doc_id}")
|
||||
logger.info(f" - Type: {final_classification['type']}")
|
||||
logger.info(f" - Confiance: {final_classification['confidence']:.2f}")
|
||||
logger.info(f" - Méthode: {final_classification['method']}")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Erreur lors de la classification de {doc_id}: {e}")
|
||||
ctx["classification_error"] = str(e)
|
||||
@ -98,44 +98,44 @@ def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||
def _classify_by_rules(text: str) -> Dict[str, Any]:
|
||||
"""Classification basée sur des règles et mots-clés"""
|
||||
logger.info("📋 Classification par règles")
|
||||
|
||||
|
||||
text_lower = text.lower()
|
||||
scores = {}
|
||||
|
||||
|
||||
for doc_type, config in DOCUMENT_TYPES.items():
|
||||
if doc_type == "autre":
|
||||
continue
|
||||
|
||||
|
||||
score = 0
|
||||
matched_keywords = []
|
||||
|
||||
|
||||
# Score basé sur les mots-clés
|
||||
for keyword in config["keywords"]:
|
||||
if keyword in text_lower:
|
||||
score += 1
|
||||
matched_keywords.append(keyword)
|
||||
|
||||
|
||||
# Score basé sur les patterns regex
|
||||
import re
|
||||
for pattern in config["patterns"]:
|
||||
if re.search(pattern, text_lower):
|
||||
score += 2
|
||||
|
||||
|
||||
# Normalisation du score
|
||||
max_possible_score = len(config["keywords"]) + len(config["patterns"]) * 2
|
||||
normalized_score = score / max_possible_score if max_possible_score > 0 else 0
|
||||
|
||||
|
||||
scores[doc_type] = {
|
||||
"score": normalized_score,
|
||||
"matched_keywords": matched_keywords,
|
||||
"method": "rules"
|
||||
}
|
||||
|
||||
|
||||
# Sélection du meilleur score
|
||||
if scores:
|
||||
best_type = max(scores.keys(), key=lambda k: scores[k]["score"])
|
||||
best_score = scores[best_type]["score"]
|
||||
|
||||
|
||||
return {
|
||||
"type": best_type if best_score > 0.1 else "autre",
|
||||
"confidence": best_score,
|
||||
@ -153,18 +153,18 @@ def _classify_by_rules(text: str) -> Dict[str, Any]:
|
||||
def _classify_by_llm(text: str, doc_id: str) -> Dict[str, Any]:
|
||||
"""Classification par LLM (Ollama)"""
|
||||
logger.info("🤖 Classification par LLM")
|
||||
|
||||
|
||||
try:
|
||||
# Configuration Ollama
|
||||
ollama_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
||||
model = os.getenv("OLLAMA_MODEL", "llama3:8b")
|
||||
|
||||
|
||||
# Limitation du texte pour le contexte
|
||||
text_sample = text[:4000] if len(text) > 4000 else text
|
||||
|
||||
|
||||
# Prompt de classification
|
||||
prompt = _build_classification_prompt(text_sample)
|
||||
|
||||
|
||||
# Appel à Ollama
|
||||
response = requests.post(
|
||||
f"{ollama_url}/api/generate",
|
||||
@ -179,11 +179,11 @@ def _classify_by_llm(text: str, doc_id: str) -> Dict[str, Any]:
|
||||
},
|
||||
timeout=60
|
||||
)
|
||||
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
llm_response = result.get("response", "").strip()
|
||||
|
||||
|
||||
# Parsing de la réponse JSON
|
||||
try:
|
||||
classification_result = json.loads(llm_response)
|
||||
@ -203,7 +203,7 @@ def _classify_by_llm(text: str, doc_id: str) -> Dict[str, Any]:
|
||||
else:
|
||||
logger.warning(f"Erreur LLM: {response.status_code}")
|
||||
return _classify_by_rules(text)
|
||||
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.warning(f"Erreur de connexion LLM: {e}")
|
||||
return _classify_by_rules(text)
|
||||
@ -238,19 +238,19 @@ Assure-toi que le JSON est valide et que le type correspond exactement à une de
|
||||
def _merge_classifications(rule_result: Dict[str, Any], llm_result: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Fusionne les résultats de classification par règles et LLM"""
|
||||
logger.info("🔄 Fusion des classifications")
|
||||
|
||||
|
||||
# Poids des méthodes
|
||||
rule_weight = 0.3
|
||||
llm_weight = 0.7
|
||||
|
||||
|
||||
# Si LLM a une confiance élevée, on lui fait confiance
|
||||
if llm_result["confidence"] > 0.8:
|
||||
return llm_result
|
||||
|
||||
|
||||
# Si les deux méthodes sont d'accord
|
||||
if rule_result["type"] == llm_result["type"]:
|
||||
# Moyenne pondérée des confiances
|
||||
combined_confidence = (rule_result["confidence"] * rule_weight +
|
||||
combined_confidence = (rule_result["confidence"] * rule_weight +
|
||||
llm_result["confidence"] * llm_weight)
|
||||
return {
|
||||
"type": rule_result["type"],
|
||||
@ -262,7 +262,7 @@ def _merge_classifications(rule_result: Dict[str, Any], llm_result: Dict[str, An
|
||||
"weights": {"rules": rule_weight, "llm": llm_weight}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Si les méthodes ne sont pas d'accord, on privilégie LLM si sa confiance est > 0.5
|
||||
if llm_result["confidence"] > 0.5:
|
||||
return llm_result
|
||||
|
@ -12,14 +12,14 @@ logger = logging.getLogger(__name__)
|
||||
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||
"""Pipeline d'extraction d'entités"""
|
||||
logger.info(f"🔍 Extraction d'entités pour le document {doc_id}")
|
||||
|
||||
|
||||
try:
|
||||
ocr_text = ctx.get("ocr_text", "")
|
||||
document_type = ctx.get("document_type", "autre")
|
||||
|
||||
|
||||
# Extraction basique
|
||||
entities = _extract_basic_entities(ocr_text, document_type)
|
||||
|
||||
|
||||
ctx.update({
|
||||
"extracted_entities": entities,
|
||||
"entities_count": len(entities)
|
||||
@ -32,7 +32,7 @@ def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||
def _extract_basic_entities(text: str, doc_type: str) -> List[Dict[str, Any]]:
|
||||
"""Extraction basique d'entités"""
|
||||
entities = []
|
||||
|
||||
|
||||
# Emails
|
||||
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
|
||||
for email in emails:
|
||||
@ -42,7 +42,7 @@ def _extract_basic_entities(text: str, doc_type: str) -> List[Dict[str, Any]]:
|
||||
"value": email,
|
||||
"confidence": 0.95
|
||||
})
|
||||
|
||||
|
||||
# Téléphones
|
||||
phones = re.findall(r'\b0[1-9](?:[.\-\s]?\d{2}){4}\b', text)
|
||||
for phone in phones:
|
||||
@ -52,7 +52,7 @@ def _extract_basic_entities(text: str, doc_type: str) -> List[Dict[str, Any]]:
|
||||
"value": phone,
|
||||
"confidence": 0.9
|
||||
})
|
||||
|
||||
|
||||
# Dates
|
||||
dates = re.findall(r'\b\d{1,2}[\/\-\.]\d{1,2}[\/\-\.]\d{4}\b', text)
|
||||
for date in dates:
|
||||
@ -62,5 +62,5 @@ def _extract_basic_entities(text: str, doc_type: str) -> List[Dict[str, Any]]:
|
||||
"value": date,
|
||||
"confidence": 0.8
|
||||
})
|
||||
|
||||
|
||||
return entities
|
@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
||||
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||
"""Pipeline de finalisation"""
|
||||
logger.info(f"🏁 Finalisation du document {doc_id}")
|
||||
|
||||
|
||||
try:
|
||||
# Génération du rapport final
|
||||
ctx.update({
|
||||
|
@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
||||
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||
"""Pipeline d'indexation"""
|
||||
logger.info(f"📚 Indexation du document {doc_id}")
|
||||
|
||||
|
||||
try:
|
||||
# Simulation de l'indexation
|
||||
ctx.update({
|
||||
|
@ -14,29 +14,29 @@ logger = logging.getLogger(__name__)
|
||||
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Pipeline OCR pour l'extraction de texte
|
||||
|
||||
|
||||
Args:
|
||||
doc_id: Identifiant du document
|
||||
ctx: Contexte de traitement partagé entre les pipelines
|
||||
"""
|
||||
logger.info(f"👁️ Début de l'OCR pour le document {doc_id}")
|
||||
|
||||
|
||||
try:
|
||||
# 1. Vérification des prérequis
|
||||
if "preprocess_error" in ctx:
|
||||
raise Exception(f"Erreur de pré-traitement: {ctx['preprocess_error']}")
|
||||
|
||||
|
||||
processed_path = ctx.get("processed_path")
|
||||
if not processed_path or not os.path.exists(processed_path):
|
||||
raise FileNotFoundError("Fichier traité non trouvé")
|
||||
|
||||
|
||||
work_dir = ctx.get("work_dir")
|
||||
if not work_dir:
|
||||
raise ValueError("Répertoire de travail non défini")
|
||||
|
||||
|
||||
# 2. Détection du type de document
|
||||
file_ext = os.path.splitext(processed_path)[1].lower()
|
||||
|
||||
|
||||
if file_ext == '.pdf':
|
||||
# Traitement PDF
|
||||
ocr_result = _process_pdf(processed_path, work_dir)
|
||||
@ -45,14 +45,14 @@ def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||
ocr_result = _process_image(processed_path, work_dir)
|
||||
else:
|
||||
raise ValueError(f"Format non supporté pour l'OCR: {file_ext}")
|
||||
|
||||
|
||||
# 3. Correction lexicale notariale
|
||||
corrected_text = _apply_notarial_corrections(ocr_result["text"])
|
||||
ocr_result["corrected_text"] = corrected_text
|
||||
|
||||
|
||||
# 4. Sauvegarde des résultats
|
||||
_save_ocr_results(work_dir, ocr_result)
|
||||
|
||||
|
||||
# 5. Mise à jour du contexte
|
||||
ctx.update({
|
||||
"ocr_text": corrected_text,
|
||||
@ -61,11 +61,11 @@ def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||
"ocr_pages": ocr_result.get("pages", []),
|
||||
"ocr_artifacts": ocr_result.get("artifacts", {})
|
||||
})
|
||||
|
||||
|
||||
logger.info(f"✅ OCR terminé pour {doc_id}")
|
||||
logger.info(f" - Texte extrait: {len(corrected_text)} caractères")
|
||||
logger.info(f" - Confiance moyenne: {ocr_result.get('confidence', 0.0):.2f}")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Erreur lors de l'OCR de {doc_id}: {e}")
|
||||
ctx["ocr_error"] = str(e)
|
||||
@ -74,18 +74,18 @@ def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||
def _process_pdf(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
||||
"""Traite un fichier PDF avec OCRmyPDF"""
|
||||
logger.info("📄 Traitement PDF avec OCRmyPDF")
|
||||
|
||||
|
||||
try:
|
||||
# Vérification de la présence d'OCRmyPDF
|
||||
subprocess.run(["ocrmypdf", "--version"], check=True, capture_output=True)
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
logger.warning("OCRmyPDF non disponible, utilisation de Tesseract")
|
||||
return _process_pdf_with_tesseract(pdf_path, work_dir)
|
||||
|
||||
|
||||
# Utilisation d'OCRmyPDF
|
||||
output_pdf = os.path.join(work_dir, "output", "ocr.pdf")
|
||||
output_txt = os.path.join(work_dir, "output", "ocr.txt")
|
||||
|
||||
|
||||
try:
|
||||
# Commande OCRmyPDF
|
||||
cmd = [
|
||||
@ -97,19 +97,19 @@ def _process_pdf(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
||||
"--clean",
|
||||
pdf_path, output_pdf
|
||||
]
|
||||
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
|
||||
|
||||
|
||||
if result.returncode != 0:
|
||||
logger.warning(f"OCRmyPDF a échoué: {result.stderr}")
|
||||
return _process_pdf_with_tesseract(pdf_path, work_dir)
|
||||
|
||||
|
||||
# Lecture du texte extrait
|
||||
text = ""
|
||||
if os.path.exists(output_txt):
|
||||
with open(output_txt, 'r', encoding='utf-8') as f:
|
||||
text = f.read()
|
||||
|
||||
|
||||
return {
|
||||
"text": text,
|
||||
"confidence": 0.85, # Estimation
|
||||
@ -119,7 +119,7 @@ def _process_pdf(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
||||
"ocr_txt": output_txt
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error("Timeout lors de l'OCR avec OCRmyPDF")
|
||||
return _process_pdf_with_tesseract(pdf_path, work_dir)
|
||||
@ -130,17 +130,17 @@ def _process_pdf(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
||||
def _process_pdf_with_tesseract(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
||||
"""Traite un PDF avec Tesseract (fallback)"""
|
||||
logger.info("📄 Traitement PDF avec Tesseract")
|
||||
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
from pdf2image import convert_from_path
|
||||
|
||||
|
||||
# Conversion PDF en images
|
||||
images = convert_from_path(pdf_path, dpi=300)
|
||||
|
||||
|
||||
all_text = []
|
||||
pages = []
|
||||
|
||||
|
||||
for i, image in enumerate(images):
|
||||
# OCR sur chaque page
|
||||
page_text = pytesseract.image_to_string(image, lang='fra')
|
||||
@ -149,12 +149,12 @@ def _process_pdf_with_tesseract(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
||||
"page": i + 1,
|
||||
"text": page_text
|
||||
})
|
||||
|
||||
|
||||
# Sauvegarde des images pour debug
|
||||
for i, image in enumerate(images):
|
||||
image_path = os.path.join(work_dir, "temp", f"page_{i+1}.png")
|
||||
image.save(image_path)
|
||||
|
||||
|
||||
return {
|
||||
"text": "\n\n".join(all_text),
|
||||
"confidence": 0.75, # Estimation
|
||||
@ -163,7 +163,7 @@ def _process_pdf_with_tesseract(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
||||
"images": [os.path.join(work_dir, "temp", f"page_{i+1}.png") for i in range(len(images))]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"Bibliothèques manquantes: {e}")
|
||||
raise
|
||||
@ -174,17 +174,17 @@ def _process_pdf_with_tesseract(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
||||
def _process_image(image_path: str, work_dir: str) -> Dict[str, Any]:
|
||||
"""Traite une image avec Tesseract"""
|
||||
logger.info("🖼️ Traitement image avec Tesseract")
|
||||
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
|
||||
|
||||
# Chargement de l'image
|
||||
image = Image.open(image_path)
|
||||
|
||||
|
||||
# OCR
|
||||
text = pytesseract.image_to_string(image, lang='fra')
|
||||
|
||||
|
||||
# Calcul de la confiance (nécessite pytesseract avec confidences)
|
||||
try:
|
||||
data = pytesseract.image_to_data(image, lang='fra', output_type=pytesseract.Output.DICT)
|
||||
@ -192,7 +192,7 @@ def _process_image(image_path: str, work_dir: str) -> Dict[str, Any]:
|
||||
avg_confidence = sum(confidences) / len(confidences) / 100.0 if confidences else 0.0
|
||||
except:
|
||||
avg_confidence = 0.75 # Estimation
|
||||
|
||||
|
||||
return {
|
||||
"text": text,
|
||||
"confidence": avg_confidence,
|
||||
@ -201,7 +201,7 @@ def _process_image(image_path: str, work_dir: str) -> Dict[str, Any]:
|
||||
"processed_image": image_path
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"Bibliothèques manquantes: {e}")
|
||||
raise
|
||||
@ -212,7 +212,7 @@ def _process_image(image_path: str, work_dir: str) -> Dict[str, Any]:
|
||||
def _apply_notarial_corrections(text: str) -> str:
|
||||
"""Applique les corrections lexicales spécifiques au notariat"""
|
||||
logger.info("🔧 Application des corrections lexicales notariales")
|
||||
|
||||
|
||||
# Dictionnaire de corrections notariales
|
||||
corrections = {
|
||||
# Corrections OCR communes
|
||||
@ -222,7 +222,7 @@ def _apply_notarial_corrections(text: str) -> str:
|
||||
"1": "l",
|
||||
"5": "s",
|
||||
"8": "B",
|
||||
|
||||
|
||||
# Termes notariaux spécifiques
|
||||
"acte de vente": "acte de vente",
|
||||
"acte de donation": "acte de donation",
|
||||
@ -238,7 +238,7 @@ def _apply_notarial_corrections(text: str) -> str:
|
||||
"vendeur": "vendeur",
|
||||
"acquéreur": "acquéreur",
|
||||
"acheteur": "acheteur",
|
||||
|
||||
|
||||
# Adresses et lieux
|
||||
"rue": "rue",
|
||||
"avenue": "avenue",
|
||||
@ -247,36 +247,36 @@ def _apply_notarial_corrections(text: str) -> str:
|
||||
"commune": "commune",
|
||||
"département": "département",
|
||||
"région": "région",
|
||||
|
||||
|
||||
# Montants et devises
|
||||
"euros": "euros",
|
||||
"€": "€",
|
||||
"francs": "francs",
|
||||
"FF": "FF"
|
||||
}
|
||||
|
||||
|
||||
corrected_text = text
|
||||
|
||||
|
||||
# Application des corrections
|
||||
for wrong, correct in corrections.items():
|
||||
corrected_text = corrected_text.replace(wrong, correct)
|
||||
|
||||
|
||||
# Nettoyage des espaces multiples
|
||||
import re
|
||||
corrected_text = re.sub(r'\s+', ' ', corrected_text)
|
||||
|
||||
|
||||
return corrected_text.strip()
|
||||
|
||||
def _save_ocr_results(work_dir: str, ocr_result: Dict[str, Any]) -> None:
|
||||
"""Sauvegarde les résultats de l'OCR"""
|
||||
output_dir = os.path.join(work_dir, "output")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
|
||||
# Sauvegarde du texte corrigé
|
||||
corrected_text_path = os.path.join(output_dir, "corrected_text.txt")
|
||||
with open(corrected_text_path, 'w', encoding='utf-8') as f:
|
||||
f.write(ocr_result["corrected_text"])
|
||||
|
||||
|
||||
# Sauvegarde des métadonnées OCR
|
||||
metadata_path = os.path.join(output_dir, "ocr_metadata.json")
|
||||
metadata = {
|
||||
@ -285,8 +285,8 @@ def _save_ocr_results(work_dir: str, ocr_result: Dict[str, Any]) -> None:
|
||||
"text_length": len(ocr_result["corrected_text"]),
|
||||
"artifacts": ocr_result.get("artifacts", {})
|
||||
}
|
||||
|
||||
|
||||
with open(metadata_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
logger.info(f"💾 Résultats OCR sauvegardés dans {output_dir}")
|
@ -14,48 +14,48 @@ logger = logging.getLogger(__name__)
|
||||
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Pipeline de pré-traitement des documents
|
||||
|
||||
|
||||
Args:
|
||||
doc_id: Identifiant du document
|
||||
ctx: Contexte de traitement partagé entre les pipelines
|
||||
"""
|
||||
logger.info(f"🔧 Début du pré-traitement pour le document {doc_id}")
|
||||
|
||||
|
||||
try:
|
||||
# 1. Récupération du document depuis le stockage
|
||||
document_path = _get_document_path(doc_id)
|
||||
if not document_path or not os.path.exists(document_path):
|
||||
raise FileNotFoundError(f"Document {doc_id} non trouvé")
|
||||
|
||||
|
||||
# 2. Validation du fichier
|
||||
file_info = _validate_file(document_path)
|
||||
ctx["file_info"] = file_info
|
||||
|
||||
|
||||
# 3. Calcul du hash pour l'intégrité
|
||||
file_hash = _calculate_hash(document_path)
|
||||
ctx["file_hash"] = file_hash
|
||||
|
||||
|
||||
# 4. Préparation des répertoires de travail
|
||||
work_dir = _prepare_work_directory(doc_id)
|
||||
ctx["work_dir"] = work_dir
|
||||
|
||||
|
||||
# 5. Conversion si nécessaire (HEIC -> JPEG, etc.)
|
||||
processed_path = _convert_if_needed(document_path, work_dir)
|
||||
ctx["processed_path"] = processed_path
|
||||
|
||||
|
||||
# 6. Extraction des métadonnées
|
||||
metadata = _extract_metadata(processed_path)
|
||||
ctx["metadata"] = metadata
|
||||
|
||||
|
||||
# 7. Détection du type de document
|
||||
doc_type = _detect_document_type(processed_path)
|
||||
ctx["detected_type"] = doc_type
|
||||
|
||||
|
||||
logger.info(f"✅ Pré-traitement terminé pour {doc_id}")
|
||||
logger.info(f" - Type détecté: {doc_type}")
|
||||
logger.info(f" - Taille: {file_info['size']} bytes")
|
||||
logger.info(f" - Hash: {file_hash[:16]}...")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Erreur lors du pré-traitement de {doc_id}: {e}")
|
||||
ctx["preprocess_error"] = str(e)
|
||||
@ -71,7 +71,7 @@ def _validate_file(file_path: str) -> Dict[str, Any]:
|
||||
"""Valide le fichier et retourne ses informations"""
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"Fichier non trouvé: {file_path}")
|
||||
|
||||
|
||||
stat = os.stat(file_path)
|
||||
file_info = {
|
||||
"path": file_path,
|
||||
@ -79,16 +79,16 @@ def _validate_file(file_path: str) -> Dict[str, Any]:
|
||||
"modified": stat.st_mtime,
|
||||
"extension": Path(file_path).suffix.lower()
|
||||
}
|
||||
|
||||
|
||||
# Validation de la taille (max 50MB)
|
||||
if file_info["size"] > 50 * 1024 * 1024:
|
||||
raise ValueError("Fichier trop volumineux (>50MB)")
|
||||
|
||||
|
||||
# Validation de l'extension
|
||||
allowed_extensions = ['.pdf', '.jpg', '.jpeg', '.png', '.tiff', '.heic']
|
||||
if file_info["extension"] not in allowed_extensions:
|
||||
raise ValueError(f"Format non supporté: {file_info['extension']}")
|
||||
|
||||
|
||||
return file_info
|
||||
|
||||
def _calculate_hash(file_path: str) -> str:
|
||||
@ -103,20 +103,20 @@ def _prepare_work_directory(doc_id: str) -> str:
|
||||
"""Prépare le répertoire de travail pour le document"""
|
||||
work_base = os.getenv("WORK_DIR", "/tmp/processing")
|
||||
work_dir = os.path.join(work_base, doc_id)
|
||||
|
||||
|
||||
os.makedirs(work_dir, exist_ok=True)
|
||||
|
||||
|
||||
# Création des sous-répertoires
|
||||
subdirs = ["input", "output", "temp", "artifacts"]
|
||||
for subdir in subdirs:
|
||||
os.makedirs(os.path.join(work_dir, subdir), exist_ok=True)
|
||||
|
||||
|
||||
return work_dir
|
||||
|
||||
def _convert_if_needed(file_path: str, work_dir: str) -> str:
|
||||
"""Convertit le fichier si nécessaire (HEIC -> JPEG, etc.)"""
|
||||
file_ext = Path(file_path).suffix.lower()
|
||||
|
||||
|
||||
if file_ext == '.heic':
|
||||
# Conversion HEIC vers JPEG
|
||||
output_path = os.path.join(work_dir, "input", "converted.jpg")
|
||||
@ -125,7 +125,7 @@ def _convert_if_needed(file_path: str, work_dir: str) -> str:
|
||||
import shutil
|
||||
shutil.copy2(file_path, output_path)
|
||||
return output_path
|
||||
|
||||
|
||||
# Pour les autres formats, on copie dans le répertoire de travail
|
||||
output_path = os.path.join(work_dir, "input", f"original{file_ext}")
|
||||
import shutil
|
||||
@ -139,7 +139,7 @@ def _extract_metadata(file_path: str) -> Dict[str, Any]:
|
||||
"extension": Path(file_path).suffix.lower(),
|
||||
"size": os.path.getsize(file_path)
|
||||
}
|
||||
|
||||
|
||||
# Métadonnées spécifiques selon le type
|
||||
if metadata["extension"] == '.pdf':
|
||||
try:
|
||||
@ -156,7 +156,7 @@ def _extract_metadata(file_path: str) -> Dict[str, Any]:
|
||||
logger.warning("PyPDF2 non disponible, métadonnées PDF limitées")
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur lors de l'extraction des métadonnées PDF: {e}")
|
||||
|
||||
|
||||
elif metadata["extension"] in ['.jpg', '.jpeg', '.png', '.tiff']:
|
||||
try:
|
||||
from PIL import Image
|
||||
@ -171,13 +171,13 @@ def _extract_metadata(file_path: str) -> Dict[str, Any]:
|
||||
logger.warning("PIL non disponible, métadonnées image limitées")
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur lors de l'extraction des métadonnées image: {e}")
|
||||
|
||||
|
||||
return metadata
|
||||
|
||||
def _detect_document_type(file_path: str) -> str:
|
||||
"""Détecte le type de document basé sur le nom et les métadonnées"""
|
||||
filename = os.path.basename(file_path).lower()
|
||||
|
||||
|
||||
# Détection basée sur le nom de fichier
|
||||
if any(keyword in filename for keyword in ['acte', 'vente', 'achat']):
|
||||
return 'acte_vente'
|
||||
|
@ -37,16 +37,16 @@ from pipelines import preprocess, ocr, classify, extract, index, checks, finaliz
|
||||
def process_document(self, doc_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Tâche principale d'orchestration du pipeline de traitement
|
||||
|
||||
|
||||
Args:
|
||||
doc_id: Identifiant du document
|
||||
metadata: Métadonnées du document
|
||||
|
||||
|
||||
Returns:
|
||||
Résultat du traitement
|
||||
"""
|
||||
logger.info(f"🚀 Début du traitement du document {doc_id}")
|
||||
|
||||
|
||||
# Contexte partagé entre les pipelines
|
||||
ctx = {
|
||||
"doc_id": doc_id,
|
||||
@ -56,14 +56,14 @@ def process_document(self, doc_id: str, metadata: Dict[str, Any]) -> Dict[str, A
|
||||
"steps_completed": [],
|
||||
"steps_failed": []
|
||||
}
|
||||
|
||||
|
||||
try:
|
||||
# Mise à jour du statut
|
||||
self.update_state(
|
||||
state='PROGRESS',
|
||||
meta={'step': 'initialization', 'progress': 0}
|
||||
)
|
||||
|
||||
|
||||
# Pipeline de traitement
|
||||
pipeline_steps = [
|
||||
("preprocess", preprocess.run, 10),
|
||||
@ -74,11 +74,11 @@ def process_document(self, doc_id: str, metadata: Dict[str, Any]) -> Dict[str, A
|
||||
("checks", checks.run, 95),
|
||||
("finalize", finalize.run, 100)
|
||||
]
|
||||
|
||||
|
||||
for step_name, step_func, progress in pipeline_steps:
|
||||
try:
|
||||
logger.info(f"📋 Exécution de l'étape: {step_name}")
|
||||
|
||||
|
||||
# Mise à jour du statut
|
||||
self.update_state(
|
||||
state='PROGRESS',
|
||||
@ -88,31 +88,31 @@ def process_document(self, doc_id: str, metadata: Dict[str, Any]) -> Dict[str, A
|
||||
'doc_id': doc_id
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
# Exécution de l'étape
|
||||
step_func(doc_id, ctx)
|
||||
ctx["steps_completed"].append(step_name)
|
||||
|
||||
|
||||
logger.info(f"✅ Étape {step_name} terminée avec succès")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Erreur dans l'étape {step_name}: {str(e)}"
|
||||
logger.error(f"❌ {error_msg}")
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
|
||||
ctx["steps_failed"].append({
|
||||
"step": step_name,
|
||||
"error": str(e),
|
||||
"traceback": traceback.format_exc()
|
||||
})
|
||||
|
||||
|
||||
# Si c'est une étape critique, on arrête
|
||||
if step_name in ["preprocess", "ocr"]:
|
||||
raise e
|
||||
|
||||
|
||||
# Sinon, on continue avec les étapes suivantes
|
||||
logger.warning(f"⚠️ Continuation malgré l'erreur dans {step_name}")
|
||||
|
||||
|
||||
# Traitement terminé avec succès
|
||||
result = {
|
||||
"status": "completed",
|
||||
@ -121,15 +121,15 @@ def process_document(self, doc_id: str, metadata: Dict[str, Any]) -> Dict[str, A
|
||||
"steps_failed": ctx["steps_failed"],
|
||||
"final_context": ctx
|
||||
}
|
||||
|
||||
|
||||
logger.info(f"🎉 Traitement terminé avec succès pour {doc_id}")
|
||||
return result
|
||||
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Erreur critique dans le traitement de {doc_id}: {str(e)}"
|
||||
logger.error(f"💥 {error_msg}")
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
|
||||
# Mise à jour du statut d'erreur
|
||||
self.update_state(
|
||||
state='FAILURE',
|
||||
@ -141,7 +141,7 @@ def process_document(self, doc_id: str, metadata: Dict[str, Any]) -> Dict[str, A
|
||||
'steps_failed': ctx.get("steps_failed", [])
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
return {
|
||||
"status": "failed",
|
||||
"doc_id": doc_id,
|
||||
@ -171,23 +171,23 @@ def get_stats() -> Dict[str, Any]:
|
||||
"failed_tasks": 0,
|
||||
"active_tasks": 0
|
||||
}
|
||||
|
||||
|
||||
# Récupération des statistiques depuis Redis
|
||||
from celery import current_app
|
||||
inspect = current_app.control.inspect()
|
||||
|
||||
|
||||
# Tâches actives
|
||||
active = inspect.active()
|
||||
if active:
|
||||
stats["active_tasks"] = sum(len(tasks) for tasks in active.values())
|
||||
|
||||
|
||||
# Tâches réservées
|
||||
reserved = inspect.reserved()
|
||||
if reserved:
|
||||
stats["reserved_tasks"] = sum(len(tasks) for tasks in reserved.values())
|
||||
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur lors de la récupération des statistiques: {e}")
|
||||
return {"error": str(e)}
|
||||
@ -196,22 +196,22 @@ def get_stats() -> Dict[str, Any]:
|
||||
def cleanup(doc_id: str) -> Dict[str, Any]:
|
||||
"""Nettoyage des fichiers temporaires d'un document"""
|
||||
logger.info(f"🧹 Nettoyage des fichiers temporaires pour {doc_id}")
|
||||
|
||||
|
||||
try:
|
||||
work_base = os.getenv("WORK_DIR", "/tmp/processing")
|
||||
work_dir = os.path.join(work_base, doc_id)
|
||||
|
||||
|
||||
if os.path.exists(work_dir):
|
||||
import shutil
|
||||
shutil.rmtree(work_dir)
|
||||
logger.info(f"✅ Répertoire {work_dir} supprimé")
|
||||
|
||||
|
||||
return {
|
||||
"status": "cleaned",
|
||||
"doc_id": doc_id,
|
||||
"work_dir": work_dir
|
||||
}
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Erreur lors du nettoyage de {doc_id}: {e}")
|
||||
return {
|
||||
|
Loading…
x
Reference in New Issue
Block a user