""" Pipeline d'extraction d'entités """ import os import logging import re from typing import Dict, Any, List from services.worker.utils.llm_client import WorkerLLMClient logger = logging.getLogger(__name__) def run(doc_id: str, ctx: Dict[str, Any]) -> None: """Pipeline d'extraction d'entités""" logger.info(f"🔍 Extraction d'entités pour le document {doc_id}") try: ocr_text = ctx.get("ocr_text", "") document_type = ctx.get("document_type", "autre") # Extraction basique entities = _extract_basic_entities(ocr_text, document_type) # Extraction avancée via LLM (merge non destructif) llm = WorkerLLMClient() prompt = _build_extraction_prompt(ocr_text[:3000] if ocr_text else "", document_type) llm_response = llm.generate(prompt) llm_json = WorkerLLMClient.extract_first_json(llm_response) or {} entities = _merge_entities_basic_with_llm(entities, llm_json) ctx.update({ "extracted_entities": entities, "entities_count": len(entities) }) logger.info(f"✅ Extraction terminée pour {doc_id}: {len(entities)} entités") except Exception as e: logger.error(f"❌ Erreur extraction {doc_id}: {e}") ctx["extraction_error"] = str(e) def _extract_basic_entities(text: str, doc_type: str) -> List[Dict[str, Any]]: """Extraction basique d'entités""" entities = [] # Emails emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text) for email in emails: entities.append({ "type": "contact", "subtype": "email", "value": email, "confidence": 0.95 }) # Téléphones phones = re.findall(r'\b0[1-9](?:[.\-\s]?\d{2}){4}\b', text) for phone in phones: entities.append({ "type": "contact", "subtype": "phone", "value": phone, "confidence": 0.9 }) # Dates dates = re.findall(r'\b\d{1,2}[\/\-\.]\d{1,2}[\/\-\.]\d{4}\b', text) for date in dates: entities.append({ "type": "date", "subtype": "generic", "value": date, "confidence": 0.8 }) return entities def _build_extraction_prompt(text: str, doc_type: str) -> str: return f""" Tu es un extracteur d'entités pour documents notariaux. Type de document: {doc_type} Extrait en JSON strict les objets: identites, adresses, biens, entreprises, montants, dates. Réponds UNIQUEMENT par un JSON. TEXTE: {text} """ def _merge_entities_basic_with_llm(basic: List[Dict[str, Any]], advanced: Dict[str, Any]) -> List[Dict[str, Any]]: merged = list(basic) if not isinstance(advanced, dict): return merged # Aplatit les entités LLM en liste simple type/value pour compatibilité minimale for key in ["identites", "adresses", "biens", "entreprises", "montants", "dates"]: items = advanced.get(key, []) or [] for item in items: try: value = item.get("adresse_complete") or item.get("date") or item.get("montant") or item.get("nom") or item.get("description") or str(item) if value: merged.append({"type": key, "value": value, "confidence": item.get("confidence", 0.8)}) except Exception: continue return merged