""" Tâches d'indexation des documents """ import logging from typing import Dict, Any from services.worker.celery_app import app logger = logging.getLogger(__name__) @app.task(bind=True, name='indexing.index_document') def index_document(self, doc_id: str, text: str, entities: Dict[str, Any], doc_type: str, context: Dict[str, Any]) -> Dict[str, Any]: """ Indexation d'un document dans les systèmes de recherche Args: doc_id: Identifiant du document text: Texte extrait du document entities: Entités extraites doc_type: Type de document context: Contexte de traitement Returns: Résultat de l'indexation """ try: logger.info(f"Début de l'indexation pour le document {doc_id}") # Mise à jour du statut self.update_state( state='PROGRESS', meta={'current_step': 'indexing', 'progress': 0} ) # Indexation dans les différents systèmes indexing_results = {} # 1. Indexation dans AnythingLLM try: from services.worker.utils.anythingllm_client import AnythingLLMClient anyllm_client = AnythingLLMClient() anyllm_result = await anyllm_client.index_document_for_actes( doc_id, text, entities, doc_type ) indexing_results['anythingllm'] = anyllm_result except Exception as e: logger.error(f"Erreur indexation AnythingLLM: {e}") indexing_results['anythingllm'] = {'status': 'error', 'error': str(e)} # 2. Indexation dans OpenSearch try: from services.worker.utils.opensearch_client import OpenSearchClient opensearch_client = OpenSearchClient() opensearch_result = await opensearch_client.index_document(doc_id, { 'text_content': text, 'entities': entities, 'doc_type': doc_type, 'filename': f"{doc_id}.pdf", 'status': 'processed' }) indexing_results['opensearch'] = opensearch_result except Exception as e: logger.error(f"Erreur indexation OpenSearch: {e}") indexing_results['opensearch'] = {'status': 'error', 'error': str(e)} # 3. Création du graphe Neo4j try: from services.worker.utils.neo4j_client import Neo4jClient neo4j_client = Neo4jClient() # Ajout du document au graphe neo4j_result = await neo4j_client.add_entities_to_document(doc_id, entities) indexing_results['neo4j'] = neo4j_result except Exception as e: logger.error(f"Erreur indexation Neo4j: {e}") indexing_results['neo4j'] = {'status': 'error', 'error': str(e)} import time time.sleep(1) # Simulation du traitement result = { 'doc_id': doc_id, 'status': 'completed', 'indexing_results': indexing_results, 'chunks_created': indexing_results.get('anythingllm', {}).get('chunks_created', 0), 'processing_time': 1.0 } logger.info(f"Indexation terminée pour le document {doc_id}") return result except Exception as e: logger.error(f"Erreur lors de l'indexation du document {doc_id}: {e}") raise @app.task(name='indexing.batch_index') def batch_index_documents(doc_ids: list, texts: list, entities_list: list, doc_types: list) -> Dict[str, Any]: """ Indexation en lot de documents Args: doc_ids: Liste des identifiants de documents texts: Liste des textes correspondants entities_list: Liste des entités correspondantes doc_types: Liste des types de documents correspondants Returns: Résultats de l'indexation en lot """ if len(doc_ids) != len(texts) or len(doc_ids) != len(entities_list) or len(doc_ids) != len(doc_types): raise ValueError("Le nombre de documents, textes, entités et types doit être identique") logger.info(f"Indexation en lot de {len(doc_ids)} documents") results = [] for doc_id, text, entities, doc_type in zip(doc_ids, texts, entities_list, doc_types): try: result = index_document.delay(doc_id, text, entities, doc_type, {}).get() results.append(result) except Exception as e: logger.error(f"Erreur lors de l'indexation en lot pour {doc_id}: {e}") results.append({ 'doc_id': doc_id, 'status': 'failed', 'error': str(e) }) return { 'batch_status': 'completed', 'total_documents': len(doc_ids), 'results': results }