""" Pipeline d'indexation dans AnythingLLM et OpenSearch """ import os import requests import logging from typing import Dict, Any, List logger = logging.getLogger(__name__) # Configuration des services ANYLLM_BASE_URL = os.getenv("ANYLLM_BASE_URL", "http://anythingllm:3001") ANYLLM_API_KEY = os.getenv("ANYLLM_API_KEY", "change_me") OPENSEARCH_URL = os.getenv("OPENSEARCH_URL", "http://opensearch:9200") def run(doc_id: str, ctx: dict): """ Indexation du document dans les systèmes de recherche """ logger.info(f"Indexation du document {doc_id}") try: # Récupération des données extracted_text = ctx.get("extracted_text", "") classification = ctx.get("classification", {}) extracted_data = ctx.get("extracted_data", {}) if not extracted_text: raise ValueError("Aucun texte extrait disponible pour l'indexation") # Indexation dans AnythingLLM _index_in_anythingllm(doc_id, extracted_text, classification, extracted_data) # Indexation dans OpenSearch _index_in_opensearch(doc_id, extracted_text, classification, extracted_data) # Métadonnées d'indexation index_meta = { "indexation_completed": True, "anythingllm_indexed": True, "opensearch_indexed": True, "text_length": len(extracted_text) } ctx["index_meta"] = index_meta logger.info(f"Indexation terminée pour le document {doc_id}") except Exception as e: logger.error(f"Erreur lors de l'indexation du document {doc_id}: {e}") raise def _index_in_anythingllm(doc_id: str, text: str, classification: Dict[str, Any], extracted_data: Dict[str, Any]): """ Indexation dans AnythingLLM """ try: # Détermination du workspace selon le type de document workspace = _get_anythingllm_workspace(classification.get("label", "document_inconnu")) # Préparation des chunks de texte chunks = _create_text_chunks(text, doc_id, classification, extracted_data) # Headers pour l'API headers = { "Authorization": f"Bearer {ANYLLM_API_KEY}", "Content-Type": "application/json" } # Indexation des chunks for i, chunk in enumerate(chunks): payload = { "documents": [chunk] } response = requests.post( f"{ANYLLM_BASE_URL}/api/workspaces/{workspace}/documents", headers=headers, json=payload, timeout=60 ) if response.status_code not in [200, 201]: logger.warning(f"Erreur lors de l'indexation du chunk {i} dans AnythingLLM: {response.status_code}") else: logger.info(f"Chunk {i} indexé dans AnythingLLM workspace {workspace}") except Exception as e: logger.error(f"Erreur lors de l'indexation dans AnythingLLM: {e}") raise def _index_in_opensearch(doc_id: str, text: str, classification: Dict[str, Any], extracted_data: Dict[str, Any]): """ Indexation dans OpenSearch """ try: from opensearchpy import OpenSearch # Configuration du client OpenSearch client = OpenSearch( hosts=[OPENSEARCH_URL], http_auth=("admin", os.getenv("OPENSEARCH_PASSWORD", "opensearch_pwd")), use_ssl=False, verify_certs=False ) # Création de l'index s'il n'existe pas index_name = "notariat-documents" if not client.indices.exists(index=index_name): _create_opensearch_index(client, index_name) # Préparation du document document = { "doc_id": doc_id, "text": text, "document_type": classification.get("label", "document_inconnu"), "confidence": classification.get("confidence", 0.0), "extracted_data": extracted_data, "timestamp": "now" } # Indexation response = client.index( index=index_name, id=doc_id, body=document ) logger.info(f"Document {doc_id} indexé dans OpenSearch: {response['result']}") except Exception as e: logger.error(f"Erreur lors de l'indexation dans OpenSearch: {e}") raise def _get_anythingllm_workspace(document_type: str) -> str: """ Détermination du workspace AnythingLLM selon le type de document """ workspace_mapping = { "acte_vente": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"), "acte_achat": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"), "donation": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"), "testament": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"), "succession": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"), "contrat_mariage": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"), "procuration": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"), "attestation": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"), "facture": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"), "document_inconnu": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes") } return workspace_mapping.get(document_type, "workspace_actes") def _create_text_chunks(text: str, doc_id: str, classification: Dict[str, Any], extracted_data: Dict[str, Any]) -> List[Dict[str, Any]]: """ Création de chunks de texte pour l'indexation """ chunk_size = 2000 # Taille optimale pour les embeddings overlap = 200 # Chevauchement entre chunks chunks = [] start = 0 while start < len(text): end = start + chunk_size # Ajustement pour ne pas couper un mot if end < len(text): while end > start and text[end] not in [' ', '\n', '\t']: end -= 1 chunk_text = text[start:end].strip() if chunk_text: chunk = { "text": chunk_text, "metadata": { "doc_id": doc_id, "document_type": classification.get("label", "document_inconnu"), "confidence": classification.get("confidence", 0.0), "chunk_index": len(chunks), "extracted_data": extracted_data } } chunks.append(chunk) start = end - overlap if end < len(text) else end return chunks def _create_opensearch_index(client, index_name: str): """ Création de l'index OpenSearch avec mapping """ mapping = { "mappings": { "properties": { "doc_id": {"type": "keyword"}, "text": {"type": "text", "analyzer": "french"}, "document_type": {"type": "keyword"}, "confidence": {"type": "float"}, "extracted_data": {"type": "object"}, "timestamp": {"type": "date"} } }, "settings": { "number_of_shards": 1, "number_of_replicas": 0, "analysis": { "analyzer": { "french": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "french_stop", "french_stemmer"] } }, "filter": { "french_stop": { "type": "stop", "stopwords": "_french_" }, "french_stemmer": { "type": "stemmer", "language": "french" } } } } } client.indices.create(index=index_name, body=mapping) logger.info(f"Index OpenSearch {index_name} créé avec succès")