4NK_IA_back/services/worker/pipelines/index.py

"""
Pipeline d'indexation dans AnythingLLM et OpenSearch
"""
import os
import requests
import logging
from typing import Dict, Any, List

logger = logging.getLogger(__name__)

# Configuration des services
ANYLLM_BASE_URL = os.getenv("ANYLLM_BASE_URL", "http://anythingllm:3001")
ANYLLM_API_KEY = os.getenv("ANYLLM_API_KEY", "change_me")
OPENSEARCH_URL = os.getenv("OPENSEARCH_URL", "http://opensearch:9200")

def run(doc_id: str, ctx: dict):
    """
    Indexation du document dans les systèmes de recherche
    """
    logger.info(f"Indexation du document {doc_id}")

    try:
        # Récupération des données
        extracted_text = ctx.get("extracted_text", "")
        classification = ctx.get("classification", {})
        extracted_data = ctx.get("extracted_data", {})

        if not extracted_text:
            raise ValueError("Aucun texte extrait disponible pour l'indexation")

        # Indexation dans AnythingLLM
        _index_in_anythingllm(doc_id, extracted_text, classification, extracted_data)

        # Indexation dans OpenSearch
        _index_in_opensearch(doc_id, extracted_text, classification, extracted_data)

        # Métadonnées d'indexation
        index_meta = {
            "indexation_completed": True,
            "anythingllm_indexed": True,
            "opensearch_indexed": True,
            "text_length": len(extracted_text)
        }

        ctx["index_meta"] = index_meta

        logger.info(f"Indexation terminée pour le document {doc_id}")

    except Exception as e:
        logger.error(f"Erreur lors de l'indexation du document {doc_id}: {e}")
        raise

def _index_in_anythingllm(doc_id: str, text: str, classification: Dict[str, Any], extracted_data: Dict[str, Any]):
    """
    Indexation dans AnythingLLM
    """
    try:
        # Détermination du workspace selon le type de document
        workspace = _get_anythingllm_workspace(classification.get("label", "document_inconnu"))

        # Préparation des chunks de texte
        chunks = _create_text_chunks(text, doc_id, classification, extracted_data)

        # Headers pour l'API
        headers = {
            "Authorization": f"Bearer {ANYLLM_API_KEY}",
            "Content-Type": "application/json"
        }

        # Indexation des chunks
        for i, chunk in enumerate(chunks):
            payload = {
                "documents": [chunk]
            }

            response = requests.post(
                f"{ANYLLM_BASE_URL}/api/workspaces/{workspace}/documents",
                headers=headers,
                json=payload,
                timeout=60
            )

            if response.status_code not in [200, 201]:
                logger.warning(f"Erreur lors de l'indexation du chunk {i} dans AnythingLLM: {response.status_code}")
            else:
                logger.info(f"Chunk {i} indexé dans AnythingLLM workspace {workspace}")

    except Exception as e:
        logger.error(f"Erreur lors de l'indexation dans AnythingLLM: {e}")
        raise

def _index_in_opensearch(doc_id: str, text: str, classification: Dict[str, Any], extracted_data: Dict[str, Any]):
    """
    Indexation dans OpenSearch
    """
    try:
        from opensearchpy import OpenSearch

        # Configuration du client OpenSearch
        client = OpenSearch(
            hosts=[OPENSEARCH_URL],
            http_auth=("admin", os.getenv("OPENSEARCH_PASSWORD", "opensearch_pwd")),
            use_ssl=False,
            verify_certs=False
        )

        # Création de l'index s'il n'existe pas
        index_name = "notariat-documents"
        if not client.indices.exists(index=index_name):
            _create_opensearch_index(client, index_name)

        # Préparation du document
        document = {
            "doc_id": doc_id,
            "text": text,
            "document_type": classification.get("label", "document_inconnu"),
            "confidence": classification.get("confidence", 0.0),
            "extracted_data": extracted_data,
            "timestamp": "now"
        }

        # Indexation
        response = client.index(
            index=index_name,
            id=doc_id,
            body=document
        )

        logger.info(f"Document {doc_id} indexé dans OpenSearch: {response['result']}")

    except Exception as e:
        logger.error(f"Erreur lors de l'indexation dans OpenSearch: {e}")
        raise

def _get_anythingllm_workspace(document_type: str) -> str:
    """
    Détermination du workspace AnythingLLM selon le type de document
    """
    workspace_mapping = {
        "acte_vente": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
        "acte_achat": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
        "donation": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
        "testament": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
        "succession": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
        "contrat_mariage": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
        "procuration": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
        "attestation": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
        "facture": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
        "document_inconnu": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes")
    }

    return workspace_mapping.get(document_type, "workspace_actes")

def _create_text_chunks(text: str, doc_id: str, classification: Dict[str, Any], extracted_data: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Création de chunks de texte pour l'indexation
    """
    chunk_size = 2000  # Taille optimale pour les embeddings
    overlap = 200  # Chevauchement entre chunks

    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size

        # Ajustement pour ne pas couper un mot
        if end < len(text):
            while end > start and text[end] not in [' ', '\n', '\t']:
                end -= 1

        chunk_text = text[start:end].strip()

        if chunk_text:
            chunk = {
                "text": chunk_text,
                "metadata": {
                    "doc_id": doc_id,
                    "document_type": classification.get("label", "document_inconnu"),
                    "confidence": classification.get("confidence", 0.0),
                    "chunk_index": len(chunks),
                    "extracted_data": extracted_data
                }
            }
            chunks.append(chunk)

        start = end - overlap if end < len(text) else end

    return chunks

def _create_opensearch_index(client, index_name: str):
    """
    Création de l'index OpenSearch avec mapping
    """
    mapping = {
        "mappings": {
            "properties": {
                "doc_id": {"type": "keyword"},
                "text": {"type": "text", "analyzer": "french"},
                "document_type": {"type": "keyword"},
                "confidence": {"type": "float"},
                "extracted_data": {"type": "object"},
                "timestamp": {"type": "date"}
            }
        },
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0,
            "analysis": {
                "analyzer": {
                    "french": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": ["lowercase", "french_stop", "french_stemmer"]
                    }
                },
                "filter": {
                    "french_stop": {
                        "type": "stop",
                        "stopwords": "_french_"
                    },
                    "french_stemmer": {
                        "type": "stemmer",
                        "language": "french"
                    }
                }
            }
        }
    }

    client.indices.create(index=index_name, body=mapping)
    logger.info(f"Index OpenSearch {index_name} créé avec succès")