
- Infrastructure complète de traitement de documents notariaux - API FastAPI d'ingestion et d'orchestration - Pipelines Celery pour le traitement asynchrone - Support des formats PDF, JPEG, PNG, TIFF, HEIC - OCR avec Tesseract et correction lexicale - Classification automatique des documents avec Ollama - Extraction de données structurées - Indexation dans AnythingLLM et OpenSearch - Système de vérifications et contrôles métier - Base de données PostgreSQL pour le métier - Stockage objet avec MinIO - Base de données graphe Neo4j - Recherche plein-texte avec OpenSearch - Supervision avec Prometheus et Grafana - Scripts d'installation pour Debian - Documentation complète - Tests unitaires et de performance - Service systemd pour le déploiement - Scripts de déploiement automatisés
233 lines
7.8 KiB
Python
233 lines
7.8 KiB
Python
"""
|
|
Pipeline d'indexation dans AnythingLLM et OpenSearch
|
|
"""
|
|
import os
|
|
import requests
|
|
import logging
|
|
from typing import Dict, Any, List
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration des services
|
|
ANYLLM_BASE_URL = os.getenv("ANYLLM_BASE_URL", "http://anythingllm:3001")
|
|
ANYLLM_API_KEY = os.getenv("ANYLLM_API_KEY", "change_me")
|
|
OPENSEARCH_URL = os.getenv("OPENSEARCH_URL", "http://opensearch:9200")
|
|
|
|
def run(doc_id: str, ctx: dict):
|
|
"""
|
|
Indexation du document dans les systèmes de recherche
|
|
"""
|
|
logger.info(f"Indexation du document {doc_id}")
|
|
|
|
try:
|
|
# Récupération des données
|
|
extracted_text = ctx.get("extracted_text", "")
|
|
classification = ctx.get("classification", {})
|
|
extracted_data = ctx.get("extracted_data", {})
|
|
|
|
if not extracted_text:
|
|
raise ValueError("Aucun texte extrait disponible pour l'indexation")
|
|
|
|
# Indexation dans AnythingLLM
|
|
_index_in_anythingllm(doc_id, extracted_text, classification, extracted_data)
|
|
|
|
# Indexation dans OpenSearch
|
|
_index_in_opensearch(doc_id, extracted_text, classification, extracted_data)
|
|
|
|
# Métadonnées d'indexation
|
|
index_meta = {
|
|
"indexation_completed": True,
|
|
"anythingllm_indexed": True,
|
|
"opensearch_indexed": True,
|
|
"text_length": len(extracted_text)
|
|
}
|
|
|
|
ctx["index_meta"] = index_meta
|
|
|
|
logger.info(f"Indexation terminée pour le document {doc_id}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Erreur lors de l'indexation du document {doc_id}: {e}")
|
|
raise
|
|
|
|
def _index_in_anythingllm(doc_id: str, text: str, classification: Dict[str, Any], extracted_data: Dict[str, Any]):
|
|
"""
|
|
Indexation dans AnythingLLM
|
|
"""
|
|
try:
|
|
# Détermination du workspace selon le type de document
|
|
workspace = _get_anythingllm_workspace(classification.get("label", "document_inconnu"))
|
|
|
|
# Préparation des chunks de texte
|
|
chunks = _create_text_chunks(text, doc_id, classification, extracted_data)
|
|
|
|
# Headers pour l'API
|
|
headers = {
|
|
"Authorization": f"Bearer {ANYLLM_API_KEY}",
|
|
"Content-Type": "application/json"
|
|
}
|
|
|
|
# Indexation des chunks
|
|
for i, chunk in enumerate(chunks):
|
|
payload = {
|
|
"documents": [chunk]
|
|
}
|
|
|
|
response = requests.post(
|
|
f"{ANYLLM_BASE_URL}/api/workspaces/{workspace}/documents",
|
|
headers=headers,
|
|
json=payload,
|
|
timeout=60
|
|
)
|
|
|
|
if response.status_code not in [200, 201]:
|
|
logger.warning(f"Erreur lors de l'indexation du chunk {i} dans AnythingLLM: {response.status_code}")
|
|
else:
|
|
logger.info(f"Chunk {i} indexé dans AnythingLLM workspace {workspace}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Erreur lors de l'indexation dans AnythingLLM: {e}")
|
|
raise
|
|
|
|
def _index_in_opensearch(doc_id: str, text: str, classification: Dict[str, Any], extracted_data: Dict[str, Any]):
|
|
"""
|
|
Indexation dans OpenSearch
|
|
"""
|
|
try:
|
|
from opensearchpy import OpenSearch
|
|
|
|
# Configuration du client OpenSearch
|
|
client = OpenSearch(
|
|
hosts=[OPENSEARCH_URL],
|
|
http_auth=("admin", os.getenv("OPENSEARCH_PASSWORD", "opensearch_pwd")),
|
|
use_ssl=False,
|
|
verify_certs=False
|
|
)
|
|
|
|
# Création de l'index s'il n'existe pas
|
|
index_name = "notariat-documents"
|
|
if not client.indices.exists(index=index_name):
|
|
_create_opensearch_index(client, index_name)
|
|
|
|
# Préparation du document
|
|
document = {
|
|
"doc_id": doc_id,
|
|
"text": text,
|
|
"document_type": classification.get("label", "document_inconnu"),
|
|
"confidence": classification.get("confidence", 0.0),
|
|
"extracted_data": extracted_data,
|
|
"timestamp": "now"
|
|
}
|
|
|
|
# Indexation
|
|
response = client.index(
|
|
index=index_name,
|
|
id=doc_id,
|
|
body=document
|
|
)
|
|
|
|
logger.info(f"Document {doc_id} indexé dans OpenSearch: {response['result']}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Erreur lors de l'indexation dans OpenSearch: {e}")
|
|
raise
|
|
|
|
def _get_anythingllm_workspace(document_type: str) -> str:
|
|
"""
|
|
Détermination du workspace AnythingLLM selon le type de document
|
|
"""
|
|
workspace_mapping = {
|
|
"acte_vente": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
|
|
"acte_achat": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
|
|
"donation": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
|
|
"testament": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
|
|
"succession": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
|
|
"contrat_mariage": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
|
|
"procuration": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
|
|
"attestation": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
|
|
"facture": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes"),
|
|
"document_inconnu": os.getenv("ANYLLM_WORKSPACE_ACTES", "workspace_actes")
|
|
}
|
|
|
|
return workspace_mapping.get(document_type, "workspace_actes")
|
|
|
|
def _create_text_chunks(text: str, doc_id: str, classification: Dict[str, Any], extracted_data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Création de chunks de texte pour l'indexation
|
|
"""
|
|
chunk_size = 2000 # Taille optimale pour les embeddings
|
|
overlap = 200 # Chevauchement entre chunks
|
|
|
|
chunks = []
|
|
start = 0
|
|
|
|
while start < len(text):
|
|
end = start + chunk_size
|
|
|
|
# Ajustement pour ne pas couper un mot
|
|
if end < len(text):
|
|
while end > start and text[end] not in [' ', '\n', '\t']:
|
|
end -= 1
|
|
|
|
chunk_text = text[start:end].strip()
|
|
|
|
if chunk_text:
|
|
chunk = {
|
|
"text": chunk_text,
|
|
"metadata": {
|
|
"doc_id": doc_id,
|
|
"document_type": classification.get("label", "document_inconnu"),
|
|
"confidence": classification.get("confidence", 0.0),
|
|
"chunk_index": len(chunks),
|
|
"extracted_data": extracted_data
|
|
}
|
|
}
|
|
chunks.append(chunk)
|
|
|
|
start = end - overlap if end < len(text) else end
|
|
|
|
return chunks
|
|
|
|
def _create_opensearch_index(client, index_name: str):
|
|
"""
|
|
Création de l'index OpenSearch avec mapping
|
|
"""
|
|
mapping = {
|
|
"mappings": {
|
|
"properties": {
|
|
"doc_id": {"type": "keyword"},
|
|
"text": {"type": "text", "analyzer": "french"},
|
|
"document_type": {"type": "keyword"},
|
|
"confidence": {"type": "float"},
|
|
"extracted_data": {"type": "object"},
|
|
"timestamp": {"type": "date"}
|
|
}
|
|
},
|
|
"settings": {
|
|
"number_of_shards": 1,
|
|
"number_of_replicas": 0,
|
|
"analysis": {
|
|
"analyzer": {
|
|
"french": {
|
|
"type": "custom",
|
|
"tokenizer": "standard",
|
|
"filter": ["lowercase", "french_stop", "french_stemmer"]
|
|
}
|
|
},
|
|
"filter": {
|
|
"french_stop": {
|
|
"type": "stop",
|
|
"stopwords": "_french_"
|
|
},
|
|
"french_stemmer": {
|
|
"type": "stemmer",
|
|
"language": "french"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
client.indices.create(index=index_name, body=mapping)
|
|
logger.info(f"Index OpenSearch {index_name} créé avec succès")
|