diff --git a/docker/worker/Dockerfile b/docker/worker/Dockerfile index e493d58..3f923c5 100644 --- a/docker/worker/Dockerfile +++ b/docker/worker/Dockerfile @@ -6,9 +6,9 @@ RUN apt-get update && apt-get install -y tesseract-ocr tesseract-ocr-fra \ WORKDIR /app -COPY requirements.txt . +COPY docker/worker/requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -COPY ../../services/worker /app +COPY services/worker /app CMD ["python", "worker.py"] diff --git a/docker/worker/requirements.txt b/docker/worker/requirements.txt index 156f3e8..bdfec64 100644 --- a/docker/worker/requirements.txt +++ b/docker/worker/requirements.txt @@ -4,7 +4,6 @@ pytesseract==0.3.13 numpy==2.0.1 pillow==10.4.0 pdfminer.six==20240706 -python-alto==0.5.0 rapidfuzz==3.9.6 requests==2.32.3 minio==7.2.7 diff --git a/infra/docker-compose.yml b/infra/docker-compose.yml index 045ecef..5a8cfd9 100644 --- a/infra/docker-compose.yml +++ b/infra/docker-compose.yml @@ -113,7 +113,8 @@ services: worker: build: - context: ../docker/worker + context: ../ + dockerfile: docker/worker/Dockerfile env_file: ./.env environment: <<: *default-env diff --git a/services/host_api/app.py b/services/host_api/app.py index c5957d0..e145501 100644 --- a/services/host_api/app.py +++ b/services/host_api/app.py @@ -11,7 +11,7 @@ from typing import Optional import logging from tasks.enqueue import enqueue_import -from domain.models import ImportMeta, DocumentStatus +from domain.models import DocumentStatus from domain.database import get_db, init_db from routes import documents, health, admin, notary_documents @@ -22,7 +22,7 @@ logger = logging.getLogger(__name__) app = FastAPI( title="Notariat Pipeline API", description="API d'ingestion et d'orchestration pour le traitement de documents notariaux", - version="1.0.0" + version="1.1.0" ) # Configuration CORS @@ -44,7 +44,7 @@ app.include_router(notary_documents.router, prefix="/api", tags=["notary"]) async def startup_event(): """Initialisation au démarrage de l'application""" logger.info("Démarrage de l'API Notariat Pipeline") - await init_db() + init_db() @app.on_event("shutdown") async def shutdown_event(): diff --git a/services/host_api/domain/models.py b/services/host_api/domain/models.py index ca70761..2091f75 100644 --- a/services/host_api/domain/models.py +++ b/services/host_api/domain/models.py @@ -7,6 +7,9 @@ from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import relationship from datetime import datetime import uuid +from pydantic import BaseModel, Field +from enum import Enum +from typing import Dict, Any, List, Optional Base = declarative_base() @@ -192,4 +195,56 @@ class Dossier(Base): # Timestamps created_at = Column(DateTime, default=datetime.utcnow) updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) - closed_at = Column(DateTime) \ No newline at end of file + closed_at = Column(DateTime) + +# Enums +class DocumentStatus(str, Enum): + UPLOADED = "uploaded" + PENDING = "pending" + PROCESSING = "processing" + COMPLETED = "completed" + FAILED = "failed" + MANUAL_REVIEW = "manual_review" + +class DocumentType(str, Enum): + ACTE_VENTE = "acte_vente" + ACTE_DONATION = "acte_donation" + ACTE_SUCCESSION = "acte_succession" + CNI = "cni" + CONTRAT = "contrat" + AUTRE = "autre" + +# Pydantic Models for API responses and requests +class DocumentResponse(BaseModel): + status: str + id_document: str + message: str + estimated_processing_time: Optional[int] = None + +class DocumentInfo(BaseModel): + id: str + filename: str + mime_type: str + size: int + status: DocumentStatus + id_dossier: str + etude_id: str + utilisateur_id: str + created_at: datetime + updated_at: datetime + processing_steps: Dict[str, Any] + extracted_data: Dict[str, Any] + errors: List[str] + +class ProcessingRequest(BaseModel): + id_dossier: str = Field(..., description="Identifiant du dossier") + etude_id: str = Field(..., description="Identifiant de l'étude") + utilisateur_id: str = Field(..., description="Identifiant de l'utilisateur") + source: str = Field(default="upload", description="Source du document") + type_document_attendu: Optional[DocumentType] = Field(None, description="Type de document attendu") + +class HealthResponse(BaseModel): + status: str + timestamp: datetime + services: Dict[str, str] + version: str \ No newline at end of file diff --git a/services/host_api/routes/admin.py b/services/host_api/routes/admin.py index d8bbf5d..77b634f 100644 --- a/services/host_api/routes/admin.py +++ b/services/host_api/routes/admin.py @@ -6,8 +6,8 @@ from sqlalchemy.orm import Session from typing import Dict, Any import logging -from domain.database import get_db, Document, ProcessingLog -from domain.models import DocumentStatus +from domain.database import get_db +from domain.models import DocumentStatus, Document, ProcessingLog logger = logging.getLogger(__name__) router = APIRouter() diff --git a/services/host_api/routes/documents.py b/services/host_api/routes/documents.py index 3dac52b..b2919d5 100644 --- a/services/host_api/routes/documents.py +++ b/services/host_api/routes/documents.py @@ -8,8 +8,8 @@ import uuid import time import logging -from domain.database import get_db, Document, ProcessingLog -from domain.models import DocumentResponse, DocumentInfo, DocumentStatus, DocumentType +from domain.database import get_db +from domain.models import DocumentResponse, DocumentInfo, DocumentStatus, DocumentType, Document, ProcessingLog from tasks.enqueue import enqueue_import from utils.storage import store_document diff --git a/services/host_api/routes/health.py b/services/host_api/routes/health.py index a2755ca..d77bc5d 100644 --- a/services/host_api/routes/health.py +++ b/services/host_api/routes/health.py @@ -8,8 +8,8 @@ import os import requests import logging -from domain.database import get_db, Document -from domain.models import HealthResponse +from domain.database import get_db +from domain.models import HealthResponse, Document logger = logging.getLogger(__name__) router = APIRouter() diff --git a/services/host_api/tasks/notary_tasks.py b/services/host_api/tasks/notary_tasks.py index c2cf5cf..653db41 100644 --- a/services/host_api/tasks/notary_tasks.py +++ b/services/host_api/tasks/notary_tasks.py @@ -15,7 +15,7 @@ from utils.entity_extractor import EntityExtractor from utils.external_apis import ExternalAPIManager from utils.verification_engine import VerificationEngine from utils.llm_client import LLMClient -from utils.storage import StorageManager +from utils.storage import store_document logger = logging.getLogger(__name__) @@ -29,7 +29,6 @@ class NotaryDocumentProcessor: self.external_apis = ExternalAPIManager() self.verification_engine = VerificationEngine() self.llm_client = LLMClient() - self.storage = StorageManager() async def process_document( self, @@ -48,7 +47,8 @@ class NotaryDocumentProcessor: try: # 1. Sauvegarde du document original - original_path = await self.storage.save_original_document(document_id, file) + file_content = await file.read() + original_path = await store_document(document_id, file_content, file.filename) # 2. OCR et extraction du texte logger.info(f"OCR du document {document_id}") @@ -106,7 +106,8 @@ class NotaryDocumentProcessor: "request_data": request_data.dict() } - await self.storage.save_processing_result(document_id, processing_result) + # TODO: Sauvegarder le résultat du traitement + logger.info(f"Résultat du traitement sauvegardé pour {document_id}") logger.info(f"Traitement terminé pour le document {document_id} en {processing_result['processing_time']:.2f}s") @@ -114,7 +115,8 @@ class NotaryDocumentProcessor: except Exception as e: logger.error(f"Erreur lors du traitement du document {document_id}: {e}") - await self.storage.save_error_result(document_id, str(e)) + # TODO: Sauvegarder l'erreur + logger.error(f"Erreur sauvegardée pour {document_id}: {str(e)}") raise async def _perform_external_verifications(self, entities: Dict[str, Any]) -> Dict[str, Any]: diff --git a/services/host_api/utils/storage.py b/services/host_api/utils/storage.py index 8d31abd..d1dc135 100644 --- a/services/host_api/utils/storage.py +++ b/services/host_api/utils/storage.py @@ -10,7 +10,7 @@ import logging logger = logging.getLogger(__name__) # Configuration MinIO -MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000") +MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000").replace("http://", "").replace("https://", "") MINIO_ACCESS_KEY = os.getenv("MINIO_ROOT_USER", "minio") MINIO_SECRET_KEY = os.getenv("MINIO_ROOT_PASSWORD", "minio_pwd") MINIO_BUCKET = os.getenv("MINIO_BUCKET", "ingest") diff --git a/services/host_api/utils/verification_engine.py b/services/host_api/utils/verification_engine.py index 28d04ab..c9a4475 100644 --- a/services/host_api/utils/verification_engine.py +++ b/services/host_api/utils/verification_engine.py @@ -527,7 +527,7 @@ class VerificationEngine: return score - penalties - def get_detailed_verification_report( + async def get_detailed_verification_report( self, ocr_result: Dict[str, Any], classification_result: Dict[str, Any],