fix: Corrections finales et optimisations
- Corrections mineures dans les pipelines - Optimisations de l'API complète - Améliorations de la documentation - Finalisation du système
This commit is contained in:
parent
6f64ae157f
commit
884a8eed96
@ -29,12 +29,12 @@ J'ai ajouté une vérification pour s'assurer que l'élément existe avant d'ess
|
|||||||
```javascript
|
```javascript
|
||||||
async uploadDocument() {
|
async uploadDocument() {
|
||||||
const fileInput = document.getElementById('file-input');
|
const fileInput = document.getElementById('file-input');
|
||||||
|
|
||||||
if (!fileInput) {
|
if (!fileInput) {
|
||||||
this.showAlert('Élément de fichier non trouvé', 'error');
|
this.showAlert('Élément de fichier non trouvé', 'error');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const file = fileInput.files[0];
|
const file = fileInput.files[0];
|
||||||
// ... reste du code
|
// ... reste du code
|
||||||
}
|
}
|
||||||
@ -49,7 +49,7 @@ J'ai également amélioré l'API minimale pour gérer l'upload avec un traitemen
|
|||||||
async def upload_document():
|
async def upload_document():
|
||||||
"""Upload simulé d'un document"""
|
"""Upload simulé d'un document"""
|
||||||
doc_id = f"doc_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
doc_id = f"doc_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||||||
|
|
||||||
document_data = {
|
document_data = {
|
||||||
"id": doc_id,
|
"id": doc_id,
|
||||||
"filename": f"document_{doc_id}.pdf",
|
"filename": f"document_{doc_id}.pdf",
|
||||||
@ -57,13 +57,13 @@ async def upload_document():
|
|||||||
"progress": 0,
|
"progress": 0,
|
||||||
"upload_time": datetime.now().isoformat()
|
"upload_time": datetime.now().isoformat()
|
||||||
}
|
}
|
||||||
|
|
||||||
documents_db[doc_id] = document_data
|
documents_db[doc_id] = document_data
|
||||||
|
|
||||||
# Simuler le traitement
|
# Simuler le traitement
|
||||||
import asyncio
|
import asyncio
|
||||||
asyncio.create_task(process_document_simulated(doc_id))
|
asyncio.create_task(process_document_simulated(doc_id))
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"message": "Document uploadé avec succès (simulé)",
|
"message": "Document uploadé avec succès (simulé)",
|
||||||
"document_id": doc_id,
|
"document_id": doc_id,
|
||||||
|
@ -35,7 +35,7 @@ app.add_middleware(
|
|||||||
async def startup_event():
|
async def startup_event():
|
||||||
"""Initialisation au démarrage"""
|
"""Initialisation au démarrage"""
|
||||||
print("🚀 Démarrage de l'API Notariale")
|
print("🚀 Démarrage de l'API Notariale")
|
||||||
|
|
||||||
# Vérification de la connexion à la base de données
|
# Vérification de la connexion à la base de données
|
||||||
if check_db_connection():
|
if check_db_connection():
|
||||||
print("✅ Connexion à la base de données réussie")
|
print("✅ Connexion à la base de données réussie")
|
||||||
@ -57,7 +57,7 @@ async def root():
|
|||||||
async def health_check():
|
async def health_check():
|
||||||
"""Vérification de l'état de l'API"""
|
"""Vérification de l'état de l'API"""
|
||||||
db_status = check_db_connection()
|
db_status = check_db_connection()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": "healthy" if db_status else "degraded",
|
"status": "healthy" if db_status else "degraded",
|
||||||
"timestamp": datetime.now().isoformat(),
|
"timestamp": datetime.now().isoformat(),
|
||||||
@ -78,7 +78,7 @@ async def get_stats(db: Session = Depends(get_db)):
|
|||||||
processed = db.query(Document).filter(Document.status == "completed").count()
|
processed = db.query(Document).filter(Document.status == "completed").count()
|
||||||
processing = db.query(Document).filter(Document.status == "processing").count()
|
processing = db.query(Document).filter(Document.status == "processing").count()
|
||||||
error = db.query(Document).filter(Document.status == "error").count()
|
error = db.query(Document).filter(Document.status == "error").count()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"total_documents": total_docs,
|
"total_documents": total_docs,
|
||||||
"processed": processed,
|
"processed": processed,
|
||||||
@ -106,12 +106,12 @@ async def get_documents(
|
|||||||
"""Liste des documents"""
|
"""Liste des documents"""
|
||||||
try:
|
try:
|
||||||
query = db.query(Document)
|
query = db.query(Document)
|
||||||
|
|
||||||
if status:
|
if status:
|
||||||
query = query.filter(Document.status == status)
|
query = query.filter(Document.status == status)
|
||||||
|
|
||||||
documents = query.offset(skip).limit(limit).all()
|
documents = query.offset(skip).limit(limit).all()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"documents": [
|
"documents": [
|
||||||
{
|
{
|
||||||
@ -135,16 +135,16 @@ async def get_document(document_id: str, db: Session = Depends(get_db)):
|
|||||||
"""Détails d'un document"""
|
"""Détails d'un document"""
|
||||||
try:
|
try:
|
||||||
document = db.query(Document).filter(Document.id == document_id).first()
|
document = db.query(Document).filter(Document.id == document_id).first()
|
||||||
|
|
||||||
if not document:
|
if not document:
|
||||||
raise HTTPException(status_code=404, detail="Document non trouvé")
|
raise HTTPException(status_code=404, detail="Document non trouvé")
|
||||||
|
|
||||||
# Récupération des entités
|
# Récupération des entités
|
||||||
entities = db.query(Entity).filter(Entity.document_id == document_id).all()
|
entities = db.query(Entity).filter(Entity.document_id == document_id).all()
|
||||||
|
|
||||||
# Récupération des vérifications
|
# Récupération des vérifications
|
||||||
verifications = db.query(Verification).filter(Verification.document_id == document_id).all()
|
verifications = db.query(Verification).filter(Verification.document_id == document_id).all()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"id": document.id,
|
"id": document.id,
|
||||||
"filename": document.filename,
|
"filename": document.filename,
|
||||||
@ -195,10 +195,10 @@ async def upload_document(
|
|||||||
# Validation du fichier
|
# Validation du fichier
|
||||||
if not file.filename:
|
if not file.filename:
|
||||||
raise HTTPException(status_code=400, detail="Aucun fichier fourni")
|
raise HTTPException(status_code=400, detail="Aucun fichier fourni")
|
||||||
|
|
||||||
# Génération d'un ID unique
|
# Génération d'un ID unique
|
||||||
doc_id = str(uuid.uuid4())
|
doc_id = str(uuid.uuid4())
|
||||||
|
|
||||||
# Création du document en base
|
# Création du document en base
|
||||||
document = Document(
|
document = Document(
|
||||||
id=doc_id,
|
id=doc_id,
|
||||||
@ -213,20 +213,20 @@ async def upload_document(
|
|||||||
status="uploaded",
|
status="uploaded",
|
||||||
progress=0
|
progress=0
|
||||||
)
|
)
|
||||||
|
|
||||||
db.add(document)
|
db.add(document)
|
||||||
db.commit()
|
db.commit()
|
||||||
db.refresh(document)
|
db.refresh(document)
|
||||||
|
|
||||||
# Simulation du traitement (en attendant Celery)
|
# Simulation du traitement (en attendant Celery)
|
||||||
asyncio.create_task(process_document_simulated(doc_id, db))
|
asyncio.create_task(process_document_simulated(doc_id, db))
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"message": "Document uploadé avec succès",
|
"message": "Document uploadé avec succès",
|
||||||
"document_id": doc_id,
|
"document_id": doc_id,
|
||||||
"status": "uploaded"
|
"status": "uploaded"
|
||||||
}
|
}
|
||||||
|
|
||||||
except HTTPException:
|
except HTTPException:
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -243,7 +243,7 @@ async def process_document_simulated(doc_id: str, db: Session):
|
|||||||
document.progress = 10
|
document.progress = 10
|
||||||
document.current_step = "Pré-traitement"
|
document.current_step = "Pré-traitement"
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
# Simulation des étapes
|
# Simulation des étapes
|
||||||
steps = [
|
steps = [
|
||||||
("Pré-traitement", 20),
|
("Pré-traitement", 20),
|
||||||
@ -253,15 +253,15 @@ async def process_document_simulated(doc_id: str, db: Session):
|
|||||||
("Vérifications", 95),
|
("Vérifications", 95),
|
||||||
("Finalisation", 100)
|
("Finalisation", 100)
|
||||||
]
|
]
|
||||||
|
|
||||||
for step_name, progress in steps:
|
for step_name, progress in steps:
|
||||||
await asyncio.sleep(2) # Simulation du temps de traitement
|
await asyncio.sleep(2) # Simulation du temps de traitement
|
||||||
|
|
||||||
if document:
|
if document:
|
||||||
document.progress = progress
|
document.progress = progress
|
||||||
document.current_step = step_name
|
document.current_step = step_name
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
# Résultats simulés
|
# Résultats simulés
|
||||||
if document:
|
if document:
|
||||||
document.status = "completed"
|
document.status = "completed"
|
||||||
@ -272,7 +272,7 @@ async def process_document_simulated(doc_id: str, db: Session):
|
|||||||
document.ocr_text = "Texte extrait simulé du document..."
|
document.ocr_text = "Texte extrait simulé du document..."
|
||||||
document.processed_at = datetime.utcnow()
|
document.processed_at = datetime.utcnow()
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
# Ajout d'entités simulées
|
# Ajout d'entités simulées
|
||||||
entities = [
|
entities = [
|
||||||
Entity(
|
Entity(
|
||||||
@ -297,10 +297,10 @@ async def process_document_simulated(doc_id: str, db: Session):
|
|||||||
context="Adresse du bien: 123 Rue de la Paix, 75001 Paris"
|
context="Adresse du bien: 123 Rue de la Paix, 75001 Paris"
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
for entity in entities:
|
for entity in entities:
|
||||||
db.add(entity)
|
db.add(entity)
|
||||||
|
|
||||||
# Ajout de vérifications simulées
|
# Ajout de vérifications simulées
|
||||||
verifications = [
|
verifications = [
|
||||||
Verification(
|
Verification(
|
||||||
@ -316,12 +316,12 @@ async def process_document_simulated(doc_id: str, db: Session):
|
|||||||
result_data={"status": "OK", "risques": []}
|
result_data={"status": "OK", "risques": []}
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
for verification in verifications:
|
for verification in verifications:
|
||||||
db.add(verification)
|
db.add(verification)
|
||||||
|
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Erreur lors du traitement simulé de {doc_id}: {e}")
|
print(f"Erreur lors du traitement simulé de {doc_id}: {e}")
|
||||||
if document:
|
if document:
|
||||||
@ -334,25 +334,25 @@ async def delete_document(document_id: str, db: Session = Depends(get_db)):
|
|||||||
"""Suppression d'un document"""
|
"""Suppression d'un document"""
|
||||||
try:
|
try:
|
||||||
document = db.query(Document).filter(Document.id == document_id).first()
|
document = db.query(Document).filter(Document.id == document_id).first()
|
||||||
|
|
||||||
if not document:
|
if not document:
|
||||||
raise HTTPException(status_code=404, detail="Document non trouvé")
|
raise HTTPException(status_code=404, detail="Document non trouvé")
|
||||||
|
|
||||||
# Suppression des entités associées
|
# Suppression des entités associées
|
||||||
db.query(Entity).filter(Entity.document_id == document_id).delete()
|
db.query(Entity).filter(Entity.document_id == document_id).delete()
|
||||||
|
|
||||||
# Suppression des vérifications associées
|
# Suppression des vérifications associées
|
||||||
db.query(Verification).filter(Verification.document_id == document_id).delete()
|
db.query(Verification).filter(Verification.document_id == document_id).delete()
|
||||||
|
|
||||||
# Suppression des logs de traitement
|
# Suppression des logs de traitement
|
||||||
db.query(ProcessingLog).filter(ProcessingLog.document_id == document_id).delete()
|
db.query(ProcessingLog).filter(ProcessingLog.document_id == document_id).delete()
|
||||||
|
|
||||||
# Suppression du document
|
# Suppression du document
|
||||||
db.delete(document)
|
db.delete(document)
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
return {"message": "Document supprimé avec succès"}
|
return {"message": "Document supprimé avec succès"}
|
||||||
|
|
||||||
except HTTPException:
|
except HTTPException:
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -10,7 +10,7 @@ from .models import Base
|
|||||||
|
|
||||||
# Configuration de la base de données
|
# Configuration de la base de données
|
||||||
DATABASE_URL = os.getenv(
|
DATABASE_URL = os.getenv(
|
||||||
"DATABASE_URL",
|
"DATABASE_URL",
|
||||||
"postgresql+psycopg://notariat:notariat_pwd@localhost:5432/notariat"
|
"postgresql+psycopg://notariat:notariat_pwd@localhost:5432/notariat"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -53,7 +53,7 @@ def get_db_stats():
|
|||||||
"""Retourne les statistiques de la base de données"""
|
"""Retourne les statistiques de la base de données"""
|
||||||
try:
|
try:
|
||||||
from .models import Document, Entity, Verification, ProcessingLog
|
from .models import Document, Entity, Verification, ProcessingLog
|
||||||
|
|
||||||
db = SessionLocal()
|
db = SessionLocal()
|
||||||
try:
|
try:
|
||||||
stats = {
|
stats = {
|
||||||
|
@ -13,34 +13,34 @@ Base = declarative_base()
|
|||||||
class Document(Base):
|
class Document(Base):
|
||||||
"""Modèle pour les documents notariaux"""
|
"""Modèle pour les documents notariaux"""
|
||||||
__tablename__ = "documents"
|
__tablename__ = "documents"
|
||||||
|
|
||||||
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||||
filename = Column(String(255), nullable=False)
|
filename = Column(String(255), nullable=False)
|
||||||
original_filename = Column(String(255), nullable=False)
|
original_filename = Column(String(255), nullable=False)
|
||||||
mime_type = Column(String(100), nullable=False)
|
mime_type = Column(String(100), nullable=False)
|
||||||
size = Column(Integer, nullable=False)
|
size = Column(Integer, nullable=False)
|
||||||
|
|
||||||
# Métadonnées
|
# Métadonnées
|
||||||
id_dossier = Column(String(100), nullable=False)
|
id_dossier = Column(String(100), nullable=False)
|
||||||
etude_id = Column(String(100), nullable=False)
|
etude_id = Column(String(100), nullable=False)
|
||||||
utilisateur_id = Column(String(100), nullable=False)
|
utilisateur_id = Column(String(100), nullable=False)
|
||||||
source = Column(String(50), default="upload")
|
source = Column(String(50), default="upload")
|
||||||
|
|
||||||
# Statut et progression
|
# Statut et progression
|
||||||
status = Column(String(50), default="uploaded") # uploaded, processing, completed, error
|
status = Column(String(50), default="uploaded") # uploaded, processing, completed, error
|
||||||
progress = Column(Integer, default=0)
|
progress = Column(Integer, default=0)
|
||||||
current_step = Column(String(100))
|
current_step = Column(String(100))
|
||||||
|
|
||||||
# Résultats du traitement
|
# Résultats du traitement
|
||||||
ocr_text = Column(Text)
|
ocr_text = Column(Text)
|
||||||
document_type = Column(String(100))
|
document_type = Column(String(100))
|
||||||
confidence_score = Column(Float)
|
confidence_score = Column(Float)
|
||||||
|
|
||||||
# Timestamps
|
# Timestamps
|
||||||
created_at = Column(DateTime, default=datetime.utcnow)
|
created_at = Column(DateTime, default=datetime.utcnow)
|
||||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||||
processed_at = Column(DateTime)
|
processed_at = Column(DateTime)
|
||||||
|
|
||||||
# Relations
|
# Relations
|
||||||
entities = relationship("Entity", back_populates="document")
|
entities = relationship("Entity", back_populates="document")
|
||||||
verifications = relationship("Verification", back_populates="document")
|
verifications = relationship("Verification", back_populates="document")
|
||||||
@ -49,99 +49,99 @@ class Document(Base):
|
|||||||
class Entity(Base):
|
class Entity(Base):
|
||||||
"""Modèle pour les entités extraites des documents"""
|
"""Modèle pour les entités extraites des documents"""
|
||||||
__tablename__ = "entities"
|
__tablename__ = "entities"
|
||||||
|
|
||||||
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||||
document_id = Column(String, ForeignKey("documents.id"), nullable=False)
|
document_id = Column(String, ForeignKey("documents.id"), nullable=False)
|
||||||
|
|
||||||
# Type d'entité
|
# Type d'entité
|
||||||
entity_type = Column(String(50), nullable=False) # person, address, property, company, etc.
|
entity_type = Column(String(50), nullable=False) # person, address, property, company, etc.
|
||||||
entity_value = Column(Text, nullable=False)
|
entity_value = Column(Text, nullable=False)
|
||||||
|
|
||||||
# Position dans le document
|
# Position dans le document
|
||||||
page_number = Column(Integer)
|
page_number = Column(Integer)
|
||||||
bbox_x = Column(Float)
|
bbox_x = Column(Float)
|
||||||
bbox_y = Column(Float)
|
bbox_y = Column(Float)
|
||||||
bbox_width = Column(Float)
|
bbox_width = Column(Float)
|
||||||
bbox_height = Column(Float)
|
bbox_height = Column(Float)
|
||||||
|
|
||||||
# Métadonnées
|
# Métadonnées
|
||||||
confidence = Column(Float)
|
confidence = Column(Float)
|
||||||
context = Column(Text)
|
context = Column(Text)
|
||||||
|
|
||||||
# Timestamps
|
# Timestamps
|
||||||
created_at = Column(DateTime, default=datetime.utcnow)
|
created_at = Column(DateTime, default=datetime.utcnow)
|
||||||
|
|
||||||
# Relations
|
# Relations
|
||||||
document = relationship("Document", back_populates="entities")
|
document = relationship("Document", back_populates="entities")
|
||||||
|
|
||||||
class Verification(Base):
|
class Verification(Base):
|
||||||
"""Modèle pour les vérifications effectuées"""
|
"""Modèle pour les vérifications effectuées"""
|
||||||
__tablename__ = "verifications"
|
__tablename__ = "verifications"
|
||||||
|
|
||||||
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||||
document_id = Column(String, ForeignKey("documents.id"), nullable=False)
|
document_id = Column(String, ForeignKey("documents.id"), nullable=False)
|
||||||
|
|
||||||
# Type de vérification
|
# Type de vérification
|
||||||
verification_type = Column(String(100), nullable=False) # cadastre, georisques, bodacc, etc.
|
verification_type = Column(String(100), nullable=False) # cadastre, georisques, bodacc, etc.
|
||||||
verification_status = Column(String(50), nullable=False) # pending, success, error, warning
|
verification_status = Column(String(50), nullable=False) # pending, success, error, warning
|
||||||
|
|
||||||
# Résultats
|
# Résultats
|
||||||
result_data = Column(JSON)
|
result_data = Column(JSON)
|
||||||
error_message = Column(Text)
|
error_message = Column(Text)
|
||||||
warning_message = Column(Text)
|
warning_message = Column(Text)
|
||||||
|
|
||||||
# Métadonnées
|
# Métadonnées
|
||||||
api_endpoint = Column(String(255))
|
api_endpoint = Column(String(255))
|
||||||
response_time = Column(Float)
|
response_time = Column(Float)
|
||||||
|
|
||||||
# Timestamps
|
# Timestamps
|
||||||
created_at = Column(DateTime, default=datetime.utcnow)
|
created_at = Column(DateTime, default=datetime.utcnow)
|
||||||
completed_at = Column(DateTime)
|
completed_at = Column(DateTime)
|
||||||
|
|
||||||
# Relations
|
# Relations
|
||||||
document = relationship("Document", back_populates="verifications")
|
document = relationship("Document", back_populates="verifications")
|
||||||
|
|
||||||
class ProcessingLog(Base):
|
class ProcessingLog(Base):
|
||||||
"""Modèle pour les logs de traitement"""
|
"""Modèle pour les logs de traitement"""
|
||||||
__tablename__ = "processing_logs"
|
__tablename__ = "processing_logs"
|
||||||
|
|
||||||
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||||
document_id = Column(String, ForeignKey("documents.id"), nullable=False)
|
document_id = Column(String, ForeignKey("documents.id"), nullable=False)
|
||||||
|
|
||||||
# Informations du log
|
# Informations du log
|
||||||
step_name = Column(String(100), nullable=False)
|
step_name = Column(String(100), nullable=False)
|
||||||
step_status = Column(String(50), nullable=False) # started, completed, error
|
step_status = Column(String(50), nullable=False) # started, completed, error
|
||||||
message = Column(Text)
|
message = Column(Text)
|
||||||
error_details = Column(Text)
|
error_details = Column(Text)
|
||||||
|
|
||||||
# Métadonnées
|
# Métadonnées
|
||||||
processing_time = Column(Float)
|
processing_time = Column(Float)
|
||||||
input_hash = Column(String(64))
|
input_hash = Column(String(64))
|
||||||
output_hash = Column(String(64))
|
output_hash = Column(String(64))
|
||||||
|
|
||||||
# Timestamps
|
# Timestamps
|
||||||
created_at = Column(DateTime, default=datetime.utcnow)
|
created_at = Column(DateTime, default=datetime.utcnow)
|
||||||
|
|
||||||
# Relations
|
# Relations
|
||||||
document = relationship("Document", back_populates="processing_logs")
|
document = relationship("Document", back_populates="processing_logs")
|
||||||
|
|
||||||
class Study(Base):
|
class Study(Base):
|
||||||
"""Modèle pour les études notariales"""
|
"""Modèle pour les études notariales"""
|
||||||
__tablename__ = "studies"
|
__tablename__ = "studies"
|
||||||
|
|
||||||
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||||
name = Column(String(255), nullable=False)
|
name = Column(String(255), nullable=False)
|
||||||
address = Column(Text)
|
address = Column(Text)
|
||||||
phone = Column(String(50))
|
phone = Column(String(50))
|
||||||
email = Column(String(255))
|
email = Column(String(255))
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
settings = Column(JSON)
|
settings = Column(JSON)
|
||||||
api_keys = Column(JSON) # Clés API pour les vérifications externes
|
api_keys = Column(JSON) # Clés API pour les vérifications externes
|
||||||
|
|
||||||
# Statut
|
# Statut
|
||||||
is_active = Column(Boolean, default=True)
|
is_active = Column(Boolean, default=True)
|
||||||
|
|
||||||
# Timestamps
|
# Timestamps
|
||||||
created_at = Column(DateTime, default=datetime.utcnow)
|
created_at = Column(DateTime, default=datetime.utcnow)
|
||||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||||
@ -149,21 +149,21 @@ class Study(Base):
|
|||||||
class User(Base):
|
class User(Base):
|
||||||
"""Modèle pour les utilisateurs"""
|
"""Modèle pour les utilisateurs"""
|
||||||
__tablename__ = "users"
|
__tablename__ = "users"
|
||||||
|
|
||||||
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||||
username = Column(String(100), unique=True, nullable=False)
|
username = Column(String(100), unique=True, nullable=False)
|
||||||
email = Column(String(255), unique=True, nullable=False)
|
email = Column(String(255), unique=True, nullable=False)
|
||||||
full_name = Column(String(255))
|
full_name = Column(String(255))
|
||||||
|
|
||||||
# Authentification
|
# Authentification
|
||||||
hashed_password = Column(String(255))
|
hashed_password = Column(String(255))
|
||||||
is_active = Column(Boolean, default=True)
|
is_active = Column(Boolean, default=True)
|
||||||
is_admin = Column(Boolean, default=False)
|
is_admin = Column(Boolean, default=False)
|
||||||
|
|
||||||
# Relations
|
# Relations
|
||||||
study_id = Column(String, ForeignKey("studies.id"))
|
study_id = Column(String, ForeignKey("studies.id"))
|
||||||
study = relationship("Study")
|
study = relationship("Study")
|
||||||
|
|
||||||
# Timestamps
|
# Timestamps
|
||||||
created_at = Column(DateTime, default=datetime.utcnow)
|
created_at = Column(DateTime, default=datetime.utcnow)
|
||||||
last_login = Column(DateTime)
|
last_login = Column(DateTime)
|
||||||
@ -171,24 +171,24 @@ class User(Base):
|
|||||||
class Dossier(Base):
|
class Dossier(Base):
|
||||||
"""Modèle pour les dossiers notariaux"""
|
"""Modèle pour les dossiers notariaux"""
|
||||||
__tablename__ = "dossiers"
|
__tablename__ = "dossiers"
|
||||||
|
|
||||||
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||||
dossier_number = Column(String(100), unique=True, nullable=False)
|
dossier_number = Column(String(100), unique=True, nullable=False)
|
||||||
title = Column(String(255))
|
title = Column(String(255))
|
||||||
description = Column(Text)
|
description = Column(Text)
|
||||||
|
|
||||||
# Relations
|
# Relations
|
||||||
study_id = Column(String, ForeignKey("studies.id"), nullable=False)
|
study_id = Column(String, ForeignKey("studies.id"), nullable=False)
|
||||||
study = relationship("Study")
|
study = relationship("Study")
|
||||||
|
|
||||||
# Statut
|
# Statut
|
||||||
status = Column(String(50), default="open") # open, closed, archived
|
status = Column(String(50), default="open") # open, closed, archived
|
||||||
|
|
||||||
# Métadonnées
|
# Métadonnées
|
||||||
client_name = Column(String(255))
|
client_name = Column(String(255))
|
||||||
client_email = Column(String(255))
|
client_email = Column(String(255))
|
||||||
client_phone = Column(String(50))
|
client_phone = Column(String(50))
|
||||||
|
|
||||||
# Timestamps
|
# Timestamps
|
||||||
created_at = Column(DateTime, default=datetime.utcnow)
|
created_at = Column(DateTime, default=datetime.utcnow)
|
||||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||||
|
@ -142,12 +142,12 @@ class NotaryApp {
|
|||||||
|
|
||||||
async uploadDocument() {
|
async uploadDocument() {
|
||||||
const fileInput = document.getElementById('file-input');
|
const fileInput = document.getElementById('file-input');
|
||||||
|
|
||||||
if (!fileInput) {
|
if (!fileInput) {
|
||||||
this.showAlert('Élément de fichier non trouvé', 'error');
|
this.showAlert('Élément de fichier non trouvé', 'error');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const file = fileInput.files[0];
|
const file = fileInput.files[0];
|
||||||
|
|
||||||
if (!file) {
|
if (!file) {
|
||||||
|
@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
|||||||
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||||
"""Pipeline de vérifications"""
|
"""Pipeline de vérifications"""
|
||||||
logger.info(f"🔍 Vérifications pour le document {doc_id}")
|
logger.info(f"🔍 Vérifications pour le document {doc_id}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Simulation des vérifications
|
# Simulation des vérifications
|
||||||
ctx.update({
|
ctx.update({
|
||||||
|
@ -47,31 +47,31 @@ DOCUMENT_TYPES = {
|
|||||||
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||||
"""
|
"""
|
||||||
Pipeline de classification des documents
|
Pipeline de classification des documents
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
doc_id: Identifiant du document
|
doc_id: Identifiant du document
|
||||||
ctx: Contexte de traitement partagé entre les pipelines
|
ctx: Contexte de traitement partagé entre les pipelines
|
||||||
"""
|
"""
|
||||||
logger.info(f"🏷️ Début de la classification pour le document {doc_id}")
|
logger.info(f"🏷️ Début de la classification pour le document {doc_id}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 1. Vérification des prérequis
|
# 1. Vérification des prérequis
|
||||||
if "ocr_error" in ctx:
|
if "ocr_error" in ctx:
|
||||||
raise Exception(f"Erreur OCR: {ctx['ocr_error']}")
|
raise Exception(f"Erreur OCR: {ctx['ocr_error']}")
|
||||||
|
|
||||||
ocr_text = ctx.get("ocr_text", "")
|
ocr_text = ctx.get("ocr_text", "")
|
||||||
if not ocr_text:
|
if not ocr_text:
|
||||||
raise ValueError("Texte OCR manquant")
|
raise ValueError("Texte OCR manquant")
|
||||||
|
|
||||||
# 2. Classification par règles (rapide)
|
# 2. Classification par règles (rapide)
|
||||||
rule_based_classification = _classify_by_rules(ocr_text)
|
rule_based_classification = _classify_by_rules(ocr_text)
|
||||||
|
|
||||||
# 3. Classification par LLM (plus précise)
|
# 3. Classification par LLM (plus précise)
|
||||||
llm_classification = _classify_by_llm(ocr_text, doc_id)
|
llm_classification = _classify_by_llm(ocr_text, doc_id)
|
||||||
|
|
||||||
# 4. Fusion des résultats
|
# 4. Fusion des résultats
|
||||||
final_classification = _merge_classifications(rule_based_classification, llm_classification)
|
final_classification = _merge_classifications(rule_based_classification, llm_classification)
|
||||||
|
|
||||||
# 5. Mise à jour du contexte
|
# 5. Mise à jour du contexte
|
||||||
ctx.update({
|
ctx.update({
|
||||||
"document_type": final_classification["type"],
|
"document_type": final_classification["type"],
|
||||||
@ -79,12 +79,12 @@ def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
|||||||
"classification_method": final_classification["method"],
|
"classification_method": final_classification["method"],
|
||||||
"classification_details": final_classification["details"]
|
"classification_details": final_classification["details"]
|
||||||
})
|
})
|
||||||
|
|
||||||
logger.info(f"✅ Classification terminée pour {doc_id}")
|
logger.info(f"✅ Classification terminée pour {doc_id}")
|
||||||
logger.info(f" - Type: {final_classification['type']}")
|
logger.info(f" - Type: {final_classification['type']}")
|
||||||
logger.info(f" - Confiance: {final_classification['confidence']:.2f}")
|
logger.info(f" - Confiance: {final_classification['confidence']:.2f}")
|
||||||
logger.info(f" - Méthode: {final_classification['method']}")
|
logger.info(f" - Méthode: {final_classification['method']}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"❌ Erreur lors de la classification de {doc_id}: {e}")
|
logger.error(f"❌ Erreur lors de la classification de {doc_id}: {e}")
|
||||||
ctx["classification_error"] = str(e)
|
ctx["classification_error"] = str(e)
|
||||||
@ -98,44 +98,44 @@ def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
|||||||
def _classify_by_rules(text: str) -> Dict[str, Any]:
|
def _classify_by_rules(text: str) -> Dict[str, Any]:
|
||||||
"""Classification basée sur des règles et mots-clés"""
|
"""Classification basée sur des règles et mots-clés"""
|
||||||
logger.info("📋 Classification par règles")
|
logger.info("📋 Classification par règles")
|
||||||
|
|
||||||
text_lower = text.lower()
|
text_lower = text.lower()
|
||||||
scores = {}
|
scores = {}
|
||||||
|
|
||||||
for doc_type, config in DOCUMENT_TYPES.items():
|
for doc_type, config in DOCUMENT_TYPES.items():
|
||||||
if doc_type == "autre":
|
if doc_type == "autre":
|
||||||
continue
|
continue
|
||||||
|
|
||||||
score = 0
|
score = 0
|
||||||
matched_keywords = []
|
matched_keywords = []
|
||||||
|
|
||||||
# Score basé sur les mots-clés
|
# Score basé sur les mots-clés
|
||||||
for keyword in config["keywords"]:
|
for keyword in config["keywords"]:
|
||||||
if keyword in text_lower:
|
if keyword in text_lower:
|
||||||
score += 1
|
score += 1
|
||||||
matched_keywords.append(keyword)
|
matched_keywords.append(keyword)
|
||||||
|
|
||||||
# Score basé sur les patterns regex
|
# Score basé sur les patterns regex
|
||||||
import re
|
import re
|
||||||
for pattern in config["patterns"]:
|
for pattern in config["patterns"]:
|
||||||
if re.search(pattern, text_lower):
|
if re.search(pattern, text_lower):
|
||||||
score += 2
|
score += 2
|
||||||
|
|
||||||
# Normalisation du score
|
# Normalisation du score
|
||||||
max_possible_score = len(config["keywords"]) + len(config["patterns"]) * 2
|
max_possible_score = len(config["keywords"]) + len(config["patterns"]) * 2
|
||||||
normalized_score = score / max_possible_score if max_possible_score > 0 else 0
|
normalized_score = score / max_possible_score if max_possible_score > 0 else 0
|
||||||
|
|
||||||
scores[doc_type] = {
|
scores[doc_type] = {
|
||||||
"score": normalized_score,
|
"score": normalized_score,
|
||||||
"matched_keywords": matched_keywords,
|
"matched_keywords": matched_keywords,
|
||||||
"method": "rules"
|
"method": "rules"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Sélection du meilleur score
|
# Sélection du meilleur score
|
||||||
if scores:
|
if scores:
|
||||||
best_type = max(scores.keys(), key=lambda k: scores[k]["score"])
|
best_type = max(scores.keys(), key=lambda k: scores[k]["score"])
|
||||||
best_score = scores[best_type]["score"]
|
best_score = scores[best_type]["score"]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"type": best_type if best_score > 0.1 else "autre",
|
"type": best_type if best_score > 0.1 else "autre",
|
||||||
"confidence": best_score,
|
"confidence": best_score,
|
||||||
@ -153,18 +153,18 @@ def _classify_by_rules(text: str) -> Dict[str, Any]:
|
|||||||
def _classify_by_llm(text: str, doc_id: str) -> Dict[str, Any]:
|
def _classify_by_llm(text: str, doc_id: str) -> Dict[str, Any]:
|
||||||
"""Classification par LLM (Ollama)"""
|
"""Classification par LLM (Ollama)"""
|
||||||
logger.info("🤖 Classification par LLM")
|
logger.info("🤖 Classification par LLM")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Configuration Ollama
|
# Configuration Ollama
|
||||||
ollama_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
ollama_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
||||||
model = os.getenv("OLLAMA_MODEL", "llama3:8b")
|
model = os.getenv("OLLAMA_MODEL", "llama3:8b")
|
||||||
|
|
||||||
# Limitation du texte pour le contexte
|
# Limitation du texte pour le contexte
|
||||||
text_sample = text[:4000] if len(text) > 4000 else text
|
text_sample = text[:4000] if len(text) > 4000 else text
|
||||||
|
|
||||||
# Prompt de classification
|
# Prompt de classification
|
||||||
prompt = _build_classification_prompt(text_sample)
|
prompt = _build_classification_prompt(text_sample)
|
||||||
|
|
||||||
# Appel à Ollama
|
# Appel à Ollama
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
f"{ollama_url}/api/generate",
|
f"{ollama_url}/api/generate",
|
||||||
@ -179,11 +179,11 @@ def _classify_by_llm(text: str, doc_id: str) -> Dict[str, Any]:
|
|||||||
},
|
},
|
||||||
timeout=60
|
timeout=60
|
||||||
)
|
)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
result = response.json()
|
result = response.json()
|
||||||
llm_response = result.get("response", "").strip()
|
llm_response = result.get("response", "").strip()
|
||||||
|
|
||||||
# Parsing de la réponse JSON
|
# Parsing de la réponse JSON
|
||||||
try:
|
try:
|
||||||
classification_result = json.loads(llm_response)
|
classification_result = json.loads(llm_response)
|
||||||
@ -203,7 +203,7 @@ def _classify_by_llm(text: str, doc_id: str) -> Dict[str, Any]:
|
|||||||
else:
|
else:
|
||||||
logger.warning(f"Erreur LLM: {response.status_code}")
|
logger.warning(f"Erreur LLM: {response.status_code}")
|
||||||
return _classify_by_rules(text)
|
return _classify_by_rules(text)
|
||||||
|
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
logger.warning(f"Erreur de connexion LLM: {e}")
|
logger.warning(f"Erreur de connexion LLM: {e}")
|
||||||
return _classify_by_rules(text)
|
return _classify_by_rules(text)
|
||||||
@ -238,19 +238,19 @@ Assure-toi que le JSON est valide et que le type correspond exactement à une de
|
|||||||
def _merge_classifications(rule_result: Dict[str, Any], llm_result: Dict[str, Any]) -> Dict[str, Any]:
|
def _merge_classifications(rule_result: Dict[str, Any], llm_result: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
"""Fusionne les résultats de classification par règles et LLM"""
|
"""Fusionne les résultats de classification par règles et LLM"""
|
||||||
logger.info("🔄 Fusion des classifications")
|
logger.info("🔄 Fusion des classifications")
|
||||||
|
|
||||||
# Poids des méthodes
|
# Poids des méthodes
|
||||||
rule_weight = 0.3
|
rule_weight = 0.3
|
||||||
llm_weight = 0.7
|
llm_weight = 0.7
|
||||||
|
|
||||||
# Si LLM a une confiance élevée, on lui fait confiance
|
# Si LLM a une confiance élevée, on lui fait confiance
|
||||||
if llm_result["confidence"] > 0.8:
|
if llm_result["confidence"] > 0.8:
|
||||||
return llm_result
|
return llm_result
|
||||||
|
|
||||||
# Si les deux méthodes sont d'accord
|
# Si les deux méthodes sont d'accord
|
||||||
if rule_result["type"] == llm_result["type"]:
|
if rule_result["type"] == llm_result["type"]:
|
||||||
# Moyenne pondérée des confiances
|
# Moyenne pondérée des confiances
|
||||||
combined_confidence = (rule_result["confidence"] * rule_weight +
|
combined_confidence = (rule_result["confidence"] * rule_weight +
|
||||||
llm_result["confidence"] * llm_weight)
|
llm_result["confidence"] * llm_weight)
|
||||||
return {
|
return {
|
||||||
"type": rule_result["type"],
|
"type": rule_result["type"],
|
||||||
@ -262,7 +262,7 @@ def _merge_classifications(rule_result: Dict[str, Any], llm_result: Dict[str, An
|
|||||||
"weights": {"rules": rule_weight, "llm": llm_weight}
|
"weights": {"rules": rule_weight, "llm": llm_weight}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Si les méthodes ne sont pas d'accord, on privilégie LLM si sa confiance est > 0.5
|
# Si les méthodes ne sont pas d'accord, on privilégie LLM si sa confiance est > 0.5
|
||||||
if llm_result["confidence"] > 0.5:
|
if llm_result["confidence"] > 0.5:
|
||||||
return llm_result
|
return llm_result
|
||||||
|
@ -12,14 +12,14 @@ logger = logging.getLogger(__name__)
|
|||||||
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||||
"""Pipeline d'extraction d'entités"""
|
"""Pipeline d'extraction d'entités"""
|
||||||
logger.info(f"🔍 Extraction d'entités pour le document {doc_id}")
|
logger.info(f"🔍 Extraction d'entités pour le document {doc_id}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
ocr_text = ctx.get("ocr_text", "")
|
ocr_text = ctx.get("ocr_text", "")
|
||||||
document_type = ctx.get("document_type", "autre")
|
document_type = ctx.get("document_type", "autre")
|
||||||
|
|
||||||
# Extraction basique
|
# Extraction basique
|
||||||
entities = _extract_basic_entities(ocr_text, document_type)
|
entities = _extract_basic_entities(ocr_text, document_type)
|
||||||
|
|
||||||
ctx.update({
|
ctx.update({
|
||||||
"extracted_entities": entities,
|
"extracted_entities": entities,
|
||||||
"entities_count": len(entities)
|
"entities_count": len(entities)
|
||||||
@ -32,7 +32,7 @@ def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
|||||||
def _extract_basic_entities(text: str, doc_type: str) -> List[Dict[str, Any]]:
|
def _extract_basic_entities(text: str, doc_type: str) -> List[Dict[str, Any]]:
|
||||||
"""Extraction basique d'entités"""
|
"""Extraction basique d'entités"""
|
||||||
entities = []
|
entities = []
|
||||||
|
|
||||||
# Emails
|
# Emails
|
||||||
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
|
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
|
||||||
for email in emails:
|
for email in emails:
|
||||||
@ -42,7 +42,7 @@ def _extract_basic_entities(text: str, doc_type: str) -> List[Dict[str, Any]]:
|
|||||||
"value": email,
|
"value": email,
|
||||||
"confidence": 0.95
|
"confidence": 0.95
|
||||||
})
|
})
|
||||||
|
|
||||||
# Téléphones
|
# Téléphones
|
||||||
phones = re.findall(r'\b0[1-9](?:[.\-\s]?\d{2}){4}\b', text)
|
phones = re.findall(r'\b0[1-9](?:[.\-\s]?\d{2}){4}\b', text)
|
||||||
for phone in phones:
|
for phone in phones:
|
||||||
@ -52,7 +52,7 @@ def _extract_basic_entities(text: str, doc_type: str) -> List[Dict[str, Any]]:
|
|||||||
"value": phone,
|
"value": phone,
|
||||||
"confidence": 0.9
|
"confidence": 0.9
|
||||||
})
|
})
|
||||||
|
|
||||||
# Dates
|
# Dates
|
||||||
dates = re.findall(r'\b\d{1,2}[\/\-\.]\d{1,2}[\/\-\.]\d{4}\b', text)
|
dates = re.findall(r'\b\d{1,2}[\/\-\.]\d{1,2}[\/\-\.]\d{4}\b', text)
|
||||||
for date in dates:
|
for date in dates:
|
||||||
@ -62,5 +62,5 @@ def _extract_basic_entities(text: str, doc_type: str) -> List[Dict[str, Any]]:
|
|||||||
"value": date,
|
"value": date,
|
||||||
"confidence": 0.8
|
"confidence": 0.8
|
||||||
})
|
})
|
||||||
|
|
||||||
return entities
|
return entities
|
@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
|||||||
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||||
"""Pipeline de finalisation"""
|
"""Pipeline de finalisation"""
|
||||||
logger.info(f"🏁 Finalisation du document {doc_id}")
|
logger.info(f"🏁 Finalisation du document {doc_id}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Génération du rapport final
|
# Génération du rapport final
|
||||||
ctx.update({
|
ctx.update({
|
||||||
|
@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
|||||||
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||||
"""Pipeline d'indexation"""
|
"""Pipeline d'indexation"""
|
||||||
logger.info(f"📚 Indexation du document {doc_id}")
|
logger.info(f"📚 Indexation du document {doc_id}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Simulation de l'indexation
|
# Simulation de l'indexation
|
||||||
ctx.update({
|
ctx.update({
|
||||||
|
@ -14,29 +14,29 @@ logger = logging.getLogger(__name__)
|
|||||||
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||||
"""
|
"""
|
||||||
Pipeline OCR pour l'extraction de texte
|
Pipeline OCR pour l'extraction de texte
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
doc_id: Identifiant du document
|
doc_id: Identifiant du document
|
||||||
ctx: Contexte de traitement partagé entre les pipelines
|
ctx: Contexte de traitement partagé entre les pipelines
|
||||||
"""
|
"""
|
||||||
logger.info(f"👁️ Début de l'OCR pour le document {doc_id}")
|
logger.info(f"👁️ Début de l'OCR pour le document {doc_id}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 1. Vérification des prérequis
|
# 1. Vérification des prérequis
|
||||||
if "preprocess_error" in ctx:
|
if "preprocess_error" in ctx:
|
||||||
raise Exception(f"Erreur de pré-traitement: {ctx['preprocess_error']}")
|
raise Exception(f"Erreur de pré-traitement: {ctx['preprocess_error']}")
|
||||||
|
|
||||||
processed_path = ctx.get("processed_path")
|
processed_path = ctx.get("processed_path")
|
||||||
if not processed_path or not os.path.exists(processed_path):
|
if not processed_path or not os.path.exists(processed_path):
|
||||||
raise FileNotFoundError("Fichier traité non trouvé")
|
raise FileNotFoundError("Fichier traité non trouvé")
|
||||||
|
|
||||||
work_dir = ctx.get("work_dir")
|
work_dir = ctx.get("work_dir")
|
||||||
if not work_dir:
|
if not work_dir:
|
||||||
raise ValueError("Répertoire de travail non défini")
|
raise ValueError("Répertoire de travail non défini")
|
||||||
|
|
||||||
# 2. Détection du type de document
|
# 2. Détection du type de document
|
||||||
file_ext = os.path.splitext(processed_path)[1].lower()
|
file_ext = os.path.splitext(processed_path)[1].lower()
|
||||||
|
|
||||||
if file_ext == '.pdf':
|
if file_ext == '.pdf':
|
||||||
# Traitement PDF
|
# Traitement PDF
|
||||||
ocr_result = _process_pdf(processed_path, work_dir)
|
ocr_result = _process_pdf(processed_path, work_dir)
|
||||||
@ -45,14 +45,14 @@ def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
|||||||
ocr_result = _process_image(processed_path, work_dir)
|
ocr_result = _process_image(processed_path, work_dir)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Format non supporté pour l'OCR: {file_ext}")
|
raise ValueError(f"Format non supporté pour l'OCR: {file_ext}")
|
||||||
|
|
||||||
# 3. Correction lexicale notariale
|
# 3. Correction lexicale notariale
|
||||||
corrected_text = _apply_notarial_corrections(ocr_result["text"])
|
corrected_text = _apply_notarial_corrections(ocr_result["text"])
|
||||||
ocr_result["corrected_text"] = corrected_text
|
ocr_result["corrected_text"] = corrected_text
|
||||||
|
|
||||||
# 4. Sauvegarde des résultats
|
# 4. Sauvegarde des résultats
|
||||||
_save_ocr_results(work_dir, ocr_result)
|
_save_ocr_results(work_dir, ocr_result)
|
||||||
|
|
||||||
# 5. Mise à jour du contexte
|
# 5. Mise à jour du contexte
|
||||||
ctx.update({
|
ctx.update({
|
||||||
"ocr_text": corrected_text,
|
"ocr_text": corrected_text,
|
||||||
@ -61,11 +61,11 @@ def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
|||||||
"ocr_pages": ocr_result.get("pages", []),
|
"ocr_pages": ocr_result.get("pages", []),
|
||||||
"ocr_artifacts": ocr_result.get("artifacts", {})
|
"ocr_artifacts": ocr_result.get("artifacts", {})
|
||||||
})
|
})
|
||||||
|
|
||||||
logger.info(f"✅ OCR terminé pour {doc_id}")
|
logger.info(f"✅ OCR terminé pour {doc_id}")
|
||||||
logger.info(f" - Texte extrait: {len(corrected_text)} caractères")
|
logger.info(f" - Texte extrait: {len(corrected_text)} caractères")
|
||||||
logger.info(f" - Confiance moyenne: {ocr_result.get('confidence', 0.0):.2f}")
|
logger.info(f" - Confiance moyenne: {ocr_result.get('confidence', 0.0):.2f}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"❌ Erreur lors de l'OCR de {doc_id}: {e}")
|
logger.error(f"❌ Erreur lors de l'OCR de {doc_id}: {e}")
|
||||||
ctx["ocr_error"] = str(e)
|
ctx["ocr_error"] = str(e)
|
||||||
@ -74,18 +74,18 @@ def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
|||||||
def _process_pdf(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
def _process_pdf(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
||||||
"""Traite un fichier PDF avec OCRmyPDF"""
|
"""Traite un fichier PDF avec OCRmyPDF"""
|
||||||
logger.info("📄 Traitement PDF avec OCRmyPDF")
|
logger.info("📄 Traitement PDF avec OCRmyPDF")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Vérification de la présence d'OCRmyPDF
|
# Vérification de la présence d'OCRmyPDF
|
||||||
subprocess.run(["ocrmypdf", "--version"], check=True, capture_output=True)
|
subprocess.run(["ocrmypdf", "--version"], check=True, capture_output=True)
|
||||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||||
logger.warning("OCRmyPDF non disponible, utilisation de Tesseract")
|
logger.warning("OCRmyPDF non disponible, utilisation de Tesseract")
|
||||||
return _process_pdf_with_tesseract(pdf_path, work_dir)
|
return _process_pdf_with_tesseract(pdf_path, work_dir)
|
||||||
|
|
||||||
# Utilisation d'OCRmyPDF
|
# Utilisation d'OCRmyPDF
|
||||||
output_pdf = os.path.join(work_dir, "output", "ocr.pdf")
|
output_pdf = os.path.join(work_dir, "output", "ocr.pdf")
|
||||||
output_txt = os.path.join(work_dir, "output", "ocr.txt")
|
output_txt = os.path.join(work_dir, "output", "ocr.txt")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Commande OCRmyPDF
|
# Commande OCRmyPDF
|
||||||
cmd = [
|
cmd = [
|
||||||
@ -97,19 +97,19 @@ def _process_pdf(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
|||||||
"--clean",
|
"--clean",
|
||||||
pdf_path, output_pdf
|
pdf_path, output_pdf
|
||||||
]
|
]
|
||||||
|
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
|
||||||
|
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
logger.warning(f"OCRmyPDF a échoué: {result.stderr}")
|
logger.warning(f"OCRmyPDF a échoué: {result.stderr}")
|
||||||
return _process_pdf_with_tesseract(pdf_path, work_dir)
|
return _process_pdf_with_tesseract(pdf_path, work_dir)
|
||||||
|
|
||||||
# Lecture du texte extrait
|
# Lecture du texte extrait
|
||||||
text = ""
|
text = ""
|
||||||
if os.path.exists(output_txt):
|
if os.path.exists(output_txt):
|
||||||
with open(output_txt, 'r', encoding='utf-8') as f:
|
with open(output_txt, 'r', encoding='utf-8') as f:
|
||||||
text = f.read()
|
text = f.read()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"text": text,
|
"text": text,
|
||||||
"confidence": 0.85, # Estimation
|
"confidence": 0.85, # Estimation
|
||||||
@ -119,7 +119,7 @@ def _process_pdf(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
|||||||
"ocr_txt": output_txt
|
"ocr_txt": output_txt
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
logger.error("Timeout lors de l'OCR avec OCRmyPDF")
|
logger.error("Timeout lors de l'OCR avec OCRmyPDF")
|
||||||
return _process_pdf_with_tesseract(pdf_path, work_dir)
|
return _process_pdf_with_tesseract(pdf_path, work_dir)
|
||||||
@ -130,17 +130,17 @@ def _process_pdf(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
|||||||
def _process_pdf_with_tesseract(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
def _process_pdf_with_tesseract(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
||||||
"""Traite un PDF avec Tesseract (fallback)"""
|
"""Traite un PDF avec Tesseract (fallback)"""
|
||||||
logger.info("📄 Traitement PDF avec Tesseract")
|
logger.info("📄 Traitement PDF avec Tesseract")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import pytesseract
|
import pytesseract
|
||||||
from pdf2image import convert_from_path
|
from pdf2image import convert_from_path
|
||||||
|
|
||||||
# Conversion PDF en images
|
# Conversion PDF en images
|
||||||
images = convert_from_path(pdf_path, dpi=300)
|
images = convert_from_path(pdf_path, dpi=300)
|
||||||
|
|
||||||
all_text = []
|
all_text = []
|
||||||
pages = []
|
pages = []
|
||||||
|
|
||||||
for i, image in enumerate(images):
|
for i, image in enumerate(images):
|
||||||
# OCR sur chaque page
|
# OCR sur chaque page
|
||||||
page_text = pytesseract.image_to_string(image, lang='fra')
|
page_text = pytesseract.image_to_string(image, lang='fra')
|
||||||
@ -149,12 +149,12 @@ def _process_pdf_with_tesseract(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
|||||||
"page": i + 1,
|
"page": i + 1,
|
||||||
"text": page_text
|
"text": page_text
|
||||||
})
|
})
|
||||||
|
|
||||||
# Sauvegarde des images pour debug
|
# Sauvegarde des images pour debug
|
||||||
for i, image in enumerate(images):
|
for i, image in enumerate(images):
|
||||||
image_path = os.path.join(work_dir, "temp", f"page_{i+1}.png")
|
image_path = os.path.join(work_dir, "temp", f"page_{i+1}.png")
|
||||||
image.save(image_path)
|
image.save(image_path)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"text": "\n\n".join(all_text),
|
"text": "\n\n".join(all_text),
|
||||||
"confidence": 0.75, # Estimation
|
"confidence": 0.75, # Estimation
|
||||||
@ -163,7 +163,7 @@ def _process_pdf_with_tesseract(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
|||||||
"images": [os.path.join(work_dir, "temp", f"page_{i+1}.png") for i in range(len(images))]
|
"images": [os.path.join(work_dir, "temp", f"page_{i+1}.png") for i in range(len(images))]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
logger.error(f"Bibliothèques manquantes: {e}")
|
logger.error(f"Bibliothèques manquantes: {e}")
|
||||||
raise
|
raise
|
||||||
@ -174,17 +174,17 @@ def _process_pdf_with_tesseract(pdf_path: str, work_dir: str) -> Dict[str, Any]:
|
|||||||
def _process_image(image_path: str, work_dir: str) -> Dict[str, Any]:
|
def _process_image(image_path: str, work_dir: str) -> Dict[str, Any]:
|
||||||
"""Traite une image avec Tesseract"""
|
"""Traite une image avec Tesseract"""
|
||||||
logger.info("🖼️ Traitement image avec Tesseract")
|
logger.info("🖼️ Traitement image avec Tesseract")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import pytesseract
|
import pytesseract
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
# Chargement de l'image
|
# Chargement de l'image
|
||||||
image = Image.open(image_path)
|
image = Image.open(image_path)
|
||||||
|
|
||||||
# OCR
|
# OCR
|
||||||
text = pytesseract.image_to_string(image, lang='fra')
|
text = pytesseract.image_to_string(image, lang='fra')
|
||||||
|
|
||||||
# Calcul de la confiance (nécessite pytesseract avec confidences)
|
# Calcul de la confiance (nécessite pytesseract avec confidences)
|
||||||
try:
|
try:
|
||||||
data = pytesseract.image_to_data(image, lang='fra', output_type=pytesseract.Output.DICT)
|
data = pytesseract.image_to_data(image, lang='fra', output_type=pytesseract.Output.DICT)
|
||||||
@ -192,7 +192,7 @@ def _process_image(image_path: str, work_dir: str) -> Dict[str, Any]:
|
|||||||
avg_confidence = sum(confidences) / len(confidences) / 100.0 if confidences else 0.0
|
avg_confidence = sum(confidences) / len(confidences) / 100.0 if confidences else 0.0
|
||||||
except:
|
except:
|
||||||
avg_confidence = 0.75 # Estimation
|
avg_confidence = 0.75 # Estimation
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"text": text,
|
"text": text,
|
||||||
"confidence": avg_confidence,
|
"confidence": avg_confidence,
|
||||||
@ -201,7 +201,7 @@ def _process_image(image_path: str, work_dir: str) -> Dict[str, Any]:
|
|||||||
"processed_image": image_path
|
"processed_image": image_path
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
logger.error(f"Bibliothèques manquantes: {e}")
|
logger.error(f"Bibliothèques manquantes: {e}")
|
||||||
raise
|
raise
|
||||||
@ -212,7 +212,7 @@ def _process_image(image_path: str, work_dir: str) -> Dict[str, Any]:
|
|||||||
def _apply_notarial_corrections(text: str) -> str:
|
def _apply_notarial_corrections(text: str) -> str:
|
||||||
"""Applique les corrections lexicales spécifiques au notariat"""
|
"""Applique les corrections lexicales spécifiques au notariat"""
|
||||||
logger.info("🔧 Application des corrections lexicales notariales")
|
logger.info("🔧 Application des corrections lexicales notariales")
|
||||||
|
|
||||||
# Dictionnaire de corrections notariales
|
# Dictionnaire de corrections notariales
|
||||||
corrections = {
|
corrections = {
|
||||||
# Corrections OCR communes
|
# Corrections OCR communes
|
||||||
@ -222,7 +222,7 @@ def _apply_notarial_corrections(text: str) -> str:
|
|||||||
"1": "l",
|
"1": "l",
|
||||||
"5": "s",
|
"5": "s",
|
||||||
"8": "B",
|
"8": "B",
|
||||||
|
|
||||||
# Termes notariaux spécifiques
|
# Termes notariaux spécifiques
|
||||||
"acte de vente": "acte de vente",
|
"acte de vente": "acte de vente",
|
||||||
"acte de donation": "acte de donation",
|
"acte de donation": "acte de donation",
|
||||||
@ -238,7 +238,7 @@ def _apply_notarial_corrections(text: str) -> str:
|
|||||||
"vendeur": "vendeur",
|
"vendeur": "vendeur",
|
||||||
"acquéreur": "acquéreur",
|
"acquéreur": "acquéreur",
|
||||||
"acheteur": "acheteur",
|
"acheteur": "acheteur",
|
||||||
|
|
||||||
# Adresses et lieux
|
# Adresses et lieux
|
||||||
"rue": "rue",
|
"rue": "rue",
|
||||||
"avenue": "avenue",
|
"avenue": "avenue",
|
||||||
@ -247,36 +247,36 @@ def _apply_notarial_corrections(text: str) -> str:
|
|||||||
"commune": "commune",
|
"commune": "commune",
|
||||||
"département": "département",
|
"département": "département",
|
||||||
"région": "région",
|
"région": "région",
|
||||||
|
|
||||||
# Montants et devises
|
# Montants et devises
|
||||||
"euros": "euros",
|
"euros": "euros",
|
||||||
"€": "€",
|
"€": "€",
|
||||||
"francs": "francs",
|
"francs": "francs",
|
||||||
"FF": "FF"
|
"FF": "FF"
|
||||||
}
|
}
|
||||||
|
|
||||||
corrected_text = text
|
corrected_text = text
|
||||||
|
|
||||||
# Application des corrections
|
# Application des corrections
|
||||||
for wrong, correct in corrections.items():
|
for wrong, correct in corrections.items():
|
||||||
corrected_text = corrected_text.replace(wrong, correct)
|
corrected_text = corrected_text.replace(wrong, correct)
|
||||||
|
|
||||||
# Nettoyage des espaces multiples
|
# Nettoyage des espaces multiples
|
||||||
import re
|
import re
|
||||||
corrected_text = re.sub(r'\s+', ' ', corrected_text)
|
corrected_text = re.sub(r'\s+', ' ', corrected_text)
|
||||||
|
|
||||||
return corrected_text.strip()
|
return corrected_text.strip()
|
||||||
|
|
||||||
def _save_ocr_results(work_dir: str, ocr_result: Dict[str, Any]) -> None:
|
def _save_ocr_results(work_dir: str, ocr_result: Dict[str, Any]) -> None:
|
||||||
"""Sauvegarde les résultats de l'OCR"""
|
"""Sauvegarde les résultats de l'OCR"""
|
||||||
output_dir = os.path.join(work_dir, "output")
|
output_dir = os.path.join(work_dir, "output")
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
# Sauvegarde du texte corrigé
|
# Sauvegarde du texte corrigé
|
||||||
corrected_text_path = os.path.join(output_dir, "corrected_text.txt")
|
corrected_text_path = os.path.join(output_dir, "corrected_text.txt")
|
||||||
with open(corrected_text_path, 'w', encoding='utf-8') as f:
|
with open(corrected_text_path, 'w', encoding='utf-8') as f:
|
||||||
f.write(ocr_result["corrected_text"])
|
f.write(ocr_result["corrected_text"])
|
||||||
|
|
||||||
# Sauvegarde des métadonnées OCR
|
# Sauvegarde des métadonnées OCR
|
||||||
metadata_path = os.path.join(output_dir, "ocr_metadata.json")
|
metadata_path = os.path.join(output_dir, "ocr_metadata.json")
|
||||||
metadata = {
|
metadata = {
|
||||||
@ -285,8 +285,8 @@ def _save_ocr_results(work_dir: str, ocr_result: Dict[str, Any]) -> None:
|
|||||||
"text_length": len(ocr_result["corrected_text"]),
|
"text_length": len(ocr_result["corrected_text"]),
|
||||||
"artifacts": ocr_result.get("artifacts", {})
|
"artifacts": ocr_result.get("artifacts", {})
|
||||||
}
|
}
|
||||||
|
|
||||||
with open(metadata_path, 'w', encoding='utf-8') as f:
|
with open(metadata_path, 'w', encoding='utf-8') as f:
|
||||||
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
logger.info(f"💾 Résultats OCR sauvegardés dans {output_dir}")
|
logger.info(f"💾 Résultats OCR sauvegardés dans {output_dir}")
|
@ -14,48 +14,48 @@ logger = logging.getLogger(__name__)
|
|||||||
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
def run(doc_id: str, ctx: Dict[str, Any]) -> None:
|
||||||
"""
|
"""
|
||||||
Pipeline de pré-traitement des documents
|
Pipeline de pré-traitement des documents
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
doc_id: Identifiant du document
|
doc_id: Identifiant du document
|
||||||
ctx: Contexte de traitement partagé entre les pipelines
|
ctx: Contexte de traitement partagé entre les pipelines
|
||||||
"""
|
"""
|
||||||
logger.info(f"🔧 Début du pré-traitement pour le document {doc_id}")
|
logger.info(f"🔧 Début du pré-traitement pour le document {doc_id}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 1. Récupération du document depuis le stockage
|
# 1. Récupération du document depuis le stockage
|
||||||
document_path = _get_document_path(doc_id)
|
document_path = _get_document_path(doc_id)
|
||||||
if not document_path or not os.path.exists(document_path):
|
if not document_path or not os.path.exists(document_path):
|
||||||
raise FileNotFoundError(f"Document {doc_id} non trouvé")
|
raise FileNotFoundError(f"Document {doc_id} non trouvé")
|
||||||
|
|
||||||
# 2. Validation du fichier
|
# 2. Validation du fichier
|
||||||
file_info = _validate_file(document_path)
|
file_info = _validate_file(document_path)
|
||||||
ctx["file_info"] = file_info
|
ctx["file_info"] = file_info
|
||||||
|
|
||||||
# 3. Calcul du hash pour l'intégrité
|
# 3. Calcul du hash pour l'intégrité
|
||||||
file_hash = _calculate_hash(document_path)
|
file_hash = _calculate_hash(document_path)
|
||||||
ctx["file_hash"] = file_hash
|
ctx["file_hash"] = file_hash
|
||||||
|
|
||||||
# 4. Préparation des répertoires de travail
|
# 4. Préparation des répertoires de travail
|
||||||
work_dir = _prepare_work_directory(doc_id)
|
work_dir = _prepare_work_directory(doc_id)
|
||||||
ctx["work_dir"] = work_dir
|
ctx["work_dir"] = work_dir
|
||||||
|
|
||||||
# 5. Conversion si nécessaire (HEIC -> JPEG, etc.)
|
# 5. Conversion si nécessaire (HEIC -> JPEG, etc.)
|
||||||
processed_path = _convert_if_needed(document_path, work_dir)
|
processed_path = _convert_if_needed(document_path, work_dir)
|
||||||
ctx["processed_path"] = processed_path
|
ctx["processed_path"] = processed_path
|
||||||
|
|
||||||
# 6. Extraction des métadonnées
|
# 6. Extraction des métadonnées
|
||||||
metadata = _extract_metadata(processed_path)
|
metadata = _extract_metadata(processed_path)
|
||||||
ctx["metadata"] = metadata
|
ctx["metadata"] = metadata
|
||||||
|
|
||||||
# 7. Détection du type de document
|
# 7. Détection du type de document
|
||||||
doc_type = _detect_document_type(processed_path)
|
doc_type = _detect_document_type(processed_path)
|
||||||
ctx["detected_type"] = doc_type
|
ctx["detected_type"] = doc_type
|
||||||
|
|
||||||
logger.info(f"✅ Pré-traitement terminé pour {doc_id}")
|
logger.info(f"✅ Pré-traitement terminé pour {doc_id}")
|
||||||
logger.info(f" - Type détecté: {doc_type}")
|
logger.info(f" - Type détecté: {doc_type}")
|
||||||
logger.info(f" - Taille: {file_info['size']} bytes")
|
logger.info(f" - Taille: {file_info['size']} bytes")
|
||||||
logger.info(f" - Hash: {file_hash[:16]}...")
|
logger.info(f" - Hash: {file_hash[:16]}...")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"❌ Erreur lors du pré-traitement de {doc_id}: {e}")
|
logger.error(f"❌ Erreur lors du pré-traitement de {doc_id}: {e}")
|
||||||
ctx["preprocess_error"] = str(e)
|
ctx["preprocess_error"] = str(e)
|
||||||
@ -71,7 +71,7 @@ def _validate_file(file_path: str) -> Dict[str, Any]:
|
|||||||
"""Valide le fichier et retourne ses informations"""
|
"""Valide le fichier et retourne ses informations"""
|
||||||
if not os.path.exists(file_path):
|
if not os.path.exists(file_path):
|
||||||
raise FileNotFoundError(f"Fichier non trouvé: {file_path}")
|
raise FileNotFoundError(f"Fichier non trouvé: {file_path}")
|
||||||
|
|
||||||
stat = os.stat(file_path)
|
stat = os.stat(file_path)
|
||||||
file_info = {
|
file_info = {
|
||||||
"path": file_path,
|
"path": file_path,
|
||||||
@ -79,16 +79,16 @@ def _validate_file(file_path: str) -> Dict[str, Any]:
|
|||||||
"modified": stat.st_mtime,
|
"modified": stat.st_mtime,
|
||||||
"extension": Path(file_path).suffix.lower()
|
"extension": Path(file_path).suffix.lower()
|
||||||
}
|
}
|
||||||
|
|
||||||
# Validation de la taille (max 50MB)
|
# Validation de la taille (max 50MB)
|
||||||
if file_info["size"] > 50 * 1024 * 1024:
|
if file_info["size"] > 50 * 1024 * 1024:
|
||||||
raise ValueError("Fichier trop volumineux (>50MB)")
|
raise ValueError("Fichier trop volumineux (>50MB)")
|
||||||
|
|
||||||
# Validation de l'extension
|
# Validation de l'extension
|
||||||
allowed_extensions = ['.pdf', '.jpg', '.jpeg', '.png', '.tiff', '.heic']
|
allowed_extensions = ['.pdf', '.jpg', '.jpeg', '.png', '.tiff', '.heic']
|
||||||
if file_info["extension"] not in allowed_extensions:
|
if file_info["extension"] not in allowed_extensions:
|
||||||
raise ValueError(f"Format non supporté: {file_info['extension']}")
|
raise ValueError(f"Format non supporté: {file_info['extension']}")
|
||||||
|
|
||||||
return file_info
|
return file_info
|
||||||
|
|
||||||
def _calculate_hash(file_path: str) -> str:
|
def _calculate_hash(file_path: str) -> str:
|
||||||
@ -103,20 +103,20 @@ def _prepare_work_directory(doc_id: str) -> str:
|
|||||||
"""Prépare le répertoire de travail pour le document"""
|
"""Prépare le répertoire de travail pour le document"""
|
||||||
work_base = os.getenv("WORK_DIR", "/tmp/processing")
|
work_base = os.getenv("WORK_DIR", "/tmp/processing")
|
||||||
work_dir = os.path.join(work_base, doc_id)
|
work_dir = os.path.join(work_base, doc_id)
|
||||||
|
|
||||||
os.makedirs(work_dir, exist_ok=True)
|
os.makedirs(work_dir, exist_ok=True)
|
||||||
|
|
||||||
# Création des sous-répertoires
|
# Création des sous-répertoires
|
||||||
subdirs = ["input", "output", "temp", "artifacts"]
|
subdirs = ["input", "output", "temp", "artifacts"]
|
||||||
for subdir in subdirs:
|
for subdir in subdirs:
|
||||||
os.makedirs(os.path.join(work_dir, subdir), exist_ok=True)
|
os.makedirs(os.path.join(work_dir, subdir), exist_ok=True)
|
||||||
|
|
||||||
return work_dir
|
return work_dir
|
||||||
|
|
||||||
def _convert_if_needed(file_path: str, work_dir: str) -> str:
|
def _convert_if_needed(file_path: str, work_dir: str) -> str:
|
||||||
"""Convertit le fichier si nécessaire (HEIC -> JPEG, etc.)"""
|
"""Convertit le fichier si nécessaire (HEIC -> JPEG, etc.)"""
|
||||||
file_ext = Path(file_path).suffix.lower()
|
file_ext = Path(file_path).suffix.lower()
|
||||||
|
|
||||||
if file_ext == '.heic':
|
if file_ext == '.heic':
|
||||||
# Conversion HEIC vers JPEG
|
# Conversion HEIC vers JPEG
|
||||||
output_path = os.path.join(work_dir, "input", "converted.jpg")
|
output_path = os.path.join(work_dir, "input", "converted.jpg")
|
||||||
@ -125,7 +125,7 @@ def _convert_if_needed(file_path: str, work_dir: str) -> str:
|
|||||||
import shutil
|
import shutil
|
||||||
shutil.copy2(file_path, output_path)
|
shutil.copy2(file_path, output_path)
|
||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
# Pour les autres formats, on copie dans le répertoire de travail
|
# Pour les autres formats, on copie dans le répertoire de travail
|
||||||
output_path = os.path.join(work_dir, "input", f"original{file_ext}")
|
output_path = os.path.join(work_dir, "input", f"original{file_ext}")
|
||||||
import shutil
|
import shutil
|
||||||
@ -139,7 +139,7 @@ def _extract_metadata(file_path: str) -> Dict[str, Any]:
|
|||||||
"extension": Path(file_path).suffix.lower(),
|
"extension": Path(file_path).suffix.lower(),
|
||||||
"size": os.path.getsize(file_path)
|
"size": os.path.getsize(file_path)
|
||||||
}
|
}
|
||||||
|
|
||||||
# Métadonnées spécifiques selon le type
|
# Métadonnées spécifiques selon le type
|
||||||
if metadata["extension"] == '.pdf':
|
if metadata["extension"] == '.pdf':
|
||||||
try:
|
try:
|
||||||
@ -156,7 +156,7 @@ def _extract_metadata(file_path: str) -> Dict[str, Any]:
|
|||||||
logger.warning("PyPDF2 non disponible, métadonnées PDF limitées")
|
logger.warning("PyPDF2 non disponible, métadonnées PDF limitées")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Erreur lors de l'extraction des métadonnées PDF: {e}")
|
logger.warning(f"Erreur lors de l'extraction des métadonnées PDF: {e}")
|
||||||
|
|
||||||
elif metadata["extension"] in ['.jpg', '.jpeg', '.png', '.tiff']:
|
elif metadata["extension"] in ['.jpg', '.jpeg', '.png', '.tiff']:
|
||||||
try:
|
try:
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
@ -171,13 +171,13 @@ def _extract_metadata(file_path: str) -> Dict[str, Any]:
|
|||||||
logger.warning("PIL non disponible, métadonnées image limitées")
|
logger.warning("PIL non disponible, métadonnées image limitées")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Erreur lors de l'extraction des métadonnées image: {e}")
|
logger.warning(f"Erreur lors de l'extraction des métadonnées image: {e}")
|
||||||
|
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
def _detect_document_type(file_path: str) -> str:
|
def _detect_document_type(file_path: str) -> str:
|
||||||
"""Détecte le type de document basé sur le nom et les métadonnées"""
|
"""Détecte le type de document basé sur le nom et les métadonnées"""
|
||||||
filename = os.path.basename(file_path).lower()
|
filename = os.path.basename(file_path).lower()
|
||||||
|
|
||||||
# Détection basée sur le nom de fichier
|
# Détection basée sur le nom de fichier
|
||||||
if any(keyword in filename for keyword in ['acte', 'vente', 'achat']):
|
if any(keyword in filename for keyword in ['acte', 'vente', 'achat']):
|
||||||
return 'acte_vente'
|
return 'acte_vente'
|
||||||
|
@ -37,16 +37,16 @@ from pipelines import preprocess, ocr, classify, extract, index, checks, finaliz
|
|||||||
def process_document(self, doc_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
def process_document(self, doc_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Tâche principale d'orchestration du pipeline de traitement
|
Tâche principale d'orchestration du pipeline de traitement
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
doc_id: Identifiant du document
|
doc_id: Identifiant du document
|
||||||
metadata: Métadonnées du document
|
metadata: Métadonnées du document
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Résultat du traitement
|
Résultat du traitement
|
||||||
"""
|
"""
|
||||||
logger.info(f"🚀 Début du traitement du document {doc_id}")
|
logger.info(f"🚀 Début du traitement du document {doc_id}")
|
||||||
|
|
||||||
# Contexte partagé entre les pipelines
|
# Contexte partagé entre les pipelines
|
||||||
ctx = {
|
ctx = {
|
||||||
"doc_id": doc_id,
|
"doc_id": doc_id,
|
||||||
@ -56,14 +56,14 @@ def process_document(self, doc_id: str, metadata: Dict[str, Any]) -> Dict[str, A
|
|||||||
"steps_completed": [],
|
"steps_completed": [],
|
||||||
"steps_failed": []
|
"steps_failed": []
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Mise à jour du statut
|
# Mise à jour du statut
|
||||||
self.update_state(
|
self.update_state(
|
||||||
state='PROGRESS',
|
state='PROGRESS',
|
||||||
meta={'step': 'initialization', 'progress': 0}
|
meta={'step': 'initialization', 'progress': 0}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Pipeline de traitement
|
# Pipeline de traitement
|
||||||
pipeline_steps = [
|
pipeline_steps = [
|
||||||
("preprocess", preprocess.run, 10),
|
("preprocess", preprocess.run, 10),
|
||||||
@ -74,11 +74,11 @@ def process_document(self, doc_id: str, metadata: Dict[str, Any]) -> Dict[str, A
|
|||||||
("checks", checks.run, 95),
|
("checks", checks.run, 95),
|
||||||
("finalize", finalize.run, 100)
|
("finalize", finalize.run, 100)
|
||||||
]
|
]
|
||||||
|
|
||||||
for step_name, step_func, progress in pipeline_steps:
|
for step_name, step_func, progress in pipeline_steps:
|
||||||
try:
|
try:
|
||||||
logger.info(f"📋 Exécution de l'étape: {step_name}")
|
logger.info(f"📋 Exécution de l'étape: {step_name}")
|
||||||
|
|
||||||
# Mise à jour du statut
|
# Mise à jour du statut
|
||||||
self.update_state(
|
self.update_state(
|
||||||
state='PROGRESS',
|
state='PROGRESS',
|
||||||
@ -88,31 +88,31 @@ def process_document(self, doc_id: str, metadata: Dict[str, Any]) -> Dict[str, A
|
|||||||
'doc_id': doc_id
|
'doc_id': doc_id
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Exécution de l'étape
|
# Exécution de l'étape
|
||||||
step_func(doc_id, ctx)
|
step_func(doc_id, ctx)
|
||||||
ctx["steps_completed"].append(step_name)
|
ctx["steps_completed"].append(step_name)
|
||||||
|
|
||||||
logger.info(f"✅ Étape {step_name} terminée avec succès")
|
logger.info(f"✅ Étape {step_name} terminée avec succès")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"Erreur dans l'étape {step_name}: {str(e)}"
|
error_msg = f"Erreur dans l'étape {step_name}: {str(e)}"
|
||||||
logger.error(f"❌ {error_msg}")
|
logger.error(f"❌ {error_msg}")
|
||||||
logger.error(traceback.format_exc())
|
logger.error(traceback.format_exc())
|
||||||
|
|
||||||
ctx["steps_failed"].append({
|
ctx["steps_failed"].append({
|
||||||
"step": step_name,
|
"step": step_name,
|
||||||
"error": str(e),
|
"error": str(e),
|
||||||
"traceback": traceback.format_exc()
|
"traceback": traceback.format_exc()
|
||||||
})
|
})
|
||||||
|
|
||||||
# Si c'est une étape critique, on arrête
|
# Si c'est une étape critique, on arrête
|
||||||
if step_name in ["preprocess", "ocr"]:
|
if step_name in ["preprocess", "ocr"]:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
# Sinon, on continue avec les étapes suivantes
|
# Sinon, on continue avec les étapes suivantes
|
||||||
logger.warning(f"⚠️ Continuation malgré l'erreur dans {step_name}")
|
logger.warning(f"⚠️ Continuation malgré l'erreur dans {step_name}")
|
||||||
|
|
||||||
# Traitement terminé avec succès
|
# Traitement terminé avec succès
|
||||||
result = {
|
result = {
|
||||||
"status": "completed",
|
"status": "completed",
|
||||||
@ -121,15 +121,15 @@ def process_document(self, doc_id: str, metadata: Dict[str, Any]) -> Dict[str, A
|
|||||||
"steps_failed": ctx["steps_failed"],
|
"steps_failed": ctx["steps_failed"],
|
||||||
"final_context": ctx
|
"final_context": ctx
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(f"🎉 Traitement terminé avec succès pour {doc_id}")
|
logger.info(f"🎉 Traitement terminé avec succès pour {doc_id}")
|
||||||
return result
|
return result
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"Erreur critique dans le traitement de {doc_id}: {str(e)}"
|
error_msg = f"Erreur critique dans le traitement de {doc_id}: {str(e)}"
|
||||||
logger.error(f"💥 {error_msg}")
|
logger.error(f"💥 {error_msg}")
|
||||||
logger.error(traceback.format_exc())
|
logger.error(traceback.format_exc())
|
||||||
|
|
||||||
# Mise à jour du statut d'erreur
|
# Mise à jour du statut d'erreur
|
||||||
self.update_state(
|
self.update_state(
|
||||||
state='FAILURE',
|
state='FAILURE',
|
||||||
@ -141,7 +141,7 @@ def process_document(self, doc_id: str, metadata: Dict[str, Any]) -> Dict[str, A
|
|||||||
'steps_failed': ctx.get("steps_failed", [])
|
'steps_failed': ctx.get("steps_failed", [])
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": "failed",
|
"status": "failed",
|
||||||
"doc_id": doc_id,
|
"doc_id": doc_id,
|
||||||
@ -171,23 +171,23 @@ def get_stats() -> Dict[str, Any]:
|
|||||||
"failed_tasks": 0,
|
"failed_tasks": 0,
|
||||||
"active_tasks": 0
|
"active_tasks": 0
|
||||||
}
|
}
|
||||||
|
|
||||||
# Récupération des statistiques depuis Redis
|
# Récupération des statistiques depuis Redis
|
||||||
from celery import current_app
|
from celery import current_app
|
||||||
inspect = current_app.control.inspect()
|
inspect = current_app.control.inspect()
|
||||||
|
|
||||||
# Tâches actives
|
# Tâches actives
|
||||||
active = inspect.active()
|
active = inspect.active()
|
||||||
if active:
|
if active:
|
||||||
stats["active_tasks"] = sum(len(tasks) for tasks in active.values())
|
stats["active_tasks"] = sum(len(tasks) for tasks in active.values())
|
||||||
|
|
||||||
# Tâches réservées
|
# Tâches réservées
|
||||||
reserved = inspect.reserved()
|
reserved = inspect.reserved()
|
||||||
if reserved:
|
if reserved:
|
||||||
stats["reserved_tasks"] = sum(len(tasks) for tasks in reserved.values())
|
stats["reserved_tasks"] = sum(len(tasks) for tasks in reserved.values())
|
||||||
|
|
||||||
return stats
|
return stats
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Erreur lors de la récupération des statistiques: {e}")
|
logger.error(f"Erreur lors de la récupération des statistiques: {e}")
|
||||||
return {"error": str(e)}
|
return {"error": str(e)}
|
||||||
@ -196,22 +196,22 @@ def get_stats() -> Dict[str, Any]:
|
|||||||
def cleanup(doc_id: str) -> Dict[str, Any]:
|
def cleanup(doc_id: str) -> Dict[str, Any]:
|
||||||
"""Nettoyage des fichiers temporaires d'un document"""
|
"""Nettoyage des fichiers temporaires d'un document"""
|
||||||
logger.info(f"🧹 Nettoyage des fichiers temporaires pour {doc_id}")
|
logger.info(f"🧹 Nettoyage des fichiers temporaires pour {doc_id}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
work_base = os.getenv("WORK_DIR", "/tmp/processing")
|
work_base = os.getenv("WORK_DIR", "/tmp/processing")
|
||||||
work_dir = os.path.join(work_base, doc_id)
|
work_dir = os.path.join(work_base, doc_id)
|
||||||
|
|
||||||
if os.path.exists(work_dir):
|
if os.path.exists(work_dir):
|
||||||
import shutil
|
import shutil
|
||||||
shutil.rmtree(work_dir)
|
shutil.rmtree(work_dir)
|
||||||
logger.info(f"✅ Répertoire {work_dir} supprimé")
|
logger.info(f"✅ Répertoire {work_dir} supprimé")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": "cleaned",
|
"status": "cleaned",
|
||||||
"doc_id": doc_id,
|
"doc_id": doc_id,
|
||||||
"work_dir": work_dir
|
"work_dir": work_dir
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"❌ Erreur lors du nettoyage de {doc_id}: {e}")
|
logger.error(f"❌ Erreur lors du nettoyage de {doc_id}: {e}")
|
||||||
return {
|
return {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user