""" Pipeline OCR pour l'extraction de texte des documents """ import os import tempfile import subprocess import json from typing import Dict, Any import logging logger = logging.getLogger(__name__) def run(doc_id: str, ctx: Dict[str, Any]) -> None: """ Pipeline OCR pour l'extraction de texte Args: doc_id: Identifiant du document ctx: Contexte de traitement partagé entre les pipelines """ logger.info(f"👁️ Début de l'OCR pour le document {doc_id}") try: # 1. Vérification des prérequis if "preprocess_error" in ctx: raise Exception(f"Erreur de pré-traitement: {ctx['preprocess_error']}") processed_path = ctx.get("processed_path") if not processed_path or not os.path.exists(processed_path): raise FileNotFoundError("Fichier traité non trouvé") work_dir = ctx.get("work_dir") if not work_dir: raise ValueError("Répertoire de travail non défini") # 2. Détection du type de document file_ext = os.path.splitext(processed_path)[1].lower() if file_ext == '.pdf': # Traitement PDF ocr_result = _process_pdf(processed_path, work_dir) elif file_ext in ['.jpg', '.jpeg', '.png', '.tiff']: # Traitement image ocr_result = _process_image(processed_path, work_dir) else: raise ValueError(f"Format non supporté pour l'OCR: {file_ext}") # 3. Correction lexicale notariale corrected_text = _apply_notarial_corrections(ocr_result["text"]) ocr_result["corrected_text"] = corrected_text # 4. Sauvegarde des résultats _save_ocr_results(work_dir, ocr_result) # 5. Mise à jour du contexte ctx.update({ "ocr_text": corrected_text, "ocr_raw_text": ocr_result["text"], "ocr_confidence": ocr_result.get("confidence", 0.0), "ocr_pages": ocr_result.get("pages", []), "ocr_artifacts": ocr_result.get("artifacts", {}) }) logger.info(f"✅ OCR terminé pour {doc_id}") logger.info(f" - Texte extrait: {len(corrected_text)} caractères") logger.info(f" - Confiance moyenne: {ocr_result.get('confidence', 0.0):.2f}") except Exception as e: logger.error(f"❌ Erreur lors de l'OCR de {doc_id}: {e}") ctx["ocr_error"] = str(e) raise def _process_pdf(pdf_path: str, work_dir: str) -> Dict[str, Any]: """Traite un fichier PDF avec OCRmyPDF""" logger.info("📄 Traitement PDF avec OCRmyPDF") try: # Vérification de la présence d'OCRmyPDF subprocess.run(["ocrmypdf", "--version"], check=True, capture_output=True) except (subprocess.CalledProcessError, FileNotFoundError): logger.warning("OCRmyPDF non disponible, utilisation de Tesseract") return _process_pdf_with_tesseract(pdf_path, work_dir) # Utilisation d'OCRmyPDF output_pdf = os.path.join(work_dir, "output", "ocr.pdf") output_txt = os.path.join(work_dir, "output", "ocr.txt") try: # Commande OCRmyPDF cmd = [ "ocrmypdf", "--sidecar", output_txt, "--output-type", "pdf", "--language", "fra", "--deskew", "--clean", pdf_path, output_pdf ] result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) if result.returncode != 0: logger.warning(f"OCRmyPDF a échoué: {result.stderr}") return _process_pdf_with_tesseract(pdf_path, work_dir) # Lecture du texte extrait text = "" if os.path.exists(output_txt): with open(output_txt, 'r', encoding='utf-8') as f: text = f.read() return { "text": text, "confidence": 0.85, # Estimation "pages": [{"page": 1, "text": text}], "artifacts": { "ocr_pdf": output_pdf, "ocr_txt": output_txt } } except subprocess.TimeoutExpired: logger.error("Timeout lors de l'OCR avec OCRmyPDF") return _process_pdf_with_tesseract(pdf_path, work_dir) except Exception as e: logger.error(f"Erreur OCRmyPDF: {e}") return _process_pdf_with_tesseract(pdf_path, work_dir) def _process_pdf_with_tesseract(pdf_path: str, work_dir: str) -> Dict[str, Any]: """Traite un PDF avec Tesseract (fallback)""" logger.info("📄 Traitement PDF avec Tesseract") try: import pytesseract from pdf2image import convert_from_path # Conversion PDF en images images = convert_from_path(pdf_path, dpi=300) all_text = [] pages = [] for i, image in enumerate(images): # OCR sur chaque page page_text = pytesseract.image_to_string(image, lang='fra') all_text.append(page_text) pages.append({ "page": i + 1, "text": page_text }) # Sauvegarde des images pour debug for i, image in enumerate(images): image_path = os.path.join(work_dir, "temp", f"page_{i+1}.png") image.save(image_path) return { "text": "\n\n".join(all_text), "confidence": 0.75, # Estimation "pages": pages, "artifacts": { "images": [os.path.join(work_dir, "temp", f"page_{i+1}.png") for i in range(len(images))] } } except ImportError as e: logger.error(f"Bibliothèques manquantes: {e}") raise except Exception as e: logger.error(f"Erreur Tesseract: {e}") raise def _process_image(image_path: str, work_dir: str) -> Dict[str, Any]: """Traite une image avec Tesseract""" logger.info("🖼️ Traitement image avec Tesseract") try: import pytesseract from PIL import Image # Chargement de l'image image = Image.open(image_path) # OCR text = pytesseract.image_to_string(image, lang='fra') # Calcul de la confiance (nécessite pytesseract avec confidences) try: data = pytesseract.image_to_data(image, lang='fra', output_type=pytesseract.Output.DICT) confidences = [int(conf) for conf in data['conf'] if int(conf) > 0] avg_confidence = sum(confidences) / len(confidences) / 100.0 if confidences else 0.0 except: avg_confidence = 0.75 # Estimation return { "text": text, "confidence": avg_confidence, "pages": [{"page": 1, "text": text}], "artifacts": { "processed_image": image_path } } except ImportError as e: logger.error(f"Bibliothèques manquantes: {e}") raise except Exception as e: logger.error(f"Erreur traitement image: {e}") raise def _apply_notarial_corrections(text: str) -> str: """Applique les corrections lexicales spécifiques au notariat""" logger.info("🔧 Application des corrections lexicales notariales") # Dictionnaire de corrections notariales corrections = { # Corrections OCR communes "rn": "m", "cl": "d", "0": "o", "1": "l", "5": "s", "8": "B", # Termes notariaux spécifiques "acte de vente": "acte de vente", "acte de donation": "acte de donation", "acte de succession": "acte de succession", "notaire": "notaire", "étude notariale": "étude notariale", "clause": "clause", "disposition": "disposition", "héritier": "héritier", "légataire": "légataire", "donataire": "donataire", "donateur": "donateur", "vendeur": "vendeur", "acquéreur": "acquéreur", "acheteur": "acheteur", # Adresses et lieux "rue": "rue", "avenue": "avenue", "boulevard": "boulevard", "place": "place", "commune": "commune", "département": "département", "région": "région", # Montants et devises "euros": "euros", "€": "€", "francs": "francs", "FF": "FF" } corrected_text = text # Application des corrections for wrong, correct in corrections.items(): corrected_text = corrected_text.replace(wrong, correct) # Nettoyage des espaces multiples import re corrected_text = re.sub(r'\s+', ' ', corrected_text) return corrected_text.strip() def _save_ocr_results(work_dir: str, ocr_result: Dict[str, Any]) -> None: """Sauvegarde les résultats de l'OCR""" output_dir = os.path.join(work_dir, "output") os.makedirs(output_dir, exist_ok=True) # Sauvegarde du texte corrigé corrected_text_path = os.path.join(output_dir, "corrected_text.txt") with open(corrected_text_path, 'w', encoding='utf-8') as f: f.write(ocr_result["corrected_text"]) # Sauvegarde des métadonnées OCR metadata_path = os.path.join(output_dir, "ocr_metadata.json") metadata = { "confidence": ocr_result.get("confidence", 0.0), "pages_count": len(ocr_result.get("pages", [])), "text_length": len(ocr_result["corrected_text"]), "artifacts": ocr_result.get("artifacts", {}) } with open(metadata_path, 'w', encoding='utf-8') as f: json.dump(metadata, f, indent=2, ensure_ascii=False) logger.info(f"💾 Résultats OCR sauvegardés dans {output_dir}")