From a1ac5ae66be308f6b3c50eeab3e8f14654b2ca89 Mon Sep 17 00:00:00 2001
From: 4NK <nicolas.cantu@pm.me>
Date: Fri, 3 Apr 2026 22:27:18 +0200
Subject: [PATCH] Add Chandra OCR service (datalab-to upstream submodule, CLI
 wrapper)

- Submodule services/chandra/upstream (shallow), run-chandra.sh, .env.example
- Docs: service-chandra, chandra-ocr-documents; link from PageIndex feature
- Index updates in docs/README, repo/README, services, system-architecture
---
 .gitmodules                                   |  4 ++
 docs/README.md                                |  2 +
 docs/features/chandra-ocr-documents.md        | 18 ++++++++
 docs/features/pageindex-semantic-documents.md |  2 +
 docs/repo/README.md                           |  1 +
 docs/repo/service-chandra.md                  | 24 ++++++++++
 docs/services.md                              |  2 +
 docs/system-architecture.md                   |  1 +
 services/chandra/.env.example                 | 10 ++++
 services/chandra/README.md                    | 46 +++++++++++++++++++
 services/chandra/run-chandra.sh               | 18 ++++++++
 services/chandra/upstream                     |  1 +
 12 files changed, 129 insertions(+)
 create mode 100644 docs/features/chandra-ocr-documents.md
 create mode 100644 docs/repo/service-chandra.md
 create mode 100644 services/chandra/.env.example
 create mode 100644 services/chandra/README.md
 create mode 100755 services/chandra/run-chandra.sh
 create mode 160000 services/chandra/upstream

diff --git a/.gitmodules b/.gitmodules
index ed8319c..35067ca 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,3 +6,7 @@
 	path = services/pageindex/upstream
 	url = https://github.com/VectifyAI/PageIndex.git
 	shallow = true
+[submodule "services/chandra/upstream"]
+	path = services/chandra/upstream
+	url = https://github.com/datalab-to/chandra.git
+	shallow = true
diff --git a/docs/README.md b/docs/README.md
index f3a9a19..fbb1973 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -25,6 +25,7 @@ Vue d’ensemble et index complet : **[repo/README.md](./repo/README.md)**. Règ
 | [repo/service-anythingllm-devtools.md](./repo/service-anythingllm-devtools.md) | Service HTTP AnythingLLM + devtools |
 | [repo/service-carbonyl.md](./repo/service-carbonyl.md) | Carbonyl (navigateur terminal), prévisualisation test |
 | [repo/service-pageindex.md](./repo/service-pageindex.md) | PageIndex (index vectorless, définition sémantique documents) |
+| [repo/service-chandra.md](./repo/service-chandra.md) | Chandra OCR (PDF / images structurés) |
 | [repo/extension-anythingllm-workspaces.md](./repo/extension-anythingllm-workspaces.md) | Extension AnythingLLM IDE (supprimée ; voir anythingllm-devtools) |
 
 Les fichiers **`README.md`** sous `services/*/`, `cron/`, `projects/`, etc. ne font que **renvoyer** vers ces pages.
@@ -62,6 +63,7 @@ Les fichiers **`README.md`** sous `services/*/`, `cron/`, `projects/`, etc. ne f
 | [anythingllm-workspaces.md](./anythingllm-workspaces.md) | Un workspace AnythingLLM par projet, synchronisation |
 | [features/carbonyl-terminal-browser.md](./features/carbonyl-terminal-browser.md) | Carbonyl, URL test dans `conf.json` |
 | [features/pageindex-semantic-documents.md](./features/pageindex-semantic-documents.md) | PageIndex, arbre sémantique PDF / Markdown |
+| [features/chandra-ocr-documents.md](./features/chandra-ocr-documents.md) | Chandra OCR, mise en page |
 | [ux-navigation-model.md](./ux-navigation-model.md) | Intentions, recherche, mode expert |
 
 ## Intégration dépôts
diff --git a/docs/features/chandra-ocr-documents.md b/docs/features/chandra-ocr-documents.md
new file mode 100644
index 0000000..3009ebd
--- /dev/null
+++ b/docs/features/chandra-ocr-documents.md
@@ -0,0 +1,18 @@
+# Chandra OCR — documents structurés
+
+## Contexte
+
+[Chandra OCR 2](https://github.com/datalab-to/chandra) (Datalab) produit des sorties **Markdown**, **HTML** et **JSON** à partir de PDF et d’images, en préservant la structure (tableaux, zones, légendes). Deux modes d’inférence sont documentés amont : **vLLM** (serveur) et **Hugging Face** (local, dépendances plus lourdes).
+
+## Intégration smart_ide
+
+- Répertoire : **`services/chandra/`** avec sous-module **`upstream/`**.
+- Commande : **`./run-chandra.sh`** (délègue à **`chandra`** dans le venv **`upstream/.venv`** ou à **`uv run chandra`**).
+
+## Chaînage possible
+
+Sortie Markdown exploitable par **PageIndex** (`--md_path`) ou par des pipelines d’ingestion **AnythingLLM** / **docv**, selon les politiques de données du projet.
+
+## Documentation liée
+
+- [repo/service-chandra.md](../repo/service-chandra.md)
diff --git a/docs/features/pageindex-semantic-documents.md b/docs/features/pageindex-semantic-documents.md
index f171d0d..1b7917a 100644
--- a/docs/features/pageindex-semantic-documents.md
+++ b/docs/features/pageindex-semantic-documents.md
@@ -10,6 +10,8 @@
 - Lancement : **`./run-pageindex.sh`** depuis **`services/pageindex/`** (délègue à **`upstream/run_pageindex.py`**).
 - Dépendances Python : installer dans un **venv** sous **`upstream/`** (voir README du service).
 
+Pour des PDF scannés ou des mises en page complexes, une étape **OCR structuré** amont ([Chandra](https://github.com/datalab-to/chandra), service **`services/chandra/`**) peut produire du Markdown exploitable avant PageIndex.
+
 ## Complémentarité avec AnythingLLM
 
 **AnythingLLM** ([anythingllm-workspaces.md](../anythingllm-workspaces.md)) couvre la mémoire documentaire et le RAG par **ingestion / embeddings** dans des workspaces. **PageIndex** adresse une autre stratégie : **structure hiérarchique explicite** et parcours type « table des matières intelligente » pour des flux où la traçabilité des sections prime.
diff --git a/docs/repo/README.md b/docs/repo/README.md
index d0eca7c..f246dc3 100644
--- a/docs/repo/README.md
+++ b/docs/repo/README.md
@@ -39,6 +39,7 @@ Toute la documentation **opérationnelle** qui vivait auparavant sous des `READM
 | [script-anythingllm-pull-sync.md](./script-anythingllm-pull-sync.md) | Hook post-merge → upload AnythingLLM |
 | [service-carbonyl.md](./service-carbonyl.md) | Carbonyl (navigateur terminal), sous-module amont |
 | [service-pageindex.md](./service-pageindex.md) | PageIndex (index sémantique vectorless), sous-module amont |
+| [service-chandra.md](./service-chandra.md) | Chandra OCR, sous-module amont |
 | [extension-anythingllm-workspaces.md](./extension-anythingllm-workspaces.md) | Extension AnythingLLM IDE (supprimée ; anythingllm-devtools) |
 
 Les **spécifications** détaillées (contrats HTTP, sécurité, orchestration) restent dans [../API/README.md](../API/README.md) et [../features/](../features/).
diff --git a/docs/repo/service-chandra.md b/docs/repo/service-chandra.md
new file mode 100644
index 0000000..e3b9e7e
--- /dev/null
+++ b/docs/repo/service-chandra.md
@@ -0,0 +1,24 @@
+# Service Chandra OCR (`services/chandra/`)
+
+OCR et extraction **structurée** (PDF / images → Markdown, HTML, JSON avec mise en page) via le projet amont **[datalab-to/chandra](https://github.com/datalab-to/chandra)**.
+
+## Rôle dans smart_ide
+
+- **Numérisation** de documents complexes (tableaux, formulaires, manuscrits, math).
+- **Pas de listener HTTP** dans ce dépôt : CLI **`chandra`**, lancée par **`services/chandra/run-chandra.sh`** après installation dans **`upstream/`** (`uv sync` ou équivalent).
+
+## Licences
+
+- **Code** : Apache-2.0 (fichier `LICENSE` dans `upstream/`).
+- **Poids du modèle** : voir **`MODEL_LICENSE`** dans le sous-module amont et les conditions d’usage commercial décrites dans le [README Chandra](https://github.com/datalab-to/chandra/blob/master/README.md).
+
+## Exploitation
+
+Voir **[`services/chandra/README.md`](../../services/chandra/README.md)** et **[features/chandra-ocr-documents.md](../features/chandra-ocr-documents.md)**.
+
+Configuration : variables d’environnement ou **`upstream/local.env`** — gabarit **`services/chandra/.env.example`**.
+
+## Voir aussi
+
+- [service-pageindex.md](./service-pageindex.md) — index sémantique sur Markdown / PDF
+- [anythingllm-workspaces.md](../anythingllm-workspaces.md) — RAG par workspace
diff --git a/docs/services.md b/docs/services.md
index e8ac6e8..985803d 100644
--- a/docs/services.md
+++ b/docs/services.md
@@ -38,6 +38,8 @@ Services d’appoint sur **`127.0.0.1`** (souvent auth **Bearer**) : Git devtool
 
 **PageIndex** (`services/pageindex/`) n’est pas un listener HTTP : outil Python (sous-module [VectifyAI/PageIndex](https://github.com/VectifyAI/PageIndex)) pour produire un **index arborescent** sémantique sur PDF / Markdown, en complément du RAG **AnythingLLM** — [repo/service-pageindex.md](./repo/service-pageindex.md).
 
+**Chandra OCR** (`services/chandra/`) n’est pas un listener HTTP : CLI (sous-module [datalab-to/chandra](https://github.com/datalab-to/chandra)) pour **OCR** PDF / images vers Markdown, HTML, JSON avec layout — [repo/service-chandra.md](./repo/service-chandra.md).
+
 ## Documentation liée
 
 - [platform-target.md](./platform-target.md)  
diff --git a/docs/system-architecture.md b/docs/system-architecture.md
index 0aa1296..8d552bc 100644
--- a/docs/system-architecture.md
+++ b/docs/system-architecture.md
@@ -35,6 +35,7 @@ Conséquences :
 | `services/anythingllm-devtools/` | HTTP : AnythingLLM + repos-devtools + RAG initial (`.4nkaiignore`) — [API/anythingllm-devtools-api.md](./API/anythingllm-devtools-api.md) |
 | `services/carbonyl/` | Navigateur terminal Chromium ([Carbonyl](https://github.com/fathyb/carbonyl)) ; sous-module **`upstream/`** ; prévisualisation test — [repo/service-carbonyl.md](./repo/service-carbonyl.md) |
 | `services/pageindex/` | Index sémantique arborescent PDF/MD ([PageIndex](https://github.com/VectifyAI/PageIndex)) ; sous-module **`upstream/`** ; CLI — [repo/service-pageindex.md](./repo/service-pageindex.md) |
+| `services/chandra/` | OCR PDF/images → MD/HTML/JSON ([Chandra](https://github.com/datalab-to/chandra)) ; sous-module **`upstream/`** ; CLI — [repo/service-chandra.md](./repo/service-chandra.md) |
 | `scripts/` , `setup/` , `systemd/` | Installation hôte, scripts d’exploitation, unités utilisateur pour services |
 | `cron/` | Pull **Git** planifié des clones décrits par `projects/<id>/conf.json` (`project_path`) — [repo/cron-git-pull.md](./repo/cron-git-pull.md) |
 | `services/local-office/` | **API REST** Office (upload, commandes docx, stockage SQLite + fichiers) ; complément programmatique à ONLYOFFICE |
diff --git a/services/chandra/.env.example b/services/chandra/.env.example
new file mode 100644
index 0000000..e7fa238
--- /dev/null
+++ b/services/chandra/.env.example
@@ -0,0 +1,10 @@
+# Optional: copy to services/chandra/upstream/local.env (see upstream README).
+# Or export before running ./run-chandra.sh
+
+# MODEL_CHECKPOINT=datalab-to/chandra-ocr-2
+# MAX_OUTPUT_TOKENS=12384
+
+# vLLM (default inference path for lightweight pip install)
+# VLLM_API_BASE=http://localhost:8000/v1
+# VLLM_MODEL_NAME=chandra
+# VLLM_GPUS=0
diff --git a/services/chandra/README.md b/services/chandra/README.md
new file mode 100644
index 0000000..2e0f7f8
--- /dev/null
+++ b/services/chandra/README.md
@@ -0,0 +1,46 @@
+# Chandra OCR (amont)
+
+[Chandra OCR 2](https://github.com/datalab-to/chandra) convertit images et PDF en **Markdown**, **HTML** ou **JSON** en conservant la mise en page (tableaux, formulaires, écriture manuscrite, math). Code sous **Apache-2.0** ; les **poids du modèle** suivent une licence dédiée (**MODEL_LICENSE** dans `upstream/`) — voir le [dépôt amont](https://github.com/datalab-to/chandra).
+
+Ce répertoire **`services/chandra/`** contient :
+
+- **`upstream/`** : sous-module Git vers **datalab-to/chandra**.
+- **`run-chandra.sh`** : lance la CLI **`chandra`** depuis l’environnement installé dans **`upstream/`** (`uv` ou `.venv`).
+- **`.env.example`** : variables usuelles (vLLM, modèle) ; l’amont charge aussi **`local.env`** dans **`upstream/`** (non versionné).
+
+## Installation (une fois par poste)
+
+Depuis les sources du sous-module (recommandé ici) :
+
+```bash
+cd services/chandra/upstream
+uv sync
+# optionnel : modèle local Hugging Face (lourd)
+# uv sync --extra hf
+```
+
+Sans **`uv`** : créer un venv, puis `pip install -e ".[hf]"` ou `pip install -e .` selon le mode d’inférence (voir [README amont](https://github.com/datalab-to/chandra/blob/master/README.md)).
+
+**Inférence vLLM** (léger côté client si le serveur tourne ailleurs) : démarrer le serveur comme documenté amont (`chandra_vllm` après install du paquet).
+
+## Usage
+
+```bash
+cd services/chandra
+./run-chandra.sh input.pdf ./output --method vllm
+# ou --method hf si dépendances HF installées
+```
+
+Options CLI (`--page-range`, `--max-workers`, etc.) : même interface que la commande **`chandra`** amont.
+
+## Rôle dans smart_ide
+
+- **OCR / numérisation structurée** pour pipelines documentaires, en amont de **PageIndex** ([PageIndex](../pageindex/README.md)) ou d’**AnythingLLM** / **docv**.
+- **Pas de service HTTP** dans ce dépôt : exécution **CLI** (comme **`services/pageindex/`**).
+
+Documentation : [docs/repo/service-chandra.md](../../docs/repo/service-chandra.md), [docs/features/chandra-ocr-documents.md](../../docs/features/chandra-ocr-documents.md).
+
+## Ressources amont
+
+- Dépôt : [datalab-to/chandra](https://github.com/datalab-to/chandra)
+- Paquet PyPI : `chandra-ocr` (alternative à l’installation depuis **`upstream/`**)
diff --git a/services/chandra/run-chandra.sh b/services/chandra/run-chandra.sh
new file mode 100755
index 0000000..970dde0
--- /dev/null
+++ b/services/chandra/run-chandra.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# Run Chandra OCR CLI from vendored upstream (PDF / images → md/html/json).
+set -euo pipefail
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+UP="${ROOT}/upstream"
+if [[ ! -d "${UP}/chandra" ]]; then
+	echo "Missing ${UP}/chandra — run: git submodule update --init services/chandra/upstream" >&2
+	exit 1
+fi
+cd "${UP}"
+if [[ -x "${UP}/.venv/bin/chandra" ]]; then
+	exec "${UP}/.venv/bin/chandra" "$@"
+fi
+if command -v uv >/dev/null 2>&1; then
+	exec uv run chandra "$@"
+fi
+echo "Install first: cd ${UP} && uv sync   (or python -m venv .venv && pip install -e .)" >&2
+exit 1
diff --git a/services/chandra/upstream b/services/chandra/upstream
new file mode 160000
index 0000000..01f86eb
--- /dev/null
+++ b/services/chandra/upstream
@@ -0,0 +1 @@
+Subproject commit 01f86ebd3a9eef07aa69734008a86541f206456e