algo/applications/collatz/collatz_k_scripts/collatz_conjoncture_rationalize.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
collatz_conjoncture_rationalize.py

Deterministic, idempotent rationalization for
`applications/collatz/conjoncture_collatz.md`.

Goals:
- keep one canonical trunk (first H1 block)
- move duplicated trunks (other H1 blocks) into an annex file
- move the imported long block starting at the line "La raison mathématique n'est pas connue."
  up to the first "## Branche ..." heading into the annex file
- make generic repeated headings unique by numbering:
  - "## Conclusion de la section précédente" -> "... (CSP-XXX)"
  - "## Conclusion de l'étape" -> "... (CE-XXX)"

The transformation preserves all text by moving it to:
`applications/collatz/conjoncture_collatz_annexes.md`.
"""

from __future__ import annotations

import argparse
import re
from pathlib import Path


def _read_lines(path: Path) -> list[str]:
    return path.read_text(encoding="utf-8", errors="strict").splitlines()


def _write_lines(path: Path, lines: list[str]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text("\n".join(lines) + "\n", encoding="utf-8")


def _find_line_eq(lines: list[str], needle: str) -> int | None:
    for i, line in enumerate(lines, start=1):
        if line.strip() == needle:
            return i
    return None


def _find_lines_prefix(lines: list[str], prefix: str) -> list[int]:
    out: list[int] = []
    for i, line in enumerate(lines, start=1):
        if line.startswith(prefix):
            out.append(i)
    return out


def _find_first_match(lines: list[str], pattern: str) -> int | None:
    rx = re.compile(pattern)
    for i, line in enumerate(lines, start=1):
        if rx.match(line):
            return i
    return None


def _slice(lines: list[str], start_line: int, end_line: int) -> list[str]:
    if start_line < 1 or end_line < start_line:
        raise ValueError("Invalid slice bounds")
    return lines[start_line - 1 : end_line]


def _replace_csp_ce_headings(lines: list[str]) -> list[str]:
    csp_idx = 0
    ce_idx = 0
    out: list[str] = []
    for line in lines:
        if line.strip() == "## Conclusion de la section précédente":
            csp_idx += 1
            out.append(f"## Conclusion de la section précédente (CSP-{csp_idx:03d})")
            continue
        if line.strip() in ("## Conclusion de l'étape", "## Conclusion de l’étape"):
            ce_idx += 1
            out.append(f"## Conclusion de l'étape (CE-{ce_idx:03d})")
            continue
        out.append(line)
    return out


def run(*, input_path: Path, annex_path: Path) -> None:
    lines = _read_lines(input_path)

    h1_conj = _find_lines_prefix(lines, "# Conjecture de Collatz:")
    vulg_start = _find_line_eq(lines, "La raison mathématique n'est pas connue.")
    branche_first = _find_first_match(lines, r"^##\s+Branche\b")

    # Already rationalized if: single H1 and no vulg marker before branches.
    if len(h1_conj) <= 1 and (vulg_start is None or (branche_first is not None and vulg_start > branche_first)):
        _write_lines(input_path, _replace_csp_ce_headings(lines))
        return

    if len(h1_conj) < 1:
        raise ValueError("Cannot find canonical H1 '# Conjecture de Collatz:'")
    if branche_first is None:
        raise ValueError("Cannot find first '## Branche ...' heading (needed to bound imported block)")

    # Boundaries for rationalization
    canonical_end = (h1_conj[1] - 1) if len(h1_conj) >= 2 else len(lines)
    imported_start = vulg_start
    imported_end = (branche_first - 1) if (imported_start is not None and branche_first > imported_start) else None

    # Collect moved blocks (by line ranges)
    moved: list[tuple[str, int, int]] = []
    if len(h1_conj) >= 2:
        for j in range(1, len(h1_conj)):
            start = h1_conj[j]
            end = (h1_conj[j + 1] - 1) if (j + 1 < len(h1_conj)) else canonical_end
            # In the original file, the last H1 block may extend into imported content; we cap at imported_start-1.
            if imported_start is not None and start < imported_start:
                end = min(end, imported_start - 1)
            if end >= start:
                moved.append((f"Duplicated trunk #{j}", start, end))

    if imported_start is not None and imported_end is not None and imported_end >= imported_start:
        moved.append(("Imported block (Futurs Accessibles / non-formal)", imported_start, imported_end))

    # Build annex file (overwrite deterministically)
    annex: list[str] = []
    annex.append("**Auteur** : Équipe 4NK")
    annex.append("")
    annex.append("# Annexes — `conjoncture_collatz.md` (contenu déplacé)")
    annex.append("")
    annex.append("Ce fichier contient des blocs déplacés de `applications/collatz/conjoncture_collatz.md` lors de la rationalisation.")
    annex.append("")
    for idx, (label, start, end) in enumerate(moved, start=1):
        annex.append(f"## Annexe {idx} — {label} (lignes {start}–{end})")
        annex.append("")
        annex.extend(_slice(lines, start, end))
        annex.append("")
        annex.append("---")
        annex.append("")
    _write_lines(annex_path, annex)

    # Build new main file:
    # - keep canonical block (1..canonical_end) but remove everything after the first H1 block beyond references
    # - remove duplicated trunks + imported block (which lie before branche_first)
    # - append from branche_first to EOF
    kept: list[str] = []
    kept.extend(_slice(lines, 1, canonical_end))
    kept.append("")
    kept.append("## Annexes (contenu déplacé)")
    kept.append("")
    kept.append("Des blocs ont été déplacés dans un fichier annexe afin de rendre le document indexable et d’éliminer les duplications de tronc.")
    kept.append("")
    kept.append(f"- annexes : `{annex_path.name}`")
    for idx, (label, start, end) in enumerate(moved, start=1):
        kept.append(f"  - Annexe {idx} : {label} (lignes {start}–{end})")
    kept.append("")
    kept.append("---")
    kept.append("")
    kept.extend(_slice(lines, branche_first, len(lines)))

    kept = _replace_csp_ce_headings(kept)
    _write_lines(input_path, kept)


def main() -> None:
    ap = argparse.ArgumentParser(description="Rationalize conjoncture_collatz.md into a canonical file + annexes")
    ap.add_argument(
        "--input",
        default="applications/collatz/conjoncture_collatz.md",
        help="Path to conjoncture_collatz.md",
    )
    ap.add_argument(
        "--annex",
        default="applications/collatz/conjoncture_collatz_annexes.md",
        help="Path to annex file to write (overwritten deterministically)",
    )
    args = ap.parse_args()
    run(input_path=Path(args.input).resolve(), annex_path=Path(args.annex).resolve())


if __name__ == "__main__":
    main()