algo/applications/collatz/collatz_k_scripts/collatz_conjoncture_rationalize.py
ncantu ab56157c05 collatz: rationalize conjoncture document into annexes
**Motivations:**
- Make `conjoncture_collatz.md` indexable and avoid duplicated trunks.

**Root causes:**
- Multiple full copies of the formal trunk and a large non-formal imported block prevented unambiguous references.

**Correctifs:**
- Move duplicated trunks and the imported non-formal block to `conjoncture_collatz_annexes.md`.
- Make generic repeated headings unique via deterministic numbering (CSP/CE).

**Evolutions:**
- Add deterministic audit + rationalization scripts and versioned audit artefacts.

**Pages affectées:**
- applications/collatz/conjoncture_collatz.md
- applications/collatz/conjoncture_collatz_annexes.md
- applications/collatz/collatz_k_scripts/collatz_conjoncture_audit.py
- applications/collatz/collatz_k_scripts/collatz_conjoncture_rationalize.py
- docs/artefacts/collatz/conjoncture_rationalisation/*
- docs/features/collatz_conjoncture_rationalization_tooling.md
- docs/collatz_conjoncture_collatz_cartographie.md
2026-03-09 04:56:32 +01:00

180 lines
6.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
collatz_conjoncture_rationalize.py
Deterministic, idempotent rationalization for
`applications/collatz/conjoncture_collatz.md`.
Goals:
- keep one canonical trunk (first H1 block)
- move duplicated trunks (other H1 blocks) into an annex file
- move the imported long block starting at the line "La raison mathématique n'est pas connue."
up to the first "## Branche ..." heading into the annex file
- make generic repeated headings unique by numbering:
- "## Conclusion de la section précédente" -> "... (CSP-XXX)"
- "## Conclusion de l'étape" -> "... (CE-XXX)"
The transformation preserves all text by moving it to:
`applications/collatz/conjoncture_collatz_annexes.md`.
"""
from __future__ import annotations
import argparse
import re
from pathlib import Path
def _read_lines(path: Path) -> list[str]:
return path.read_text(encoding="utf-8", errors="strict").splitlines()
def _write_lines(path: Path, lines: list[str]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
def _find_line_eq(lines: list[str], needle: str) -> int | None:
for i, line in enumerate(lines, start=1):
if line.strip() == needle:
return i
return None
def _find_lines_prefix(lines: list[str], prefix: str) -> list[int]:
out: list[int] = []
for i, line in enumerate(lines, start=1):
if line.startswith(prefix):
out.append(i)
return out
def _find_first_match(lines: list[str], pattern: str) -> int | None:
rx = re.compile(pattern)
for i, line in enumerate(lines, start=1):
if rx.match(line):
return i
return None
def _slice(lines: list[str], start_line: int, end_line: int) -> list[str]:
if start_line < 1 or end_line < start_line:
raise ValueError("Invalid slice bounds")
return lines[start_line - 1 : end_line]
def _replace_csp_ce_headings(lines: list[str]) -> list[str]:
csp_idx = 0
ce_idx = 0
out: list[str] = []
for line in lines:
if line.strip() == "## Conclusion de la section précédente":
csp_idx += 1
out.append(f"## Conclusion de la section précédente (CSP-{csp_idx:03d})")
continue
if line.strip() in ("## Conclusion de l'étape", "## Conclusion de létape"):
ce_idx += 1
out.append(f"## Conclusion de l'étape (CE-{ce_idx:03d})")
continue
out.append(line)
return out
def run(*, input_path: Path, annex_path: Path) -> None:
lines = _read_lines(input_path)
h1_conj = _find_lines_prefix(lines, "# Conjecture de Collatz:")
vulg_start = _find_line_eq(lines, "La raison mathématique n'est pas connue.")
branche_first = _find_first_match(lines, r"^##\s+Branche\b")
# Already rationalized if: single H1 and no vulg marker before branches.
if len(h1_conj) <= 1 and (vulg_start is None or (branche_first is not None and vulg_start > branche_first)):
_write_lines(input_path, _replace_csp_ce_headings(lines))
return
if len(h1_conj) < 1:
raise ValueError("Cannot find canonical H1 '# Conjecture de Collatz:'")
if branche_first is None:
raise ValueError("Cannot find first '## Branche ...' heading (needed to bound imported block)")
# Boundaries for rationalization
canonical_end = (h1_conj[1] - 1) if len(h1_conj) >= 2 else len(lines)
imported_start = vulg_start
imported_end = (branche_first - 1) if (imported_start is not None and branche_first > imported_start) else None
# Collect moved blocks (by line ranges)
moved: list[tuple[str, int, int]] = []
if len(h1_conj) >= 2:
for j in range(1, len(h1_conj)):
start = h1_conj[j]
end = (h1_conj[j + 1] - 1) if (j + 1 < len(h1_conj)) else canonical_end
# In the original file, the last H1 block may extend into imported content; we cap at imported_start-1.
if imported_start is not None and start < imported_start:
end = min(end, imported_start - 1)
if end >= start:
moved.append((f"Duplicated trunk #{j}", start, end))
if imported_start is not None and imported_end is not None and imported_end >= imported_start:
moved.append(("Imported block (Futurs Accessibles / non-formal)", imported_start, imported_end))
# Build annex file (overwrite deterministically)
annex: list[str] = []
annex.append("**Auteur** : Équipe 4NK")
annex.append("")
annex.append("# Annexes — `conjoncture_collatz.md` (contenu déplacé)")
annex.append("")
annex.append("Ce fichier contient des blocs déplacés de `applications/collatz/conjoncture_collatz.md` lors de la rationalisation.")
annex.append("")
for idx, (label, start, end) in enumerate(moved, start=1):
annex.append(f"## Annexe {idx}{label} (lignes {start}{end})")
annex.append("")
annex.extend(_slice(lines, start, end))
annex.append("")
annex.append("---")
annex.append("")
_write_lines(annex_path, annex)
# Build new main file:
# - keep canonical block (1..canonical_end) but remove everything after the first H1 block beyond references
# - remove duplicated trunks + imported block (which lie before branche_first)
# - append from branche_first to EOF
kept: list[str] = []
kept.extend(_slice(lines, 1, canonical_end))
kept.append("")
kept.append("## Annexes (contenu déplacé)")
kept.append("")
kept.append("Des blocs ont été déplacés dans un fichier annexe afin de rendre le document indexable et déliminer les duplications de tronc.")
kept.append("")
kept.append(f"- annexes : `{annex_path.name}`")
for idx, (label, start, end) in enumerate(moved, start=1):
kept.append(f" - Annexe {idx} : {label} (lignes {start}{end})")
kept.append("")
kept.append("---")
kept.append("")
kept.extend(_slice(lines, branche_first, len(lines)))
kept = _replace_csp_ce_headings(kept)
_write_lines(input_path, kept)
def main() -> None:
ap = argparse.ArgumentParser(description="Rationalize conjoncture_collatz.md into a canonical file + annexes")
ap.add_argument(
"--input",
default="applications/collatz/conjoncture_collatz.md",
help="Path to conjoncture_collatz.md",
)
ap.add_argument(
"--annex",
default="applications/collatz/conjoncture_collatz_annexes.md",
help="Path to annex file to write (overwritten deterministically)",
)
args = ap.parse_args()
run(input_path=Path(args.input).resolve(), annex_path=Path(args.annex).resolve())
if __name__ == "__main__":
main()