algo/applications/collatz/collatz_k_scripts/collatz_conjoncture_audit.py
ncantu ab56157c05 collatz: rationalize conjoncture document into annexes
**Motivations:**
- Make `conjoncture_collatz.md` indexable and avoid duplicated trunks.

**Root causes:**
- Multiple full copies of the formal trunk and a large non-formal imported block prevented unambiguous references.

**Correctifs:**
- Move duplicated trunks and the imported non-formal block to `conjoncture_collatz_annexes.md`.
- Make generic repeated headings unique via deterministic numbering (CSP/CE).

**Evolutions:**
- Add deterministic audit + rationalization scripts and versioned audit artefacts.

**Pages affectées:**
- applications/collatz/conjoncture_collatz.md
- applications/collatz/conjoncture_collatz_annexes.md
- applications/collatz/collatz_k_scripts/collatz_conjoncture_audit.py
- applications/collatz/collatz_k_scripts/collatz_conjoncture_rationalize.py
- docs/artefacts/collatz/conjoncture_rationalisation/*
- docs/features/collatz_conjoncture_rationalization_tooling.md
- docs/collatz_conjoncture_collatz_cartographie.md
2026-03-09 04:56:32 +01:00

270 lines
9.0 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
collatz_conjoncture_audit.py
Deterministic audit tool for `applications/collatz/conjoncture_collatz.md`:
- detect duplicated heading titles (normalized)
- build a canonical TOC (by default: first H1 block)
- build a reference table mapping duplicate headings to a canonical occurrence
- detect key block boundaries used for rationalization (duplicate trunks and imported block)
Outputs (JSON + MD) are intended to be versioned under `docs/artefacts/`.
"""
from __future__ import annotations
import argparse
import json
import re
import unicodedata
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class Heading:
line: int
level: int
title: str
def _read_lines(path: Path) -> list[str]:
return path.read_text(encoding="utf-8", errors="strict").splitlines()
def _strip_accents(s: str) -> str:
return "".join(ch for ch in unicodedata.normalize("NFKD", s) if not unicodedata.combining(ch))
def _norm_title(title: str) -> str:
t = _strip_accents(title).lower()
t = re.sub(r"\s+", " ", t).strip()
return t
def _parse_headings(lines: list[str]) -> list[Heading]:
out: list[Heading] = []
for i, line in enumerate(lines, start=1):
m = re.match(r"^(#{1,6})\s+(.*)$", line)
if not m:
continue
lvl = len(m.group(1))
title = m.group(2).strip()
out.append(Heading(line=i, level=lvl, title=title))
return out
def _find_line(lines: list[str], needle: str) -> int | None:
for i, line in enumerate(lines, start=1):
if line.strip() == needle:
return i
return None
def _find_heading_lines(lines: list[str], prefix: str) -> list[int]:
out: list[int] = []
for i, line in enumerate(lines, start=1):
if line.startswith(prefix):
out.append(i)
return out
def _first_heading_line_matching(headings: list[Heading], pattern: str) -> int | None:
rx = re.compile(pattern)
for h in headings:
if rx.match(f"{'#' * h.level} {h.title}"):
return h.line
return None
def _compute_duplicates(headings: list[Heading]) -> dict[str, object]:
groups: dict[tuple[int, str], list[Heading]] = {}
for h in headings:
key = (h.level, _norm_title(h.title))
groups.setdefault(key, []).append(h)
dup_groups = {k: v for k, v in groups.items() if len(v) >= 2}
top = sorted(dup_groups.items(), key=lambda kv: (-len(kv[1]), kv[0][0], kv[0][1]))
top_view: list[dict[str, object]] = []
for (lvl, ntitle), occ in top[:30]:
top_view.append(
{
"level": lvl,
"title_norm": ntitle,
"count": len(occ),
"occurrences": [{"line": h.line, "title": h.title} for h in occ[:10]],
}
)
return {
"groups_count": len(dup_groups),
"top_groups": top_view,
"all_groups": [
{
"level": lvl,
"title_norm": ntitle,
"count": len(occ),
"first": {"line": occ[0].line, "title": occ[0].title},
"others": [{"line": h.line, "title": h.title} for h in occ[1:]],
}
for (lvl, ntitle), occ in sorted(dup_groups.items(), key=lambda kv: (kv[0][0], kv[0][1]))
],
}
def _canonical_toc(headings: list[Heading], canonical_end_line: int) -> list[dict[str, object]]:
toc: list[dict[str, object]] = []
for h in headings:
if h.line > canonical_end_line:
break
if h.level < 2:
continue
toc.append({"line": h.line, "level": h.level, "title": h.title})
return toc
def _write_json(path: Path, obj: object) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(obj, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
def _write_md(path: Path, lines: list[str]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
def run(*, input_path: Path, output_dir: Path) -> None:
lines = _read_lines(input_path)
headings = _parse_headings(lines)
h1_conj = _find_heading_lines(lines, "# Conjecture de Collatz:")
vulg_start = _find_line(lines, "La raison mathématique n'est pas connue.")
branche_first = _first_heading_line_matching(headings, r"^##\s+Branche\b")
vulg_end = (branche_first - 1) if (vulg_start is not None and branche_first is not None and branche_first > vulg_start) else None
canonical_end = (h1_conj[1] - 1) if len(h1_conj) >= 2 else len(lines)
dup = _compute_duplicates(headings)
toc = _canonical_toc(headings, canonical_end)
renvois: list[dict[str, object]] = []
for g in dup["all_groups"]:
if not isinstance(g, dict):
continue
lvl = g.get("level")
if lvl != 2:
continue
first = g.get("first")
others = g.get("others")
if not isinstance(first, dict) or not isinstance(others, list):
continue
renvois.append(
{
"title_norm": g.get("title_norm"),
"canonical": first,
"duplicates": others,
"count": g.get("count"),
}
)
out = {
"inputs": {"conjoncture_path": str(input_path)},
"stats": {"lines": len(lines), "headings_total": len(headings)},
"boundaries": {
"h1_conjecture_lines": h1_conj,
"vulgarisation_start_line": vulg_start,
"vulgarisation_end_line": vulg_end,
"first_branche_heading_line": branche_first,
"canonical_end_line": canonical_end,
},
"duplicates": {"groups_count": dup["groups_count"], "top_groups": dup["top_groups"]},
"canonical_toc": toc,
"renvois_level2": renvois,
}
json_path = output_dir / "audit_conjoncture.json"
md_path = output_dir / "audit_conjoncture.md"
_write_json(json_path, out)
md: list[str] = []
md.append("**Auteur** : Équipe 4NK")
md.append("")
md.append("# Audit déterministe — `conjoncture_collatz.md`")
md.append("")
md.append("## Entrée")
md.append("")
md.append(f"- fichier : `{input_path}`")
md.append("")
md.append("## Statistiques")
md.append("")
md.append(f"- lignes : {out['stats']['lines']}")
md.append(f"- headings total : {out['stats']['headings_total']}")
md.append(f"- groupes de headings dupliqués (tous niveaux) : {out['duplicates']['groups_count']}")
md.append("")
md.append("## Bornes détectées (rationalisation)")
md.append("")
md.append(f"- occurrences `# Conjecture de Collatz:` : {h1_conj}")
md.append(f"- canonical_end_line (premier bloc H1) : {canonical_end}")
md.append(f"- début bloc importé : {vulg_start}")
md.append(f"- fin bloc importé : {vulg_end}")
md.append(f"- première section `## Branche ...` : {branche_first}")
md.append("")
md.append("## Doublons — top groupes")
md.append("")
md.append("| level | count | title_norm | occurrences (first 10) |")
md.append("| --- | --- | --- | --- |")
for g in out["duplicates"]["top_groups"]:
occ = g["occurrences"]
occ_txt = ", ".join(f"L{o['line']}:{o['title']}" for o in occ)
md.append(f"| {g['level']} | {g['count']} | `{g['title_norm']}` | {occ_txt} |")
md.append("")
md.append("## TOC canonique (bloc initial)")
md.append("")
for h in toc[:120]:
indent = " " * max(0, h["level"] - 2)
md.append(f"{indent}- L{h['line']} `{h['title']}`")
if len(toc) > 120:
md.append("")
md.append(f"(TOC tronquée : {len(toc)} entrées)")
md.append("")
md.append("## Table de renvois (headings `##` dupliqués)")
md.append("")
md.append("Chaque ligne pointe vers une occurrence canonique (première) et liste les duplicats.")
md.append("")
md.append("| title_norm | canonical | duplicates (lines) |")
md.append("| --- | --- | --- |")
for r in renvois[:200]:
can = r["canonical"]
dups = r["duplicates"]
can_txt = f"L{can['line']}:{can['title']}"
dup_lines = ", ".join(f"L{d['line']}" for d in dups[:30])
more = "" if len(dups) <= 30 else f" (+{len(dups)-30})"
md.append(f"| `{r['title_norm']}` | {can_txt} | {dup_lines}{more} |")
if len(renvois) > 200:
md.append("")
md.append(f"(table tronquée : {len(renvois)} lignes)")
_write_md(md_path, md)
def main() -> None:
ap = argparse.ArgumentParser(description="Audit duplications + canonical TOC for conjoncture_collatz.md")
ap.add_argument(
"--input",
default="applications/collatz/conjoncture_collatz.md",
help="Path to conjoncture_collatz.md",
)
ap.add_argument(
"--output-dir",
default="docs/artefacts/collatz/conjoncture_rationalisation",
help="Output directory for JSON/MD audit artefacts",
)
args = ap.parse_args()
run(input_path=Path(args.input).resolve(), output_dir=Path(args.output_dir).resolve())
if __name__ == "__main__":
main()