**Motivations:** - Make `conjoncture_collatz.md` indexable and avoid duplicated trunks. **Root causes:** - Multiple full copies of the formal trunk and a large non-formal imported block prevented unambiguous references. **Correctifs:** - Move duplicated trunks and the imported non-formal block to `conjoncture_collatz_annexes.md`. - Make generic repeated headings unique via deterministic numbering (CSP/CE). **Evolutions:** - Add deterministic audit + rationalization scripts and versioned audit artefacts. **Pages affectées:** - applications/collatz/conjoncture_collatz.md - applications/collatz/conjoncture_collatz_annexes.md - applications/collatz/collatz_k_scripts/collatz_conjoncture_audit.py - applications/collatz/collatz_k_scripts/collatz_conjoncture_rationalize.py - docs/artefacts/collatz/conjoncture_rationalisation/* - docs/features/collatz_conjoncture_rationalization_tooling.md - docs/collatz_conjoncture_collatz_cartographie.md
270 lines
9.0 KiB
Python
270 lines
9.0 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
collatz_conjoncture_audit.py
|
|
|
|
Deterministic audit tool for `applications/collatz/conjoncture_collatz.md`:
|
|
- detect duplicated heading titles (normalized)
|
|
- build a canonical TOC (by default: first H1 block)
|
|
- build a reference table mapping duplicate headings to a canonical occurrence
|
|
- detect key block boundaries used for rationalization (duplicate trunks and imported block)
|
|
|
|
Outputs (JSON + MD) are intended to be versioned under `docs/artefacts/`.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import unicodedata
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Heading:
|
|
line: int
|
|
level: int
|
|
title: str
|
|
|
|
|
|
def _read_lines(path: Path) -> list[str]:
|
|
return path.read_text(encoding="utf-8", errors="strict").splitlines()
|
|
|
|
|
|
def _strip_accents(s: str) -> str:
|
|
return "".join(ch for ch in unicodedata.normalize("NFKD", s) if not unicodedata.combining(ch))
|
|
|
|
|
|
def _norm_title(title: str) -> str:
|
|
t = _strip_accents(title).lower()
|
|
t = re.sub(r"\s+", " ", t).strip()
|
|
return t
|
|
|
|
|
|
def _parse_headings(lines: list[str]) -> list[Heading]:
|
|
out: list[Heading] = []
|
|
for i, line in enumerate(lines, start=1):
|
|
m = re.match(r"^(#{1,6})\s+(.*)$", line)
|
|
if not m:
|
|
continue
|
|
lvl = len(m.group(1))
|
|
title = m.group(2).strip()
|
|
out.append(Heading(line=i, level=lvl, title=title))
|
|
return out
|
|
|
|
|
|
def _find_line(lines: list[str], needle: str) -> int | None:
|
|
for i, line in enumerate(lines, start=1):
|
|
if line.strip() == needle:
|
|
return i
|
|
return None
|
|
|
|
|
|
def _find_heading_lines(lines: list[str], prefix: str) -> list[int]:
|
|
out: list[int] = []
|
|
for i, line in enumerate(lines, start=1):
|
|
if line.startswith(prefix):
|
|
out.append(i)
|
|
return out
|
|
|
|
|
|
def _first_heading_line_matching(headings: list[Heading], pattern: str) -> int | None:
|
|
rx = re.compile(pattern)
|
|
for h in headings:
|
|
if rx.match(f"{'#' * h.level} {h.title}"):
|
|
return h.line
|
|
return None
|
|
|
|
|
|
def _compute_duplicates(headings: list[Heading]) -> dict[str, object]:
|
|
groups: dict[tuple[int, str], list[Heading]] = {}
|
|
for h in headings:
|
|
key = (h.level, _norm_title(h.title))
|
|
groups.setdefault(key, []).append(h)
|
|
|
|
dup_groups = {k: v for k, v in groups.items() if len(v) >= 2}
|
|
top = sorted(dup_groups.items(), key=lambda kv: (-len(kv[1]), kv[0][0], kv[0][1]))
|
|
|
|
top_view: list[dict[str, object]] = []
|
|
for (lvl, ntitle), occ in top[:30]:
|
|
top_view.append(
|
|
{
|
|
"level": lvl,
|
|
"title_norm": ntitle,
|
|
"count": len(occ),
|
|
"occurrences": [{"line": h.line, "title": h.title} for h in occ[:10]],
|
|
}
|
|
)
|
|
|
|
return {
|
|
"groups_count": len(dup_groups),
|
|
"top_groups": top_view,
|
|
"all_groups": [
|
|
{
|
|
"level": lvl,
|
|
"title_norm": ntitle,
|
|
"count": len(occ),
|
|
"first": {"line": occ[0].line, "title": occ[0].title},
|
|
"others": [{"line": h.line, "title": h.title} for h in occ[1:]],
|
|
}
|
|
for (lvl, ntitle), occ in sorted(dup_groups.items(), key=lambda kv: (kv[0][0], kv[0][1]))
|
|
],
|
|
}
|
|
|
|
|
|
def _canonical_toc(headings: list[Heading], canonical_end_line: int) -> list[dict[str, object]]:
|
|
toc: list[dict[str, object]] = []
|
|
for h in headings:
|
|
if h.line > canonical_end_line:
|
|
break
|
|
if h.level < 2:
|
|
continue
|
|
toc.append({"line": h.line, "level": h.level, "title": h.title})
|
|
return toc
|
|
|
|
|
|
def _write_json(path: Path, obj: object) -> None:
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text(json.dumps(obj, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
|
|
|
|
|
def _write_md(path: Path, lines: list[str]) -> None:
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
|
|
|
|
def run(*, input_path: Path, output_dir: Path) -> None:
|
|
lines = _read_lines(input_path)
|
|
headings = _parse_headings(lines)
|
|
|
|
h1_conj = _find_heading_lines(lines, "# Conjecture de Collatz:")
|
|
vulg_start = _find_line(lines, "La raison mathématique n'est pas connue.")
|
|
branche_first = _first_heading_line_matching(headings, r"^##\s+Branche\b")
|
|
vulg_end = (branche_first - 1) if (vulg_start is not None and branche_first is not None and branche_first > vulg_start) else None
|
|
|
|
canonical_end = (h1_conj[1] - 1) if len(h1_conj) >= 2 else len(lines)
|
|
|
|
dup = _compute_duplicates(headings)
|
|
toc = _canonical_toc(headings, canonical_end)
|
|
|
|
renvois: list[dict[str, object]] = []
|
|
for g in dup["all_groups"]:
|
|
if not isinstance(g, dict):
|
|
continue
|
|
lvl = g.get("level")
|
|
if lvl != 2:
|
|
continue
|
|
first = g.get("first")
|
|
others = g.get("others")
|
|
if not isinstance(first, dict) or not isinstance(others, list):
|
|
continue
|
|
renvois.append(
|
|
{
|
|
"title_norm": g.get("title_norm"),
|
|
"canonical": first,
|
|
"duplicates": others,
|
|
"count": g.get("count"),
|
|
}
|
|
)
|
|
|
|
out = {
|
|
"inputs": {"conjoncture_path": str(input_path)},
|
|
"stats": {"lines": len(lines), "headings_total": len(headings)},
|
|
"boundaries": {
|
|
"h1_conjecture_lines": h1_conj,
|
|
"vulgarisation_start_line": vulg_start,
|
|
"vulgarisation_end_line": vulg_end,
|
|
"first_branche_heading_line": branche_first,
|
|
"canonical_end_line": canonical_end,
|
|
},
|
|
"duplicates": {"groups_count": dup["groups_count"], "top_groups": dup["top_groups"]},
|
|
"canonical_toc": toc,
|
|
"renvois_level2": renvois,
|
|
}
|
|
|
|
json_path = output_dir / "audit_conjoncture.json"
|
|
md_path = output_dir / "audit_conjoncture.md"
|
|
_write_json(json_path, out)
|
|
|
|
md: list[str] = []
|
|
md.append("**Auteur** : Équipe 4NK")
|
|
md.append("")
|
|
md.append("# Audit déterministe — `conjoncture_collatz.md`")
|
|
md.append("")
|
|
md.append("## Entrée")
|
|
md.append("")
|
|
md.append(f"- fichier : `{input_path}`")
|
|
md.append("")
|
|
md.append("## Statistiques")
|
|
md.append("")
|
|
md.append(f"- lignes : {out['stats']['lines']}")
|
|
md.append(f"- headings total : {out['stats']['headings_total']}")
|
|
md.append(f"- groupes de headings dupliqués (tous niveaux) : {out['duplicates']['groups_count']}")
|
|
md.append("")
|
|
md.append("## Bornes détectées (rationalisation)")
|
|
md.append("")
|
|
md.append(f"- occurrences `# Conjecture de Collatz:` : {h1_conj}")
|
|
md.append(f"- canonical_end_line (premier bloc H1) : {canonical_end}")
|
|
md.append(f"- début bloc importé : {vulg_start}")
|
|
md.append(f"- fin bloc importé : {vulg_end}")
|
|
md.append(f"- première section `## Branche ...` : {branche_first}")
|
|
md.append("")
|
|
md.append("## Doublons — top groupes")
|
|
md.append("")
|
|
md.append("| level | count | title_norm | occurrences (first 10) |")
|
|
md.append("| --- | --- | --- | --- |")
|
|
for g in out["duplicates"]["top_groups"]:
|
|
occ = g["occurrences"]
|
|
occ_txt = ", ".join(f"L{o['line']}:{o['title']}" for o in occ)
|
|
md.append(f"| {g['level']} | {g['count']} | `{g['title_norm']}` | {occ_txt} |")
|
|
md.append("")
|
|
md.append("## TOC canonique (bloc initial)")
|
|
md.append("")
|
|
for h in toc[:120]:
|
|
indent = " " * max(0, h["level"] - 2)
|
|
md.append(f"{indent}- L{h['line']} `{h['title']}`")
|
|
if len(toc) > 120:
|
|
md.append("")
|
|
md.append(f"(TOC tronquée : {len(toc)} entrées)")
|
|
md.append("")
|
|
md.append("## Table de renvois (headings `##` dupliqués)")
|
|
md.append("")
|
|
md.append("Chaque ligne pointe vers une occurrence canonique (première) et liste les duplicats.")
|
|
md.append("")
|
|
md.append("| title_norm | canonical | duplicates (lines) |")
|
|
md.append("| --- | --- | --- |")
|
|
for r in renvois[:200]:
|
|
can = r["canonical"]
|
|
dups = r["duplicates"]
|
|
can_txt = f"L{can['line']}:{can['title']}"
|
|
dup_lines = ", ".join(f"L{d['line']}" for d in dups[:30])
|
|
more = "" if len(dups) <= 30 else f" (+{len(dups)-30})"
|
|
md.append(f"| `{r['title_norm']}` | {can_txt} | {dup_lines}{more} |")
|
|
if len(renvois) > 200:
|
|
md.append("")
|
|
md.append(f"(table tronquée : {len(renvois)} lignes)")
|
|
_write_md(md_path, md)
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(description="Audit duplications + canonical TOC for conjoncture_collatz.md")
|
|
ap.add_argument(
|
|
"--input",
|
|
default="applications/collatz/conjoncture_collatz.md",
|
|
help="Path to conjoncture_collatz.md",
|
|
)
|
|
ap.add_argument(
|
|
"--output-dir",
|
|
default="docs/artefacts/collatz/conjoncture_rationalisation",
|
|
help="Output directory for JSON/MD audit artefacts",
|
|
)
|
|
args = ap.parse_args()
|
|
run(input_path=Path(args.input).resolve(), output_dir=Path(args.output_dir).resolve())
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|