#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ collatz_conjoncture_audit.py Deterministic audit tool for `applications/collatz/conjoncture_collatz.md`: - detect duplicated heading titles (normalized) - build a canonical TOC (by default: first H1 block) - build a reference table mapping duplicate headings to a canonical occurrence - detect key block boundaries used for rationalization (duplicate trunks and imported block) Outputs (JSON + MD) are intended to be versioned under `docs/artefacts/`. """ from __future__ import annotations import argparse import json import re import unicodedata from dataclasses import dataclass from pathlib import Path @dataclass(frozen=True) class Heading: line: int level: int title: str def _read_lines(path: Path) -> list[str]: return path.read_text(encoding="utf-8", errors="strict").splitlines() def _strip_accents(s: str) -> str: return "".join(ch for ch in unicodedata.normalize("NFKD", s) if not unicodedata.combining(ch)) def _norm_title(title: str) -> str: t = _strip_accents(title).lower() t = re.sub(r"\s+", " ", t).strip() return t def _parse_headings(lines: list[str]) -> list[Heading]: out: list[Heading] = [] for i, line in enumerate(lines, start=1): m = re.match(r"^(#{1,6})\s+(.*)$", line) if not m: continue lvl = len(m.group(1)) title = m.group(2).strip() out.append(Heading(line=i, level=lvl, title=title)) return out def _find_line(lines: list[str], needle: str) -> int | None: for i, line in enumerate(lines, start=1): if line.strip() == needle: return i return None def _find_heading_lines(lines: list[str], prefix: str) -> list[int]: out: list[int] = [] for i, line in enumerate(lines, start=1): if line.startswith(prefix): out.append(i) return out def _first_heading_line_matching(headings: list[Heading], pattern: str) -> int | None: rx = re.compile(pattern) for h in headings: if rx.match(f"{'#' * h.level} {h.title}"): return h.line return None def _compute_duplicates(headings: list[Heading]) -> dict[str, object]: groups: dict[tuple[int, str], list[Heading]] = {} for h in headings: key = (h.level, _norm_title(h.title)) groups.setdefault(key, []).append(h) dup_groups = {k: v for k, v in groups.items() if len(v) >= 2} top = sorted(dup_groups.items(), key=lambda kv: (-len(kv[1]), kv[0][0], kv[0][1])) top_view: list[dict[str, object]] = [] for (lvl, ntitle), occ in top[:30]: top_view.append( { "level": lvl, "title_norm": ntitle, "count": len(occ), "occurrences": [{"line": h.line, "title": h.title} for h in occ[:10]], } ) return { "groups_count": len(dup_groups), "top_groups": top_view, "all_groups": [ { "level": lvl, "title_norm": ntitle, "count": len(occ), "first": {"line": occ[0].line, "title": occ[0].title}, "others": [{"line": h.line, "title": h.title} for h in occ[1:]], } for (lvl, ntitle), occ in sorted(dup_groups.items(), key=lambda kv: (kv[0][0], kv[0][1])) ], } def _canonical_toc(headings: list[Heading], canonical_end_line: int) -> list[dict[str, object]]: toc: list[dict[str, object]] = [] for h in headings: if h.line > canonical_end_line: break if h.level < 2: continue toc.append({"line": h.line, "level": h.level, "title": h.title}) return toc def _write_json(path: Path, obj: object) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(obj, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") def _write_md(path: Path, lines: list[str]) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text("\n".join(lines) + "\n", encoding="utf-8") def run(*, input_path: Path, output_dir: Path) -> None: lines = _read_lines(input_path) headings = _parse_headings(lines) h1_conj = _find_heading_lines(lines, "# Conjecture de Collatz:") vulg_start = _find_line(lines, "La raison mathématique n'est pas connue.") branche_first = _first_heading_line_matching(headings, r"^##\s+Branche\b") vulg_end = (branche_first - 1) if (vulg_start is not None and branche_first is not None and branche_first > vulg_start) else None canonical_end = (h1_conj[1] - 1) if len(h1_conj) >= 2 else len(lines) dup = _compute_duplicates(headings) toc = _canonical_toc(headings, canonical_end) renvois: list[dict[str, object]] = [] for g in dup["all_groups"]: if not isinstance(g, dict): continue lvl = g.get("level") if lvl != 2: continue first = g.get("first") others = g.get("others") if not isinstance(first, dict) or not isinstance(others, list): continue renvois.append( { "title_norm": g.get("title_norm"), "canonical": first, "duplicates": others, "count": g.get("count"), } ) out = { "inputs": {"conjoncture_path": str(input_path)}, "stats": {"lines": len(lines), "headings_total": len(headings)}, "boundaries": { "h1_conjecture_lines": h1_conj, "vulgarisation_start_line": vulg_start, "vulgarisation_end_line": vulg_end, "first_branche_heading_line": branche_first, "canonical_end_line": canonical_end, }, "duplicates": {"groups_count": dup["groups_count"], "top_groups": dup["top_groups"]}, "canonical_toc": toc, "renvois_level2": renvois, } json_path = output_dir / "audit_conjoncture.json" md_path = output_dir / "audit_conjoncture.md" _write_json(json_path, out) md: list[str] = [] md.append("**Auteur** : Équipe 4NK") md.append("") md.append("# Audit déterministe — `conjoncture_collatz.md`") md.append("") md.append("## Entrée") md.append("") md.append(f"- fichier : `{input_path}`") md.append("") md.append("## Statistiques") md.append("") md.append(f"- lignes : {out['stats']['lines']}") md.append(f"- headings total : {out['stats']['headings_total']}") md.append(f"- groupes de headings dupliqués (tous niveaux) : {out['duplicates']['groups_count']}") md.append("") md.append("## Bornes détectées (rationalisation)") md.append("") md.append(f"- occurrences `# Conjecture de Collatz:` : {h1_conj}") md.append(f"- canonical_end_line (premier bloc H1) : {canonical_end}") md.append(f"- début bloc importé : {vulg_start}") md.append(f"- fin bloc importé : {vulg_end}") md.append(f"- première section `## Branche ...` : {branche_first}") md.append("") md.append("## Doublons — top groupes") md.append("") md.append("| level | count | title_norm | occurrences (first 10) |") md.append("| --- | --- | --- | --- |") for g in out["duplicates"]["top_groups"]: occ = g["occurrences"] occ_txt = ", ".join(f"L{o['line']}:{o['title']}" for o in occ) md.append(f"| {g['level']} | {g['count']} | `{g['title_norm']}` | {occ_txt} |") md.append("") md.append("## TOC canonique (bloc initial)") md.append("") for h in toc[:120]: indent = " " * max(0, h["level"] - 2) md.append(f"{indent}- L{h['line']} `{h['title']}`") if len(toc) > 120: md.append("") md.append(f"(TOC tronquée : {len(toc)} entrées)") md.append("") md.append("## Table de renvois (headings `##` dupliqués)") md.append("") md.append("Chaque ligne pointe vers une occurrence canonique (première) et liste les duplicats.") md.append("") md.append("| title_norm | canonical | duplicates (lines) |") md.append("| --- | --- | --- |") for r in renvois[:200]: can = r["canonical"] dups = r["duplicates"] can_txt = f"L{can['line']}:{can['title']}" dup_lines = ", ".join(f"L{d['line']}" for d in dups[:30]) more = "" if len(dups) <= 30 else f" (+{len(dups)-30})" md.append(f"| `{r['title_norm']}` | {can_txt} | {dup_lines}{more} |") if len(renvois) > 200: md.append("") md.append(f"(table tronquée : {len(renvois)} lignes)") _write_md(md_path, md) def main() -> None: ap = argparse.ArgumentParser(description="Audit duplications + canonical TOC for conjoncture_collatz.md") ap.add_argument( "--input", default="applications/collatz/conjoncture_collatz.md", help="Path to conjoncture_collatz.md", ) ap.add_argument( "--output-dir", default="docs/artefacts/collatz/conjoncture_rationalisation", help="Output directory for JSON/MD audit artefacts", ) args = ap.parse_args() run(input_path=Path(args.input).resolve(), output_dir=Path(args.output_dir).resolve()) if __name__ == "__main__": main()