algo/applications/collatz/collatz_k_scripts/md_to_audit_json.py

# -*- coding: utf-8 -*-
"""
md_to_audit_json.py

Parse audit_60_etats_B12_mod4096_horizon7.md and output audit_60_etats_B12_mod4096_horizon7.json
with residue_to_state mapping and state_table.
"""

from __future__ import annotations
import argparse
import json
import re
from pathlib import Path


def parse_state_table(text: str) -> list[dict]:
    """Parse the markdown table '| État | Mot (a0..a6) | ...' into a list of dicts."""
    lines = text.splitlines()
    table_lines = []
    in_table = False
    for ln in lines:
        if "|" in ln and "État" in ln and "Mot (a0..a6)" in ln:
            in_table = True
        if in_table:
            if ln.strip().startswith("|") and "---" not in ln:
                table_lines.append(ln)
            elif in_table and ln.strip().startswith("|") and "---" in ln:
                continue  # skip separator
            elif in_table and (not ln.strip().startswith("|") or ln.strip() == "|"):
                break
    if len(table_lines) < 2:
        return []
    header = [p.strip() for p in table_lines[0].strip().strip("|").split("|")]
    rows = []
    for ln in table_lines[1:]:
        parts = [p.strip() for p in ln.strip().strip("|").split("|")]
        if len(parts) >= len(header):
            row = {}
            for i, h in enumerate(header):
                val = parts[i] if i < len(parts) else ""
                if h in ("État", "Somme A", "Effectif", "C7", "n7 mod 3", "n7 mod 2187"):
                    try:
                        row[h] = int(val)
                    except ValueError:
                        row[h] = val
                else:
                    row[h] = val
            rows.append(row)
    return rows


def parse_residues_by_state(text: str) -> dict[int, list[int]]:
    """Parse '### État N' sections and extract residues for each state."""
    residue_by_state: dict[int, list[int]] = {}
    blocks = re.split(r"\n### État ", text)
    for block in blocks[1:]:  # skip content before first État
        m = re.match(r"^(\d+)\s", block)
        if not m:
            continue
        state_id = int(m.group(1))
        res_match = re.search(r"Résidus \(mod 4096\), effectif \d+ :\s*\n\s*([\d,\s]+)", block)
        if res_match:
            residue_str = res_match.group(1).strip()
            residues = [int(x.strip()) for x in residue_str.split(",") if x.strip()]
            residue_by_state[state_id] = residues
    return residue_by_state


def build_residue_to_state(residue_by_state: dict[int, list[int]]) -> dict[str, int]:
    """Build {str(residue): state_id} mapping."""
    out: dict[str, int] = {}
    for state_id, residues in residue_by_state.items():
        for r in residues:
            out[str(r)] = state_id
    return out


def main() -> None:
    ap = argparse.ArgumentParser(description="Parse audit MD to JSON")
    ap.add_argument("--input", "-i", default="audit_60_etats_B12_mod4096_horizon7.md")
    ap.add_argument("--output", "-o", default="audit_60_etats_B12_mod4096_horizon7.json")
    args = ap.parse_args()

    text = Path(args.input).read_text(encoding="utf-8")
    state_table = parse_state_table(text)
    residue_by_state = parse_residues_by_state(text)
    residue_to_state = build_residue_to_state(residue_by_state)

    out = {
        "residue_to_state": residue_to_state,
        "state_table": state_table,
    }
    Path(args.output).write_text(json.dumps(out, indent=2, ensure_ascii=False), encoding="utf-8")
    print(f"Wrote {args.output}: {len(residue_to_state)} residues, {len(state_table)} states")


if __name__ == "__main__":
    main()