algo/applications/collatz/collatz_k_scripts/collatz_extract_universal_clauses.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
collatz_extract_universal_clauses.py

Option A (Lift(B12)-scoped): convert deterministic *local witnesses* from the C3
verification artefact into *universal clause candidates* (D/F) by lifting each
witnessed residue to the minimal modulus that fixes its prefix of valuations.

Inputs:
- docs/artefacts/collatz/c3_local_descent/verification_c3_local_descent.json

Outputs (versionable, no transcripts):
- docs/artefacts/collatz/universal_clauses/clauses_universelles.json
- docs/artefacts/collatz/universal_clauses/clauses_universelles.md

Notes:
- D_exact and F witnesses are lifted to modulus 2^(A+1) where A is the prefix sum.
- D_brother witnesses are preserved as *local* dependencies (mate_exact) at the
  domain palier; they are not asserted as standalone universal reductions here.
"""

from __future__ import annotations

import argparse
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Literal

from collatz_k_core import prefix_data


ClauseKind = Literal["D_exact", "F", "D_brother_local"]


def _read_json(path: Path) -> object:
    return json.loads(path.read_text(encoding="utf-8", errors="strict"))


def _display_path(path: Path, repo_root: Path) -> str:
    try:
        rel = path.resolve().relative_to(repo_root.resolve())
        return str(rel)
    except ValueError:
        return str(path)


@dataclass(frozen=True)
class ExtractedClause:
    kind: ClauseKind
    domain_palier: int
    source_n: int
    stable_modulus_power: int | None
    stable_residue_mod_2p: int | None
    # shared: prefix length + affine constants
    k_or_t: int | None
    A: int | None
    C: int | None
    word: tuple[int, ...] | None
    # D
    N0: int | None
    # F
    a: int | None
    y: int | None
    preimage_m: int | None
    DeltaF: int | None
    Nf: int | None
    source_csv: str | None
    # brother
    mate_exact: int | None

    def to_json(self) -> dict[str, object]:
        return {
            "kind": self.kind,
            "domain_palier": self.domain_palier,
            "source_n": self.source_n,
            "stable_modulus_power": self.stable_modulus_power,
            "stable_residue_mod_2p": self.stable_residue_mod_2p,
            "k_or_t": self.k_or_t,
            "A": self.A,
            "C": self.C,
            "word": list(self.word) if self.word is not None else None,
            "N0": self.N0,
            "a": self.a,
            "y": self.y,
            "preimage_m": self.preimage_m,
            "DeltaF": self.DeltaF,
            "Nf": self.Nf,
            "source_csv": self.source_csv,
            "mate_exact": self.mate_exact,
        }


def _lift_to_stable_residue(
    *,
    n0: int,
    base_palier: int,
    target_palier: int,
    prefix_len: int,
    expected_word: tuple[int, ...],
) -> int:
    """
    Deterministically lift a residue from 2^base_palier to 2^target_palier by choosing,
    at each step, the unique child (r or r+2^m) that preserves the expected prefix word.
    """
    if target_palier < base_palier:
        raise ValueError("target_palier must be >= base_palier")
    if prefix_len < 1:
        raise ValueError("prefix_len must be >= 1")
    r = n0
    for m in range(base_palier, target_palier):
        shift = 1 << m
        c0 = r
        c1 = r + shift
        w0 = prefix_data(c0, prefix_len).word
        w1 = prefix_data(c1, prefix_len).word
        ok0 = w0 == expected_word
        ok1 = w1 == expected_word
        if ok0 and not ok1:
            r = c0
        elif ok1 and not ok0:
            r = c1
        else:
            raise ValueError(
                "Ambiguous lift (expected exactly one child to preserve the prefix word): "
                f"m={m}, base_palier={base_palier}, target_palier={target_palier}, n0={n0}, ok0={ok0}, ok1={ok1}"
            )
    return r


def _format_int_list(values: list[int], per_line: int = 16) -> list[str]:
    lines: list[str] = []
    for i in range(0, len(values), per_line):
        lines.append(", ".join(str(x) for x in values[i : i + per_line]))
    return lines


def run(*, verification_json: Path, output_dir: Path, repo_root: Path) -> None:
    obj = _read_json(verification_json)
    if not isinstance(obj, dict):
        raise ValueError("Invalid C3 verification JSON: expected object")
    domain = obj.get("domain")
    records = obj.get("records")
    if not isinstance(domain, dict) or not isinstance(records, list):
        raise ValueError("Invalid C3 verification JSON: missing domain/records")
    palier = domain.get("palier")
    if not isinstance(palier, int):
        raise ValueError("Invalid C3 verification JSON: domain.palier must be int")

    extracted: list[ExtractedClause] = []
    for rec in records:
        if not isinstance(rec, dict):
            raise ValueError("Invalid C3 record: expected object")
        n = rec.get("n")
        kind = rec.get("kind")
        if not isinstance(n, int) or not isinstance(kind, str):
            raise ValueError("Invalid C3 record: missing n/kind")

        if kind == "D_exact":
            k = rec.get("k")
            A_k = rec.get("A_k")
            C_k = rec.get("C_k")
            N0 = rec.get("N0")
            if not all(isinstance(x, int) for x in [k, A_k, C_k, N0]):
                raise ValueError(f"Invalid D_exact record for n={n}: missing k/A_k/C_k/N0")
            pref = prefix_data(n, k)
            if pref.A != A_k or pref.C != C_k:
                raise ValueError(f"D_exact mismatch for n={n}: prefix_data differs from record")
            m_stab = A_k + 1
            r_stab = _lift_to_stable_residue(
                n0=n,
                base_palier=palier,
                target_palier=m_stab,
                prefix_len=k,
                expected_word=pref.word,
            )
            pref_stab = prefix_data(r_stab, k)
            if pref_stab.word != pref.word or pref_stab.A != pref.A or pref_stab.C != pref.C:
                raise ValueError(f"D_exact lift check failed for n={n} at modulus 2^{m_stab}")

            extracted.append(
                ExtractedClause(
                    kind="D_exact",
                    domain_palier=palier,
                    source_n=n,
                    stable_modulus_power=m_stab,
                    stable_residue_mod_2p=r_stab,
                    k_or_t=k,
                    A=A_k,
                    C=C_k,
                    word=pref.word,
                    N0=N0,
                    a=None,
                    y=None,
                    preimage_m=None,
                    DeltaF=None,
                    Nf=None,
                    source_csv=None,
                    mate_exact=None,
                )
            )
            continue

        if kind == "F":
            t = rec.get("t")
            a = rec.get("a")
            y = rec.get("y")
            preimage_m = rec.get("preimage_m")
            A_t = rec.get("A_t")
            C_t = rec.get("C_t")
            DeltaF = rec.get("DeltaF")
            Nf = rec.get("Nf")
            source_csv = rec.get("source_csv")
            ints_ok = all(isinstance(x, int) for x in [t, a, y, preimage_m, A_t, C_t, DeltaF, Nf])
            if not ints_ok or not isinstance(source_csv, str):
                raise ValueError(f"Invalid F record for n={n}: missing fields")
            pref = prefix_data(n, t)
            if pref.A != A_t or pref.C != C_t or pref.y != y:
                raise ValueError(f"F mismatch for n={n}: prefix_data differs from record")
            m_stab = A_t + 1
            r_stab = _lift_to_stable_residue(
                n0=n,
                base_palier=palier,
                target_palier=m_stab,
                prefix_len=t,
                expected_word=pref.word,
            )
            pref_stab = prefix_data(r_stab, t)
            if pref_stab.word != pref.word or pref_stab.A != pref.A or pref_stab.C != pref.C or pref_stab.y != pref.y:
                raise ValueError(f"F lift check failed for n={n} at modulus 2^{m_stab}")

            extracted.append(
                ExtractedClause(
                    kind="F",
                    domain_palier=palier,
                    source_n=n,
                    stable_modulus_power=m_stab,
                    stable_residue_mod_2p=r_stab,
                    k_or_t=t,
                    A=A_t,
                    C=C_t,
                    word=pref.word,
                    N0=None,
                    a=a,
                    y=y,
                    preimage_m=preimage_m,
                    DeltaF=DeltaF,
                    Nf=Nf,
                    source_csv=source_csv,
                    mate_exact=None,
                )
            )
            continue

        if kind == "D_brother":
            k = rec.get("k")
            A_k = rec.get("A_k")
            C_k = rec.get("C_k")
            mate = rec.get("mate_exact")
            if not all(isinstance(x, int) for x in [k, A_k, C_k, mate]):
                raise ValueError(f"Invalid D_brother record for n={n}: missing fields")
            pref = prefix_data(n, k)
            if pref.A != A_k or pref.C != C_k:
                raise ValueError(f"D_brother mismatch for n={n}: prefix_data differs from record")
            extracted.append(
                ExtractedClause(
                    kind="D_brother_local",
                    domain_palier=palier,
                    source_n=n,
                    stable_modulus_power=None,
                    stable_residue_mod_2p=None,
                    k_or_t=k,
                    A=A_k,
                    C=C_k,
                    word=pref.word,
                    N0=None,
                    a=None,
                    y=None,
                    preimage_m=None,
                    DeltaF=None,
                    Nf=None,
                    source_csv=None,
                    mate_exact=mate,
                )
            )
            continue

        raise ValueError(f"Unknown record kind in C3 verification JSON: {kind}")

    extracted = sorted(extracted, key=lambda c: (c.kind, c.source_n))
    max_stab = max([c.stable_modulus_power for c in extracted if c.stable_modulus_power is not None], default=None)

    out_json = output_dir / "clauses_universelles.json"
    out_md = output_dir / "clauses_universelles.md"
    output_dir.mkdir(parents=True, exist_ok=True)

    summary = {
        "inputs": {
            "verification_c3_local_descent_json": _display_path(verification_json, repo_root),
        },
        "domain": {"palier": palier, "max_stable_modulus_power": max_stab},
        "counts": {
            "total": len(extracted),
            "D_exact": sum(1 for c in extracted if c.kind == "D_exact"),
            "F": sum(1 for c in extracted if c.kind == "F"),
            "D_brother_local": sum(1 for c in extracted if c.kind == "D_brother_local"),
        },
        "clauses": [c.to_json() for c in extracted],
    }
    out_json.write_text(json.dumps(summary, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")

    # Markdown output (short, human-checkable)
    lines: list[str] = []
    lines.append("**Auteur** : Équipe 4NK")
    lines.append("")
    lines.append("# Clauses universelles extraites — Option A (Lift(B12))")
    lines.append("")
    lines.append("## Entrée")
    lines.append("")
    lines.append(f"- C3 vérification JSON : `{_display_path(verification_json, repo_root)}`")
    lines.append("")
    lines.append("## Domaine")
    lines.append("")
    lines.append(f"- palier (domaine des témoins) : 2^{palier}")
    if max_stab is not None:
        lines.append(f"- max(2^m_stable) observé : 2^{max_stab}")
    lines.append("")
    lines.append("## Compteurs")
    lines.append("")
    lines.append(f"- total : {summary['counts']['total']}")
    lines.append(f"- D_exact : {summary['counts']['D_exact']}")
    lines.append(f"- F : {summary['counts']['F']}")
    lines.append(f"- D_brother_local : {summary['counts']['D_brother_local']}")
    lines.append("")
    lines.append("## Table (extrait)")
    lines.append("")
    header = ["kind", "source_n", "m_stable", "residue_mod_2^m", "k/t", "A", "N0/Nf", "mate_exact"]
    lines.append("| " + " | ".join(header) + " |")
    lines.append("| " + " | ".join(["---"] * len(header)) + " |")
    for c in extracted:
        m_st = "" if c.stable_modulus_power is None else str(c.stable_modulus_power)
        r_st = "" if c.stable_residue_mod_2p is None else str(c.stable_residue_mod_2p)
        kt = "" if c.k_or_t is None else str(c.k_or_t)
        A = "" if c.A is None else str(c.A)
        if c.kind == "D_exact":
            thr = "" if c.N0 is None else f"N0={c.N0}"
        elif c.kind == "F":
            thr = "" if c.Nf is None else f"Nf={c.Nf}"
        else:
            thr = ""
        mate = "" if c.mate_exact is None else str(c.mate_exact)
        lines.append(f"| {c.kind} | {c.source_n} | {m_st} | {r_st} | {kt} | {A} | {thr} | {mate} |")
    lines.append("")
    lines.append("## Notes")
    lines.append("")
    lines.append("- `D_exact` et `F` sont relevées au module minimal \\(2^{A+1}\\) qui fige le mot de valuations du témoin.")
    lines.append("- `D_brother_local` encode une dépendance locale (palier du domaine) vers un `mate_exact` ; elle n’est pas utilisée ici comme clause universelle autonome.")
    lines.append("")
    out_md.write_text("\n".join(lines) + "\n", encoding="utf-8")


def main() -> None:
    ap = argparse.ArgumentParser(description="Extract universal clause candidates from deterministic C3 witnesses (Option A)")
    ap.add_argument(
        "--verification-json",
        default="docs/artefacts/collatz/c3_local_descent/verification_c3_local_descent.json",
        help="Path to deterministic C3 verification JSON",
    )
    ap.add_argument(
        "--output-dir",
        default="docs/artefacts/collatz/universal_clauses",
        help="Output directory for extracted clause artefacts (JSON + MD)",
    )
    ap.add_argument(
        "--repo-root",
        default="",
        help="Repository root used to display relative paths in outputs (defaults to current working directory)",
    )
    args = ap.parse_args()

    repo_root = Path(args.repo_root).resolve() if args.repo_root.strip() else Path.cwd().resolve()
    run(
        verification_json=Path(args.verification_json).resolve(),
        output_dir=Path(args.output_dir).resolve(),
        repo_root=repo_root,
    )


if __name__ == "__main__":
    main()