algo/applications/collatz/collatz_k_scripts/collatz_k_pipeline.py

# -*- coding: utf-8 -*-
"""
collatz_k_pipeline.py

Pipeline principale (reproduction des audits "après fusion" et des paquets D16/D17).

Entrées attendues:
- audit_60_etats_B12_mod4096_horizon7.json
- complétion_minorée_m15_vers_m16.md
- candidats_D10_palier2p17.md

Sorties:
- audits Markdown
- CSV exhaustifs
- Markdown listes exhaustives (bloc ```csv```)
"""

from __future__ import annotations
from pathlib import Path
import csv
import json
import re
import tempfile
from collections import Counter
from typing import List, Set, Dict, Tuple, Iterable

from collatz_k_core import A_k, prefix_data, N0_D
from collatz_k_utils import parse_markdown_table_to_rows, write_text
from collatz_k_fusion import build_fusion_clauses


def load_state_map_60(audit60_json_path: str) -> Tuple[Dict[int, int], Dict[int, str]]:
    import json

    data = json.loads(Path(audit60_json_path).read_text(encoding="utf-8"))
    res_to_state = {int(k): int(v) for k, v in data["residue_to_state"].items()}
    state_mot7 = {}
    for row in data["state_table"]:
        state_mot7[int(row["État"])] = row["Mot (a0..a6)"]
    return res_to_state, state_mot7


def build_R17_from_completion(completion_m15_to_m16_md: str) -> List[int]:
    """Build R17 from completion (Parents both) without D10 subtraction."""
    text = Path(completion_m15_to_m16_md).read_text(encoding="utf-8")
    m = re.search(r"### Parents « both ».*?\n(.*)\Z", text, flags=re.S)
    if not m:
        raise ValueError("Section 'Parents both' introuvable")
    B15 = sorted(set(map(int, re.findall(r"\b\d+\b", m.group(1)))))
    shift15 = 1 << 15
    shift16 = 1 << 16
    R16 = set(B15) | {p + shift15 for p in B15}
    R17 = set(R16) | {x + shift16 for x in R16}
    return sorted(R17)


def rebuild_R17_after_full_D10(completion_m15_to_m16_md: str, candidats_D10_md: str) -> List[int]:
    text = Path(completion_m15_to_m16_md).read_text(encoding="utf-8")
    m = re.search(r"### Parents « both ».*?\n(.*)\Z", text, flags=re.S)
    if not m:
        raise ValueError("Section 'Parents both' introuvable")
    B15 = sorted(set(map(int, re.findall(r"\b\d+\b", m.group(1)))))

    shift15 = 1 << 15
    shift16 = 1 << 16
    R16 = set(B15) | {p + shift15 for p in B15}
    R17 = set(R16) | {x + shift16 for x in R16}

    rows = parse_markdown_table_to_rows(candidats_D10_md)
    cover175: Set[int] = set()
    for parts in rows[2:]:
        low = int(parts[0])
        high = int(parts[1])
        cover175.add(low)
        cover175.add(high)
    R17_after_175 = R17 - cover175

    A10_16_high = [x for x in R17_after_175 if A_k(x, 10) == 16]
    cover171: Set[int] = set()
    for x in A10_16_high:
        cover171.add(x)
        cover171.add(x - shift16)

    return sorted(R17_after_175 - cover171)


def lift_set(residues: Iterable[int], shift: int, count: int) -> List[int]:
    out: List[int] = []
    for r in residues:
        for j in range(count):
            out.append(r + j * shift)
    return out


def csv_to_md_list(csv_path: str, md_path: str, title: str, intro: str) -> None:
    p_csv = Path(csv_path)
    p_md = Path(md_path)
    with p_csv.open("r", encoding="utf-8") as fin, p_md.open("w", encoding="utf-8") as fout:
        fout.write(f"# {title}\n\n")
        fout.write("## Introduction\n\n")
        fout.write(intro.strip() + "\n\n")
        fout.write("## Liste exhaustive\n\n")
        fout.write("```csv\n")
        last = ""
        for line in fin:
            fout.write(line)
            last = line
        if last and not last.endswith("\n"):
            fout.write("\n")
        fout.write("```\n")


def run_after_fusion_D16_D17(
    audit60_json: str,
    completion_m15_to_m16_md: str,
    candidats_D10_md: str,
    out_dir: str,
) -> None:
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    res_to_state, state_mot7 = load_state_map_60(audit60_json)

    # R17 après D10 complet
    R17_after_full = rebuild_R17_after_full_D10(completion_m15_to_m16_md, candidats_D10_md)

    # D11 (2^19)
    shift17 = 1 << 17
    shift18 = 1 << 18
    R19 = lift_set(R17_after_full, shift17, 4)
    cand11 = set([n for n in R19 if A_k(n, 11) == 18])
    cover11 = cand11 | {n ^ shift18 for n in cand11}
    R19_after = [n for n in R19 if n not in cover11]

    # D12 (2^21)
    shift19 = 1 << 19
    shift20 = 1 << 20
    R21 = lift_set(R19_after, shift19, 4)
    cand12 = set([n for n in R21 if A_k(n, 12) == 20])
    cover12 = cand12 | {n ^ shift20 for n in cand12}
    R21_after = [n for n in R21 if n not in cover12]

    # D13 (2^22)
    shift21 = 1 << 21
    R22 = list(R21_after) + [n + shift21 for n in R21_after]
    cand13 = set([n for n in R22 if A_k(n, 13) == 21])
    cover13 = cand13 | {n ^ shift21 for n in cand13}
    R22_after = [n for n in R22 if n not in cover13]

    # D14 (2^24)
    shift22 = 1 << 22
    shift23 = 1 << 23
    R24 = lift_set(R22_after, shift22, 4)
    cand14 = set([n for n in R24 if A_k(n, 14) == 23])
    cover14 = cand14 | {n ^ shift23 for n in cand14}
    R24_after = [n for n in R24 if n not in cover14]

    # D15 (2^25)
    shift24 = 1 << 24
    R25 = list(R24_after) + [n + shift24 for n in R24_after]
    cand15 = set([n for n in R25 if A_k(n, 15) == 24])
    cover15 = cand15 | {n ^ shift24 for n in cand15}
    R25_after = sorted([n for n in R25 if n not in cover15])

    # Fusion (t=11,12,14) au palier 2^25
    md_f11 = str(Path(out_dir) / "fusion_t11_palier2p25.md")
    csv_f11 = str(Path(out_dir) / "fusion_t11_palier2p25.csv")
    md_f12 = str(Path(out_dir) / "fusion_t12_palier2p25.md")
    csv_f12 = str(Path(out_dir) / "fusion_t12_palier2p25.csv")
    md_f14 = str(Path(out_dir) / "fusion_t14_palier2p25.md")
    csv_f14 = str(Path(out_dir) / "fusion_t14_palier2p25.csv")

    build_fusion_clauses(R25_after, 11, res_to_state, state_mot7, md_f11, csv_f11, 25)
    build_fusion_clauses(R25_after, 12, res_to_state, state_mot7, md_f12, csv_f12, 25)
    build_fusion_clauses(R25_after, 14, res_to_state, state_mot7, md_f14, csv_f14, 25)

    def load_hitset(csv_path: str) -> Set[int]:
        hs: Set[int] = set()
        p = Path(csv_path)
        if p.stat().st_size == 0:
            return hs
        with p.open("r", encoding="utf-8") as f:
            r = csv.DictReader(f)
            for row in r:
                hs.add(int(row["classe_mod_2^m"]))
        return hs

    unionF = load_hitset(csv_f11) | load_hitset(csv_f12) | load_hitset(csv_f14)
    R25_after_F = [n for n in R25_after if n not in unionF]

    # D16 après fusion (2^27)
    shift25 = 1 << 25
    shift26 = 1 << 26
    k16 = 16
    A16_target = 26

    cand_D16: Set[int] = set()
    for r in R25_after_F:
        for j in range(4):
            n = r + j * shift25
            if A_k(n, k16) == A16_target:
                cand_D16.add(n)
    cover_D16 = cand_D16 | {n ^ shift26 for n in cand_D16}

    delta16 = (1 << 26) - (3**16)
    N0_dist = Counter()
    csv_d16 = str(Path(out_dir) / "candidats_D16_apres_fusion_palier2p27.csv")
    with Path(csv_d16).open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["classe_mod_2^27", "sœur", "mot_a0..a15", "A16", "C16", "delta", "N0", "U^16(n)", "etat_id", "base_mod_4096"])
        for n in sorted(cand_D16):
            pref = prefix_data(n, 16)
            N0 = N0_D(pref.C, pref.A, 16)
            N0_dist[N0] += 1
            w.writerow([n, n ^ shift26, " ".join(map(str, pref.word)), pref.A, pref.C, delta16, N0, pref.y, res_to_state[n % 4096], n % 4096])

    # audit D16 (minimal)
    maxA16_after = 0
    for r in R25_after_F:
        for j in range(4):
            n = r + j * shift25
            if n in cover_D16:
                continue
            A = A_k(n, k16)
            if A > maxA16_after:
                maxA16_after = A

    md_d16 = str(Path(out_dir) / "candidats_D16_apres_fusion_palier2p27_et_impact.md")
    write_text(
        md_d16,
        "\n".join(
            [
                "# Paquet D16 minimal après fusion (palier 2^27)",
                "",
                "## Introduction",
                "",
                "Audit D16 sur le noyau au palier 2^25 après fusion F(11)∪F(12)∪F(14).",
                "",
                "## Tailles",
                "",
                f"- noyau après D15 : {len(R25_after)}",
                f"- noyau après fusion : {len(R25_after_F)}",
                f"- relèvements 2^27 : {4 * len(R25_after_F)}",
                f"- candidats D16 : {len(cand_D16)}",
                f"- couverture (avec sœurs) : {len(cover_D16)}",
                f"- invariant max A16 après : {maxA16_after}",
                "",
                "## CSV exhaustif",
                "",
                f"- {Path(csv_d16).name}",
                "",
            ]
        )
        + "\n",
    )

    csv_to_md_list(
        csv_d16,
        str(Path(out_dir) / "candidats_D16_apres_fusion_palier2p27_liste_exhaustive.md"),
        "Liste exhaustive des clauses D16 après fusion (palier 2^27)",
        "Liste exhaustive (format CSV copiable).",
    )

    # D17 après fusion et D16 (2^28)
    shift27 = 1 << 27
    k17 = 17
    A17_target = 27

    pair_low_set: Set[int] = set()
    for r in R25_after_F:
        for j in range(4):
            low = r + j * shift25
            if low in cover_D16:
                continue
            if A_k(low, k17) == A17_target or A_k(low + shift27, k17) == A17_target:
                pair_low_set.add(low)

    delta17 = (1 << 27) - (3**17)
    csv_d17 = str(Path(out_dir) / "candidats_D17_apres_fusion_palier2p28.csv")
    with Path(csv_d17).open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["classe_mod_2^28", "sœur", "côté", "mot_a0..a16", "A17", "C17", "delta", "N0", "U^17(n)", "etat_id", "base_mod_4096"])
        for low in sorted(pair_low_set):
            high = low + shift27
            rep = low if A_k(low, k17) == A17_target else high
            side = "basse" if rep == low else "haute"
            pref = prefix_data(rep, 17)
            N0 = N0_D(pref.C, pref.A, 17)
            w.writerow([rep, rep ^ shift27, side, " ".join(map(str, pref.word)), pref.A, pref.C, delta17, N0, pref.y, res_to_state[rep % 4096], rep % 4096])

    md_d17 = str(Path(out_dir) / "candidats_D17_apres_fusion_palier2p28_et_impact.md")
    write_text(
        md_d17,
        "\n".join(
            [
                "# Paquet D17 minimal après fusion (palier 2^28)",
                "",
                "## Introduction",
                "",
                "Audit D17 sur le domaine résiduel après fusion et après D16.",
                "",
                "## Tailles",
                "",
                f"- paires candidates D17 : {len(pair_low_set)}",
                "",
                "## CSV exhaustif",
                "",
                f"- {Path(csv_d17).name}",
                "",
            ]
        )
        + "\n",
    )

    csv_to_md_list(
        csv_d17,
        str(Path(out_dir) / "candidats_D17_apres_fusion_palier2p28_liste_exhaustive.md"),
        "Liste exhaustive des clauses D17 après fusion (palier 2^28)",
        "Liste exhaustive (format CSV copiable).",
    )

    cover_D17 = {low for low in pair_low_set} | {low + shift27 for low in pair_low_set}
    all_lifted_28: Set[int] = set()
    for r in R25_after_F:
        for j in range(4):
            low = r + j * shift25
            if low in cover_D16:
                continue
            all_lifted_28.add(low)
            all_lifted_28.add(low + shift27)
    R28_after_D17 = sorted(all_lifted_28 - cover_D17)
    noyau_path = Path(out_dir) / "noyaux" / "noyau_post_D17.json"
    noyau_path.parent.mkdir(parents=True, exist_ok=True)
    noyau_path.write_text(
        json.dumps({"noyau": R28_after_D17, "palier": 28}),
        encoding="utf-8",
    )


def run_extended_D18_to_D21(
    audit60_json: str,
    out_dir: str,
    noyau_post_D17_path: str | None = None,
    resume_from: str | None = None,
) -> None:
    """Continue from D17 to D18, D19, F15, D20, F16, D21. resume_from='D20' skips to D20."""
    from collatz_fusion_pipeline import run_fusion_pipeline
    from collatz_scission import run_scission
    from collatz_update_noyau import run_update_noyau

    out = Path(out_dir)
    out.mkdir(parents=True, exist_ok=True)
    (out / "noyaux").mkdir(exist_ok=True)
    (out / "candidats").mkdir(exist_ok=True)
    (out / "certificats").mkdir(exist_ok=True)

    if resume_from == "D20":
        prev_noyau = str(out / "noyaux" / "noyau_post_F15.json")
        if not Path(prev_noyau).exists():
            raise FileNotFoundError(f"Resume D20 requires {prev_noyau}")
    else:
        noyau_d17 = noyau_post_D17_path or str(out / "noyaux" / "noyau_post_D17.json")
        if not Path(noyau_d17).exists():
            raise FileNotFoundError(f"Run full pipeline first to produce {noyau_d17}")
        prev_noyau = noyau_d17

    if resume_from != "D20":
        for horizon, palier, valeur, label in [(18, 30, 29, "D18"), (19, 32, 31, "D19")]:
            run_single_palier(
                horizon=horizon,
                palier=palier,
                valeur=valeur,
                input_noyau=prev_noyau,
                output_csv=str(out / "candidats" / f"candidats_{label}_palier2p{palier}.csv"),
                audit60_json=audit60_json,
                output_noyau_path=str(out / "noyaux" / f"noyau_post_{label}.json"),
            )
            prev_noyau = str(out / "noyaux" / f"noyau_post_{label}.json")

        csv_f15 = str(out / "candidats" / "candidats_F15_palier2p32.csv")
        cert_f15 = str(out / "certificats" / "certificat_F15_palier2p32.json")
        run_fusion_pipeline(
            horizons=[15],
            palier=32,
            input_noyau=prev_noyau,
            output_csv=csv_f15,
            audit60_json=audit60_json,
            cible="critique",
        )
        run_scission(csv_f15, cert_f15)
        noyau_f15 = str(out / "noyaux" / "noyau_post_F15.json")
        run_update_noyau(cert_f15, prev_noyau, noyau_f15)
        prev_noyau = noyau_f15

    csv_d20 = str(out / "candidats" / "candidats_D20_palier2p34.csv")
    noyau_d20 = str(out / "noyaux" / "noyau_post_D20.json")
    if Path(noyau_d20).exists():
        print(f"Using existing {noyau_d20}")
    elif Path(csv_d20).exists():
        from collatz_recover_noyau import run_recover

        print("Recovering noyau_post_D20 from existing candidats CSV...")
        run_recover(
            previous_noyau=prev_noyau,
            candidats_csv=csv_d20,
            palier=34,
            output=noyau_d20,
            input_palier=32,
        )
    else:
        run_single_palier(
            horizon=20,
            palier=34,
            valeur=32,
            input_noyau=prev_noyau,
            output_csv=csv_d20,
            audit60_json=audit60_json,
            output_noyau_path=noyau_d20,
        )
    prev_noyau = noyau_d20

    csv_f16 = str(out / "candidats" / "candidats_F16_palier2p35.csv")
    cert_f16 = str(out / "certificats" / "certificat_F16_palier2p35.json")
    run_fusion_pipeline(
        horizons=[16],
        palier=35,
        input_noyau=prev_noyau,
        output_csv=csv_f16,
        audit60_json=audit60_json,
        modulo=9,
    )
    run_scission(csv_f16, cert_f16)
    noyau_f16 = str(out / "noyaux" / "noyau_post_F16.json")
    run_update_noyau(cert_f16, prev_noyau, noyau_f16)
    prev_noyau = noyau_f16

    run_single_palier(
        horizon=21,
        palier=36,
        valeur=34,
        input_noyau=prev_noyau,
        output_csv=str(out / "candidats" / "candidats_D21_palier2p36.csv"),
        audit60_json=audit60_json,
        output_noyau_path=str(out / "noyaux" / "noyau_post_D21.json"),
    )


def load_noyau(path: str) -> List[int]:
    """Load noyau from JSON: list of residues or dict with noyau/residues/covered."""
    data = json.loads(Path(path).read_text(encoding="utf-8"))
    if isinstance(data, list):
        return [int(x) for x in data]
    if isinstance(data, dict):
        for key in ("noyau", "residues", "uncovered", "R25_after", "R24_after"):
            if key in data and isinstance(data[key], list):
                return [int(x) for x in data[key]]
    raise ValueError(f"Noyau JSON: no residue list in {path}")


def run_single_palier(
    horizon: int,
    palier: int,
    valeur: int,
    input_noyau: str,
    output_csv: str,
    audit60_json: str,
    output_noyau_path: str | None = None,
) -> None:
    """
    Run a single palier: load noyau, lift to 2^palier, extract D_k candidates with A_k=valeur.
    """
    residues = load_noyau(input_noyau)
    res_to_state, _ = load_state_map_60(audit60_json)

    max_r = max(residues) if residues else 0
    input_palier = max_r.bit_length() if max_r else 0
    curr_shift = 1 << (palier - 1)
    if palier == 17:
        prev_shift = 1 << 16
        lift_count = 1
    elif palier - input_palier >= 2:
        prev_shift = 1 << input_palier
        lift_count = 1 << (palier - input_palier)
    else:
        prev_shift = 1 << (palier - 1)
        lift_count = 2

    lifted: List[int] = []
    for r in residues:
        for j in range(lift_count):
            lifted.append(r + j * prev_shift)

    cand = set(n for n in lifted if A_k(n, horizon) == valeur)
    cover = cand | {n ^ curr_shift for n in cand}
    delta = (1 << valeur) - (3**horizon) if (1 << valeur) > (3**horizon) else 0

    Path(output_csv).parent.mkdir(parents=True, exist_ok=True)
    with Path(output_csv).open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        col_palier = f"classe_mod_2^{palier}"
        w.writerow([col_palier, "sœur", f"mot_a0..a{horizon-1}", f"A{horizon}", f"C{horizon}", "delta", "N0", f"U^{horizon}(n)", "etat_id", "base_mod_4096"])
        for n in sorted(cand):
            pref = prefix_data(n, horizon)
            N0 = N0_D(pref.C, pref.A, horizon) if delta > 0 else 0
            base = n % 4096
            etat = res_to_state.get(base, 0)
            w.writerow([n, n ^ curr_shift, " ".join(map(str, pref.word)), pref.A, pref.C, delta, N0, pref.y, etat, base])

    if output_noyau_path:
        residual = sorted(set(lifted) - cover)
        Path(output_noyau_path).parent.mkdir(parents=True, exist_ok=True)
        Path(output_noyau_path).write_text(
            json.dumps({"noyau": residual, "palier": palier}),
            encoding="utf-8",
        )
        print(f"Wrote noyau: {output_noyau_path} ({len(residual)} residues)")

    print(f"Wrote {output_csv}: {len(cand)} candidates, palier 2^{palier}")


def main() -> None:
    import argparse

    ap = argparse.ArgumentParser(description="Collatz pipeline: full run or single palier")
    ap.add_argument("--audit60", help="Audit 60 états JSON (for full run)")
    ap.add_argument("--m15m16", help="Complétion m15→m16 MD (for full run)")
    ap.add_argument("--d10", help="Candidats D10 MD (for full run)")
    ap.add_argument("--out", help="Output directory (for full run)")
    ap.add_argument("--horizon", type=int, help="Horizon k for single palier")
    ap.add_argument("--palier", type=int, help="Palier m (2^m) for single palier")
    ap.add_argument("--seuil", default="A_min", help="Seuil type (A_min)")
    ap.add_argument("--valeur", type=int, help="A_k target value for single palier")
    ap.add_argument("--input-noyau", help="Input noyau JSON for single palier")
    ap.add_argument("--output", help="Output CSV for single palier")
    ap.add_argument("--output-noyau", help="Output residual noyau JSON for next palier")
    ap.add_argument("--parallel", action="store_true", help="Use parallel mode (placeholder)")
    ap.add_argument("--threads", type=int, default=1, help="Thread count (placeholder)")
    ap.add_argument("--extend", action="store_true", help="Run extended D18-D21 pipeline (requires noyau_post_D17)")
    ap.add_argument("--resume-from", help="Resume from step (e.g. D20) - skip earlier steps")
    args = ap.parse_args()

    if args.extend:
        audit60 = args.audit60 or str(Path(__file__).parent / "audit_60_etats_B12_mod4096_horizon7.json")
        out_dir = args.out or str(Path(__file__).parent / "out")
        if not Path(audit60).exists():
            raise SystemExit(f"Audit60 not found: {audit60}")
        run_extended_D18_to_D21(
            audit60_json=audit60,
            out_dir=out_dir,
            resume_from=args.resume_from,
        )
        return

    if args.horizon is not None and args.palier is not None and args.valeur is not None and args.output:
        audit60 = args.audit60 or str(Path(__file__).parent / "audit_60_etats_B12_mod4096_horizon7.json")
        if args.horizon == 10 and args.palier == 17 and args.m15m16:
            residues = build_R17_from_completion(args.m15m16)
        elif args.input_noyau:
            residues = load_noyau(args.input_noyau)
        else:
            raise SystemExit("--input-noyau or --m15m16 (for D10) required for single palier mode")
        if not Path(audit60).exists():
            raise SystemExit(f"Audit60 not found: {audit60}")
        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tf:
            json.dump(residues, tf)
            tmp_noyau = tf.name
        try:
            run_single_palier(
                horizon=args.horizon,
                palier=args.palier,
                valeur=args.valeur,
                input_noyau=tmp_noyau if args.horizon == 10 and args.palier == 17 else args.input_noyau,
                output_csv=args.output,
                audit60_json=audit60,
                output_noyau_path=args.output_noyau,
            )
        finally:
            if args.horizon == 10 and args.palier == 17:
                Path(tmp_noyau).unlink(missing_ok=True)
    elif args.audit60 and args.m15m16 and args.d10 and args.out:
        run_after_fusion_D16_D17(args.audit60, args.m15m16, args.d10, args.out)
    else:
        ap.error("Use either (--audit60 --m15m16 --d10 --out) or (--horizon --palier --valeur --input-noyau --output)")


if __name__ == "__main__":
    main()