algo/applications/collatz/collatz_k_scripts/collatz_analyze_prefix_diversity.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
collatz_analyze_prefix_diversity.py

Phase 4 experiment: For no/no residues at 2^16, sample n = r + k*2^16 and count
distinct prefix words of length k_prefix. If the number of distinct prefixes is
bounded, a disjunction of D_minor clauses (one per prefix) might be decidable.
"""

from __future__ import annotations

import argparse
import json
from pathlib import Path

from collatz_k_core import prefix_data


def _read_tracked_roots(path: Path) -> list[int]:
    roots: list[int] = []
    for line in path.read_text(encoding="utf-8", errors="strict").splitlines():
        s = line.strip()
        if not s or s.startswith("#"):
            continue
        if "#" in s:
            s = s.split("#", 1)[0].strip()
        try:
            r = int(s, 10)
        except ValueError:
            continue
        if r > 0 and (r % 2) == 1:
            roots.append(r)
    return roots


def _collect_residues_no_no(tracked_roots: list[int], root_palier: int) -> list[int]:
    step = 1 << root_palier
    residues: list[int] = []
    seen: set[int] = set()
    for r0 in tracked_roots:
        r_low = r0
        r_high = r0 + step
        for r in (r_low, r_high):
            if r not in seen:
                residues.append(r)
                seen.add(r)
    return residues


def _analyze_residue(
    residue: int,
    m: int,
    k_prefix: int,
    sample_size: int,
) -> dict[str, object]:
    mod = 1 << m
    prefixes: set[tuple[int, ...]] = set()
    for k in range(sample_size):
        n = residue + k * mod
        if n <= 0 or (n % 2) == 0:
            continue
        pref = prefix_data(n, k_prefix)
        prefixes.add(pref.word)
    return {
        "residue_mod_2p": residue,
        "k_prefix": k_prefix,
        "sample_size": sample_size,
        "distinct_prefix_count": len(prefixes),
        "prefixes": [list(p) for p in sorted(prefixes)],
    }


def run(
    tracked_roots_file: Path,
    m: int,
    k_prefix: int,
    sample_size: int,
    output_dir: Path,
    max_residues: int,
) -> None:
    roots = _read_tracked_roots(tracked_roots_file)
    residues = _collect_residues_no_no(roots, m - 1)
    residues = residues[:max_residues]

    rows: list[dict[str, object]] = []
    for r in residues:
        row = _analyze_residue(r, m, k_prefix, sample_size)
        rows.append(row)

    obj = {
        "domain": {"m": m, "k_prefix": k_prefix, "sample_size": sample_size},
        "counts": {
            "residues_analyzed": len(residues),
            "max_distinct_prefixes": max(row["distinct_prefix_count"] for row in rows),
            "min_distinct_prefixes": min(row["distinct_prefix_count"] for row in rows),
        },
        "rows": rows,
    }

    out_path = output_dir / f"prefix_diversity_m{m}_k{k_prefix}_sample{sample_size}.json"
    output_dir.mkdir(parents=True, exist_ok=True)
    out_path.write_text(
        json.dumps(obj, indent=2, ensure_ascii=False) + "\n",
        encoding="utf-8",
    )


def main() -> None:
    ap = argparse.ArgumentParser(
        description="Analyze prefix diversity for no/no residues (Phase 4 experiment)"
    )
    ap.add_argument("--tracked-roots-file", default="")
    ap.add_argument("--repo-root", default=".")
    ap.add_argument("--m", type=int, default=16)
    ap.add_argument("--k-prefix", type=int, default=11, help="Prefix length (k-1 for D_minor at horizon k)")
    ap.add_argument("--sample-size", type=int, default=256)
    ap.add_argument("--output-dir", default="")
    ap.add_argument("--max-residues", type=int, default=64)
    args = ap.parse_args()

    repo_root = Path(args.repo_root).resolve()
    tracked_roots_file = (
        Path(args.tracked_roots_file).resolve()
        if args.tracked_roots_file.strip()
        else repo_root
        / "docs"
        / "artefacts"
        / "collatz"
        / "refinement_K"
        / "palier2p15"
        / "incremental_D_minor"
        / "tracked_roots_lb_any_top200_mod2p15_to2p18.txt"
    )
    output_dir = (
        Path(args.output_dir).resolve()
        if args.output_dir.strip()
        else repo_root
        / "docs"
        / "artefacts"
        / "collatz"
        / "refinement_K"
        / "palier2p15"
        / "phase4_prefix_diversity"
    )

    run(
        tracked_roots_file=tracked_roots_file,
        m=int(args.m),
        k_prefix=int(args.k_prefix),
        sample_size=int(args.sample_size),
        output_dir=output_dir,
        max_residues=int(args.max_residues),
    )


if __name__ == "__main__":
    main()