diff --git a/applications/collatz/collatz_k_scripts/collatz_generate_local_h6_artefacts.py b/applications/collatz/collatz_k_scripts/collatz_generate_local_h6_artefacts.py index 522e409..ef0f35c 100644 --- a/applications/collatz/collatz_k_scripts/collatz_generate_local_h6_artefacts.py +++ b/applications/collatz/collatz_k_scripts/collatz_generate_local_h6_artefacts.py @@ -38,6 +38,7 @@ from pathlib import Path from collatz_k_core import A_k from collatz_k_fusion import build_fusion_clauses from collatz_k_pipeline import load_state_map_60 +from collatz_scission import run_scission @dataclass(frozen=True) @@ -58,14 +59,22 @@ class LocalH6StateResult: artefacts_dir: str -def _write_certificate_json(path: Path, clauses: set[int], covered: set[int], palier: int) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - obj = { - "clauses": sorted(clauses), - "covered": sorted(covered), - "palier": palier, - } - path.write_text(json.dumps(obj, indent=2), encoding="utf-8") +def _read_certificate_json(cert_path: Path) -> tuple[set[int], set[int], int]: + obj = json.loads(cert_path.read_text(encoding="utf-8")) + if not isinstance(obj, dict): + raise ValueError(f"Invalid certificate JSON (expected object): {cert_path}") + clauses_raw = obj.get("clauses", []) + covered_raw = obj.get("covered", []) + palier_raw = obj.get("palier", 0) + + if not isinstance(clauses_raw, list) or not all(isinstance(x, int) for x in clauses_raw): + raise ValueError(f"Invalid certificate JSON clauses list: {cert_path}") + if not isinstance(covered_raw, list) or not all(isinstance(x, int) for x in covered_raw): + raise ValueError(f"Invalid certificate JSON covered list: {cert_path}") + if not isinstance(palier_raw, int): + raise ValueError(f"Invalid certificate JSON palier: {cert_path}") + + return set(clauses_raw), set(covered_raw), palier_raw def _invert_residue_to_state(res_to_state: dict[int, int]) -> dict[int, list[int]]: @@ -110,13 +119,8 @@ def _write_candidates_d8_csv(path: Path, residues: list[int], palier: int) -> li def _covered_from_cert(cert_path: Path) -> set[int]: - obj = json.loads(cert_path.read_text(encoding="utf-8")) - if not isinstance(obj, dict) or "covered" not in obj: - raise ValueError(f"Invalid certificate JSON: {cert_path}") - covered = obj["covered"] - if not isinstance(covered, list) or not all(isinstance(x, int) for x in covered): - raise ValueError(f"Invalid certificate JSON covered list: {cert_path}") - return set(covered) + _, covered, _ = _read_certificate_json(cert_path) + return covered def _compute_fusion_rows_until_covered( @@ -293,9 +297,12 @@ def generate_for_state( d8_covered_set: set[int] = set() cert_paths: list[Path] = [] if cand_d8: - shift = 1 << (palier - 1) - d8_covered_set = set(cand_d8) | {n ^ shift for n in cand_d8} - _write_certificate_json(certificat_d8_path, set(cand_d8), d8_covered_set, palier) + run_scission(str(candidats_d8_path), str(certificat_d8_path)) + _, d8_covered_set, cert_palier = _read_certificate_json(certificat_d8_path) + if cert_palier != palier: + raise ValueError( + f"Unexpected D8 certificate palier for E{state_id}: got 2^{cert_palier}, expected 2^{palier}" + ) cert_paths.append(certificat_d8_path) uncovered_after_d8 = sorted(set(L) - d8_covered_set) @@ -322,16 +329,15 @@ def generate_for_state( fusion_covered_set: set[int] = set() fusion_clauses = 0 fusion_covered = 0 - fusion_hit_classes = { - int(row["classe_mod_2^m"]) - for row in fusion_rows_list - if "classe_mod_2^m" in row and str(row["classe_mod_2^m"]).strip() - } - if fusion_hit_classes and fusion_cert_path is not None: - fusion_covered_set = set(fusion_hit_classes) - _write_certificate_json(fusion_cert_path, set(fusion_hit_classes), fusion_covered_set, palier) - fusion_clauses = len(fusion_hit_classes) - fusion_covered = len(fusion_hit_classes) + if fusion_rows_list and fusion_cert_path is not None and fusion_csv_path is not None: + run_scission(str(fusion_csv_path), str(fusion_cert_path)) + fusion_clauses_set, fusion_covered_set, cert_palier = _read_certificate_json(fusion_cert_path) + if cert_palier != palier: + raise ValueError( + f"Unexpected fusion certificate palier for E{state_id}: got 2^{cert_palier}, expected 2^{palier}" + ) + fusion_clauses = len(fusion_clauses_set) + fusion_covered = len(fusion_covered_set) cert_paths.append(fusion_cert_path) union_covered = d8_covered_set | fusion_covered_set diff --git a/applications/collatz/collatz_k_scripts/collatz_scission.py b/applications/collatz/collatz_k_scripts/collatz_scission.py index aadeccb..ecce248 100644 --- a/applications/collatz/collatz_k_scripts/collatz_scission.py +++ b/applications/collatz/collatz_k_scripts/collatz_scission.py @@ -26,12 +26,54 @@ def _find_column(row: dict, *candidates: str) -> str | None: return None -def infer_palier(rows: list[dict], classe_col: str | None) -> int: - """Infer modulus power m from column name or max value.""" +def _try_parse_int(value: object) -> int | None: + if value is None: + return None + if isinstance(value, int): + return value + if isinstance(value, str): + s = value.strip() + if not s: + return None + try: + return int(s) + except ValueError: + return None + return None + + +def infer_palier(rows: list[dict], classe_col: str | None, csv_path: Path | None = None) -> int: + """ + Infer modulus power m. + + Priority order: + - explicit numeric column 'palier' (or 'm' used as exponent in some CSVs) + - class column name containing '2^' (e.g. 'classe_mod_2^27') + - filename containing 'palier2p' + - fallback heuristic from max class value (legacy; not reliable when values are sparse) + """ + if rows: + pal_col = _find_column(rows[0], "palier") + if pal_col: + v = _try_parse_int(rows[0].get(pal_col)) + if v is not None and v > 0: + return v + m_col = _find_column(rows[0], "m", "modulus_power") + if m_col: + v = _try_parse_int(rows[0].get(m_col)) + if v is not None and v > 0: + return v + if classe_col and ("2^" in classe_col or "2^" in str(classe_col)): m = re.search(r"2\^(\d+)", classe_col) if m: return int(m.group(1)) + + if csv_path is not None: + m2 = re.search(r"palier2p(\d+)", str(csv_path)) + if m2: + return int(m2.group(1)) + if rows and classe_col: try: vals = [int(r.get(classe_col, 0) or 0) for r in rows if r.get(classe_col)] @@ -48,6 +90,9 @@ def infer_palier(rows: list[dict], classe_col: str | None) -> int: def run_scission(csv_path: str, out_json_path: str) -> None: """Read CSV, extract clauses and covered set, write JSON certificate.""" + out_path = Path(out_json_path) + out_path.parent.mkdir(parents=True, exist_ok=True) + rows: list[dict] = [] with Path(csv_path).open("r", encoding="utf-8") as f: reader = csv.DictReader(f) @@ -56,7 +101,7 @@ def run_scission(csv_path: str, out_json_path: str) -> None: if not rows: cert = {"clauses": [], "covered": [], "palier": 0} - Path(out_json_path).write_text(json.dumps(cert, indent=2), encoding="utf-8") + out_path.write_text(json.dumps(cert, indent=2), encoding="utf-8") print(f"Wrote {out_json_path} (empty)") return @@ -69,28 +114,32 @@ def run_scission(csv_path: str, out_json_path: str) -> None: for r in rows: if classe_col: try: - c = int(r.get(classe_col, 0) or 0) - clauses.append(c) - covered.add(c) + raw = r.get(classe_col) + if raw is not None and str(raw).strip(): + c = int(raw) + clauses.append(c) + covered.add(c) except (ValueError, TypeError): pass if soeur_col: try: - s = int(r.get(soeur_col, 0) or 0) - covered.add(s) + raw = r.get(soeur_col) + if raw is not None and str(raw).strip(): + s = int(raw) + covered.add(s) except (ValueError, TypeError): pass clauses = sorted(set(clauses)) covered_list = sorted(covered) - palier = infer_palier(rows, classe_col) + palier = infer_palier(rows, classe_col, csv_path=Path(csv_path)) cert = { "clauses": clauses, "covered": covered_list, "palier": palier, } - Path(out_json_path).write_text(json.dumps(cert, indent=2), encoding="utf-8") + out_path.write_text(json.dumps(cert, indent=2), encoding="utf-8") print(f"Wrote {out_json_path}: {len(clauses)} clauses, {len(covered_list)} covered, palier 2^{palier}") diff --git a/docs/fixKnowledge/collatz_scission_palier_inference_and_output_dirs.md b/docs/fixKnowledge/collatz_scission_palier_inference_and_output_dirs.md new file mode 100644 index 0000000..15afab0 --- /dev/null +++ b/docs/fixKnowledge/collatz_scission_palier_inference_and_output_dirs.md @@ -0,0 +1,45 @@ +# collatz_scission palier inference and output directory creation + +## Problem + +The helper script `applications/collatz/collatz_k_scripts/collatz_scission.py` can produce incorrect certificates in two cases: + +- **Incorrect `palier` inference**: when the CSV class column is named generically (e.g. `classe_mod_2^m`) and the covered classes are sparse / small (e.g. values \(<2^8\) while the target modulus is \(2^{13}\)), `palier` was inferred from the maximum class value. This yields a wrong modulus power. +- **Missing output directories**: `run_scission()` writes `out_json_path` without creating parent directories, which can raise `FileNotFoundError` when callers pass a new path under a non-existing folder. + +## Root cause + +- `infer_palier()` only supported: + - parsing `2^m` from the class column name, or + - a fallback heuristic based on the maximum covered residue value. + This heuristic is not reliable when the class column name does not encode the modulus power. +- `run_scission()` assumed the output directory exists. + +## Corrective actions + +- **Prefer explicit palier columns**: + - If the CSV contains a numeric `palier` column, use it. + - If the CSV contains a numeric `m` / `modulus_power` column (used as exponent in some pipelines), use it. +- **Fallback from filename**: parse `palier2p` from the CSV path when available. +- **Keep legacy fallback**: keep the max-value heuristic as a last resort. +- **Create output directories**: ensure `out_json_path.parent` exists before writing. +- **Do not add spurious residue 0**: skip empty strings instead of coercing to 0 when parsing the class / sister columns. + +## Impact + +- Certificates generated via `collatz_scission.py` now carry a `palier` that matches the CSV’s intended modulus power when the CSV provides it (or when the filename encodes it). +- Callers can write certificates to new directories without pre-creating them. + +## Analysis modalities + +- For any certificate JSON, verify: + - `palier` matches the intended modulus power \(2^m\), + - `clauses` and `covered` sets do not contain a spurious `0`, + - directory creation does not fail when writing under a fresh path. + +## Deployment + +- No environment changes are required. +- The fix is local to: + - `applications/collatz/collatz_k_scripts/collatz_scission.py` +