From b17c6aa732d17558d947ad82787ddfee28f584a4 Mon Sep 17 00:00:00 2001
From: saymrwulf <mrwulf.cleaner@gmail.com>
Date: Mon, 4 May 2026 12:23:18 +0200
Subject: [PATCH] Treat failed watches as instrumentation backoff

---
 src/braiins_ratchet/guidance.py  | 19 +++++++++++
 src/braiins_ratchet/lifecycle.py | 56 +++++++++++++++++++++++++-------
 tests/test_guidance.py           | 20 ++++++++++++
 3 files changed, 83 insertions(+), 12 deletions(-)

diff --git a/src/braiins_ratchet/guidance.py b/src/braiins_ratchet/guidance.py
index 1c436fa..2c88e52 100644
--- a/src/braiins_ratchet/guidance.py
+++ b/src/braiins_ratchet/guidance.py
@@ -413,6 +413,8 @@ def _recent_completed_watch(latest_report: str | None, latest_market_timestamp:
     report_path = REPORTS_DIR.parent / latest_report
     if not report_path.name.startswith("run-") or not report_path.exists():
         return None
+    if _report_collected_samples(report_path) == 0:
+        return None
     market_dt = _parse_utc(latest_market_timestamp)
     if market_dt is not None and report_path.stat().st_mtime < market_dt.timestamp():
         return None
@@ -436,6 +438,23 @@ def _recent_completed_watch(latest_report: str | None, latest_market_timestamp:
     )
 
 
+def _report_collected_samples(report_path: Path) -> int | None:
+    try:
+        lines = report_path.read_text(encoding="utf-8").splitlines()
+    except OSError:
+        return None
+    for line in lines:
+        stripped = line.strip()
+        if not stripped.startswith("- collected_samples:"):
+            continue
+        _, _, value = stripped.partition(":")
+        try:
+            return int(value.strip())
+        except ValueError:
+            return None
+    return None
+
+
 def _research_stage(active_watch: str | None, completed_watch: CompletedWatch | None) -> str:
     if active_watch:
         return "watch running"
diff --git a/src/braiins_ratchet/lifecycle.py b/src/braiins_ratchet/lifecycle.py
index fa6941b..6df829f 100644
--- a/src/braiins_ratchet/lifecycle.py
+++ b/src/braiins_ratchet/lifecycle.py
@@ -10,12 +10,13 @@ from .config import AppConfig
 from .experiments import ACTIVE_WATCH, finish_experiment, start_experiment
 from .guidance import POST_WATCH_COOLDOWN_MINUTES, build_operator_cockpit, get_operator_state
 from .storage import connect, init_db
-from .watch_loop import run_watch_loop
+from .watch_loop import WatchLoopSummary, run_watch_loop
 
 
 DEFAULT_WATCH_CYCLES = 24
 DEFAULT_INTERVAL_SECONDS = 300
 MAX_CONSECUTIVE_CYCLE_FAILURES = 3
+ERROR_BACKOFF_MINUTES = 30
 
 
 @dataclass(frozen=True)
@@ -38,6 +39,15 @@ class ManualPosition:
     payload_json: str
 
 
+@dataclass(frozen=True)
+class WatchStageResult:
+    run_id: str
+    status: str
+    successful_cycles: int
+    failed_cycles: int
+    last_error: str | None
+
+
 def init_lifecycle_db(conn) -> None:
     init_db(conn)
     conn.executescript(
@@ -161,31 +171,47 @@ def run_supervisor(config: AppConfig, *, once: bool = False) -> int:
             phase = state.get("phase", "idle")
             next_action_utc = state.get("next_action_utc")
 
-        if phase == "cooldown" and next_action_utc:
+        if phase in {"cooldown", "error_backoff"} and next_action_utc:
             remaining = _seconds_until(next_action_utc)
             if remaining > 0:
-                _print_timer("Lifecycle cooldown", remaining)
+                _print_timer("Lifecycle cooldown" if phase == "cooldown" else "Instrumentation retry backoff", remaining)
                 if once:
                     return 0
                 _sleep_with_progress(remaining)
 
-        run_id = _run_watch_stage(config)
-        next_action = datetime.now(UTC) + timedelta(minutes=POST_WATCH_COOLDOWN_MINUTES)
+        watch = _run_watch_stage(config)
+        if watch.status == "failed" and watch.successful_cycles == 0:
+            next_action = datetime.now(UTC) + timedelta(minutes=ERROR_BACKOFF_MINUTES)
+            phase = "error_backoff"
+            message = "watch failed before collecting samples; retry backoff active"
+            event_type = "watch_failed_backoff"
+        else:
+            next_action = datetime.now(UTC) + timedelta(minutes=POST_WATCH_COOLDOWN_MINUTES)
+            phase = "cooldown"
+            message = "watch complete; cooldown active before next research stage"
+            event_type = "watch_completed"
         with connect() as conn:
             init_lifecycle_db(conn)
             _write_state(
                 conn,
                 {
-                    "phase": "cooldown",
+                    "phase": phase,
                     "next_action_utc": next_action.isoformat(timespec="seconds"),
-                    "last_run_id": run_id,
-                    "message": "watch complete; cooldown active before next research stage",
+                    "last_run_id": watch.run_id,
+                    "message": message,
                 },
             )
             _record_event(
                 conn,
-                "watch_completed",
-                {"run_id": run_id, "next_action_utc": next_action.isoformat(timespec="seconds")},
+                event_type,
+                {
+                    "run_id": watch.run_id,
+                    "status": watch.status,
+                    "successful_cycles": watch.successful_cycles,
+                    "failed_cycles": watch.failed_cycles,
+                    "last_error": watch.last_error,
+                    "next_action_utc": next_action.isoformat(timespec="seconds"),
+                },
             )
             print(
                 build_operator_cockpit(
@@ -244,7 +270,7 @@ def recover_stale_active_watch(conn) -> str | None:
     return report_path
 
 
-def _run_watch_stage(config: AppConfig) -> str:
+def _run_watch_stage(config: AppConfig) -> WatchStageResult:
     experiment = start_experiment(
         DEFAULT_WATCH_CYCLES,
         DEFAULT_INTERVAL_SECONDS,
@@ -309,7 +335,13 @@ def _run_watch_stage(config: AppConfig) -> str:
                 "last_error": summary.last_error,
             },
         )
-    return experiment.run_id
+    return WatchStageResult(
+        run_id=experiment.run_id,
+        status=summary.status,
+        successful_cycles=summary.successful_cycles,
+        failed_cycles=summary.failed_cycles,
+        last_error=summary.last_error,
+    )
 
 
 def _print_cycle_result(index: int, total: int, result) -> None:
diff --git a/tests/test_guidance.py b/tests/test_guidance.py
index 0d4c69b..761ce74 100644
--- a/tests/test_guidance.py
+++ b/tests/test_guidance.py
@@ -1,6 +1,8 @@
 from decimal import Decimal
 from datetime import UTC, datetime
+from pathlib import Path
 import sqlite3
+from tempfile import TemporaryDirectory
 import unittest
 from unittest.mock import patch
 
@@ -11,6 +13,7 @@ from braiins_ratchet.guidance import (
     _active_watch_status_lines,
     _do_this_now,
     _pathway_forecast,
+    _recent_completed_watch,
     build_operator_cockpit,
 )
 from braiins_ratchet.models import CandidateOrder, MarketSnapshot, OceanSnapshot, StrategyProposal
@@ -195,6 +198,23 @@ class GuidanceTests(unittest.TestCase):
         self.assertIn("Active watch ETA: 2026-04-29T12:48:06+02:00", text)
         self.assertIn("Active watch remaining: about 90 minutes", text)
 
+    def test_zero_sample_failed_report_is_not_treated_as_cooldown_evidence(self) -> None:
+        with TemporaryDirectory() as tmp:
+            reports = Path(tmp) / "reports"
+            reports.mkdir()
+            report = reports / "run-failed.md"
+            report.write_text(
+                "# run-failed\n\n"
+                "## Run Summary\n\n"
+                "- collected_samples: 0\n",
+                encoding="utf-8",
+            )
+
+            with patch("braiins_ratchet.guidance.REPORTS_DIR", reports):
+                completed = _recent_completed_watch("reports/run-failed.md", None)
+
+        self.assertIsNone(completed)
+
 
 def _completed_watch(age_minutes: int) -> CompletedWatch:
     return CompletedWatch(