Treat failed watches as instrumentation backoff

2026-05-14 20:37:52 +00:00 · 2026-05-04 12:23:18 +02:00 · 2026-05-04 12:23:18 +02:00 · b17c6aa732
commit b17c6aa732
parent c741fbf850
3 changed files with 83 additions and 12 deletions
--- a/src/braiins_ratchet/guidance.py
+++ b/src/braiins_ratchet/guidance.py
@ -413,6 +413,8 @@ def _recent_completed_watch(latest_report: str | None, latest_market_timestamp:
    report_path = REPORTS_DIR.parent / latest_report
    if not report_path.name.startswith("run-") or not report_path.exists():
        return None
+    if _report_collected_samples(report_path) == 0:
+        return None
    market_dt = _parse_utc(latest_market_timestamp)
    if market_dt is not None and report_path.stat().st_mtime < market_dt.timestamp():
        return None
@ -436,6 +438,23 @@ def _recent_completed_watch(latest_report: str | None, latest_market_timestamp:
    )


+def _report_collected_samples(report_path: Path) -> int | None:
+    try:
+        lines = report_path.read_text(encoding="utf-8").splitlines()
+    except OSError:
+        return None
+    for line in lines:
+        stripped = line.strip()
+        if not stripped.startswith("- collected_samples:"):
+            continue
+        _, _, value = stripped.partition(":")
+        try:
+            return int(value.strip())
+        except ValueError:
+            return None
+    return None
+
+
 def _research_stage(active_watch: str | None, completed_watch: CompletedWatch | None) -> str:
    if active_watch:
        return "watch running"
--- a/src/braiins_ratchet/lifecycle.py
+++ b/src/braiins_ratchet/lifecycle.py
@ -10,12 +10,13 @@ from .config import AppConfig
 from .experiments import ACTIVE_WATCH, finish_experiment, start_experiment
 from .guidance import POST_WATCH_COOLDOWN_MINUTES, build_operator_cockpit, get_operator_state
 from .storage import connect, init_db
-from .watch_loop import run_watch_loop
+from .watch_loop import WatchLoopSummary, run_watch_loop


 DEFAULT_WATCH_CYCLES = 24
 DEFAULT_INTERVAL_SECONDS = 300
 MAX_CONSECUTIVE_CYCLE_FAILURES = 3
+ERROR_BACKOFF_MINUTES = 30


@dataclass(frozen=True)
@ -38,6 +39,15 @@ class ManualPosition:
    payload_json: str


+@dataclass(frozen=True)
+class WatchStageResult:
+    run_id: str
+    status: str
+    successful_cycles: int
+    failed_cycles: int
+    last_error: str | None
+
+
 def init_lifecycle_db(conn) -> None:
    init_db(conn)
    conn.executescript(
@ -161,31 +171,47 @@ def run_supervisor(config: AppConfig, *, once: bool = False) -> int:
            phase = state.get("phase", "idle")
            next_action_utc = state.get("next_action_utc")

-        if phase == "cooldown" and next_action_utc:
+        if phase in {"cooldown", "error_backoff"} and next_action_utc:
            remaining = _seconds_until(next_action_utc)
            if remaining > 0:
-                _print_timer("Lifecycle cooldown", remaining)
+                _print_timer("Lifecycle cooldown" if phase == "cooldown" else "Instrumentation retry backoff", remaining)
                if once:
                    return 0
                _sleep_with_progress(remaining)

-        run_id = _run_watch_stage(config)
-        next_action = datetime.now(UTC) + timedelta(minutes=POST_WATCH_COOLDOWN_MINUTES)
+        watch = _run_watch_stage(config)
+        if watch.status == "failed" and watch.successful_cycles == 0:
+            next_action = datetime.now(UTC) + timedelta(minutes=ERROR_BACKOFF_MINUTES)
+            phase = "error_backoff"
+            message = "watch failed before collecting samples; retry backoff active"
+            event_type = "watch_failed_backoff"
+        else:
+            next_action = datetime.now(UTC) + timedelta(minutes=POST_WATCH_COOLDOWN_MINUTES)
+            phase = "cooldown"
+            message = "watch complete; cooldown active before next research stage"
+            event_type = "watch_completed"
        with connect() as conn:
            init_lifecycle_db(conn)
            _write_state(
                conn,
                {
-                    "phase": "cooldown",
+                    "phase": phase,
                    "next_action_utc": next_action.isoformat(timespec="seconds"),
-                    "last_run_id": run_id,
-                    "message": "watch complete; cooldown active before next research stage",
+                    "last_run_id": watch.run_id,
+                    "message": message,
                },
            )
            _record_event(
                conn,
-                "watch_completed",
-                {"run_id": run_id, "next_action_utc": next_action.isoformat(timespec="seconds")},
+                event_type,
+                {
+                    "run_id": watch.run_id,
+                    "status": watch.status,
+                    "successful_cycles": watch.successful_cycles,
+                    "failed_cycles": watch.failed_cycles,
+                    "last_error": watch.last_error,
+                    "next_action_utc": next_action.isoformat(timespec="seconds"),
+                },
            )
            print(
                build_operator_cockpit(
@ -244,7 +270,7 @@ def recover_stale_active_watch(conn) -> str | None:
    return report_path


-def _run_watch_stage(config: AppConfig) -> str:
+def _run_watch_stage(config: AppConfig) -> WatchStageResult:
    experiment = start_experiment(
        DEFAULT_WATCH_CYCLES,
        DEFAULT_INTERVAL_SECONDS,
@ -309,7 +335,13 @@ def _run_watch_stage(config: AppConfig) -> str:
                "last_error": summary.last_error,
            },
        )
-    return experiment.run_id
+    return WatchStageResult(
+        run_id=experiment.run_id,
+        status=summary.status,
+        successful_cycles=summary.successful_cycles,
+        failed_cycles=summary.failed_cycles,
+        last_error=summary.last_error,
+    )


 def _print_cycle_result(index: int, total: int, result) -> None:
--- a/tests/test_guidance.py
+++ b/tests/test_guidance.py
@ -1,6 +1,8 @@
 from decimal import Decimal
 from datetime import UTC, datetime
+from pathlib import Path
 import sqlite3
+from tempfile import TemporaryDirectory
 import unittest
 from unittest.mock import patch

@ -11,6 +13,7 @@ from braiins_ratchet.guidance import (
    _active_watch_status_lines,
    _do_this_now,
    _pathway_forecast,
+    _recent_completed_watch,
    build_operator_cockpit,
 )
 from braiins_ratchet.models import CandidateOrder, MarketSnapshot, OceanSnapshot, StrategyProposal
@ -195,6 +198,23 @@ class GuidanceTests(unittest.TestCase):
        self.assertIn("Active watch ETA: 2026-04-29T12:48:06+02:00", text)
        self.assertIn("Active watch remaining: about 90 minutes", text)

+    def test_zero_sample_failed_report_is_not_treated_as_cooldown_evidence(self) -> None:
+        with TemporaryDirectory() as tmp:
+            reports = Path(tmp) / "reports"
+            reports.mkdir()
+            report = reports / "run-failed.md"
+            report.write_text(
+                "# run-failed\n\n"
+                "## Run Summary\n\n"
+                "- collected_samples: 0\n",
+                encoding="utf-8",
+            )
+
+            with patch("braiins_ratchet.guidance.REPORTS_DIR", reports):
+                completed = _recent_completed_watch("reports/run-failed.md", None)
+
+        self.assertIsNone(completed)
+

 def _completed_watch(age_minutes: int) -> CompletedWatch:
    return CompletedWatch(