From b17c6aa732d17558d947ad82787ddfee28f584a4 Mon Sep 17 00:00:00 2001 From: saymrwulf Date: Mon, 4 May 2026 12:23:18 +0200 Subject: [PATCH] Treat failed watches as instrumentation backoff --- src/braiins_ratchet/guidance.py | 19 +++++++++++ src/braiins_ratchet/lifecycle.py | 56 +++++++++++++++++++++++++------- tests/test_guidance.py | 20 ++++++++++++ 3 files changed, 83 insertions(+), 12 deletions(-) diff --git a/src/braiins_ratchet/guidance.py b/src/braiins_ratchet/guidance.py index 1c436fa..2c88e52 100644 --- a/src/braiins_ratchet/guidance.py +++ b/src/braiins_ratchet/guidance.py @@ -413,6 +413,8 @@ def _recent_completed_watch(latest_report: str | None, latest_market_timestamp: report_path = REPORTS_DIR.parent / latest_report if not report_path.name.startswith("run-") or not report_path.exists(): return None + if _report_collected_samples(report_path) == 0: + return None market_dt = _parse_utc(latest_market_timestamp) if market_dt is not None and report_path.stat().st_mtime < market_dt.timestamp(): return None @@ -436,6 +438,23 @@ def _recent_completed_watch(latest_report: str | None, latest_market_timestamp: ) +def _report_collected_samples(report_path: Path) -> int | None: + try: + lines = report_path.read_text(encoding="utf-8").splitlines() + except OSError: + return None + for line in lines: + stripped = line.strip() + if not stripped.startswith("- collected_samples:"): + continue + _, _, value = stripped.partition(":") + try: + return int(value.strip()) + except ValueError: + return None + return None + + def _research_stage(active_watch: str | None, completed_watch: CompletedWatch | None) -> str: if active_watch: return "watch running" diff --git a/src/braiins_ratchet/lifecycle.py b/src/braiins_ratchet/lifecycle.py index fa6941b..6df829f 100644 --- a/src/braiins_ratchet/lifecycle.py +++ b/src/braiins_ratchet/lifecycle.py @@ -10,12 +10,13 @@ from .config import AppConfig from .experiments import ACTIVE_WATCH, finish_experiment, start_experiment from .guidance import POST_WATCH_COOLDOWN_MINUTES, build_operator_cockpit, get_operator_state from .storage import connect, init_db -from .watch_loop import run_watch_loop +from .watch_loop import WatchLoopSummary, run_watch_loop DEFAULT_WATCH_CYCLES = 24 DEFAULT_INTERVAL_SECONDS = 300 MAX_CONSECUTIVE_CYCLE_FAILURES = 3 +ERROR_BACKOFF_MINUTES = 30 @dataclass(frozen=True) @@ -38,6 +39,15 @@ class ManualPosition: payload_json: str +@dataclass(frozen=True) +class WatchStageResult: + run_id: str + status: str + successful_cycles: int + failed_cycles: int + last_error: str | None + + def init_lifecycle_db(conn) -> None: init_db(conn) conn.executescript( @@ -161,31 +171,47 @@ def run_supervisor(config: AppConfig, *, once: bool = False) -> int: phase = state.get("phase", "idle") next_action_utc = state.get("next_action_utc") - if phase == "cooldown" and next_action_utc: + if phase in {"cooldown", "error_backoff"} and next_action_utc: remaining = _seconds_until(next_action_utc) if remaining > 0: - _print_timer("Lifecycle cooldown", remaining) + _print_timer("Lifecycle cooldown" if phase == "cooldown" else "Instrumentation retry backoff", remaining) if once: return 0 _sleep_with_progress(remaining) - run_id = _run_watch_stage(config) - next_action = datetime.now(UTC) + timedelta(minutes=POST_WATCH_COOLDOWN_MINUTES) + watch = _run_watch_stage(config) + if watch.status == "failed" and watch.successful_cycles == 0: + next_action = datetime.now(UTC) + timedelta(minutes=ERROR_BACKOFF_MINUTES) + phase = "error_backoff" + message = "watch failed before collecting samples; retry backoff active" + event_type = "watch_failed_backoff" + else: + next_action = datetime.now(UTC) + timedelta(minutes=POST_WATCH_COOLDOWN_MINUTES) + phase = "cooldown" + message = "watch complete; cooldown active before next research stage" + event_type = "watch_completed" with connect() as conn: init_lifecycle_db(conn) _write_state( conn, { - "phase": "cooldown", + "phase": phase, "next_action_utc": next_action.isoformat(timespec="seconds"), - "last_run_id": run_id, - "message": "watch complete; cooldown active before next research stage", + "last_run_id": watch.run_id, + "message": message, }, ) _record_event( conn, - "watch_completed", - {"run_id": run_id, "next_action_utc": next_action.isoformat(timespec="seconds")}, + event_type, + { + "run_id": watch.run_id, + "status": watch.status, + "successful_cycles": watch.successful_cycles, + "failed_cycles": watch.failed_cycles, + "last_error": watch.last_error, + "next_action_utc": next_action.isoformat(timespec="seconds"), + }, ) print( build_operator_cockpit( @@ -244,7 +270,7 @@ def recover_stale_active_watch(conn) -> str | None: return report_path -def _run_watch_stage(config: AppConfig) -> str: +def _run_watch_stage(config: AppConfig) -> WatchStageResult: experiment = start_experiment( DEFAULT_WATCH_CYCLES, DEFAULT_INTERVAL_SECONDS, @@ -309,7 +335,13 @@ def _run_watch_stage(config: AppConfig) -> str: "last_error": summary.last_error, }, ) - return experiment.run_id + return WatchStageResult( + run_id=experiment.run_id, + status=summary.status, + successful_cycles=summary.successful_cycles, + failed_cycles=summary.failed_cycles, + last_error=summary.last_error, + ) def _print_cycle_result(index: int, total: int, result) -> None: diff --git a/tests/test_guidance.py b/tests/test_guidance.py index 0d4c69b..761ce74 100644 --- a/tests/test_guidance.py +++ b/tests/test_guidance.py @@ -1,6 +1,8 @@ from decimal import Decimal from datetime import UTC, datetime +from pathlib import Path import sqlite3 +from tempfile import TemporaryDirectory import unittest from unittest.mock import patch @@ -11,6 +13,7 @@ from braiins_ratchet.guidance import ( _active_watch_status_lines, _do_this_now, _pathway_forecast, + _recent_completed_watch, build_operator_cockpit, ) from braiins_ratchet.models import CandidateOrder, MarketSnapshot, OceanSnapshot, StrategyProposal @@ -195,6 +198,23 @@ class GuidanceTests(unittest.TestCase): self.assertIn("Active watch ETA: 2026-04-29T12:48:06+02:00", text) self.assertIn("Active watch remaining: about 90 minutes", text) + def test_zero_sample_failed_report_is_not_treated_as_cooldown_evidence(self) -> None: + with TemporaryDirectory() as tmp: + reports = Path(tmp) / "reports" + reports.mkdir() + report = reports / "run-failed.md" + report.write_text( + "# run-failed\n\n" + "## Run Summary\n\n" + "- collected_samples: 0\n", + encoding="utf-8", + ) + + with patch("braiins_ratchet.guidance.REPORTS_DIR", reports): + completed = _recent_completed_watch("reports/run-failed.md", None) + + self.assertIsNone(completed) + def _completed_watch(age_minutes: int) -> CompletedWatch: return CompletedWatch(