Treat failed watches as instrumentation backoff

This commit is contained in:
saymrwulf 2026-05-04 12:23:18 +02:00
parent c741fbf850
commit b17c6aa732
3 changed files with 83 additions and 12 deletions

View file

@ -413,6 +413,8 @@ def _recent_completed_watch(latest_report: str | None, latest_market_timestamp:
report_path = REPORTS_DIR.parent / latest_report report_path = REPORTS_DIR.parent / latest_report
if not report_path.name.startswith("run-") or not report_path.exists(): if not report_path.name.startswith("run-") or not report_path.exists():
return None return None
if _report_collected_samples(report_path) == 0:
return None
market_dt = _parse_utc(latest_market_timestamp) market_dt = _parse_utc(latest_market_timestamp)
if market_dt is not None and report_path.stat().st_mtime < market_dt.timestamp(): if market_dt is not None and report_path.stat().st_mtime < market_dt.timestamp():
return None return None
@ -436,6 +438,23 @@ def _recent_completed_watch(latest_report: str | None, latest_market_timestamp:
) )
def _report_collected_samples(report_path: Path) -> int | None:
try:
lines = report_path.read_text(encoding="utf-8").splitlines()
except OSError:
return None
for line in lines:
stripped = line.strip()
if not stripped.startswith("- collected_samples:"):
continue
_, _, value = stripped.partition(":")
try:
return int(value.strip())
except ValueError:
return None
return None
def _research_stage(active_watch: str | None, completed_watch: CompletedWatch | None) -> str: def _research_stage(active_watch: str | None, completed_watch: CompletedWatch | None) -> str:
if active_watch: if active_watch:
return "watch running" return "watch running"

View file

@ -10,12 +10,13 @@ from .config import AppConfig
from .experiments import ACTIVE_WATCH, finish_experiment, start_experiment from .experiments import ACTIVE_WATCH, finish_experiment, start_experiment
from .guidance import POST_WATCH_COOLDOWN_MINUTES, build_operator_cockpit, get_operator_state from .guidance import POST_WATCH_COOLDOWN_MINUTES, build_operator_cockpit, get_operator_state
from .storage import connect, init_db from .storage import connect, init_db
from .watch_loop import run_watch_loop from .watch_loop import WatchLoopSummary, run_watch_loop
DEFAULT_WATCH_CYCLES = 24 DEFAULT_WATCH_CYCLES = 24
DEFAULT_INTERVAL_SECONDS = 300 DEFAULT_INTERVAL_SECONDS = 300
MAX_CONSECUTIVE_CYCLE_FAILURES = 3 MAX_CONSECUTIVE_CYCLE_FAILURES = 3
ERROR_BACKOFF_MINUTES = 30
@dataclass(frozen=True) @dataclass(frozen=True)
@ -38,6 +39,15 @@ class ManualPosition:
payload_json: str payload_json: str
@dataclass(frozen=True)
class WatchStageResult:
run_id: str
status: str
successful_cycles: int
failed_cycles: int
last_error: str | None
def init_lifecycle_db(conn) -> None: def init_lifecycle_db(conn) -> None:
init_db(conn) init_db(conn)
conn.executescript( conn.executescript(
@ -161,31 +171,47 @@ def run_supervisor(config: AppConfig, *, once: bool = False) -> int:
phase = state.get("phase", "idle") phase = state.get("phase", "idle")
next_action_utc = state.get("next_action_utc") next_action_utc = state.get("next_action_utc")
if phase == "cooldown" and next_action_utc: if phase in {"cooldown", "error_backoff"} and next_action_utc:
remaining = _seconds_until(next_action_utc) remaining = _seconds_until(next_action_utc)
if remaining > 0: if remaining > 0:
_print_timer("Lifecycle cooldown", remaining) _print_timer("Lifecycle cooldown" if phase == "cooldown" else "Instrumentation retry backoff", remaining)
if once: if once:
return 0 return 0
_sleep_with_progress(remaining) _sleep_with_progress(remaining)
run_id = _run_watch_stage(config) watch = _run_watch_stage(config)
if watch.status == "failed" and watch.successful_cycles == 0:
next_action = datetime.now(UTC) + timedelta(minutes=ERROR_BACKOFF_MINUTES)
phase = "error_backoff"
message = "watch failed before collecting samples; retry backoff active"
event_type = "watch_failed_backoff"
else:
next_action = datetime.now(UTC) + timedelta(minutes=POST_WATCH_COOLDOWN_MINUTES) next_action = datetime.now(UTC) + timedelta(minutes=POST_WATCH_COOLDOWN_MINUTES)
phase = "cooldown"
message = "watch complete; cooldown active before next research stage"
event_type = "watch_completed"
with connect() as conn: with connect() as conn:
init_lifecycle_db(conn) init_lifecycle_db(conn)
_write_state( _write_state(
conn, conn,
{ {
"phase": "cooldown", "phase": phase,
"next_action_utc": next_action.isoformat(timespec="seconds"), "next_action_utc": next_action.isoformat(timespec="seconds"),
"last_run_id": run_id, "last_run_id": watch.run_id,
"message": "watch complete; cooldown active before next research stage", "message": message,
}, },
) )
_record_event( _record_event(
conn, conn,
"watch_completed", event_type,
{"run_id": run_id, "next_action_utc": next_action.isoformat(timespec="seconds")}, {
"run_id": watch.run_id,
"status": watch.status,
"successful_cycles": watch.successful_cycles,
"failed_cycles": watch.failed_cycles,
"last_error": watch.last_error,
"next_action_utc": next_action.isoformat(timespec="seconds"),
},
) )
print( print(
build_operator_cockpit( build_operator_cockpit(
@ -244,7 +270,7 @@ def recover_stale_active_watch(conn) -> str | None:
return report_path return report_path
def _run_watch_stage(config: AppConfig) -> str: def _run_watch_stage(config: AppConfig) -> WatchStageResult:
experiment = start_experiment( experiment = start_experiment(
DEFAULT_WATCH_CYCLES, DEFAULT_WATCH_CYCLES,
DEFAULT_INTERVAL_SECONDS, DEFAULT_INTERVAL_SECONDS,
@ -309,7 +335,13 @@ def _run_watch_stage(config: AppConfig) -> str:
"last_error": summary.last_error, "last_error": summary.last_error,
}, },
) )
return experiment.run_id return WatchStageResult(
run_id=experiment.run_id,
status=summary.status,
successful_cycles=summary.successful_cycles,
failed_cycles=summary.failed_cycles,
last_error=summary.last_error,
)
def _print_cycle_result(index: int, total: int, result) -> None: def _print_cycle_result(index: int, total: int, result) -> None:

View file

@ -1,6 +1,8 @@
from decimal import Decimal from decimal import Decimal
from datetime import UTC, datetime from datetime import UTC, datetime
from pathlib import Path
import sqlite3 import sqlite3
from tempfile import TemporaryDirectory
import unittest import unittest
from unittest.mock import patch from unittest.mock import patch
@ -11,6 +13,7 @@ from braiins_ratchet.guidance import (
_active_watch_status_lines, _active_watch_status_lines,
_do_this_now, _do_this_now,
_pathway_forecast, _pathway_forecast,
_recent_completed_watch,
build_operator_cockpit, build_operator_cockpit,
) )
from braiins_ratchet.models import CandidateOrder, MarketSnapshot, OceanSnapshot, StrategyProposal from braiins_ratchet.models import CandidateOrder, MarketSnapshot, OceanSnapshot, StrategyProposal
@ -195,6 +198,23 @@ class GuidanceTests(unittest.TestCase):
self.assertIn("Active watch ETA: 2026-04-29T12:48:06+02:00", text) self.assertIn("Active watch ETA: 2026-04-29T12:48:06+02:00", text)
self.assertIn("Active watch remaining: about 90 minutes", text) self.assertIn("Active watch remaining: about 90 minutes", text)
def test_zero_sample_failed_report_is_not_treated_as_cooldown_evidence(self) -> None:
with TemporaryDirectory() as tmp:
reports = Path(tmp) / "reports"
reports.mkdir()
report = reports / "run-failed.md"
report.write_text(
"# run-failed\n\n"
"## Run Summary\n\n"
"- collected_samples: 0\n",
encoding="utf-8",
)
with patch("braiins_ratchet.guidance.REPORTS_DIR", reports):
completed = _recent_completed_watch("reports/run-failed.md", None)
self.assertIsNone(completed)
def _completed_watch(age_minutes: int) -> CompletedWatch: def _completed_watch(age_minutes: int) -> CompletedWatch:
return CompletedWatch( return CompletedWatch(