Treat failed watches as instrumentation backoff

This commit is contained in:
saymrwulf 2026-05-04 12:23:18 +02:00
parent c741fbf850
commit b17c6aa732
3 changed files with 83 additions and 12 deletions

View file

@ -413,6 +413,8 @@ def _recent_completed_watch(latest_report: str | None, latest_market_timestamp:
report_path = REPORTS_DIR.parent / latest_report
if not report_path.name.startswith("run-") or not report_path.exists():
return None
if _report_collected_samples(report_path) == 0:
return None
market_dt = _parse_utc(latest_market_timestamp)
if market_dt is not None and report_path.stat().st_mtime < market_dt.timestamp():
return None
@ -436,6 +438,23 @@ def _recent_completed_watch(latest_report: str | None, latest_market_timestamp:
)
def _report_collected_samples(report_path: Path) -> int | None:
try:
lines = report_path.read_text(encoding="utf-8").splitlines()
except OSError:
return None
for line in lines:
stripped = line.strip()
if not stripped.startswith("- collected_samples:"):
continue
_, _, value = stripped.partition(":")
try:
return int(value.strip())
except ValueError:
return None
return None
def _research_stage(active_watch: str | None, completed_watch: CompletedWatch | None) -> str:
if active_watch:
return "watch running"

View file

@ -10,12 +10,13 @@ from .config import AppConfig
from .experiments import ACTIVE_WATCH, finish_experiment, start_experiment
from .guidance import POST_WATCH_COOLDOWN_MINUTES, build_operator_cockpit, get_operator_state
from .storage import connect, init_db
from .watch_loop import run_watch_loop
from .watch_loop import WatchLoopSummary, run_watch_loop
DEFAULT_WATCH_CYCLES = 24
DEFAULT_INTERVAL_SECONDS = 300
MAX_CONSECUTIVE_CYCLE_FAILURES = 3
ERROR_BACKOFF_MINUTES = 30
@dataclass(frozen=True)
@ -38,6 +39,15 @@ class ManualPosition:
payload_json: str
@dataclass(frozen=True)
class WatchStageResult:
run_id: str
status: str
successful_cycles: int
failed_cycles: int
last_error: str | None
def init_lifecycle_db(conn) -> None:
init_db(conn)
conn.executescript(
@ -161,31 +171,47 @@ def run_supervisor(config: AppConfig, *, once: bool = False) -> int:
phase = state.get("phase", "idle")
next_action_utc = state.get("next_action_utc")
if phase == "cooldown" and next_action_utc:
if phase in {"cooldown", "error_backoff"} and next_action_utc:
remaining = _seconds_until(next_action_utc)
if remaining > 0:
_print_timer("Lifecycle cooldown", remaining)
_print_timer("Lifecycle cooldown" if phase == "cooldown" else "Instrumentation retry backoff", remaining)
if once:
return 0
_sleep_with_progress(remaining)
run_id = _run_watch_stage(config)
next_action = datetime.now(UTC) + timedelta(minutes=POST_WATCH_COOLDOWN_MINUTES)
watch = _run_watch_stage(config)
if watch.status == "failed" and watch.successful_cycles == 0:
next_action = datetime.now(UTC) + timedelta(minutes=ERROR_BACKOFF_MINUTES)
phase = "error_backoff"
message = "watch failed before collecting samples; retry backoff active"
event_type = "watch_failed_backoff"
else:
next_action = datetime.now(UTC) + timedelta(minutes=POST_WATCH_COOLDOWN_MINUTES)
phase = "cooldown"
message = "watch complete; cooldown active before next research stage"
event_type = "watch_completed"
with connect() as conn:
init_lifecycle_db(conn)
_write_state(
conn,
{
"phase": "cooldown",
"phase": phase,
"next_action_utc": next_action.isoformat(timespec="seconds"),
"last_run_id": run_id,
"message": "watch complete; cooldown active before next research stage",
"last_run_id": watch.run_id,
"message": message,
},
)
_record_event(
conn,
"watch_completed",
{"run_id": run_id, "next_action_utc": next_action.isoformat(timespec="seconds")},
event_type,
{
"run_id": watch.run_id,
"status": watch.status,
"successful_cycles": watch.successful_cycles,
"failed_cycles": watch.failed_cycles,
"last_error": watch.last_error,
"next_action_utc": next_action.isoformat(timespec="seconds"),
},
)
print(
build_operator_cockpit(
@ -244,7 +270,7 @@ def recover_stale_active_watch(conn) -> str | None:
return report_path
def _run_watch_stage(config: AppConfig) -> str:
def _run_watch_stage(config: AppConfig) -> WatchStageResult:
experiment = start_experiment(
DEFAULT_WATCH_CYCLES,
DEFAULT_INTERVAL_SECONDS,
@ -309,7 +335,13 @@ def _run_watch_stage(config: AppConfig) -> str:
"last_error": summary.last_error,
},
)
return experiment.run_id
return WatchStageResult(
run_id=experiment.run_id,
status=summary.status,
successful_cycles=summary.successful_cycles,
failed_cycles=summary.failed_cycles,
last_error=summary.last_error,
)
def _print_cycle_result(index: int, total: int, result) -> None:

View file

@ -1,6 +1,8 @@
from decimal import Decimal
from datetime import UTC, datetime
from pathlib import Path
import sqlite3
from tempfile import TemporaryDirectory
import unittest
from unittest.mock import patch
@ -11,6 +13,7 @@ from braiins_ratchet.guidance import (
_active_watch_status_lines,
_do_this_now,
_pathway_forecast,
_recent_completed_watch,
build_operator_cockpit,
)
from braiins_ratchet.models import CandidateOrder, MarketSnapshot, OceanSnapshot, StrategyProposal
@ -195,6 +198,23 @@ class GuidanceTests(unittest.TestCase):
self.assertIn("Active watch ETA: 2026-04-29T12:48:06+02:00", text)
self.assertIn("Active watch remaining: about 90 minutes", text)
def test_zero_sample_failed_report_is_not_treated_as_cooldown_evidence(self) -> None:
with TemporaryDirectory() as tmp:
reports = Path(tmp) / "reports"
reports.mkdir()
report = reports / "run-failed.md"
report.write_text(
"# run-failed\n\n"
"## Run Summary\n\n"
"- collected_samples: 0\n",
encoding="utf-8",
)
with patch("braiins_ratchet.guidance.REPORTS_DIR", reports):
completed = _recent_completed_watch("reports/run-failed.md", None)
self.assertIsNone(completed)
def _completed_watch(age_minutes: int) -> CompletedWatch:
return CompletedWatch(