mirror of
https://github.com/saymrwulf/BraiinsRatchet.git
synced 2026-05-14 20:37:52 +00:00
Treat failed watches as instrumentation backoff
This commit is contained in:
parent
c741fbf850
commit
b17c6aa732
3 changed files with 83 additions and 12 deletions
|
|
@ -413,6 +413,8 @@ def _recent_completed_watch(latest_report: str | None, latest_market_timestamp:
|
|||
report_path = REPORTS_DIR.parent / latest_report
|
||||
if not report_path.name.startswith("run-") or not report_path.exists():
|
||||
return None
|
||||
if _report_collected_samples(report_path) == 0:
|
||||
return None
|
||||
market_dt = _parse_utc(latest_market_timestamp)
|
||||
if market_dt is not None and report_path.stat().st_mtime < market_dt.timestamp():
|
||||
return None
|
||||
|
|
@ -436,6 +438,23 @@ def _recent_completed_watch(latest_report: str | None, latest_market_timestamp:
|
|||
)
|
||||
|
||||
|
||||
def _report_collected_samples(report_path: Path) -> int | None:
|
||||
try:
|
||||
lines = report_path.read_text(encoding="utf-8").splitlines()
|
||||
except OSError:
|
||||
return None
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if not stripped.startswith("- collected_samples:"):
|
||||
continue
|
||||
_, _, value = stripped.partition(":")
|
||||
try:
|
||||
return int(value.strip())
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _research_stage(active_watch: str | None, completed_watch: CompletedWatch | None) -> str:
|
||||
if active_watch:
|
||||
return "watch running"
|
||||
|
|
|
|||
|
|
@ -10,12 +10,13 @@ from .config import AppConfig
|
|||
from .experiments import ACTIVE_WATCH, finish_experiment, start_experiment
|
||||
from .guidance import POST_WATCH_COOLDOWN_MINUTES, build_operator_cockpit, get_operator_state
|
||||
from .storage import connect, init_db
|
||||
from .watch_loop import run_watch_loop
|
||||
from .watch_loop import WatchLoopSummary, run_watch_loop
|
||||
|
||||
|
||||
DEFAULT_WATCH_CYCLES = 24
|
||||
DEFAULT_INTERVAL_SECONDS = 300
|
||||
MAX_CONSECUTIVE_CYCLE_FAILURES = 3
|
||||
ERROR_BACKOFF_MINUTES = 30
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
|
@ -38,6 +39,15 @@ class ManualPosition:
|
|||
payload_json: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class WatchStageResult:
|
||||
run_id: str
|
||||
status: str
|
||||
successful_cycles: int
|
||||
failed_cycles: int
|
||||
last_error: str | None
|
||||
|
||||
|
||||
def init_lifecycle_db(conn) -> None:
|
||||
init_db(conn)
|
||||
conn.executescript(
|
||||
|
|
@ -161,31 +171,47 @@ def run_supervisor(config: AppConfig, *, once: bool = False) -> int:
|
|||
phase = state.get("phase", "idle")
|
||||
next_action_utc = state.get("next_action_utc")
|
||||
|
||||
if phase == "cooldown" and next_action_utc:
|
||||
if phase in {"cooldown", "error_backoff"} and next_action_utc:
|
||||
remaining = _seconds_until(next_action_utc)
|
||||
if remaining > 0:
|
||||
_print_timer("Lifecycle cooldown", remaining)
|
||||
_print_timer("Lifecycle cooldown" if phase == "cooldown" else "Instrumentation retry backoff", remaining)
|
||||
if once:
|
||||
return 0
|
||||
_sleep_with_progress(remaining)
|
||||
|
||||
run_id = _run_watch_stage(config)
|
||||
next_action = datetime.now(UTC) + timedelta(minutes=POST_WATCH_COOLDOWN_MINUTES)
|
||||
watch = _run_watch_stage(config)
|
||||
if watch.status == "failed" and watch.successful_cycles == 0:
|
||||
next_action = datetime.now(UTC) + timedelta(minutes=ERROR_BACKOFF_MINUTES)
|
||||
phase = "error_backoff"
|
||||
message = "watch failed before collecting samples; retry backoff active"
|
||||
event_type = "watch_failed_backoff"
|
||||
else:
|
||||
next_action = datetime.now(UTC) + timedelta(minutes=POST_WATCH_COOLDOWN_MINUTES)
|
||||
phase = "cooldown"
|
||||
message = "watch complete; cooldown active before next research stage"
|
||||
event_type = "watch_completed"
|
||||
with connect() as conn:
|
||||
init_lifecycle_db(conn)
|
||||
_write_state(
|
||||
conn,
|
||||
{
|
||||
"phase": "cooldown",
|
||||
"phase": phase,
|
||||
"next_action_utc": next_action.isoformat(timespec="seconds"),
|
||||
"last_run_id": run_id,
|
||||
"message": "watch complete; cooldown active before next research stage",
|
||||
"last_run_id": watch.run_id,
|
||||
"message": message,
|
||||
},
|
||||
)
|
||||
_record_event(
|
||||
conn,
|
||||
"watch_completed",
|
||||
{"run_id": run_id, "next_action_utc": next_action.isoformat(timespec="seconds")},
|
||||
event_type,
|
||||
{
|
||||
"run_id": watch.run_id,
|
||||
"status": watch.status,
|
||||
"successful_cycles": watch.successful_cycles,
|
||||
"failed_cycles": watch.failed_cycles,
|
||||
"last_error": watch.last_error,
|
||||
"next_action_utc": next_action.isoformat(timespec="seconds"),
|
||||
},
|
||||
)
|
||||
print(
|
||||
build_operator_cockpit(
|
||||
|
|
@ -244,7 +270,7 @@ def recover_stale_active_watch(conn) -> str | None:
|
|||
return report_path
|
||||
|
||||
|
||||
def _run_watch_stage(config: AppConfig) -> str:
|
||||
def _run_watch_stage(config: AppConfig) -> WatchStageResult:
|
||||
experiment = start_experiment(
|
||||
DEFAULT_WATCH_CYCLES,
|
||||
DEFAULT_INTERVAL_SECONDS,
|
||||
|
|
@ -309,7 +335,13 @@ def _run_watch_stage(config: AppConfig) -> str:
|
|||
"last_error": summary.last_error,
|
||||
},
|
||||
)
|
||||
return experiment.run_id
|
||||
return WatchStageResult(
|
||||
run_id=experiment.run_id,
|
||||
status=summary.status,
|
||||
successful_cycles=summary.successful_cycles,
|
||||
failed_cycles=summary.failed_cycles,
|
||||
last_error=summary.last_error,
|
||||
)
|
||||
|
||||
|
||||
def _print_cycle_result(index: int, total: int, result) -> None:
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
from decimal import Decimal
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
import sqlite3
|
||||
from tempfile import TemporaryDirectory
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
|
|
@ -11,6 +13,7 @@ from braiins_ratchet.guidance import (
|
|||
_active_watch_status_lines,
|
||||
_do_this_now,
|
||||
_pathway_forecast,
|
||||
_recent_completed_watch,
|
||||
build_operator_cockpit,
|
||||
)
|
||||
from braiins_ratchet.models import CandidateOrder, MarketSnapshot, OceanSnapshot, StrategyProposal
|
||||
|
|
@ -195,6 +198,23 @@ class GuidanceTests(unittest.TestCase):
|
|||
self.assertIn("Active watch ETA: 2026-04-29T12:48:06+02:00", text)
|
||||
self.assertIn("Active watch remaining: about 90 minutes", text)
|
||||
|
||||
def test_zero_sample_failed_report_is_not_treated_as_cooldown_evidence(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
reports = Path(tmp) / "reports"
|
||||
reports.mkdir()
|
||||
report = reports / "run-failed.md"
|
||||
report.write_text(
|
||||
"# run-failed\n\n"
|
||||
"## Run Summary\n\n"
|
||||
"- collected_samples: 0\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
with patch("braiins_ratchet.guidance.REPORTS_DIR", reports):
|
||||
completed = _recent_completed_watch("reports/run-failed.md", None)
|
||||
|
||||
self.assertIsNone(completed)
|
||||
|
||||
|
||||
def _completed_watch(age_minutes: int) -> CompletedWatch:
|
||||
return CompletedWatch(
|
||||
|
|
|
|||
Loading…
Reference in a new issue