mirror of
https://github.com/saymrwulf/BraiinsRatchet.git
synced 2026-05-14 20:37:52 +00:00
Treat failed watches as instrumentation backoff
This commit is contained in:
parent
c741fbf850
commit
b17c6aa732
3 changed files with 83 additions and 12 deletions
|
|
@ -413,6 +413,8 @@ def _recent_completed_watch(latest_report: str | None, latest_market_timestamp:
|
||||||
report_path = REPORTS_DIR.parent / latest_report
|
report_path = REPORTS_DIR.parent / latest_report
|
||||||
if not report_path.name.startswith("run-") or not report_path.exists():
|
if not report_path.name.startswith("run-") or not report_path.exists():
|
||||||
return None
|
return None
|
||||||
|
if _report_collected_samples(report_path) == 0:
|
||||||
|
return None
|
||||||
market_dt = _parse_utc(latest_market_timestamp)
|
market_dt = _parse_utc(latest_market_timestamp)
|
||||||
if market_dt is not None and report_path.stat().st_mtime < market_dt.timestamp():
|
if market_dt is not None and report_path.stat().st_mtime < market_dt.timestamp():
|
||||||
return None
|
return None
|
||||||
|
|
@ -436,6 +438,23 @@ def _recent_completed_watch(latest_report: str | None, latest_market_timestamp:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _report_collected_samples(report_path: Path) -> int | None:
|
||||||
|
try:
|
||||||
|
lines = report_path.read_text(encoding="utf-8").splitlines()
|
||||||
|
except OSError:
|
||||||
|
return None
|
||||||
|
for line in lines:
|
||||||
|
stripped = line.strip()
|
||||||
|
if not stripped.startswith("- collected_samples:"):
|
||||||
|
continue
|
||||||
|
_, _, value = stripped.partition(":")
|
||||||
|
try:
|
||||||
|
return int(value.strip())
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _research_stage(active_watch: str | None, completed_watch: CompletedWatch | None) -> str:
|
def _research_stage(active_watch: str | None, completed_watch: CompletedWatch | None) -> str:
|
||||||
if active_watch:
|
if active_watch:
|
||||||
return "watch running"
|
return "watch running"
|
||||||
|
|
|
||||||
|
|
@ -10,12 +10,13 @@ from .config import AppConfig
|
||||||
from .experiments import ACTIVE_WATCH, finish_experiment, start_experiment
|
from .experiments import ACTIVE_WATCH, finish_experiment, start_experiment
|
||||||
from .guidance import POST_WATCH_COOLDOWN_MINUTES, build_operator_cockpit, get_operator_state
|
from .guidance import POST_WATCH_COOLDOWN_MINUTES, build_operator_cockpit, get_operator_state
|
||||||
from .storage import connect, init_db
|
from .storage import connect, init_db
|
||||||
from .watch_loop import run_watch_loop
|
from .watch_loop import WatchLoopSummary, run_watch_loop
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_WATCH_CYCLES = 24
|
DEFAULT_WATCH_CYCLES = 24
|
||||||
DEFAULT_INTERVAL_SECONDS = 300
|
DEFAULT_INTERVAL_SECONDS = 300
|
||||||
MAX_CONSECUTIVE_CYCLE_FAILURES = 3
|
MAX_CONSECUTIVE_CYCLE_FAILURES = 3
|
||||||
|
ERROR_BACKOFF_MINUTES = 30
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
|
|
@ -38,6 +39,15 @@ class ManualPosition:
|
||||||
payload_json: str
|
payload_json: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class WatchStageResult:
|
||||||
|
run_id: str
|
||||||
|
status: str
|
||||||
|
successful_cycles: int
|
||||||
|
failed_cycles: int
|
||||||
|
last_error: str | None
|
||||||
|
|
||||||
|
|
||||||
def init_lifecycle_db(conn) -> None:
|
def init_lifecycle_db(conn) -> None:
|
||||||
init_db(conn)
|
init_db(conn)
|
||||||
conn.executescript(
|
conn.executescript(
|
||||||
|
|
@ -161,31 +171,47 @@ def run_supervisor(config: AppConfig, *, once: bool = False) -> int:
|
||||||
phase = state.get("phase", "idle")
|
phase = state.get("phase", "idle")
|
||||||
next_action_utc = state.get("next_action_utc")
|
next_action_utc = state.get("next_action_utc")
|
||||||
|
|
||||||
if phase == "cooldown" and next_action_utc:
|
if phase in {"cooldown", "error_backoff"} and next_action_utc:
|
||||||
remaining = _seconds_until(next_action_utc)
|
remaining = _seconds_until(next_action_utc)
|
||||||
if remaining > 0:
|
if remaining > 0:
|
||||||
_print_timer("Lifecycle cooldown", remaining)
|
_print_timer("Lifecycle cooldown" if phase == "cooldown" else "Instrumentation retry backoff", remaining)
|
||||||
if once:
|
if once:
|
||||||
return 0
|
return 0
|
||||||
_sleep_with_progress(remaining)
|
_sleep_with_progress(remaining)
|
||||||
|
|
||||||
run_id = _run_watch_stage(config)
|
watch = _run_watch_stage(config)
|
||||||
|
if watch.status == "failed" and watch.successful_cycles == 0:
|
||||||
|
next_action = datetime.now(UTC) + timedelta(minutes=ERROR_BACKOFF_MINUTES)
|
||||||
|
phase = "error_backoff"
|
||||||
|
message = "watch failed before collecting samples; retry backoff active"
|
||||||
|
event_type = "watch_failed_backoff"
|
||||||
|
else:
|
||||||
next_action = datetime.now(UTC) + timedelta(minutes=POST_WATCH_COOLDOWN_MINUTES)
|
next_action = datetime.now(UTC) + timedelta(minutes=POST_WATCH_COOLDOWN_MINUTES)
|
||||||
|
phase = "cooldown"
|
||||||
|
message = "watch complete; cooldown active before next research stage"
|
||||||
|
event_type = "watch_completed"
|
||||||
with connect() as conn:
|
with connect() as conn:
|
||||||
init_lifecycle_db(conn)
|
init_lifecycle_db(conn)
|
||||||
_write_state(
|
_write_state(
|
||||||
conn,
|
conn,
|
||||||
{
|
{
|
||||||
"phase": "cooldown",
|
"phase": phase,
|
||||||
"next_action_utc": next_action.isoformat(timespec="seconds"),
|
"next_action_utc": next_action.isoformat(timespec="seconds"),
|
||||||
"last_run_id": run_id,
|
"last_run_id": watch.run_id,
|
||||||
"message": "watch complete; cooldown active before next research stage",
|
"message": message,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
_record_event(
|
_record_event(
|
||||||
conn,
|
conn,
|
||||||
"watch_completed",
|
event_type,
|
||||||
{"run_id": run_id, "next_action_utc": next_action.isoformat(timespec="seconds")},
|
{
|
||||||
|
"run_id": watch.run_id,
|
||||||
|
"status": watch.status,
|
||||||
|
"successful_cycles": watch.successful_cycles,
|
||||||
|
"failed_cycles": watch.failed_cycles,
|
||||||
|
"last_error": watch.last_error,
|
||||||
|
"next_action_utc": next_action.isoformat(timespec="seconds"),
|
||||||
|
},
|
||||||
)
|
)
|
||||||
print(
|
print(
|
||||||
build_operator_cockpit(
|
build_operator_cockpit(
|
||||||
|
|
@ -244,7 +270,7 @@ def recover_stale_active_watch(conn) -> str | None:
|
||||||
return report_path
|
return report_path
|
||||||
|
|
||||||
|
|
||||||
def _run_watch_stage(config: AppConfig) -> str:
|
def _run_watch_stage(config: AppConfig) -> WatchStageResult:
|
||||||
experiment = start_experiment(
|
experiment = start_experiment(
|
||||||
DEFAULT_WATCH_CYCLES,
|
DEFAULT_WATCH_CYCLES,
|
||||||
DEFAULT_INTERVAL_SECONDS,
|
DEFAULT_INTERVAL_SECONDS,
|
||||||
|
|
@ -309,7 +335,13 @@ def _run_watch_stage(config: AppConfig) -> str:
|
||||||
"last_error": summary.last_error,
|
"last_error": summary.last_error,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
return experiment.run_id
|
return WatchStageResult(
|
||||||
|
run_id=experiment.run_id,
|
||||||
|
status=summary.status,
|
||||||
|
successful_cycles=summary.successful_cycles,
|
||||||
|
failed_cycles=summary.failed_cycles,
|
||||||
|
last_error=summary.last_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _print_cycle_result(index: int, total: int, result) -> None:
|
def _print_cycle_result(index: int, total: int, result) -> None:
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,8 @@
|
||||||
from decimal import Decimal
|
from decimal import Decimal
|
||||||
from datetime import UTC, datetime
|
from datetime import UTC, datetime
|
||||||
|
from pathlib import Path
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
from tempfile import TemporaryDirectory
|
||||||
import unittest
|
import unittest
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
|
@ -11,6 +13,7 @@ from braiins_ratchet.guidance import (
|
||||||
_active_watch_status_lines,
|
_active_watch_status_lines,
|
||||||
_do_this_now,
|
_do_this_now,
|
||||||
_pathway_forecast,
|
_pathway_forecast,
|
||||||
|
_recent_completed_watch,
|
||||||
build_operator_cockpit,
|
build_operator_cockpit,
|
||||||
)
|
)
|
||||||
from braiins_ratchet.models import CandidateOrder, MarketSnapshot, OceanSnapshot, StrategyProposal
|
from braiins_ratchet.models import CandidateOrder, MarketSnapshot, OceanSnapshot, StrategyProposal
|
||||||
|
|
@ -195,6 +198,23 @@ class GuidanceTests(unittest.TestCase):
|
||||||
self.assertIn("Active watch ETA: 2026-04-29T12:48:06+02:00", text)
|
self.assertIn("Active watch ETA: 2026-04-29T12:48:06+02:00", text)
|
||||||
self.assertIn("Active watch remaining: about 90 minutes", text)
|
self.assertIn("Active watch remaining: about 90 minutes", text)
|
||||||
|
|
||||||
|
def test_zero_sample_failed_report_is_not_treated_as_cooldown_evidence(self) -> None:
|
||||||
|
with TemporaryDirectory() as tmp:
|
||||||
|
reports = Path(tmp) / "reports"
|
||||||
|
reports.mkdir()
|
||||||
|
report = reports / "run-failed.md"
|
||||||
|
report.write_text(
|
||||||
|
"# run-failed\n\n"
|
||||||
|
"## Run Summary\n\n"
|
||||||
|
"- collected_samples: 0\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("braiins_ratchet.guidance.REPORTS_DIR", reports):
|
||||||
|
completed = _recent_completed_watch("reports/run-failed.md", None)
|
||||||
|
|
||||||
|
self.assertIsNone(completed)
|
||||||
|
|
||||||
|
|
||||||
def _completed_watch(age_minutes: int) -> CompletedWatch:
|
def _completed_watch(age_minutes: int) -> CompletedWatch:
|
||||||
return CompletedWatch(
|
return CompletedWatch(
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue