autoresearch-quantum/tests/test_harness.py
saymrwulf 5d0c28f939 Harden teaching layer, add notebook execution tests, fix repo hygiene
Quality fixes:
- Add deprecation warnings to 5 silent no-op legacy wrappers in assess.py
- Remove dead code in tracker.py score_by_section (unused first loop)
- Remove unused variable in assess.py _check_order
- Fix .gitignore: add progress JSONs, checkpoints, .coverage, .DS_Store, LaTeX aux
- Fix "all three plans" → "all four plans" in learning_objectives.md
- Add teaching/ package to README project tree
- Add compendium to README paper tree

Testing:
- Add 43 unit tests for teaching/assess.py and tracker.py (quiz, predict_choice,
  reflect, order, checkpoint_summary, legacy wrapper deprecation warnings,
  tracker scoring, persistence, mastery calculation)
- Add notebook execution test suite (nbclient): all 11 notebooks execute without
  errors in a fresh kernel, structural validation (valid JSON, has code cells,
  has assessments, section parameters, learning objectives document)
- Overall test count: 185 passing (was 107), coverage: 85% (was ~25% in tests)

Toolchain:
- Add pytest-cov, ruff, nbclient, nbformat to dev dependencies
- Add ruff config (E, F, W, I, UP, B, SIM rules)
- Add coverage config with term-missing output
- Fix all ruff lint issues across src/ and tests/ (import sorting, unused imports)
- Fix Plan D notebook paths (configs/rungs → ../../configs/rungs)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-15 15:34:37 +02:00

455 lines
17 KiB
Python
Raw Permalink Blame History

from __future__ import annotations
from pathlib import Path
from qiskit.quantum_info import Statevector
from autoresearch_quantum.codes.four_two_two import STABILIZERS, encoded_magic_statevector
from autoresearch_quantum.execution.local import LocalCheapExecutor
from autoresearch_quantum.execution.transfer import TransferEvaluator
from autoresearch_quantum.experiments.encoded_magic_state import build_circuit_bundle
from autoresearch_quantum.lessons.feedback import (
build_lesson_feedback,
extract_search_rules,
narrow_search_space,
)
from autoresearch_quantum.models import (
CostWeights,
ExperimentSpec,
HardwareConfig,
LessonFeedback,
QualityWeights,
RungConfig,
RungProgress,
ScoreConfig,
SearchRule,
SearchSpaceConfig,
TierPolicyConfig,
TransferReport,
)
from autoresearch_quantum.persistence.store import ResearchStore
from autoresearch_quantum.ratchet.runner import AutoresearchHarness
from autoresearch_quantum.scoring.score import factory_throughput_score
from autoresearch_quantum.search.challengers import generate_neighbor_challengers
from autoresearch_quantum.search.strategies import (
LessonGuided,
RandomCombo,
default_composite,
)
def _test_rung(search_dimensions: dict[str, list[object]] | None = None) -> RungConfig:
spec = ExperimentSpec(
rung=1,
target_backend="fake_brisbane",
noise_backend="fake_brisbane",
shots=64,
repeats=1,
)
return RungConfig(
rung=1,
name="test",
description="test rung",
objective="test objective",
bootstrap_incumbent=spec,
search_space=SearchSpaceConfig(
dimensions=search_dimensions or {"verification": ["both", "z_only"]},
max_challengers_per_step=4,
),
tier_policy=TierPolicyConfig(
cheap_margin=0.0,
confirmation_margin=0.0,
cheap_shots=64,
expensive_shots=128,
cheap_repeats=1,
expensive_repeats=1,
promote_top_k=1,
enable_hardware=False,
confirm_incumbent_on_hardware=False,
hardware_budget=0,
),
score=ScoreConfig(
cheap_quality=QualityWeights(
ideal_fidelity=0.2,
noisy_fidelity=0.3,
logical_witness=0.3,
codespace_rate=0.1,
stability_score=0.05,
spectator_alignment=0.05,
),
expensive_quality=QualityWeights(
logical_witness=0.6,
codespace_rate=0.2,
stability_score=0.1,
spectator_alignment=0.1,
),
cost_weights=CostWeights(
two_qubit_count=0.05,
depth=0.01,
shot_count=0.0001,
runtime_estimate=0.01,
queue_cost_proxy=0.0,
),
),
step_budget=1,
patience=1,
hardware=HardwareConfig(),
)
# ── Original tests ──────────────────────────────────────────────────────────
def test_encoded_target_state_satisfies_stabilizers() -> None:
state = encoded_magic_statevector()
assert isinstance(state, Statevector)
for stabilizer in STABILIZERS.values():
expectation = state.expectation_value(stabilizer)
assert abs(expectation - 1.0) < 1e-8
def test_circuit_bundle_contains_expected_contexts() -> None:
bundle = build_circuit_bundle(ExperimentSpec(rung=1))
assert set(bundle.witness_circuits) == {"logical_x", "logical_y", "spectator_z"}
for name, circuit in bundle.witness_circuits.items():
assert circuit.metadata["context"] == name
assert "logical_operator" in circuit.metadata
assert bundle.acceptance.metadata["context"] == "acceptance"
def test_local_executor_produces_score() -> None:
rung = _test_rung()
result = LocalCheapExecutor().evaluate(rung.bootstrap_incumbent, rung)
assert result.score > 0.0
assert 0.0 <= result.metrics.acceptance_rate <= 1.0
assert 0.0 <= (result.metrics.logical_magic_witness or 0.0) <= 1.0
def test_neighbor_challengers_mutate_single_dimension() -> None:
incumbent = ExperimentSpec(rung=1)
search_space = SearchSpaceConfig(
dimensions={
"verification": ["both", "z_only"],
"seed_style": ["h_p", "ry_rz"],
},
max_challengers_per_step=8,
)
challengers = generate_neighbor_challengers(incumbent, search_space)
assert len(challengers) == 2
for challenger in challengers:
changed_fields = [
field_name
for field_name in incumbent.__dataclass_fields__
if getattr(incumbent, field_name) != getattr(challenger.spec, field_name)
]
assert len(changed_fields) == 1
def test_ratchet_step_persists_incumbent_and_step(tmp_path: Path) -> None:
rung = _test_rung({"verification": ["both", "z_only"], "postselection": ["all_measured", "z_only"]})
harness = AutoresearchHarness(ResearchStore(tmp_path))
step = harness.run_ratchet_step(rung, allow_hardware=False)
assert step.step_index == 1
assert (tmp_path / "rung_1" / "incumbent.json").exists()
assert list((tmp_path / "rung_1" / "ratchet_steps").glob("*.json"))
# ── New tests: challenger strategies ────────────────────────────────────────
def test_neighbor_walk_respects_history() -> None:
incumbent = ExperimentSpec(rung=1)
search_space = SearchSpaceConfig(
dimensions={"verification": ["both", "z_only"], "seed_style": ["h_p", "ry_rz"]},
max_challengers_per_step=8,
)
# First pass: get all challengers
all_challengers = generate_neighbor_challengers(incumbent, search_space)
fps = {c.spec.fingerprint() for c in all_challengers}
# Second pass with history: should get nothing new
new_challengers = generate_neighbor_challengers(incumbent, search_space, history=fps)
assert len(new_challengers) == 0
def test_random_combo_generates_multi_axis_mutations() -> None:
incumbent = ExperimentSpec(rung=1)
search_space = SearchSpaceConfig(
dimensions={
"verification": ["both", "z_only", "x_only"],
"seed_style": ["h_p", "ry_rz", "u_magic"],
"optimization_level": [1, 2, 3],
},
max_challengers_per_step=10,
)
strategy = RandomCombo(num_candidates=10, max_mutations=3)
challengers = strategy.generate(incumbent, search_space, set())
assert len(challengers) > 0
# At least one challenger should mutate multiple dimensions
multi_axis = [
c for c in challengers
if sum(
1 for f in incumbent.__dataclass_fields__
if getattr(incumbent, f) != getattr(c.spec, f)
) > 1
]
# Probabilistic, but with 10 candidates and 3 dims it's extremely likely
assert len(multi_axis) > 0
def test_lesson_guided_uses_rules() -> None:
incumbent = ExperimentSpec(rung=1)
search_space = SearchSpaceConfig(
dimensions={
"verification": ["both", "z_only", "x_only"],
"seed_style": ["h_p", "ry_rz", "u_magic"],
},
max_challengers_per_step=8,
)
feedback = LessonFeedback(
rung=1,
rules=[
SearchRule("verification", "prefer", "z_only", 0.8, "top performer"),
SearchRule("seed_style", "avoid", "h_p", 0.6, "consistently poor"),
SearchRule("seed_style", "fix", "ry_rz", 0.9, "all top-K use this"),
],
narrowed_dimensions={},
best_spec_fields={},
)
strategy = LessonGuided(num_candidates=6)
challengers = strategy.generate(incumbent, search_space, set(), [feedback])
assert len(challengers) > 0
# All challengers should have seed_style fixed to ry_rz (from fix rule)
for c in challengers:
assert c.spec.seed_style == "ry_rz"
def test_composite_generator_combines_strategies() -> None:
incumbent = ExperimentSpec(rung=1)
search_space = SearchSpaceConfig(
dimensions={
"verification": ["both", "z_only", "x_only"],
"seed_style": ["h_p", "ry_rz", "u_magic"],
"optimization_level": [1, 2, 3],
},
max_challengers_per_step=8,
)
composite = default_composite(has_lessons=False)
challengers = composite.generate(incumbent, search_space, set())
assert len(challengers) > 0
assert len(challengers) <= 8
# ── New tests: lesson feedback ─────<E29480><E29480>────────────────────────────────────────
def test_extract_search_rules_prefer_and_avoid() -> None:
search_space = SearchSpaceConfig(
dimensions={"verification": ["both", "z_only"]},
max_challengers_per_step=4,
)
records = [
{"spec": {"verification": "z_only"}, "final_score": 0.8},
{"spec": {"verification": "z_only"}, "final_score": 0.85},
{"spec": {"verification": "z_only"}, "final_score": 0.82},
{"spec": {"verification": "both"}, "final_score": 0.5},
{"spec": {"verification": "both"}, "final_score": 0.55},
{"spec": {"verification": "both"}, "final_score": 0.52},
]
rules = extract_search_rules(records, search_space)
actions = {(r.dimension, r.action, r.value) for r in rules}
assert ("verification", "prefer", "z_only") in actions
assert ("verification", "avoid", "both") in actions
def test_narrow_search_space_removes_avoided() -> None:
search_space = SearchSpaceConfig(
dimensions={
"verification": ["both", "z_only", "x_only"],
"seed_style": ["h_p", "ry_rz", "u_magic"],
},
max_challengers_per_step=8,
)
rules = [
SearchRule("verification", "avoid", "x_only", 0.5, "poor"),
SearchRule("seed_style", "fix", "ry_rz", 0.6, "best"),
]
narrowed = narrow_search_space(search_space, rules)
assert "x_only" not in narrowed.dimensions["verification"]
assert narrowed.dimensions["seed_style"] == ["ry_rz"]
def test_build_lesson_feedback_end_to_end() -> None:
search_space = SearchSpaceConfig(
dimensions={"verification": ["both", "z_only"]},
max_challengers_per_step=4,
)
records = [
{"spec": {"verification": "z_only"}, "final_score": 0.8},
{"spec": {"verification": "z_only"}, "final_score": 0.85},
{"spec": {"verification": "both"}, "final_score": 0.5},
{"spec": {"verification": "both"}, "final_score": 0.55},
]
feedback = build_lesson_feedback(1, records, search_space)
assert feedback.rung == 1
assert len(feedback.rules) > 0
assert feedback.best_spec_fields["verification"] == "z_only"
# ── New tests: factory score ────────────────────────────────────────────────
def test_factory_throughput_score_produces_metrics() -> None:
from autoresearch_quantum.models import EvaluationMetrics
metrics = EvaluationMetrics(
ideal_encoded_fidelity=0.95,
noisy_encoded_fidelity=0.85,
logical_magic_witness=0.80,
acceptance_rate=0.70,
codespace_rate=0.65,
stability_score=0.90,
two_qubit_count=30,
depth=50,
shot_count=1024,
)
config = ScoreConfig(
name="factory_throughput",
cheap_quality=QualityWeights(
noisy_fidelity=0.3,
logical_witness=0.4,
codespace_rate=0.2,
stability_score=0.1,
),
)
score, quality, cost = factory_throughput_score(metrics, "cheap", config)
assert score > 0.0
assert quality > 0.0
assert cost > 0.0
assert "factory_metrics" in metrics.extra
fm = metrics.extra["factory_metrics"]
assert fm["accepted_states_per_shot"] == 0.70
assert fm["throughput_proxy"] > 0.0
def test_score_registry_has_factory() -> None:
from autoresearch_quantum.scoring.score import SCORE_REGISTRY
assert "factory_throughput" in SCORE_REGISTRY
# ── New tests: transfer evaluation ──────────────<E29480><E29480><EFBFBD>───────────────────────────
def test_transfer_evaluator_runs_across_backends() -> None:
rung = _test_rung()
evaluator = TransferEvaluator()
report = evaluator.evaluate_across_backends(
rung.bootstrap_incumbent,
["fake_brisbane"], # Use single backend for speed
rung,
)
assert isinstance(report, TransferReport)
assert report.transfer_score > 0.0
assert "fake_brisbane" in report.per_backend_scores
# ── New tests: persistence (progress, feedback) ───<E29480><E29480>────────────────────────
def test_save_and_load_progress(tmp_path: Path) -> None:
store = ResearchStore(tmp_path)
progress = RungProgress(
rung=1,
steps_completed=2,
patience_remaining=1,
current_incumbent_id="r1-incumbent-abc123",
completed=False,
)
store.save_progress(progress)
loaded = store.load_progress(1)
assert loaded is not None
assert loaded.steps_completed == 2
assert loaded.current_incumbent_id == "r1-incumbent-abc123"
assert not loaded.completed
def test_save_and_load_lesson_feedback(tmp_path: Path) -> None:
store = ResearchStore(tmp_path)
feedback = LessonFeedback(
rung=1,
rules=[SearchRule("verification", "prefer", "z_only", 0.8, "good")],
narrowed_dimensions={"verification": ["z_only"]},
best_spec_fields={"verification": "z_only"},
)
store.save_lesson_feedback(feedback)
loaded = store.load_lesson_feedback(1)
assert loaded is not None
assert len(loaded.rules) == 1
assert loaded.rules[0].dimension == "verification"
assert loaded.rules[0].action == "prefer"
# ── New tests: resumability in harness ──────────────────────────────────────
def test_run_rung_saves_progress(tmp_path: Path) -> None:
rung = _test_rung({"verification": ["both", "z_only"]})
store = ResearchStore(tmp_path)
harness = AutoresearchHarness(store)
steps, lesson, feedback = harness.run_rung(rung, allow_hardware=False)
assert len(steps) >= 1
progress = store.load_progress(1)
assert progress is not None
assert progress.completed
def test_run_rung_returns_lesson_and_feedback(tmp_path: Path) -> None:
rung = _test_rung({"verification": ["both", "z_only"]})
harness = AutoresearchHarness(ResearchStore(tmp_path))
steps, lesson, feedback = harness.run_rung(rung, allow_hardware=False)
assert lesson.rung == 1
assert isinstance(feedback, LessonFeedback)
assert feedback.rung == 1
# ── New tests: cross-rung propagation ──────<E29480><E29480>────────────────────────────────
def test_run_ratchet_propagates_winner(tmp_path: Path) -> None:
rung1 = _test_rung({"verification": ["both", "z_only"]})
rung2_spec = ExperimentSpec(
rung=2,
target_backend="fake_brisbane",
noise_backend="fake_brisbane",
shots=64,
repeats=1,
)
rung2 = RungConfig(
rung=2,
name="test rung 2",
description="test rung 2",
objective="test objective 2",
bootstrap_incumbent=rung2_spec,
search_space=SearchSpaceConfig(
dimensions={"verification": ["both", "z_only"]},
max_challengers_per_step=2,
),
tier_policy=rung1.tier_policy,
score=rung1.score,
step_budget=1,
patience=1,
hardware=HardwareConfig(),
)
store = ResearchStore(tmp_path)
harness = AutoresearchHarness(store)
results = harness.run_ratchet([rung1, rung2], allow_hardware=False)
assert len(results) == 2
# Both should have lesson + feedback
for lesson, feedback in results:
assert lesson is not None
assert isinstance(feedback, LessonFeedback)
# Accumulated lessons should have entries from both rungs
assert len(harness._accumulated_lessons) == 2
# ── New tests: seed determinism fix ─────────────────────────────────────────
def test_different_specs_get_different_seeds() -> None:
"""Two specs with different fingerprints should produce different seeds."""
import hashlib
spec_a = ExperimentSpec(rung=1, verification="both")
spec_b = ExperimentSpec(rung=1, verification="z_only")
seed_a = int(hashlib.sha256(f"{spec_a.fingerprint()}-0".encode()).hexdigest()[:8], 16)
seed_b = int(hashlib.sha256(f"{spec_b.fingerprint()}-0".encode()).hexdigest()[:8], 16)
assert seed_a != seed_b