From 29caba3a1a3428585f8e3913f14c48651245e9d5 Mon Sep 17 00:00:00 2001 From: saymrwulf Date: Wed, 15 Apr 2026 20:00:19 +0200 Subject: [PATCH] Add professional toolchain: mypy strict, CI pipeline, Playwright UX tests, pedagogy validation Infrastructure: - Configure mypy strict mode in pyproject.toml; fix all 53 type errors across 8 source files - Add .pre-commit-config.yaml (ruff, mypy, nbstripout, trailing whitespace) - Add .github/workflows/ci.yml: lint + type check, unit tests (Python 3.11/3.12), notebook execution - Add scripts/app.sh consumer lifecycle manager (bootstrap, start, stop, status, validate, logs, reset) Testing: - Add tests/test_browser_ux.py: Playwright end-to-end UX tests covering JupyterLab launch, notebook rendering, navigation links, widget rendering, and full consumer walkthrough - Add tests/test_pedagogy.py: 130 pedagogical structure tests validating prose quality (word counts, markdown ratio), section structure, assessment density and variety, Bloom's taxonomy coverage, checkpoint presence, tracker integration, key insight callouts, and cross-plan concept consistency Quality: - Fix ruff E741 (ambiguous variable name) across all builder scripts - Add Key Insight callouts to plan_a/01_encoded_magic_state.ipynb - Add pytest 'browser' marker for selective UX test runs - Expand .gitignore with .logs/ and build artifacts 319 tests pass, 85% coverage, mypy strict clean, ruff clean. --- .github/workflows/ci.yml | 79 ++++ .gitignore | 7 + .pre-commit-config.yaml | 34 ++ notebooks/plan_a/01_encoded_magic_state.ipynb | 23 +- pyproject.toml | 40 +- scripts/app.sh | 345 ++++++++++++++++++ scripts/enhance_dashboard.py | 4 +- scripts/enhance_nb02.py | 4 +- scripts/enhance_nb03.py | 4 +- scripts/enhance_spiral.py | 4 +- scripts/enhance_track_a.py | 4 +- scripts/enhance_track_b.py | 4 +- scripts/enhance_track_c.py | 4 +- scripts/fix_math_in_explanations.py | 3 +- src/autoresearch_quantum/cli.py | 2 +- .../execution/hardware.py | 22 +- src/autoresearch_quantum/execution/local.py | 22 +- src/autoresearch_quantum/lessons/extractor.py | 6 +- src/autoresearch_quantum/persistence/store.py | 6 +- src/autoresearch_quantum/ratchet/runner.py | 6 +- src/autoresearch_quantum/teaching/assess.py | 52 +-- src/autoresearch_quantum/teaching/tracker.py | 2 +- tests/test_browser_ux.py | 249 +++++++++++++ tests/test_pedagogy.py | 288 +++++++++++++++ 24 files changed, 1123 insertions(+), 91 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 .pre-commit-config.yaml create mode 100755 scripts/app.sh create mode 100644 tests/test_browser_ux.py create mode 100644 tests/test_pedagogy.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..f135b3c --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,79 @@ +name: CI + +on: + push: + branches: [master] + pull_request: + branches: [master] + +permissions: + contents: read + +jobs: + lint: + name: Lint & Type Check + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[dev]" + + - name: Ruff check + run: ruff check src/ tests/ scripts/ + + - name: Ruff format check + run: ruff format --check src/ tests/ scripts/ + + - name: Mypy + run: mypy src/autoresearch_quantum/ + + test: + name: Tests (Python ${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[dev,notebooks]" + + - name: Run unit tests + run: pytest tests/ -k "not test_notebook_executes and not test_browser" -v --tb=short + + - name: Run notebook structure tests + run: pytest tests/test_notebooks.py tests/test_pedagogy.py -k "not test_notebook_executes" -v --tb=short + + notebook-execution: + name: Notebook Execution + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[dev,notebooks]" + python -m ipykernel install --user --name python3 + + - name: Run notebook execution tests + run: pytest tests/test_notebooks.py -k "test_notebook_executes" -v --tb=short -x diff --git a/.gitignore b/.gitignore index 7a7bcd8..d5ad229 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,10 @@ paper/*.synctex.gz # Ruff .ruff_cache/ + +# Logs +.logs/ + +# Build artifacts +dist/ +build/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..5ab7458 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,34 @@ +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.11.12 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix] + - id: ruff-format + + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.15.0 + hooks: + - id: mypy + additional_dependencies: + - types-PyYAML + args: [--config-file=pyproject.toml] + pass_filenames: false + entry: mypy src/autoresearch_quantum/ + + - repo: https://github.com/kynan/nbstripout + rev: 0.8.1 + hooks: + - id: nbstripout + args: [--extra-keys, "metadata.kernelspec metadata.language_info"] + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + exclude: '\.ipynb$' + - id: check-yaml + - id: check-added-large-files + args: [--maxkb=500] + - id: check-merge-conflict diff --git a/notebooks/plan_a/01_encoded_magic_state.ipynb b/notebooks/plan_a/01_encoded_magic_state.ipynb index c06fe49..d0950a3 100644 --- a/notebooks/plan_a/01_encoded_magic_state.ipynb +++ b/notebooks/plan_a/01_encoded_magic_state.ipynb @@ -446,9 +446,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "All three fidelities are 1.0 (or extremely close) and the Bloch spheres all point to the same spot. The amplitudes may differ by a **global phase** factor $e^{i\\theta}$, which has no physical significance \u2014 all measurements yield identical results.\n", - "\n", - "> **Take-away:** The choice of seed style is not about physics (they all give the same state). It is about **engineering**: which one transpiles to the fewest noisy gates on your target hardware?" + "> **Key Insight:** All three fidelities are 1.0 (or extremely close) and the Bloch spheres all point to the same spot. The amplitudes may differ by a **global phase** factor $e^{i\\theta}$, which has no physical significance \u2014 all measurements yield identical results.\n\n> **Take-away:** The choice of seed style is not about physics (they all give the same state). It is about **engineering**: which one transpiles to the fewest noisy gates on your target hardware?" ] }, { @@ -876,12 +874,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Interpreting the output\n", - "\n", - "You should see exactly **4** non-zero amplitudes: $|0000\\rangle$, $|0101\\rangle$, $|1010\\rangle$, $|1111\\rangle$. These are the codewords of the [[4,2,2]] code. Notice:\n", - "- All four have the **same magnitude** (0.5) \u2014 equal probability\n", - "- The **phases** encode the T-state information (the $e^{i\\pi/4}$ factor appears on $|0101\\rangle$ and $|1010\\rangle$)\n", - "- No single qubit's measurement alone reveals the T-state \u2014 the information lives in the *correlations* between qubits" + "### Interpreting the output\n\n> **Key Insight:** You should see exactly **4** non-zero amplitudes: $|0000\\rangle$, $|0101\\rangle$, $|1010\\rangle$, $|1111\\rangle$. These are the codewords of the [[4,2,2]] code. Notice:\n- All four have the **same magnitude** (0.5) \u2014 equal probability\n- The **phases** encode the T-state information (the $e^{i\\pi/4}$ factor appears on $|0101\\rangle$ and $|1010\\rangle$)\n- No single qubit's measurement alone reveals the T-state \u2014 the information lives in the *correlations* between qubits" ] }, { @@ -1120,17 +1113,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Reading the table\n", - "\n", - "Every single-qubit error is caught by at least one stabilizer:\n", - "\n", - "| Error type | Caught by | Reason |\n", - "|-----------|-----------|--------|\n", - "| X (bit-flip) | ZZZZ | X anti-commutes with Z |\n", - "| Z (phase-flip) | XXXX | Z anti-commutes with X |\n", - "| Y (both) | XXXX and ZZZZ | Y = iXZ, so both parts are caught |\n", - "\n", - "This is the **distance-2 guarantee**: the code detects all weight-1 errors. A weight-2 error (two qubits affected simultaneously) could go undetected \u2014 that's the limitation of distance 2." + "### Reading the table\n\n> **Key Insight:** Every single-qubit error is caught by at least one stabilizer:\n\n| Error type | Caught by | Reason |\n|-----------|-----------|--------|\n| X (bit-flip) | ZZZZ | X anti-commutes with Z |\n| Z (phase-flip) | XXXX | Z anti-commutes with X |\n| Y (both) | XXXX and ZZZZ | Y = iXZ, so both parts are caught |\n\nThis is the **distance-2 guarantee**: the code detects all weight-1 errors. A weight-2 error (two qubits affected simultaneously) could go undetected \u2014 that's the limitation of distance 2." ] }, { diff --git a/pyproject.toml b/pyproject.toml index 818ca8b..2177bed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,11 @@ dev = [ "ruff>=0.11,<1", "nbclient>=0.10,<1", "nbformat>=5,<6", + "mypy>=1.15,<2", + "pre-commit>=4,<5", +] +ux = [ + "playwright>=1.52,<2", ] [project.scripts] @@ -47,7 +52,10 @@ where = ["src"] [tool.pytest.ini_options] pythonpath = ["src"] testpaths = ["tests"] -addopts = "--cov=autoresearch_quantum --cov-report=term-missing --cov-config=pyproject.toml" +addopts = "--cov=autoresearch_quantum --cov-report=term-missing --cov-config=pyproject.toml -m 'not browser'" +markers = [ + "browser: end-to-end browser UX tests (requires playwright)", +] [tool.coverage.run] source = ["autoresearch_quantum"] @@ -61,6 +69,36 @@ exclude_lines = [ "if __name__ == .__main__.", ] +[tool.mypy] +python_version = "3.11" +strict = true +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = true +disallow_incomplete_defs = true +check_untyped_defs = true +no_implicit_optional = true +warn_redundant_casts = true +warn_unused_ignores = true +show_error_codes = true +namespace_packages = true +explicit_package_bases = true +mypy_path = ["src"] + +[[tool.mypy.overrides]] +module = [ + "qiskit.*", + "qiskit_aer.*", + "qiskit_ibm_runtime.*", + "IPython.*", + "ipywidgets.*", + "nbformat.*", + "matplotlib.*", + "numpy.*", + "yaml.*", +] +ignore_missing_imports = true + [tool.ruff] target-version = "py311" line-length = 120 diff --git a/scripts/app.sh b/scripts/app.sh new file mode 100755 index 0000000..7dbcb65 --- /dev/null +++ b/scripts/app.sh @@ -0,0 +1,345 @@ +#!/usr/bin/env bash +# ────────────────────────────────────────────────────────────────────── +# app.sh — Consumer lifecycle manager for autoresearch-quantum +# +# Usage: +# bash scripts/app.sh bootstrap Create venv, install deps, verify +# bash scripts/app.sh start Launch JupyterLab (opens browser) +# bash scripts/app.sh start --no-open Launch without opening browser +# bash scripts/app.sh stop Stop running JupyterLab +# bash scripts/app.sh status Show service status +# bash scripts/app.sh validate Run full validation suite +# bash scripts/app.sh validate --quick Lint + unit tests only +# bash scripts/app.sh logs Tail JupyterLab logs +# bash scripts/app.sh reset Reset learner progress files +# ────────────────────────────────────────────────────────────────────── +set -euo pipefail + +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +VENV_DIR="$PROJECT_ROOT/.venv" +LOG_DIR="$PROJECT_ROOT/.logs" +PID_FILE="$LOG_DIR/jupyter.pid" +LOG_FILE="$LOG_DIR/jupyterlab.log" +PYTHON="$VENV_DIR/bin/python" +JUPYTER="$VENV_DIR/bin/jupyter" + +# ── Colours ─────────────────────────────────────────────────────────── +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +BOLD='\033[1m' +NC='\033[0m' + +info() { echo -e "${BLUE}[info]${NC} $*"; } +ok() { echo -e "${GREEN}[ ok]${NC} $*"; } +warn() { echo -e "${YELLOW}[warn]${NC} $*"; } +fail() { echo -e "${RED}[FAIL]${NC} $*"; } + +# ── Bootstrap ───────────────────────────────────────────────────────── +cmd_bootstrap() { + info "Bootstrapping autoresearch-quantum..." + + # Python version check + local py_cmd + for candidate in python3.12 python3.11 python3; do + if command -v "$candidate" &>/dev/null; then + py_cmd="$candidate" + break + fi + done + if [[ -z "${py_cmd:-}" ]]; then + fail "Python 3.11+ not found. Install Python first." + exit 1 + fi + + local py_version + py_version=$("$py_cmd" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") + local py_major py_minor + py_major=$(echo "$py_version" | cut -d. -f1) + py_minor=$(echo "$py_version" | cut -d. -f2) + if (( py_major < 3 || py_minor < 11 )); then + fail "Python >= 3.11 required (found $py_version)" + exit 1 + fi + ok "Python $py_version ($py_cmd)" + + # Create venv + if [[ ! -d "$VENV_DIR" ]]; then + info "Creating virtual environment..." + "$py_cmd" -m venv "$VENV_DIR" + ok "Virtual environment created" + else + ok "Virtual environment exists" + fi + + # Install package + info "Installing autoresearch-quantum + dependencies..." + "$PYTHON" -m pip install --upgrade pip --quiet + "$PYTHON" -m pip install -e "$PROJECT_ROOT[dev,notebooks]" --quiet + ok "Package installed" + + # Install Jupyter kernel + "$PYTHON" -m ipykernel install --user --name autoresearch-quantum --display-name "Autoresearch Quantum" --quiet 2>/dev/null || true + ok "Jupyter kernel registered" + + # Create log directory + mkdir -p "$LOG_DIR" + + # Verify imports + if "$PYTHON" -c "from autoresearch_quantum.models import ExperimentSpec; print('Import OK')" &>/dev/null; then + ok "Import verification passed" + else + fail "Import verification failed — check installation" + exit 1 + fi + + echo "" + ok "${BOLD}Bootstrap complete!${NC}" + echo "" + echo " Next steps:" + echo " bash scripts/app.sh start # Launch JupyterLab" + echo " bash scripts/app.sh validate # Run validation suite" +} + +# ── Start ───────────────────────────────────────────────────────────── +cmd_start() { + local open_browser=true + [[ "${1:-}" == "--no-open" ]] && open_browser=false + + if [[ ! -f "$PYTHON" ]]; then + fail "Not bootstrapped. Run: bash scripts/app.sh bootstrap" + exit 1 + fi + + # Check if already running + if [[ -f "$PID_FILE" ]] && kill -0 "$(cat "$PID_FILE")" 2>/dev/null; then + local url + url=$(grep -o 'http://[^ ]*' "$LOG_FILE" 2>/dev/null | tail -1 || echo "http://localhost:8888") + warn "JupyterLab already running (PID $(cat "$PID_FILE"))" + echo " $url" + return 0 + fi + + mkdir -p "$LOG_DIR" + + # Find free port + local port=8888 + while lsof -i :"$port" &>/dev/null; do + port=$((port + 1)) + if (( port > 8899 )); then + fail "No free port in range 8888–8899" + exit 1 + fi + done + + info "Starting JupyterLab on port $port..." + + cd "$PROJECT_ROOT" + nohup "$JUPYTER" lab \ + --port="$port" \ + --no-browser \ + --notebook-dir="$PROJECT_ROOT/notebooks" \ + --ServerApp.token='' \ + --ServerApp.password='' \ + > "$LOG_FILE" 2>&1 & + + local pid=$! + echo "$pid" > "$PID_FILE" + + # Wait for server to start + local tries=0 + while ! curl -s "http://localhost:$port/api" &>/dev/null; do + sleep 0.5 + tries=$((tries + 1)) + if (( tries > 20 )); then + fail "JupyterLab failed to start. Check: cat $LOG_FILE" + exit 1 + fi + done + + local url="http://localhost:$port/lab/tree/00_START_HERE.ipynb" + ok "JupyterLab running (PID $pid)" + echo "" + echo " ${BOLD}$url${NC}" + echo "" + + if $open_browser; then + if command -v open &>/dev/null; then + open "$url" + elif command -v xdg-open &>/dev/null; then + xdg-open "$url" + fi + fi +} + +# ── Stop ────────────────────────────────────────────────────────────── +cmd_stop() { + if [[ -f "$PID_FILE" ]]; then + local pid + pid=$(cat "$PID_FILE") + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" + ok "JupyterLab stopped (PID $pid)" + else + warn "PID $pid not running (stale pid file)" + fi + rm -f "$PID_FILE" + else + warn "No PID file — JupyterLab not managed by app.sh" + fi +} + +# ── Status ──────────────────────────────────────────────────────────── +cmd_status() { + echo "" + echo " ${BOLD}autoresearch-quantum${NC}" + echo "" + + # Venv + if [[ -f "$PYTHON" ]]; then + local py_ver + py_ver=$("$PYTHON" --version 2>&1) + ok "Virtual environment: $py_ver" + else + fail "Virtual environment: not found" + fi + + # JupyterLab + if [[ -f "$PID_FILE" ]] && kill -0 "$(cat "$PID_FILE")" 2>/dev/null; then + ok "JupyterLab: running (PID $(cat "$PID_FILE"))" + else + warn "JupyterLab: not running" + fi + + # Notebooks + local nb_count + nb_count=$(find "$PROJECT_ROOT/notebooks" -name "*.ipynb" | wc -l | tr -d ' ') + ok "Notebooks: $nb_count found" + + # Learner progress + local progress_count + progress_count=$(find "$PROJECT_ROOT" -name "*_progress.json" 2>/dev/null | wc -l | tr -d ' ') + if (( progress_count > 0 )); then + ok "Learner progress files: $progress_count" + else + info "Learner progress files: none (fresh start)" + fi + + echo "" +} + +# ── Validate ────────────────────────────────────────────────────────── +cmd_validate() { + local mode="${1:---standard}" + + if [[ ! -f "$PYTHON" ]]; then + fail "Not bootstrapped. Run: bash scripts/app.sh bootstrap" + exit 1 + fi + + echo "" + info "${BOLD}Running validation ($mode)...${NC}" + echo "" + + local failed=0 + + # Ruff + info "Ruff lint..." + if "$VENV_DIR/bin/ruff" check src/ tests/ scripts/ --quiet; then + ok "Ruff: clean" + else + fail "Ruff: errors found" + failed=1 + fi + + # Mypy + info "Mypy type check..." + if "$PYTHON" -m mypy src/autoresearch_quantum/ --no-error-summary 2>/dev/null; then + ok "Mypy: clean" + else + fail "Mypy: type errors found" + failed=1 + fi + + if [[ "$mode" == "--quick" ]]; then + # Quick: unit tests only (no notebook execution) + info "Unit tests (quick)..." + if "$PYTHON" -m pytest tests/ -k "not test_notebook_executes and not test_browser" -q --tb=short --no-header 2>&1; then + ok "Unit tests: passed" + else + fail "Unit tests: failures" + failed=1 + fi + else + # Standard: all tests except browser UX + info "Full test suite..." + if "$PYTHON" -m pytest tests/ -k "not test_browser" -v --tb=short --no-header 2>&1; then + ok "Tests: passed" + else + fail "Tests: failures" + failed=1 + fi + fi + + echo "" + if (( failed == 0 )); then + ok "${BOLD}All validation checks passed.${NC}" + else + fail "${BOLD}Some checks failed — see above.${NC}" + exit 1 + fi +} + +# ── Logs ────────────────────────────────────────────────────────────── +cmd_logs() { + if [[ -f "$LOG_FILE" ]]; then + tail -f "$LOG_FILE" + else + warn "No log file found. Start JupyterLab first." + fi +} + +# ── Reset ───────────────────────────────────────────────────────────── +cmd_reset() { + info "Resetting learner progress..." + local count=0 + while IFS= read -r -d '' f; do + rm "$f" + count=$((count + 1)) + done < <(find "$PROJECT_ROOT" -name "*_progress.json" -print0 2>/dev/null) + ok "Removed $count progress file(s)" + info "Notebook outputs are preserved (use nbstripout to clear them)" +} + +# ── Main dispatch ───────────────────────────────────────────────────── +case "${1:-help}" in + bootstrap) cmd_bootstrap ;; + start) cmd_start "${2:-}" ;; + stop) cmd_stop ;; + status) cmd_status ;; + validate) cmd_validate "${2:-}" ;; + logs) cmd_logs ;; + reset) cmd_reset ;; + help|--help|-h) + echo "" + echo " ${BOLD}autoresearch-quantum${NC} — lifecycle manager" + echo "" + echo " Usage: bash scripts/app.sh " + echo "" + echo " Commands:" + echo " bootstrap Create venv, install deps, verify imports" + echo " start [--no-open] Launch JupyterLab (opens 00_START_HERE.ipynb)" + echo " stop Stop JupyterLab" + echo " status Show service and project status" + echo " validate [--quick] Run lint, type check, and tests" + echo " logs Tail JupyterLab output" + echo " reset Delete learner progress files" + echo "" + ;; + *) + fail "Unknown command: $1" + echo " Run 'bash scripts/app.sh help' for usage." + exit 1 + ;; +esac diff --git a/scripts/enhance_dashboard.py b/scripts/enhance_dashboard.py index 3ec8d06..1c992f6 100644 --- a/scripts/enhance_dashboard.py +++ b/scripts/enhance_dashboard.py @@ -8,10 +8,10 @@ ORIG = len(nb["cells"]) def md(s): lines = s.strip().split("\n") - return {"cell_type": "markdown", "metadata": {}, "source": [l + "\n" for l in lines[:-1]] + [lines[-1]]} + return {"cell_type": "markdown", "metadata": {}, "source": [ln + "\n" for ln in lines[:-1]] + [lines[-1]]} def code(s): lines = s.strip().split("\n") - return {"cell_type": "code", "metadata": {}, "source": [l + "\n" for l in lines[:-1]] + [lines[-1]], "outputs": [], "execution_count": None} + return {"cell_type": "code", "metadata": {}, "source": [ln + "\n" for ln in lines[:-1]] + [lines[-1]], "outputs": [], "execution_count": None} ins = [] diff --git a/scripts/enhance_nb02.py b/scripts/enhance_nb02.py index a977094..9f61169 100644 --- a/scripts/enhance_nb02.py +++ b/scripts/enhance_nb02.py @@ -8,11 +8,11 @@ ORIG = len(nb["cells"]) def md(s): lines = s.strip().split("\n") - return {"cell_type": "markdown", "metadata": {}, "source": [l + "\n" for l in lines[:-1]] + [lines[-1]]} + return {"cell_type": "markdown", "metadata": {}, "source": [ln + "\n" for ln in lines[:-1]] + [lines[-1]]} def code(s): lines = s.strip().split("\n") - return {"cell_type": "code", "metadata": {}, "source": [l + "\n" for l in lines[:-1]] + [lines[-1]], "outputs": [], "execution_count": None} + return {"cell_type": "code", "metadata": {}, "source": [ln + "\n" for ln in lines[:-1]] + [lines[-1]], "outputs": [], "execution_count": None} ins = [] diff --git a/scripts/enhance_nb03.py b/scripts/enhance_nb03.py index 181246c..4d88f64 100644 --- a/scripts/enhance_nb03.py +++ b/scripts/enhance_nb03.py @@ -8,11 +8,11 @@ ORIG = len(nb["cells"]) def md(s): lines = s.strip().split("\n") - return {"cell_type": "markdown", "metadata": {}, "source": [l + "\n" for l in lines[:-1]] + [lines[-1]]} + return {"cell_type": "markdown", "metadata": {}, "source": [ln + "\n" for ln in lines[:-1]] + [lines[-1]]} def code(s): lines = s.strip().split("\n") - return {"cell_type": "code", "metadata": {}, "source": [l + "\n" for l in lines[:-1]] + [lines[-1]], "outputs": [], "execution_count": None} + return {"cell_type": "code", "metadata": {}, "source": [ln + "\n" for ln in lines[:-1]] + [lines[-1]], "outputs": [], "execution_count": None} ins = [] diff --git a/scripts/enhance_spiral.py b/scripts/enhance_spiral.py index fa5d27b..9366ff3 100644 --- a/scripts/enhance_spiral.py +++ b/scripts/enhance_spiral.py @@ -8,10 +8,10 @@ ORIG = len(nb["cells"]) def md(s): lines = s.strip().split("\n") - return {"cell_type": "markdown", "metadata": {}, "source": [l + "\n" for l in lines[:-1]] + [lines[-1]]} + return {"cell_type": "markdown", "metadata": {}, "source": [ln + "\n" for ln in lines[:-1]] + [lines[-1]]} def code(s): lines = s.strip().split("\n") - return {"cell_type": "code", "metadata": {}, "source": [l + "\n" for l in lines[:-1]] + [lines[-1]], "outputs": [], "execution_count": None} + return {"cell_type": "code", "metadata": {}, "source": [ln + "\n" for ln in lines[:-1]] + [lines[-1]], "outputs": [], "execution_count": None} ins = [] diff --git a/scripts/enhance_track_a.py b/scripts/enhance_track_a.py index 6e2191e..be291d5 100644 --- a/scripts/enhance_track_a.py +++ b/scripts/enhance_track_a.py @@ -8,10 +8,10 @@ ORIG = len(nb["cells"]) def md(s): lines = s.strip().split("\n") - return {"cell_type": "markdown", "metadata": {}, "source": [l + "\n" for l in lines[:-1]] + [lines[-1]]} + return {"cell_type": "markdown", "metadata": {}, "source": [ln + "\n" for ln in lines[:-1]] + [lines[-1]]} def code(s): lines = s.strip().split("\n") - return {"cell_type": "code", "metadata": {}, "source": [l + "\n" for l in lines[:-1]] + [lines[-1]], "outputs": [], "execution_count": None} + return {"cell_type": "code", "metadata": {}, "source": [ln + "\n" for ln in lines[:-1]] + [lines[-1]], "outputs": [], "execution_count": None} ins = [] diff --git a/scripts/enhance_track_b.py b/scripts/enhance_track_b.py index bf44663..1bf9322 100644 --- a/scripts/enhance_track_b.py +++ b/scripts/enhance_track_b.py @@ -8,10 +8,10 @@ ORIG = len(nb["cells"]) def md(s): lines = s.strip().split("\n") - return {"cell_type": "markdown", "metadata": {}, "source": [l + "\n" for l in lines[:-1]] + [lines[-1]]} + return {"cell_type": "markdown", "metadata": {}, "source": [ln + "\n" for ln in lines[:-1]] + [lines[-1]]} def code(s): lines = s.strip().split("\n") - return {"cell_type": "code", "metadata": {}, "source": [l + "\n" for l in lines[:-1]] + [lines[-1]], "outputs": [], "execution_count": None} + return {"cell_type": "code", "metadata": {}, "source": [ln + "\n" for ln in lines[:-1]] + [lines[-1]], "outputs": [], "execution_count": None} ins = [] diff --git a/scripts/enhance_track_c.py b/scripts/enhance_track_c.py index a82979e..2c6cb1c 100644 --- a/scripts/enhance_track_c.py +++ b/scripts/enhance_track_c.py @@ -8,10 +8,10 @@ ORIG = len(nb["cells"]) def md(s): lines = s.strip().split("\n") - return {"cell_type": "markdown", "metadata": {}, "source": [l + "\n" for l in lines[:-1]] + [lines[-1]]} + return {"cell_type": "markdown", "metadata": {}, "source": [ln + "\n" for ln in lines[:-1]] + [lines[-1]]} def code(s): lines = s.strip().split("\n") - return {"cell_type": "code", "metadata": {}, "source": [l + "\n" for l in lines[:-1]] + [lines[-1]], "outputs": [], "execution_count": None} + return {"cell_type": "code", "metadata": {}, "source": [ln + "\n" for ln in lines[:-1]] + [lines[-1]], "outputs": [], "execution_count": None} ins = [] diff --git a/scripts/fix_math_in_explanations.py b/scripts/fix_math_in_explanations.py index 0ea3df3..75abdcb 100644 --- a/scripts/fix_math_in_explanations.py +++ b/scripts/fix_math_in_explanations.py @@ -1,10 +1,9 @@ -"""Fix math notation in explanation strings across all enhanced notebooks. +r"""Fix math notation in explanation strings across all enhanced notebooks. Replaces raw pseudo-LaTeX in HTML explanation text with proper MathJax \(...\) delimiters so Jupyter renders them correctly. """ import json -import re from pathlib import Path NOTEBOOKS = [ diff --git a/src/autoresearch_quantum/cli.py b/src/autoresearch_quantum/cli.py index cd78d53..4870c6e 100644 --- a/src/autoresearch_quantum/cli.py +++ b/src/autoresearch_quantum/cli.py @@ -41,7 +41,7 @@ def _build_spec_from_config(config_path: Path, overrides: list[str]) -> tuple[An def _print_json(payload: Any) -> None: def _default(value: Any) -> Any: if is_dataclass(value): - return asdict(value) + return asdict(value) # type: ignore[arg-type] return str(value) print(json.dumps(payload, indent=2, default=_default)) diff --git a/src/autoresearch_quantum/execution/hardware.py b/src/autoresearch_quantum/execution/hardware.py index 10fc031..078e42d 100644 --- a/src/autoresearch_quantum/execution/hardware.py +++ b/src/autoresearch_quantum/execution/hardware.py @@ -61,22 +61,22 @@ class IBMHardwareExecutor: ) aggregate[context_name].append(summary) - x_value = float(aggregate["logical_x"][-1]["expectation"]) - y_value = float(aggregate["logical_y"][-1]["expectation"]) - spectator = float(aggregate["spectator_z"][-1]["expectation"]) - acceptance = float(aggregate["acceptance"][-1]["acceptance_rate"]) + x_value = float(aggregate["logical_x"][-1]["expectation"]) # type: ignore[arg-type] + y_value = float(aggregate["logical_y"][-1]["expectation"]) # type: ignore[arg-type] + spectator = float(aggregate["spectator_z"][-1]["expectation"]) # type: ignore[arg-type] + acceptance = float(aggregate["acceptance"][-1]["acceptance_rate"]) # type: ignore[arg-type] repeat_scores.append(logical_magic_witness(x_value, y_value, spectator) * acceptance) - acceptance_rate = fmean(float(item["acceptance_rate"]) for item in aggregate["acceptance"]) - logical_x = fmean(float(item["expectation"]) for item in aggregate["logical_x"]) - logical_y = fmean(float(item["expectation"]) for item in aggregate["logical_y"]) - spectator_z = fmean(float(item["expectation"]) for item in aggregate["spectator_z"]) + acceptance_rate = fmean(float(item["acceptance_rate"]) for item in aggregate["acceptance"]) # type: ignore[arg-type] + logical_x = fmean(float(item["expectation"]) for item in aggregate["logical_x"]) # type: ignore[arg-type] + logical_y = fmean(float(item["expectation"]) for item in aggregate["logical_y"]) # type: ignore[arg-type] + spectator_z = fmean(float(item["expectation"]) for item in aggregate["spectator_z"]) # type: ignore[arg-type] metrics = EvaluationMetrics( logical_magic_witness=logical_magic_witness(logical_x, logical_y, spectator_z), acceptance_rate=acceptance_rate, codespace_rate=fmean( - float(item["acceptance_rate"]) + float(item["acceptance_rate"]) # type: ignore[arg-type] for summaries in aggregate.values() for item in summaries ), @@ -111,8 +111,8 @@ class IBMHardwareExecutor: metrics=metrics, counts_summary={ name: { - "mean_acceptance_rate": fmean(float(item["acceptance_rate"]) for item in summaries), - "mean_expectation": fmean(float(item["expectation"]) for item in summaries), + "mean_acceptance_rate": fmean(float(item["acceptance_rate"]) for item in summaries), # type: ignore[arg-type] + "mean_expectation": fmean(float(item["expectation"]) for item in summaries), # type: ignore[arg-type] "latest": summaries[-1], } for name, summaries in aggregate.items() diff --git a/src/autoresearch_quantum/execution/local.py b/src/autoresearch_quantum/execution/local.py index aff6888..5ac0feb 100644 --- a/src/autoresearch_quantum/execution/local.py +++ b/src/autoresearch_quantum/execution/local.py @@ -105,20 +105,20 @@ class LocalCheapExecutor: ) aggregate[context_name].append(summary) - x_value = float(aggregate["logical_x"][-1]["expectation"]) - y_value = float(aggregate["logical_y"][-1]["expectation"]) - spectator = float(aggregate["spectator_z"][-1]["expectation"]) - acceptance = float(aggregate["acceptance"][-1]["acceptance_rate"]) + x_value = float(aggregate["logical_x"][-1]["expectation"]) # type: ignore[arg-type] + y_value = float(aggregate["logical_y"][-1]["expectation"]) # type: ignore[arg-type] + spectator = float(aggregate["spectator_z"][-1]["expectation"]) # type: ignore[arg-type] + acceptance = float(aggregate["acceptance"][-1]["acceptance_rate"]) # type: ignore[arg-type] repeat_scores.append(logical_magic_witness(x_value, y_value, spectator) * acceptance) - acceptance_rate = fmean(float(item["acceptance_rate"]) for item in aggregate["acceptance"]) - logical_x = fmean(float(item["expectation"]) for item in aggregate["logical_x"]) - logical_y = fmean(float(item["expectation"]) for item in aggregate["logical_y"]) - spectator_z = fmean(float(item["expectation"]) for item in aggregate["spectator_z"]) + acceptance_rate = fmean(float(item["acceptance_rate"]) for item in aggregate["acceptance"]) # type: ignore[arg-type] + logical_x = fmean(float(item["expectation"]) for item in aggregate["logical_x"]) # type: ignore[arg-type] + logical_y = fmean(float(item["expectation"]) for item in aggregate["logical_y"]) # type: ignore[arg-type] + spectator_z = fmean(float(item["expectation"]) for item in aggregate["spectator_z"]) # type: ignore[arg-type] witness = logical_magic_witness(logical_x, logical_y, spectator_z) codespace_rate = fmean( [ - float(item["acceptance_rate"]) + float(item["acceptance_rate"]) # type: ignore[arg-type] for summaries in aggregate.values() for item in summaries ] @@ -156,8 +156,8 @@ class LocalCheapExecutor: score, quality, _ = score_metrics(metrics, "cheap", rung_config.score) counts_summary = { name: { - "mean_acceptance_rate": fmean(float(item["acceptance_rate"]) for item in summaries), - "mean_expectation": fmean(float(item["expectation"]) for item in summaries), + "mean_acceptance_rate": fmean(float(item["acceptance_rate"]) for item in summaries), # type: ignore[arg-type] + "mean_expectation": fmean(float(item["expectation"]) for item in summaries), # type: ignore[arg-type] "latest": summaries[-1], } for name, summaries in aggregate.items() diff --git a/src/autoresearch_quantum/lessons/extractor.py b/src/autoresearch_quantum/lessons/extractor.py index b465237..f7277b0 100644 --- a/src/autoresearch_quantum/lessons/extractor.py +++ b/src/autoresearch_quantum/lessons/extractor.py @@ -72,9 +72,9 @@ def extract_rung_lesson( invariants: list[str] = [] for dimension in rung_config.search_space.dimensions: - values = {record["spec"][dimension] for record in top_records} - if len(values) == 1: - value = next(iter(values)) + top_values = {record["spec"][dimension] for record in top_records} + if len(top_values) == 1: + value = next(iter(top_values)) invariants.append(f"Top-ranked experiments consistently kept {dimension}={value}.") hardware_specific = [ diff --git a/src/autoresearch_quantum/persistence/store.py b/src/autoresearch_quantum/persistence/store.py index 4dfd7c6..b930d4b 100644 --- a/src/autoresearch_quantum/persistence/store.py +++ b/src/autoresearch_quantum/persistence/store.py @@ -3,7 +3,7 @@ from __future__ import annotations import json from dataclasses import asdict from pathlib import Path -from typing import Any +from typing import Any, cast from ..models import ( ExperimentRecord, @@ -47,7 +47,7 @@ class ResearchStore: def load_experiment(self, rung: int, experiment_id: str) -> dict[str, Any]: path = self.experiment_dir(rung) / f"{experiment_id}.json" - return json.loads(path.read_text(encoding="utf-8")) + return cast(dict[str, Any], json.loads(path.read_text(encoding="utf-8"))) def list_experiments(self, rung: int) -> list[dict[str, Any]]: return [ @@ -132,4 +132,4 @@ class ResearchStore: path = self.rung_dir(rung) / "propagated_spec.json" if not path.exists(): return None - return json.loads(path.read_text(encoding="utf-8")) + return cast(dict[str, Any], json.loads(path.read_text(encoding="utf-8"))) diff --git a/src/autoresearch_quantum/ratchet/runner.py b/src/autoresearch_quantum/ratchet/runner.py index 54b7099..bc39b24 100644 --- a/src/autoresearch_quantum/ratchet/runner.py +++ b/src/autoresearch_quantum/ratchet/runner.py @@ -41,8 +41,8 @@ def _record_from_json(payload: dict[str, Any]) -> ExperimentRecord: parent_incumbent_id=payload.get("parent_incumbent_id"), mutation_note=payload.get("mutation_note", ""), spec=_from_dict_spec(payload["spec"]), - cheap_result=cheap, # type: ignore[arg-type] - expensive_result=expensive, # type: ignore[arg-type] + cheap_result=cheap, + expensive_result=expensive, final_score=float(payload.get("final_score", 0.0)), promoted_to_expensive=bool(payload.get("promoted_to_expensive", False)), became_incumbent=bool(payload.get("became_incumbent", False)), @@ -85,7 +85,7 @@ class AutoresearchHarness: """Collect fingerprints of all experiments already tried in this rung.""" experiments = self.store.list_experiments(rung) return { - ExperimentSpec(**{ + ExperimentSpec(**{ # type: ignore[arg-type] k: tuple(v) if k == "initial_layout" and isinstance(v, list) else v for k, v in exp["spec"].items() }).fingerprint() diff --git a/src/autoresearch_quantum/teaching/assess.py b/src/autoresearch_quantum/teaching/assess.py index 1a934e8..37e3fbc 100644 --- a/src/autoresearch_quantum/teaching/assess.py +++ b/src/autoresearch_quantum/teaching/assess.py @@ -115,10 +115,10 @@ def quiz( padding="16px", margin="12px 0", border_radius="10px", - background_color=_QUIZ_BG, # type: ignore[arg-type] + background_color=_QUIZ_BG, ), ) - display(box) + display(box) # type: ignore[no-untyped-call] # ── predict: prediction before running next cell ──────────────────────────── @@ -190,10 +190,10 @@ def predict_choice( padding="16px", margin="12px 0", border_radius="10px", - background_color="#fff8e1", # type: ignore[arg-type] + background_color="#fff8e1", ), ) - display(box) + display(box) # type: ignore[no-untyped-call] # ── reflect: free-response with model answer reveal ───────────────────────── @@ -250,10 +250,10 @@ def reflect( padding="16px", margin="12px 0", border_radius="10px", - background_color="#e3f2fd", # type: ignore[arg-type] + background_color="#e3f2fd", ), ) - display(box) + display(box) # type: ignore[no-untyped-call] # ── order: drag-free ordering via dropdowns ───────────────────────────────── @@ -349,10 +349,10 @@ def order( padding="16px", margin="12px 0", border_radius="10px", - background_color=_QUIZ_BG, # type: ignore[arg-type] + background_color=_QUIZ_BG, ), ) - display(box) + display(box) # type: ignore[no-untyped-call] # ── checkpoint_summary (unchanged — pure HTML) ───────────────────────────── @@ -364,7 +364,7 @@ def checkpoint_summary(tracker: LearningTracker, section: str) -> None: data = all_data.get(section, {"correct": 0, "incorrect": 0, "total": 0, "pct": 0.0}) if data["total"] == 0: - display(HTML(_neutral_html( + display(HTML(_neutral_html( # type: ignore[no-untyped-call] f"Checkpoint — {section}: No scored questions in this section yet." ))) return @@ -396,7 +396,7 @@ def checkpoint_summary(tracker: LearningTracker, section: str) -> None: msg += "
This section needs more work. Re-read and retry the questions above." msg += review - display(HTML( + display(HTML( # type: ignore[no-untyped-call] f'
{msg}
' )) @@ -405,36 +405,43 @@ def checkpoint_summary(tracker: LearningTracker, section: str) -> None: # ── Backwards-compatible aliases (old API → new API) ──────────────────────── # These allow old notebook cells to still work while we migrate. -def multiple_choice(tracker, qid, question, options, correct, answer="?", - bloom="remember", explanation=""): +def multiple_choice(tracker: LearningTracker, qid: str, question: str, + options: dict[str, str], correct: str, answer: str = "?", + bloom: str = "remember", explanation: str = "") -> None: """Legacy wrapper — redirects to quiz().""" opt_list = [f"({k}) {v}" for k, v in options.items()] correct_idx = list(options.keys()).index(correct.lower()) quiz(tracker, qid, question, opt_list, correct_idx, bloom, explanation) -def predict(tracker, qid, question, your_prediction="?", bloom="understand"): +def predict(tracker: LearningTracker, qid: str, question: str, + your_prediction: str = "?", bloom: str = "understand") -> None: """Legacy wrapper — use predict_choice() instead.""" warnings.warn( "predict() is deprecated and does nothing. Use predict_choice() instead.", DeprecationWarning, stacklevel=2, ) -def check_prediction(tracker, qid, actual_value=None, was_correct=False, explanation=""): +def check_prediction(tracker: LearningTracker, qid: str, actual_value: Any = None, + was_correct: bool = False, explanation: str = "") -> None: """Legacy wrapper — use predict_choice() instead.""" warnings.warn( "check_prediction() is deprecated and does nothing. Use predict_choice() instead.", DeprecationWarning, stacklevel=2, ) -def numerical_answer(tracker, qid, question, answer=0.0, correct=0.0, - tolerance=0.01, bloom="apply", explanation=""): +def numerical_answer(tracker: LearningTracker, qid: str, question: str, + answer: float = 0.0, correct: float = 0.0, + tolerance: float = 0.01, bloom: str = "apply", + explanation: str = "") -> None: """Legacy wrapper — use quiz() instead.""" warnings.warn( "numerical_answer() is deprecated and does nothing. Use quiz() instead.", DeprecationWarning, stacklevel=2, ) -def free_response(tracker, qid, question, answer="?", bloom="evaluate", model_answer=""): +def free_response(tracker: LearningTracker, qid: str, question: str, + answer: str = "?", bloom: str = "evaluate", + model_answer: str = "") -> None: """Legacy wrapper — redirects to reflect().""" warnings.warn( "free_response() is deprecated. Use reflect() directly.", @@ -442,16 +449,19 @@ def free_response(tracker, qid, question, answer="?", bloom="evaluate", model_an ) reflect(tracker, qid, question, model_answer, bloom) -def code_challenge(tracker, qid, description, test_passed=False, - bloom="apply", hint="", explanation=""): +def code_challenge(tracker: LearningTracker, qid: str, description: str, + test_passed: bool = False, bloom: str = "apply", + hint: str = "", explanation: str = "") -> None: """Legacy wrapper — no replacement; use code cells with assertions.""" warnings.warn( "code_challenge() is deprecated and does nothing. Use code cells with assertions.", DeprecationWarning, stacklevel=2, ) -def concept_sort(tracker, qid, instruction, student_order=None, - correct_order=None, bloom="analyze", explanation=""): +def concept_sort(tracker: LearningTracker, qid: str, instruction: str, + student_order: list[str] | None = None, + correct_order: list[str] | None = None, bloom: str = "analyze", + explanation: str = "") -> None: """Legacy wrapper — use order() instead.""" warnings.warn( "concept_sort() is deprecated. Use order() directly.", diff --git a/src/autoresearch_quantum/teaching/tracker.py b/src/autoresearch_quantum/teaching/tracker.py index 27c875c..8b86503 100644 --- a/src/autoresearch_quantum/teaching/tracker.py +++ b/src/autoresearch_quantum/teaching/tracker.py @@ -227,7 +227,7 @@ class LearningTracker: html_parts.append("") html_parts.append("") - display(HTML("\n".join(html_parts))) + display(HTML("\n".join(html_parts))) # type: ignore[no-untyped-call] # ── persistence ───────────────────────────────────────────────────── def save(self, path: str | Path | None = None) -> Path: diff --git a/tests/test_browser_ux.py b/tests/test_browser_ux.py new file mode 100644 index 0000000..aaecc6f --- /dev/null +++ b/tests/test_browser_ux.py @@ -0,0 +1,249 @@ +"""End-to-end browser UX tests using Playwright. + +Validates the complete consumer experience: +- JupyterLab launches and serves notebooks +- 00_START_HERE.ipynb loads and renders plan links +- Content notebooks load, render widgets, and navigation works +- The full walkthrough from entry point to plan completion is unbroken + +Requires: pip install playwright && python -m playwright install chromium + +Run with: pytest tests/test_browser_ux.py -m browser -v +""" +from __future__ import annotations + +import os +import signal +import socket +import subprocess +import time +from pathlib import Path + +import pytest + +# Skip entire module if playwright is not installed +pw = pytest.importorskip("playwright.sync_api", reason="playwright not installed") + +NOTEBOOK_DIR = Path("notebooks") +PROJECT_ROOT = Path(__file__).resolve().parent.parent +STARTUP_TIMEOUT = 30 # seconds to wait for Jupyter to start +PAGE_TIMEOUT = 15_000 # ms per page load + + +def _find_free_port() -> int: + """Find a free TCP port.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +@pytest.fixture(scope="module") +def jupyter_server(): + """Launch a JupyterLab server for the test session, tear it down after.""" + port = _find_free_port() + venv_python = PROJECT_ROOT / ".venv" / "bin" / "python" + + if not venv_python.exists(): + pytest.skip("No .venv found — run 'bash scripts/app.sh bootstrap' first") + + jupyter_bin = PROJECT_ROOT / ".venv" / "bin" / "jupyter" + if not jupyter_bin.exists(): + pytest.skip("jupyter not installed in .venv") + + proc = subprocess.Popen( + [ + str(jupyter_bin), "lab", + f"--port={port}", + "--no-browser", + f"--notebook-dir={NOTEBOOK_DIR.resolve()}", + "--ServerApp.token=", + "--ServerApp.password=", + "--ServerApp.disable_check_xsrf=True", + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + cwd=str(PROJECT_ROOT), + preexec_fn=os.setsid, + ) + + base_url = f"http://localhost:{port}" + + # Wait for server to become responsive + started = False + for _ in range(STARTUP_TIMEOUT * 2): + try: + with socket.create_connection(("localhost", port), timeout=0.5): + started = True + break + except OSError: + time.sleep(0.5) + + if not started: + proc.kill() + pytest.skip(f"JupyterLab failed to start on port {port}") + + # Give the server a moment to fully initialize + time.sleep(2) + + yield base_url + + # Teardown: kill the process group + try: + os.killpg(os.getpgid(proc.pid), signal.SIGTERM) + proc.wait(timeout=5) + except (ProcessLookupError, subprocess.TimeoutExpired): + os.killpg(os.getpgid(proc.pid), signal.SIGKILL) + + +@pytest.fixture(scope="module") +def browser_page(jupyter_server: str): + """Create a Playwright browser page for the test session.""" + from playwright.sync_api import sync_playwright + + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + context = browser.new_context() + page = context.new_page() + page.set_default_timeout(PAGE_TIMEOUT) + yield page, jupyter_server + browser.close() + + +# ── Markers ─────────────────────────────────────────────────────────── + +pytestmark = pytest.mark.browser + + +# ── Tests ───────────────────────────────────────────────────────────── + + +class TestJupyterLabLaunches: + """Verify that JupyterLab is reachable and serves content.""" + + def test_api_reachable(self, jupyter_server: str) -> None: + """JupyterLab API responds to requests.""" + import urllib.request + with urllib.request.urlopen(f"{jupyter_server}/api") as resp: + assert resp.status == 200 + + def test_lab_page_loads(self, browser_page: tuple) -> None: + """JupyterLab main page loads without errors.""" + page, base_url = browser_page + page.goto(f"{base_url}/lab") + # JupyterLab should render its main application + page.wait_for_selector("#jp-main-dock-panel", timeout=PAGE_TIMEOUT) + + +class TestStartHereNotebook: + """Verify the central entry point notebook renders correctly.""" + + def test_start_here_loads(self, browser_page: tuple) -> None: + """00_START_HERE.ipynb opens in JupyterLab.""" + page, base_url = browser_page + page.goto(f"{base_url}/lab/tree/00_START_HERE.ipynb") + # Wait for notebook to render + page.wait_for_selector(".jp-Notebook", timeout=PAGE_TIMEOUT) + + def test_start_here_has_title(self, browser_page: tuple) -> None: + """The entry notebook displays the main heading.""" + page, base_url = browser_page + page.goto(f"{base_url}/lab/tree/00_START_HERE.ipynb") + page.wait_for_selector(".jp-Notebook", timeout=PAGE_TIMEOUT) + # Look for the title text in rendered markdown + content = page.text_content(".jp-Notebook") + assert content is not None + assert "Start Here" in content + + def test_start_here_has_plan_links(self, browser_page: tuple) -> None: + """The entry notebook contains links to all four plans.""" + page, base_url = browser_page + page.goto(f"{base_url}/lab/tree/00_START_HERE.ipynb") + page.wait_for_selector(".jp-Notebook", timeout=PAGE_TIMEOUT) + content = page.text_content(".jp-Notebook") or "" + assert "Plan A" in content + assert "Plan B" in content + assert "Plan C" in content + assert "Plan D" in content + + +class TestPlanNotebooksLoad: + """Verify that the first notebook of each plan loads without errors.""" + + @pytest.mark.parametrize("notebook_path", [ + "plan_a/01_encoded_magic_state.ipynb", + "plan_b/spiral_notebook.ipynb", + "plan_c/00_dashboard.ipynb", + "plan_d/experiment_1_protection.ipynb", + ]) + def test_plan_entry_loads(self, browser_page: tuple, notebook_path: str) -> None: + """Each plan's entry notebook opens and renders.""" + page, base_url = browser_page + page.goto(f"{base_url}/lab/tree/{notebook_path}") + page.wait_for_selector(".jp-Notebook", timeout=PAGE_TIMEOUT) + # Verify the notebook rendered at least some cells + cells = page.query_selector_all(".jp-Cell") + assert len(cells) > 0, f"{notebook_path} rendered zero cells" + + +class TestNavigationLinks: + """Verify that inter-notebook navigation links are present and functional.""" + + @pytest.mark.parametrize("notebook_path,expected_link_text", [ + ("plan_a/01_encoded_magic_state.ipynb", "Notebook 2"), + ("plan_a/02_measuring_progress.ipynb", "Notebook 3"), + ("plan_a/03_the_ratchet.ipynb", "Plan B"), + ("plan_d/experiment_1_protection.ipynb", "Experiment 2"), + ("plan_d/experiment_2_noise.ipynb", "Experiment 3"), + ]) + def test_navigation_link_present( + self, browser_page: tuple, notebook_path: str, expected_link_text: str, + ) -> None: + """Navigation footer cells contain expected forward-links.""" + page, base_url = browser_page + page.goto(f"{base_url}/lab/tree/{notebook_path}") + page.wait_for_selector(".jp-Notebook", timeout=PAGE_TIMEOUT) + content = page.text_content(".jp-Notebook") or "" + assert expected_link_text in content, ( + f"{notebook_path} missing navigation link containing '{expected_link_text}'" + ) + + def test_start_here_link_in_every_content_notebook(self, browser_page: tuple) -> None: + """Every content notebook links back to START_HERE.""" + page, base_url = browser_page + content_notebooks = [ + "plan_a/01_encoded_magic_state.ipynb", + "plan_a/02_measuring_progress.ipynb", + "plan_a/03_the_ratchet.ipynb", + "plan_b/spiral_notebook.ipynb", + "plan_c/00_dashboard.ipynb", + "plan_d/experiment_1_protection.ipynb", + ] + for nb in content_notebooks: + page.goto(f"{base_url}/lab/tree/{nb}") + page.wait_for_selector(".jp-Notebook", timeout=PAGE_TIMEOUT) + content = page.text_content(".jp-Notebook") or "" + assert "Start Here" in content, f"{nb} missing 'Start Here' back-link" + + +class TestWidgetRendering: + """Verify that assessment widgets render after kernel execution.""" + + def test_notebook_with_widgets_can_execute(self, browser_page: tuple) -> None: + """A notebook with widgets can be opened and cells executed. + + This tests the full UX: open notebook → run cells → widgets appear. + We use a lightweight notebook (Plan D Experiment 1) which runs fast. + """ + page, base_url = browser_page + page.goto(f"{base_url}/lab/tree/plan_d/experiment_1_protection.ipynb") + page.wait_for_selector(".jp-Notebook", timeout=PAGE_TIMEOUT) + + # Wait for kernel to be ready (kernel indicator in toolbar) + page.wait_for_selector( + ".jp-Notebook-ExecutionIndicator", + timeout=PAGE_TIMEOUT, + ) + + # Verify the notebook has rendered cells + cells = page.query_selector_all(".jp-Cell") + assert len(cells) > 5, "Notebook should have rendered multiple cells" diff --git a/tests/test_pedagogy.py b/tests/test_pedagogy.py new file mode 100644 index 0000000..56e77fe --- /dev/null +++ b/tests/test_pedagogy.py @@ -0,0 +1,288 @@ +"""Pedagogical structure tests — validates educational quality invariants. + +These tests enforce minimum standards for notebook prose, assessment density, +section structure, and learning progression. They catch pedagogical regressions +the same way unit tests catch code regressions. +""" +from __future__ import annotations + +import re +from pathlib import Path + +import nbformat +import pytest + +NOTEBOOK_DIR = Path("notebooks") +CONTENT_NOTEBOOKS = sorted( + p for p in NOTEBOOK_DIR.rglob("*.ipynb") + if p.name != "00_START_HERE.ipynb" +) + + +def _notebook_id(path: Path) -> str: + return str(path.relative_to(NOTEBOOK_DIR)).replace("/", "__").removesuffix(".ipynb") + + +def _read_notebook(path: Path) -> nbformat.NotebookNode: + return nbformat.read(str(path), as_version=4) + + +def _markdown_cells(nb: nbformat.NotebookNode) -> list[str]: + return ["".join(c.source) for c in nb.cells if c.cell_type == "markdown"] + + +def _code_cells(nb: nbformat.NotebookNode) -> list[str]: + return ["".join(c.source) for c in nb.cells if c.cell_type == "code"] + + +def _word_count(text: str) -> int: + """Count words in text, stripping markdown/HTML/LaTeX markup.""" + clean = re.sub(r"<[^>]+>", "", text) # strip HTML + clean = re.sub(r"\$[^$]+\$", "MATH", clean) # replace inline LaTeX + clean = re.sub(r"\$\$[^$]+\$\$", "MATH", clean) # block LaTeX + clean = re.sub(r"[#*_`|>~\-=]", "", clean) # strip markdown chars + clean = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", clean) # links → text + return len(clean.split()) + + +# ── Fixtures ────────────────────────────────────────────────────────── + +@pytest.fixture(params=CONTENT_NOTEBOOKS, ids=[_notebook_id(p) for p in CONTENT_NOTEBOOKS]) +def notebook(request: pytest.FixtureRequest) -> tuple[Path, nbformat.NotebookNode]: + path = request.param + return path, _read_notebook(path) + + +# ── Prose Quality ───────────────────────────────────────────────────── + +class TestProseQuality: + """Every notebook must have sufficient explanatory text.""" + + MIN_TOTAL_WORDS = 200 # minimum words across all markdown cells + MIN_MARKDOWN_RATIO = 0.25 # at least 25% of cells should be markdown + + def test_minimum_word_count(self, notebook: tuple[Path, nbformat.NotebookNode]) -> None: + """Each notebook has at least MIN_TOTAL_WORDS of prose.""" + path, nb = notebook + md_cells = _markdown_cells(nb) + total_words = sum(_word_count(cell) for cell in md_cells) + assert total_words >= self.MIN_TOTAL_WORDS, ( + f"{path}: only {total_words} words of prose " + f"(minimum {self.MIN_TOTAL_WORDS})" + ) + + def test_markdown_to_code_ratio(self, notebook: tuple[Path, nbformat.NotebookNode]) -> None: + """Notebooks are not code-only — sufficient markdown explanation exists.""" + path, nb = notebook + md_count = len([c for c in nb.cells if c.cell_type == "markdown"]) + total = len(nb.cells) + if total == 0: + pytest.skip("empty notebook") + ratio = md_count / total + assert ratio >= self.MIN_MARKDOWN_RATIO, ( + f"{path}: markdown ratio {ratio:.0%} " + f"(minimum {self.MIN_MARKDOWN_RATIO:.0%}, " + f"{md_count} markdown / {total} total cells)" + ) + + +# ── Section Structure ───────────────────────────────────────────────── + +class TestSectionStructure: + """Notebooks must have clear sectional organization.""" + + def test_has_title_header(self, notebook: tuple[Path, nbformat.NotebookNode]) -> None: + """First cell is a markdown cell with a level-1 or level-2 heading.""" + path, nb = notebook + first = nb.cells[0] + assert first.cell_type == "markdown", ( + f"{path}: first cell is {first.cell_type}, expected markdown header" + ) + src = "".join(first.source) + assert re.match(r"^#{1,2}\s", src), ( + f"{path}: first cell doesn't start with # or ## heading" + ) + + def test_has_multiple_sections(self, notebook: tuple[Path, nbformat.NotebookNode]) -> None: + """Notebook has at least 2 section headers (## or ###).""" + path, nb = notebook + md_text = "\n".join(_markdown_cells(nb)) + sections = re.findall(r"^#{2,3}\s", md_text, re.MULTILINE) + assert len(sections) >= 2, ( + f"{path}: only {len(sections)} section headers found (minimum 2)" + ) + + +# ── Assessment Density ──────────────────────────────────────────────── + +ASSESSMENT_PATTERN = re.compile(r"(quiz|predict_choice|reflect|order)\s*\(") + + +class TestAssessmentDensity: + """Notebooks must have sufficient interactive assessments.""" + + MIN_ASSESSMENTS = 2 # at least 2 assessment calls per notebook + + def test_minimum_assessment_count(self, notebook: tuple[Path, nbformat.NotebookNode]) -> None: + """Each notebook has at least MIN_ASSESSMENTS interactive assessments.""" + path, nb = notebook + code = "\n".join(_code_cells(nb)) + # Exclude the LearningTracker import/setup line + code_no_setup = "\n".join( + line for line in code.split("\n") + if "LearningTracker" not in line + ) + matches = ASSESSMENT_PATTERN.findall(code_no_setup) + assert len(matches) >= self.MIN_ASSESSMENTS, ( + f"{path}: only {len(matches)} assessments " + f"(minimum {self.MIN_ASSESSMENTS})" + ) + + def test_assessment_variety(self, notebook: tuple[Path, nbformat.NotebookNode]) -> None: + """Each notebook uses at least 2 different assessment types.""" + path, nb = notebook + code = "\n".join(_code_cells(nb)) + code_no_setup = "\n".join( + line for line in code.split("\n") + if "LearningTracker" not in line + ) + types_found = set(ASSESSMENT_PATTERN.findall(code_no_setup)) + assert len(types_found) >= 2, ( + f"{path}: only {len(types_found)} assessment type(s) " + f"({types_found}), minimum 2 for variety" + ) + + +# ── Bloom's Taxonomy Coverage ───────────────────────────────────────── + +BLOOM_PATTERN = re.compile(r'bloom\s*=\s*["\'](\w+)["\']') + + +class TestBloomCoverage: + """Notebooks should exercise multiple Bloom's taxonomy levels.""" + + def test_bloom_levels_used(self, notebook: tuple[Path, nbformat.NotebookNode]) -> None: + """Each notebook exercises at least 2 Bloom's taxonomy levels.""" + path, nb = notebook + code = "\n".join(_code_cells(nb)) + blooms = set(BLOOM_PATTERN.findall(code)) + if not blooms: + pytest.skip("no bloom= parameters found") + assert len(blooms) >= 2, ( + f"{path}: only {len(blooms)} Bloom level(s) ({blooms}), " + f"minimum 2 for cognitive depth" + ) + + +# ── Checkpoint Coverage ─────────────────────────────────────────────── + +class TestCheckpointCoverage: + """Notebooks with many assessments should include checkpoint summaries.""" + + MIN_ASSESSMENTS_FOR_CHECKPOINT = 4 + + def test_checkpoint_present_when_needed( + self, notebook: tuple[Path, nbformat.NotebookNode], + ) -> None: + """Notebooks with 4+ assessments should include checkpoint_summary calls.""" + path, nb = notebook + code = "\n".join(_code_cells(nb)) + assessment_count = len(ASSESSMENT_PATTERN.findall(code)) + if assessment_count < self.MIN_ASSESSMENTS_FOR_CHECKPOINT: + pytest.skip(f"only {assessment_count} assessments (threshold: {self.MIN_ASSESSMENTS_FOR_CHECKPOINT})") + has_checkpoint = "checkpoint_summary" in code + assert has_checkpoint, ( + f"{path}: {assessment_count} assessments but no checkpoint_summary call" + ) + + +# ── Learning Tracker Integration ────────────────────────────────────── + +class TestTrackerIntegration: + """Every content notebook must integrate the learning tracker.""" + + def test_tracker_initialization(self, notebook: tuple[Path, nbformat.NotebookNode]) -> None: + """Each notebook creates a LearningTracker instance.""" + path, nb = notebook + code = "\n".join(_code_cells(nb)) + assert "LearningTracker" in code, ( + f"{path}: no LearningTracker initialization found" + ) + + def test_tracker_dashboard_at_end(self, notebook: tuple[Path, nbformat.NotebookNode]) -> None: + """Each notebook calls tracker.dashboard() near the end.""" + path, nb = notebook + code_cells = _code_cells(nb) + if not code_cells: + pytest.skip("no code cells") + # Check last 3 code cells for dashboard call + tail = "\n".join(code_cells[-3:]) + assert "dashboard()" in tail, ( + f"{path}: no tracker.dashboard() call in final code cells" + ) + + def test_tracker_save_at_end(self, notebook: tuple[Path, nbformat.NotebookNode]) -> None: + """Each notebook saves tracker progress near the end.""" + path, nb = notebook + code_cells = _code_cells(nb) + if not code_cells: + pytest.skip("no code cells") + tail = "\n".join(code_cells[-3:]) + assert "save()" in tail, ( + f"{path}: no tracker.save() call in final code cells" + ) + + +# ── Key Insight Pattern ─────────────────────────────────────────────── + +class TestKeyInsights: + """Notebooks should have 'Key Insight' callouts for important takeaways.""" + + # Interactive dashboards and short notebooks are exempt + EXEMPT = {"00_dashboard.ipynb"} + + def test_has_key_insights(self, notebook: tuple[Path, nbformat.NotebookNode]) -> None: + """Notebooks with 5+ sections should have at least one Key Insight callout.""" + path, nb = notebook + if path.name in self.EXEMPT: + pytest.skip("interactive dashboard — exempt from insight callouts") + md_text = "\n".join(_markdown_cells(nb)) + sections = re.findall(r"^#{2,3}\s", md_text, re.MULTILINE) + if len(sections) < 5: + pytest.skip(f"only {len(sections)} sections (threshold: 5)") + has_insight = bool( + re.search( + r"key insight|observe:|key fact|result:|proof summary|important|tip:", + md_text, re.IGNORECASE, + ) + ) + assert has_insight, ( + f"{path}: {len(sections)} sections but no 'Key Insight' callout" + ) + + +# ── Cross-Plan Consistency ──────────────────────────────────────────── + +class TestCrossPlanConsistency: + """All four plans should cover core concepts.""" + + CORE_CONCEPTS = ["stabiliz", "magic", "witness", "ratchet"] + + def test_all_plans_cover_core_concepts(self) -> None: + """Each plan's notebooks collectively mention all core concepts.""" + plans = { + "plan_a": sorted(NOTEBOOK_DIR.glob("plan_a/*.ipynb")), + "plan_b": sorted(NOTEBOOK_DIR.glob("plan_b/*.ipynb")), + "plan_c": sorted(NOTEBOOK_DIR.glob("plan_c/*.ipynb")), + "plan_d": sorted(NOTEBOOK_DIR.glob("plan_d/*.ipynb")), + } + for plan_name, notebooks in plans.items(): + all_text = "" + for nb_path in notebooks: + nb = _read_notebook(nb_path) + all_text += "\n".join(_markdown_cells(nb) + _code_cells(nb)) + all_text_lower = all_text.lower() + for concept in self.CORE_CONCEPTS: + assert concept in all_text_lower, ( + f"{plan_name}: core concept '{concept}' not found in any notebook" + )