alpha-arena/tests/test_debate.py
oho 081772387b feat: Add Phase 2 Multi-Agent Debate System
Implement a comprehensive debate framework for improved trading decisions:

- Add 6 debate personas: Optimist, Pessimist, Fundamentalist, Technician,
  Devil's Advocate, and Neutral
- Implement structured debate rounds with position-taking and rebuttals
- Add weighted consensus mechanism for aggregating agent opinions
- Add confidence calibration tracking to improve accuracy over time
- Include comprehensive test suite (38 tests)

New components:
- src/agents/debate.py - Complete debate system implementation
  - DebateAgent: Individual agent with persona-specific prompts
  - DebateOrchestrator: Coordinates multi-agent debates
  - ConfidenceCalibrator: Tracks prediction accuracy
- tests/test_debate.py - Full test coverage

The debate system helps avoid groupthink by having agents with different
analytical biases challenge each other's positions before reaching consensus.
2026-01-12 23:23:50 +01:00

719 lines
25 KiB
Python

"""Tests for the multi-agent debate system."""
import asyncio
import pytest
from datetime import datetime, timedelta
from decimal import Decimal
from unittest.mock import AsyncMock, MagicMock, patch
from src.agents.debate import (
DebatePersona,
DebatePosition,
DebateRound,
DebateResult,
DebateAgent,
DebateOrchestrator,
ConfidenceCalibrator,
get_confidence_calibrator,
)
from src.core.types import MarketState, OrderBook, PriceLevel
from src.llm.base import LLMResponse
# =============================================================================
# Test Fixtures
# =============================================================================
@pytest.fixture
def mock_market():
"""Create a mock market state for testing."""
return MarketState(
market_id="test_market_123",
condition_id="cond_123",
question="Will Bitcoin exceed $100,000 by end of 2025?",
category="crypto",
end_date=datetime(2025, 12, 31),
yes_token_id="yes_123",
no_token_id="no_123",
yes_price=Decimal("0.45"),
no_price=Decimal("0.55"),
volume_24h=Decimal("50000"),
liquidity=Decimal("100000"),
yes_book=OrderBook(
token_id="yes_123",
bids=[PriceLevel(price=Decimal("0.44"), size=Decimal("1000"))],
asks=[PriceLevel(price=Decimal("0.46"), size=Decimal("1000"))],
),
no_book=OrderBook(
token_id="no_123",
bids=[PriceLevel(price=Decimal("0.54"), size=Decimal("1000"))],
asks=[PriceLevel(price=Decimal("0.56"), size=Decimal("1000"))],
),
indicators={"rsi": 55, "macd_signal": 0.02},
sentiment={"score": 0.3, "label": "bullish"},
)
@pytest.fixture
def mock_llm_provider():
"""Create a mock LLM provider."""
provider = MagicMock()
provider.default_model = "test-model"
async def mock_complete(*args, **kwargs):
return LLMResponse(
content='{"direction": "yes", "fair_value": 0.55, "confidence": 0.7, "reasoning": "Test reasoning", "key_arguments": ["arg1", "arg2"], "counter_arguments": ["counter1"], "risk_factors": ["risk1"]}',
model="test-model",
provider="test",
tokens_input=100,
tokens_output=50,
latency_ms=100,
cost_estimate=Decimal("0.001"),
finish_reason="stop",
)
provider.complete = mock_complete
return provider
@pytest.fixture
def mock_providers(mock_llm_provider):
"""Create mock providers for all personas."""
return {
DebatePersona.OPTIMIST: mock_llm_provider,
DebatePersona.PESSIMIST: mock_llm_provider,
DebatePersona.FUNDAMENTALIST: mock_llm_provider,
DebatePersona.TECHNICIAN: mock_llm_provider,
DebatePersona.DEVILS_ADVOCATE: mock_llm_provider,
}
# =============================================================================
# DebatePersona Tests
# =============================================================================
class TestDebatePersona:
"""Tests for DebatePersona enum."""
def test_all_personas_exist(self):
assert DebatePersona.OPTIMIST.value == "optimist"
assert DebatePersona.PESSIMIST.value == "pessimist"
assert DebatePersona.FUNDAMENTALIST.value == "fundamentalist"
assert DebatePersona.TECHNICIAN.value == "technician"
assert DebatePersona.DEVILS_ADVOCATE.value == "devils_advocate"
assert DebatePersona.NEUTRAL.value == "neutral"
def test_persona_count(self):
assert len(DebatePersona) == 6
# =============================================================================
# DebatePosition Tests
# =============================================================================
class TestDebatePosition:
"""Tests for DebatePosition dataclass."""
@pytest.fixture
def position(self):
return DebatePosition(
persona=DebatePersona.OPTIMIST,
model="test-model",
direction="yes",
fair_value=Decimal("0.65"),
confidence=Decimal("0.8"),
reasoning="Strong bullish case based on momentum",
key_arguments=["arg1", "arg2"],
counter_arguments=["counter1"],
risk_factors=["risk1", "risk2"],
)
def test_position_creation(self, position):
assert position.persona == DebatePersona.OPTIMIST
assert position.direction == "yes"
assert position.fair_value == Decimal("0.65")
assert position.confidence == Decimal("0.8")
def test_position_to_dict(self, position):
d = position.to_dict()
assert d["persona"] == "optimist"
assert d["direction"] == "yes"
assert d["fair_value"] == 0.65
assert d["confidence"] == 0.8
assert "timestamp" in d
assert len(d["key_arguments"]) == 2
# =============================================================================
# DebateRound Tests
# =============================================================================
class TestDebateRound:
"""Tests for DebateRound dataclass."""
@pytest.fixture
def debate_round(self):
positions = [
DebatePosition(
persona=DebatePersona.OPTIMIST,
model="test",
direction="yes",
fair_value=Decimal("0.6"),
confidence=Decimal("0.7"),
reasoning="Bullish",
key_arguments=["arg1"],
counter_arguments=[],
risk_factors=[],
),
DebatePosition(
persona=DebatePersona.PESSIMIST,
model="test",
direction="no",
fair_value=Decimal("0.4"),
confidence=Decimal("0.6"),
reasoning="Bearish",
key_arguments=["arg2"],
counter_arguments=[],
risk_factors=[],
),
]
return DebateRound(
round_number=1,
positions=positions,
rebuttals={DebatePersona.OPTIMIST: "My rebuttal"},
consensus_direction="yes",
consensus_confidence=Decimal("0.55"),
disagreement_level=Decimal("0.4"),
)
def test_round_creation(self, debate_round):
assert debate_round.round_number == 1
assert len(debate_round.positions) == 2
assert debate_round.consensus_direction == "yes"
def test_round_to_dict(self, debate_round):
d = debate_round.to_dict()
assert d["round_number"] == 1
assert len(d["positions"]) == 2
assert "optimist" in d["rebuttals"]
assert d["consensus_confidence"] == 0.55
# =============================================================================
# DebateResult Tests
# =============================================================================
class TestDebateResult:
"""Tests for DebateResult dataclass."""
@pytest.fixture
def debate_result(self):
return DebateResult(
market_id="test_123",
market_question="Will X happen?",
rounds=[],
final_direction="yes",
final_confidence=Decimal("0.75"),
final_fair_value=Decimal("0.6"),
consensus_strength=Decimal("0.8"),
key_bull_arguments=["bull1", "bull2"],
key_bear_arguments=["bear1"],
unresolved_concerns=["concern1"],
recommendation="BUY YES",
total_rounds=2,
total_tokens=1000,
total_latency_ms=500,
)
def test_result_creation(self, debate_result):
assert debate_result.market_id == "test_123"
assert debate_result.final_direction == "yes"
assert debate_result.final_confidence == Decimal("0.75")
def test_result_to_dict(self, debate_result):
d = debate_result.to_dict()
assert d["market_id"] == "test_123"
assert d["final_direction"] == "yes"
assert d["recommendation"] == "BUY YES"
assert d["total_rounds"] == 2
# =============================================================================
# DebateAgent Tests
# =============================================================================
class TestDebateAgent:
"""Tests for DebateAgent."""
@pytest.fixture
def agent(self, mock_llm_provider):
return DebateAgent(
persona=DebatePersona.FUNDAMENTALIST,
provider=mock_llm_provider,
weight=Decimal("1.0"),
)
def test_agent_creation(self, agent):
assert agent.persona == DebatePersona.FUNDAMENTALIST
assert agent.weight == Decimal("1.0")
def test_persona_prompts_exist(self):
for persona in DebatePersona:
assert persona in DebateAgent.PERSONA_PROMPTS
def test_system_prompt_contains_persona(self, agent):
prompt = agent.system_prompt
assert "FUNDAMENTALIST" in prompt
assert "JSON" in prompt
@pytest.mark.asyncio
async def test_analyze(self, agent, mock_market):
position = await agent.analyze(mock_market, "Test context")
assert isinstance(position, DebatePosition)
assert position.persona == DebatePersona.FUNDAMENTALIST
assert position.direction == "yes"
assert position.fair_value == Decimal("0.55")
assert position.confidence == Decimal("0.7")
@pytest.mark.asyncio
async def test_analyze_with_previous_positions(self, agent, mock_market):
prev_position = DebatePosition(
persona=DebatePersona.OPTIMIST,
model="test",
direction="yes",
fair_value=Decimal("0.7"),
confidence=Decimal("0.8"),
reasoning="Previous position",
key_arguments=["prev_arg"],
counter_arguments=[],
risk_factors=[],
)
position = await agent.analyze(mock_market, "Context", [prev_position])
assert isinstance(position, DebatePosition)
@pytest.mark.asyncio
async def test_rebut(self, agent, mock_market):
own_position = DebatePosition(
persona=DebatePersona.FUNDAMENTALIST,
model="test",
direction="yes",
fair_value=Decimal("0.55"),
confidence=Decimal("0.7"),
reasoning="My position",
key_arguments=["arg1"],
counter_arguments=[],
risk_factors=[],
)
other_position = DebatePosition(
persona=DebatePersona.PESSIMIST,
model="test",
direction="no",
fair_value=Decimal("0.4"),
confidence=Decimal("0.6"),
reasoning="Their position",
key_arguments=["counter_arg"],
counter_arguments=[],
risk_factors=[],
)
rebuttal = await agent.rebut(mock_market, own_position, [other_position])
assert isinstance(rebuttal, str)
def test_parse_response_valid_json(self, agent):
content = '{"direction": "no", "fair_value": 0.3, "confidence": 0.6, "reasoning": "Test", "key_arguments": [], "counter_arguments": [], "risk_factors": []}'
position = agent._parse_response(content)
assert position.direction == "no"
assert position.fair_value == Decimal("0.3")
assert position.confidence == Decimal("0.6")
def test_parse_response_markdown_json(self, agent):
content = '```json\n{"direction": "yes", "fair_value": 0.5, "confidence": 0.5, "reasoning": "Test", "key_arguments": [], "counter_arguments": [], "risk_factors": []}\n```'
position = agent._parse_response(content)
assert position.direction == "yes"
def test_parse_response_invalid_json(self, agent):
content = "This is not JSON at all"
position = agent._parse_response(content)
assert position.direction == "abstain"
assert position.confidence == Decimal("0.3")
# =============================================================================
# DebateOrchestrator Tests
# =============================================================================
class TestDebateOrchestrator:
"""Tests for DebateOrchestrator."""
@pytest.fixture
def orchestrator(self, mock_providers):
return DebateOrchestrator(
providers=mock_providers,
max_rounds=2,
consensus_threshold=Decimal("0.7"),
)
def test_orchestrator_creation(self, orchestrator):
assert orchestrator.max_rounds == 2
assert orchestrator.consensus_threshold == Decimal("0.7")
assert len(orchestrator.agents) == 5
def test_default_weights(self, orchestrator):
assert DebatePersona.FUNDAMENTALIST in orchestrator.weights
assert orchestrator.weights[DebatePersona.FUNDAMENTALIST] == Decimal("0.25")
assert orchestrator.weights[DebatePersona.DEVILS_ADVOCATE] == Decimal("0.10")
def test_calculate_consensus_all_yes(self, orchestrator):
positions = [
DebatePosition(
persona=DebatePersona.OPTIMIST,
model="test",
direction="yes",
fair_value=Decimal("0.6"),
confidence=Decimal("0.8"),
reasoning="",
key_arguments=[],
counter_arguments=[],
risk_factors=[],
),
DebatePosition(
persona=DebatePersona.FUNDAMENTALIST,
model="test",
direction="yes",
fair_value=Decimal("0.65"),
confidence=Decimal("0.9"),
reasoning="",
key_arguments=[],
counter_arguments=[],
risk_factors=[],
),
]
direction, confidence, disagreement = orchestrator._calculate_consensus(positions)
assert direction == "yes"
assert confidence > Decimal("0.5")
assert disagreement < Decimal("0.5")
def test_calculate_consensus_mixed(self, orchestrator):
positions = [
DebatePosition(
persona=DebatePersona.OPTIMIST,
model="test",
direction="yes",
fair_value=Decimal("0.7"),
confidence=Decimal("0.8"),
reasoning="",
key_arguments=[],
counter_arguments=[],
risk_factors=[],
),
DebatePosition(
persona=DebatePersona.PESSIMIST,
model="test",
direction="no",
fair_value=Decimal("0.3"),
confidence=Decimal("0.8"),
reasoning="",
key_arguments=[],
counter_arguments=[],
risk_factors=[],
),
]
direction, confidence, disagreement = orchestrator._calculate_consensus(positions)
# With equal weights and confidence, should have high disagreement
assert disagreement > Decimal("0.3")
def test_calculate_consensus_empty(self, orchestrator):
direction, confidence, disagreement = orchestrator._calculate_consensus([])
assert direction == "abstain"
assert confidence == Decimal("0")
assert disagreement == Decimal("1")
def test_synthesize_result_empty_rounds(self, orchestrator, mock_market):
result = orchestrator._synthesize_result(
market=mock_market,
rounds=[],
total_tokens=0,
total_latency_ms=0,
)
assert result.final_direction == "abstain"
assert result.final_confidence == Decimal("0")
assert "Insufficient" in result.recommendation
@pytest.mark.asyncio
async def test_debate_flow(self, orchestrator, mock_market):
result = await orchestrator.debate(
market=mock_market,
context="Test context",
)
assert isinstance(result, DebateResult)
assert result.market_id == mock_market.market_id
assert result.total_rounds >= 1
assert len(result.rounds) >= 1
@pytest.mark.asyncio
async def test_debate_with_specific_personas(self, orchestrator, mock_market):
result = await orchestrator.debate(
market=mock_market,
context="Test context",
required_personas=[DebatePersona.OPTIMIST, DebatePersona.PESSIMIST],
)
assert isinstance(result, DebateResult)
@pytest.mark.asyncio
async def test_debate_no_agents_raises(self, mock_market):
orchestrator = DebateOrchestrator(providers={}, max_rounds=1)
with pytest.raises(ValueError, match="No debate agents available"):
await orchestrator.debate(mock_market)
# =============================================================================
# ConfidenceCalibrator Tests
# =============================================================================
class TestConfidenceCalibrator:
"""Tests for ConfidenceCalibrator."""
@pytest.fixture
def calibrator(self):
return ConfidenceCalibrator()
def test_calibrator_init(self, calibrator):
assert calibrator._predictions == []
assert calibrator._calibration_scores == {}
def test_record_prediction(self, calibrator):
calibrator.record_prediction(
market_id="test_123",
persona=DebatePersona.OPTIMIST,
predicted_prob=Decimal("0.7"),
confidence=Decimal("0.8"),
timestamp=datetime.utcnow(),
)
assert len(calibrator._predictions) == 1
assert calibrator._predictions[0]["market_id"] == "test_123"
assert calibrator._predictions[0]["outcome"] is None
def test_record_outcome(self, calibrator):
calibrator.record_prediction(
market_id="test_123",
persona=DebatePersona.OPTIMIST,
predicted_prob=Decimal("0.7"),
confidence=Decimal("0.8"),
timestamp=datetime.utcnow(),
)
calibrator.record_outcome("test_123", True)
assert calibrator._predictions[0]["outcome"] is True
assert DebatePersona.OPTIMIST in calibrator._calibration_scores
def test_calibration_after_multiple_outcomes(self, calibrator):
# Record multiple predictions
for i in range(5):
calibrator.record_prediction(
market_id=f"market_{i}",
persona=DebatePersona.FUNDAMENTALIST,
predicted_prob=Decimal("0.7"),
confidence=Decimal("0.8"),
timestamp=datetime.utcnow(),
)
# Alternate outcomes
calibrator.record_outcome(f"market_{i}", i % 2 == 0)
scores = calibrator._calibration_scores[DebatePersona.FUNDAMENTALIST]
assert scores["total_predictions"] == 5
assert scores["brier_score"] > Decimal("0")
def test_get_calibration_adjustment_no_data(self, calibrator):
adjustment = calibrator.get_calibration_adjustment(DebatePersona.OPTIMIST)
assert adjustment == Decimal("1.0")
def test_get_calibration_adjustment_insufficient_data(self, calibrator):
# Add less than 10 predictions
for i in range(5):
calibrator.record_prediction(
market_id=f"market_{i}",
persona=DebatePersona.OPTIMIST,
predicted_prob=Decimal("0.9"),
confidence=Decimal("0.9"),
timestamp=datetime.utcnow(),
)
calibrator.record_outcome(f"market_{i}", False)
adjustment = calibrator.get_calibration_adjustment(DebatePersona.OPTIMIST)
assert adjustment == Decimal("1.0") # Not enough data
def test_overconfidence_adjustment(self, calibrator):
# Simulate overconfident predictions (high confidence, wrong outcomes)
for i in range(15):
calibrator.record_prediction(
market_id=f"market_{i}",
persona=DebatePersona.OPTIMIST,
predicted_prob=Decimal("0.9"),
confidence=Decimal("0.9"),
timestamp=datetime.utcnow(),
)
# Most outcomes are wrong
calibrator.record_outcome(f"market_{i}", i < 3)
adjustment = calibrator.get_calibration_adjustment(DebatePersona.OPTIMIST)
assert adjustment < Decimal("1.0") # Should reduce confidence
def test_get_calibration_report(self, calibrator):
calibrator.record_prediction(
market_id="test_1",
persona=DebatePersona.FUNDAMENTALIST,
predicted_prob=Decimal("0.6"),
confidence=Decimal("0.7"),
timestamp=datetime.utcnow(),
)
calibrator.record_outcome("test_1", True)
report = calibrator.get_calibration_report()
assert report["total_predictions"] == 1
assert report["pending_predictions"] == 0
assert "fundamentalist" in report["by_persona"]
class TestGetConfidenceCalibrator:
"""Tests for get_confidence_calibrator singleton."""
def test_returns_same_instance(self):
cal1 = get_confidence_calibrator()
cal2 = get_confidence_calibrator()
assert cal1 is cal2
# =============================================================================
# Integration Tests
# =============================================================================
class TestDebateIntegration:
"""Integration tests for the complete debate system."""
@pytest.mark.asyncio
async def test_full_debate_flow(self, mock_market, mock_providers):
"""Test a complete debate from start to finish."""
orchestrator = DebateOrchestrator(
providers=mock_providers,
max_rounds=2,
consensus_threshold=Decimal("0.6"),
)
result = await orchestrator.debate(
market=mock_market,
context="Bitcoin has been showing strong momentum. Recent ETF approvals have increased institutional interest.",
)
# Verify result structure
assert result.market_id == mock_market.market_id
assert result.final_direction in ["yes", "no", "abstain"]
assert Decimal("0") <= result.final_confidence <= Decimal("1")
assert Decimal("0") <= result.final_fair_value <= Decimal("1")
assert len(result.rounds) >= 1
assert result.total_tokens > 0
@pytest.mark.asyncio
async def test_debate_with_calibration(self, mock_market, mock_providers):
"""Test debate with confidence calibration tracking."""
orchestrator = DebateOrchestrator(
providers=mock_providers,
max_rounds=1,
)
calibrator = ConfidenceCalibrator()
# Run debate
result = await orchestrator.debate(mock_market, "Test")
# Record predictions from each position
for round in result.rounds:
for position in round.positions:
calibrator.record_prediction(
market_id=result.market_id,
persona=position.persona,
predicted_prob=position.fair_value,
confidence=position.confidence,
timestamp=position.timestamp,
)
# Simulate outcome
calibrator.record_outcome(result.market_id, True)
# Check report
report = calibrator.get_calibration_report()
assert report["total_predictions"] > 0
@pytest.mark.asyncio
async def test_devils_advocate_opposes_consensus(self, mock_market):
"""Test that devil's advocate argues against consensus."""
# Create providers with different responses
bullish_provider = MagicMock()
bullish_provider.default_model = "bullish-model"
async def bullish_complete(*args, **kwargs):
return LLMResponse(
content='{"direction": "yes", "fair_value": 0.7, "confidence": 0.8, "reasoning": "Bullish!", "key_arguments": ["bull"], "counter_arguments": [], "risk_factors": []}',
model="bullish",
provider="test",
tokens_input=100,
tokens_output=50,
latency_ms=100,
cost_estimate=Decimal("0.001"),
finish_reason="stop",
)
bullish_provider.complete = bullish_complete
# Devil's advocate should oppose (mock returns YES but in real usage would return NO)
da_provider = MagicMock()
da_provider.default_model = "da-model"
async def da_complete(*args, **kwargs):
return LLMResponse(
content='{"direction": "no", "fair_value": 0.35, "confidence": 0.6, "reasoning": "Contrarian view", "key_arguments": ["risk"], "counter_arguments": [], "risk_factors": []}',
model="da",
provider="test",
tokens_input=100,
tokens_output=50,
latency_ms=100,
cost_estimate=Decimal("0.001"),
finish_reason="stop",
)
da_provider.complete = da_complete
orchestrator = DebateOrchestrator(
providers={
DebatePersona.OPTIMIST: bullish_provider,
DebatePersona.DEVILS_ADVOCATE: da_provider,
},
max_rounds=1,
)
result = await orchestrator.debate(mock_market, "Test")
# Check that we have mixed positions
directions = set()
for round in result.rounds:
for pos in round.positions:
directions.add(pos.direction)
# Should have both yes and no
assert "yes" in directions
assert "no" in directions