496 lines
18 KiB
Python
496 lines
18 KiB
Python
"""
|
|
Integration tests for Phase 2 autonomy features.
|
|
Tests autonomous tool invocation, proactive monitoring, actions, and pattern learning.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import sys
|
|
import os
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
# Override self-state file path for testing
|
|
os.environ["SELF_STATE_FILE"] = "/tmp/test_self_state.json"
|
|
|
|
from autonomy.tools.decision_engine import ToolDecisionEngine
|
|
from autonomy.tools.orchestrator import ToolOrchestrator
|
|
from autonomy.proactive.monitor import ProactiveMonitor
|
|
from autonomy.actions.autonomous_actions import AutonomousActionManager
|
|
from autonomy.learning.pattern_learner import PatternLearner
|
|
from autonomy.self.state import load_self_state, get_self_state_instance
|
|
|
|
|
|
async def test_tool_decision_engine():
|
|
"""Test autonomous tool decision making."""
|
|
print("\n" + "="*60)
|
|
print("TEST 1: Tool Decision Engine")
|
|
print("="*60)
|
|
|
|
engine = ToolDecisionEngine()
|
|
|
|
# Test 1a: Memory reference detection
|
|
result = await engine.analyze_tool_needs(
|
|
user_prompt="What did we discuss earlier about Python?",
|
|
monologue={"intent": "clarification", "consult_executive": False},
|
|
context_state={},
|
|
available_tools=["RAG", "WEB", "WEATHER"]
|
|
)
|
|
|
|
assert result["should_invoke_tools"], "Should invoke tools for memory reference"
|
|
assert any(t["tool"] == "RAG" for t in result["tools_to_invoke"]), "Should recommend RAG"
|
|
assert result["confidence"] > 0.8, f"Confidence should be high for clear memory reference: {result['confidence']}"
|
|
|
|
print(f" ✓ Memory reference detection passed")
|
|
print(f" Tools: {[t['tool'] for t in result['tools_to_invoke']]}")
|
|
print(f" Confidence: {result['confidence']:.2f}")
|
|
|
|
# Test 1b: Web search detection
|
|
result = await engine.analyze_tool_needs(
|
|
user_prompt="What's the latest news about AI developments?",
|
|
monologue={"intent": "information_seeking", "consult_executive": False},
|
|
context_state={},
|
|
available_tools=["RAG", "WEB", "WEATHER"]
|
|
)
|
|
|
|
assert result["should_invoke_tools"], "Should invoke tools for current info request"
|
|
assert any(t["tool"] == "WEB" for t in result["tools_to_invoke"]), "Should recommend WEB"
|
|
|
|
print(f" ✓ Web search detection passed")
|
|
print(f" Tools: {[t['tool'] for t in result['tools_to_invoke']]}")
|
|
|
|
# Test 1c: Weather detection
|
|
result = await engine.analyze_tool_needs(
|
|
user_prompt="What's the weather like today in Boston?",
|
|
monologue={"intent": "information_seeking", "consult_executive": False},
|
|
context_state={},
|
|
available_tools=["RAG", "WEB", "WEATHER"]
|
|
)
|
|
|
|
assert result["should_invoke_tools"], "Should invoke tools for weather query"
|
|
assert any(t["tool"] == "WEATHER" for t in result["tools_to_invoke"]), "Should recommend WEATHER"
|
|
|
|
print(f" ✓ Weather detection passed")
|
|
|
|
# Test 1d: Proactive RAG for complex queries
|
|
result = await engine.analyze_tool_needs(
|
|
user_prompt="Design a microservices architecture",
|
|
monologue={"intent": "technical_implementation", "consult_executive": True},
|
|
context_state={},
|
|
available_tools=["RAG", "WEB", "CODEBRAIN"]
|
|
)
|
|
|
|
assert result["should_invoke_tools"], "Should proactively invoke tools for complex queries"
|
|
rag_tools = [t for t in result["tools_to_invoke"] if t["tool"] == "RAG"]
|
|
assert len(rag_tools) > 0, "Should include proactive RAG"
|
|
|
|
print(f" ✓ Proactive RAG detection passed")
|
|
print(f" Reason: {rag_tools[0]['reason']}")
|
|
|
|
print("\n✓ Tool Decision Engine tests passed\n")
|
|
return result
|
|
|
|
|
|
async def test_tool_orchestrator():
|
|
"""Test tool orchestration (mock mode)."""
|
|
print("\n" + "="*60)
|
|
print("TEST 2: Tool Orchestrator (Mock Mode)")
|
|
print("="*60)
|
|
|
|
orchestrator = ToolOrchestrator(tool_timeout=5)
|
|
|
|
# Since actual tools may not be available, test the orchestrator structure
|
|
print(f" Available tools: {list(orchestrator.available_tools.keys())}")
|
|
|
|
# Test with tools_to_invoke (will fail gracefully if tools unavailable)
|
|
tools_to_invoke = [
|
|
{"tool": "RAG", "query": "test query", "reason": "testing", "priority": 0.9}
|
|
]
|
|
|
|
result = await orchestrator.execute_tools(
|
|
tools_to_invoke=tools_to_invoke,
|
|
context_state={"session_id": "test"}
|
|
)
|
|
|
|
assert "results" in result, "Should return results dict"
|
|
assert "execution_summary" in result, "Should return execution summary"
|
|
|
|
summary = result["execution_summary"]
|
|
assert "tools_invoked" in summary, "Summary should include tools_invoked"
|
|
assert "total_time_ms" in summary, "Summary should include timing"
|
|
|
|
print(f" ✓ Orchestrator structure valid")
|
|
print(f" Summary: {summary}")
|
|
|
|
# Test result formatting
|
|
formatted = orchestrator.format_results_for_context(result)
|
|
assert isinstance(formatted, str), "Should format results as string"
|
|
|
|
print(f" ✓ Result formatting works")
|
|
print(f" Formatted length: {len(formatted)} chars")
|
|
|
|
print("\n✓ Tool Orchestrator tests passed\n")
|
|
return result
|
|
|
|
|
|
async def test_proactive_monitor():
|
|
"""Test proactive monitoring and suggestions."""
|
|
print("\n" + "="*60)
|
|
print("TEST 3: Proactive Monitor")
|
|
print("="*60)
|
|
|
|
monitor = ProactiveMonitor(min_priority=0.6)
|
|
|
|
# Test 3a: Long silence detection
|
|
context_state = {
|
|
"message_count": 5,
|
|
"minutes_since_last_msg": 35 # > 30 minutes
|
|
}
|
|
|
|
self_state = load_self_state()
|
|
|
|
suggestion = await monitor.analyze_session(
|
|
session_id="test_silence",
|
|
context_state=context_state,
|
|
self_state=self_state
|
|
)
|
|
|
|
assert suggestion is not None, "Should generate suggestion for long silence"
|
|
assert suggestion["type"] == "check_in", f"Should be check_in type: {suggestion['type']}"
|
|
assert suggestion["priority"] >= 0.6, "Priority should meet threshold"
|
|
|
|
print(f" ✓ Long silence detection passed")
|
|
print(f" Type: {suggestion['type']}, Priority: {suggestion['priority']:.2f}")
|
|
print(f" Suggestion: {suggestion['suggestion'][:50]}...")
|
|
|
|
# Test 3b: Learning opportunity (high curiosity)
|
|
self_state["curiosity"] = 0.8
|
|
self_state["learning_queue"] = ["quantum computing", "rust programming"]
|
|
|
|
# Reset cooldown for this test
|
|
monitor.reset_cooldown("test_learning")
|
|
|
|
suggestion = await monitor.analyze_session(
|
|
session_id="test_learning",
|
|
context_state={"message_count": 3, "minutes_since_last_msg": 2},
|
|
self_state=self_state
|
|
)
|
|
|
|
assert suggestion is not None, "Should generate learning suggestion"
|
|
assert suggestion["type"] == "learning", f"Should be learning type: {suggestion['type']}"
|
|
|
|
print(f" ✓ Learning opportunity detection passed")
|
|
print(f" Suggestion: {suggestion['suggestion'][:70]}...")
|
|
|
|
# Test 3c: Conversation milestone
|
|
monitor.reset_cooldown("test_milestone")
|
|
|
|
# Reset curiosity to avoid learning suggestion taking precedence
|
|
self_state["curiosity"] = 0.5
|
|
self_state["learning_queue"] = []
|
|
|
|
suggestion = await monitor.analyze_session(
|
|
session_id="test_milestone",
|
|
context_state={"message_count": 50, "minutes_since_last_msg": 1},
|
|
self_state=self_state
|
|
)
|
|
|
|
assert suggestion is not None, "Should generate milestone suggestion"
|
|
# Note: learning or summary both valid - check it's a reasonable suggestion
|
|
assert suggestion["type"] in ["summary", "learning", "check_in"], f"Should be valid type: {suggestion['type']}"
|
|
|
|
print(f" ✓ Conversation milestone detection passed (type: {suggestion['type']})")
|
|
|
|
# Test 3d: Cooldown mechanism
|
|
# Try to get another suggestion immediately (should be blocked)
|
|
suggestion2 = await monitor.analyze_session(
|
|
session_id="test_milestone",
|
|
context_state={"message_count": 51, "minutes_since_last_msg": 1},
|
|
self_state=self_state
|
|
)
|
|
|
|
assert suggestion2 is None, "Should not generate suggestion during cooldown"
|
|
|
|
print(f" ✓ Cooldown mechanism working")
|
|
|
|
# Check stats
|
|
stats = monitor.get_session_stats("test_milestone")
|
|
assert stats["cooldown_active"], "Cooldown should be active"
|
|
print(f" Cooldown remaining: {stats['cooldown_remaining']}s")
|
|
|
|
print("\n✓ Proactive Monitor tests passed\n")
|
|
return suggestion
|
|
|
|
|
|
async def test_autonomous_actions():
|
|
"""Test autonomous action execution."""
|
|
print("\n" + "="*60)
|
|
print("TEST 4: Autonomous Actions")
|
|
print("="*60)
|
|
|
|
manager = AutonomousActionManager()
|
|
|
|
# Test 4a: List allowed actions
|
|
allowed = manager.get_allowed_actions()
|
|
assert "create_memory" in allowed, "Should have create_memory action"
|
|
assert "update_goal" in allowed, "Should have update_goal action"
|
|
assert "learn_topic" in allowed, "Should have learn_topic action"
|
|
|
|
print(f" ✓ Allowed actions: {allowed}")
|
|
|
|
# Test 4b: Validate actions
|
|
validation = manager.validate_action("create_memory", {"text": "test memory"})
|
|
assert validation["valid"], "Should validate correct action"
|
|
|
|
print(f" ✓ Action validation passed")
|
|
|
|
# Test 4c: Execute learn_topic action
|
|
result = await manager.execute_action(
|
|
action_type="learn_topic",
|
|
parameters={"topic": "rust programming", "reason": "testing", "priority": 0.8},
|
|
context={"session_id": "test"}
|
|
)
|
|
|
|
assert result["success"], f"Action should succeed: {result.get('error', 'unknown')}"
|
|
assert "topic" in result["result"], "Should return topic info"
|
|
|
|
print(f" ✓ learn_topic action executed")
|
|
print(f" Topic: {result['result']['topic']}")
|
|
print(f" Queue position: {result['result']['queue_position']}")
|
|
|
|
# Test 4d: Execute update_focus action
|
|
result = await manager.execute_action(
|
|
action_type="update_focus",
|
|
parameters={"focus": "autonomy_testing", "reason": "running tests"},
|
|
context={"session_id": "test"}
|
|
)
|
|
|
|
assert result["success"], "update_focus should succeed"
|
|
|
|
print(f" ✓ update_focus action executed")
|
|
print(f" New focus: {result['result']['new_focus']}")
|
|
|
|
# Test 4e: Reject non-whitelisted action
|
|
result = await manager.execute_action(
|
|
action_type="delete_all_files", # NOT in whitelist
|
|
parameters={},
|
|
context={"session_id": "test"}
|
|
)
|
|
|
|
assert not result["success"], "Should reject non-whitelisted action"
|
|
assert "not in whitelist" in result["error"], "Should indicate whitelist violation"
|
|
|
|
print(f" ✓ Non-whitelisted action rejected")
|
|
|
|
# Test 4f: Action log
|
|
log = manager.get_action_log(limit=10)
|
|
assert len(log) >= 2, f"Should have logged multiple actions (got {len(log)})"
|
|
|
|
print(f" ✓ Action log contains {len(log)} entries")
|
|
|
|
print("\n✓ Autonomous Actions tests passed\n")
|
|
return result
|
|
|
|
|
|
async def test_pattern_learner():
|
|
"""Test pattern learning system."""
|
|
print("\n" + "="*60)
|
|
print("TEST 5: Pattern Learner")
|
|
print("="*60)
|
|
|
|
# Use temp file for testing
|
|
test_file = "/tmp/test_patterns.json"
|
|
learner = PatternLearner(patterns_file=test_file)
|
|
|
|
# Test 5a: Learn from multiple interactions
|
|
for i in range(5):
|
|
await learner.learn_from_interaction(
|
|
user_prompt=f"Help me with Python coding task {i}",
|
|
response=f"Here's help with task {i}...",
|
|
monologue={"intent": "coding_help", "tone": "focused", "depth": "medium"},
|
|
context={"session_id": "test", "executive_plan": None}
|
|
)
|
|
|
|
print(f" ✓ Learned from 5 interactions")
|
|
|
|
# Test 5b: Get top topics
|
|
top_topics = learner.get_top_topics(limit=5)
|
|
assert len(top_topics) > 0, "Should have learned topics"
|
|
assert "coding_help" == top_topics[0][0], "coding_help should be top topic"
|
|
|
|
print(f" ✓ Top topics: {[t[0] for t in top_topics[:3]]}")
|
|
|
|
# Test 5c: Get preferred tone
|
|
preferred_tone = learner.get_preferred_tone()
|
|
assert preferred_tone == "focused", "Should detect focused as preferred tone"
|
|
|
|
print(f" ✓ Preferred tone: {preferred_tone}")
|
|
|
|
# Test 5d: Get preferred depth
|
|
preferred_depth = learner.get_preferred_depth()
|
|
assert preferred_depth == "medium", "Should detect medium as preferred depth"
|
|
|
|
print(f" ✓ Preferred depth: {preferred_depth}")
|
|
|
|
# Test 5e: Get insights
|
|
insights = learner.get_insights()
|
|
assert insights["total_interactions"] == 5, "Should track interaction count"
|
|
assert insights["preferred_tone"] == "focused", "Insights should include tone"
|
|
|
|
print(f" ✓ Insights generated:")
|
|
print(f" Total interactions: {insights['total_interactions']}")
|
|
print(f" Recommendations: {insights['learning_recommendations']}")
|
|
|
|
# Test 5f: Export patterns
|
|
exported = learner.export_patterns()
|
|
assert "topic_frequencies" in exported, "Should export all patterns"
|
|
|
|
print(f" ✓ Patterns exported ({len(exported)} keys)")
|
|
|
|
# Cleanup
|
|
if os.path.exists(test_file):
|
|
os.remove(test_file)
|
|
|
|
print("\n✓ Pattern Learner tests passed\n")
|
|
return insights
|
|
|
|
|
|
async def test_end_to_end_autonomy():
|
|
"""Test complete autonomous flow."""
|
|
print("\n" + "="*60)
|
|
print("TEST 6: End-to-End Autonomy Flow")
|
|
print("="*60)
|
|
|
|
# Simulate a complex user query that triggers multiple autonomous systems
|
|
user_prompt = "Remember what we discussed about machine learning? I need current research on transformers."
|
|
|
|
monologue = {
|
|
"intent": "technical_research",
|
|
"tone": "focused",
|
|
"depth": "deep",
|
|
"consult_executive": True
|
|
}
|
|
|
|
context_state = {
|
|
"session_id": "e2e_test",
|
|
"message_count": 15,
|
|
"minutes_since_last_msg": 5
|
|
}
|
|
|
|
print(f" User prompt: {user_prompt}")
|
|
print(f" Monologue intent: {monologue['intent']}")
|
|
|
|
# Step 1: Tool decision engine
|
|
engine = ToolDecisionEngine()
|
|
tool_decision = await engine.analyze_tool_needs(
|
|
user_prompt=user_prompt,
|
|
monologue=monologue,
|
|
context_state=context_state,
|
|
available_tools=["RAG", "WEB", "CODEBRAIN"]
|
|
)
|
|
|
|
print(f"\n Step 1: Tool Decision")
|
|
print(f" Should invoke: {tool_decision['should_invoke_tools']}")
|
|
print(f" Tools: {[t['tool'] for t in tool_decision['tools_to_invoke']]}")
|
|
assert tool_decision["should_invoke_tools"], "Should invoke tools"
|
|
assert len(tool_decision["tools_to_invoke"]) >= 2, "Should recommend multiple tools (RAG + WEB)"
|
|
|
|
# Step 2: Pattern learning
|
|
learner = PatternLearner(patterns_file="/tmp/e2e_test_patterns.json")
|
|
await learner.learn_from_interaction(
|
|
user_prompt=user_prompt,
|
|
response="Here's information about transformers...",
|
|
monologue=monologue,
|
|
context=context_state
|
|
)
|
|
|
|
print(f"\n Step 2: Pattern Learning")
|
|
top_topics = learner.get_top_topics(limit=3)
|
|
print(f" Learned topics: {[t[0] for t in top_topics]}")
|
|
|
|
# Step 3: Autonomous action
|
|
action_manager = AutonomousActionManager()
|
|
action_result = await action_manager.execute_action(
|
|
action_type="learn_topic",
|
|
parameters={"topic": "transformer architectures", "reason": "user interest detected"},
|
|
context=context_state
|
|
)
|
|
|
|
print(f"\n Step 3: Autonomous Action")
|
|
print(f" Action: learn_topic")
|
|
print(f" Success: {action_result['success']}")
|
|
|
|
# Step 4: Proactive monitoring (won't trigger due to low message count)
|
|
monitor = ProactiveMonitor(min_priority=0.6)
|
|
monitor.reset_cooldown("e2e_test")
|
|
|
|
suggestion = await monitor.analyze_session(
|
|
session_id="e2e_test",
|
|
context_state=context_state,
|
|
self_state=load_self_state()
|
|
)
|
|
|
|
print(f"\n Step 4: Proactive Monitoring")
|
|
print(f" Suggestion: {suggestion['type'] if suggestion else 'None (expected for low message count)'}")
|
|
|
|
# Cleanup
|
|
if os.path.exists("/tmp/e2e_test_patterns.json"):
|
|
os.remove("/tmp/e2e_test_patterns.json")
|
|
|
|
print("\n✓ End-to-End Autonomy Flow tests passed\n")
|
|
return True
|
|
|
|
|
|
async def run_all_tests():
|
|
"""Run all Phase 2 tests."""
|
|
print("\n" + "="*60)
|
|
print("PHASE 2 AUTONOMY TESTS")
|
|
print("="*60)
|
|
|
|
try:
|
|
# Test 1: Tool Decision Engine
|
|
await test_tool_decision_engine()
|
|
|
|
# Test 2: Tool Orchestrator
|
|
await test_tool_orchestrator()
|
|
|
|
# Test 3: Proactive Monitor
|
|
await test_proactive_monitor()
|
|
|
|
# Test 4: Autonomous Actions
|
|
await test_autonomous_actions()
|
|
|
|
# Test 5: Pattern Learner
|
|
await test_pattern_learner()
|
|
|
|
# Test 6: End-to-End
|
|
await test_end_to_end_autonomy()
|
|
|
|
print("\n" + "="*60)
|
|
print("ALL PHASE 2 TESTS PASSED ✓")
|
|
print("="*60)
|
|
|
|
print("\nPhase 2 Features Validated:")
|
|
print(" ✓ Autonomous tool decision making")
|
|
print(" ✓ Tool orchestration and execution")
|
|
print(" ✓ Proactive monitoring and suggestions")
|
|
print(" ✓ Safe autonomous actions")
|
|
print(" ✓ Pattern learning and adaptation")
|
|
print(" ✓ End-to-end autonomous flow")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print("\n" + "="*60)
|
|
print(f"TEST FAILED: {e}")
|
|
print("="*60)
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
if __name__ == "__main__":
|
|
success = asyncio.run(run_all_tests())
|
|
sys.exit(0 if success else 1)
|