autonomy phase 2
This commit is contained in:
495
cortex/tests/test_autonomy_phase2.py
Normal file
495
cortex/tests/test_autonomy_phase2.py
Normal file
@@ -0,0 +1,495 @@
|
||||
"""
|
||||
Integration tests for Phase 2 autonomy features.
|
||||
Tests autonomous tool invocation, proactive monitoring, actions, and pattern learning.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
# Override self-state file path for testing
|
||||
os.environ["SELF_STATE_FILE"] = "/tmp/test_self_state.json"
|
||||
|
||||
from autonomy.tools.decision_engine import ToolDecisionEngine
|
||||
from autonomy.tools.orchestrator import ToolOrchestrator
|
||||
from autonomy.proactive.monitor import ProactiveMonitor
|
||||
from autonomy.actions.autonomous_actions import AutonomousActionManager
|
||||
from autonomy.learning.pattern_learner import PatternLearner
|
||||
from autonomy.self.state import load_self_state, get_self_state_instance
|
||||
|
||||
|
||||
async def test_tool_decision_engine():
|
||||
"""Test autonomous tool decision making."""
|
||||
print("\n" + "="*60)
|
||||
print("TEST 1: Tool Decision Engine")
|
||||
print("="*60)
|
||||
|
||||
engine = ToolDecisionEngine()
|
||||
|
||||
# Test 1a: Memory reference detection
|
||||
result = await engine.analyze_tool_needs(
|
||||
user_prompt="What did we discuss earlier about Python?",
|
||||
monologue={"intent": "clarification", "consult_executive": False},
|
||||
context_state={},
|
||||
available_tools=["RAG", "WEB", "WEATHER"]
|
||||
)
|
||||
|
||||
assert result["should_invoke_tools"], "Should invoke tools for memory reference"
|
||||
assert any(t["tool"] == "RAG" for t in result["tools_to_invoke"]), "Should recommend RAG"
|
||||
assert result["confidence"] > 0.8, f"Confidence should be high for clear memory reference: {result['confidence']}"
|
||||
|
||||
print(f" ✓ Memory reference detection passed")
|
||||
print(f" Tools: {[t['tool'] for t in result['tools_to_invoke']]}")
|
||||
print(f" Confidence: {result['confidence']:.2f}")
|
||||
|
||||
# Test 1b: Web search detection
|
||||
result = await engine.analyze_tool_needs(
|
||||
user_prompt="What's the latest news about AI developments?",
|
||||
monologue={"intent": "information_seeking", "consult_executive": False},
|
||||
context_state={},
|
||||
available_tools=["RAG", "WEB", "WEATHER"]
|
||||
)
|
||||
|
||||
assert result["should_invoke_tools"], "Should invoke tools for current info request"
|
||||
assert any(t["tool"] == "WEB" for t in result["tools_to_invoke"]), "Should recommend WEB"
|
||||
|
||||
print(f" ✓ Web search detection passed")
|
||||
print(f" Tools: {[t['tool'] for t in result['tools_to_invoke']]}")
|
||||
|
||||
# Test 1c: Weather detection
|
||||
result = await engine.analyze_tool_needs(
|
||||
user_prompt="What's the weather like today in Boston?",
|
||||
monologue={"intent": "information_seeking", "consult_executive": False},
|
||||
context_state={},
|
||||
available_tools=["RAG", "WEB", "WEATHER"]
|
||||
)
|
||||
|
||||
assert result["should_invoke_tools"], "Should invoke tools for weather query"
|
||||
assert any(t["tool"] == "WEATHER" for t in result["tools_to_invoke"]), "Should recommend WEATHER"
|
||||
|
||||
print(f" ✓ Weather detection passed")
|
||||
|
||||
# Test 1d: Proactive RAG for complex queries
|
||||
result = await engine.analyze_tool_needs(
|
||||
user_prompt="Design a microservices architecture",
|
||||
monologue={"intent": "technical_implementation", "consult_executive": True},
|
||||
context_state={},
|
||||
available_tools=["RAG", "WEB", "CODEBRAIN"]
|
||||
)
|
||||
|
||||
assert result["should_invoke_tools"], "Should proactively invoke tools for complex queries"
|
||||
rag_tools = [t for t in result["tools_to_invoke"] if t["tool"] == "RAG"]
|
||||
assert len(rag_tools) > 0, "Should include proactive RAG"
|
||||
|
||||
print(f" ✓ Proactive RAG detection passed")
|
||||
print(f" Reason: {rag_tools[0]['reason']}")
|
||||
|
||||
print("\n✓ Tool Decision Engine tests passed\n")
|
||||
return result
|
||||
|
||||
|
||||
async def test_tool_orchestrator():
|
||||
"""Test tool orchestration (mock mode)."""
|
||||
print("\n" + "="*60)
|
||||
print("TEST 2: Tool Orchestrator (Mock Mode)")
|
||||
print("="*60)
|
||||
|
||||
orchestrator = ToolOrchestrator(tool_timeout=5)
|
||||
|
||||
# Since actual tools may not be available, test the orchestrator structure
|
||||
print(f" Available tools: {list(orchestrator.available_tools.keys())}")
|
||||
|
||||
# Test with tools_to_invoke (will fail gracefully if tools unavailable)
|
||||
tools_to_invoke = [
|
||||
{"tool": "RAG", "query": "test query", "reason": "testing", "priority": 0.9}
|
||||
]
|
||||
|
||||
result = await orchestrator.execute_tools(
|
||||
tools_to_invoke=tools_to_invoke,
|
||||
context_state={"session_id": "test"}
|
||||
)
|
||||
|
||||
assert "results" in result, "Should return results dict"
|
||||
assert "execution_summary" in result, "Should return execution summary"
|
||||
|
||||
summary = result["execution_summary"]
|
||||
assert "tools_invoked" in summary, "Summary should include tools_invoked"
|
||||
assert "total_time_ms" in summary, "Summary should include timing"
|
||||
|
||||
print(f" ✓ Orchestrator structure valid")
|
||||
print(f" Summary: {summary}")
|
||||
|
||||
# Test result formatting
|
||||
formatted = orchestrator.format_results_for_context(result)
|
||||
assert isinstance(formatted, str), "Should format results as string"
|
||||
|
||||
print(f" ✓ Result formatting works")
|
||||
print(f" Formatted length: {len(formatted)} chars")
|
||||
|
||||
print("\n✓ Tool Orchestrator tests passed\n")
|
||||
return result
|
||||
|
||||
|
||||
async def test_proactive_monitor():
|
||||
"""Test proactive monitoring and suggestions."""
|
||||
print("\n" + "="*60)
|
||||
print("TEST 3: Proactive Monitor")
|
||||
print("="*60)
|
||||
|
||||
monitor = ProactiveMonitor(min_priority=0.6)
|
||||
|
||||
# Test 3a: Long silence detection
|
||||
context_state = {
|
||||
"message_count": 5,
|
||||
"minutes_since_last_msg": 35 # > 30 minutes
|
||||
}
|
||||
|
||||
self_state = load_self_state()
|
||||
|
||||
suggestion = await monitor.analyze_session(
|
||||
session_id="test_silence",
|
||||
context_state=context_state,
|
||||
self_state=self_state
|
||||
)
|
||||
|
||||
assert suggestion is not None, "Should generate suggestion for long silence"
|
||||
assert suggestion["type"] == "check_in", f"Should be check_in type: {suggestion['type']}"
|
||||
assert suggestion["priority"] >= 0.6, "Priority should meet threshold"
|
||||
|
||||
print(f" ✓ Long silence detection passed")
|
||||
print(f" Type: {suggestion['type']}, Priority: {suggestion['priority']:.2f}")
|
||||
print(f" Suggestion: {suggestion['suggestion'][:50]}...")
|
||||
|
||||
# Test 3b: Learning opportunity (high curiosity)
|
||||
self_state["curiosity"] = 0.8
|
||||
self_state["learning_queue"] = ["quantum computing", "rust programming"]
|
||||
|
||||
# Reset cooldown for this test
|
||||
monitor.reset_cooldown("test_learning")
|
||||
|
||||
suggestion = await monitor.analyze_session(
|
||||
session_id="test_learning",
|
||||
context_state={"message_count": 3, "minutes_since_last_msg": 2},
|
||||
self_state=self_state
|
||||
)
|
||||
|
||||
assert suggestion is not None, "Should generate learning suggestion"
|
||||
assert suggestion["type"] == "learning", f"Should be learning type: {suggestion['type']}"
|
||||
|
||||
print(f" ✓ Learning opportunity detection passed")
|
||||
print(f" Suggestion: {suggestion['suggestion'][:70]}...")
|
||||
|
||||
# Test 3c: Conversation milestone
|
||||
monitor.reset_cooldown("test_milestone")
|
||||
|
||||
# Reset curiosity to avoid learning suggestion taking precedence
|
||||
self_state["curiosity"] = 0.5
|
||||
self_state["learning_queue"] = []
|
||||
|
||||
suggestion = await monitor.analyze_session(
|
||||
session_id="test_milestone",
|
||||
context_state={"message_count": 50, "minutes_since_last_msg": 1},
|
||||
self_state=self_state
|
||||
)
|
||||
|
||||
assert suggestion is not None, "Should generate milestone suggestion"
|
||||
# Note: learning or summary both valid - check it's a reasonable suggestion
|
||||
assert suggestion["type"] in ["summary", "learning", "check_in"], f"Should be valid type: {suggestion['type']}"
|
||||
|
||||
print(f" ✓ Conversation milestone detection passed (type: {suggestion['type']})")
|
||||
|
||||
# Test 3d: Cooldown mechanism
|
||||
# Try to get another suggestion immediately (should be blocked)
|
||||
suggestion2 = await monitor.analyze_session(
|
||||
session_id="test_milestone",
|
||||
context_state={"message_count": 51, "minutes_since_last_msg": 1},
|
||||
self_state=self_state
|
||||
)
|
||||
|
||||
assert suggestion2 is None, "Should not generate suggestion during cooldown"
|
||||
|
||||
print(f" ✓ Cooldown mechanism working")
|
||||
|
||||
# Check stats
|
||||
stats = monitor.get_session_stats("test_milestone")
|
||||
assert stats["cooldown_active"], "Cooldown should be active"
|
||||
print(f" Cooldown remaining: {stats['cooldown_remaining']}s")
|
||||
|
||||
print("\n✓ Proactive Monitor tests passed\n")
|
||||
return suggestion
|
||||
|
||||
|
||||
async def test_autonomous_actions():
|
||||
"""Test autonomous action execution."""
|
||||
print("\n" + "="*60)
|
||||
print("TEST 4: Autonomous Actions")
|
||||
print("="*60)
|
||||
|
||||
manager = AutonomousActionManager()
|
||||
|
||||
# Test 4a: List allowed actions
|
||||
allowed = manager.get_allowed_actions()
|
||||
assert "create_memory" in allowed, "Should have create_memory action"
|
||||
assert "update_goal" in allowed, "Should have update_goal action"
|
||||
assert "learn_topic" in allowed, "Should have learn_topic action"
|
||||
|
||||
print(f" ✓ Allowed actions: {allowed}")
|
||||
|
||||
# Test 4b: Validate actions
|
||||
validation = manager.validate_action("create_memory", {"text": "test memory"})
|
||||
assert validation["valid"], "Should validate correct action"
|
||||
|
||||
print(f" ✓ Action validation passed")
|
||||
|
||||
# Test 4c: Execute learn_topic action
|
||||
result = await manager.execute_action(
|
||||
action_type="learn_topic",
|
||||
parameters={"topic": "rust programming", "reason": "testing", "priority": 0.8},
|
||||
context={"session_id": "test"}
|
||||
)
|
||||
|
||||
assert result["success"], f"Action should succeed: {result.get('error', 'unknown')}"
|
||||
assert "topic" in result["result"], "Should return topic info"
|
||||
|
||||
print(f" ✓ learn_topic action executed")
|
||||
print(f" Topic: {result['result']['topic']}")
|
||||
print(f" Queue position: {result['result']['queue_position']}")
|
||||
|
||||
# Test 4d: Execute update_focus action
|
||||
result = await manager.execute_action(
|
||||
action_type="update_focus",
|
||||
parameters={"focus": "autonomy_testing", "reason": "running tests"},
|
||||
context={"session_id": "test"}
|
||||
)
|
||||
|
||||
assert result["success"], "update_focus should succeed"
|
||||
|
||||
print(f" ✓ update_focus action executed")
|
||||
print(f" New focus: {result['result']['new_focus']}")
|
||||
|
||||
# Test 4e: Reject non-whitelisted action
|
||||
result = await manager.execute_action(
|
||||
action_type="delete_all_files", # NOT in whitelist
|
||||
parameters={},
|
||||
context={"session_id": "test"}
|
||||
)
|
||||
|
||||
assert not result["success"], "Should reject non-whitelisted action"
|
||||
assert "not in whitelist" in result["error"], "Should indicate whitelist violation"
|
||||
|
||||
print(f" ✓ Non-whitelisted action rejected")
|
||||
|
||||
# Test 4f: Action log
|
||||
log = manager.get_action_log(limit=10)
|
||||
assert len(log) >= 2, f"Should have logged multiple actions (got {len(log)})"
|
||||
|
||||
print(f" ✓ Action log contains {len(log)} entries")
|
||||
|
||||
print("\n✓ Autonomous Actions tests passed\n")
|
||||
return result
|
||||
|
||||
|
||||
async def test_pattern_learner():
|
||||
"""Test pattern learning system."""
|
||||
print("\n" + "="*60)
|
||||
print("TEST 5: Pattern Learner")
|
||||
print("="*60)
|
||||
|
||||
# Use temp file for testing
|
||||
test_file = "/tmp/test_patterns.json"
|
||||
learner = PatternLearner(patterns_file=test_file)
|
||||
|
||||
# Test 5a: Learn from multiple interactions
|
||||
for i in range(5):
|
||||
await learner.learn_from_interaction(
|
||||
user_prompt=f"Help me with Python coding task {i}",
|
||||
response=f"Here's help with task {i}...",
|
||||
monologue={"intent": "coding_help", "tone": "focused", "depth": "medium"},
|
||||
context={"session_id": "test", "executive_plan": None}
|
||||
)
|
||||
|
||||
print(f" ✓ Learned from 5 interactions")
|
||||
|
||||
# Test 5b: Get top topics
|
||||
top_topics = learner.get_top_topics(limit=5)
|
||||
assert len(top_topics) > 0, "Should have learned topics"
|
||||
assert "coding_help" == top_topics[0][0], "coding_help should be top topic"
|
||||
|
||||
print(f" ✓ Top topics: {[t[0] for t in top_topics[:3]]}")
|
||||
|
||||
# Test 5c: Get preferred tone
|
||||
preferred_tone = learner.get_preferred_tone()
|
||||
assert preferred_tone == "focused", "Should detect focused as preferred tone"
|
||||
|
||||
print(f" ✓ Preferred tone: {preferred_tone}")
|
||||
|
||||
# Test 5d: Get preferred depth
|
||||
preferred_depth = learner.get_preferred_depth()
|
||||
assert preferred_depth == "medium", "Should detect medium as preferred depth"
|
||||
|
||||
print(f" ✓ Preferred depth: {preferred_depth}")
|
||||
|
||||
# Test 5e: Get insights
|
||||
insights = learner.get_insights()
|
||||
assert insights["total_interactions"] == 5, "Should track interaction count"
|
||||
assert insights["preferred_tone"] == "focused", "Insights should include tone"
|
||||
|
||||
print(f" ✓ Insights generated:")
|
||||
print(f" Total interactions: {insights['total_interactions']}")
|
||||
print(f" Recommendations: {insights['learning_recommendations']}")
|
||||
|
||||
# Test 5f: Export patterns
|
||||
exported = learner.export_patterns()
|
||||
assert "topic_frequencies" in exported, "Should export all patterns"
|
||||
|
||||
print(f" ✓ Patterns exported ({len(exported)} keys)")
|
||||
|
||||
# Cleanup
|
||||
if os.path.exists(test_file):
|
||||
os.remove(test_file)
|
||||
|
||||
print("\n✓ Pattern Learner tests passed\n")
|
||||
return insights
|
||||
|
||||
|
||||
async def test_end_to_end_autonomy():
|
||||
"""Test complete autonomous flow."""
|
||||
print("\n" + "="*60)
|
||||
print("TEST 6: End-to-End Autonomy Flow")
|
||||
print("="*60)
|
||||
|
||||
# Simulate a complex user query that triggers multiple autonomous systems
|
||||
user_prompt = "Remember what we discussed about machine learning? I need current research on transformers."
|
||||
|
||||
monologue = {
|
||||
"intent": "technical_research",
|
||||
"tone": "focused",
|
||||
"depth": "deep",
|
||||
"consult_executive": True
|
||||
}
|
||||
|
||||
context_state = {
|
||||
"session_id": "e2e_test",
|
||||
"message_count": 15,
|
||||
"minutes_since_last_msg": 5
|
||||
}
|
||||
|
||||
print(f" User prompt: {user_prompt}")
|
||||
print(f" Monologue intent: {monologue['intent']}")
|
||||
|
||||
# Step 1: Tool decision engine
|
||||
engine = ToolDecisionEngine()
|
||||
tool_decision = await engine.analyze_tool_needs(
|
||||
user_prompt=user_prompt,
|
||||
monologue=monologue,
|
||||
context_state=context_state,
|
||||
available_tools=["RAG", "WEB", "CODEBRAIN"]
|
||||
)
|
||||
|
||||
print(f"\n Step 1: Tool Decision")
|
||||
print(f" Should invoke: {tool_decision['should_invoke_tools']}")
|
||||
print(f" Tools: {[t['tool'] for t in tool_decision['tools_to_invoke']]}")
|
||||
assert tool_decision["should_invoke_tools"], "Should invoke tools"
|
||||
assert len(tool_decision["tools_to_invoke"]) >= 2, "Should recommend multiple tools (RAG + WEB)"
|
||||
|
||||
# Step 2: Pattern learning
|
||||
learner = PatternLearner(patterns_file="/tmp/e2e_test_patterns.json")
|
||||
await learner.learn_from_interaction(
|
||||
user_prompt=user_prompt,
|
||||
response="Here's information about transformers...",
|
||||
monologue=monologue,
|
||||
context=context_state
|
||||
)
|
||||
|
||||
print(f"\n Step 2: Pattern Learning")
|
||||
top_topics = learner.get_top_topics(limit=3)
|
||||
print(f" Learned topics: {[t[0] for t in top_topics]}")
|
||||
|
||||
# Step 3: Autonomous action
|
||||
action_manager = AutonomousActionManager()
|
||||
action_result = await action_manager.execute_action(
|
||||
action_type="learn_topic",
|
||||
parameters={"topic": "transformer architectures", "reason": "user interest detected"},
|
||||
context=context_state
|
||||
)
|
||||
|
||||
print(f"\n Step 3: Autonomous Action")
|
||||
print(f" Action: learn_topic")
|
||||
print(f" Success: {action_result['success']}")
|
||||
|
||||
# Step 4: Proactive monitoring (won't trigger due to low message count)
|
||||
monitor = ProactiveMonitor(min_priority=0.6)
|
||||
monitor.reset_cooldown("e2e_test")
|
||||
|
||||
suggestion = await monitor.analyze_session(
|
||||
session_id="e2e_test",
|
||||
context_state=context_state,
|
||||
self_state=load_self_state()
|
||||
)
|
||||
|
||||
print(f"\n Step 4: Proactive Monitoring")
|
||||
print(f" Suggestion: {suggestion['type'] if suggestion else 'None (expected for low message count)'}")
|
||||
|
||||
# Cleanup
|
||||
if os.path.exists("/tmp/e2e_test_patterns.json"):
|
||||
os.remove("/tmp/e2e_test_patterns.json")
|
||||
|
||||
print("\n✓ End-to-End Autonomy Flow tests passed\n")
|
||||
return True
|
||||
|
||||
|
||||
async def run_all_tests():
|
||||
"""Run all Phase 2 tests."""
|
||||
print("\n" + "="*60)
|
||||
print("PHASE 2 AUTONOMY TESTS")
|
||||
print("="*60)
|
||||
|
||||
try:
|
||||
# Test 1: Tool Decision Engine
|
||||
await test_tool_decision_engine()
|
||||
|
||||
# Test 2: Tool Orchestrator
|
||||
await test_tool_orchestrator()
|
||||
|
||||
# Test 3: Proactive Monitor
|
||||
await test_proactive_monitor()
|
||||
|
||||
# Test 4: Autonomous Actions
|
||||
await test_autonomous_actions()
|
||||
|
||||
# Test 5: Pattern Learner
|
||||
await test_pattern_learner()
|
||||
|
||||
# Test 6: End-to-End
|
||||
await test_end_to_end_autonomy()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("ALL PHASE 2 TESTS PASSED ✓")
|
||||
print("="*60)
|
||||
|
||||
print("\nPhase 2 Features Validated:")
|
||||
print(" ✓ Autonomous tool decision making")
|
||||
print(" ✓ Tool orchestration and execution")
|
||||
print(" ✓ Proactive monitoring and suggestions")
|
||||
print(" ✓ Safe autonomous actions")
|
||||
print(" ✓ Pattern learning and adaptation")
|
||||
print(" ✓ End-to-end autonomous flow")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print("\n" + "="*60)
|
||||
print(f"TEST FAILED: {e}")
|
||||
print("="*60)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = asyncio.run(run_all_tests())
|
||||
sys.exit(0 if success else 1)
|
||||
Reference in New Issue
Block a user