""" Integration tests for Phase 2 autonomy features. Tests autonomous tool invocation, proactive monitoring, actions, and pattern learning. """ import asyncio import json import sys import os # Add parent directory to path for imports sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Override self-state file path for testing os.environ["SELF_STATE_FILE"] = "/tmp/test_self_state.json" from autonomy.tools.decision_engine import ToolDecisionEngine from autonomy.tools.orchestrator import ToolOrchestrator from autonomy.proactive.monitor import ProactiveMonitor from autonomy.actions.autonomous_actions import AutonomousActionManager from autonomy.learning.pattern_learner import PatternLearner from autonomy.self.state import load_self_state, get_self_state_instance async def test_tool_decision_engine(): """Test autonomous tool decision making.""" print("\n" + "="*60) print("TEST 1: Tool Decision Engine") print("="*60) engine = ToolDecisionEngine() # Test 1a: Memory reference detection result = await engine.analyze_tool_needs( user_prompt="What did we discuss earlier about Python?", monologue={"intent": "clarification", "consult_executive": False}, context_state={}, available_tools=["RAG", "WEB", "WEATHER"] ) assert result["should_invoke_tools"], "Should invoke tools for memory reference" assert any(t["tool"] == "RAG" for t in result["tools_to_invoke"]), "Should recommend RAG" assert result["confidence"] > 0.8, f"Confidence should be high for clear memory reference: {result['confidence']}" print(f" ✓ Memory reference detection passed") print(f" Tools: {[t['tool'] for t in result['tools_to_invoke']]}") print(f" Confidence: {result['confidence']:.2f}") # Test 1b: Web search detection result = await engine.analyze_tool_needs( user_prompt="What's the latest news about AI developments?", monologue={"intent": "information_seeking", "consult_executive": False}, context_state={}, available_tools=["RAG", "WEB", "WEATHER"] ) assert result["should_invoke_tools"], "Should invoke tools for current info request" assert any(t["tool"] == "WEB" for t in result["tools_to_invoke"]), "Should recommend WEB" print(f" ✓ Web search detection passed") print(f" Tools: {[t['tool'] for t in result['tools_to_invoke']]}") # Test 1c: Weather detection result = await engine.analyze_tool_needs( user_prompt="What's the weather like today in Boston?", monologue={"intent": "information_seeking", "consult_executive": False}, context_state={}, available_tools=["RAG", "WEB", "WEATHER"] ) assert result["should_invoke_tools"], "Should invoke tools for weather query" assert any(t["tool"] == "WEATHER" for t in result["tools_to_invoke"]), "Should recommend WEATHER" print(f" ✓ Weather detection passed") # Test 1d: Proactive RAG for complex queries result = await engine.analyze_tool_needs( user_prompt="Design a microservices architecture", monologue={"intent": "technical_implementation", "consult_executive": True}, context_state={}, available_tools=["RAG", "WEB", "CODEBRAIN"] ) assert result["should_invoke_tools"], "Should proactively invoke tools for complex queries" rag_tools = [t for t in result["tools_to_invoke"] if t["tool"] == "RAG"] assert len(rag_tools) > 0, "Should include proactive RAG" print(f" ✓ Proactive RAG detection passed") print(f" Reason: {rag_tools[0]['reason']}") print("\n✓ Tool Decision Engine tests passed\n") return result async def test_tool_orchestrator(): """Test tool orchestration (mock mode).""" print("\n" + "="*60) print("TEST 2: Tool Orchestrator (Mock Mode)") print("="*60) orchestrator = ToolOrchestrator(tool_timeout=5) # Since actual tools may not be available, test the orchestrator structure print(f" Available tools: {list(orchestrator.available_tools.keys())}") # Test with tools_to_invoke (will fail gracefully if tools unavailable) tools_to_invoke = [ {"tool": "RAG", "query": "test query", "reason": "testing", "priority": 0.9} ] result = await orchestrator.execute_tools( tools_to_invoke=tools_to_invoke, context_state={"session_id": "test"} ) assert "results" in result, "Should return results dict" assert "execution_summary" in result, "Should return execution summary" summary = result["execution_summary"] assert "tools_invoked" in summary, "Summary should include tools_invoked" assert "total_time_ms" in summary, "Summary should include timing" print(f" ✓ Orchestrator structure valid") print(f" Summary: {summary}") # Test result formatting formatted = orchestrator.format_results_for_context(result) assert isinstance(formatted, str), "Should format results as string" print(f" ✓ Result formatting works") print(f" Formatted length: {len(formatted)} chars") print("\n✓ Tool Orchestrator tests passed\n") return result async def test_proactive_monitor(): """Test proactive monitoring and suggestions.""" print("\n" + "="*60) print("TEST 3: Proactive Monitor") print("="*60) monitor = ProactiveMonitor(min_priority=0.6) # Test 3a: Long silence detection context_state = { "message_count": 5, "minutes_since_last_msg": 35 # > 30 minutes } self_state = load_self_state() suggestion = await monitor.analyze_session( session_id="test_silence", context_state=context_state, self_state=self_state ) assert suggestion is not None, "Should generate suggestion for long silence" assert suggestion["type"] == "check_in", f"Should be check_in type: {suggestion['type']}" assert suggestion["priority"] >= 0.6, "Priority should meet threshold" print(f" ✓ Long silence detection passed") print(f" Type: {suggestion['type']}, Priority: {suggestion['priority']:.2f}") print(f" Suggestion: {suggestion['suggestion'][:50]}...") # Test 3b: Learning opportunity (high curiosity) self_state["curiosity"] = 0.8 self_state["learning_queue"] = ["quantum computing", "rust programming"] # Reset cooldown for this test monitor.reset_cooldown("test_learning") suggestion = await monitor.analyze_session( session_id="test_learning", context_state={"message_count": 3, "minutes_since_last_msg": 2}, self_state=self_state ) assert suggestion is not None, "Should generate learning suggestion" assert suggestion["type"] == "learning", f"Should be learning type: {suggestion['type']}" print(f" ✓ Learning opportunity detection passed") print(f" Suggestion: {suggestion['suggestion'][:70]}...") # Test 3c: Conversation milestone monitor.reset_cooldown("test_milestone") # Reset curiosity to avoid learning suggestion taking precedence self_state["curiosity"] = 0.5 self_state["learning_queue"] = [] suggestion = await monitor.analyze_session( session_id="test_milestone", context_state={"message_count": 50, "minutes_since_last_msg": 1}, self_state=self_state ) assert suggestion is not None, "Should generate milestone suggestion" # Note: learning or summary both valid - check it's a reasonable suggestion assert suggestion["type"] in ["summary", "learning", "check_in"], f"Should be valid type: {suggestion['type']}" print(f" ✓ Conversation milestone detection passed (type: {suggestion['type']})") # Test 3d: Cooldown mechanism # Try to get another suggestion immediately (should be blocked) suggestion2 = await monitor.analyze_session( session_id="test_milestone", context_state={"message_count": 51, "minutes_since_last_msg": 1}, self_state=self_state ) assert suggestion2 is None, "Should not generate suggestion during cooldown" print(f" ✓ Cooldown mechanism working") # Check stats stats = monitor.get_session_stats("test_milestone") assert stats["cooldown_active"], "Cooldown should be active" print(f" Cooldown remaining: {stats['cooldown_remaining']}s") print("\n✓ Proactive Monitor tests passed\n") return suggestion async def test_autonomous_actions(): """Test autonomous action execution.""" print("\n" + "="*60) print("TEST 4: Autonomous Actions") print("="*60) manager = AutonomousActionManager() # Test 4a: List allowed actions allowed = manager.get_allowed_actions() assert "create_memory" in allowed, "Should have create_memory action" assert "update_goal" in allowed, "Should have update_goal action" assert "learn_topic" in allowed, "Should have learn_topic action" print(f" ✓ Allowed actions: {allowed}") # Test 4b: Validate actions validation = manager.validate_action("create_memory", {"text": "test memory"}) assert validation["valid"], "Should validate correct action" print(f" ✓ Action validation passed") # Test 4c: Execute learn_topic action result = await manager.execute_action( action_type="learn_topic", parameters={"topic": "rust programming", "reason": "testing", "priority": 0.8}, context={"session_id": "test"} ) assert result["success"], f"Action should succeed: {result.get('error', 'unknown')}" assert "topic" in result["result"], "Should return topic info" print(f" ✓ learn_topic action executed") print(f" Topic: {result['result']['topic']}") print(f" Queue position: {result['result']['queue_position']}") # Test 4d: Execute update_focus action result = await manager.execute_action( action_type="update_focus", parameters={"focus": "autonomy_testing", "reason": "running tests"}, context={"session_id": "test"} ) assert result["success"], "update_focus should succeed" print(f" ✓ update_focus action executed") print(f" New focus: {result['result']['new_focus']}") # Test 4e: Reject non-whitelisted action result = await manager.execute_action( action_type="delete_all_files", # NOT in whitelist parameters={}, context={"session_id": "test"} ) assert not result["success"], "Should reject non-whitelisted action" assert "not in whitelist" in result["error"], "Should indicate whitelist violation" print(f" ✓ Non-whitelisted action rejected") # Test 4f: Action log log = manager.get_action_log(limit=10) assert len(log) >= 2, f"Should have logged multiple actions (got {len(log)})" print(f" ✓ Action log contains {len(log)} entries") print("\n✓ Autonomous Actions tests passed\n") return result async def test_pattern_learner(): """Test pattern learning system.""" print("\n" + "="*60) print("TEST 5: Pattern Learner") print("="*60) # Use temp file for testing test_file = "/tmp/test_patterns.json" learner = PatternLearner(patterns_file=test_file) # Test 5a: Learn from multiple interactions for i in range(5): await learner.learn_from_interaction( user_prompt=f"Help me with Python coding task {i}", response=f"Here's help with task {i}...", monologue={"intent": "coding_help", "tone": "focused", "depth": "medium"}, context={"session_id": "test", "executive_plan": None} ) print(f" ✓ Learned from 5 interactions") # Test 5b: Get top topics top_topics = learner.get_top_topics(limit=5) assert len(top_topics) > 0, "Should have learned topics" assert "coding_help" == top_topics[0][0], "coding_help should be top topic" print(f" ✓ Top topics: {[t[0] for t in top_topics[:3]]}") # Test 5c: Get preferred tone preferred_tone = learner.get_preferred_tone() assert preferred_tone == "focused", "Should detect focused as preferred tone" print(f" ✓ Preferred tone: {preferred_tone}") # Test 5d: Get preferred depth preferred_depth = learner.get_preferred_depth() assert preferred_depth == "medium", "Should detect medium as preferred depth" print(f" ✓ Preferred depth: {preferred_depth}") # Test 5e: Get insights insights = learner.get_insights() assert insights["total_interactions"] == 5, "Should track interaction count" assert insights["preferred_tone"] == "focused", "Insights should include tone" print(f" ✓ Insights generated:") print(f" Total interactions: {insights['total_interactions']}") print(f" Recommendations: {insights['learning_recommendations']}") # Test 5f: Export patterns exported = learner.export_patterns() assert "topic_frequencies" in exported, "Should export all patterns" print(f" ✓ Patterns exported ({len(exported)} keys)") # Cleanup if os.path.exists(test_file): os.remove(test_file) print("\n✓ Pattern Learner tests passed\n") return insights async def test_end_to_end_autonomy(): """Test complete autonomous flow.""" print("\n" + "="*60) print("TEST 6: End-to-End Autonomy Flow") print("="*60) # Simulate a complex user query that triggers multiple autonomous systems user_prompt = "Remember what we discussed about machine learning? I need current research on transformers." monologue = { "intent": "technical_research", "tone": "focused", "depth": "deep", "consult_executive": True } context_state = { "session_id": "e2e_test", "message_count": 15, "minutes_since_last_msg": 5 } print(f" User prompt: {user_prompt}") print(f" Monologue intent: {monologue['intent']}") # Step 1: Tool decision engine engine = ToolDecisionEngine() tool_decision = await engine.analyze_tool_needs( user_prompt=user_prompt, monologue=monologue, context_state=context_state, available_tools=["RAG", "WEB", "CODEBRAIN"] ) print(f"\n Step 1: Tool Decision") print(f" Should invoke: {tool_decision['should_invoke_tools']}") print(f" Tools: {[t['tool'] for t in tool_decision['tools_to_invoke']]}") assert tool_decision["should_invoke_tools"], "Should invoke tools" assert len(tool_decision["tools_to_invoke"]) >= 2, "Should recommend multiple tools (RAG + WEB)" # Step 2: Pattern learning learner = PatternLearner(patterns_file="/tmp/e2e_test_patterns.json") await learner.learn_from_interaction( user_prompt=user_prompt, response="Here's information about transformers...", monologue=monologue, context=context_state ) print(f"\n Step 2: Pattern Learning") top_topics = learner.get_top_topics(limit=3) print(f" Learned topics: {[t[0] for t in top_topics]}") # Step 3: Autonomous action action_manager = AutonomousActionManager() action_result = await action_manager.execute_action( action_type="learn_topic", parameters={"topic": "transformer architectures", "reason": "user interest detected"}, context=context_state ) print(f"\n Step 3: Autonomous Action") print(f" Action: learn_topic") print(f" Success: {action_result['success']}") # Step 4: Proactive monitoring (won't trigger due to low message count) monitor = ProactiveMonitor(min_priority=0.6) monitor.reset_cooldown("e2e_test") suggestion = await monitor.analyze_session( session_id="e2e_test", context_state=context_state, self_state=load_self_state() ) print(f"\n Step 4: Proactive Monitoring") print(f" Suggestion: {suggestion['type'] if suggestion else 'None (expected for low message count)'}") # Cleanup if os.path.exists("/tmp/e2e_test_patterns.json"): os.remove("/tmp/e2e_test_patterns.json") print("\n✓ End-to-End Autonomy Flow tests passed\n") return True async def run_all_tests(): """Run all Phase 2 tests.""" print("\n" + "="*60) print("PHASE 2 AUTONOMY TESTS") print("="*60) try: # Test 1: Tool Decision Engine await test_tool_decision_engine() # Test 2: Tool Orchestrator await test_tool_orchestrator() # Test 3: Proactive Monitor await test_proactive_monitor() # Test 4: Autonomous Actions await test_autonomous_actions() # Test 5: Pattern Learner await test_pattern_learner() # Test 6: End-to-End await test_end_to_end_autonomy() print("\n" + "="*60) print("ALL PHASE 2 TESTS PASSED ✓") print("="*60) print("\nPhase 2 Features Validated:") print(" ✓ Autonomous tool decision making") print(" ✓ Tool orchestration and execution") print(" ✓ Proactive monitoring and suggestions") print(" ✓ Safe autonomous actions") print(" ✓ Pattern learning and adaptation") print(" ✓ End-to-end autonomous flow") return True except Exception as e: print("\n" + "="*60) print(f"TEST FAILED: {e}") print("="*60) import traceback traceback.print_exc() return False if __name__ == "__main__": success = asyncio.run(run_all_tests()) sys.exit(0 if success else 1)