autonomy phase 2

2025-12-14 14:43:08 -05:00
parent 49f792f20c
commit 193bf814ec
12 changed files with 2258 additions and 4 deletions
--- a/cortex/tests/test_autonomy_phase2.py
+++ b/cortex/tests/test_autonomy_phase2.py
@@ -0,0 +1,495 @@
+"""
+Integration tests for Phase 2 autonomy features.
+Tests autonomous tool invocation, proactive monitoring, actions, and pattern learning.
+"""
+
+import asyncio
+import json
+import sys
+import os
+
+# Add parent directory to path for imports
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Override self-state file path for testing
+os.environ["SELF_STATE_FILE"] = "/tmp/test_self_state.json"
+
+from autonomy.tools.decision_engine import ToolDecisionEngine
+from autonomy.tools.orchestrator import ToolOrchestrator
+from autonomy.proactive.monitor import ProactiveMonitor
+from autonomy.actions.autonomous_actions import AutonomousActionManager
+from autonomy.learning.pattern_learner import PatternLearner
+from autonomy.self.state import load_self_state, get_self_state_instance
+
+
+async def test_tool_decision_engine():
+    """Test autonomous tool decision making."""
+    print("\n" + "="*60)
+    print("TEST 1: Tool Decision Engine")
+    print("="*60)
+
+    engine = ToolDecisionEngine()
+
+    # Test 1a: Memory reference detection
+    result = await engine.analyze_tool_needs(
+        user_prompt="What did we discuss earlier about Python?",
+        monologue={"intent": "clarification", "consult_executive": False},
+        context_state={},
+        available_tools=["RAG", "WEB", "WEATHER"]
+    )
+
+    assert result["should_invoke_tools"], "Should invoke tools for memory reference"
+    assert any(t["tool"] == "RAG" for t in result["tools_to_invoke"]), "Should recommend RAG"
+    assert result["confidence"] > 0.8, f"Confidence should be high for clear memory reference: {result['confidence']}"
+
+    print(f"  ✓ Memory reference detection passed")
+    print(f"    Tools: {[t['tool'] for t in result['tools_to_invoke']]}")
+    print(f"    Confidence: {result['confidence']:.2f}")
+
+    # Test 1b: Web search detection
+    result = await engine.analyze_tool_needs(
+        user_prompt="What's the latest news about AI developments?",
+        monologue={"intent": "information_seeking", "consult_executive": False},
+        context_state={},
+        available_tools=["RAG", "WEB", "WEATHER"]
+    )
+
+    assert result["should_invoke_tools"], "Should invoke tools for current info request"
+    assert any(t["tool"] == "WEB" for t in result["tools_to_invoke"]), "Should recommend WEB"
+
+    print(f"  ✓ Web search detection passed")
+    print(f"    Tools: {[t['tool'] for t in result['tools_to_invoke']]}")
+
+    # Test 1c: Weather detection
+    result = await engine.analyze_tool_needs(
+        user_prompt="What's the weather like today in Boston?",
+        monologue={"intent": "information_seeking", "consult_executive": False},
+        context_state={},
+        available_tools=["RAG", "WEB", "WEATHER"]
+    )
+
+    assert result["should_invoke_tools"], "Should invoke tools for weather query"
+    assert any(t["tool"] == "WEATHER" for t in result["tools_to_invoke"]), "Should recommend WEATHER"
+
+    print(f"  ✓ Weather detection passed")
+
+    # Test 1d: Proactive RAG for complex queries
+    result = await engine.analyze_tool_needs(
+        user_prompt="Design a microservices architecture",
+        monologue={"intent": "technical_implementation", "consult_executive": True},
+        context_state={},
+        available_tools=["RAG", "WEB", "CODEBRAIN"]
+    )
+
+    assert result["should_invoke_tools"], "Should proactively invoke tools for complex queries"
+    rag_tools = [t for t in result["tools_to_invoke"] if t["tool"] == "RAG"]
+    assert len(rag_tools) > 0, "Should include proactive RAG"
+
+    print(f"  ✓ Proactive RAG detection passed")
+    print(f"    Reason: {rag_tools[0]['reason']}")
+
+    print("\n✓ Tool Decision Engine tests passed\n")
+    return result
+
+
+async def test_tool_orchestrator():
+    """Test tool orchestration (mock mode)."""
+    print("\n" + "="*60)
+    print("TEST 2: Tool Orchestrator (Mock Mode)")
+    print("="*60)
+
+    orchestrator = ToolOrchestrator(tool_timeout=5)
+
+    # Since actual tools may not be available, test the orchestrator structure
+    print(f"  Available tools: {list(orchestrator.available_tools.keys())}")
+
+    # Test with tools_to_invoke (will fail gracefully if tools unavailable)
+    tools_to_invoke = [
+        {"tool": "RAG", "query": "test query", "reason": "testing", "priority": 0.9}
+    ]
+
+    result = await orchestrator.execute_tools(
+        tools_to_invoke=tools_to_invoke,
+        context_state={"session_id": "test"}
+    )
+
+    assert "results" in result, "Should return results dict"
+    assert "execution_summary" in result, "Should return execution summary"
+
+    summary = result["execution_summary"]
+    assert "tools_invoked" in summary, "Summary should include tools_invoked"
+    assert "total_time_ms" in summary, "Summary should include timing"
+
+    print(f"  ✓ Orchestrator structure valid")
+    print(f"    Summary: {summary}")
+
+    # Test result formatting
+    formatted = orchestrator.format_results_for_context(result)
+    assert isinstance(formatted, str), "Should format results as string"
+
+    print(f"  ✓ Result formatting works")
+    print(f"    Formatted length: {len(formatted)} chars")
+
+    print("\n✓ Tool Orchestrator tests passed\n")
+    return result
+
+
+async def test_proactive_monitor():
+    """Test proactive monitoring and suggestions."""
+    print("\n" + "="*60)
+    print("TEST 3: Proactive Monitor")
+    print("="*60)
+
+    monitor = ProactiveMonitor(min_priority=0.6)
+
+    # Test 3a: Long silence detection
+    context_state = {
+        "message_count": 5,
+        "minutes_since_last_msg": 35  # > 30 minutes
+    }
+
+    self_state = load_self_state()
+
+    suggestion = await monitor.analyze_session(
+        session_id="test_silence",
+        context_state=context_state,
+        self_state=self_state
+    )
+
+    assert suggestion is not None, "Should generate suggestion for long silence"
+    assert suggestion["type"] == "check_in", f"Should be check_in type: {suggestion['type']}"
+    assert suggestion["priority"] >= 0.6, "Priority should meet threshold"
+
+    print(f"  ✓ Long silence detection passed")
+    print(f"    Type: {suggestion['type']}, Priority: {suggestion['priority']:.2f}")
+    print(f"    Suggestion: {suggestion['suggestion'][:50]}...")
+
+    # Test 3b: Learning opportunity (high curiosity)
+    self_state["curiosity"] = 0.8
+    self_state["learning_queue"] = ["quantum computing", "rust programming"]
+
+    # Reset cooldown for this test
+    monitor.reset_cooldown("test_learning")
+
+    suggestion = await monitor.analyze_session(
+        session_id="test_learning",
+        context_state={"message_count": 3, "minutes_since_last_msg": 2},
+        self_state=self_state
+    )
+
+    assert suggestion is not None, "Should generate learning suggestion"
+    assert suggestion["type"] == "learning", f"Should be learning type: {suggestion['type']}"
+
+    print(f"  ✓ Learning opportunity detection passed")
+    print(f"    Suggestion: {suggestion['suggestion'][:70]}...")
+
+    # Test 3c: Conversation milestone
+    monitor.reset_cooldown("test_milestone")
+
+    # Reset curiosity to avoid learning suggestion taking precedence
+    self_state["curiosity"] = 0.5
+    self_state["learning_queue"] = []
+
+    suggestion = await monitor.analyze_session(
+        session_id="test_milestone",
+        context_state={"message_count": 50, "minutes_since_last_msg": 1},
+        self_state=self_state
+    )
+
+    assert suggestion is not None, "Should generate milestone suggestion"
+    # Note: learning or summary both valid - check it's a reasonable suggestion
+    assert suggestion["type"] in ["summary", "learning", "check_in"], f"Should be valid type: {suggestion['type']}"
+
+    print(f"  ✓ Conversation milestone detection passed (type: {suggestion['type']})")
+
+    # Test 3d: Cooldown mechanism
+    # Try to get another suggestion immediately (should be blocked)
+    suggestion2 = await monitor.analyze_session(
+        session_id="test_milestone",
+        context_state={"message_count": 51, "minutes_since_last_msg": 1},
+        self_state=self_state
+    )
+
+    assert suggestion2 is None, "Should not generate suggestion during cooldown"
+
+    print(f"  ✓ Cooldown mechanism working")
+
+    # Check stats
+    stats = monitor.get_session_stats("test_milestone")
+    assert stats["cooldown_active"], "Cooldown should be active"
+    print(f"    Cooldown remaining: {stats['cooldown_remaining']}s")
+
+    print("\n✓ Proactive Monitor tests passed\n")
+    return suggestion
+
+
+async def test_autonomous_actions():
+    """Test autonomous action execution."""
+    print("\n" + "="*60)
+    print("TEST 4: Autonomous Actions")
+    print("="*60)
+
+    manager = AutonomousActionManager()
+
+    # Test 4a: List allowed actions
+    allowed = manager.get_allowed_actions()
+    assert "create_memory" in allowed, "Should have create_memory action"
+    assert "update_goal" in allowed, "Should have update_goal action"
+    assert "learn_topic" in allowed, "Should have learn_topic action"
+
+    print(f"  ✓ Allowed actions: {allowed}")
+
+    # Test 4b: Validate actions
+    validation = manager.validate_action("create_memory", {"text": "test memory"})
+    assert validation["valid"], "Should validate correct action"
+
+    print(f"  ✓ Action validation passed")
+
+    # Test 4c: Execute learn_topic action
+    result = await manager.execute_action(
+        action_type="learn_topic",
+        parameters={"topic": "rust programming", "reason": "testing", "priority": 0.8},
+        context={"session_id": "test"}
+    )
+
+    assert result["success"], f"Action should succeed: {result.get('error', 'unknown')}"
+    assert "topic" in result["result"], "Should return topic info"
+
+    print(f"  ✓ learn_topic action executed")
+    print(f"    Topic: {result['result']['topic']}")
+    print(f"    Queue position: {result['result']['queue_position']}")
+
+    # Test 4d: Execute update_focus action
+    result = await manager.execute_action(
+        action_type="update_focus",
+        parameters={"focus": "autonomy_testing", "reason": "running tests"},
+        context={"session_id": "test"}
+    )
+
+    assert result["success"], "update_focus should succeed"
+
+    print(f"  ✓ update_focus action executed")
+    print(f"    New focus: {result['result']['new_focus']}")
+
+    # Test 4e: Reject non-whitelisted action
+    result = await manager.execute_action(
+        action_type="delete_all_files",  # NOT in whitelist
+        parameters={},
+        context={"session_id": "test"}
+    )
+
+    assert not result["success"], "Should reject non-whitelisted action"
+    assert "not in whitelist" in result["error"], "Should indicate whitelist violation"
+
+    print(f"  ✓ Non-whitelisted action rejected")
+
+    # Test 4f: Action log
+    log = manager.get_action_log(limit=10)
+    assert len(log) >= 2, f"Should have logged multiple actions (got {len(log)})"
+
+    print(f"  ✓ Action log contains {len(log)} entries")
+
+    print("\n✓ Autonomous Actions tests passed\n")
+    return result
+
+
+async def test_pattern_learner():
+    """Test pattern learning system."""
+    print("\n" + "="*60)
+    print("TEST 5: Pattern Learner")
+    print("="*60)
+
+    # Use temp file for testing
+    test_file = "/tmp/test_patterns.json"
+    learner = PatternLearner(patterns_file=test_file)
+
+    # Test 5a: Learn from multiple interactions
+    for i in range(5):
+        await learner.learn_from_interaction(
+            user_prompt=f"Help me with Python coding task {i}",
+            response=f"Here's help with task {i}...",
+            monologue={"intent": "coding_help", "tone": "focused", "depth": "medium"},
+            context={"session_id": "test", "executive_plan": None}
+        )
+
+    print(f"  ✓ Learned from 5 interactions")
+
+    # Test 5b: Get top topics
+    top_topics = learner.get_top_topics(limit=5)
+    assert len(top_topics) > 0, "Should have learned topics"
+    assert "coding_help" == top_topics[0][0], "coding_help should be top topic"
+
+    print(f"  ✓ Top topics: {[t[0] for t in top_topics[:3]]}")
+
+    # Test 5c: Get preferred tone
+    preferred_tone = learner.get_preferred_tone()
+    assert preferred_tone == "focused", "Should detect focused as preferred tone"
+
+    print(f"  ✓ Preferred tone: {preferred_tone}")
+
+    # Test 5d: Get preferred depth
+    preferred_depth = learner.get_preferred_depth()
+    assert preferred_depth == "medium", "Should detect medium as preferred depth"
+
+    print(f"  ✓ Preferred depth: {preferred_depth}")
+
+    # Test 5e: Get insights
+    insights = learner.get_insights()
+    assert insights["total_interactions"] == 5, "Should track interaction count"
+    assert insights["preferred_tone"] == "focused", "Insights should include tone"
+
+    print(f"  ✓ Insights generated:")
+    print(f"    Total interactions: {insights['total_interactions']}")
+    print(f"    Recommendations: {insights['learning_recommendations']}")
+
+    # Test 5f: Export patterns
+    exported = learner.export_patterns()
+    assert "topic_frequencies" in exported, "Should export all patterns"
+
+    print(f"  ✓ Patterns exported ({len(exported)} keys)")
+
+    # Cleanup
+    if os.path.exists(test_file):
+        os.remove(test_file)
+
+    print("\n✓ Pattern Learner tests passed\n")
+    return insights
+
+
+async def test_end_to_end_autonomy():
+    """Test complete autonomous flow."""
+    print("\n" + "="*60)
+    print("TEST 6: End-to-End Autonomy Flow")
+    print("="*60)
+
+    # Simulate a complex user query that triggers multiple autonomous systems
+    user_prompt = "Remember what we discussed about machine learning? I need current research on transformers."
+
+    monologue = {
+        "intent": "technical_research",
+        "tone": "focused",
+        "depth": "deep",
+        "consult_executive": True
+    }
+
+    context_state = {
+        "session_id": "e2e_test",
+        "message_count": 15,
+        "minutes_since_last_msg": 5
+    }
+
+    print(f"  User prompt: {user_prompt}")
+    print(f"  Monologue intent: {monologue['intent']}")
+
+    # Step 1: Tool decision engine
+    engine = ToolDecisionEngine()
+    tool_decision = await engine.analyze_tool_needs(
+        user_prompt=user_prompt,
+        monologue=monologue,
+        context_state=context_state,
+        available_tools=["RAG", "WEB", "CODEBRAIN"]
+    )
+
+    print(f"\n  Step 1: Tool Decision")
+    print(f"    Should invoke: {tool_decision['should_invoke_tools']}")
+    print(f"    Tools: {[t['tool'] for t in tool_decision['tools_to_invoke']]}")
+    assert tool_decision["should_invoke_tools"], "Should invoke tools"
+    assert len(tool_decision["tools_to_invoke"]) >= 2, "Should recommend multiple tools (RAG + WEB)"
+
+    # Step 2: Pattern learning
+    learner = PatternLearner(patterns_file="/tmp/e2e_test_patterns.json")
+    await learner.learn_from_interaction(
+        user_prompt=user_prompt,
+        response="Here's information about transformers...",
+        monologue=monologue,
+        context=context_state
+    )
+
+    print(f"\n  Step 2: Pattern Learning")
+    top_topics = learner.get_top_topics(limit=3)
+    print(f"    Learned topics: {[t[0] for t in top_topics]}")
+
+    # Step 3: Autonomous action
+    action_manager = AutonomousActionManager()
+    action_result = await action_manager.execute_action(
+        action_type="learn_topic",
+        parameters={"topic": "transformer architectures", "reason": "user interest detected"},
+        context=context_state
+    )
+
+    print(f"\n  Step 3: Autonomous Action")
+    print(f"    Action: learn_topic")
+    print(f"    Success: {action_result['success']}")
+
+    # Step 4: Proactive monitoring (won't trigger due to low message count)
+    monitor = ProactiveMonitor(min_priority=0.6)
+    monitor.reset_cooldown("e2e_test")
+
+    suggestion = await monitor.analyze_session(
+        session_id="e2e_test",
+        context_state=context_state,
+        self_state=load_self_state()
+    )
+
+    print(f"\n  Step 4: Proactive Monitoring")
+    print(f"    Suggestion: {suggestion['type'] if suggestion else 'None (expected for low message count)'}")
+
+    # Cleanup
+    if os.path.exists("/tmp/e2e_test_patterns.json"):
+        os.remove("/tmp/e2e_test_patterns.json")
+
+    print("\n✓ End-to-End Autonomy Flow tests passed\n")
+    return True
+
+
+async def run_all_tests():
+    """Run all Phase 2 tests."""
+    print("\n" + "="*60)
+    print("PHASE 2 AUTONOMY TESTS")
+    print("="*60)
+
+    try:
+        # Test 1: Tool Decision Engine
+        await test_tool_decision_engine()
+
+        # Test 2: Tool Orchestrator
+        await test_tool_orchestrator()
+
+        # Test 3: Proactive Monitor
+        await test_proactive_monitor()
+
+        # Test 4: Autonomous Actions
+        await test_autonomous_actions()
+
+        # Test 5: Pattern Learner
+        await test_pattern_learner()
+
+        # Test 6: End-to-End
+        await test_end_to_end_autonomy()
+
+        print("\n" + "="*60)
+        print("ALL PHASE 2 TESTS PASSED ✓")
+        print("="*60)
+
+        print("\nPhase 2 Features Validated:")
+        print("  ✓ Autonomous tool decision making")
+        print("  ✓ Tool orchestration and execution")
+        print("  ✓ Proactive monitoring and suggestions")
+        print("  ✓ Safe autonomous actions")
+        print("  ✓ Pattern learning and adaptation")
+        print("  ✓ End-to-end autonomous flow")
+
+        return True
+
+    except Exception as e:
+        print("\n" + "="*60)
+        print(f"TEST FAILED: {e}")
+        print("="*60)
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+if __name__ == "__main__":
+    success = asyncio.run(run_all_tests())
+    sys.exit(0 if success else 1)