project-lyra/cortex/tests/test_autonomy_phase2.py

"""
Integration tests for Phase 2 autonomy features.
Tests autonomous tool invocation, proactive monitoring, actions, and pattern learning.
"""

import asyncio
import json
import sys
import os

# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Override self-state file path for testing
os.environ["SELF_STATE_FILE"] = "/tmp/test_self_state.json"

from autonomy.tools.decision_engine import ToolDecisionEngine
from autonomy.tools.orchestrator import ToolOrchestrator
from autonomy.proactive.monitor import ProactiveMonitor
from autonomy.actions.autonomous_actions import AutonomousActionManager
from autonomy.learning.pattern_learner import PatternLearner
from autonomy.self.state import load_self_state, get_self_state_instance


async def test_tool_decision_engine():
    """Test autonomous tool decision making."""
    print("\n" + "="*60)
    print("TEST 1: Tool Decision Engine")
    print("="*60)

    engine = ToolDecisionEngine()

    # Test 1a: Memory reference detection
    result = await engine.analyze_tool_needs(
        user_prompt="What did we discuss earlier about Python?",
        monologue={"intent": "clarification", "consult_executive": False},
        context_state={},
        available_tools=["RAG", "WEB", "WEATHER"]
    )

    assert result["should_invoke_tools"], "Should invoke tools for memory reference"
    assert any(t["tool"] == "RAG" for t in result["tools_to_invoke"]), "Should recommend RAG"
    assert result["confidence"] > 0.8, f"Confidence should be high for clear memory reference: {result['confidence']}"

    print(f"  ✓ Memory reference detection passed")
    print(f"    Tools: {[t['tool'] for t in result['tools_to_invoke']]}")
    print(f"    Confidence: {result['confidence']:.2f}")

    # Test 1b: Web search detection
    result = await engine.analyze_tool_needs(
        user_prompt="What's the latest news about AI developments?",
        monologue={"intent": "information_seeking", "consult_executive": False},
        context_state={},
        available_tools=["RAG", "WEB", "WEATHER"]
    )

    assert result["should_invoke_tools"], "Should invoke tools for current info request"
    assert any(t["tool"] == "WEB" for t in result["tools_to_invoke"]), "Should recommend WEB"

    print(f"  ✓ Web search detection passed")
    print(f"    Tools: {[t['tool'] for t in result['tools_to_invoke']]}")

    # Test 1c: Weather detection
    result = await engine.analyze_tool_needs(
        user_prompt="What's the weather like today in Boston?",
        monologue={"intent": "information_seeking", "consult_executive": False},
        context_state={},
        available_tools=["RAG", "WEB", "WEATHER"]
    )

    assert result["should_invoke_tools"], "Should invoke tools for weather query"
    assert any(t["tool"] == "WEATHER" for t in result["tools_to_invoke"]), "Should recommend WEATHER"

    print(f"  ✓ Weather detection passed")

    # Test 1d: Proactive RAG for complex queries
    result = await engine.analyze_tool_needs(
        user_prompt="Design a microservices architecture",
        monologue={"intent": "technical_implementation", "consult_executive": True},
        context_state={},
        available_tools=["RAG", "WEB", "CODEBRAIN"]
    )

    assert result["should_invoke_tools"], "Should proactively invoke tools for complex queries"
    rag_tools = [t for t in result["tools_to_invoke"] if t["tool"] == "RAG"]
    assert len(rag_tools) > 0, "Should include proactive RAG"

    print(f"  ✓ Proactive RAG detection passed")
    print(f"    Reason: {rag_tools[0]['reason']}")

    print("\n✓ Tool Decision Engine tests passed\n")
    return result


async def test_tool_orchestrator():
    """Test tool orchestration (mock mode)."""
    print("\n" + "="*60)
    print("TEST 2: Tool Orchestrator (Mock Mode)")
    print("="*60)

    orchestrator = ToolOrchestrator(tool_timeout=5)

    # Since actual tools may not be available, test the orchestrator structure
    print(f"  Available tools: {list(orchestrator.available_tools.keys())}")

    # Test with tools_to_invoke (will fail gracefully if tools unavailable)
    tools_to_invoke = [
        {"tool": "RAG", "query": "test query", "reason": "testing", "priority": 0.9}
    ]

    result = await orchestrator.execute_tools(
        tools_to_invoke=tools_to_invoke,
        context_state={"session_id": "test"}
    )

    assert "results" in result, "Should return results dict"
    assert "execution_summary" in result, "Should return execution summary"

    summary = result["execution_summary"]
    assert "tools_invoked" in summary, "Summary should include tools_invoked"
    assert "total_time_ms" in summary, "Summary should include timing"

    print(f"  ✓ Orchestrator structure valid")
    print(f"    Summary: {summary}")

    # Test result formatting
    formatted = orchestrator.format_results_for_context(result)
    assert isinstance(formatted, str), "Should format results as string"

    print(f"  ✓ Result formatting works")
    print(f"    Formatted length: {len(formatted)} chars")

    print("\n✓ Tool Orchestrator tests passed\n")
    return result


async def test_proactive_monitor():
    """Test proactive monitoring and suggestions."""
    print("\n" + "="*60)
    print("TEST 3: Proactive Monitor")
    print("="*60)

    monitor = ProactiveMonitor(min_priority=0.6)

    # Test 3a: Long silence detection
    context_state = {
        "message_count": 5,
        "minutes_since_last_msg": 35  # > 30 minutes
    }

    self_state = load_self_state()

    suggestion = await monitor.analyze_session(
        session_id="test_silence",
        context_state=context_state,
        self_state=self_state
    )

    assert suggestion is not None, "Should generate suggestion for long silence"
    assert suggestion["type"] == "check_in", f"Should be check_in type: {suggestion['type']}"
    assert suggestion["priority"] >= 0.6, "Priority should meet threshold"

    print(f"  ✓ Long silence detection passed")
    print(f"    Type: {suggestion['type']}, Priority: {suggestion['priority']:.2f}")
    print(f"    Suggestion: {suggestion['suggestion'][:50]}...")

    # Test 3b: Learning opportunity (high curiosity)
    self_state["curiosity"] = 0.8
    self_state["learning_queue"] = ["quantum computing", "rust programming"]

    # Reset cooldown for this test
    monitor.reset_cooldown("test_learning")

    suggestion = await monitor.analyze_session(
        session_id="test_learning",
        context_state={"message_count": 3, "minutes_since_last_msg": 2},
        self_state=self_state
    )

    assert suggestion is not None, "Should generate learning suggestion"
    assert suggestion["type"] == "learning", f"Should be learning type: {suggestion['type']}"

    print(f"  ✓ Learning opportunity detection passed")
    print(f"    Suggestion: {suggestion['suggestion'][:70]}...")

    # Test 3c: Conversation milestone
    monitor.reset_cooldown("test_milestone")

    # Reset curiosity to avoid learning suggestion taking precedence
    self_state["curiosity"] = 0.5
    self_state["learning_queue"] = []

    suggestion = await monitor.analyze_session(
        session_id="test_milestone",
        context_state={"message_count": 50, "minutes_since_last_msg": 1},
        self_state=self_state
    )

    assert suggestion is not None, "Should generate milestone suggestion"
    # Note: learning or summary both valid - check it's a reasonable suggestion
    assert suggestion["type"] in ["summary", "learning", "check_in"], f"Should be valid type: {suggestion['type']}"

    print(f"  ✓ Conversation milestone detection passed (type: {suggestion['type']})")

    # Test 3d: Cooldown mechanism
    # Try to get another suggestion immediately (should be blocked)
    suggestion2 = await monitor.analyze_session(
        session_id="test_milestone",
        context_state={"message_count": 51, "minutes_since_last_msg": 1},
        self_state=self_state
    )

    assert suggestion2 is None, "Should not generate suggestion during cooldown"

    print(f"  ✓ Cooldown mechanism working")

    # Check stats
    stats = monitor.get_session_stats("test_milestone")
    assert stats["cooldown_active"], "Cooldown should be active"
    print(f"    Cooldown remaining: {stats['cooldown_remaining']}s")

    print("\n✓ Proactive Monitor tests passed\n")
    return suggestion


async def test_autonomous_actions():
    """Test autonomous action execution."""
    print("\n" + "="*60)
    print("TEST 4: Autonomous Actions")
    print("="*60)

    manager = AutonomousActionManager()

    # Test 4a: List allowed actions
    allowed = manager.get_allowed_actions()
    assert "create_memory" in allowed, "Should have create_memory action"
    assert "update_goal" in allowed, "Should have update_goal action"
    assert "learn_topic" in allowed, "Should have learn_topic action"

    print(f"  ✓ Allowed actions: {allowed}")

    # Test 4b: Validate actions
    validation = manager.validate_action("create_memory", {"text": "test memory"})
    assert validation["valid"], "Should validate correct action"

    print(f"  ✓ Action validation passed")

    # Test 4c: Execute learn_topic action
    result = await manager.execute_action(
        action_type="learn_topic",
        parameters={"topic": "rust programming", "reason": "testing", "priority": 0.8},
        context={"session_id": "test"}
    )

    assert result["success"], f"Action should succeed: {result.get('error', 'unknown')}"
    assert "topic" in result["result"], "Should return topic info"

    print(f"  ✓ learn_topic action executed")
    print(f"    Topic: {result['result']['topic']}")
    print(f"    Queue position: {result['result']['queue_position']}")

    # Test 4d: Execute update_focus action
    result = await manager.execute_action(
        action_type="update_focus",
        parameters={"focus": "autonomy_testing", "reason": "running tests"},
        context={"session_id": "test"}
    )

    assert result["success"], "update_focus should succeed"

    print(f"  ✓ update_focus action executed")
    print(f"    New focus: {result['result']['new_focus']}")

    # Test 4e: Reject non-whitelisted action
    result = await manager.execute_action(
        action_type="delete_all_files",  # NOT in whitelist
        parameters={},
        context={"session_id": "test"}
    )

    assert not result["success"], "Should reject non-whitelisted action"
    assert "not in whitelist" in result["error"], "Should indicate whitelist violation"

    print(f"  ✓ Non-whitelisted action rejected")

    # Test 4f: Action log
    log = manager.get_action_log(limit=10)
    assert len(log) >= 2, f"Should have logged multiple actions (got {len(log)})"

    print(f"  ✓ Action log contains {len(log)} entries")

    print("\n✓ Autonomous Actions tests passed\n")
    return result


async def test_pattern_learner():
    """Test pattern learning system."""
    print("\n" + "="*60)
    print("TEST 5: Pattern Learner")
    print("="*60)

    # Use temp file for testing
    test_file = "/tmp/test_patterns.json"
    learner = PatternLearner(patterns_file=test_file)

    # Test 5a: Learn from multiple interactions
    for i in range(5):
        await learner.learn_from_interaction(
            user_prompt=f"Help me with Python coding task {i}",
            response=f"Here's help with task {i}...",
            monologue={"intent": "coding_help", "tone": "focused", "depth": "medium"},
            context={"session_id": "test", "executive_plan": None}
        )

    print(f"  ✓ Learned from 5 interactions")

    # Test 5b: Get top topics
    top_topics = learner.get_top_topics(limit=5)
    assert len(top_topics) > 0, "Should have learned topics"
    assert "coding_help" == top_topics[0][0], "coding_help should be top topic"

    print(f"  ✓ Top topics: {[t[0] for t in top_topics[:3]]}")

    # Test 5c: Get preferred tone
    preferred_tone = learner.get_preferred_tone()
    assert preferred_tone == "focused", "Should detect focused as preferred tone"

    print(f"  ✓ Preferred tone: {preferred_tone}")

    # Test 5d: Get preferred depth
    preferred_depth = learner.get_preferred_depth()
    assert preferred_depth == "medium", "Should detect medium as preferred depth"

    print(f"  ✓ Preferred depth: {preferred_depth}")

    # Test 5e: Get insights
    insights = learner.get_insights()
    assert insights["total_interactions"] == 5, "Should track interaction count"
    assert insights["preferred_tone"] == "focused", "Insights should include tone"

    print(f"  ✓ Insights generated:")
    print(f"    Total interactions: {insights['total_interactions']}")
    print(f"    Recommendations: {insights['learning_recommendations']}")

    # Test 5f: Export patterns
    exported = learner.export_patterns()
    assert "topic_frequencies" in exported, "Should export all patterns"

    print(f"  ✓ Patterns exported ({len(exported)} keys)")

    # Cleanup
    if os.path.exists(test_file):
        os.remove(test_file)

    print("\n✓ Pattern Learner tests passed\n")
    return insights


async def test_end_to_end_autonomy():
    """Test complete autonomous flow."""
    print("\n" + "="*60)
    print("TEST 6: End-to-End Autonomy Flow")
    print("="*60)

    # Simulate a complex user query that triggers multiple autonomous systems
    user_prompt = "Remember what we discussed about machine learning? I need current research on transformers."

    monologue = {
        "intent": "technical_research",
        "tone": "focused",
        "depth": "deep",
        "consult_executive": True
    }

    context_state = {
        "session_id": "e2e_test",
        "message_count": 15,
        "minutes_since_last_msg": 5
    }

    print(f"  User prompt: {user_prompt}")
    print(f"  Monologue intent: {monologue['intent']}")

    # Step 1: Tool decision engine
    engine = ToolDecisionEngine()
    tool_decision = await engine.analyze_tool_needs(
        user_prompt=user_prompt,
        monologue=monologue,
        context_state=context_state,
        available_tools=["RAG", "WEB", "CODEBRAIN"]
    )

    print(f"\n  Step 1: Tool Decision")
    print(f"    Should invoke: {tool_decision['should_invoke_tools']}")
    print(f"    Tools: {[t['tool'] for t in tool_decision['tools_to_invoke']]}")
    assert tool_decision["should_invoke_tools"], "Should invoke tools"
    assert len(tool_decision["tools_to_invoke"]) >= 2, "Should recommend multiple tools (RAG + WEB)"

    # Step 2: Pattern learning
    learner = PatternLearner(patterns_file="/tmp/e2e_test_patterns.json")
    await learner.learn_from_interaction(
        user_prompt=user_prompt,
        response="Here's information about transformers...",
        monologue=monologue,
        context=context_state
    )

    print(f"\n  Step 2: Pattern Learning")
    top_topics = learner.get_top_topics(limit=3)
    print(f"    Learned topics: {[t[0] for t in top_topics]}")

    # Step 3: Autonomous action
    action_manager = AutonomousActionManager()
    action_result = await action_manager.execute_action(
        action_type="learn_topic",
        parameters={"topic": "transformer architectures", "reason": "user interest detected"},
        context=context_state
    )

    print(f"\n  Step 3: Autonomous Action")
    print(f"    Action: learn_topic")
    print(f"    Success: {action_result['success']}")

    # Step 4: Proactive monitoring (won't trigger due to low message count)
    monitor = ProactiveMonitor(min_priority=0.6)
    monitor.reset_cooldown("e2e_test")

    suggestion = await monitor.analyze_session(
        session_id="e2e_test",
        context_state=context_state,
        self_state=load_self_state()
    )

    print(f"\n  Step 4: Proactive Monitoring")
    print(f"    Suggestion: {suggestion['type'] if suggestion else 'None (expected for low message count)'}")

    # Cleanup
    if os.path.exists("/tmp/e2e_test_patterns.json"):
        os.remove("/tmp/e2e_test_patterns.json")

    print("\n✓ End-to-End Autonomy Flow tests passed\n")
    return True


async def run_all_tests():
    """Run all Phase 2 tests."""
    print("\n" + "="*60)
    print("PHASE 2 AUTONOMY TESTS")
    print("="*60)

    try:
        # Test 1: Tool Decision Engine
        await test_tool_decision_engine()

        # Test 2: Tool Orchestrator
        await test_tool_orchestrator()

        # Test 3: Proactive Monitor
        await test_proactive_monitor()

        # Test 4: Autonomous Actions
        await test_autonomous_actions()

        # Test 5: Pattern Learner
        await test_pattern_learner()

        # Test 6: End-to-End
        await test_end_to_end_autonomy()

        print("\n" + "="*60)
        print("ALL PHASE 2 TESTS PASSED ✓")
        print("="*60)

        print("\nPhase 2 Features Validated:")
        print("  ✓ Autonomous tool decision making")
        print("  ✓ Tool orchestration and execution")
        print("  ✓ Proactive monitoring and suggestions")
        print("  ✓ Safe autonomous actions")
        print("  ✓ Pattern learning and adaptation")
        print("  ✓ End-to-end autonomous flow")

        return True

    except Exception as e:
        print("\n" + "="*60)
        print(f"TEST FAILED: {e}")
        print("="*60)
        import traceback
        traceback.print_exc()
        return False


if __name__ == "__main__":
    success = asyncio.run(run_all_tests())
    sys.exit(0 if success else 1)