"""
性能基准测试

测试不同MCTS实现的性能对比
"""

import time
import torch
import pytest
from game import Game2048
from torch_mcts import TorchMCTS


class TestPerformanceBenchmark:
    """性能基准测试类"""
    
    @pytest.fixture
    def game(self):
        """测试游戏状态"""
        return Game2048(height=3, width=3, seed=42)
    
    def test_cpu_mcts_performance(self, game):
        """测试CPU MCTS性能"""
        mcts = TorchMCTS(
            c_param=1.414,
            max_simulation_depth=50,
            device="cpu"
        )
        
        simulations = 2000
        start_time = time.time()
        action, stats = mcts.search(game, simulations)
        elapsed_time = time.time() - start_time
        
        speed = simulations / elapsed_time
        
        # CPU MCTS应该达到基本性能要求
        assert speed > 500, f"CPU MCTS性能过低: {speed:.1f} 模拟/秒"
        assert action in game.get_valid_moves()
    
    def test_auto_device_mcts_performance(self, game):
        """测试自动设备选择MCTS性能"""
        mcts = TorchMCTS(
            c_param=1.414,
            max_simulation_depth=50,
            device="auto"
        )

        simulations = 2000
        start_time = time.time()
        action, stats = mcts.search(game, simulations)
        elapsed_time = time.time() - start_time

        speed = simulations / elapsed_time

        # 自动设备选择应该有合理性能
        assert speed > 100, f"自动设备MCTS性能过低: {speed:.1f} 模拟/秒"
        assert action in game.get_valid_moves()

        if mcts.device.type == "cuda":
            del mcts
            torch.cuda.empty_cache()
    
    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA不可用")
    def test_gpu_mcts_performance(self, game):
        """测试GPU MCTS性能"""
        gpu_mcts = TorchMCTS(
            max_simulation_depth=50,
            batch_size=8192,
            device="cuda"
        )
        
        simulations = 5000
        
        torch.cuda.synchronize()
        start_time = time.time()
        action, stats = gpu_mcts.search(game, simulations)
        torch.cuda.synchronize()
        elapsed_time = time.time() - start_time
        
        speed = simulations / elapsed_time
        
        # GPU MCTS应该有显著性能提升
        assert speed > 200, f"GPU MCTS性能过低: {speed:.1f} 模拟/秒"
        assert action in game.get_valid_moves()
        
        del gpu_mcts
        torch.cuda.empty_cache()
    
    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA不可用")
    def test_performance_comparison(self, game):
        """性能对比测试"""
        simulations = 3000
        results = {}
        
        # CPU MCTS
        cpu_mcts = TorchMCTS(c_param=1.414, max_simulation_depth=50, device="cpu")
        start_time = time.time()
        cpu_action, cpu_stats = cpu_mcts.search(game.copy(), simulations)
        cpu_time = time.time() - start_time
        results['CPU'] = simulations / cpu_time

        # GPU MCTS
        gpu_mcts = TorchMCTS(max_simulation_depth=50, batch_size=8192, device="cuda")
        torch.cuda.synchronize()
        start_time = time.time()
        gpu_action, gpu_stats = gpu_mcts.search(game.copy(), simulations)
        torch.cuda.synchronize()
        gpu_time = time.time() - start_time
        results['GPU'] = simulations / gpu_time
        
        # 验证性能提升
        speedup = results['GPU'] / results['CPU']
        print(f"\n性能对比:")
        print(f"  CPU: {results['CPU']:.1f} 模拟/秒")
        print(f"  GPU: {results['GPU']:.1f} 模拟/秒")
        print(f"  加速比: {speedup:.1f}x")
        
        # GPU应该有一定的性能优势（至少不能太慢）
        assert speedup > 0.1, f"GPU性能严重低于CPU: {speedup:.2f}x"
        
        # 清理
        del cpu_mcts, gpu_mcts
        torch.cuda.empty_cache()
    
    def test_batch_size_scaling(self):
        """测试批次大小对性能的影响"""
        if not torch.cuda.is_available():
            pytest.skip("CUDA不可用")
        
        game = Game2048(height=3, width=3, seed=42)
        batch_sizes = [1024, 4096, 16384]
        simulations = 2000
        
        results = {}
        
        for batch_size in batch_sizes:
            gpu_mcts = TorchMCTS(
                max_simulation_depth=50,
                batch_size=batch_size,
                device="cuda"
            )
            
            torch.cuda.synchronize()
            start_time = time.time()
            action, stats = gpu_mcts.search(game.copy(), simulations)
            torch.cuda.synchronize()
            elapsed_time = time.time() - start_time
            
            speed = simulations / elapsed_time
            results[batch_size] = speed
            
            del gpu_mcts
            torch.cuda.empty_cache()
        
        # 验证批次大小的影响
        speeds = list(results.values())
        max_speed = max(speeds)
        min_speed = min(speeds)
        
        # 不同批次大小的性能差异应该在合理范围内
        speed_ratio = max_speed / min_speed
        assert speed_ratio < 10, f"批次大小性能差异过大: {speed_ratio:.2f}"
        
        print(f"\n批次大小性能测试:")
        for batch_size, speed in results.items():
            print(f"  {batch_size:,}: {speed:.1f} 模拟/秒")


def test_memory_efficiency():
    """内存效率测试"""
    if not torch.cuda.is_available():
        pytest.skip("CUDA不可用")
    
    game = Game2048(height=3, width=3, seed=42)
    
    torch.cuda.empty_cache()
    initial_memory = torch.cuda.memory_allocated()
    
    gpu_mcts = TorchMCTS(
        max_simulation_depth=50,
        batch_size=32768,
        device="cuda"
    )
    
    # 执行搜索
    action, stats = gpu_mcts.search(game, 10000)
    
    peak_memory = torch.cuda.max_memory_allocated()
    memory_used = (peak_memory - initial_memory) / 1e6  # MB
    
    # 内存使用应该合理
    assert memory_used < 500, f"GPU内存使用过多: {memory_used:.1f} MB"
    
    # 计算内存效率
    speed = stats['sims_per_second']
    memory_efficiency = speed / memory_used if memory_used > 0 else 0
    
    print(f"\n内存效率测试:")
    print(f"  内存使用: {memory_used:.1f} MB")
    print(f"  模拟速度: {speed:.1f} 模拟/秒")
    print(f"  内存效率: {memory_efficiency:.1f} 模拟/秒/MB")
    
    # 清理
    del gpu_mcts
    torch.cuda.empty_cache()


if __name__ == "__main__":
    pytest.main([__file__, "-v"])