"""
Deep2048 快速基准测试工具

自动测试不同配置的性能，找出最优的线程数和参数设置
"""

import time
import torch
import multiprocessing as mp
from typing import Dict, List, Tuple, Optional
import json
from pathlib import Path
import argparse

from game import Game2048
from mcts import PureMCTS
from training_data import TrainingDataManager


class QuickBenchmark:
    """快速基准测试工具"""
    
    def __init__(self, output_dir: str = "results/benchmark"):
        """
        初始化基准测试
        
        Args:
            output_dir: 结果输出目录
        """
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        # 系统信息
        self.cpu_count = mp.cpu_count()
        self.cuda_available = torch.cuda.is_available()
        
        print(f"系统信息:")
        print(f"  CPU核心数: {self.cpu_count}")
        print(f"  CUDA可用: {self.cuda_available}")
        if self.cuda_available:
            print(f"  CUDA设备: {torch.cuda.get_device_name()}")
    
    def test_thread_performance(self, simulations: int = 200) -> Dict[int, Dict]:
        """
        测试不同线程数的性能
        
        Args:
            simulations: 每次测试的模拟次数
            
        Returns:
            线程数 -> 性能指标的字典
        """
        print(f"\n=== 线程性能测试 ({simulations} 模拟) ===")
        
        # 测试的线程数配置
        thread_configs = [1, 2, 4]
        if self.cpu_count >= 8:
            thread_configs.append(8)
        if self.cpu_count >= 16:
            thread_configs.append(16)
        
        results = {}
        
        for num_threads in thread_configs:
            print(f"\n测试 {num_threads} 线程...")
            
            # 创建MCTS
            mcts = PureMCTS(
                c_param=1.414,
                max_simulation_depth=80,
                num_threads=num_threads
            )
            
            # 运行多次测试取平均值
            times = []
            scores = []
            
            for run in range(3):  # 3次运行
                game = Game2048(height=3, width=3, seed=42 + run)
                
                start_time = time.time()
                best_action, root = mcts.search(game, simulations)
                elapsed_time = time.time() - start_time
                
                times.append(elapsed_time)
                if root:
                    # 计算平均子节点价值作为质量指标
                    avg_value = sum(child.average_value for child in root.children.values()) / len(root.children) if root.children else 0
                    scores.append(avg_value)
                else:
                    scores.append(0)
            
            # 计算统计指标
            avg_time = sum(times) / len(times)
            avg_score = sum(scores) / len(scores)
            sims_per_sec = simulations / avg_time
            
            # 计算效率（每核心每秒模拟数）
            efficiency = sims_per_sec / num_threads
            
            # 计算相对于单线程的加速比
            if num_threads == 1:
                baseline_speed = sims_per_sec
                speedup = 1.0
            else:
                speedup = sims_per_sec / baseline_speed if 'baseline_speed' in locals() else 1.0
            
            results[num_threads] = {
                'avg_time': avg_time,
                'sims_per_sec': sims_per_sec,
                'efficiency': efficiency,
                'speedup': speedup,
                'avg_score': avg_score,
                'times': times
            }
            
            print(f"  平均时间: {avg_time:.3f}秒")
            print(f"  模拟速度: {sims_per_sec:.1f} 次/秒")
            print(f"  效率: {efficiency:.1f} 模拟/秒/核心")
            print(f"  加速比: {speedup:.2f}x")
        
        return results
    
    def test_simulation_depth(self, num_threads: int = None) -> Dict[int, Dict]:
        """
        测试不同模拟深度的影响
        
        Args:
            num_threads: 线程数，None表示使用最优线程数
            
        Returns:
            深度 -> 性能指标的字典
        """
        if num_threads is None:
            num_threads = min(4, self.cpu_count)
        
        print(f"\n=== 模拟深度测试 ({num_threads} 线程) ===")
        
        depths = [50, 80, 120, 200]
        results = {}
        
        for depth in depths:
            print(f"\n测试深度 {depth}...")
            
            mcts = PureMCTS(
                c_param=1.414,
                max_simulation_depth=depth,
                num_threads=num_threads
            )
            
            game = Game2048(height=3, width=3, seed=42)
            
            start_time = time.time()
            best_action, root = mcts.search(game, 150)  # 固定模拟次数
            elapsed_time = time.time() - start_time
            
            sims_per_sec = 150 / elapsed_time
            avg_value = sum(child.average_value for child in root.children.values()) / len(root.children) if root and root.children else 0
            
            results[depth] = {
                'time': elapsed_time,
                'sims_per_sec': sims_per_sec,
                'avg_value': avg_value
            }
            
            print(f"  时间: {elapsed_time:.3f}秒")
            print(f"  速度: {sims_per_sec:.1f} 次/秒")
            print(f"  平均价值: {avg_value:.1f}")
        
        return results
    
    def test_board_sizes(self, num_threads: int = None) -> Dict[str, Dict]:
        """
        测试不同棋盘大小的性能
        
        Args:
            num_threads: 线程数
            
        Returns:
            棋盘大小 -> 性能指标的字典
        """
        if num_threads is None:
            num_threads = min(4, self.cpu_count)
        
        print(f"\n=== 棋盘大小测试 ({num_threads} 线程) ===")
        
        board_sizes = [(3, 3), (4, 4), (3, 4), (4, 3)]
        results = {}
        
        for height, width in board_sizes:
            size_key = f"{height}x{width}"
            print(f"\n测试 {size_key} 棋盘...")
            
            mcts = PureMCTS(
                c_param=1.414,
                max_simulation_depth=80,
                num_threads=num_threads
            )
            
            game = Game2048(height=height, width=width, seed=42)
            
            start_time = time.time()
            best_action, root = mcts.search(game, 100)
            elapsed_time = time.time() - start_time
            
            sims_per_sec = 100 / elapsed_time
            valid_moves = len(game.get_valid_moves())
            
            results[size_key] = {
                'time': elapsed_time,
                'sims_per_sec': sims_per_sec,
                'valid_moves': valid_moves,
                'board_cells': height * width
            }
            
            print(f"  时间: {elapsed_time:.3f}秒")
            print(f"  速度: {sims_per_sec:.1f} 次/秒")
            print(f"  有效动作: {valid_moves}")
        
        return results
    
    def find_optimal_config(self) -> Dict:
        """
        找到最优配置
        
        Returns:
            最优配置字典
        """
        print("\n=== 寻找最优配置 ===")
        
        # 测试线程性能
        thread_results = self.test_thread_performance(200)
        
        # 找到最优线程数（基于效率和绝对速度的平衡）
        best_thread_score = 0
        best_threads = 1
        
        for threads, result in thread_results.items():
            # 综合评分：速度 * 0.7 + 效率 * 0.3
            score = result['sims_per_sec'] * 0.7 + result['efficiency'] * 0.3
            if score > best_thread_score:
                best_thread_score = score
                best_threads = threads
        
        print(f"\n最优线程数: {best_threads}")
        print(f"  速度: {thread_results[best_threads]['sims_per_sec']:.1f} 模拟/秒")
        print(f"  效率: {thread_results[best_threads]['efficiency']:.1f} 模拟/秒/核心")
        print(f"  加速比: {thread_results[best_threads]['speedup']:.2f}x")
        
        # 测试其他参数
        depth_results = self.test_simulation_depth(best_threads)
        board_results = self.test_board_sizes(best_threads)
        
        # 推荐配置
        optimal_config = {
            'recommended_threads': best_threads,
            'recommended_depth': 80,  # 平衡性能和质量
            'recommended_board_size': (3, 3),  # L0阶段推荐
            'performance_summary': {
                'best_speed': thread_results[best_threads]['sims_per_sec'],
                'best_efficiency': thread_results[best_threads]['efficiency'],
                'speedup': thread_results[best_threads]['speedup']
            },
            'system_info': {
                'cpu_cores': self.cpu_count,
                'cuda_available': self.cuda_available
            }
        }
        
        return optimal_config
    
    def run_full_benchmark(self) -> Dict:
        """运行完整基准测试"""
        print("Deep2048 快速基准测试")
        print("=" * 50)
        
        start_time = time.time()
        
        # 运行所有测试
        results = {
            'timestamp': time.time(),
            'system_info': {
                'cpu_cores': self.cpu_count,
                'cuda_available': self.cuda_available
            },
            'thread_performance': self.test_thread_performance(200),
            'optimal_config': self.find_optimal_config()
        }
        
        total_time = time.time() - start_time
        results['benchmark_time'] = total_time
        
        # 保存结果
        result_file = self.output_dir / f"benchmark_results_{int(time.time())}.json"
        with open(result_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        
        print(f"\n基准测试完成! 用时: {total_time:.1f}秒")
        print(f"结果已保存到: {result_file}")
        
        return results
    
    def print_recommendations(self, results: Dict):
        """打印配置推荐"""
        config = results['optimal_config']
        
        print("\n" + "=" * 50)
        print("🚀 性能优化推荐")
        print("=" * 50)
        
        print(f"推荐线程数: {config['recommended_threads']}")
        print(f"推荐模拟深度: {config['recommended_depth']}")
        print(f"推荐棋盘大小: {config['recommended_board_size']}")
        
        print(f"\n预期性能:")
        print(f"  模拟速度: {config['performance_summary']['best_speed']:.1f} 次/秒")
        print(f"  CPU效率: {config['performance_summary']['best_efficiency']:.1f} 模拟/秒/核心")
        print(f"  多线程加速: {config['performance_summary']['speedup']:.2f}x")
        
        print(f"\n配置示例:")
        print(f"```python")
        print(f"mcts = PureMCTS(")
        print(f"    c_param=1.414,")
        print(f"    max_simulation_depth={config['recommended_depth']},")
        print(f"    num_threads={config['recommended_threads']}")
        print(f")")
        print(f"```")


def main():
    """主函数"""
    parser = argparse.ArgumentParser(description="Deep2048快速基准测试")
    parser.add_argument("--output", "-o", default="results/benchmark", help="输出目录")
    parser.add_argument("--quick", action="store_true", help="快速测试模式")
    
    args = parser.parse_args()
    
    benchmark = QuickBenchmark(args.output)
    
    if args.quick:
        # 快速测试模式
        print("快速测试模式")
        thread_results = benchmark.test_thread_performance(100)
        
        # 简单推荐
        best_threads = max(thread_results.keys(), key=lambda k: thread_results[k]['sims_per_sec'])
        print(f"\n快速推荐: 使用 {best_threads} 线程")
        print(f"预期速度: {thread_results[best_threads]['sims_per_sec']:.1f} 模拟/秒")
    else:
        # 完整基准测试
        results = benchmark.run_full_benchmark()
        benchmark.print_recommendations(results)


if __name__ == "__main__":
    main()