""" 第十二章示例9:Win Rate评估 对应文档:12.4.4 Win Rate评估 这个示例展示如何使用Win Rate评估生成的AIME题目质量。 Win Rate评估通过对比生成题目和真题,评估生成质量: - Win Rate = 50%:生成质量与真题相当(理想情况) - Win Rate > 50%:生成质量优于真题(可能是评估偏差) - Win Rate < 50%:生成质量低于真题(需要改进) """ import sys import os import json # 添加HelloAgents路径 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "HelloAgents")) from hello_agents import HelloAgentsLLM from hello_agents.evaluation import WinRateEvaluator, AIDataset # 1. 准备生成的题目数据 generated_problems = [ { "problem_id": "generated_001", "problem": "Find the number of positive integers $n$ such that $n^2 + 19n + 92$ is a perfect square.", "answer": "4" }, { "problem_id": "generated_002", "problem": "In triangle $ABC$, $AB = 13$, $BC = 14$, and $CA = 15$. Find the area of the triangle.", "answer": "84" }, { "problem_id": "generated_003", "problem": "How many positive integers less than 1000 are divisible by 7 but not by 11?", "answer": "129" } ] # 2. 加载参考数据集(AIME真题) print("="*60) print("Win Rate评估") print("="*60) print("\n加载参考数据集...") dataset = AIDataset() reference_problems = dataset.load() print(f"✅ 已加载 {len(reference_problems)} 道AIME真题") # 3. 创建Win Rate评估器 llm = HelloAgentsLLM(model_name="gpt-4o") evaluator = WinRateEvaluator( llm=llm, reference_problems=reference_problems ) # 4. 运行Win Rate评估 print(f"\n开始Win Rate评估...") print(f" 生成题目数: {len(generated_problems)}") print(f" 对比数量: 20") results = evaluator.evaluate( generated_problems=generated_problems, num_comparisons=20 # 进行20次对比 ) # 5. 显示评估结果 print("\n" + "="*60) print("评估结果") print("="*60) print(f"\nWin Rate: {results['win_rate']:.2%}") print(f"Tie Rate: {results['tie_rate']:.2%}") print(f"Loss Rate: {results['loss_rate']:.2%}") print(f"\n详细统计:") print(f" 总对比数: {results['total_comparisons']}") print(f" 生成题目胜: {results['wins']}") print(f" 平局: {results['ties']}") print(f" 真题胜: {results['losses']}") # 6. 质量评估 print(f"\n质量评估:") win_rate = results['win_rate'] if 0.45 <= win_rate <= 0.55: print("✅ 优秀 - 生成质量接近AIME真题水平") elif 0.35 <= win_rate < 0.45: print("⚠️ 良好 - 生成质量可用,但略低于真题") elif 0.25 <= win_rate < 0.35: print("⚠️ 一般 - 生成质量一般,需要改进") else: print("❌ 较差 - 生成质量差,需要大幅改进") # 7. 查看部分对比详情 print("\n" + "="*60) print("对比详情(前5个)") print("="*60) for i, comparison in enumerate(results['comparisons'][:5], 1): print(f"\n对比 {i}:") print(f" 生成题目: {comparison['generated_problem'][:60]}...") print(f" 真题: {comparison['reference_problem'][:60]}...") print(f" 结果: {comparison['result']}") if 'reason' in comparison: print(f" 理由: {comparison['reason'][:100]}...") # 8. 保存评估结果 output_file = "./evaluation_results/win_rate_results.json" os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: json.dump(results, f, indent=2, ensure_ascii=False) print(f"\n✅ 评估结果已保存到 {output_file}") # 运行输出示例: # ============================================================ # Win Rate评估 # ============================================================ # # 加载参考数据集... # ✅ 已加载 963 道AIME真题 # # 开始Win Rate评估... # 生成题目数: 3 # 对比数量: 20 # # Win Rate评估: 100%|██████████| 20/20 [01:00<00:00, 3.01s/对比] # # ============================================================ # 评估结果 # ============================================================ # # Win Rate: 45.00% # Tie Rate: 10.00% # Loss Rate: 45.00% # # 详细统计: # 总对比数: 20 # 生成题目胜: 9 # 平局: 2 # 真题胜: 9 # # 质量评估: # ✅ 优秀 - 生成质量接近AIME真题水平 # # ============================================================ # 对比详情(前5个) # ============================================================ # # 对比 1: # 生成题目: Find the number of positive integers $n$ such that $n^2 + 19... # 真题: Let $N$ be the number of consecutive $0$'s at the right end... # 结果: generated # 理由: The generated problem has a clearer problem statement and a mo... # # 对比 2: # 生成题目: In triangle $ABC$, $AB = 13$, $BC = 14$, and $CA = 15$. F... # 真题: Find the number of ordered pairs $(m,n)$ of positive integers... # 结果: reference # 理由: The reference problem is more challenging and requires deeper... # # ... # # ✅ 评估结果已保存到 ./evaluation_results/win_rate_results.json