You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
hello-agents/code/chapter12/09_data_generation_win_rate.py

171 lines
5.0 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""
第十二章示例9Win Rate评估
对应文档12.4.4 Win Rate评估
这个示例展示如何使用Win Rate评估生成的AIME题目质量。
Win Rate评估通过对比生成题目和真题评估生成质量
- Win Rate = 50%:生成质量与真题相当(理想情况)
- Win Rate > 50%:生成质量优于真题(可能是评估偏差)
- Win Rate < 50%:生成质量低于真题(需要改进)
"""
import sys
import os
import json
# 添加HelloAgents路径
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "HelloAgents"))
from hello_agents import HelloAgentsLLM
from hello_agents.evaluation import WinRateEvaluator, AIDataset
# 1. 准备生成的题目数据
generated_problems = [
{
"problem_id": "generated_001",
"problem": "Find the number of positive integers $n$ such that $n^2 + 19n + 92$ is a perfect square.",
"answer": "4"
},
{
"problem_id": "generated_002",
"problem": "In triangle $ABC$, $AB = 13$, $BC = 14$, and $CA = 15$. Find the area of the triangle.",
"answer": "84"
},
{
"problem_id": "generated_003",
"problem": "How many positive integers less than 1000 are divisible by 7 but not by 11?",
"answer": "129"
}
]
# 2. 加载参考数据集AIME真题
print("="*60)
print("Win Rate评估")
print("="*60)
print("\n加载参考数据集...")
dataset = AIDataset()
reference_problems = dataset.load()
print(f"✅ 已加载 {len(reference_problems)} 道AIME真题")
# 3. 创建Win Rate评估器
llm = HelloAgentsLLM(model_name="gpt-4o")
evaluator = WinRateEvaluator(
llm=llm,
reference_problems=reference_problems
)
# 4. 运行Win Rate评估
print(f"\n开始Win Rate评估...")
print(f" 生成题目数: {len(generated_problems)}")
print(f" 对比数量: 20")
results = evaluator.evaluate(
generated_problems=generated_problems,
num_comparisons=20 # 进行20次对比
)
# 5. 显示评估结果
print("\n" + "="*60)
print("评估结果")
print("="*60)
print(f"\nWin Rate: {results['win_rate']:.2%}")
print(f"Tie Rate: {results['tie_rate']:.2%}")
print(f"Loss Rate: {results['loss_rate']:.2%}")
print(f"\n详细统计:")
print(f" 总对比数: {results['total_comparisons']}")
print(f" 生成题目胜: {results['wins']}")
print(f" 平局: {results['ties']}")
print(f" 真题胜: {results['losses']}")
# 6. 质量评估
print(f"\n质量评估:")
win_rate = results['win_rate']
if 0.45 <= win_rate <= 0.55:
print("✅ 优秀 - 生成质量接近AIME真题水平")
elif 0.35 <= win_rate < 0.45:
print("⚠️ 良好 - 生成质量可用,但略低于真题")
elif 0.25 <= win_rate < 0.35:
print("⚠️ 一般 - 生成质量一般,需要改进")
else:
print("❌ 较差 - 生成质量差,需要大幅改进")
# 7. 查看部分对比详情
print("\n" + "="*60)
print("对比详情前5个")
print("="*60)
for i, comparison in enumerate(results['comparisons'][:5], 1):
print(f"\n对比 {i}:")
print(f" 生成题目: {comparison['generated_problem'][:60]}...")
print(f" 真题: {comparison['reference_problem'][:60]}...")
print(f" 结果: {comparison['result']}")
if 'reason' in comparison:
print(f" 理由: {comparison['reason'][:100]}...")
# 8. 保存评估结果
output_file = "./evaluation_results/win_rate_results.json"
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\n✅ 评估结果已保存到 {output_file}")
# 运行输出示例:
# ============================================================
# Win Rate评估
# ============================================================
#
# 加载参考数据集...
# ✅ 已加载 963 道AIME真题
#
# 开始Win Rate评估...
# 生成题目数: 3
# 对比数量: 20
#
# Win Rate评估: 100%|██████████| 20/20 [01:00<00:00, 3.01s/对比]
#
# ============================================================
# 评估结果
# ============================================================
#
# Win Rate: 45.00%
# Tie Rate: 10.00%
# Loss Rate: 45.00%
#
# 详细统计:
# 总对比数: 20
# 生成题目胜: 9
# 平局: 2
# 真题胜: 9
#
# 质量评估:
# ✅ 优秀 - 生成质量接近AIME真题水平
#
# ============================================================
# 对比详情前5个
# ============================================================
#
# 对比 1:
# 生成题目: Find the number of positive integers $n$ such that $n^2 + 19...
# 真题: Let $N$ be the number of consecutive $0$'s at the right end...
# 结果: generated
# 理由: The generated problem has a clearer problem statement and a mo...
#
# 对比 2:
# 生成题目: In triangle $ABC$, $AB = 13$, $BC = 14$, and $CA = 15$. F...
# 真题: Find the number of ordered pairs $(m,n)$ of positive integers...
# 结果: reference
# 理由: The reference problem is more challenging and requires deeper...
#
# ...
#
# ✅ 评估结果已保存到 ./evaluation_results/win_rate_results.json