You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

315 lines
11 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""
完整评估流程
运行完整的数据生成和评估流程:
1. 生成AIME题目
2. LLM Judge评估
3. Win Rate评估
4. 生成综合报告
运行方法:
python data_generation/run_complete_evaluation.py 30 3.0
参数:
- 30: 生成题目数量
- 3.0: 每次生成之间的延迟(秒)
说明:
- 使用AIME 2025年真题作为参考
- 数据集来源math-ai/aime25JSONL格式
"""
import json
import os
import sys
from datetime import datetime
from aime_generator import AIMEGenerator
from hello_agents import SimpleAgent, HelloAgentsLLM
from hello_agents.tools import LLMJudgeTool, WinRateTool
def run_complete_evaluation(
num_problems: int = 30,
delay_seconds: float = 3.0
):
"""
运行完整评估流程
Args:
num_problems: 生成题目数量
delay_seconds: 每次生成之间的延迟避免API速率限制
"""
print("\n" + "="*80)
print("🚀 AIME数据生成与评估完整流程")
print("="*80)
print(f"\n配置信息:")
print(f" - 生成题目数量: {num_problems}")
print(f" - API延迟: {delay_seconds}秒/题")
print(f" - 生成参考数据: TianHongZXY/aime-1983-2025900+道题)")
print(f" - 评估参考: AIME 2025真题")
# ========== 步骤1: 生成AIME题目 ==========
print("\n" + "="*80)
print("📝 步骤1: 生成AIME题目")
print("="*80)
generator = AIMEGenerator(delay_seconds=delay_seconds)
generated_data_path = generator.generate_and_save(
num_problems=num_problems,
output_dir="data_generation/generated_data"
)
print(f"\n✅ 步骤1完成生成数据保存在: {generated_data_path}")
# ========== 步骤2: 评估 ==========
# 创建评估结果目录
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
evaluation_dir = f"data_generation/evaluation_results/{timestamp}"
os.makedirs(evaluation_dir, exist_ok=True)
os.makedirs(os.path.join(evaluation_dir, "llm_judge"), exist_ok=True)
os.makedirs(os.path.join(evaluation_dir, "win_rate"), exist_ok=True)
# 创建LLM
llm = HelloAgentsLLM()
# ========== 步骤2.1: LLM Judge评估 ==========
print(f"\n🎯 步骤2.1: LLM Judge评估 (vs AIME 2025)")
llm_judge_result = None
try:
llm_judge_tool = LLMJudgeTool(llm=llm)
llm_judge_result_json = llm_judge_tool.run({
"generated_data_path": generated_data_path,
"reference_year": 2025,
"max_samples": num_problems,
"output_dir": os.path.join(evaluation_dir, "llm_judge"),
"judge_model": "gpt-4o"
})
llm_judge_result = json.loads(llm_judge_result_json)
print(f"\n✅ LLM Judge评估完成")
print(f" 平均总分: {llm_judge_result['metrics']['average_total_score']:.2f}/5.0")
print(f" 通过率: {llm_judge_result['metrics']['pass_rate']:.2%}")
except Exception as e:
print(f"\n❌ LLM Judge评估失败: {e}")
import traceback
traceback.print_exc()
# ========== 步骤2.2: Win Rate评估 ==========
print(f"\n🏆 步骤2.2: Win Rate评估 (vs AIME 2025)")
win_rate_result = None
try:
win_rate_tool = WinRateTool(llm=llm)
win_rate_result_json = win_rate_tool.run({
"generated_data_path": generated_data_path,
"reference_year": 2025,
"num_comparisons": min(num_problems, 20), # 最多20次对比
"output_dir": os.path.join(evaluation_dir, "win_rate"),
"judge_model": "gpt-4o"
})
win_rate_result = json.loads(win_rate_result_json)
print(f"\n✅ Win Rate评估完成")
print(f" Win Rate: {win_rate_result['metrics']['win_rate']:.2%}")
except Exception as e:
print(f"\n❌ Win Rate评估失败: {e}")
import traceback
traceback.print_exc()
# ========== 步骤3: 生成综合报告 ==========
comprehensive_report_path = None
if llm_judge_result or win_rate_result:
print("\n" + "="*80)
print("📊 步骤3: 生成综合报告")
print("="*80)
comprehensive_report_path = os.path.join(evaluation_dir, "comprehensive_report.md")
# 生成综合报告
report = generate_comprehensive_report(
generated_data_path,
llm_judge_result,
win_rate_result
)
with open(comprehensive_report_path, 'w', encoding='utf-8') as f:
f.write(report)
print(f"\n✅ 综合报告已保存: {comprehensive_report_path}")
# ========== 完成 ==========
print("\n" + "="*80)
print("🎉 完整评估流程完成!")
print("="*80)
print(f"\n📁 输出文件:")
print(f" - 生成数据: {generated_data_path}")
print(f" - 评估结果目录: {evaluation_dir}")
if llm_judge_result:
print(f" - LLM Judge报告: {llm_judge_result.get('report_file', 'N/A')}")
if win_rate_result:
print(f" - Win Rate报告: {win_rate_result.get('report_file', 'N/A')}")
if comprehensive_report_path:
print(f" - 综合报告: {comprehensive_report_path}")
print(f"\n💡 下一步:")
if comprehensive_report_path:
print(f" 1. 查看综合报告: {comprehensive_report_path}")
print(f" 2. 运行人工验证: python data_generation/human_verification_ui.py {generated_data_path}")
return {
"generated_data_path": generated_data_path,
"llm_judge_result": llm_judge_result,
"win_rate_result": win_rate_result,
"comprehensive_report_path": comprehensive_report_path
}
def generate_comprehensive_report(
generated_data_path: str,
llm_judge_result: dict,
win_rate_result: dict
) -> str:
"""生成综合评估报告"""
# 加载生成数据
with open(generated_data_path, 'r', encoding='utf-8') as f:
generated_data = json.load(f)
report = f"""# AIME数据生成与评估综合报告
## 1. 基本信息
- **生成时间**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
- **生成题目数量**: {len(generated_data)}
- **参考AIME年份**: 2025
- **生成数据路径**: {generated_data_path}
## 2. 数据生成统计
### 主题分布
"""
# 统计主题分布
topic_counts = {}
for item in generated_data:
topic = item.get('topic', 'Unknown')
topic_counts[topic] = topic_counts.get(topic, 0) + 1
report += "| 主题 | 数量 | 占比 |\n"
report += "|------|------|------|\n"
for topic, count in sorted(topic_counts.items(), key=lambda x: x[1], reverse=True):
percentage = count / len(generated_data) * 100
report += f"| {topic} | {count} | {percentage:.1f}% |\n"
# LLM Judge结果
if llm_judge_result:
report += "\n## 3. LLM Judge评估结果\n\n"
report += f"""**总体评分**:
- 平均总分: {llm_judge_result['metrics']['average_total_score']:.2f}/5.0
- 通过率: {llm_judge_result['metrics']['pass_rate']:.2%}
- 优秀率: {llm_judge_result['metrics']['excellent_rate']:.2%}
**各维度评分**:
| 维度 | 平均分 |
|------|--------|
| 正确性 | {llm_judge_result['metrics']['dimension_averages']['correctness']:.2f}/5.0 |
| 清晰度 | {llm_judge_result['metrics']['dimension_averages']['clarity']:.2f}/5.0 |
| 难度匹配 | {llm_judge_result['metrics']['dimension_averages']['difficulty_match']:.2f}/5.0 |
| 完整性 | {llm_judge_result['metrics']['dimension_averages']['completeness']:.2f}/5.0 |
"""
# Win Rate结果
if win_rate_result:
report += "\n## 4. Win Rate评估结果\n\n"
report += f"""**胜率统计**:
- Win Rate: {win_rate_result['metrics']['win_rate']:.2%}
- Loss Rate: {win_rate_result['metrics']['loss_rate']:.2%}
- Tie Rate: {win_rate_result['metrics']['tie_rate']:.2%}
**对比次数**:
- 总对比次数: {win_rate_result['metrics']['total_comparisons']}
- 胜出次数: {win_rate_result['metrics']['wins']}
- 失败次数: {win_rate_result['metrics']['losses']}
- 平局次数: {win_rate_result['metrics']['ties']}
"""
# 综合结论
report += "\n## 5. 综合结论\n\n"
if llm_judge_result and win_rate_result:
overall_avg_score = llm_judge_result['metrics']['average_total_score']
overall_win_rate = win_rate_result['metrics']['win_rate']
if overall_avg_score >= 4.5 and overall_win_rate >= 0.48:
report += "✅ **结论**: 生成数据质量**优秀**达到或超过AIME真题水平。\n"
elif overall_avg_score >= 4.0 and overall_win_rate >= 0.45:
report += "✅ **结论**: 生成数据质量**良好**接近AIME真题水平。\n"
else:
report += "⚠️ **结论**: 生成数据质量**需要改进**与AIME真题仍有差距。\n"
report += f"\n**整体指标**:\n"
report += f"- LLM Judge得分: {overall_avg_score:.2f}/5.0\n"
report += f"- Win Rate: {overall_win_rate:.2%}\n"
# 改进建议
report += "\n## 6. 改进建议\n\n"
if llm_judge_result:
avg_score = llm_judge_result['metrics']['average_total_score']
if avg_score >= 4.5:
report += "- ✅ 继续保持当前的生成策略\n"
report += "- ✅ 可以考虑增加生成数量\n"
elif avg_score >= 4.0:
report += "- 🔄 优化题目生成的提示词\n"
report += "- 🔄 增加质量过滤步骤\n"
else:
report += "- ⚠️ 需要重新设计生成提示词\n"
report += "- ⚠️ 考虑使用更强的生成模型\n"
report += "- ⚠️ 增加人工审核环节\n"
# 下一步行动
report += "\n## 7. 下一步行动\n\n"
report += "1. **人工验证**: 运行人工验证界面,对生成的题目进行人工审核\n"
report += f" ```bash\n python data_generation/human_verification_ui.py {generated_data_path}\n ```\n\n"
report += "2. **质量筛选**: 根据评估结果筛选高质量题目\n\n"
report += "3. **迭代优化**: 根据评估反馈优化生成策略\n"
report += f"\n---\n\n*报告生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n"
return report
def main():
if len(sys.argv) < 2:
print("用法: python run_complete_evaluation.py <num_problems> [delay_seconds]")
print("\n说明:")
print(" - 使用AIME 2025年真题作为参考")
print(" - 数据集来源: math-ai/aime25JSONL格式")
print("\n示例:")
print("python run_complete_evaluation.py 30 3.0")
sys.exit(1)
# 解析命令行参数
num_problems = int(sys.argv[1])
delay_seconds = float(sys.argv[2]) if len(sys.argv) > 2 else 3.0
# 运行完整评估
run_complete_evaluation(
num_problems=num_problems,
delay_seconds=delay_seconds
)
if __name__ == "__main__":
main()