You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

168 lines
5.0 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""
第十二章示例8LLM Judge评估
对应文档12.4.3 LLM Judge评估
这个示例展示如何使用LLM Judge评估生成的AIME题目质量。
LLM Judge从4个维度评估题目质量
1. 正确性Correctness题目和答案是否正确
2. 清晰度Clarity题目表述是否清晰
3. 难度匹配Difficulty Match难度是否符合AIME水平
4. 完整性Completeness题目是否完整
"""
import sys
import os
import json
# 添加HelloAgents路径
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "HelloAgents"))
from hello_agents import HelloAgentsLLM
from hello_agents.evaluation import LLMJudge
# 1. 准备生成的题目数据
generated_problems = [
{
"problem_id": "generated_001",
"problem": "Find the number of positive integers $n$ such that $n^2 + 19n + 92$ is a perfect square.",
"answer": "4",
"solution": "Let $n^2 + 19n + 92 = m^2$ for some positive integer $m$..."
},
{
"problem_id": "generated_002",
"problem": "In triangle $ABC$, $AB = 13$, $BC = 14$, and $CA = 15$. Find the area of the triangle.",
"answer": "84",
"solution": "Using Heron's formula, $s = (13+14+15)/2 = 21$..."
}
]
# 2. 创建LLM Judge评估器
llm = HelloAgentsLLM(model_name="gpt-4o")
judge = LLMJudge(llm=llm)
# 3. 评估每道题目
print("="*60)
print("LLM Judge评估")
print("="*60)
all_scores = []
for i, problem in enumerate(generated_problems, 1):
print(f"\n评估题目 {i}/{len(generated_problems)}")
print(f"题目ID: {problem['problem_id']}")
# 评估单道题目
result = judge.evaluate_single(problem)
# 显示评估结果
print(f"\n评估结果:")
print(f" 正确性: {result['correctness']}/5")
print(f" 清晰度: {result['clarity']}/5")
print(f" 难度匹配: {result['difficulty_match']}/5")
print(f" 完整性: {result['completeness']}/5")
print(f" 平均分: {result['average_score']:.2f}/5")
print(f"\n评语:")
print(f" {result['feedback']}")
all_scores.append(result)
# 4. 计算总体统计
print("\n" + "="*60)
print("总体统计")
print("="*60)
avg_correctness = sum(s['correctness'] for s in all_scores) / len(all_scores)
avg_clarity = sum(s['clarity'] for s in all_scores) / len(all_scores)
avg_difficulty = sum(s['difficulty_match'] for s in all_scores) / len(all_scores)
avg_completeness = sum(s['completeness'] for s in all_scores) / len(all_scores)
avg_overall = sum(s['average_score'] for s in all_scores) / len(all_scores)
print(f"\n平均分:")
print(f" 正确性: {avg_correctness:.2f}/5")
print(f" 清晰度: {avg_clarity:.2f}/5")
print(f" 难度匹配: {avg_difficulty:.2f}/5")
print(f" 完整性: {avg_completeness:.2f}/5")
print(f" 总体平均: {avg_overall:.2f}/5")
# 5. 质量评估
print(f"\n质量评估:")
if avg_overall >= 4.0:
print("✅ 优秀 - 题目质量很高,可以直接使用")
elif avg_overall >= 3.0:
print("⚠️ 良好 - 题目质量可用,建议人工审核")
elif avg_overall >= 2.0:
print("⚠️ 一般 - 题目质量一般,需要大幅改进")
else:
print("❌ 较差 - 题目质量差,需要重新生成")
# 6. 保存评估结果
output_file = "./evaluation_results/llm_judge_results.json"
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump({
'problems': generated_problems,
'scores': all_scores,
'statistics': {
'avg_correctness': avg_correctness,
'avg_clarity': avg_clarity,
'avg_difficulty': avg_difficulty,
'avg_completeness': avg_completeness,
'avg_overall': avg_overall
}
}, f, indent=2, ensure_ascii=False)
print(f"\n✅ 评估结果已保存到 {output_file}")
# 运行输出示例:
# ============================================================
# LLM Judge评估
# ============================================================
#
# 评估题目 1/2
# 题目ID: generated_001
#
# 评估结果:
# 正确性: 5/5
# 清晰度: 4/5
# 难度匹配: 5/5
# 完整性: 5/5
# 平均分: 4.75/5
#
# 评语:
# This is an excellent AIME-level problem. The problem is well-posed,
# the solution is correct, and the difficulty is appropriate.
#
# 评估题目 2/2
# 题目ID: generated_002
#
# 评估结果:
# 正确性: 5/5
# 清晰度: 5/5
# 难度匹配: 3/5
# 完整性: 5/5
# 平均分: 4.50/5
#
# 评语:
# The problem is correct and clear, but the difficulty is slightly
# below AIME level. Consider adding more complexity.
#
# ============================================================
# 总体统计
# ============================================================
#
# 平均分:
# 正确性: 5.00/5
# 清晰度: 4.50/5
# 难度匹配: 4.00/5
# 完整性: 5.00/5
# 总体平均: 4.62/5
#
# 质量评估:
# ✅ 优秀 - 题目质量很高,可以直接使用
#
# ✅ 评估结果已保存到 ./evaluation_results/llm_judge_results.json