""" 第十二章示例8:LLM Judge评估 对应文档:12.4.3 LLM Judge评估 这个示例展示如何使用LLM Judge评估生成的AIME题目质量。 LLM Judge从4个维度评估题目质量: 1. 正确性(Correctness):题目和答案是否正确 2. 清晰度(Clarity):题目表述是否清晰 3. 难度匹配(Difficulty Match):难度是否符合AIME水平 4. 完整性(Completeness):题目是否完整 """ import sys import os import json # 添加HelloAgents路径 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "HelloAgents")) from hello_agents import HelloAgentsLLM from hello_agents.evaluation import LLMJudge # 1. 准备生成的题目数据 generated_problems = [ { "problem_id": "generated_001", "problem": "Find the number of positive integers $n$ such that $n^2 + 19n + 92$ is a perfect square.", "answer": "4", "solution": "Let $n^2 + 19n + 92 = m^2$ for some positive integer $m$..." }, { "problem_id": "generated_002", "problem": "In triangle $ABC$, $AB = 13$, $BC = 14$, and $CA = 15$. Find the area of the triangle.", "answer": "84", "solution": "Using Heron's formula, $s = (13+14+15)/2 = 21$..." } ] # 2. 创建LLM Judge评估器 llm = HelloAgentsLLM(model_name="gpt-4o") judge = LLMJudge(llm=llm) # 3. 评估每道题目 print("="*60) print("LLM Judge评估") print("="*60) all_scores = [] for i, problem in enumerate(generated_problems, 1): print(f"\n评估题目 {i}/{len(generated_problems)}") print(f"题目ID: {problem['problem_id']}") # 评估单道题目 result = judge.evaluate_single(problem) # 显示评估结果 print(f"\n评估结果:") print(f" 正确性: {result['correctness']}/5") print(f" 清晰度: {result['clarity']}/5") print(f" 难度匹配: {result['difficulty_match']}/5") print(f" 完整性: {result['completeness']}/5") print(f" 平均分: {result['average_score']:.2f}/5") print(f"\n评语:") print(f" {result['feedback']}") all_scores.append(result) # 4. 计算总体统计 print("\n" + "="*60) print("总体统计") print("="*60) avg_correctness = sum(s['correctness'] for s in all_scores) / len(all_scores) avg_clarity = sum(s['clarity'] for s in all_scores) / len(all_scores) avg_difficulty = sum(s['difficulty_match'] for s in all_scores) / len(all_scores) avg_completeness = sum(s['completeness'] for s in all_scores) / len(all_scores) avg_overall = sum(s['average_score'] for s in all_scores) / len(all_scores) print(f"\n平均分:") print(f" 正确性: {avg_correctness:.2f}/5") print(f" 清晰度: {avg_clarity:.2f}/5") print(f" 难度匹配: {avg_difficulty:.2f}/5") print(f" 完整性: {avg_completeness:.2f}/5") print(f" 总体平均: {avg_overall:.2f}/5") # 5. 质量评估 print(f"\n质量评估:") if avg_overall >= 4.0: print("✅ 优秀 - 题目质量很高,可以直接使用") elif avg_overall >= 3.0: print("⚠️ 良好 - 题目质量可用,建议人工审核") elif avg_overall >= 2.0: print("⚠️ 一般 - 题目质量一般,需要大幅改进") else: print("❌ 较差 - 题目质量差,需要重新生成") # 6. 保存评估结果 output_file = "./evaluation_results/llm_judge_results.json" os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: json.dump({ 'problems': generated_problems, 'scores': all_scores, 'statistics': { 'avg_correctness': avg_correctness, 'avg_clarity': avg_clarity, 'avg_difficulty': avg_difficulty, 'avg_completeness': avg_completeness, 'avg_overall': avg_overall } }, f, indent=2, ensure_ascii=False) print(f"\n✅ 评估结果已保存到 {output_file}") # 运行输出示例: # ============================================================ # LLM Judge评估 # ============================================================ # # 评估题目 1/2 # 题目ID: generated_001 # # 评估结果: # 正确性: 5/5 # 清晰度: 4/5 # 难度匹配: 5/5 # 完整性: 5/5 # 平均分: 4.75/5 # # 评语: # This is an excellent AIME-level problem. The problem is well-posed, # the solution is correct, and the difficulty is appropriate. # # 评估题目 2/2 # 题目ID: generated_002 # # 评估结果: # 正确性: 5/5 # 清晰度: 5/5 # 难度匹配: 3/5 # 完整性: 5/5 # 平均分: 4.50/5 # # 评语: # The problem is correct and clear, but the difficulty is slightly # below AIME level. Consider adding more complexity. # # ============================================================ # 总体统计 # ============================================================ # # 平均分: # 正确性: 5.00/5 # 清晰度: 4.50/5 # 难度匹配: 4.00/5 # 完整性: 5.00/5 # 总体平均: 4.62/5 # # 质量评估: # ✅ 优秀 - 题目质量很高,可以直接使用 # # ✅ 评估结果已保存到 ./evaluation_results/llm_judge_results.json