|
|
|
|
|
"""
|
|
|
|
|
|
第十二章示例6:GAIA评估最佳实践
|
|
|
|
|
|
|
|
|
|
|
|
对应文档:12.3.9 GAIA评估最佳实践
|
|
|
|
|
|
|
|
|
|
|
|
这个示例展示了GAIA评估的最佳实践,包括:
|
|
|
|
|
|
1. 分级评估
|
|
|
|
|
|
2. 小样本快速测试
|
|
|
|
|
|
3. 结果解读
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
from hello_agents import SimpleAgent, HelloAgentsLLM
|
|
|
|
|
|
from hello_agents.tools import GAIAEvaluationTool
|
|
|
|
|
|
|
|
|
|
|
|
# GAIA官方系统提示词
|
|
|
|
|
|
GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
|
|
|
|
|
|
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
|
|
|
|
|
If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
|
|
|
|
|
|
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
|
|
|
|
|
If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
|
|
|
|
|
|
|
|
|
|
|
|
# 创建智能体
|
|
|
|
|
|
llm = HelloAgentsLLM()
|
|
|
|
|
|
agent = SimpleAgent(
|
|
|
|
|
|
name="TestAgent",
|
|
|
|
|
|
llm=llm,
|
|
|
|
|
|
system_prompt=GAIA_SYSTEM_PROMPT
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 创建评估工具
|
|
|
|
|
|
gaia_tool = GAIAEvaluationTool()
|
|
|
|
|
|
|
|
|
|
|
|
# ============================================================
|
|
|
|
|
|
# 最佳实践1:分级评估
|
|
|
|
|
|
# ============================================================
|
|
|
|
|
|
print("="*60)
|
|
|
|
|
|
print("最佳实践1:分级评估")
|
|
|
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
|
|
|
# 第一步:评估Level 1(简单任务)
|
|
|
|
|
|
print("\n第一步:评估Level 1(简单任务)")
|
|
|
|
|
|
results_l1 = gaia_tool.run(agent, level=1, max_samples=10)
|
|
|
|
|
|
print(f"Level 1精确匹配率: {results_l1['exact_match_rate']:.2%}")
|
|
|
|
|
|
|
|
|
|
|
|
# 第二步:如果Level 1表现良好,评估Level 2
|
|
|
|
|
|
if results_l1['exact_match_rate'] > 0.6:
|
|
|
|
|
|
print("\n第二步:评估Level 2(中等任务)")
|
|
|
|
|
|
results_l2 = gaia_tool.run(agent, level=2, max_samples=10)
|
|
|
|
|
|
print(f"Level 2精确匹配率: {results_l2['exact_match_rate']:.2%}")
|
|
|
|
|
|
|
|
|
|
|
|
# 第三步:如果Level 2表现良好,评估Level 3
|
|
|
|
|
|
if results_l2['exact_match_rate'] > 0.4:
|
|
|
|
|
|
print("\n第三步:评估Level 3(困难任务)")
|
|
|
|
|
|
results_l3 = gaia_tool.run(agent, level=3, max_samples=10)
|
|
|
|
|
|
print(f"Level 3精确匹配率: {results_l3['exact_match_rate']:.2%}")
|
|
|
|
|
|
else:
|
|
|
|
|
|
print("\n⚠️ Level 2表现不佳,建议先优化后再评估Level 3")
|
|
|
|
|
|
else:
|
|
|
|
|
|
print("\n⚠️ Level 1表现不佳,建议先优化后再评估更高级别")
|
|
|
|
|
|
|
|
|
|
|
|
# ============================================================
|
|
|
|
|
|
# 最佳实践2:小样本快速测试
|
|
|
|
|
|
# ============================================================
|
|
|
|
|
|
print("\n" + "="*60)
|
|
|
|
|
|
print("最佳实践2:小样本快速测试")
|
|
|
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
|
|
|
# 快速测试(每个级别2个样本)
|
|
|
|
|
|
for level in [1, 2, 3]:
|
|
|
|
|
|
print(f"\n快速测试 Level {level}:")
|
|
|
|
|
|
results = gaia_tool.run(agent, level=level, max_samples=2)
|
|
|
|
|
|
print(f" 精确匹配率: {results['exact_match_rate']:.2%}")
|
|
|
|
|
|
|
|
|
|
|
|
# ============================================================
|
|
|
|
|
|
# 最佳实践3:结果解读
|
|
|
|
|
|
# ============================================================
|
|
|
|
|
|
print("\n" + "="*60)
|
|
|
|
|
|
print("最佳实践3:结果解读")
|
|
|
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
|
|
|
def interpret_results(level, exact_match_rate):
|
|
|
|
|
|
"""解读评估结果"""
|
|
|
|
|
|
print(f"\nLevel {level} 结果解读:")
|
|
|
|
|
|
print(f"精确匹配率: {exact_match_rate:.2%}")
|
|
|
|
|
|
|
|
|
|
|
|
if level == 1:
|
|
|
|
|
|
if exact_match_rate >= 0.6:
|
|
|
|
|
|
print("✅ 优秀 - 基础能力扎实")
|
|
|
|
|
|
elif exact_match_rate >= 0.4:
|
|
|
|
|
|
print("⚠️ 良好 - 基础能力可用")
|
|
|
|
|
|
else:
|
|
|
|
|
|
print("❌ 较差 - 需要改进")
|
|
|
|
|
|
print("建议:")
|
|
|
|
|
|
print(" - 检查系统提示词是否包含GAIA官方格式要求")
|
|
|
|
|
|
print(" - 检查答案提取逻辑是否正确")
|
|
|
|
|
|
print(" - 检查LLM模型是否足够强大")
|
|
|
|
|
|
|
|
|
|
|
|
elif level == 2:
|
|
|
|
|
|
if exact_match_rate >= 0.4:
|
|
|
|
|
|
print("✅ 优秀 - 中等任务能力强")
|
|
|
|
|
|
elif exact_match_rate >= 0.2:
|
|
|
|
|
|
print("⚠️ 良好 - 中等任务能力可用")
|
|
|
|
|
|
else:
|
|
|
|
|
|
print("❌ 较差 - 需要改进")
|
|
|
|
|
|
print("建议:")
|
|
|
|
|
|
print(" - 增强多步推理能力")
|
|
|
|
|
|
print(" - 增加工具使用能力")
|
|
|
|
|
|
print(" - 优化推理链的构建")
|
|
|
|
|
|
|
|
|
|
|
|
elif level == 3:
|
|
|
|
|
|
if exact_match_rate >= 0.2:
|
|
|
|
|
|
print("✅ 优秀 - 复杂任务能力强")
|
|
|
|
|
|
elif exact_match_rate >= 0.1:
|
|
|
|
|
|
print("⚠️ 良好 - 复杂任务能力可用")
|
|
|
|
|
|
else:
|
|
|
|
|
|
print("❌ 较差 - 需要改进")
|
|
|
|
|
|
print("建议:")
|
|
|
|
|
|
print(" - 增强复杂推理能力")
|
|
|
|
|
|
print(" - 增加长上下文处理能力")
|
|
|
|
|
|
print(" - 优化工具链的组合使用")
|
|
|
|
|
|
|
|
|
|
|
|
# 解读结果
|
|
|
|
|
|
if 'results_l1' in locals():
|
|
|
|
|
|
interpret_results(1, results_l1['exact_match_rate'])
|
|
|
|
|
|
if 'results_l2' in locals():
|
|
|
|
|
|
interpret_results(2, results_l2['exact_match_rate'])
|
|
|
|
|
|
if 'results_l3' in locals():
|
|
|
|
|
|
interpret_results(3, results_l3['exact_match_rate'])
|
|
|
|
|
|
|
|
|
|
|
|
# ============================================================
|
|
|
|
|
|
# 难度递进分析
|
|
|
|
|
|
# ============================================================
|
|
|
|
|
|
print("\n" + "="*60)
|
|
|
|
|
|
print("难度递进分析")
|
|
|
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
|
|
|
if 'results_l1' in locals() and 'results_l2' in locals():
|
|
|
|
|
|
if results_l1['exact_match_rate'] > results_l2['exact_match_rate']:
|
|
|
|
|
|
print("✅ 正常递进:Level 1 > Level 2")
|
|
|
|
|
|
else:
|
|
|
|
|
|
print("⚠️ 异常情况:Level 2 >= Level 1(可能是数据集偏差或智能体特性)")
|
|
|
|
|
|
|
|
|
|
|
|
if 'results_l2' in locals() and 'results_l3' in locals():
|
|
|
|
|
|
if results_l2['exact_match_rate'] > results_l3['exact_match_rate']:
|
|
|
|
|
|
print("✅ 正常递进:Level 2 > Level 3")
|
|
|
|
|
|
else:
|
|
|
|
|
|
print("⚠️ 异常情况:Level 3 >= Level 2(可能是数据集偏差或智能体特性)")
|
|
|
|
|
|
|