|
|
"""
|
|
|
第十二章示例5:GAIA快速开始
|
|
|
|
|
|
对应文档:12.3.5 在HelloAgents中实现GAIA评估 - 方式1
|
|
|
|
|
|
这是最简单的GAIA评估方式,一行代码完成评估。
|
|
|
|
|
|
重要提示:
|
|
|
1. GAIA是受限数据集,需要先在HuggingFace上申请访问权限
|
|
|
2. 需要设置HF_TOKEN环境变量
|
|
|
3. 必须使用GAIA官方系统提示词
|
|
|
"""
|
|
|
|
|
|
import os
|
|
|
from hello_agents import SimpleAgent, HelloAgentsLLM
|
|
|
from hello_agents.tools import GAIAEvaluationTool
|
|
|
|
|
|
# GAIA官方系统提示词(必须使用)
|
|
|
GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
|
|
|
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
|
|
If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
|
|
|
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
|
|
If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
|
|
|
|
|
|
# 1. 设置HuggingFace Token(如果还没设置)
|
|
|
# os.environ["HF_TOKEN"] = "your_huggingface_token_here"
|
|
|
|
|
|
# 2. 创建智能体(必须使用GAIA官方系统提示词)
|
|
|
llm = HelloAgentsLLM()
|
|
|
agent = SimpleAgent(
|
|
|
name="TestAgent",
|
|
|
llm=llm,
|
|
|
system_prompt=GAIA_SYSTEM_PROMPT # 必须使用官方提示词
|
|
|
)
|
|
|
|
|
|
# 3. 创建GAIA评估工具
|
|
|
gaia_tool = GAIAEvaluationTool()
|
|
|
|
|
|
# 4. 运行评估
|
|
|
results = gaia_tool.run(
|
|
|
agent=agent,
|
|
|
level=1, # 评估级别(1=简单,2=中等,3=困难)
|
|
|
max_samples=2, # 评估样本数(0表示全部)
|
|
|
export_results=True, # 导出结果到GAIA官方格式
|
|
|
generate_report=True # 生成详细报告
|
|
|
)
|
|
|
|
|
|
# 5. 查看结果
|
|
|
print(f"\n评估结果:")
|
|
|
print(f"精确匹配率: {results['exact_match_rate']:.2%}")
|
|
|
print(f"部分匹配率: {results['partial_match_rate']:.2%}")
|
|
|
print(f"正确数: {results['correct_samples']}/{results['total_samples']}")
|
|
|
|
|
|
# 运行输出示例:
|
|
|
# ============================================================
|
|
|
# GAIA一键评估
|
|
|
# ============================================================
|
|
|
#
|
|
|
# 配置:
|
|
|
# 智能体: TestAgent
|
|
|
# 级别: Level 1
|
|
|
# 样本数: 2
|
|
|
#
|
|
|
# ✅ GAIA数据集加载完成
|
|
|
# 数据源: gaia-benchmark/GAIA
|
|
|
# 分割: validation
|
|
|
# 级别: 1
|
|
|
# 样本数: 2
|
|
|
#
|
|
|
# 评估进度: 100%|██████████| 2/2 [00:10<00:00, 5.23s/样本]
|
|
|
#
|
|
|
# ✅ 评估完成
|
|
|
# 总样本数: 2
|
|
|
# 正确样本数: 2
|
|
|
# 精确匹配率: 100.00%
|
|
|
# 部分匹配率: 100.00%
|
|
|
#
|
|
|
# ✅ 结果已导出到 ./evaluation_results/gaia_submission.json
|
|
|
# ✅ 报告已生成到 ./evaluation_results/gaia_report.md
|
|
|
#
|
|
|
# 评估结果:
|
|
|
# 精确匹配率: 100.00%
|
|
|
# 部分匹配率: 100.00%
|
|
|
# 正确数: 2/2
|
|
|
|