You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

86 lines
2.9 KiB
Python

6 months ago
"""
第十二章示例5GAIA快速开始
对应文档12.3.5 在HelloAgents中实现GAIA评估 - 方式1
这是最简单的GAIA评估方式一行代码完成评估
重要提示
1. GAIA是受限数据集需要先在HuggingFace上申请访问权限
2. 需要设置HF_TOKEN环境变量
3. 必须使用GAIA官方系统提示词
"""
import os
from hello_agents import SimpleAgent, HelloAgentsLLM
from hello_agents.tools import GAIAEvaluationTool
# GAIA官方系统提示词必须使用
GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
# 1. 设置HuggingFace Token如果还没设置
# os.environ["HF_TOKEN"] = "your_huggingface_token_here"
# 2. 创建智能体必须使用GAIA官方系统提示词
llm = HelloAgentsLLM()
agent = SimpleAgent(
name="TestAgent",
llm=llm,
system_prompt=GAIA_SYSTEM_PROMPT # 必须使用官方提示词
)
# 3. 创建GAIA评估工具
gaia_tool = GAIAEvaluationTool()
# 4. 运行评估
results = gaia_tool.run(
agent=agent,
level=1, # 评估级别1=简单2=中等3=困难)
max_samples=2, # 评估样本数0表示全部
export_results=True, # 导出结果到GAIA官方格式
generate_report=True # 生成详细报告
)
# 5. 查看结果
print(f"\n评估结果:")
print(f"精确匹配率: {results['exact_match_rate']:.2%}")
print(f"部分匹配率: {results['partial_match_rate']:.2%}")
print(f"正确数: {results['correct_samples']}/{results['total_samples']}")
# 运行输出示例:
# ============================================================
# GAIA一键评估
# ============================================================
#
# 配置:
# 智能体: TestAgent
# 级别: Level 1
# 样本数: 2
#
# ✅ GAIA数据集加载完成
# 数据源: gaia-benchmark/GAIA
# 分割: validation
# 级别: 1
# 样本数: 2
#
# 评估进度: 100%|██████████| 2/2 [00:10<00:00, 5.23s/样本]
#
# ✅ 评估完成
# 总样本数: 2
# 正确样本数: 2
# 精确匹配率: 100.00%
# 部分匹配率: 100.00%
#
# ✅ 结果已导出到 ./evaluation_results/gaia_submission.json
# ✅ 报告已生成到 ./evaluation_results/gaia_report.md
#
# 评估结果:
# 精确匹配率: 100.00%
# 部分匹配率: 100.00%
# 正确数: 2/2