You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

86 lines
2.9 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""
第十二章示例5GAIA快速开始
对应文档12.3.5 在HelloAgents中实现GAIA评估 - 方式1
这是最简单的GAIA评估方式一行代码完成评估。
重要提示:
1. GAIA是受限数据集需要先在HuggingFace上申请访问权限
2. 需要设置HF_TOKEN环境变量
3. 必须使用GAIA官方系统提示词
"""
import os
from hello_agents import SimpleAgent, HelloAgentsLLM
from hello_agents.tools import GAIAEvaluationTool
# GAIA官方系统提示词必须使用
GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
# 1. 设置HuggingFace Token如果还没设置
# os.environ["HF_TOKEN"] = "your_huggingface_token_here"
# 2. 创建智能体必须使用GAIA官方系统提示词
llm = HelloAgentsLLM()
agent = SimpleAgent(
name="TestAgent",
llm=llm,
system_prompt=GAIA_SYSTEM_PROMPT # 必须使用官方提示词
)
# 3. 创建GAIA评估工具
gaia_tool = GAIAEvaluationTool()
# 4. 运行评估
results = gaia_tool.run(
agent=agent,
level=1, # 评估级别1=简单2=中等3=困难)
max_samples=2, # 评估样本数0表示全部
export_results=True, # 导出结果到GAIA官方格式
generate_report=True # 生成详细报告
)
# 5. 查看结果
print(f"\n评估结果:")
print(f"精确匹配率: {results['exact_match_rate']:.2%}")
print(f"部分匹配率: {results['partial_match_rate']:.2%}")
print(f"正确数: {results['correct_samples']}/{results['total_samples']}")
# 运行输出示例:
# ============================================================
# GAIA一键评估
# ============================================================
#
# 配置:
# 智能体: TestAgent
# 级别: Level 1
# 样本数: 2
#
# ✅ GAIA数据集加载完成
# 数据源: gaia-benchmark/GAIA
# 分割: validation
# 级别: 1
# 样本数: 2
#
# 评估进度: 100%|██████████| 2/2 [00:10<00:00, 5.23s/样本]
#
# ✅ 评估完成
# 总样本数: 2
# 正确样本数: 2
# 精确匹配率: 100.00%
# 部分匹配率: 100.00%
#
# ✅ 结果已导出到 ./evaluation_results/gaia_submission.json
# ✅ 报告已生成到 ./evaluation_results/gaia_report.md
#
# 评估结果:
# 精确匹配率: 100.00%
# 部分匹配率: 100.00%
# 正确数: 2/2