You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

294 lines
9.0 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""
第十二章BFCL一键评估脚本
本脚本提供完整的BFCL评估流程
1. 自动检查和准备BFCL数据
2. 运行HelloAgents评估
3. 导出BFCL格式结果
4. 调用BFCL官方评估工具
5. 展示评估结果
使用方法:
python examples/04_run_bfcl_evaluation.py
可选参数:
--category: 评估类别默认simple_python
--samples: 样本数量默认5设为0表示全部
--model-name: 模型名称默认HelloAgents
"""
import sys
import subprocess
from pathlib import Path
import argparse
import json
# 添加项目路径
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
from hello_agents import SimpleAgent, HelloAgentsLLM
from hello_agents.evaluation import BFCLDataset, BFCLEvaluator
# 函数调用系统提示词
FUNCTION_CALLING_SYSTEM_PROMPT = """你是一个专业的函数调用助手。
你的任务是:根据用户的问题和提供的函数定义,生成正确的函数调用。
输出格式要求:
1. 必须是纯JSON格式不要添加任何解释文字
2. 使用JSON数组格式[{"name": "函数名", "arguments": {"参数名": "参数值"}}]
3. 如果需要调用多个函数,在数组中添加多个对象
4. 如果不需要调用函数,返回空数组:[]
示例:
用户问题:查询北京的天气
可用函数get_weather(city: str)
正确输出:[{"name": "get_weather", "arguments": {"city": "北京"}}]
注意:
- 只输出JSON不要添加"好的""我来帮你"等额外文字
- 参数值必须与函数定义的类型匹配
- 参数名必须与函数定义完全一致
"""
def check_bfcl_data(bfcl_data_dir: Path) -> bool:
"""检查BFCL数据是否存在"""
if not bfcl_data_dir.exists():
print(f"\n❌ BFCL数据目录不存在: {bfcl_data_dir}")
print(f"\n请先克隆BFCL仓库")
print(f" git clone --depth 1 https://github.com/ShishirPatil/gorilla.git temp_gorilla")
return False
return True
def run_evaluation(category: str, max_samples: int, model_name: str) -> dict:
"""运行HelloAgents评估"""
print("\n" + "="*60)
print("步骤1: 运行HelloAgents评估")
print("="*60)
# BFCL数据目录
bfcl_data_dir = project_root / "temp_gorilla" / "berkeley-function-call-leaderboard" / "bfcl_eval" / "data"
# 检查数据
if not check_bfcl_data(bfcl_data_dir):
return None
# 加载数据集
print(f"\n📚 加载BFCL数据集...")
dataset = BFCLDataset(bfcl_data_dir=str(bfcl_data_dir), category=category)
# 创建智能体
print(f"\n🤖 创建智能体...")
llm = HelloAgentsLLM()
agent = SimpleAgent(
name=model_name,
llm=llm,
system_prompt=FUNCTION_CALLING_SYSTEM_PROMPT,
enable_tool_calling=False
)
print(f" 智能体: {model_name}")
print(f" LLM: {llm.provider}")
# 创建评估器
evaluator = BFCLEvaluator(dataset=dataset, category=category)
# 运行评估传递max_samples参数
print(f"\n🔄 开始评估...")
if max_samples > 0:
print(f" 样本数量: {max_samples}")
results = evaluator.evaluate(agent, max_samples=max_samples)
else:
print(f" 样本数量: 全部")
results = evaluator.evaluate(agent, max_samples=None)
# 显示结果
print(f"\n📊 评估结果:")
print(f" 准确率: {results['overall_accuracy']:.2%}")
print(f" 正确数: {results['correct_samples']}/{results['total_samples']}")
return results
def export_bfcl_format(results: dict, category: str, model_name: str) -> Path:
"""导出BFCL格式结果"""
print("\n" + "="*60)
print("步骤2: 导出BFCL格式结果")
print("="*60)
# 输出目录
output_dir = project_root / "evaluation_results" / "bfcl_official"
output_dir.mkdir(parents=True, exist_ok=True)
# 输出文件
output_file = output_dir / f"BFCL_v4_{category}_result.json"
# 创建评估器(用于导出)
bfcl_data_dir = project_root / "temp_gorilla" / "berkeley-function-call-leaderboard" / "bfcl_eval" / "data"
dataset = BFCLDataset(bfcl_data_dir=str(bfcl_data_dir), category=category)
evaluator = BFCLEvaluator(dataset=dataset, category=category)
# 导出
evaluator.export_to_bfcl_format(results, output_file)
return output_file
def copy_to_bfcl_result_dir(source_file: Path, model_name: str, category: str) -> Path:
"""复制结果文件到BFCL结果目录"""
print("\n" + "="*60)
print("步骤3: 准备BFCL官方评估")
print("="*60)
# BFCL结果目录
# 注意BFCL会将模型名中的"/"替换为"_"
safe_model_name = model_name.replace("/", "_")
result_dir = project_root / "result" / safe_model_name
result_dir.mkdir(parents=True, exist_ok=True)
# 目标文件
target_file = result_dir / f"BFCL_v4_{category}_result.json"
# 复制文件
import shutil
shutil.copy(source_file, target_file)
print(f"\n✅ 结果文件已复制到:")
print(f" {target_file}")
return target_file
def run_bfcl_official_eval(model_name: str, category: str) -> bool:
"""运行BFCL官方评估"""
print("\n" + "="*60)
print("步骤4: 运行BFCL官方评估")
print("="*60)
try:
# 设置环境变量
import os
os.environ['PYTHONUTF8'] = '1'
# 运行BFCL评估
cmd = [
"bfcl", "evaluate",
"--model", model_name,
"--test-category", category,
"--partial-eval"
]
print(f"\n🔄 运行命令: {' '.join(cmd)}")
result = subprocess.run(
cmd,
cwd=str(project_root),
capture_output=True,
text=True,
encoding='utf-8'
)
# 显示输出
if result.stdout:
print(result.stdout)
if result.returncode != 0:
print(f"\n❌ BFCL评估失败:")
if result.stderr:
print(result.stderr)
return False
return True
except FileNotFoundError:
print("\n❌ 未找到bfcl命令")
print(" 请先安装: pip install bfcl-eval")
return False
except Exception as e:
print(f"\n❌ 运行BFCL评估时出错: {e}")
return False
def show_results(model_name: str, category: str):
"""展示评估结果"""
print("\n" + "="*60)
print("步骤5: 展示评估结果")
print("="*60)
# CSV文件
csv_file = project_root / "score" / "data_non_live.csv"
if csv_file.exists():
print(f"\n📊 评估结果汇总:")
with open(csv_file, 'r', encoding='utf-8') as f:
content = f.read()
print(content)
else:
print(f"\n⚠️ 未找到评估结果文件: {csv_file}")
# 详细评分文件
safe_model_name = model_name.replace("/", "_")
score_file = project_root / "score" / safe_model_name / "non_live" / f"BFCL_v4_{category}_score.json"
if score_file.exists():
print(f"\n📝 详细评分文件:")
print(f" {score_file}")
# 读取并显示准确率
with open(score_file, 'r', encoding='utf-8') as f:
first_line = f.readline()
summary = json.loads(first_line)
print(f"\n🎯 最终结果:")
print(f" 准确率: {summary['accuracy']:.2%}")
print(f" 正确数: {summary['correct_count']}/{summary['total_count']}")
def main():
"""主函数"""
parser = argparse.ArgumentParser(description="BFCL一键评估脚本")
parser.add_argument("--category", default="simple_python", help="评估类别")
parser.add_argument("--samples", type=int, default=5, help="样本数量0表示全部")
parser.add_argument("--model-name", default="Qwen/Qwen3-8B",
help="模型名称必须是BFCL支持的模型运行'bfcl models'查看)")
args = parser.parse_args()
print("="*60)
print("BFCL一键评估脚本")
print("="*60)
print(f"\n配置:")
print(f" 评估类别: {args.category}")
print(f" 样本数量: {args.samples if args.samples > 0 else '全部'}")
print(f" 模型名称: {args.model_name}")
# 步骤1: 运行评估
results = run_evaluation(args.category, args.samples, args.model_name)
if not results:
return
# 步骤2: 导出BFCL格式
output_file = export_bfcl_format(results, args.category, args.model_name)
# 步骤3: 复制到BFCL结果目录
copy_to_bfcl_result_dir(output_file, args.model_name, args.category)
# 步骤4: 运行BFCL官方评估
if not run_bfcl_official_eval(args.model_name, args.category):
print("\n⚠️ BFCL官方评估失败但HelloAgents评估已完成")
return
# 步骤5: 展示结果
show_results(args.model_name, args.category)
print("\n" + "="*60)
print("✅ 评估完成!")
print("="*60)
if __name__ == "__main__":
main()