|
|
"""
|
|
|
第十二章:BFCL一键评估脚本
|
|
|
|
|
|
本脚本提供完整的BFCL评估流程:
|
|
|
1. 自动检查和准备BFCL数据
|
|
|
2. 运行HelloAgents评估
|
|
|
3. 导出BFCL格式结果
|
|
|
4. 调用BFCL官方评估工具
|
|
|
5. 展示评估结果
|
|
|
|
|
|
使用方法:
|
|
|
python examples/04_run_bfcl_evaluation.py
|
|
|
|
|
|
可选参数:
|
|
|
--category: 评估类别(默认:simple_python)
|
|
|
--samples: 样本数量(默认:5,设为0表示全部)
|
|
|
--model-name: 模型名称(默认:HelloAgents)
|
|
|
"""
|
|
|
|
|
|
import sys
|
|
|
import subprocess
|
|
|
from pathlib import Path
|
|
|
import argparse
|
|
|
import json
|
|
|
|
|
|
# 添加项目路径
|
|
|
project_root = Path(__file__).parent.parent
|
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
|
|
from hello_agents import SimpleAgent, HelloAgentsLLM
|
|
|
from hello_agents.evaluation import BFCLDataset, BFCLEvaluator
|
|
|
|
|
|
|
|
|
# 函数调用系统提示词
|
|
|
FUNCTION_CALLING_SYSTEM_PROMPT = """你是一个专业的函数调用助手。
|
|
|
|
|
|
你的任务是:根据用户的问题和提供的函数定义,生成正确的函数调用。
|
|
|
|
|
|
输出格式要求:
|
|
|
1. 必须是纯JSON格式,不要添加任何解释文字
|
|
|
2. 使用JSON数组格式:[{"name": "函数名", "arguments": {"参数名": "参数值"}}]
|
|
|
3. 如果需要调用多个函数,在数组中添加多个对象
|
|
|
4. 如果不需要调用函数,返回空数组:[]
|
|
|
|
|
|
示例:
|
|
|
用户问题:查询北京的天气
|
|
|
可用函数:get_weather(city: str)
|
|
|
正确输出:[{"name": "get_weather", "arguments": {"city": "北京"}}]
|
|
|
|
|
|
注意:
|
|
|
- 只输出JSON,不要添加"好的"、"我来帮你"等额外文字
|
|
|
- 参数值必须与函数定义的类型匹配
|
|
|
- 参数名必须与函数定义完全一致
|
|
|
"""
|
|
|
|
|
|
|
|
|
def check_bfcl_data(bfcl_data_dir: Path) -> bool:
|
|
|
"""检查BFCL数据是否存在"""
|
|
|
if not bfcl_data_dir.exists():
|
|
|
print(f"\n❌ BFCL数据目录不存在: {bfcl_data_dir}")
|
|
|
print(f"\n请先克隆BFCL仓库:")
|
|
|
print(f" git clone --depth 1 https://github.com/ShishirPatil/gorilla.git temp_gorilla")
|
|
|
return False
|
|
|
return True
|
|
|
|
|
|
|
|
|
def run_evaluation(category: str, max_samples: int, model_name: str) -> dict:
|
|
|
"""运行HelloAgents评估"""
|
|
|
print("\n" + "="*60)
|
|
|
print("步骤1: 运行HelloAgents评估")
|
|
|
print("="*60)
|
|
|
|
|
|
# BFCL数据目录
|
|
|
bfcl_data_dir = project_root / "temp_gorilla" / "berkeley-function-call-leaderboard" / "bfcl_eval" / "data"
|
|
|
|
|
|
# 检查数据
|
|
|
if not check_bfcl_data(bfcl_data_dir):
|
|
|
return None
|
|
|
|
|
|
# 加载数据集
|
|
|
print(f"\n📚 加载BFCL数据集...")
|
|
|
dataset = BFCLDataset(bfcl_data_dir=str(bfcl_data_dir), category=category)
|
|
|
|
|
|
# 创建智能体
|
|
|
print(f"\n🤖 创建智能体...")
|
|
|
llm = HelloAgentsLLM()
|
|
|
agent = SimpleAgent(
|
|
|
name=model_name,
|
|
|
llm=llm,
|
|
|
system_prompt=FUNCTION_CALLING_SYSTEM_PROMPT,
|
|
|
enable_tool_calling=False
|
|
|
)
|
|
|
print(f" 智能体: {model_name}")
|
|
|
print(f" LLM: {llm.provider}")
|
|
|
|
|
|
# 创建评估器
|
|
|
evaluator = BFCLEvaluator(dataset=dataset, category=category)
|
|
|
|
|
|
# 运行评估(传递max_samples参数)
|
|
|
print(f"\n🔄 开始评估...")
|
|
|
if max_samples > 0:
|
|
|
print(f" 样本数量: {max_samples}")
|
|
|
results = evaluator.evaluate(agent, max_samples=max_samples)
|
|
|
else:
|
|
|
print(f" 样本数量: 全部")
|
|
|
results = evaluator.evaluate(agent, max_samples=None)
|
|
|
|
|
|
# 显示结果
|
|
|
print(f"\n📊 评估结果:")
|
|
|
print(f" 准确率: {results['overall_accuracy']:.2%}")
|
|
|
print(f" 正确数: {results['correct_samples']}/{results['total_samples']}")
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
def export_bfcl_format(results: dict, category: str, model_name: str) -> Path:
|
|
|
"""导出BFCL格式结果"""
|
|
|
print("\n" + "="*60)
|
|
|
print("步骤2: 导出BFCL格式结果")
|
|
|
print("="*60)
|
|
|
|
|
|
# 输出目录
|
|
|
output_dir = project_root / "evaluation_results" / "bfcl_official"
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
# 输出文件
|
|
|
output_file = output_dir / f"BFCL_v4_{category}_result.json"
|
|
|
|
|
|
# 创建评估器(用于导出)
|
|
|
bfcl_data_dir = project_root / "temp_gorilla" / "berkeley-function-call-leaderboard" / "bfcl_eval" / "data"
|
|
|
dataset = BFCLDataset(bfcl_data_dir=str(bfcl_data_dir), category=category)
|
|
|
evaluator = BFCLEvaluator(dataset=dataset, category=category)
|
|
|
|
|
|
# 导出
|
|
|
evaluator.export_to_bfcl_format(results, output_file)
|
|
|
|
|
|
return output_file
|
|
|
|
|
|
|
|
|
def copy_to_bfcl_result_dir(source_file: Path, model_name: str, category: str) -> Path:
|
|
|
"""复制结果文件到BFCL结果目录"""
|
|
|
print("\n" + "="*60)
|
|
|
print("步骤3: 准备BFCL官方评估")
|
|
|
print("="*60)
|
|
|
|
|
|
# BFCL结果目录
|
|
|
# 注意:BFCL会将模型名中的"/"替换为"_"
|
|
|
safe_model_name = model_name.replace("/", "_")
|
|
|
result_dir = project_root / "result" / safe_model_name
|
|
|
result_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
# 目标文件
|
|
|
target_file = result_dir / f"BFCL_v4_{category}_result.json"
|
|
|
|
|
|
# 复制文件
|
|
|
import shutil
|
|
|
shutil.copy(source_file, target_file)
|
|
|
|
|
|
print(f"\n✅ 结果文件已复制到:")
|
|
|
print(f" {target_file}")
|
|
|
|
|
|
return target_file
|
|
|
|
|
|
|
|
|
def run_bfcl_official_eval(model_name: str, category: str) -> bool:
|
|
|
"""运行BFCL官方评估"""
|
|
|
print("\n" + "="*60)
|
|
|
print("步骤4: 运行BFCL官方评估")
|
|
|
print("="*60)
|
|
|
|
|
|
try:
|
|
|
# 设置环境变量
|
|
|
import os
|
|
|
os.environ['PYTHONUTF8'] = '1'
|
|
|
|
|
|
# 运行BFCL评估
|
|
|
cmd = [
|
|
|
"bfcl", "evaluate",
|
|
|
"--model", model_name,
|
|
|
"--test-category", category,
|
|
|
"--partial-eval"
|
|
|
]
|
|
|
|
|
|
print(f"\n🔄 运行命令: {' '.join(cmd)}")
|
|
|
|
|
|
result = subprocess.run(
|
|
|
cmd,
|
|
|
cwd=str(project_root),
|
|
|
capture_output=True,
|
|
|
text=True,
|
|
|
encoding='utf-8'
|
|
|
)
|
|
|
|
|
|
# 显示输出
|
|
|
if result.stdout:
|
|
|
print(result.stdout)
|
|
|
|
|
|
if result.returncode != 0:
|
|
|
print(f"\n❌ BFCL评估失败:")
|
|
|
if result.stderr:
|
|
|
print(result.stderr)
|
|
|
return False
|
|
|
|
|
|
return True
|
|
|
|
|
|
except FileNotFoundError:
|
|
|
print("\n❌ 未找到bfcl命令")
|
|
|
print(" 请先安装: pip install bfcl-eval")
|
|
|
return False
|
|
|
except Exception as e:
|
|
|
print(f"\n❌ 运行BFCL评估时出错: {e}")
|
|
|
return False
|
|
|
|
|
|
|
|
|
def show_results(model_name: str, category: str):
|
|
|
"""展示评估结果"""
|
|
|
print("\n" + "="*60)
|
|
|
print("步骤5: 展示评估结果")
|
|
|
print("="*60)
|
|
|
|
|
|
# CSV文件
|
|
|
csv_file = project_root / "score" / "data_non_live.csv"
|
|
|
|
|
|
if csv_file.exists():
|
|
|
print(f"\n📊 评估结果汇总:")
|
|
|
with open(csv_file, 'r', encoding='utf-8') as f:
|
|
|
content = f.read()
|
|
|
print(content)
|
|
|
else:
|
|
|
print(f"\n⚠️ 未找到评估结果文件: {csv_file}")
|
|
|
|
|
|
# 详细评分文件
|
|
|
safe_model_name = model_name.replace("/", "_")
|
|
|
score_file = project_root / "score" / safe_model_name / "non_live" / f"BFCL_v4_{category}_score.json"
|
|
|
|
|
|
if score_file.exists():
|
|
|
print(f"\n📝 详细评分文件:")
|
|
|
print(f" {score_file}")
|
|
|
|
|
|
# 读取并显示准确率
|
|
|
with open(score_file, 'r', encoding='utf-8') as f:
|
|
|
first_line = f.readline()
|
|
|
summary = json.loads(first_line)
|
|
|
print(f"\n🎯 最终结果:")
|
|
|
print(f" 准确率: {summary['accuracy']:.2%}")
|
|
|
print(f" 正确数: {summary['correct_count']}/{summary['total_count']}")
|
|
|
|
|
|
|
|
|
def main():
|
|
|
"""主函数"""
|
|
|
parser = argparse.ArgumentParser(description="BFCL一键评估脚本")
|
|
|
parser.add_argument("--category", default="simple_python", help="评估类别")
|
|
|
parser.add_argument("--samples", type=int, default=5, help="样本数量(0表示全部)")
|
|
|
parser.add_argument("--model-name", default="Qwen/Qwen3-8B",
|
|
|
help="模型名称(必须是BFCL支持的模型,运行'bfcl models'查看)")
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
print("="*60)
|
|
|
print("BFCL一键评估脚本")
|
|
|
print("="*60)
|
|
|
print(f"\n配置:")
|
|
|
print(f" 评估类别: {args.category}")
|
|
|
print(f" 样本数量: {args.samples if args.samples > 0 else '全部'}")
|
|
|
print(f" 模型名称: {args.model_name}")
|
|
|
|
|
|
# 步骤1: 运行评估
|
|
|
results = run_evaluation(args.category, args.samples, args.model_name)
|
|
|
if not results:
|
|
|
return
|
|
|
|
|
|
# 步骤2: 导出BFCL格式
|
|
|
output_file = export_bfcl_format(results, args.category, args.model_name)
|
|
|
|
|
|
# 步骤3: 复制到BFCL结果目录
|
|
|
copy_to_bfcl_result_dir(output_file, args.model_name, args.category)
|
|
|
|
|
|
# 步骤4: 运行BFCL官方评估
|
|
|
if not run_bfcl_official_eval(args.model_name, args.category):
|
|
|
print("\n⚠️ BFCL官方评估失败,但HelloAgents评估已完成")
|
|
|
return
|
|
|
|
|
|
# 步骤5: 展示结果
|
|
|
show_results(args.model_name, args.category)
|
|
|
|
|
|
print("\n" + "="*60)
|
|
|
print("✅ 评估完成!")
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|