hello-agents/code/chapter12/04_run_bfcl_evaluation.py

"""
第十二章：BFCL一键评估脚本

本脚本提供完整的BFCL评估流程：
1. 自动检查和准备BFCL数据
2. 运行HelloAgents评估
3. 导出BFCL格式结果
4. 调用BFCL官方评估工具
5. 展示评估结果

使用方法：
    python examples/04_run_bfcl_evaluation.py

可选参数：
    --category: 评估类别（默认：simple_python）
    --samples: 样本数量（默认：5，设为0表示全部）
    --model-name: 模型名称（默认：HelloAgents）
"""

import sys
import subprocess
from pathlib import Path
import argparse
import json

# 添加项目路径
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

from hello_agents import SimpleAgent, HelloAgentsLLM
from hello_agents.evaluation import BFCLDataset, BFCLEvaluator


# 函数调用系统提示词
FUNCTION_CALLING_SYSTEM_PROMPT = """你是一个专业的函数调用助手。

你的任务是：根据用户的问题和提供的函数定义，生成正确的函数调用。

输出格式要求：
1. 必须是纯JSON格式，不要添加任何解释文字
2. 使用JSON数组格式：[{"name": "函数名", "arguments": {"参数名": "参数值"}}]
3. 如果需要调用多个函数，在数组中添加多个对象
4. 如果不需要调用函数，返回空数组：[]

示例：
用户问题：查询北京的天气
可用函数：get_weather(city: str)
正确输出：[{"name": "get_weather", "arguments": {"city": "北京"}}]

注意：
- 只输出JSON，不要添加"好的"、"我来帮你"等额外文字
- 参数值必须与函数定义的类型匹配
- 参数名必须与函数定义完全一致
"""


def check_bfcl_data(bfcl_data_dir: Path) -> bool:
    """检查BFCL数据是否存在"""
    if not bfcl_data_dir.exists():
        print(f"\n❌ BFCL数据目录不存在: {bfcl_data_dir}")
        print(f"\n请先克隆BFCL仓库：")
        print(f"   git clone --depth 1 https://github.com/ShishirPatil/gorilla.git temp_gorilla")
        return False
    return True


def run_evaluation(category: str, max_samples: int, model_name: str) -> dict:
    """运行HelloAgents评估"""
    print("\n" + "="*60)
    print("步骤1: 运行HelloAgents评估")
    print("="*60)

    # BFCL数据目录
    bfcl_data_dir = project_root / "temp_gorilla" / "berkeley-function-call-leaderboard" / "bfcl_eval" / "data"

    # 检查数据
    if not check_bfcl_data(bfcl_data_dir):
        return None

    # 加载数据集
    print(f"\n📚 加载BFCL数据集...")
    dataset = BFCLDataset(bfcl_data_dir=str(bfcl_data_dir), category=category)

    # 创建智能体
    print(f"\n🤖 创建智能体...")
    llm = HelloAgentsLLM()
    agent = SimpleAgent(
        name=model_name,
        llm=llm,
        system_prompt=FUNCTION_CALLING_SYSTEM_PROMPT,
        enable_tool_calling=False
    )
    print(f"   智能体: {model_name}")
    print(f"   LLM: {llm.provider}")

    # 创建评估器
    evaluator = BFCLEvaluator(dataset=dataset, category=category)

    # 运行评估（传递max_samples参数）
    print(f"\n🔄 开始评估...")
    if max_samples > 0:
        print(f"   样本数量: {max_samples}")
        results = evaluator.evaluate(agent, max_samples=max_samples)
    else:
        print(f"   样本数量: 全部")
        results = evaluator.evaluate(agent, max_samples=None)

    # 显示结果
    print(f"\n📊 评估结果:")
    print(f"   准确率: {results['overall_accuracy']:.2%}")
    print(f"   正确数: {results['correct_samples']}/{results['total_samples']}")

    return results


def export_bfcl_format(results: dict, category: str, model_name: str) -> Path:
    """导出BFCL格式结果"""
    print("\n" + "="*60)
    print("步骤2: 导出BFCL格式结果")
    print("="*60)

    # 输出目录
    output_dir = project_root / "evaluation_results" / "bfcl_official"
    output_dir.mkdir(parents=True, exist_ok=True)

    # 输出文件
    output_file = output_dir / f"BFCL_v4_{category}_result.json"

    # 创建评估器（用于导出）
    bfcl_data_dir = project_root / "temp_gorilla" / "berkeley-function-call-leaderboard" / "bfcl_eval" / "data"
    dataset = BFCLDataset(bfcl_data_dir=str(bfcl_data_dir), category=category)
    evaluator = BFCLEvaluator(dataset=dataset, category=category)

    # 导出
    evaluator.export_to_bfcl_format(results, output_file)

    return output_file


def copy_to_bfcl_result_dir(source_file: Path, model_name: str, category: str) -> Path:
    """复制结果文件到BFCL结果目录"""
    print("\n" + "="*60)
    print("步骤3: 准备BFCL官方评估")
    print("="*60)

    # BFCL结果目录
    # 注意：BFCL会将模型名中的"/"替换为"_"
    safe_model_name = model_name.replace("/", "_")
    result_dir = project_root / "result" / safe_model_name
    result_dir.mkdir(parents=True, exist_ok=True)

    # 目标文件
    target_file = result_dir / f"BFCL_v4_{category}_result.json"

    # 复制文件
    import shutil
    shutil.copy(source_file, target_file)

    print(f"\n✅ 结果文件已复制到:")
    print(f"   {target_file}")

    return target_file


def run_bfcl_official_eval(model_name: str, category: str) -> bool:
    """运行BFCL官方评估"""
    print("\n" + "="*60)
    print("步骤4: 运行BFCL官方评估")
    print("="*60)

    try:
        # 设置环境变量
        import os
        os.environ['PYTHONUTF8'] = '1'

        # 运行BFCL评估
        cmd = [
            "bfcl", "evaluate",
            "--model", model_name,
            "--test-category", category,
            "--partial-eval"
        ]

        print(f"\n🔄 运行命令: {' '.join(cmd)}")

        result = subprocess.run(
            cmd,
            cwd=str(project_root),
            capture_output=True,
            text=True,
            encoding='utf-8'
        )

        # 显示输出
        if result.stdout:
            print(result.stdout)

        if result.returncode != 0:
            print(f"\n❌ BFCL评估失败:")
            if result.stderr:
                print(result.stderr)
            return False

        return True

    except FileNotFoundError:
        print("\n❌ 未找到bfcl命令")
        print("   请先安装: pip install bfcl-eval")
        return False
    except Exception as e:
        print(f"\n❌ 运行BFCL评估时出错: {e}")
        return False


def show_results(model_name: str, category: str):
    """展示评估结果"""
    print("\n" + "="*60)
    print("步骤5: 展示评估结果")
    print("="*60)

    # CSV文件
    csv_file = project_root / "score" / "data_non_live.csv"

    if csv_file.exists():
        print(f"\n📊 评估结果汇总:")
        with open(csv_file, 'r', encoding='utf-8') as f:
            content = f.read()
            print(content)
    else:
        print(f"\n⚠️ 未找到评估结果文件: {csv_file}")

    # 详细评分文件
    safe_model_name = model_name.replace("/", "_")
    score_file = project_root / "score" / safe_model_name / "non_live" / f"BFCL_v4_{category}_score.json"

    if score_file.exists():
        print(f"\n📝 详细评分文件:")
        print(f"   {score_file}")

        # 读取并显示准确率
        with open(score_file, 'r', encoding='utf-8') as f:
            first_line = f.readline()
            summary = json.loads(first_line)
            print(f"\n🎯 最终结果:")
            print(f"   准确率: {summary['accuracy']:.2%}")
            print(f"   正确数: {summary['correct_count']}/{summary['total_count']}")


def main():
    """主函数"""
    parser = argparse.ArgumentParser(description="BFCL一键评估脚本")
    parser.add_argument("--category", default="simple_python", help="评估类别")
    parser.add_argument("--samples", type=int, default=5, help="样本数量（0表示全部）")
    parser.add_argument("--model-name", default="Qwen/Qwen3-8B",
                       help="模型名称（必须是BFCL支持的模型，运行'bfcl models'查看）")

    args = parser.parse_args()

    print("="*60)
    print("BFCL一键评估脚本")
    print("="*60)
    print(f"\n配置:")
    print(f"   评估类别: {args.category}")
    print(f"   样本数量: {args.samples if args.samples > 0 else '全部'}")
    print(f"   模型名称: {args.model_name}")

    # 步骤1: 运行评估
    results = run_evaluation(args.category, args.samples, args.model_name)
    if not results:
        return

    # 步骤2: 导出BFCL格式
    output_file = export_bfcl_format(results, args.category, args.model_name)

    # 步骤3: 复制到BFCL结果目录
    copy_to_bfcl_result_dir(output_file, args.model_name, args.category)

    # 步骤4: 运行BFCL官方评估
    if not run_bfcl_official_eval(args.model_name, args.category):
        print("\n⚠️ BFCL官方评估失败，但HelloAgents评估已完成")
        return

    # 步骤5: 展示结果
    show_results(args.model_name, args.category)

    print("\n" + "="*60)
    print("✅ 评估完成！")
    print("="*60)


if __name__ == "__main__":
    main()