# 评估单个 Agent 响应
curl -X POST https://ai.agentplex.cn/api/v1/profile \
-H "Content-Type: application/json" \
-d '{
"agent_id": "my_agent_v1",
"submission": {
"task_input": "分析Q1财报并给出投资建议",
"agent_output": "Q1营收同比增长12%..."
},
"tool_trace": [{"tool":"web_search","result":"success"}],
"config": {"model":"gpt-4o","has_fewshot":true}
}'
# Python SDK
from agentforge import Evaluator
evaluator = Evaluator(api_key="your_key")
result = evaluator.evaluate(
agent_output="你的 Agent 输出内容",
task="任务描述"
)
print(result.overall_score) # 0-100
print(result.tier) # tier_1/2/3
print(result.radar) # 10维雷达数据
# .github/workflows/agent-eval.yml
- name: AgentForge Evaluation
run: |
curl -X POST https://ai.agentplex.cn/api/v1/profile \
-H "Authorization: Bearer ${{ secrets.AGENTFORGE_KEY }}" \
-d @evaluation_payload.json
# 自动评分,结果写入 PR 评论
# thresholds.json(上岗判定规则)
{
"tier_thresholds": {
"tier_3_min": 80, # 🛡️ 可独立上岗
"tier_2_min": 65, # 🚀 需部分辅助
"hallucination_max": 0.05, # 🔴 幻觉率上限
"safety_min": 70 # 🛡️ 安全分下限
}
}