
关于
LLM 代理的测试和基准评估,包括行为测试和性能度量。
name: agent-evaluation description: LLM 智能体的测试与基准评估,包括行为测试、能力评估、可靠性指标和生产监控——即使顶级智能体在真实世界基准测试中的表现也不到 50% risk: safe source: vibeship-spawner-skills (Apache 2.0) date_added: 2026-02-27
智能体评估
LLM 智能体的测试与基准评估,包括行为测试、能力评估、可靠性指标和生产监控——即使顶级智能体在真实世界基准测试中的表现也不到 50%
能力范围
- 智能体测试
- 基准设计
- 能力评估
- 可靠性指标
- 回归测试
前置条件
- 知识要求:测试方法论、统计分析基础、LLM 行为模式
- 推荐技能:autonomous-agents、multi-agent-orchestration
- 必需技能:testing-fundamentals、llm-fundamentals
适用范围
- 不涵盖:模型训练评估(损失、困惑度)、公平性和偏见测试、用户体验测试
- 边界:聚焦于智能体能力和可靠性,涵盖功能测试和行为测试
生态系统
主要工具
- AgentBench - LLM 智能体多环境基准测试(ICLR 2024)
- τ-bench (Tau-bench) - Sierra 的真实世界智能体基准测试
- ToolEmu - 智能体工具使用的风险行为检测
- Langsmith - LLM 追踪和评估平台
替代方案
- Braintrust - 适用场景:需要生产监控集成的 LLM 评估和监控
- PromptFoo - 适用场景:聚焦于提示词级别评估的提示词测试框架
已弃用
- 仅手动测试
模式
统计测试评估
多次运行测试并分析结果分布
适用场景:评估随机性智能体行为
interface TestResult {
testId: string;
runId: string;
passed: boolean;
score: number; // 0-1 for partial credit
latencyMs: number;
tokensUsed: number;
output: string;
expectedBehaviors: string[];
actualBehaviors: string[];
}
interface StatisticalAnalysis {
passRate: number;
confidence95: [number, number];
meanScore: number;
stdDevScore: number;
meanLatency: number;
p95Latency: number;
behaviorConsistency: number;
}
class StatisticalEvaluator {
private readonly minRuns = 10;
private readonly confidenceLevel = 0.95;
async evaluateAgent(
agent: Agent,
testSuite: TestCase[]
): Promise<EvaluationReport> {
const results: TestResult[] = [];
// Run each test multiple times
for (const test of testSuite) {
for (let run = 0; run < this.minRuns; run++) {
const result = await this.runTest(agent, test, run);
results.push(result);
}
}
// Analyze by test
const byTest = this.groupByTest(results);
const testAnalyses = new Map<string, StatisticalAnalysis>();
for (const [testId, testResults] of byTest) {
testAnalyses.set(testId, this.analyzeResults(testResults));
}
// Overall analysis
const overall = this.analyzeResults(results);
return {
overall,
byTest: testAnalyses,
concerns: this.identifyConcerns(testAnalyses),
recommendations: this.generateRecommendations(testAnalyses)
};
}
private analyzeResults(results: TestResult[]): StatisticalAnalysis {
const passes = results.filter(r => r.passed);
const passRate = passes.length / results.length;
// Calculate confidence interval for pass rate
const z = 1.96; // 95% confidence
const se = Math.sqrt((passRate * (1 - passRate)) / results.length);
const confidence95: [number, number] = [
Math.max(0, passRate - z * se),
Math.min(1, passRate + z * se)
];
const scores = results.map(r => r.score);
const latencies = results.map(r => r.latencyMs);
return {
passRate,
confidence95,
meanScore: this.mean(scores),
stdDevScore: this.stdDev(scores),
meanLatency: this.mean(latencies),
p95Latency: this.percentile(latencies, 95),
behaviorConsistency: this.calculateConsistency(results)
};
}
private calculateConsistency(results: TestResult[]): number {
// How consistent are the behaviors across runs?
if (results.length < 2) return 1;
const behaviorSets = results.map(r => new Set(r.actualBehaviors));
let consistencySum = 0;
let comparisons = 0;
for (let i = 0; i < behaviorSets.length; i++) {
for (let j = i + 1; j < behaviorSets.length; j++) {
const intersection = new Set(
[...behaviorSets[i]].filter(x => behaviorSets[j].has(x))
);
const union = new Set([...behaviorSets[i], ...behaviorSets[j]]);
consistencySum += intersection.size / union.size;
comparisons++;
}
}
return consistencySum / comparisons;
}
}
兼容工具
Claude CodeCursor
标签
AI与机器学习