LLM Model Evaluation
Human → AIBenchmark multiple models, compare metrics, generate recommendation.
6 nodes · 7 edgesml
agentclisystem
Visual
Load Evaluation Datasetsystem
↓parallel→ Evaluate Claude
↓parallel→ Evaluate GPT-4
↓parallel→ Evaluate Gemini
Evaluate Claudeagent
↓parallel→ Compare Results
Evaluate GPT-4agent
↓parallel→ Compare Results
Evaluate Geminiagent
↓parallel→ Compare Results
Compare Resultssystem
Accuracy, latency, cost per 1K tokens.
↓sequential→ Generate Recommendation
Generate Recommendationagent
uc-model-evaluation.osop.yaml
osop_version: "1.0"
id: "model-eval"
name: "LLM Model Evaluation"
description: "Benchmark multiple models, compare metrics, generate recommendation."
nodes:
- id: "prepare"
type: "system"
name: "Load Evaluation Dataset"
- id: "eval_claude"
type: "agent"
subtype: "llm"
name: "Evaluate Claude"
- id: "eval_gpt"
type: "agent"
subtype: "llm"
name: "Evaluate GPT-4"
- id: "eval_gemini"
type: "agent"
subtype: "llm"
name: "Evaluate Gemini"
- id: "compare"
type: "system"
name: "Compare Results"
description: "Accuracy, latency, cost per 1K tokens."
- id: "recommend"
type: "agent"
subtype: "llm"
name: "Generate Recommendation"
edges:
- from: "prepare"
to: "eval_claude"
mode: "parallel"
- from: "prepare"
to: "eval_gpt"
mode: "parallel"
- from: "prepare"
to: "eval_gemini"
mode: "parallel"
- from: "eval_claude"
to: "compare"
mode: "parallel"
- from: "eval_gpt"
to: "compare"
mode: "parallel"
- from: "eval_gemini"
to: "compare"
mode: "parallel"
- from: "compare"
to: "recommend"
mode: "sequential"