提示詞工程管線

AI/ML
7 個節點 · 7 條連接ai ml
ex-prompt-engineering.osop.yaml
# Prompt Engineering Pipeline
# Write, test, evaluate, iterate, and deploy production prompts
osop_version: "2.0"
id: prompt-engineering
name: "提示詞工程管線"

nodes:
  - id: define_task
    type: human
    purpose: Define the task requirements, expected behavior, and evaluation criteria
    role: prompt_engineer
    outputs: [task_spec, eval_criteria, golden_dataset_path, target_model]
    explain: |
      Task spec includes: input format, expected output format, edge cases,
      safety constraints, and performance targets (accuracy, latency, cost).

  - id: write_prompt
    type: agent
    purpose: Draft initial prompt with system instructions, few-shot examples, and output schema
    runtime:
      provider: anthropic
      model: claude-sonnet-4-20250514
      config:
        temperature: 0.7
        system_prompt: |
          You are a prompt engineering assistant. Given a task spec,
          write a production-quality prompt with clear instructions,
          structured output format, and edge case handling.
    inputs: [task_spec]
    outputs: [prompt_text, prompt_version]
    timeout_sec: 30

  - id: test_prompt
    type: cli
    purpose: Run the prompt against the golden evaluation dataset and collect responses
    runtime:
      command: |
        python run_eval.py \
          --prompt-file prompts/${prompt_version}.txt \
          --dataset ${golden_dataset_path} \
          --model ${target_model} \
          --concurrency 10 \
          --output results/${prompt_version}/
    inputs: [prompt_text, golden_dataset_path, target_model]
    outputs: [raw_results, total_cost, avg_latency_ms]
    timeout_sec: 600
    retry_policy:
      max_retries: 2
      backoff_sec: 10
    security:
      credentials: [ANTHROPIC_API_KEY]

  - id: evaluate_results
    type: cli
    purpose: Score results using automated metrics and LLM-as-judge evaluation
    runtime:
      command: |
        python evaluate_prompt.py \
          --results results/${prompt_version}/ \
          --criteria ${eval_criteria} \
          --judge-model claude-haiku-4-20250414 \
          --metrics accuracy,consistency,safety,format_compliance \
          --output scores/${prompt_version}.json
    inputs: [raw_results, eval_criteria]
    outputs: [accuracy_score, consistency_score, safety_score, format_compliance, failure_cases]
    timeout_sec: 300
    security:
      credentials: [ANTHROPIC_API_KEY]

  - id: review_failures
    type: agent
    purpose: Analyze failure cases and suggest specific prompt improvements
    runtime:
      provider: anthropic
      model: claude-sonnet-4-20250514
      config:
        temperature: 0.3
        system_prompt: |
          Analyze these prompt evaluation failures. For each failure,
          identify the root cause and suggest a specific prompt modification.
    inputs: [failure_cases, prompt_text]
    outputs: [improvement_suggestions, root_cause_analysis]
    timeout_sec: 30

  - id: iterate_prompt
    type: agent
    purpose: Apply improvement suggestions to create an updated prompt version
    runtime:
      provider: anthropic
      model: claude-sonnet-4-20250514
      config:
        temperature: 0.3
    inputs: [prompt_text, improvement_suggestions]
    outputs: [updated_prompt_text, changelog]
    timeout_sec: 30

  - id: deploy_prompt
    type: api
    purpose: Publish approved prompt version to the prompt management system
    runtime:
      endpoint: /api/v1/prompts
      method: PUT
      url: https://prompt-registry.internal
    inputs: [prompt_text, prompt_version, accuracy_score]
    outputs: [deployed_version, deployment_url]
    security:
      auth: bearer_token
      secret_ref: PROMPT_REGISTRY_TOKEN

edges:
  - from: define_task
    to: write_prompt
    mode: sequential

  - from: write_prompt
    to: test_prompt
    mode: sequential

  - from: test_prompt
    to: evaluate_results
    mode: sequential

  - from: evaluate_results
    to: review_failures
    mode: conditional
    condition: "accuracy_score < 0.95 || safety_score < 1.0"

  - from: evaluate_results
    to: deploy_prompt
    mode: conditional
    condition: "accuracy_score >= 0.95 && safety_score >= 1.0 && format_compliance >= 0.98"

  - from: review_failures
    to: iterate_prompt
    mode: sequential

  - from: iterate_prompt
    to: test_prompt
    mode: loop
    when: "accuracy_score < 0.95"
    label: "Re-test with improved prompt"