Testing Your Prompts

Validate prompt quality with automated testing.

Testing During Development

Use Ollama for fast, free local testing:

import { createProvider } from "@mzhub/promptc";

// Free local testing
const testProvider = createProvider("ollama", {
  defaultModel: "llama3.2"
});

// Production provider
const prodProvider = createProvider("openai", {
  apiKey: process.env.OPENAI_API_KEY
});

// Use testProvider during development
const program = new ChainOfThought(schema, testProvider);

Unit Testing with Vitest

tests/extractor.test.js
import { describe, it, expect } from "vitest";
import { extractNames } from "../src/services/extractor.js";

describe("extractNames", () => {
  it("should extract single name", async () => {
    const names = await extractNames("Bill Gates is a philanthropist.");
    expect(names).toContain("Bill Gates");
  });

  it("should extract multiple names", async () => {
    const names = await extractNames(
      "Steve Jobs and Tim Cook led Apple."
    );
    expect(names).toContain("Steve Jobs");
    expect(names).toContain("Tim Cook");
  });

  it("should return empty array for no names", async () => {
    const names = await extractNames("The weather is nice today.");
    expect(names).toEqual([]);
  });
});
API Costs
Running tests against real APIs costs money. Consider mocking for unit tests and using real APIs only for integration tests.

Mocking LLM Responses

tests/mock-provider.js
// Create a mock provider for testing
function createMockProvider(responses) {
  let callIndex = 0;
  
  return {
    name: "mock",
    defaultModel: "mock-model",
    async complete({ prompt }) {
      const response = responses[callIndex % responses.length];
      callIndex++;
      return {
        content: JSON.stringify(response),
        usage: { inputTokens: 10, outputTokens: 20 }
      };
    }
  };
}

// Usage in tests
const mockProvider = createMockProvider([
  { names: ["Alice", "Bob"] },
  { names: [] },
  { names: ["Charlie"] }
]);

const program = new Predict(schema, mockProvider);
const result = await program.run({ text: "test" });
expect(result.result.names).toEqual(["Alice", "Bob"]);

Eval Sets

Maintain a test dataset to measure prompt quality:

tests/eval-set.json
[
  {
    "input": { "text": "Elon Musk runs Tesla." },
    "expected": { "names": ["Elon Musk"] }
  },
  {
    "input": { "text": "No names here." },
    "expected": { "names": [] }
  },
  {
    "input": { "text": "Meeting with Sarah and John." },
    "expected": { "names": ["Sarah", "John"] }
  }
]
tests/eval.test.js
import { describe, it, expect } from "vitest";
import { exactMatch, arrayOverlap } from "@mzhub/promptc";
import evalSet from "./eval-set.json";
import { extractNames } from "../src/services/extractor.js";

describe("Eval Set", () => {
  const evaluator = arrayOverlap();
  
  it("should pass evaluation set", async () => {
    let totalScore = 0;
    
    for (const { input, expected } of evalSet) {
      const result = await extractNames(input.text);
      const score = evaluator(result, expected.names);
      totalScore += score;
    }
    
    const avgScore = totalScore / evalSet.length;
    console.log(`Average score: ${(avgScore * 100).toFixed(1)}%`);
    
    // Require at least 80% average
    expect(avgScore).toBeGreaterThan(0.8);
  });
});

Regression Testing

Ensure prompt updates don't break existing functionality:

// scripts/regression-test.js
import { readFileSync } from "fs";
import evalSet from "../tests/eval-set.json";

async function runRegressionTest() {
  // Load current production prompt
  const currentPrompt = JSON.parse(
    readFileSync("prompts/current.json", "utf-8")
  );
  
  // Load new candidate prompt
  const newPrompt = JSON.parse(
    readFileSync("prompts/candidate.json", "utf-8")
  );
  
  const currentScore = await evaluatePrompt(currentPrompt, evalSet);
  const newScore = await evaluatePrompt(newPrompt, evalSet);
  
  console.log(`Current: ${currentScore.toFixed(2)}`);
  console.log(`New: ${newScore.toFixed(2)}`);
  
  if (newScore < currentScore * 0.95) {
    console.error("❌ Regression detected! New prompt is >5% worse.");
    process.exit(1);
  }
  
  console.log("✅ No regression detected.");
}

CI Integration

.github/workflows/test.yml
name: Test Prompts

on: [push, pull_request]

jobs:
  test:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-node@v4
        with:
          node-version: '20'
      
      - name: Install dependencies
        run: npm ci
      
      # Use mock provider for unit tests
      - name: Unit tests
        run: npm test
      
      # Use real API for integration tests (on main only)
      - name: Integration tests
        if: github.ref == 'refs/heads/main'
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        run: npm run test:integration