Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 143 additions & 21 deletions eval/eval.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { parseArgs } from 'node:util';
import * as v from 'valibot';
import * as p from '@clack/prompts';
import { claudeCodeCli } from './lib/agents/claude-code-cli.ts';
import * as path from 'node:path';
import * as fs from 'node:fs/promises';
Expand All @@ -18,7 +19,9 @@ import { x } from 'tinyexec';
const Args = v.pipe(
v.object({
values: v.object({
agent: v.union([v.literal('claude-code'), v.literal('copilot')]),
agent: v.optional(
v.union([v.literal('claude-code'), v.literal('copilot')]),
),
verbose: v.boolean(),
}),
positionals: v.array(v.string()),
Expand All @@ -29,11 +32,11 @@ const Args = v.pipe(
})),
);

const args = v.parse(
const parsedArgs = v.parse(
Args,
parseArgs({
options: {
agent: { type: 'string', default: 'claude-code', short: 'a' },
agent: { type: 'string', short: 'a' },
verbose: { type: 'boolean', default: false, short: 'v' },
},
strict: false,
Expand All @@ -42,6 +45,76 @@ const args = v.parse(
}),
);

// Display intro
p.intro('πŸ§ͺ Storybook MCP Evaluations');

// Get available eval directories
const evalsDir = path.join(process.cwd(), 'evals');
const availableEvals = await fs.readdir(evalsDir, { withFileTypes: true });
const evalOptions = availableEvals
.filter((dirent) => dirent.isDirectory())
.map((dirent) => ({
value: dirent.name,
label: dirent.name,
}));

// Prompt for missing arguments
const promptResults = await p.group(
{
evals: async () => {
if (parsedArgs.evals.length > 0) {
return parsedArgs.evals;
}

const result = await p.multiselect({
message: 'Select evaluations to run:',
options: evalOptions,
required: true,
});

if (p.isCancel(result)) {
p.cancel('Operation cancelled.');
process.exit(0);
}

return result as string[];
},
agent: async () => {
if (parsedArgs.agent) {
return parsedArgs.agent;
}

const result = await p.select({
message: 'Select agent to use:',
options: [
{ value: 'claude-code', label: 'Claude Code CLI' },
{ value: 'copilot', label: 'GitHub Copilot', disabled: true },
],
});

if (p.isCancel(result)) {
p.cancel('Operation cancelled.');
process.exit(0);
}

return result as 'claude-code' | 'copilot';
},
verbose: async () => parsedArgs.verbose,
},
{
onCancel: () => {
p.cancel('Operation cancelled.');
process.exit(0);
},
},
);

const args = {
agent: promptResults.agent,
verbose: promptResults.verbose,
evals: promptResults.evals,
};

const evalDirsToPaths = Object.fromEntries(
args.evals.map((evalDir) => [
evalDir,
Expand All @@ -50,15 +123,20 @@ const evalDirsToPaths = Object.fromEntries(
);

// Validate that all eval directories exist
const s = p.spinner();
s.start('Validating eval directories');
for (const evalPath of Object.values(evalDirsToPaths)) {
const dirExists = await fs
.access(evalPath)
.then(() => true)
.catch(() => false);
if (!dirExists) {
throw new TypeError(`Eval directory does not exist: ${evalPath}`);
s.stop('Validation failed');
p.log.error(`Eval directory does not exist: ${evalPath}`);
process.exit(1);
}
}
s.stop('All eval directories validated');

let agent;

Expand All @@ -69,6 +147,8 @@ switch (args.agent) {
}
}

p.log.info(`Running ${args.evals.length} evaluation(s) with ${args.agent}`);

await Promise.all(
Object.entries(evalDirsToPaths).map(async ([evalDir, evalPath]) => {
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
Expand All @@ -88,13 +168,15 @@ await Promise.all(
verbose: args.verbose,
};

console.group(`Running ${evalDir} with ${args.agent}...`);
p.log.step(`\n${evalDir}: Starting evaluation`);

console.log('Setting up experiment...');
await setupExperiment(experimentArgs)
const setupSpinner = p.spinner();
setupSpinner.start('Setting up experiment');
await setupExperiment(experimentArgs);
setupSpinner.stop('Experiment set up');


console.log(`Executing prompt with ${args.agent}...`);
const agentSpinner = p.spinner();
agentSpinner.start(`Executing prompt with ${args.agent}`);
const prompt = await fs.readFile(path.join(evalPath, 'prompt.md'), 'utf8');
const enhancedPrompt = dedent`${prompt}
<constraints>
Expand All @@ -105,20 +187,46 @@ await Promise.all(
env: process.env,
...experimentArgs,
});
agentSpinner.stop(
`Agent completed (${promptResult.turns} turns, ${promptResult.duration}s, $${promptResult.cost})`,
);

console.log('Setting up evaluations...');
const evalSetupSpinner = p.spinner();
evalSetupSpinner.start('Setting up evaluations');
await setupEvaluations(experimentArgs);
evalSetupSpinner.stop('Evaluations set up');

console.log('Starting evaluation...');
const [buildSuccess, typeCheckSuccess, lintSuccess, { tests, a11y }] =
await Promise.all([
build(experimentArgs),
checkTypes(experimentArgs),
runESLint(experimentArgs),
testStories(experimentArgs),
saveEnvironment(experimentArgs, args.agent),
]);
const evaluationResults = await p.tasks([
{
title: 'Building project',
task: async () => await build(experimentArgs),
},
{
title: 'Type checking',
task: async () => await checkTypes(experimentArgs),
},
{
title: 'Linting code',
task: async () => await runESLint(experimentArgs),
},
{
title: 'Testing stories',
task: async () => await testStories(experimentArgs),
},
{
title: 'Saving environment',
task: async () => await saveEnvironment(experimentArgs, args.agent),
},
]);

const [buildSuccess, typeCheckSuccess, lintSuccess, testsResult] =
evaluationResults;
const { tests, a11y } = testsResult as { tests: boolean; a11y: boolean };

const prettierSpinner = p.spinner();
prettierSpinner.start('Formatting results');
await x('pnpm', ['exec', 'prettier', '--write', resultsPath]);
prettierSpinner.stop('Results formatted');

const summary = {
...promptResult,
Expand All @@ -132,7 +240,21 @@ await Promise.all(
path.join(resultsPath, 'summary.json'),
JSON.stringify(summary, null, 2),
);
console.log('Evaluation complete. Summary:');
console.log(JSON.stringify(summary, null, 2));

// Log summary with styled output
p.log.success(`${evalDir}: Evaluation complete`);
p.log.info('Summary:');
p.log.message(` Build: ${buildSuccess ? 'βœ…' : '❌'}`);
p.log.message(` Type Check: ${typeCheckSuccess ? 'βœ…' : '❌'}`);
p.log.message(` Lint: ${lintSuccess ? 'βœ…' : '❌'}`);
p.log.message(` Tests: ${tests ? 'βœ…' : '❌'}`);
p.log.message(` A11y: ${a11y ? 'βœ…' : '❌'}`);
p.log.message(
` Duration: ${promptResult.duration}s (API: ${promptResult.durationApi}s, Wall: ${promptResult.durationWall}s)`,
);
p.log.message(` Cost: $${promptResult.cost}`);
p.log.message(` Turns: ${promptResult.turns}`);
}),
);

p.outro('✨ All evaluations complete!');
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp-eval-project-template</title>
</head>
<body>
<div id="root"></div>
<script type="module" src="/src/main.tsx"></script>
</body>
</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"name": "mcp-eval-project-template",
"version": "0.0.0",
"private": true,
"type": "module",
"scripts": {
"build": "vite build",
"dev": "vite",
"lint": "eslint .",
"preview": "vite preview",
"test": "vitest",
"storybook": "storybook dev --port 6006",
"typecheck": "tsc --noEmit --project ./tsconfig.app.json"
},
"dependencies": {
"react": "^19.1.1",
"react-dom": "^19.1.1"
},
"devDependencies": {
"@eslint/js": "^9.36.0",
"@types/node": "^24.6.0",
"@types/react": "^19.1.16",
"@types/react-dom": "^19.1.9",
"@vitejs/plugin-react-swc": "^4.1.0",
"eslint": "^9.36.0",
"eslint-plugin-react-hooks": "^5.2.0",
"eslint-plugin-react-refresh": "^0.4.22",
"globals": "^16.4.0",
"typescript": "~5.9.3",
"typescript-eslint": "^8.45.0",
"vite": "^7.1.12"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import { StrictMode } from 'react'
import { createRoot } from 'react-dom/client'

createRoot(document.getElementById('root')!).render(
<StrictMode>
<div>Hello World</div>
</StrictMode>,
)

const foo: string = 2;
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"compilerOptions": {
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
"target": "ES2022",
"useDefineForClassFields": true,
"lib": ["ES2022", "DOM", "DOM.Iterable"],
"module": "ESNext",
"types": ["vite/client"],
"typeRoots": ["./node_modules/@types/", "./node_modules"],
"skipLibCheck": true,

/* Bundler mode */
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"verbatimModuleSyntax": false,
"moduleDetection": "force",
"noEmit": true,
"jsx": "react-jsx",

/* Linting */
"strict": true,
"noUnusedLocals": false,
"noUnusedParameters": false,
"erasableSyntaxOnly": true,
"noFallthroughCasesInSwitch": true,
"noUncheckedSideEffectImports": true
},
"include": ["src"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"files": [],
"references": [
{ "path": "./tsconfig.app.json" },
{ "path": "./tsconfig.node.json" }
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"compilerOptions": {
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo",
"target": "ES2023",
"lib": ["ES2023"],
"module": "ESNext",
"types": ["node"],
"skipLibCheck": true,

/* Bundler mode */
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"verbatimModuleSyntax": true,
"moduleDetection": "force",
"noEmit": true,

/* Linting */
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"erasableSyntaxOnly": true,
"noFallthroughCasesInSwitch": true,
"noUncheckedSideEffectImports": true
},
"include": ["vite.config.ts"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import { defineConfig } from 'vite'
import react from '@vitejs/plugin-react-swc'

// https://vite.dev/config/
export default defineConfig({
plugins: [react()],
optimizeDeps: {
include: ['react/jsx-dev-runtime']
}
})