diff --git a/eval/eval.ts b/eval/eval.ts index 03f0b72..cceefc7 100644 --- a/eval/eval.ts +++ b/eval/eval.ts @@ -1,5 +1,6 @@ import { parseArgs } from 'node:util'; import * as v from 'valibot'; +import * as p from '@clack/prompts'; import { claudeCodeCli } from './lib/agents/claude-code-cli.ts'; import * as path from 'node:path'; import * as fs from 'node:fs/promises'; @@ -18,7 +19,9 @@ import { x } from 'tinyexec'; const Args = v.pipe( v.object({ values: v.object({ - agent: v.union([v.literal('claude-code'), v.literal('copilot')]), + agent: v.optional( + v.union([v.literal('claude-code'), v.literal('copilot')]), + ), verbose: v.boolean(), }), positionals: v.array(v.string()), @@ -29,11 +32,11 @@ const Args = v.pipe( })), ); -const args = v.parse( +const parsedArgs = v.parse( Args, parseArgs({ options: { - agent: { type: 'string', default: 'claude-code', short: 'a' }, + agent: { type: 'string', short: 'a' }, verbose: { type: 'boolean', default: false, short: 'v' }, }, strict: false, @@ -42,6 +45,76 @@ const args = v.parse( }), ); +// Display intro +p.intro('๐Ÿงช Storybook MCP Evaluations'); + +// Get available eval directories +const evalsDir = path.join(process.cwd(), 'evals'); +const availableEvals = await fs.readdir(evalsDir, { withFileTypes: true }); +const evalOptions = availableEvals + .filter((dirent) => dirent.isDirectory()) + .map((dirent) => ({ + value: dirent.name, + label: dirent.name, + })); + +// Prompt for missing arguments +const promptResults = await p.group( + { + evals: async () => { + if (parsedArgs.evals.length > 0) { + return parsedArgs.evals; + } + + const result = await p.multiselect({ + message: 'Select evaluations to run:', + options: evalOptions, + required: true, + }); + + if (p.isCancel(result)) { + p.cancel('Operation cancelled.'); + process.exit(0); + } + + return result as string[]; + }, + agent: async () => { + if (parsedArgs.agent) { + return parsedArgs.agent; + } + + const result = await p.select({ + message: 'Select agent to use:', + options: [ + { value: 'claude-code', label: 'Claude Code CLI' }, + { value: 'copilot', label: 'GitHub Copilot', disabled: true }, + ], + }); + + if (p.isCancel(result)) { + p.cancel('Operation cancelled.'); + process.exit(0); + } + + return result as 'claude-code' | 'copilot'; + }, + verbose: async () => parsedArgs.verbose, + }, + { + onCancel: () => { + p.cancel('Operation cancelled.'); + process.exit(0); + }, + }, +); + +const args = { + agent: promptResults.agent, + verbose: promptResults.verbose, + evals: promptResults.evals, +}; + const evalDirsToPaths = Object.fromEntries( args.evals.map((evalDir) => [ evalDir, @@ -50,15 +123,20 @@ const evalDirsToPaths = Object.fromEntries( ); // Validate that all eval directories exist +const s = p.spinner(); +s.start('Validating eval directories'); for (const evalPath of Object.values(evalDirsToPaths)) { const dirExists = await fs .access(evalPath) .then(() => true) .catch(() => false); if (!dirExists) { - throw new TypeError(`Eval directory does not exist: ${evalPath}`); + s.stop('Validation failed'); + p.log.error(`Eval directory does not exist: ${evalPath}`); + process.exit(1); } } +s.stop('All eval directories validated'); let agent; @@ -69,6 +147,8 @@ switch (args.agent) { } } +p.log.info(`Running ${args.evals.length} evaluation(s) with ${args.agent}`); + await Promise.all( Object.entries(evalDirsToPaths).map(async ([evalDir, evalPath]) => { const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); @@ -88,13 +168,15 @@ await Promise.all( verbose: args.verbose, }; - console.group(`Running ${evalDir} with ${args.agent}...`); + p.log.step(`\n${evalDir}: Starting evaluation`); - console.log('Setting up experiment...'); - await setupExperiment(experimentArgs) + const setupSpinner = p.spinner(); + setupSpinner.start('Setting up experiment'); + await setupExperiment(experimentArgs); + setupSpinner.stop('Experiment set up'); - - console.log(`Executing prompt with ${args.agent}...`); + const agentSpinner = p.spinner(); + agentSpinner.start(`Executing prompt with ${args.agent}`); const prompt = await fs.readFile(path.join(evalPath, 'prompt.md'), 'utf8'); const enhancedPrompt = dedent`${prompt} @@ -105,20 +187,46 @@ await Promise.all( env: process.env, ...experimentArgs, }); + agentSpinner.stop( + `Agent completed (${promptResult.turns} turns, ${promptResult.duration}s, $${promptResult.cost})`, + ); - console.log('Setting up evaluations...'); + const evalSetupSpinner = p.spinner(); + evalSetupSpinner.start('Setting up evaluations'); await setupEvaluations(experimentArgs); + evalSetupSpinner.stop('Evaluations set up'); - console.log('Starting evaluation...'); - const [buildSuccess, typeCheckSuccess, lintSuccess, { tests, a11y }] = - await Promise.all([ - build(experimentArgs), - checkTypes(experimentArgs), - runESLint(experimentArgs), - testStories(experimentArgs), - saveEnvironment(experimentArgs, args.agent), - ]); + const evaluationResults = await p.tasks([ + { + title: 'Building project', + task: async () => await build(experimentArgs), + }, + { + title: 'Type checking', + task: async () => await checkTypes(experimentArgs), + }, + { + title: 'Linting code', + task: async () => await runESLint(experimentArgs), + }, + { + title: 'Testing stories', + task: async () => await testStories(experimentArgs), + }, + { + title: 'Saving environment', + task: async () => await saveEnvironment(experimentArgs, args.agent), + }, + ]); + + const [buildSuccess, typeCheckSuccess, lintSuccess, testsResult] = + evaluationResults; + const { tests, a11y } = testsResult as { tests: boolean; a11y: boolean }; + + const prettierSpinner = p.spinner(); + prettierSpinner.start('Formatting results'); await x('pnpm', ['exec', 'prettier', '--write', resultsPath]); + prettierSpinner.stop('Results formatted'); const summary = { ...promptResult, @@ -132,7 +240,21 @@ await Promise.all( path.join(resultsPath, 'summary.json'), JSON.stringify(summary, null, 2), ); - console.log('Evaluation complete. Summary:'); - console.log(JSON.stringify(summary, null, 2)); + + // Log summary with styled output + p.log.success(`${evalDir}: Evaluation complete`); + p.log.info('Summary:'); + p.log.message(` Build: ${buildSuccess ? 'โœ…' : 'โŒ'}`); + p.log.message(` Type Check: ${typeCheckSuccess ? 'โœ…' : 'โŒ'}`); + p.log.message(` Lint: ${lintSuccess ? 'โœ…' : 'โŒ'}`); + p.log.message(` Tests: ${tests ? 'โœ…' : 'โŒ'}`); + p.log.message(` A11y: ${a11y ? 'โœ…' : 'โŒ'}`); + p.log.message( + ` Duration: ${promptResult.duration}s (API: ${promptResult.durationApi}s, Wall: ${promptResult.durationWall}s)`, + ); + p.log.message(` Cost: $${promptResult.cost}`); + p.log.message(` Turns: ${promptResult.turns}`); }), ); + +p.outro('โœจ All evaluations complete!'); diff --git a/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/index.html b/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/index.html new file mode 100644 index 0000000..38b0160 --- /dev/null +++ b/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/index.html @@ -0,0 +1,13 @@ + + + + + + + mcp-eval-project-template + + +
+ + + diff --git a/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/package.json b/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/package.json new file mode 100644 index 0000000..9927795 --- /dev/null +++ b/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/package.json @@ -0,0 +1,33 @@ +{ + "name": "mcp-eval-project-template", + "version": "0.0.0", + "private": true, + "type": "module", + "scripts": { + "build": "vite build", + "dev": "vite", + "lint": "eslint .", + "preview": "vite preview", + "test": "vitest", + "storybook": "storybook dev --port 6006", + "typecheck": "tsc --noEmit --project ./tsconfig.app.json" + }, + "dependencies": { + "react": "^19.1.1", + "react-dom": "^19.1.1" + }, + "devDependencies": { + "@eslint/js": "^9.36.0", + "@types/node": "^24.6.0", + "@types/react": "^19.1.16", + "@types/react-dom": "^19.1.9", + "@vitejs/plugin-react-swc": "^4.1.0", + "eslint": "^9.36.0", + "eslint-plugin-react-hooks": "^5.2.0", + "eslint-plugin-react-refresh": "^0.4.22", + "globals": "^16.4.0", + "typescript": "~5.9.3", + "typescript-eslint": "^8.45.0", + "vite": "^7.1.12" + } +} diff --git a/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/src/main.tsx b/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/src/main.tsx new file mode 100644 index 0000000..4cfe17b --- /dev/null +++ b/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/src/main.tsx @@ -0,0 +1,10 @@ +import { StrictMode } from 'react' +import { createRoot } from 'react-dom/client' + +createRoot(document.getElementById('root')!).render( + +
Hello World
+
, +) + +const foo: string = 2; diff --git a/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/tsconfig.app.json b/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/tsconfig.app.json new file mode 100644 index 0000000..d7b4361 --- /dev/null +++ b/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/tsconfig.app.json @@ -0,0 +1,29 @@ +{ + "compilerOptions": { + "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo", + "target": "ES2022", + "useDefineForClassFields": true, + "lib": ["ES2022", "DOM", "DOM.Iterable"], + "module": "ESNext", + "types": ["vite/client"], + "typeRoots": ["./node_modules/@types/", "./node_modules"], + "skipLibCheck": true, + + /* Bundler mode */ + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": false, + "moduleDetection": "force", + "noEmit": true, + "jsx": "react-jsx", + + /* Linting */ + "strict": true, + "noUnusedLocals": false, + "noUnusedParameters": false, + "erasableSyntaxOnly": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedSideEffectImports": true + }, + "include": ["src"] +} diff --git a/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/tsconfig.json b/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/tsconfig.json new file mode 100644 index 0000000..1ffef60 --- /dev/null +++ b/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/tsconfig.json @@ -0,0 +1,7 @@ +{ + "files": [], + "references": [ + { "path": "./tsconfig.app.json" }, + { "path": "./tsconfig.node.json" } + ] +} diff --git a/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/tsconfig.node.json b/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/tsconfig.node.json new file mode 100644 index 0000000..8a67f62 --- /dev/null +++ b/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/tsconfig.node.json @@ -0,0 +1,26 @@ +{ + "compilerOptions": { + "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo", + "target": "ES2023", + "lib": ["ES2023"], + "module": "ESNext", + "types": ["node"], + "skipLibCheck": true, + + /* Bundler mode */ + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "moduleDetection": "force", + "noEmit": true, + + /* Linting */ + "strict": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "erasableSyntaxOnly": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedSideEffectImports": true + }, + "include": ["vite.config.ts"] +} diff --git a/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/vite.config.ts b/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/vite.config.ts new file mode 100644 index 0000000..4612a94 --- /dev/null +++ b/eval/evals/101-flight-booking-plain/experiments/claude-code-no-context-2025-11-04T16-38-20-439Z/project/vite.config.ts @@ -0,0 +1,10 @@ +import { defineConfig } from 'vite' +import react from '@vitejs/plugin-react-swc' + +// https://vite.dev/config/ +export default defineConfig({ + plugins: [react()], + optimizeDeps: { + include: ['react/jsx-dev-runtime'] + } +})