Add voice-to-text transcription feature

sst · LakshmanTurlapati · Nov 3, 2025 · Nov 3, 2025 · ce16c8640fb0b52c086e6145b7f3ccf694213b8b
commit ce16c8640fb0b52c086e6145b7f3ccf694213b8b
diff --git a/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx b/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx
@@ -9,7 +9,7 @@ import {
   dim,
   fg,
 } from "@opentui/core"
-import { createEffect, createMemo, Match, Switch, type JSX, onMount, batch } from "solid-js"
+import { createEffect, createMemo, Match, Switch, Show, type JSX, onMount, batch, createSignal } from "solid-js"
 import { useLocal } from "@tui/context/local"
 import { useTheme } from "@tui/context/theme"
 import { SplitBorder } from "@tui/component/border"
@@ -28,6 +28,9 @@ import { useExit } from "../../context/exit"
 import { Clipboard } from "../../util/clipboard"
 import type { FilePart } from "@opencode-ai/sdk"
 import { TuiEvent } from "../../event"
+import { Audio } from "@/util/audio"
+import { Whisper } from "@/util/whisper"
+import { useToast } from "../../ui/toast"
 
 export type PromptProps = {
   sessionID?: string
@@ -61,6 +64,20 @@ export function Prompt(props: PromptProps) {
   const command = useCommandDialog()
   const renderer = useRenderer()
   const { theme, syntax } = useTheme()
+  const toast = useToast()
+
+  const [recording, setRecording] = createSignal<Audio.RecordingSession | null>(null)
+  const [recordingAvailable, setRecordingAvailable] = createSignal(false)
+  const [whisperConfigured, setWhisperConfigured] = createSignal(false)
+
+  onMount(async () => {
+    const available = await Audio.checkRecordingAvailable()
+    console.log("Recording available:", available)
+    setRecordingAvailable(available)
+
+    // Whisper uses OpenAI provider key - always available if OpenAI is configured
+    setWhisperConfigured(true)
+  })
 
   const textareaKeybindings = createMemo(() => {
     const newlineBindings = keybind.all.input_newline || []
@@ -156,6 +173,16 @@ export function Prompt(props: PromptProps) {
           }
         },
       },
+          {
+            title: recording() ? "Stop notation" : "Start notation",
+            value: "prompt.voice",
+            disabled: !recordingAvailable(),
+            category: "Prompt",
+            onSelect: (dialog) => {
+              toggleRecording()
+              dialog.clear()
+            },
+          },
     ]
   })
 
@@ -456,6 +483,70 @@ export function Prompt(props: PromptProps) {
     return
   }
 
+  async function toggleRecording() {
+    const currentRecording = recording()
+
+    if (currentRecording) {
+      try {
+        toast.show({
+          message: "Processing audio...",
+          variant: "info",
+        })
+
+        const audioBlob = await currentRecording.stop()
+        setRecording(null)
+
+        console.log("Audio blob:", { size: audioBlob.size, type: audioBlob.type })
+
+        const text = await Whisper.transcribe(audioBlob)
+
+        console.log("Transcription result:", text)
+
+        if (text && text.trim()) {
+          input.insertText(text + " ")
+
+          toast.show({
+            message: "Transcription complete",
+            variant: "success",
+          })
+        } else {
+          toast.show({
+            message: "No text detected in audio",
+            variant: "warning",
+          })
+        }
+      } catch (error) {
+        setRecording(null)
+
+        if (error instanceof Whisper.ConfigError) {
+          toast.show({
+            message: "OpenAI not configured. Run: opencode auth login",
+            variant: "error",
+          })
+        } else {
+          toast.show({
+            message: `Recording failed: ${error instanceof Error ? error.message : String(error)}`,
+            variant: "error",
+          })
+        }
+      }
+    } else {
+      try {
+        const session = await Audio.startRecording()
+        setRecording(session)
+        toast.show({
+          message: "Recording... Click mic to stop",
+          variant: "info",
+        })
+      } catch (error) {
+        toast.show({
+          message: `Failed to start recording: ${error instanceof Error ? error.message : String(error)}`,
+          variant: "error",
+        })
+      }
+    }
+  }
+
   return (
     <>
       <Autocomplete
@@ -688,10 +779,48 @@ export function Prompt(props: PromptProps) {
           </box>
           <box
             backgroundColor={theme.backgroundElement}
-            width={1}
             justifyContent="center"
             alignItems="center"
-          ></box>
+          >
+            <Show when={recordingAvailable()}>
+              <Switch>
+                <Match when={recording()}>
+              <box
+                paddingLeft={1}
+                paddingRight={1}
+                marginRight={1}
+                height={1}
+                backgroundColor={theme.error}
+                onMouseDown={(e: MouseEvent) => {
+                  e.preventDefault()
+                  toggleRecording()
+                }}
+              >
+                <text fg={theme.background} attributes={TextAttributes.BOLD}>
+                  Stop
+                </text>
+              </box>
+            </Match>
+            <Match when={!recording()}>
+              <box
+                paddingLeft={1}
+                paddingRight={1}
+                marginRight={1}
+                height={1}
+                backgroundColor={theme.primary}
+                onMouseDown={(e: MouseEvent) => {
+                  e.preventDefault()
+                  toggleRecording()
+                }}
+              >
+                <text fg={theme.background} attributes={TextAttributes.BOLD}>
+                  Notate
+                </text>
+              </box>
+                </Match>
+              </Switch>
+            </Show>
+          </box>
         </box>
         <box flexDirection="row" justifyContent="space-between">
           <text flexShrink={0} wrapMode="none" fg={theme.text}>

diff --git a/packages/opencode/src/config/config.ts b/packages/opencode/src/config/config.ts
@@ -597,6 +597,13 @@ export namespace Config {
         )
         .optional()
         .describe("Custom provider configurations and model overrides"),
+      whisper: z
+        .object({
+          model: z.string().optional().default("whisper-1").describe("Whisper model to use (default: whisper-1)"),
+          baseURL: z.string().optional().default("https://api.openai.com/v1").describe("OpenAI API base URL (default: https://api.openai.com/v1)"),
+        })
+        .optional()
+        .describe("OpenAI Whisper configuration for voice input (uses OpenAI provider API key from 'opencode auth login')"),
       mcp: z
         .record(z.string(), Mcp)
         .optional()

diff --git a/packages/opencode/src/util/audio.ts b/packages/opencode/src/util/audio.ts
@@ -0,0 +1,117 @@
+import { Log } from "./log"
+import { $ } from "bun"
+import fs from "fs/promises"
+import path from "path"
+import os from "os"
+
+export namespace Audio {
+  const log = Log.create({ service: "audio" })
+
+  export interface RecordingSession {
+    stop: () => Promise<Blob>
+    isRecording: boolean
+  }
+
+  export async function startRecording(): Promise<RecordingSession> {
+    const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "opencode-audio-"))
+    const audioFile = path.join(tempDir, "recording.wav")
+
+    let isRecording = true
+    let proc: any
+
+    // Try to use available recording tools based on platform
+    const platform = process.platform
+
+    log.info("starting audio recording", { platform, audioFile })
+
+    if (platform === "darwin") {
+      // macOS - use sox with default audio device, let it run until killed
+      proc = Bun.spawn(['sox', '-d', '-t', 'wav', audioFile], {
+        stdout: 'ignore',
+        stderr: 'ignore',
+      })
+    } else if (platform === "linux") {
+      // Linux - use arecord (ALSA)
+      proc = Bun.spawn(['arecord', '-f', 'cd', '-c', '1', '-r', '16000', audioFile], {
+        stdout: 'ignore',
+        stderr: 'ignore',
+      })
+    } else {
+      throw new Error("Audio recording not supported on this platform")
+    }
+
+    // Give sox time to start and create file
+    await new Promise(resolve => setTimeout(resolve, 1000))
+
+    log.info("recording started", { pid: proc.pid })
+
+    return {
+      async stop() {
+        isRecording = false
+        log.info("stopping audio recording", { audioFile, pid: proc.pid })
+
+        try {
+          proc.kill()
+          await proc.exited
+          log.info("recording process exited")
+        } catch (e) {
+          log.error("error killing recording process", { error: e })
+        }
+
+        // Wait for file to be fully written and flushed to disk
+        await new Promise(resolve => setTimeout(resolve, 2000))
+
+        const file = Bun.file(audioFile)
+
+        // Check if file exists and has content
+        const exists = await file.exists()
+        if (!exists) {
+          log.error("audio file does not exist", { audioFile })
+          throw new Error("Recording file not found")
+        }
+
+        const size = await file.size
+        log.info("audio file info", { audioFile, size, exists })
+
+        if (size === 0) {
+          log.error("audio file is empty", { audioFile })
+          throw new Error("Recording file is empty")
+        }
+
+        const arrayBuffer = await file.arrayBuffer()
+        const blob = new Blob([arrayBuffer], { type: "audio/wav" })
+
+        log.info("created blob", { blobSize: blob.size, blobType: blob.type })
+
+        // Cleanup
+        await fs.rm(tempDir, { recursive: true, force: true }).catch(() => {})
+
+        return blob
+      },
+      get isRecording() {
+        return isRecording
+      }
+    }
+  }
+
+  export async function checkRecordingAvailable(): Promise<boolean> {
+    const platform = process.platform
+
+    try {
+      if (platform === "darwin") {
+        // Check for sox
+        const result = await $`which sox`.nothrow().quiet()
+        return result.exitCode === 0
+      } else if (platform === "linux") {
+        // Check for arecord
+        const result = await $`which arecord`.nothrow().quiet()
+        return result.exitCode === 0
+      }
+    } catch (error) {
+      log.error("error checking recording availability", { error })
+    }
+
+    return false
+  }
+}
+
diff --git a/packages/opencode/src/util/whisper.ts b/packages/opencode/src/util/whisper.ts
@@ -0,0 +1,62 @@
+import { Config } from "../config/config"
+import { Auth } from "../auth"
+import { Log } from "./log"
+import { NamedError } from "./error"
+
+export namespace Whisper {
+  const log = Log.create({ service: "whisper" })
+
+  export class ConfigError extends NamedError {
+    constructor() {
+      super("WhisperConfigError", "OpenAI API key not configured. Run: opencode auth login")
+    }
+  }
+
+  export async function transcribe(audioBlob: Blob): Promise<string> {
+    // Get OpenAI API key from auth (same as provider)
+    const auth = await Auth.get("openai")
+    const config = await Config.get()
+
+    // Use auth key first, fall back to config.whisper.apiKey
+    const apiKey = auth?.type === "api" ? auth.key : config.whisper?.apiKey
+
+    if (!apiKey) {
+      log.error("no OpenAI API key found in auth or config")
+      throw new ConfigError()
+    }
+
+    const baseURL = config.whisper?.baseURL ?? "https://api.openai.com/v1"
+    const model = config.whisper?.model ?? "whisper-1"
+
+    const formData = new FormData()
+    formData.append("file", audioBlob, "recording.wav")
+    formData.append("model", model)
+
+    log.info("transcribing audio", { model, size: audioBlob.size, type: audioBlob.type })
+
+    const response = await fetch(`${baseURL}/audio/transcriptions`, {
+      method: "POST",
+      headers: {
+        "Authorization": `Bearer ${apiKey}`,
+      },
+      body: formData,
+    })
+
+    if (!response.ok) {
+      const error = await response.text()
+      log.error("transcription failed", { status: response.status, error })
+      throw new Error(`Whisper API error (${response.status}): ${error}`)
+    }
+
+    const result = await response.json()
+    log.info("transcription complete", { text: result.text })
+
+    if (!result.text) {
+      log.error("no text in response", { result })
+      throw new Error("No transcription text returned")
+    }
+
+    return result.text
+  }
+}
+