Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Add voice-to-text transcription feature
  • Loading branch information
LakshmanTurlapati committed Nov 3, 2025
commit ce16c8640fb0b52c086e6145b7f3ccf694213b8b
135 changes: 132 additions & 3 deletions packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import {
dim,
fg,
} from "@opentui/core"
import { createEffect, createMemo, Match, Switch, type JSX, onMount, batch } from "solid-js"
import { createEffect, createMemo, Match, Switch, Show, type JSX, onMount, batch, createSignal } from "solid-js"
import { useLocal } from "@tui/context/local"
import { useTheme } from "@tui/context/theme"
import { SplitBorder } from "@tui/component/border"
Expand All @@ -28,6 +28,9 @@ import { useExit } from "../../context/exit"
import { Clipboard } from "../../util/clipboard"
import type { FilePart } from "@opencode-ai/sdk"
import { TuiEvent } from "../../event"
import { Audio } from "@/util/audio"
import { Whisper } from "@/util/whisper"
import { useToast } from "../../ui/toast"

export type PromptProps = {
sessionID?: string
Expand Down Expand Up @@ -61,6 +64,20 @@ export function Prompt(props: PromptProps) {
const command = useCommandDialog()
const renderer = useRenderer()
const { theme, syntax } = useTheme()
const toast = useToast()

const [recording, setRecording] = createSignal<Audio.RecordingSession | null>(null)
const [recordingAvailable, setRecordingAvailable] = createSignal(false)
const [whisperConfigured, setWhisperConfigured] = createSignal(false)

onMount(async () => {
const available = await Audio.checkRecordingAvailable()
console.log("Recording available:", available)
setRecordingAvailable(available)

// Whisper uses OpenAI provider key - always available if OpenAI is configured
setWhisperConfigured(true)
})

const textareaKeybindings = createMemo(() => {
const newlineBindings = keybind.all.input_newline || []
Expand Down Expand Up @@ -156,6 +173,16 @@ export function Prompt(props: PromptProps) {
}
},
},
{
title: recording() ? "Stop notation" : "Start notation",
value: "prompt.voice",
disabled: !recordingAvailable(),
category: "Prompt",
onSelect: (dialog) => {
toggleRecording()
dialog.clear()
},
},
]
})

Expand Down Expand Up @@ -456,6 +483,70 @@ export function Prompt(props: PromptProps) {
return
}

async function toggleRecording() {
const currentRecording = recording()

if (currentRecording) {
try {
toast.show({
message: "Processing audio...",
variant: "info",
})

const audioBlob = await currentRecording.stop()
setRecording(null)

console.log("Audio blob:", { size: audioBlob.size, type: audioBlob.type })

const text = await Whisper.transcribe(audioBlob)

console.log("Transcription result:", text)

if (text && text.trim()) {
input.insertText(text + " ")

toast.show({
message: "Transcription complete",
variant: "success",
})
} else {
toast.show({
message: "No text detected in audio",
variant: "warning",
})
}
} catch (error) {
setRecording(null)

if (error instanceof Whisper.ConfigError) {
toast.show({
message: "OpenAI not configured. Run: opencode auth login",
variant: "error",
})
} else {
toast.show({
message: `Recording failed: ${error instanceof Error ? error.message : String(error)}`,
variant: "error",
})
}
}
} else {
try {
const session = await Audio.startRecording()
setRecording(session)
toast.show({
message: "Recording... Click mic to stop",
variant: "info",
})
} catch (error) {
toast.show({
message: `Failed to start recording: ${error instanceof Error ? error.message : String(error)}`,
variant: "error",
})
}
}
}

return (
<>
<Autocomplete
Expand Down Expand Up @@ -688,10 +779,48 @@ export function Prompt(props: PromptProps) {
</box>
<box
backgroundColor={theme.backgroundElement}
width={1}
justifyContent="center"
alignItems="center"
></box>
>
<Show when={recordingAvailable()}>
<Switch>
<Match when={recording()}>
<box
paddingLeft={1}
paddingRight={1}
marginRight={1}
height={1}
backgroundColor={theme.error}
onMouseDown={(e: MouseEvent) => {
e.preventDefault()
toggleRecording()
}}
>
<text fg={theme.background} attributes={TextAttributes.BOLD}>
Stop
</text>
</box>
</Match>
<Match when={!recording()}>
<box
paddingLeft={1}
paddingRight={1}
marginRight={1}
height={1}
backgroundColor={theme.primary}
onMouseDown={(e: MouseEvent) => {
e.preventDefault()
toggleRecording()
}}
>
<text fg={theme.background} attributes={TextAttributes.BOLD}>
Notate
</text>
</box>
</Match>
</Switch>
</Show>
</box>
</box>
<box flexDirection="row" justifyContent="space-between">
<text flexShrink={0} wrapMode="none" fg={theme.text}>
Expand Down
7 changes: 7 additions & 0 deletions packages/opencode/src/config/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,13 @@ export namespace Config {
)
.optional()
.describe("Custom provider configurations and model overrides"),
whisper: z
.object({
model: z.string().optional().default("whisper-1").describe("Whisper model to use (default: whisper-1)"),
baseURL: z.string().optional().default("https://api.openai.com/v1").describe("OpenAI API base URL (default: https://api.openai.com/v1)"),
})
.optional()
.describe("OpenAI Whisper configuration for voice input (uses OpenAI provider API key from 'opencode auth login')"),
mcp: z
.record(z.string(), Mcp)
.optional()
Expand Down
117 changes: 117 additions & 0 deletions packages/opencode/src/util/audio.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import { Log } from "./log"
import { $ } from "bun"
import fs from "fs/promises"
import path from "path"
import os from "os"

export namespace Audio {
const log = Log.create({ service: "audio" })

export interface RecordingSession {
stop: () => Promise<Blob>
isRecording: boolean
}

export async function startRecording(): Promise<RecordingSession> {
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "opencode-audio-"))
const audioFile = path.join(tempDir, "recording.wav")

let isRecording = true
let proc: any

// Try to use available recording tools based on platform
const platform = process.platform

log.info("starting audio recording", { platform, audioFile })

if (platform === "darwin") {
// macOS - use sox with default audio device, let it run until killed
proc = Bun.spawn(['sox', '-d', '-t', 'wav', audioFile], {
stdout: 'ignore',
stderr: 'ignore',
})
} else if (platform === "linux") {
// Linux - use arecord (ALSA)
proc = Bun.spawn(['arecord', '-f', 'cd', '-c', '1', '-r', '16000', audioFile], {
stdout: 'ignore',
stderr: 'ignore',
})
} else {
throw new Error("Audio recording not supported on this platform")
}

// Give sox time to start and create file
await new Promise(resolve => setTimeout(resolve, 1000))

log.info("recording started", { pid: proc.pid })

return {
async stop() {
isRecording = false
log.info("stopping audio recording", { audioFile, pid: proc.pid })

try {
proc.kill()
await proc.exited
log.info("recording process exited")
} catch (e) {
log.error("error killing recording process", { error: e })
}

// Wait for file to be fully written and flushed to disk
await new Promise(resolve => setTimeout(resolve, 2000))

const file = Bun.file(audioFile)

// Check if file exists and has content
const exists = await file.exists()
if (!exists) {
log.error("audio file does not exist", { audioFile })
throw new Error("Recording file not found")
}

const size = await file.size
log.info("audio file info", { audioFile, size, exists })

if (size === 0) {
log.error("audio file is empty", { audioFile })
throw new Error("Recording file is empty")
}

const arrayBuffer = await file.arrayBuffer()
const blob = new Blob([arrayBuffer], { type: "audio/wav" })

log.info("created blob", { blobSize: blob.size, blobType: blob.type })

// Cleanup
await fs.rm(tempDir, { recursive: true, force: true }).catch(() => {})

return blob
},
get isRecording() {
return isRecording
}
}
}

export async function checkRecordingAvailable(): Promise<boolean> {
const platform = process.platform

try {
if (platform === "darwin") {
// Check for sox
const result = await $`which sox`.nothrow().quiet()
return result.exitCode === 0
} else if (platform === "linux") {
// Check for arecord
const result = await $`which arecord`.nothrow().quiet()
return result.exitCode === 0
}
} catch (error) {
log.error("error checking recording availability", { error })
}

return false
}
}

62 changes: 62 additions & 0 deletions packages/opencode/src/util/whisper.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import { Config } from "../config/config"
import { Auth } from "../auth"
import { Log } from "./log"
import { NamedError } from "./error"

export namespace Whisper {
const log = Log.create({ service: "whisper" })

export class ConfigError extends NamedError {
constructor() {
super("WhisperConfigError", "OpenAI API key not configured. Run: opencode auth login")
}
}

export async function transcribe(audioBlob: Blob): Promise<string> {
// Get OpenAI API key from auth (same as provider)
const auth = await Auth.get("openai")
const config = await Config.get()

// Use auth key first, fall back to config.whisper.apiKey
const apiKey = auth?.type === "api" ? auth.key : config.whisper?.apiKey

if (!apiKey) {
log.error("no OpenAI API key found in auth or config")
throw new ConfigError()
}

const baseURL = config.whisper?.baseURL ?? "https://api.openai.com/v1"
const model = config.whisper?.model ?? "whisper-1"

const formData = new FormData()
formData.append("file", audioBlob, "recording.wav")
formData.append("model", model)

log.info("transcribing audio", { model, size: audioBlob.size, type: audioBlob.type })

const response = await fetch(`${baseURL}/audio/transcriptions`, {
method: "POST",
headers: {
"Authorization": `Bearer ${apiKey}`,
},
body: formData,
})

if (!response.ok) {
const error = await response.text()
log.error("transcription failed", { status: response.status, error })
throw new Error(`Whisper API error (${response.status}): ${error}`)
}

const result = await response.json()
log.info("transcription complete", { text: result.text })

if (!result.text) {
log.error("no text in response", { result })
throw new Error("No transcription text returned")
}

return result.text
}
}