zyqfork · pull · Oct 4, 2025 · Sep 7, 2025 · Sep 8, 2025 · Sep 8, 2025
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -41,6 +41,7 @@
     "@eslint/js": "^9.17.0",
     "@tailwindcss/postcss": "^4.1.11",
     "@tailwindcss/vite": "^4.1.11",
+    "@types/dom-speech-recognition": "^0.0.6",
     "@types/node": "^22.13.1",
     "@types/react": "^18.3.18",
     "@types/react-dom": "^18.3.5",

diff --git a/src/components/ChatInput.tsx b/src/components/ChatInput.tsx
@@ -1,7 +1,13 @@
 import { memo, useCallback, useEffect, useMemo } from 'react';
 import toast from 'react-hot-toast';
 import { useTranslation } from 'react-i18next';
-import { LuArrowUp, LuPaperclip, LuSquare } from 'react-icons/lu';
+import {
+  LuArrowUp,
+  LuCircleStop,
+  LuMic,
+  LuPaperclip,
+  LuSquare,
+} from 'react-icons/lu';
 import { TbAdjustmentsHorizontal } from 'react-icons/tb';
 import { useNavigate } from 'react-router';
 import { useChatContext } from '../context/chat';
@@ -10,6 +16,10 @@ import { useFileUpload } from '../hooks/useFileUpload';
 import { MessageExtra } from '../types';
 import { classNames, cleanCurrentUrl } from '../utils';
 import { DropzoneArea } from './DropzoneArea';
+import SpeechToText, {
+  IS_SPEECH_RECOGNITION_SUPPORTED,
+  SpeechRecordCallback,
+} from './SpeechToText';
 
 /**
  * If the current URL contains "?m=...", prefill the message input with the value.
@@ -50,6 +60,11 @@ export const ChatInput = memo(
       stopGenerating(convId);
     }, [convId, stopGenerating]);
 
+    const handleRecord: SpeechRecordCallback = useCallback(
+      (text: string) => textarea.setValue(text),
+      [textarea]
+    );
+
     const sendNewMessage = async () => {
       const lastInpMsg = textarea.value();
       if (lastInpMsg.trim().length === 0) {
@@ -145,6 +160,35 @@ export const ChatInput = memo(
               </div>
 
               <div className="flex items-center">
+                {IS_SPEECH_RECOGNITION_SUPPORTED && !isPending && (
+                  <SpeechToText onRecord={handleRecord}>
+                    {({ isRecording, startRecording, stopRecording }) => (
+                      <>
+                        {!isRecording && (
+                          <button
+                            className="btn btn-ghost w-8 h-8 p-0 rounded-full mr-2"
+                            onClick={startRecording}
+                            title="Record"
+                            aria-label="Start Recording"
+                          >
+                            <LuMic className="h-5 w-5" />
+                          </button>
+                        )}
+                        {isRecording && (
+                          <button
+                            className="btn btn-ghost w-8 h-8 p-0 rounded-full mr-2"
+                            onClick={stopRecording}
+                            title="Stop"
+                            aria-label="Stop Recording"
+                          >
+                            <LuCircleStop className="h-5 w-5" />
+                          </button>
+                        )}
+                      </>
+                    )}
+                  </SpeechToText>
+                )}
+
                 {isPending && (
                   <button
                     className="btn btn-neutral w-8 h-8 p-0 rounded-full"

diff --git a/src/components/SpeechToText.tsx b/src/components/SpeechToText.tsx
@@ -0,0 +1,167 @@
+import {
+  forwardRef,
+  Fragment,
+  ReactNode,
+  useCallback,
+  useEffect,
+  useImperativeHandle,
+  useRef,
+  useState,
+} from 'react';
+
+export type SpeechRecordCallback = (text: string) => void;
+
+const SpeechRecognition =
+  typeof window === 'undefined'
+    ? undefined
+    : window.SpeechRecognition || window.webkitSpeechRecognition;
+
+export const IS_SPEECH_RECOGNITION_SUPPORTED = !!SpeechRecognition;
+
+interface SpeechToTextProps {
+  lang?: string;
+  continuous?: boolean;
+  interimResults?: boolean;
+  onRecord?: SpeechRecordCallback;
+}
+
+interface SpeechToTextState {
+  isRecording: boolean;
+  transcript: string;
+  startRecording: () => void;
+  stopRecording: () => void;
+}
+
+const useSpeechToText = ({
+  lang,
+  continuous = true,
+  interimResults = true,
+  onRecord,
+}: SpeechToTextProps): SpeechToTextState => {
+  const [isRecording, setIsRecording] = useState<boolean>(false);
+  const [transcript, setTranscript] = useState<string>('');
+  const recognitionRef = useRef<SpeechRecognition | null>(null);
+  const stoppedManuallyRef = useRef<boolean>(false);
+  const onRecordRef = useRef<SpeechRecordCallback | undefined>(onRecord);
+  const finalTranscriptRef = useRef<string>('');
+
+  useEffect(() => {
+    onRecordRef.current = onRecord;
+  }, [onRecord]);
+
+  useEffect(() => {
+    if (!IS_SPEECH_RECOGNITION_SUPPORTED) {
+      console.error('Speech Recognition is not supported in this browser.');
+      return;
+    }
+
+    const recognition = new SpeechRecognition!();
+    recognition.continuous = continuous;
+    recognition.interimResults = interimResults;
+    recognition.lang =
+      lang || navigator.languages?.[0] || navigator.language || 'en-US';
+
+    recognition.onstart = () => {
+      setIsRecording(true);
+    };
+    recognition.onresult = (event: SpeechRecognitionEvent) => {
+      if (!event?.results) return;
+
+      for (let i = event.resultIndex; i < event.results.length; i++) {
+        const result = event.results[i];
+        const { isFinal, length } = result;
+        if (length <= 0) continue;
+        const { transcript, confidence } = result[0];
+        const fullTranscript = [finalTranscriptRef.current, transcript].join(
+          ' '
+        );
+        setTranscript(fullTranscript);
+        onRecordRef.current?.(fullTranscript);
+        if (isFinal && confidence > 0) {
+          finalTranscriptRef.current += transcript;
+        }
+      }
+    };
+    recognition.onerror = (event: SpeechRecognitionErrorEvent) => {
+      console.warn('Speech recognition error:', event);
+      setIsRecording(false);
+    };
+    recognition.onend = () => {
+      setIsRecording(false);
+      // Automatically restart if not stopped manually
+      if (continuous && !stoppedManuallyRef.current) {
+        try {
+          recognition.start();
+        } catch (error) {
+          console.error('Error restarting speech recognition:', error);
+        }
+      }
+    };
+
+    recognitionRef.current = recognition;
+
+    return () => {
+      if (!recognitionRef.current) return;
+
+      recognitionRef.current.onresult = null;
+      recognitionRef.current.onend = null;
+      recognitionRef.current.onerror = null;
+      recognitionRef.current.onstart = null;
+      recognitionRef.current.stop();
+      recognitionRef.current = null;
+    };
+  }, [lang, continuous, interimResults]);
+
+  const startRecording = useCallback(() => {
+    const recognition = recognitionRef.current;
+    if (recognition && !isRecording) {
+      setTranscript('');
+      finalTranscriptRef.current = '';
+      stoppedManuallyRef.current = false;
+      try {
+        recognition.start();
+      } catch (error) {
+        console.error('Failed to start recording:', error);
+        setIsRecording(false);
+      }
+    }
+  }, [isRecording]);
+
+  const stopRecording = useCallback(() => {
+    const recognition = recognitionRef.current;
+    if (recognition && isRecording) {
+      stoppedManuallyRef.current = true;
+      try {
+        recognition.stop();
+      } catch (error) {
+        console.error('Failed to stop recording:', error);
+        setIsRecording(false);
+      }
+    }
+  }, [isRecording]);
+
+  return {
+    isRecording,
+    transcript,
+    startRecording,
+    stopRecording,
+  };
+};
+
+const SpeechToText = forwardRef<
+  SpeechToTextState,
+  SpeechToTextProps & { children: (props: SpeechToTextState) => ReactNode }
+>(({ children, lang, continuous, interimResults, onRecord }, ref) => {
+  const speechToText = useSpeechToText({
+    lang,
+    continuous,
+    interimResults,
+    onRecord,
+  });
+
+  useImperativeHandle(ref, () => speechToText, [speechToText]);
+
+  return <Fragment>{children(speechToText)}</Fragment>;
+});
+
+export default SpeechToText;
diff --git a/src/components/TextToSpeech.tsx b/src/components/TextToSpeech.tsx
@@ -25,7 +25,7 @@ const popularLanguages = [
   'ar',
 ];
 
-export const IS_SPEECH_SYNTHESIS_SUPPORTED = !!speechSynthesis || false;
+export const IS_SPEECH_SYNTHESIS_SUPPORTED = !!window.speechSynthesis;
 export const getSpeechSynthesisVoices = () =>
   speechSynthesis
     ?.getVoices()
@@ -93,7 +93,7 @@ const useTextToSpeech = ({
       utteranceRef.current.onerror = null;
     }
 
-    const utterance = new SpeechSynthesisUtterance(text);
+    const utterance = new window.SpeechSynthesisUtterance(text);
 
     utterance.voice = voice;
     utterance.pitch = pitch;
@@ -109,7 +109,8 @@ const useTextToSpeech = ({
       setIsPlaying(false);
     };
 
-    utterance.onerror = () => {
+    utterance.onerror = (event) => {
+      console.error('Speech synthesis error: ', event.error);
       setIsPlaying(false);
     };