remove fallback audio chat task type, register the provider only if t…

…he server task type is available, handle the case when the chat endpoint does not return audio Signed-off-by: Julien Veyssier <[email protected]>
nextcloud · julien-nc · Jul 10, 2025 · Jul 1, 2025 · Jul 3, 2025 · Jul 3, 2025
commit 47b9c867b088ab82b7cfe49d1d45071fb0781be8
diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php
@@ -10,7 +10,6 @@
 use OCA\OpenAi\Capabilities;
 use OCA\OpenAi\OldProcessing\Translation\TranslationProvider as OldTranslationProvider;
 use OCA\OpenAi\TaskProcessing\AudioToAudioChatProvider;
-use OCA\OpenAi\TaskProcessing\AudioToAudioChatTaskType;
 use OCA\OpenAi\TaskProcessing\AudioToTextProvider;
 use OCA\OpenAi\TaskProcessing\ChangeToneProvider;
 use OCA\OpenAi\TaskProcessing\ChangeToneTaskType;
@@ -144,8 +143,9 @@ public function register(IRegistrationContext $context): void {
 				&& $this->appConfig->getValueString(Application::APP_ID, 'tts_provider_enabled', '1') === '1'
 			)
 		) {
-			$context->registerTaskProcessingTaskType(AudioToAudioChatTaskType::class);
-			$context->registerTaskProcessingProvider(AudioToAudioChatProvider::class);
+			if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AudioToAudioChat')) {
+				$context->registerTaskProcessingProvider(AudioToAudioChatProvider::class);
+			}
 		}
 
 		$context->registerCapability(Capabilities::class);

diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php
@@ -19,6 +19,7 @@
 use OCP\TaskProcessing\ISynchronousProvider;
 use OCP\TaskProcessing\ShapeDescriptor;
 use OCP\TaskProcessing\ShapeEnumValue;
+use OCP\TaskProcessing\TaskTypes\AudioToAudioChat;
 use Psr\Log\LoggerInterface;
 use RuntimeException;
 
@@ -42,10 +43,7 @@ public function getName(): string {
 	}
 
 	public function getTaskTypeId(): string {
-		if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AudioToAudioChat')) {
-			return \OCP\TaskProcessing\TaskTypes\AudioToAudioChat::ID;
-		}
-		return AudioToAudioChatTaskType::ID;
+		return AudioToAudioChat::ID;
 	}
 
 	public function getExpectedRuntime(): int {
@@ -129,13 +127,7 @@ public function getOutputShapeEnumValues(): array {
 	}
 
 	public function getOptionalOutputShape(): array {
-		return [
-			'input_transcript' => new ShapeDescriptor(
-				$this->l->t('Input transcript'),
-				$this->l->t('Transcription of the input audio'),
-				EShapeType::Text,
-			),
-		];
+		return [];
 	}
 
 	public function getOptionalOutputShapeEnumValues(): array {
@@ -203,22 +195,44 @@ public function process(?string $userId, array $input, callable $reportProgress)
 				'modalities' => ['text', 'audio'],
 				'audio' => ['voice' => $outputVoice, 'format' => 'mp3'],
 			];
+			$systemPrompt .= ' Producing text responses will break the user interface. Important: You have multimodal voice capability, and you use voice exclusively to respond.';
 			$completion = $this->openAiAPIService->createChatCompletion(
 				$userId, $llmModel, null, $systemPrompt, $history, 1, 1000,
 				$extraParams, null, null, $b64Audio,
 			);
 			$message = array_pop($completion['audio_messages']);
+			// TODO find a way to force the model to answer with audio when there is only text in the history
+			// https://community.openai.com/t/gpt-4o-audio-preview-responds-in-text-not-audio/1006486/5
+			if ($message === null) {
+				// no audio, TTS the text message
+				try {
+					$textResponse = array_pop($completion['messages']);
+					$apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $textResponse, $ttsModel, $outputVoice, $speed);
+					if (!isset($apiResponse['body'])) {
+						$this->logger->warning($serviceName . ' text to speech generation failed: no speech returned');
+						throw new RuntimeException($serviceName . ' text to speech generation failed: no speech returned');
+					}
+					$output = $apiResponse['body'];
+				} catch (\Exception $e) {
+					$this->logger->warning($serviceName . ' text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]);
+					throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage());
+				}
+			} else {
+				$output = base64_decode($message['audio']['data']);
+				$textResponse = $message['audio']['transcript'];
+			}
 			$result = [
-				'output' => base64_decode($message['audio']['data']),
-				'output_transcript' => $message['audio']['transcript'],
+				'output' => $output,
+				'output_transcript' => $textResponse,
 			];
 
 			// we still want the input transcription
 			try {
 				$inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel);
 				$result['input_transcript'] = $inputTranscription;
 			} catch (Exception $e) {
-				$this->logger->warning($serviceName . ' transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
+				$this->logger->warning($serviceName . ' audio input transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
+				throw new RuntimeException($serviceName . ' audio input transcription failed with: ' . $e->getMessage());
 			}
 
 			return $result;

diff --git a/lib/TaskProcessing/AudioToAudioChatTaskType.php b/lib/TaskProcessing/AudioToAudioChatTaskType.php