fix(web): detect multimodal capability via tags instead of model_type

Since model_type only represents the primary category (e.g., 'chat'), it cannot capture auxiliary capabilities. Switching to 'IMAGE2TEXT' tag detection allows multimodal support for versatile models like gpt-5.2-pro.
infiniflow · KevinHuSh · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026
commit 8f9251969481a78a98a2d2929cd6ec8fbdeaf505
diff --git a/web/src/pages/agent/form/agent-form/index.tsx b/web/src/pages/agent/form/agent-form/index.tsx
@@ -162,7 +162,7 @@ function AgentForm({ node }: INextOperatorForm) {
         <FormWrapper>
           {isSubAgent && <DescriptionField></DescriptionField>}
           <LargeModelFormField showSpeech2TextModel></LargeModelFormField>
-          {findLlmByUuid(llmId)?.model_type === LlmModelType.Image2text && (
+          {findLlmByUuid(llmId)?.tags?.includes('IMAGE2TEXT') && (
             <QueryVariable
               name="visual_files_var"
               label="Visual Input File"