-
Notifications
You must be signed in to change notification settings - Fork 954
Add support for transcribe image and audio transcription for gemini, anthropic, mistral and ollama. #1828
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for transcribe image and audio transcription for gemini, anthropic, mistral and ollama. #1828
Changes from all commits
02b1778
09fbf22
57195fb
8e5f14d
d57d188
6260f9e
13c034e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,4 @@ | ||
| """Adapter for Generic API LLM provider API""" | ||
| """Adapter for Gemini API LLM provider""" | ||
|
|
||
| import litellm | ||
| import instructor | ||
|
|
@@ -8,13 +8,9 @@ | |
| from litellm.exceptions import ContentPolicyViolationError | ||
| from instructor.core import InstructorRetryException | ||
|
|
||
| from cognee.infrastructure.llm.exceptions import ContentPolicyFilterError | ||
| from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import ( | ||
| LLMInterface, | ||
| ) | ||
| import logging | ||
| from cognee.shared.rate_limiting import llm_rate_limiter_context_manager | ||
| from cognee.shared.logging_utils import get_logger | ||
|
|
||
| from tenacity import ( | ||
| retry, | ||
| stop_after_delay, | ||
|
|
@@ -23,55 +19,65 @@ | |
| before_sleep_log, | ||
| ) | ||
|
|
||
| from cognee.infrastructure.llm.exceptions import ContentPolicyFilterError | ||
| from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.generic_llm_api.adapter import ( | ||
| GenericAPIAdapter, | ||
| ) | ||
| from cognee.shared.logging_utils import get_logger | ||
| from cognee.modules.observability.get_observe import get_observe | ||
|
|
||
| logger = get_logger() | ||
| observe = get_observe() | ||
|
|
||
|
|
||
| class GeminiAdapter(LLMInterface): | ||
| class GeminiAdapter(GenericAPIAdapter): | ||
| """ | ||
| Adapter for Gemini API LLM provider. | ||
| This class initializes the API adapter with necessary credentials and configurations for | ||
| interacting with the gemini LLM models. It provides methods for creating structured outputs | ||
| based on user input and system prompts. | ||
| based on user input and system prompts, as well as multimodal processing capabilities. | ||
| Public methods: | ||
| - acreate_structured_output(text_input: str, system_prompt: str, response_model: | ||
| Type[BaseModel]) -> BaseModel | ||
| - acreate_structured_output(text_input: str, system_prompt: str, response_model: Type[BaseModel]) -> BaseModel | ||
| - create_transcript(input) -> BaseModel: Transcribe audio files to text | ||
| - transcribe_image(input) -> BaseModel: Inherited from GenericAPIAdapter | ||
| """ | ||
|
|
||
| name: str | ||
| model: str | ||
| api_key: str | ||
| default_instructor_mode = "json_mode" | ||
|
|
||
| def __init__( | ||
| self, | ||
| endpoint, | ||
| api_key: str, | ||
| model: str, | ||
| api_version: str, | ||
| max_completion_tokens: int, | ||
| endpoint: str = None, | ||
| api_version: str = None, | ||
| transcription_model: str = None, | ||
| instructor_mode: str = None, | ||
| fallback_model: str = None, | ||
| fallback_api_key: str = None, | ||
| fallback_endpoint: str = None, | ||
| ): | ||
| self.model = model | ||
| self.api_key = api_key | ||
| self.endpoint = endpoint | ||
| self.api_version = api_version | ||
| self.max_completion_tokens = max_completion_tokens | ||
|
|
||
| self.fallback_model = fallback_model | ||
| self.fallback_api_key = fallback_api_key | ||
| self.fallback_endpoint = fallback_endpoint | ||
|
|
||
| super().__init__( | ||
| api_key=api_key, | ||
| model=model, | ||
| max_completion_tokens=max_completion_tokens, | ||
| name="Gemini", | ||
| endpoint=endpoint, | ||
| api_version=api_version, | ||
| transcription_model=transcription_model, | ||
| fallback_model=fallback_model, | ||
| fallback_api_key=fallback_api_key, | ||
| fallback_endpoint=fallback_endpoint, | ||
| ) | ||
|
Comment on lines
49
to
+73
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion | 🟠 Major Add missing Two issues:
Apply this diff to add the missing parameter: def __init__(
self,
api_key: str,
model: str,
max_completion_tokens: int,
endpoint: str = None,
api_version: str = None,
transcription_model: str = None,
+ image_transcribe_model: str = None,
instructor_mode: str = None,
fallback_model: str = None,
fallback_api_key: str = None,
fallback_endpoint: str = None,
):
+ """
+ Initialize the Gemini adapter with API credentials and model configurations.
+
+ Parameters:
+ -----------
+ api_key: Gemini API key
+ model: Primary model identifier for structured output generation
+ max_completion_tokens: Maximum tokens for completion responses
+ endpoint: Optional custom API endpoint
+ api_version: Optional API version
+ transcription_model: Optional model for audio transcription (defaults to main model)
+ image_transcribe_model: Optional model for image transcription (defaults to main model)
+ instructor_mode: Mode for instructor client (defaults to "json_mode")
+ fallback_model: Fallback model for content policy violations
+ fallback_api_key: API key for fallback model
+ fallback_endpoint: Endpoint for fallback model
+ """
super().__init__(
api_key=api_key,
model=model,
max_completion_tokens=max_completion_tokens,
name="Gemini",
endpoint=endpoint,
api_version=api_version,
transcription_model=transcription_model,
+ image_transcribe_model=image_transcribe_model,
fallback_model=fallback_model,
fallback_api_key=fallback_api_key,
fallback_endpoint=fallback_endpoint,
)As per coding guidelines, undocumented function definitions are considered incomplete. |
||
| self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode | ||
|
|
||
| self.aclient = instructor.from_litellm( | ||
| litellm.acompletion, mode=instructor.Mode(self.instructor_mode) | ||
| ) | ||
|
|
||
| @observe(as_type="generation") | ||
| @retry( | ||
| stop=stop_after_delay(128), | ||
| wait=wait_exponential_jitter(8, 128), | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 167
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 1814
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 368
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 9839
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 12569
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 859
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 19540
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 18772
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 2918
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 1117
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 2373
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 12294
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 3498
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 17733
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 4132
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 2734
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 25949
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 7159
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 22727
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 5023
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 19507
🏁 Script executed:
Repository: topoteretes/cognee
Length of output: 20137
Override
create_transcriptwith explicit error handling for Anthropic audio limitation.AnthropicAdapterinheritscreate_transcriptfromGenericAPIAdapter, but Anthropic's API does not support audio transcription. Override this method to raiseNotImplementedErrorwith a clear message directing users to external transcription services. Fortranscribe_image, verify the inherited implementation works correctly with Anthropic's vision API by testing with base64-encoded images.