HandlerLee
diff --git a/‎langextract/inference.py‎
Lines changed: 19 additions & 1 deletion b/‎langextract/inference.py‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎langextract/providers/gemini.py‎
Lines changed: 41 additions & 26 deletions b/‎langextract/providers/gemini.py‎
Lines changed: 41 additions & 26 deletions
diff --git a/‎langextract/providers/ollama.py‎
Lines changed: 43 additions & 15 deletions b/‎langextract/providers/ollama.py‎
Lines changed: 43 additions & 15 deletions
@@ -20,7 +20,7 @@
 import enum
 import json
 import textwrap
-from typing import Any
+from typing import Any, Mapping
 
 from absl import logging
 from typing_extensions import deprecated
@@ -73,6 +73,7 @@ def __init__(self, constraint: schema.Constraint = schema.Constraint()):
     self._constraint = constraint
     self._schema: schema.BaseSchema | None = None
     self._fence_output_override: bool | None = None
+    self._extra_kwargs: dict[str, Any] = {}
 
   @classmethod
   def get_schema_class(cls) -> type[schema.BaseSchema] | None:
@@ -116,6 +117,23 @@ def requires_fence_output(self) -> bool:
       return True
     return not self._schema.supports_strict_mode
 
+  def merge_kwargs(
+      self, runtime_kwargs: Mapping[str, Any] | None = None
+  ) -> dict[str, Any]:
+    """Merge stored extra kwargs with runtime kwargs.
+
+    Runtime kwargs take precedence over stored kwargs.
+
+    Args:
+      runtime_kwargs: Kwargs provided at inference time, or None.
+
+    Returns:
+      Merged kwargs dictionary.
+    """
+    base = getattr(self, '_extra_kwargs', {}) or {}
+    incoming = dict(runtime_kwargs or {})
+    return {**base, **incoming}
+
   @abc.abstractmethod
   def infer(
       self, batch_prompts: Sequence[str], **kwargs
 
@@ -19,17 +19,27 @@
 
 import concurrent.futures
 import dataclasses
-from typing import Any, Iterator, Sequence
+from typing import Any, Final, Iterator, Sequence
 
 from langextract import data
 from langextract import exceptions
 from langextract import inference
 from langextract import schema
 from langextract.providers import registry
 
+_API_CONFIG_KEYS: Final[set[str]] = {
+    'response_mime_type',
+    'response_schema',
+    'safety_settings',
+    'system_instruction',
+    'tools',
+    'stop_sequences',
+    'candidate_count',
+}
+
 
 @registry.register(
-    r'^gemini',  # gemini-2.5-flash, gemini-2.5-pro, etc.
+    r'^gemini',
     priority=10,
 )
 @dataclasses.dataclass(init=False)
@@ -109,37 +119,31 @@ def __init__(
     self.temperature = temperature
     self.max_workers = max_workers
     self.fence_output = fence_output
-    api_config_keys = {
-        'response_schema',
-        'response_mime_type',
-        'tools',
-        'safety_settings',
-        'stop_sequences',
-        'candidate_count',
-        'system_instruction',
-    }
-    self._extra_kwargs = {
-        k: v for k, v in (kwargs or {}).items() if k in api_config_keys
-    }
 
     if not self.api_key:
-      raise exceptions.InferenceConfigError('API key not provided for Gemini.')
+      raise exceptions.InferenceConfigError('API key not provided.')
 
     self._client = genai.Client(api_key=self.api_key)
 
     super().__init__(
         constraint=schema.Constraint(constraint_type=schema.ConstraintType.NONE)
     )
+    self._extra_kwargs = {
+        k: v for k, v in (kwargs or {}).items() if k in _API_CONFIG_KEYS
+    }
 
   def _process_single_prompt(
       self, prompt: str, config: dict
   ) -> inference.ScoredOutput:
     """Process a single prompt and return a ScoredOutput."""
     try:
-      if self._extra_kwargs:
-        config.update(self._extra_kwargs)
+      # Apply stored kwargs that weren't already set in config
+      for key, value in self._extra_kwargs.items():
+        if key not in config and value is not None:
+          config[key] = value
+
       if self.gemini_schema:
-        # Gemini structured output only supports JSON
+        # Structured output requires JSON format
         if self.format_type != data.FormatType.JSON:
           raise exceptions.InferenceConfigError(
               'Gemini structured output only supports JSON format. '
@@ -149,7 +153,7 @@ def _process_single_prompt(
         config.setdefault('response_schema', self.gemini_schema.schema_dict)
 
       response = self._client.models.generate_content(
-          model=self.model_id, contents=prompt, config=config  # type: ignore[arg-type]
+          model=self.model_id, contents=prompt, config=config
       )
 
       return inference.ScoredOutput(score=1.0, output=response.text)
@@ -171,15 +175,26 @@ def infer(
     Yields:
       Lists of ScoredOutputs.
     """
+    merged_kwargs = self.merge_kwargs(kwargs)
+
     config = {
-        'temperature': kwargs.get('temperature', self.temperature),
+        'temperature': merged_kwargs.get('temperature', self.temperature),
     }
-    if 'max_output_tokens' in kwargs:
-      config['max_output_tokens'] = kwargs['max_output_tokens']
-    if 'top_p' in kwargs:
-      config['top_p'] = kwargs['top_p']
-    if 'top_k' in kwargs:
-      config['top_k'] = kwargs['top_k']
+    if 'max_output_tokens' in merged_kwargs:
+      config['max_output_tokens'] = merged_kwargs['max_output_tokens']
+    if 'top_p' in merged_kwargs:
+      config['top_p'] = merged_kwargs['top_p']
+    if 'top_k' in merged_kwargs:
+      config['top_k'] = merged_kwargs['top_k']
+
+    handled_keys = {'temperature', 'max_output_tokens', 'top_p', 'top_k'}
+    for key, value in merged_kwargs.items():
+      if (
+          key not in handled_keys
+          and key in _API_CONFIG_KEYS
+          and value is not None
+      ):
+        config[key] = value
 
     # Use parallel processing for batches larger than 1
     if len(batch_prompts) > 1 and self.max_workers > 1:
 
@@ -83,7 +83,12 @@
 from langextract import schema
 from langextract.providers import registry
 
+# Ollama defaults
 _OLLAMA_DEFAULT_MODEL_URL = 'http://localhost:11434'
+_DEFAULT_TEMPERATURE = 0.8
+_DEFAULT_TIMEOUT = 30
+_DEFAULT_KEEP_ALIVE = 5 * 60  # 5 minutes
+_DEFAULT_NUM_CTX = 2048
 
 
 @registry.register(
@@ -193,8 +198,8 @@ def __init__(
     self._model_url = base_url or model_url or _OLLAMA_DEFAULT_MODEL_URL
     self.format_type = format_type
     self._constraint = constraint
-    self._extra_kwargs = kwargs or {}
     super().__init__(constraint=constraint)
+    self._extra_kwargs = kwargs or {}
 
   def infer(
       self, batch_prompts: Sequence[str], **kwargs
@@ -208,6 +213,8 @@ def infer(
     Yields:
       Lists of ScoredOutputs.
     """
+    combined_kwargs = self.merge_kwargs(kwargs)
+
     for prompt in batch_prompts:
       try:
         response = self._ollama_query(
@@ -217,7 +224,7 @@ def infer(
             if self.format_type == data.FormatType.JSON
             else 'yaml',
             model_url=self._model_url,
-            **kwargs,
+            **combined_kwargs,
         )
         # No score for Ollama. Default to 1.0
         yield [inference.ScoredOutput(score=1.0, output=response['response'])]
@@ -230,18 +237,20 @@ def _ollama_query(
       self,
       prompt: str,
       model: str | None = None,
-      temperature: float = 0.8,
+      temperature: float | None = None,
       seed: int | None = None,
       top_k: int | None = None,
+      top_p: float | None = None,
       max_output_tokens: int | None = None,
       structured_output_format: str | None = None,
       system: str = '',
       raw: bool = False,
       model_url: str | None = None,
-      timeout: int = 30,
-      keep_alive: int = 5 * 60,
+      timeout: int | None = None,
+      keep_alive: int | None = None,
       num_threads: int | None = None,
-      num_ctx: int = 2048,
+      num_ctx: int | None = None,
+      stop: str | list[str] | None = None,
       **kwargs,  # pylint: disable=unused-argument
   ) -> Mapping[str, Any]:
     """Sends a prompt to an Ollama model and returns the generated response.
@@ -257,6 +266,7 @@ def _ollama_query(
         output.
       seed: Seed for reproducible generation. If None, random seed is used.
       top_k: The top-K parameter for sampling.
+      top_p: The top-P (nucleus) sampling parameter.
       max_output_tokens: Maximum tokens to generate. If None, the model's
         default is used.
       structured_output_format: If set to "json" or a JSON schema dict, requests
@@ -272,6 +282,7 @@ def _ollama_query(
         heuristic.
       num_ctx: Number of context tokens allowed. If None, uses model's default
         or config.
+      stop: Stop sequences to halt generation. Can be a string or list of strings.
       **kwargs: Additional parameters passed through.
 
     Returns:
@@ -291,19 +302,30 @@ def _ollama_query(
           'json' if self.format_type == data.FormatType.JSON else 'yaml'
       )
 
-    options: dict[str, Any] = {'keep_alive': keep_alive}
-    if seed:
+    options: dict[str, Any] = {}
+    if keep_alive is not None:
+      options['keep_alive'] = keep_alive
+    else:
+      options['keep_alive'] = _DEFAULT_KEEP_ALIVE
+
+    if seed is not None:
       options['seed'] = seed
-    if temperature:
+    if temperature is not None:
       options['temperature'] = temperature
-    if top_k:
+    else:
+      options['temperature'] = _DEFAULT_TEMPERATURE
+    if top_k is not None:
       options['top_k'] = top_k
-    if num_threads:
+    if top_p is not None:
+      options['top_p'] = top_p
+    if num_threads is not None:
       options['num_thread'] = num_threads
-    if max_output_tokens:
+    if max_output_tokens is not None:
       options['num_predict'] = max_output_tokens
-    if num_ctx:
+    if num_ctx is not None:
       options['num_ctx'] = num_ctx
+    else:
+      options['num_ctx'] = _DEFAULT_NUM_CTX
 
     api_url = model_url + '/api/generate'
 
@@ -317,6 +339,12 @@ def _ollama_query(
         'options': options,
     }
 
+    # Add stop sequences if provided (top-level in Ollama API)
+    if stop is not None:
+      payload['stop'] = stop
+
+    request_timeout = timeout if timeout is not None else _DEFAULT_TIMEOUT
+
     try:
       response = self._requests.post(
           api_url,
@@ -325,12 +353,12 @@ def _ollama_query(
               'Accept': 'application/json',
           },
           json=payload,
-          timeout=timeout,
+          timeout=request_timeout,
       )
     except self._requests.exceptions.RequestException as e:
       if isinstance(e, self._requests.exceptions.ReadTimeout):
         msg = (
-            f'Ollama Model timed out (timeout={timeout},'
+            f'Ollama Model timed out (timeout={request_timeout},'
             f' num_threads={num_threads})'
         )
         raise exceptions.InferenceRuntimeError(