Default MAX_STRING_LENGTH to aggressive (#328)

lukehinds · web-flow · commit 18841b52b139 · 2025-09-03T18:00:41.000+01:00
* Default MAX_STRING_LENGTH to aggressive

We had a MAX_STRING_LENGTH of 1000 which was lopping off deep
research reports, changed to the more sensible 100,000 (100KB)

Signed-off-by: Luke Hinds &lt;lukehinds@gmail.com&gt;

* Don't use non type base int

Signed-off-by: Luke Hinds &lt;lukehinds@gmail.com&gt;

* Bump versions to v075

Signed-off-by: Luke Hinds &lt;lukehinds@gmail.com&gt;

---------

Signed-off-by: Luke Hinds &lt;lukehinds@gmail.com&gt;
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -155,6 +155,10 @@ src/agent/
 - **Authentication**: Never bypass authentication checks; use UnifiedAuthenticationManager for consistent auth handling
 - **Plugin Security**: Plugins must declare required scopes; use allowlist-based validation for plugin loading
 - **Audit Logging**: Log all security events (authentication, authorization, access denials) with appropriate risk levels
+- **Function Argument Sanitization**: Configure `max_string_length` and `sanitization_enabled` in security config to control how function arguments are sanitized:
+  - `max_string_length: 100000` - Default 100KB limit for string arguments (prevents large file content truncation)
+  - `max_string_length: -1` - Disable string length limits entirely (use with caution)
+  - `sanitization_enabled: false` - Disable all argument sanitization (not recommended for production)
 
 ## Task Completion Workflow
 
@@ -263,6 +267,10 @@ security:
     files:write: ["files:read"]
     api:admin: ["api:write", "api:read"]
     api:write: ["api:read"]
+  
+  # Function argument sanitization settings
+  sanitization_enabled: true   # Enable function argument sanitization
+  max_string_length: 100000    # Max string length in chars (100KB default, -1 = unlimited)
 ```
 
 **Important**: Each function requires specific scopes. If a user's API key doesn't have the required scope (either directly or through hierarchy inheritance), access will be denied with a 403 error.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "agentup"
-version = "0.7.4"
+version = "0.7.5"
 description = "Create AI agents with all the trappings, out of the box."
 readme = "README.md"
 requires-python = ">=3.11"
diff --git a/src/agent/config/model.py b/src/agent/config/model.py
@@ -313,6 +313,14 @@ class SecurityConfig(BaseModel):
     )
     scope_hierarchy: dict[str, list[str]] = Field(default_factory=dict, description="Scope hierarchy configuration")
 
+    # Function argument sanitization settings
+    max_string_length: int = Field(
+        default=100000,
+        description="Maximum allowed string length in function arguments (in characters). Set to -1 to disable limit.",
+        ge=-1,
+    )
+    sanitization_enabled: bool = Field(default=True, description="Enable function argument sanitization for security")
+
     @field_validator("scope_hierarchy", mode="before")
     @classmethod
     def validate_scope_hierarchy(cls, v):
diff --git a/src/agent/core/function_executor.py b/src/agent/core/function_executor.py
@@ -174,7 +174,11 @@ async def execute_function_call(self, function_name: str, arguments: dict[str, A
             audit_logger.log_configuration_error(
                 "function_validation",
                 "function_validation_failed",
-                {"correlation_id": correlation_id, "error_type": "ValueError", "function_name": function_name},
+                {
+                    "correlation_id": correlation_id,
+                    "error_type": "ValueError",
+                    "function_name": function_name,
+                },
             )
             return f"Invalid request format [ref:{correlation_id}]"
 
@@ -211,7 +215,26 @@ async def execute_function_call(self, function_name: str, arguments: dict[str, A
 
     def _sanitize_function_arguments(self, arguments: dict[str, Any], correlation_id: str) -> dict[str, Any]:
         """Sanitize function arguments to prevent injection attacks."""
-        MAX_STRING_LENGTH = 1000
+        # Get configuration for security settings
+        try:
+            from agent.config import get_config
+
+            config = get_config()
+            security_config = config.agent_config.security
+
+            # Check if sanitization is disabled
+            if not security_config.sanitization_enabled:
+                logger.debug(f"Function argument sanitization disabled [corr:{correlation_id}]")
+                return arguments
+
+            # Get configurable string length limit
+            max_string_length = security_config.max_string_length
+            # A value of -1 indicates no limit. This is handled in the truncation logic below.
+
+        except Exception as e:
+            logger.warning(f"Failed to load security config, using defaults [corr:{correlation_id}]: {e}")
+            max_string_length = 100000  # Fallback to 100KB
+
         MAX_NESTED_DEPTH = 5
         ALLOWED_TYPES = (str, int, float, bool, list, dict, type(None))
 
@@ -222,12 +245,20 @@ def _sanitize_value(value, depth=0):
 
             if not isinstance(value, ALLOWED_TYPES):
                 logger.warning(f"Disallowed argument type: {type(value)} [corr:{correlation_id}]")
-                return str(value)[:MAX_STRING_LENGTH]
+                sanitized_str = str(value)
+                if max_string_length != -1:
+                    return sanitized_str[:max_string_length]
+                return sanitized_str
 
             if isinstance(value, str):
                 # Sanitize string length and remove potential control characters
                 sanitized = "".join(char for char in value if ord(char) >= 32 or char in "\t\n\r")
-                return sanitized[:MAX_STRING_LENGTH]
+                if max_string_length != -1 and len(sanitized) > max_string_length:
+                    logger.debug(
+                        f"String truncated from {len(sanitized)} to {max_string_length} chars [corr:{correlation_id}]"
+                    )
+                    return sanitized[:max_string_length]
+                return sanitized
 
             elif isinstance(value, list):
                 if len(value) > 100:  # Limit array size
@@ -296,7 +327,10 @@ async def _execute_with_state_management(self, handler, task, function_name: str
 
     async def _apply_ai_middleware(self, handler, function_name: str):
         try:
-            from agent.middleware import execute_ai_function_with_middleware, get_ai_compatible_middleware
+            from agent.middleware import (
+                execute_ai_function_with_middleware,
+                get_ai_compatible_middleware,
+            )
 
             # Check if there's any AI-compatible middleware to apply
             ai_middleware = get_ai_compatible_middleware()
diff --git a/src/agent/templates/config/agentup.yml.j2 b/src/agent/templates/config/agentup.yml.j2
@@ -212,6 +212,10 @@ security:
 {% if has_mcp %}
     weather:admin: ["alerts:read", "weather:read"]
 {% endif %}
+  
+  # Function argument sanitization settings
+  sanitization_enabled: {{ sanitization_enabled | default(true) }}  # Enable function argument sanitization
+  max_string_length: {{ max_string_length | default(100000) }}      # Max string length in chars (100KB default, -1 = unlimited)
 {% endif %}
 
 {%- if ai_provider_config %}
diff --git a/uv.lock b/uv.lock