New GenerationConfig parameter for generate routes (#47)

spillai · web-flow · commit 1da3574b0925 · 2025-02-04T13:23:14.000-08:00
- deprecates `json_schema` and `detail` parameters
diff --git a/README.md b/README.md
@@ -48,55 +48,23 @@ The package provides optional features that can be installed based on your needs
 ### Basic Usage
 
 ```python
+from PIL import Image
 from vlmrun.client import VLMRun
+from vlmrun.client.types import GenerationConfig
 from vlmrun.hub.schemas.document.invoice import Invoice
 
 # Initialize the client
 client = VLMRun(api_key="your-api-key")
 
 # Process an image
+image: Image.Image = Image.open("image.jpg")
 response = client.image.generate(
-    image="https://example.com/invoice.jpg",
+    images=[image],
     model="vlm-1",
     domain="document.invoice",
-    json_schema=Invoice.model_json_schema(),
+    config=GenerationConfig(json_schema=Invoice.model_json_schema()),
 )
-```
-
-### Image Utilities
-
-```python
-from vlmrun.common.image import encode_image
-from vlmrun.common.utils import download_image
-from PIL import Image
-
-# Convert image to base64 or binary
-image = Image.open("image.jpg")
-base64_str = encode_image(image, format="PNG")  # or format="JPEG"
-binary_data = encode_image(image, format="binary")
-
-# Download image from URL
-image = download_image("https://example.com/image.jpg")
-```
-
-</details>
-
-## 📂 Directory Structure
-
-```bash
-vlmrun/
-├── client/               # Client implementation
-│   ├── client.py         # Main VLMRun class
-│   ├── base_requestor.py # Low-level request logic
-│   ├── files.py          # File operations
-│   ├── models.py         # Model operations
-│   ├── finetune.py       # Fine-tuning operations
-│   └── types.py          # Type definitions
-├── common/              # Common utilities
-│   ├── auth.py          # Authentication utilities
-│   └── image.py         # Image processing utilities
-└── types/              # Type definitions
-    └── abstract.py     # Abstract base classes
+print(response)
 ```
 
 ## 🔗 Quick Links
diff --git a/tests/cli/test_cli_generate.py b/tests/cli/test_cli_generate.py
@@ -1,23 +1,30 @@
 """Test generate subcommand."""
 
+from pathlib import Path
+
 from vlmrun.cli.cli import app
+from vlmrun.common.utils import download_artifact
 
 
 def test_generate_image(runner, mock_client, tmp_path):
     """Test generate image command."""
-    test_image = tmp_path / "test.jpg"
-    test_image.write_bytes(b"test image data")
-    result = runner.invoke(app, ["generate", "image", str(test_image)])
-    assert result.exit_code == 0
-
-
-def test_generate_video(runner, mock_client, tmp_path):
-    """Test generate video command."""
-    result = runner.invoke(app, ["generate", "video", "test prompt"])
+    path: Path = download_artifact(
+        "https://storage.googleapis.com/vlm-data-public-prod/hub/examples/document.invoice/invoice_1.jpg",
+        format="file",
+    )
+    result = runner.invoke(
+        app, ["generate", "image", str(path), "--domain", "document.invoice"]
+    )
     assert result.exit_code == 0
 
 
 def test_generate_document(runner, mock_client, tmp_path):
     """Test generate document command."""
-    result = runner.invoke(app, ["generate", "document", "test prompt"])
+    path: Path = download_artifact(
+        "https://storage.googleapis.com/vlm-data-public-prod/hub/examples/document.bank-statement/lending_bankstatement.pdf",
+        format="file",
+    )
+    result = runner.invoke(
+        app, ["generate", "document", str(path), "--domain", "document.bank-statement"]
+    )
     assert result.exit_code == 0
diff --git a/vlmrun/cli/_cli/generate.py b/vlmrun/cli/_cli/generate.py
@@ -1,14 +1,15 @@
 """Generation API commands."""
 
 from pathlib import Path
-from typing import Optional
 
 import typer
+from PIL import Image
 from rich import print as rprint
 
 from vlmrun.client import VLMRun
+from vlmrun.client.types import PredictionResponse
 
-app = typer.Typer(help="Generation operations")
+app = typer.Typer(help="Generation operations", no_args_is_help=True)
 
 
 @app.command()
@@ -17,60 +18,34 @@ def image(
     image: Path = typer.Argument(
         ..., help="Input image file", exists=True, readable=True
     ),
-    output: Optional[Path] = typer.Option(None, help="Output file path"),
+    domain: str = typer.Option(
+        ..., help="Domain to use for generation (e.g. `document.invoice`)"
+    ),
 ) -> None:
     """Generate an image."""
     client: VLMRun = ctx.obj
-    response = client.image.generate(images=[image], model="vlm-1", domain="image")
-    if output and response and hasattr(response, "response"):
-        if isinstance(response.response, bytes):
-            output.write_bytes(response.response)
-            rprint(f"Image saved to {output}")
-        else:
-            rprint("Error: Response does not contain valid image data")
-    else:
-        rprint("Image data generated (use --output to save to file)")
+    if not Path(image).is_file():
+        raise typer.Abort(f"Image file does not exist: {image}")
 
-
-@app.command()
-def video(
-    ctx: typer.Context,
-    prompt: str = typer.Argument(..., help="Video generation prompt"),
-    output: Optional[Path] = typer.Option(None, help="Output file path"),
-) -> None:
-    """Generate a video."""
-    client: VLMRun = ctx.obj
-    response = client.video.generate(
-        file_or_url=prompt, model="vlm-1", domain="video"  # Using prompt as input text
-    )
-    if output and response and hasattr(response, "response"):
-        if isinstance(response.response, bytes):
-            output.write_bytes(response.response)
-            rprint(f"Video saved to {output}")
-        else:
-            rprint("Error: Response does not contain valid video data")
-    else:
-        rprint("Video data generated (use --output to save to file)")
+    img: Image.Image = Image.open(image)
+    response: PredictionResponse = client.image.generate(images=[img], domain=domain)
+    rprint(response)
 
 
 @app.command()
 def document(
     ctx: typer.Context,
-    prompt: str = typer.Argument(..., help="Document generation prompt"),
-    output: Optional[Path] = typer.Option(None, help="Output file path"),
+    path: Path = typer.Argument(
+        ..., help="Path to the document file", exists=True, readable=True
+    ),
+    domain: str = typer.Option(
+        ..., help="Domain to use for generation (e.g. `document.invoice`)"
+    ),
 ) -> None:
     """Generate a document."""
     client: VLMRun = ctx.obj
-    response = client.document.generate(
-        file_or_url=prompt,  # Using prompt as input text
-        model="vlm-1",
-        domain="document",
-    )
-    if output and response and hasattr(response, "response"):
-        if isinstance(response.response, bytes):
-            output.write_bytes(response.response)
-            rprint(f"Document saved to {output}")
-        else:
-            rprint("Error: Response does not contain valid document data")
-    else:
-        rprint("Document data generated (use --output to save to file)")
+    if not Path(path).is_file():
+        raise typer.Abort(f"Document file does not exist: {path}")
+
+    response = client.document.generate(file=path, domain=domain)
+    rprint(response)
diff --git a/vlmrun/cli/cli.py b/vlmrun/cli/cli.py
@@ -20,6 +20,7 @@
     name="vlmrun",
     help="CLI for VLM Run (https://app.vlm.run)",
     add_completion=True,
+    no_args_is_help=True,
 )
 
 
diff --git a/vlmrun/client/predictions.py b/vlmrun/client/predictions.py
@@ -10,7 +10,12 @@
 from vlmrun.common.image import encode_image
 from vlmrun.client.base_requestor import APIRequestor
 from vlmrun.types.abstract import VLMRunProtocol
-from vlmrun.client.types import PredictionResponse, FileResponse
+from vlmrun.client.types import (
+    PredictionResponse,
+    FileResponse,
+    GenerationConfig,
+    RequestMetadata,
+)
 
 
 class Predictions:
@@ -82,23 +87,19 @@ class ImagePredictions(Predictions):
     def generate(
         self,
         images: list[Path | Image.Image],
-        model: str,
         domain: str,
-        json_schema: dict | None = None,
-        detail: str = "auto",
         batch: bool = False,
-        metadata: dict = {},
+        metadata: RequestMetadata | None = None,
+        config: GenerationConfig | None = None,
         callback_url: str | None = None,
     ) -> PredictionResponse:
         """Generate a document prediction.
 
         Args:
             images: List of images to generate predictions from
-            model: Model to use for prediction
             domain: Domain to use for prediction
-            json_schema: JSON schema to use for prediction
-            detail: Detail level for prediction
             batch: Whether to run prediction in batch mode
+            config: GenerateConfig to use for prediction
             metadata: Metadata to include in prediction
             callback_url: URL to call when prediction is complete
 
@@ -117,18 +118,20 @@ def generate(
         else:
             raise ValueError("Image must be a path or a PIL Image")
 
+        additional_kwargs = {}
+        if config:
+            additional_kwargs["config"] = config.model_dump()
+        if metadata:
+            additional_kwargs["metadata"] = metadata.model_dump()
         response, status_code, headers = self._requestor.request(
             method="POST",
             url="image/generate",
             data={
                 "image": encode_image(images[0], format="JPEG"),
-                "model": model,
                 "domain": domain,
-                "json_schema": json_schema,
-                "detail": detail,
                 "batch": batch,
-                "metadata": metadata,
                 "callback_url": callback_url,
+                **additional_kwargs,
             },
         )
         if not isinstance(response, dict):
@@ -144,64 +147,75 @@ class _FilePredictions(Predictions):
 
         def generate(
             self,
-            file_or_url: str | Path,
-            model: str,
-            domain: str,
-            json_schema: dict | None = None,
-            detail: str = "auto",
+            file: Path | str | None = None,
+            url: str | None = None,
+            domain: str | None = None,
             batch: bool = False,
-            metadata: dict = {},
+            config: GenerationConfig | None = GenerationConfig(),
+            metadata: RequestMetadata | None = RequestMetadata(),
             callback_url: str | None = None,
         ) -> PredictionResponse:
             """Generate a document prediction.
 
             Args:
-                file_or_url: File (pathlib.Path) or file_id or URL to generate prediction from
-                model: Model to use for prediction
+                file: File (pathlib.Path) or file_id to generate prediction from
+                url: URL to generate prediction from
                 domain: Domain to use for prediction
-                json_schema: JSON schema to use for prediction
-                detail: Detail level for prediction
                 batch: Whether to run prediction in batch mode
+                config: GenerateConfig to use for prediction
                 metadata: Metadata to include in prediction
                 callback_url: URL to call when prediction is complete
 
             Returns:
                 PredictionResponse: Prediction response
             """
             is_url = False
-            if isinstance(file_or_url, Path):
-                logger.debug(
-                    f"Uploading file [path={file_or_url}, size={file_or_url.stat().st_size / 1024 / 1024:.2f} MB] to VLM Run"
-                )
-                upload_response, _, _ = self._client.files.upload(
-                    file=file_or_url, purpose="assistants"
-                )
-                if not isinstance(upload_response, dict):
-                    raise TypeError("Expected dict response")
-                response = FileResponse(**upload_response)
-                logger.debug(
-                    f"Uploaded file [file_id={response.id}, name={response.filename}]"
-                )
-                file_or_url = response.id
-            elif isinstance(file_or_url, str):
-                is_url = str(file_or_url).startswith(("http://", "https://"))
+            if not file and not url:
+                raise ValueError("Either `file` or `url` must be provided")
+            if file and url:
+                raise ValueError("Only one of `file` or `url` can be provided")
+            if file:
+                if isinstance(file, Path) or (
+                    isinstance(file, str) and Path(file).suffix
+                ):
+                    logger.debug(
+                        f"Uploading file [path={file}, size={file.stat().st_size / 1024 / 1024:.2f} MB] to VLM Run"
+                    )
+                    response: FileResponse = self._client.files.upload(
+                        file=Path(file), purpose="assistants"
+                    )
+                    logger.debug(
+                        f"Uploaded file [file_id={response.id}, name={response.filename}]"
+                    )
+                    file_or_url = response.id
+                elif isinstance(file, str):
+                    logger.debug(f"Using file_id [file_id={file}]")
+                    assert not Path(file).suffix, "File must not have an extension"
+                    file_or_url = file
+                else:
+                    raise ValueError("File must be a pathlib.Path or a string")
+            elif url:
+                is_url = True
+                file_or_url = url
             else:
                 raise ValueError(
                     "File or URL must be a pathlib.Path, str, or AnyHttpUrl"
                 )
 
+            additional_kwargs = {}
+            if config:
+                additional_kwargs["config"] = config.model_dump()
+            if metadata:
+                additional_kwargs["metadata"] = metadata.model_dump()
             response, status_code, headers = self._requestor.request(
                 method="POST",
                 url=f"{route}/generate",
                 data={
                     "url" if is_url else "file_id": file_or_url,
-                    "model": model,
                     "domain": domain,
-                    "json_schema": json_schema,
-                    "detail": detail,
                     "batch": batch,
-                    "metadata": metadata,
                     "callback_url": callback_url,
+                    **additional_kwargs,
                 },
             )
             if not isinstance(response, dict):
diff --git a/vlmrun/client/types.py b/vlmrun/client/types.py
diff --git a/vlmrun/version.py b/vlmrun/version.py

Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@`
`20`	`20`	`name="vlmrun",`
`21`	`21`	`help="CLI for VLM Run (https://app.vlm.run)",`
`22`	`22`	`add_completion=True,`
	`23`	`+ no_args_is_help=True,`
`23`	`24`	`)`
`24`	`25`
`25`	`26`