Fix: Pass project parameter correctly to storage.Client in Gemini Batch API (#286)

aksg87 · web-flow · commit 0ca4843f7c72 · 2025-11-26T18:25:03.000-05:00
Fixes #285 - Batch API failing with "Required parameter: project" error The project and location parameters from language_model_params were not being passed through to _submit_file(), causing storage.Client to be created without a project context. Changes: - Updated _submit_file() to accept project and location parameters - Modified _process_batch() to pass these parameters through - Added test to verify project parameter is correctly passed - Updated tests to include required batch config parameters
diff --git a/langextract/providers/gemini_batch.py b/langextract/providers/gemini_batch.py
@@ -300,6 +300,8 @@ def _submit_file(
     requests: Sequence[dict],
     display: str,
     retention_days: int | None,
+    project: str | None = None,
+    location: str | None = None,
 ) -> genai.types.BatchJob:
   """Submit a file-based batch job to Vertex AI using GCS storage.
 
@@ -317,6 +319,10 @@ def _submit_file(
     display: Display name for the batch job, used for identification and
         as part of the GCS blob name.
     retention_days: Days to keep GCS data. If set, applies lifecycle rule.
+    project: Optional GCP project ID. If not provided, will attempt to
+        determine from client or environment.
+    location: Optional GCP region/location. If not provided, will attempt to
+        determine from client or use default.
 
   Returns:
     BatchJob object that can be polled for completion status.
@@ -336,7 +342,7 @@ def _submit_file(
         line = {"key": f"{_KEY_IDX}{idx}", "request": req}
         f.write(json.dumps(line, ensure_ascii=False) + "\n")
 
-    project, location = _get_project_location(client)
+    project, location = _get_project_location(client, project, location)
     bucket_name = _get_bucket_name(project, location)
     blob_name = f"batch-input/{display}-{uuid.uuid4().hex}.jsonl"
 
@@ -807,7 +813,15 @@ def _process_batch(
         )
         for p in batch_prompts
     ]
-    job = _submit_file(client, model_id, requests, display, cfg.retention_days)
+    job = _submit_file(
+        client,
+        model_id,
+        requests,
+        display,
+        cfg.retention_days,
+        project,
+        location,
+    )
     if cfg.on_job_create:
       try:
         cfg.on_job_create(job)
diff --git a/tests/test_gemini_batch_api.py b/tests/test_gemini_batch_api.py
@@ -102,7 +102,13 @@ def test_batch_routing_vertex(self, mock_client_cls):
         vertexai=True,
         project="test-project",
         location=gb._DEFAULT_LOCATION,
-        batch={"enabled": True, "threshold": 2, "poll_interval": 1},
+        batch={
+            "enabled": True,
+            "threshold": 2,
+            "poll_interval": 1,
+            "enable_caching": False,
+            "retention_days": None,
+        },
     )
     prompts = ["p1", "p2"]
     outs = list(model.infer(prompts))
@@ -156,7 +162,12 @@ def test_realtime_when_below_threshold(self, mock_client_cls):
         vertexai=True,
         project="p",
         location="l",
-        batch={"enabled": True, "threshold": 10},
+        batch={
+            "enabled": True,
+            "threshold": 10,
+            "enable_caching": False,
+            "retention_days": None,
+        },
     )
     outs = list(model.infer(["hello"]))
 
@@ -195,7 +206,12 @@ def test_batch_with_schema(self, mock_client_cls):
         project="p",
         location="l",
         gemini_schema=mock_schema,
-        batch={"enabled": True, "threshold": 1},
+        batch={
+            "enabled": True,
+            "threshold": 1,
+            "enable_caching": False,
+            "retention_days": None,
+        },
     )
 
     # Mock _submit_file to verify the request payload contains the schema.
@@ -207,7 +223,7 @@ def test_batch_with_schema(self, mock_client_cls):
       self.assertLen(outs, 1)
       self.assertEqual(outs[0][0].output, '{"name":"test"}')
 
-      # Verify _submit_file was called with correct arguments.
+      # Verify _submit_file was called with project and location parameters.
       mock_submit.assert_called_with(
           mock_client,
           "gemini-2.5-flash",
@@ -222,6 +238,9 @@ def test_batch_with_schema(self, mock_client_cls):
               },
           }],
           mock.ANY,  # Display name contains timestamp/random.
+          None,  # retention_days
+          "p",  # project
+          "l",  # location
       )
 
     self.assertEqual(model.gemini_schema.schema_dict, mock_schema.schema_dict)
@@ -238,7 +257,12 @@ def test_batch_error_handling(self, mock_client_cls):
         vertexai=True,
         project="p",
         location="l",
-        batch={"enabled": True, "threshold": 1},
+        batch={
+            "enabled": True,
+            "threshold": 1,
+            "enable_caching": False,
+            "retention_days": None,
+        },
     )
 
     with self.assertRaisesRegex(Exception, "Gemini Batch API error"):
@@ -273,7 +297,12 @@ def test_file_based_ordering(self, mock_client_cls):
         vertexai=True,
         project="p",
         location="l",
-        batch={"enabled": True, "threshold": 1},
+        batch={
+            "enabled": True,
+            "threshold": 1,
+            "enable_caching": False,
+            "retention_days": None,
+        },
     )
 
     results = list(model.infer(prompts))
@@ -354,6 +383,8 @@ def list_blobs_side_effect(prefix=None):
             "enabled": True,
             "threshold": 1,
             "max_prompts_per_job": max_prompts_per_job,
+            "enable_caching": False,
+            "retention_days": None,
         },
     )
 
@@ -386,7 +417,12 @@ def test_batch_item_error(self, mock_client_cls):
         vertexai=True,
         project="p",
         location="l",
-        batch={"enabled": True, "threshold": 1},
+        batch={
+            "enabled": True,
+            "threshold": 1,
+            "enable_caching": False,
+            "retention_days": None,
+        },
     )
 
     with self.assertRaisesRegex(Exception, "Batch item error"):
@@ -420,7 +456,12 @@ def test_empty_prompts_fast_path(self, mock_client_cls):
         prompts=[],
         schema_dict=None,
         gen_config={},
-        cfg=gb.BatchConfig(enabled=True, poll_interval=1),
+        cfg=gb.BatchConfig(
+            enabled=True,
+            poll_interval=1,
+            enable_caching=False,
+            retention_days=None,
+        ),
     )
     self.assertEqual(outs, [])
 
@@ -443,7 +484,13 @@ def test_file_pad_to_expected_count(self, mock_client_cls):
       mock_client.batches.create.return_value = job
       mock_client.batches.get.return_value = job
 
-      cfg = gb.BatchConfig(enabled=True, threshold=1, poll_interval=1)
+      cfg = gb.BatchConfig(
+          enabled=True,
+          threshold=1,
+          poll_interval=1,
+          enable_caching=False,
+          retention_days=None,
+      )
       outs = gb.infer_batch(
           client=mock_client,
           model_id="m",
@@ -481,6 +528,7 @@ def test_cache_hit_skips_inference(self, mock_client_cls):
         enabled=True,
         threshold=1,
         enable_caching=True,
+        retention_days=None,
     )
 
     outs = gb.infer_batch(
@@ -541,6 +589,7 @@ def get_blob(name):
           enabled=True,
           threshold=1,
           enable_caching=True,
+          retention_days=None,
       )
 
       outs = gb.infer_batch(
@@ -566,6 +615,63 @@ def get_blob(name):
           upload_calls, "Should have uploaded new_response to cache"
       )
 
+  @mock.patch.object(genai, "Client", autospec=True)
+  @mock.patch.dict("os.environ", {}, clear=True)
+  def test_project_passed_to_storage_client(self, mock_client_cls):
+    """Test that project parameter is passed to storage.Client constructor."""
+    mock_client = mock_client_cls.return_value
+    mock_client.vertexai = True
+    if hasattr(mock_client, "project"):
+      del mock_client.project
+
+    self.mock_storage_client.create_bucket.return_value = self.mock_bucket
+
+    output_blob = mock.create_autospec(gb.storage.Blob, instance=True)
+    output_blob.name = f"output{gb._EXT_JSONL}"
+    output_blob.open.return_value.__enter__.return_value = io.StringIO(
+        _create_batch_response(0, {"result": "ok"})
+    )
+    self.mock_bucket.list_blobs.return_value = [output_blob]
+
+    mock_client.batches.create.return_value = create_mock_batch_job()
+    mock_client.batches.get.return_value = create_mock_batch_job()
+
+    # Create model with specific project and location
+    test_project = "test-project-123"
+    test_location = "us-central1"
+
+    model = gemini.GeminiLanguageModel(
+        model_id="gemini-2.5-flash",
+        vertexai=True,
+        project=test_project,
+        location=test_location,
+        batch={
+            "enabled": True,
+            "threshold": 1,
+            "poll_interval": 0.1,
+            "enable_caching": False,
+            "retention_days": None,
+        },
+    )
+
+    list(model.infer(["test prompt"]))
+
+    # Verify storage.Client was called with the correct project parameter.
+    storage_calls = self.mock_storage_cls.call_args_list
+
+    project_calls = [
+        call
+        for call in storage_calls
+        if call.kwargs.get("project") == test_project
+    ]
+
+    self.assertGreaterEqual(
+        len(project_calls),
+        1,
+        f"storage.Client should be called with project={test_project}, "
+        f"but was called with: {[call.kwargs for call in storage_calls]}",
+    )
+
   def test_cache_hashing_stability(self):
     """Test that hash is stable for same inputs."""
     cache = gb.GCSBatchCache("b")