ai-dynamo · blarson-b10 · Sep 9, 2025 · Sep 9, 2025 · Sep 9, 2025 · coderabbitai
@@ -14,6 +14,8 @@
 // limitations under the License.
 
 use super::context::{callable_accepts_kwarg, Context};
+use dynamo_llm::protocols::DataStream;
+use dynamo_runtime::engine::AsyncEngineContext;
 use pyo3::prelude::*;
 use pyo3::types::{PyDict, PyModule};
 use pyo3::{PyAny, PyErr};
@@ -73,7 +75,7 @@ pub fn add_to_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
 /// ```
 #[pyclass]
 #[derive(Clone)]
-pub struct PythonAsyncEngine(PythonServerStreamingEngine);
+pub struct PythonAsyncEngine(pub PythonServerStreamingEngine);
 
 #[pymethods]
 impl PythonAsyncEngine {
@@ -135,31 +137,16 @@ impl PythonServerStreamingEngine {
             has_context,
         }
     }
-}
 
-#[derive(Debug, thiserror::Error)]
-enum ResponseProcessingError {
-    #[error("python exception: {0}")]
-    PythonException(String),
-
-    #[error("python generator exit: {0}")]
-    PyGeneratorExit(String),
-
-    #[error("deserialize error: {0}")]
-    DeserializeError(String),
-
-    #[error("gil offload error: {0}")]
-    OffloadError(String),
-}
-
-#[async_trait]
-impl<Req, Resp> AsyncEngine<SingleIn<Req>, ManyOut<Annotated<Resp>>, Error>
-    for PythonServerStreamingEngine
-where
-    Req: Data + Serialize,
-    Resp: Data + for<'de> Deserialize<'de>,
-{
-    async fn generate(&self, request: SingleIn<Req>) -> Result<ManyOut<Annotated<Resp>>, Error> {
+    /// Generate the response in parts.
+    pub async fn generate_in_parts<Req, Resp>(
+        &self,
+        request: SingleIn<Req>,
+    ) -> Result<(DataStream<Annotated<Resp>>, Arc<dyn AsyncEngineContext>), Error>
+    where
+        Req: Data + Serialize,
+        Resp: Data + for<'de> Deserialize<'de>,
+    {
         // Create a context
         let (request, context) = request.transfer(());
         let ctx = context.context();
@@ -290,8 +277,36 @@ where
         });
 
         let stream = ReceiverStream::new(rx);
+        let context = context.context();
+        Ok((Box::pin(stream), context))
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+enum ResponseProcessingError {
+    #[error("python exception: {0}")]
+    PythonException(String),
+
+    #[error("python generator exit: {0}")]
+    PyGeneratorExit(String),
+
+    #[error("deserialize error: {0}")]
+    DeserializeError(String),
 
-        Ok(ResponseStream::new(Box::pin(stream), context.context()))
+    #[error("gil offload error: {0}")]
+    OffloadError(String),
+}
+
+#[async_trait]
+impl<Req, Resp> AsyncEngine<SingleIn<Req>, ManyOut<Annotated<Resp>>, Error>
+    for PythonServerStreamingEngine
+where
+    Req: Data + Serialize,
+    Resp: Data + for<'de> Deserialize<'de>,
+{
+    async fn generate(&self, request: SingleIn<Req>) -> Result<ManyOut<Annotated<Resp>>, Error> {
+        let (stream, context) = self.generate_in_parts(request).await?;
+        Ok(ResponseStream::new(Box::pin(stream), context))
     }
-    async fn generate(&self, request: SingleIn<Req>) -> Result<ManyOut<Annotated<Resp>>, Error> {
-        let (stream, context) = self.generate_in_parts(request).await?;
-        Ok(ResponseStream::new(Box::pin(stream), context))
-    }
+    async fn generate(&self, request: SingleIn<Req>) -> Result<ManyOut<Annotated<Resp>>, Error> {
+        let (stream, context) = self.generate_in_parts(request).await?;
+        Ok(ResponseStream::new(stream, context))
+    }
-    async fn generate(&self, request: SingleIn<Req>) -> Result<ManyOut<Annotated<Resp>>, Error> {
-        let (stream, context) = self.generate_in_parts(request).await?;
-        Ok(ResponseStream::new(Box::pin(stream), context))
-    }
+    async fn generate(&self, request: SingleIn<Req>) -> Result<ManyOut<Annotated<Resp>>, Error> {
+        let (stream, context) = self.generate_in_parts(request).await?;
+        Ok(ResponseStream::new(stream, context))
+    }
 }
 

@@ -177,8 +177,29 @@ where
     Resp: Data + for<'de> Deserialize<'de>,
 {
     async fn generate(&self, request: SingleIn<Req>) -> Result<ManyOut<Annotated<Resp>>, Error> {
-        match self.0.generate(request).await {
-            Ok(res) => Ok(res),
+        match self.0 .0.generate_in_parts(request).await {
+            Ok((mut stream, context)) => {
+                let request_id = context.id().to_string();
+                let first_item = match futures::StreamExt::next(&mut stream).await {
+                    // TODO - item may still contain an Annotated error. How do we want to handle that?
+                    // TODO - should we be returning an HttpError here?
+                    Some(item) => item,
+                    None => {
+                        let error_msg = "python async generator stream ended before processing started";
+                        tracing::warn!(request_id, error_msg);
+                        return Err(Error::new(std::io::Error::new(
+                            std::io::ErrorKind::UnexpectedEof,
+                            error_msg,
+                        )));
+                    }
+                };
+
+                // Create a new stream that yields the first item followed by the rest of the original stream
+                let once_stream = futures::stream::once(async { first_item });
+                let stream = futures::StreamExt::chain(once_stream, stream);
+
+                Ok(ResponseStream::new(Box::pin(stream), context))
+            }
-        match self.0 .0.generate_in_parts(request).await {
-            Ok((mut stream, context)) => {
-                let request_id = context.id().to_string();
-                let first_item = match futures::StreamExt::next(&mut stream).await {
-                    // TODO - item may still contain an Annotated error. How do we want to handle that?
-                    // TODO - should we be returning an HttpError here?
-                    Some(item) => item,
-                    None => {
-                        let error_msg = "python async generator stream ended before processing started";
-                        tracing::warn!(request_id, error_msg);
-                        return Err(Error::new(std::io::Error::new(
-                            std::io::ErrorKind::UnexpectedEof,
-                            error_msg,
-                        )));
-                    }
-                };
-
-                // Create a new stream that yields the first item followed by the rest of the original stream
-                let once_stream = futures::stream::once(async { first_item });
-                let stream = futures::StreamExt::chain(once_stream, stream);
-
-                Ok(ResponseStream::new(Box::pin(stream), context))
-            }
+        match self.0 .0.generate_in_parts(request).await {
+            Ok((mut stream, context)) => {
+                let request_id = context.id().to_string();
+                let first_item = match futures::StreamExt::next(&mut stream).await {
+                    // TODO - item may still contain an Annotated error. How do we want to handle that?
+                    // TODO - should we be returning an HttpError here?
+                    Some(item) => item,
+                    None => {
+                        let error_msg = "python async generator stream ended before processing started";
+                        tracing::warn!(%request_id, "{}", error_msg);
+                        return Err(Error::new(std::io::Error::new(
+                            std::io::ErrorKind::UnexpectedEof,
+                            error_msg,
+                        )));
+                    }
+                };
+
+                // Fail fast if Python produced an annotated error as the first item.
+                if first_item.is_error() {
+                    let msg = format!("first stream item was an error; request_id={}", request_id);
+                    // Prefer explicit HTTP error so callers get an HTTP status instead of a 500 IO error.
+                    return Err(http_error::HttpError { code: 500, message: msg })?;
+                }
+
+                // Create a new stream that yields the first item followed by the rest of the original stream
+                let once_stream = futures::stream::once(async { first_item });
+                let stream = futures::StreamExt::chain(once_stream, stream);
+
+                Ok(ResponseStream::new(Box::pin(stream), context))
+            }
-        match self.0 .0.generate_in_parts(request).await {
-            Ok((mut stream, context)) => {
-                let request_id = context.id().to_string();
-                let first_item = match futures::StreamExt::next(&mut stream).await {
-                    // TODO - item may still contain an Annotated error. How do we want to handle that?
-                    // TODO - should we be returning an HttpError here?
-                    Some(item) => item,
-                    None => {
-                        let error_msg = "python async generator stream ended before processing started";
-                        tracing::warn!(request_id, error_msg);
-                        return Err(Error::new(std::io::Error::new(
-                            std::io::ErrorKind::UnexpectedEof,
-                            error_msg,
-                        )));
-                    }
-                };
-
-                // Create a new stream that yields the first item followed by the rest of the original stream
-                let once_stream = futures::stream::once(async { first_item });
-                let stream = futures::StreamExt::chain(once_stream, stream);
-
-                Ok(ResponseStream::new(Box::pin(stream), context))
-            }
+        match self.0 .0.generate_in_parts(request).await {
+            Ok((mut stream, context)) => {
+                let request_id = context.id().to_string();
+                let first_item = match futures::StreamExt::next(&mut stream).await {
+                    // TODO - item may still contain an Annotated error. How do we want to handle that?
+                    // TODO - should we be returning an HttpError here?
+                    Some(item) => item,
+                    None => {
+                        let error_msg = "python async generator stream ended before processing started";
+                        tracing::warn!(%request_id, "{}", error_msg);
+                        return Err(Error::new(std::io::Error::new(
+                            std::io::ErrorKind::UnexpectedEof,
+                            error_msg,
+                        )));
+                    }
+                };
+
+                // Fail fast if Python produced an annotated error as the first item.
+                if first_item.is_error() {
+                    let msg = format!("first stream item was an error; request_id={}", request_id);
+                    // Prefer explicit HTTP error so callers get an HTTP status instead of a 500 IO error.
+                    return Err(http_error::HttpError { code: 500, message: msg })?;
+                }
+
+                // Create a new stream that yields the first item followed by the rest of the original stream
+                let once_stream = futures::stream::once(async { first_item });
+                let stream = futures::StreamExt::chain(once_stream, stream);
+
+                Ok(ResponseStream::new(Box::pin(stream), context))
+            }
 
             // Inspect the error - if it was an HttpError from Python, extract the code and message
             // and return the rust version of HttpError

@@ -377,6 +377,12 @@ impl DistributedRuntime {
         self.inner.runtime().shutdown();
     }
 
+    fn child_token(&self) -> CancellationToken {
+        CancellationToken {
+            inner: self.inner.runtime().child_token(),
+        }
+    }
+
     fn event_loop(&self) -> PyObject {
         self.event_loop.clone()
     }

diff --git a/lib/bindings/python/src/dynamo/_core.pyi b/lib/bindings/python/src/dynamo/_core.pyi
@@ -52,6 +52,30 @@ class DistributedRuntime:
         Shutdown the runtime by triggering the cancellation token
         """
         ...
+
+    def child_token(self) -> CancellationToken:
+        """
+        Get a child cancellation token from the runtime
+        """
+        ...
+
+class CancellationToken:
+    """
+    A cancellation token for coordinating shutdown across components
+    """
+
+    def cancel(self) -> None:
+        """
+        Cancel the token
+        """
+        ...
+
+    async def cancelled(self) -> None:
+        """
+        Wait for the token to be cancelled
+        """
+        ...
+
 class EtcdClient:
     """
     Etcd is used for discovery in the DistributedRuntime