Make from_dict more flexible, and add from_pytree

arviz-devs · OriolAbril · Mar 14, 2024 · Nov 14, 2023 · Mar 13, 2024 · Mar 13, 2024
commit c68c913a4c09111bb81250794fb56e51b230545c
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## v0.x.x Unreleased
 
 ### New features
+- Support for `pytree`s and robust to nested dictionaries. (2291)
 
 ### Maintenance and fixes
 - Fix deprecations introduced in latest pandas and xarray versions, and prepare for numpy 2.0 ones ([2315](https://github.com/arviz-devs/arviz/pull/2315)))

diff --git a/arviz/data/__init__.py b/arviz/data/__init__.py
@@ -7,7 +7,7 @@
 from .io_cmdstan import from_cmdstan
 from .io_cmdstanpy import from_cmdstanpy
 from .io_datatree import from_datatree, to_datatree
-from .io_dict import from_dict
+from .io_dict import from_dict, from_pytree
 from .io_emcee import from_emcee
 from .io_json import from_json, to_json
 from .io_netcdf import from_netcdf, to_netcdf
@@ -38,6 +38,7 @@
     "from_cmdstanpy",
     "from_datatree",
     "from_dict",
+    "from_pytree",
     "from_json",
     "from_pyro",
     "from_numpyro",

diff --git a/arviz/data/base.py b/arviz/data/base.py
@@ -8,6 +8,7 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar, Union
 
 import numpy as np
+import tree
 import xarray as xr
 
 try:
@@ -67,6 +68,48 @@ def wrapped(cls: RequiresArgTypeT) -> Optional[RequiresReturnTypeT]:
         return wrapped
 
 
+def _yield_flat_up_to(shallow_tree, input_tree, path=()):
+    """Yields (path, value) pairs of input_tree flattened up to shallow_tree.
+
+    Adapted from dm-tree (https://github.com/google-deepmind/tree) to allow
+    lists as leaves.
+
+    Args:
+        shallow_tree: Nested structure. Traverse no further than its leaf nodes.
+        input_tree: Nested structure. Return the paths and values from this tree.
+            Must have the same upper structure as shallow_tree.
+        path: Tuple. Optional argument, only used when recursing. The path from the
+            root of the original shallow_tree, down to the root of the shallow_tree
+            arg of this recursive call.
+
+    Yields:
+        Pairs of (path, value), where path the tuple path of a leaf node in
+        shallow_tree, and value is the value of the corresponding node in
+        input_tree.
+    """
+    # pylint: disable=protected-access
+    if isinstance(shallow_tree, tree._TEXT_OR_BYTES) or not (
+        isinstance(shallow_tree, tree.collections_abc.Mapping)
+        or tree._is_namedtuple(shallow_tree)
+        or tree._is_attrs(shallow_tree)
+    ):
+        yield (path, input_tree)
+    else:
+        input_tree = dict(tree._yield_sorted_items(input_tree))
+        for shallow_key, shallow_subtree in tree._yield_sorted_items(shallow_tree):
+            subpath = path + (shallow_key,)
+            input_subtree = input_tree[shallow_key]
+            for leaf_path, leaf_value in _yield_flat_up_to(
+                shallow_subtree, input_subtree, path=subpath
+            ):
+                yield (leaf_path, leaf_value)
+    # pylint: enable=protected-access
+
+
+def _flatten_with_path(structure):
+    return list(_yield_flat_up_to(structure, structure))
+
+
 def generate_dims_coords(
     shape,
     var_name,
@@ -255,7 +298,7 @@ def numpy_to_data_array(
     return xr.DataArray(ary, coords=coords, dims=dims)
 
 
-def dict_to_dataset(
+def pytree_to_dataset(
     data,
     *,
     attrs=None,
@@ -266,11 +309,34 @@ def dict_to_dataset(
     index_origin=None,
     skip_event_dims=None,
 ):
-    """Convert a dictionary of numpy arrays to an xarray.Dataset.
+    """Convert a pytree of numpy arrays to an xarray.Dataset.
+
+    See https://jax.readthedocs.io/en/latest/pytrees.html for what a pytree is, but
+    this inclues at least dictionaries and tuple types.
+
+    In case of nested pytrees, the variable name will be a tuple of individual names.
+
+    For example,
+
+    pytree_to_dataset({'top': {'second': 1.}, 'top2': 1.})
+
+    will have `var_names` `('top', 'second')` and `top2`.
+
+    Dimensions and co-ordinates can be defined as usual:
+
+    datadict = {
+        "top": {"a": np.random.randn(100), "b": np.random.randn(1, 100, 10)},
+        "d": np.random.randn(100),
+    }
+    dataset = convert_to_dataset(datadict,
+                                 coords={"c": np.arange(10)},
+                                 dims={("top", "b"): ["c"]})
+
+    Then `dataset.data_vars` will be `('top', 'a'), ('top', 'b'), 'd'`.
 
     Parameters
     ----------
-    data : dict[str] -> ndarray
+    data : pytree
         Data to convert. Keys are variable names.
     attrs : dict
         Json serializable metadata to attach to the dataset, in addition to defaults.
@@ -302,6 +368,10 @@ def dict_to_dataset(
     """
     if dims is None:
         dims = {}
+    try:
+        data = {k[0] if len(k) == 1 else k: v for k, v in _flatten_with_path(data)}
+    except TypeError:  # probably unsortable keys -- the function will still work if
+        pass  # it is an honest dictionary.
 
     data_vars = {
         key: numpy_to_data_array(
@@ -318,6 +388,9 @@ def dict_to_dataset(
     return xr.Dataset(data_vars=data_vars, attrs=make_attrs(attrs=attrs, library=library))
 
 
+dict_to_dataset = pytree_to_dataset
+
+
 def make_attrs(attrs=None, library=None):
     """Make standard attributes to attach to xarray datasets.
 

diff --git a/arviz/data/converters.py b/arviz/data/converters.py
@@ -1,5 +1,6 @@
 """High level conversion functions."""
 import numpy as np
+import tree
 import xarray as xr
 
 from .base import dict_to_dataset
@@ -105,6 +106,8 @@ def convert_to_inference_data(obj, *, group="posterior", coords=None, dims=None,
         dataset = obj.to_dataset()
     elif isinstance(obj, dict):
         dataset = dict_to_dataset(obj, coords=coords, dims=dims)
+    elif tree.is_nested(obj) and not isinstance(obj, (list, tuple)):
+        dataset = dict_to_dataset(obj, coords=coords, dims=dims)
     elif isinstance(obj, np.ndarray):
         dataset = dict_to_dataset({"x": obj}, coords=coords, dims=dims)
     elif isinstance(obj, (list, tuple)) and isinstance(obj[0], str) and obj[0].endswith(".csv"):
@@ -118,6 +121,7 @@ def convert_to_inference_data(obj, *, group="posterior", coords=None, dims=None,
             "xarray dataarray",
             "xarray dataset",
             "dict",
+            "pytree",
             "netcdf filename",
             "numpy array",
             "pystan fit",

diff --git a/arviz/data/io_dict.py b/arviz/data/io_dict.py
@@ -458,3 +458,6 @@ def from_dict(
         attrs=attrs,
         **kwargs,
     ).to_inference_data()
+
+
+from_pytree = from_dict
diff --git a/arviz/plots/backends/matplotlib/pairplot.py b/arviz/plots/backends/matplotlib/pairplot.py
@@ -333,7 +333,7 @@ def plot_pair(
                     if reference_values:
                         x_name = flat_var_names[i]
                         y_name = flat_var_names[j + not_marginals]
-                        if x_name and y_name not in difference:
+                        if (x_name not in difference) and (y_name not in difference):
                             ax[j, i].plot(
                                 reference_values_copy[x_name],
                                 reference_values_copy[y_name],

diff --git a/arviz/tests/base_tests/test_data.py b/arviz/tests/base_tests/test_data.py
@@ -1077,6 +1077,20 @@ def test_dict_to_dataset():
     assert set(dataset.b.coords) == {"chain", "draw", "c"}
 
 
+def test_nested_dict_to_dataset():
+    datadict = {
+        "top": {"a": np.random.randn(100), "b": np.random.randn(1, 100, 10)},
+        "d": np.random.randn(100),
+    }
+    dataset = convert_to_dataset(datadict, coords={"c": np.arange(10)}, dims={("top", "b"): ["c"]})
+    assert set(dataset.data_vars) == {("top", "a"), ("top", "b"), "d"}
+    assert set(dataset.coords) == {"chain", "draw", "c"}
+
+    assert set(dataset[("top", "a")].coords) == {"chain", "draw"}
+    assert set(dataset[("top", "b")].coords) == {"chain", "draw", "c"}
+    assert set(dataset.d.coords) == {"chain", "draw"}
+
+
 def test_dict_to_dataset_event_dims_error():
     datadict = {"a": np.random.randn(1, 100, 10)}
     coords = {"b": np.arange(10), "c": ["x", "y", "z"]}

diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,7 @@ numpy>=1.22.0,<2.0
 scipy>=1.8.0
 packaging
 pandas>=1.4.0
+dm-tree>=0.1.8
 xarray>=0.21.0
 h5netcdf>=1.0.2
 typing_extensions>=4.1.0