doublevkaytester1
diff --git a/‎mars/dataframe/arithmetic/bitwise_or.py‎
Lines changed: 24 additions & 2 deletions b/‎mars/dataframe/arithmetic/bitwise_or.py‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎mars/dataframe/arithmetic/core.py‎
Lines changed: 14 additions & 0 deletions b/‎mars/dataframe/arithmetic/core.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎mars/dataframe/base/isin.py‎
Lines changed: 91 additions & 58 deletions b/‎mars/dataframe/base/isin.py‎
Lines changed: 91 additions & 58 deletions
diff --git a/‎mars/dataframe/base/tests/test_base.py‎
Lines changed: 9 additions & 9 deletions b/‎mars/dataframe/base/tests/test_base.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎mars/dataframe/base/tests/test_base_execution.py‎
Lines changed: 12 additions & 2 deletions b/‎mars/dataframe/base/tests/test_base_execution.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎mars/dataframe/core.py‎
Lines changed: 2 additions & 0 deletions b/‎mars/dataframe/core.py‎
Lines changed: 2 additions & 0 deletions
@@ -15,8 +15,8 @@
 import operator
 
 from ... import opcodes as OperandDef
-from ...utils import classproperty
-from .core import DataFrameBinopUfunc
+from ...utils import classproperty, TreeReductionBuilder
+from .core import DataFrameBinopUfunc, DataFrameArithmeticTreeMixin
 
 
 class DataFrameOr(DataFrameBinopUfunc):
@@ -36,6 +36,10 @@ def tensor_op_type(self):
         return TensorBitor
 
 
+class DataFrameTreeOr(DataFrameArithmeticTreeMixin, DataFrameOr):
+    _op_type_ = OperandDef.TREE_OR
+
+
 def bitor(df, other, axis="columns", level=None, fill_value=None):
     op = DataFrameOr(axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other)
     return op(df, other)
@@ -44,3 +48,21 @@ def bitor(df, other, axis="columns", level=None, fill_value=None):
 def rbitor(df, other, axis="columns", level=None, fill_value=None):
     op = DataFrameOr(axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df)
     return op.rcall(df, other)
+
+
+def tree_dataframe_or(
+    *args, index=None, combine_size=None, axis="columns", level=None, fill_value=None
+):
+    class MultiplyBuilder(TreeReductionBuilder):
+        def _build_reduction(self, inputs, final=False):
+            op = DataFrameTreeOr(
+                axis=axis,
+                level=level,
+                fill_value=fill_value,
+                output_types=inputs[0].op.output_types,
+            )
+            params = inputs[0].params.copy()
+            params["index"] = index
+            return op.new_chunk(inputs, **params)
+
+    return MultiplyBuilder(combine_size).build(args)
@@ -14,6 +14,7 @@
 
 import itertools
 import copy
+from functools import reduce
 
 import numpy as np
 import pandas as pd
@@ -618,6 +619,8 @@ def _new_chunks(self, inputs, kws=None, **kw):
                 inp, (DATAFRAME_CHUNK_TYPE, SERIES_CHUNK_TYPE, TENSOR_CHUNK_TYPE)
             )
         ]
+        # use first two to infer(for tree operand)
+        property_inputs = property_inputs[:2]
         if len(property_inputs) == 1:
             properties = self._calc_properties(*property_inputs)
         elif any(inp.ndim == 2 for inp in property_inputs):
@@ -871,6 +874,17 @@ def __call__(self, df):
             )
 
 
+class DataFrameArithmeticTreeMixin:
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs = [ctx[c.key] for c in op.inputs]
+        ctx[op.outputs[0].key] = reduce(op._operator, inputs)
+
+    def _set_inputs(self, inputs):
+        inputs = self._get_inputs_data(inputs)
+        setattr(self, "_inputs", inputs)
+
+
 class DataFrameUnaryUfunc(DataFrameUnaryOp, TensorUfuncMixin):
     pass
 
 
@@ -12,58 +12,48 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import itertools
+
 import numpy as np
 import pandas as pd
 from pandas.api.types import is_list_like
 
 from ... import opcodes as OperandDef
-from ...core import ENTITY_TYPE, recursive_tile
+from ...core import ENTITY_TYPE
 from ...serialization.serializables import KeyField, AnyField
 from ...tensor.core import TENSOR_TYPE
-from ...utils import has_unknown_shape
 from ..core import DATAFRAME_TYPE, SERIES_TYPE, INDEX_TYPE
 from ..operands import DataFrameOperand, DataFrameOperandMixin
 
 
 class DataFrameIsin(DataFrameOperand, DataFrameOperandMixin):
     _op_type_ = OperandDef.ISIN
 
-    _input = KeyField("input")
-    _values = AnyField("values")
-
-    def __init__(self, values=None, output_types=None, **kw):
-        super().__init__(_values=values, _output_types=output_types, **kw)
-
-    @property
-    def input(self):
-        return self._input
-
-    @property
-    def values(self):
-        return self._values
+    input = KeyField("input")
+    values = AnyField("values")
 
     def _set_inputs(self, inputs):
         super()._set_inputs(inputs)
         inputs_iter = iter(self._inputs)
-        self._input = next(inputs_iter)
+        self.input = next(inputs_iter)
         if len(self._inputs) > 1:
-            if isinstance(self._values, dict):
+            if isinstance(self.values, dict):
                 new_values = dict()
-                for k, v in self._values.items():
+                for k, v in self.values.items():
                     if isinstance(v, ENTITY_TYPE):
                         new_values[k] = next(inputs_iter)
                     else:
                         new_values[k] = v
-                self._values = new_values
+                self.values = new_values
             else:
-                self._values = self._inputs[1]
+                self.values = self._inputs[1]
 
     def __call__(self, elements):
         inputs = [elements]
-        if isinstance(self._values, ENTITY_TYPE):
-            inputs.append(self._values)
-        elif isinstance(self._values, dict):
-            for v in self._values.values():
+        if isinstance(self.values, ENTITY_TYPE):
+            inputs.append(self.values)
+        elif isinstance(self.values, dict):
+            for v in self.values.values():
                 if isinstance(v, ENTITY_TYPE):
                     inputs.append(v)
 
@@ -87,47 +77,63 @@ def __call__(self, elements):
                 dtypes=dtypes,
             )
 
+    @classmethod
+    def _tile_entity_values(cls, op):
+        from ..utils import auto_merge_chunks
+        from ..arithmetic.bitwise_or import tree_dataframe_or
+        from ...core.context import get_context
+
+        in_elements = op.input
+        out_elements = op.outputs[0]
+        # values contains mars objects
+        chunks_list = []
+        in_chunks = in_elements.chunks
+        if any(len(t.chunks) > 4 for t in op.inputs):
+            # yield and merge value chunks to reduce graph nodes
+            yield list(
+                itertools.chain.from_iterable(
+                    t.chunks for t in op.inputs if isinstance(t, ENTITY_TYPE)
+                )
+            )
+            in_elements = auto_merge_chunks(get_context(), op.input)
+            in_chunks = in_elements.chunks
+            for value in op.inputs[1:]:
+                if isinstance(value, DATAFRAME_TYPE + SERIES_TYPE):
+                    merged = auto_merge_chunks(get_context(), value)
+                    chunks_list.append(merged.chunks)
+                elif isinstance(value, ENTITY_TYPE):
+                    chunks_list.append(value.chunks)
+        else:
+            for value in op.inputs[1:]:
+                if isinstance(value, ENTITY_TYPE):
+                    chunks_list.append(value.chunks)
+
+        out_chunks = []
+        for in_chunk in in_chunks:
+            isin_chunks = []
+            for value_chunks in itertools.product(*chunks_list):
+                input_chunks = [in_chunk] + list(value_chunks)
+                isin_chunks.append(cls._new_chunk(op, in_chunk, input_chunks))
+            out_chunk = tree_dataframe_or(*isin_chunks, index=in_chunk.index)
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = out_elements.params
+        params["nsplits"] = in_elements.nsplits
+        params["chunks"] = out_chunks
+        return new_op.new_tileables(op.inputs, kws=[params])
+
     @classmethod
     def tile(cls, op):
         in_elements = op.input
         out_elements = op.outputs[0]
 
-        values_inputs = []
         if len(op.inputs) > 1:
-            for value in op.inputs[1:]:
-                # make sure arg has known shape when it's a md.Series
-                if has_unknown_shape(value):
-                    yield
-                value = yield from recursive_tile(value.rechunk(value.shape))
-                values_inputs.append(value)
+            return (yield from cls._tile_entity_values(op))
 
         out_chunks = []
         for chunk in in_elements.chunks:
-            chunk_op = op.copy().reset_key()
-            chunk_inputs = [chunk]
-            if len(op.inputs) > 1:
-                chunk_inputs.extend(v.chunks[0] for v in values_inputs)
-            if out_elements.ndim == 1:
-                out_chunk = chunk_op.new_chunk(
-                    chunk_inputs,
-                    shape=chunk.shape,
-                    dtype=out_elements.dtype,
-                    index_value=chunk.index_value,
-                    name=out_elements.name,
-                    index=chunk.index,
-                )
-            else:
-                chunk_dtypes = pd.Series(
-                    [np.dtype(bool) for _ in chunk.dtypes], index=chunk.dtypes.index
-                )
-                out_chunk = chunk_op.new_chunk(
-                    chunk_inputs,
-                    shape=chunk.shape,
-                    index_value=chunk.index_value,
-                    columns_value=chunk.columns_value,
-                    dtypes=chunk_dtypes,
-                    index=chunk.index,
-                )
+            out_chunk = cls._new_chunk(op, chunk, [chunk])
             out_chunks.append(out_chunk)
 
         new_op = op.copy()
@@ -136,6 +142,33 @@ def tile(cls, op):
         params["chunks"] = out_chunks
         return new_op.new_tileables(op.inputs, kws=[params])
 
+    @classmethod
+    def _new_chunk(cls, op, chunk, input_chunks):
+        out_elements = op.outputs[0]
+        chunk_op = op.copy().reset_key()
+        if out_elements.ndim == 1:
+            out_chunk = chunk_op.new_chunk(
+                input_chunks,
+                shape=chunk.shape,
+                dtype=out_elements.dtype,
+                index_value=chunk.index_value,
+                name=out_elements.name,
+                index=chunk.index,
+            )
+        else:
+            chunk_dtypes = pd.Series(
+                [np.dtype(bool) for _ in chunk.dtypes], index=chunk.dtypes.index
+            )
+            out_chunk = chunk_op.new_chunk(
+                input_chunks,
+                shape=chunk.shape,
+                index_value=chunk.index_value,
+                columns_value=chunk.columns_value,
+                dtypes=chunk_dtypes,
+                index=chunk.index,
+            )
+        return out_chunk
+
     @classmethod
     def execute(cls, ctx, op):
         inputs_iter = iter(op.inputs)
@@ -222,7 +255,7 @@ def series_isin(elements, values):
             "only list-like objects are allowed to be passed to isin(), "
             f"you passed a [{type(values)}]"
         )
-    op = DataFrameIsin(values)
+    op = DataFrameIsin(values=values)
     return op(elements)
 
 
@@ -298,5 +331,5 @@ def df_isin(df, values):
             "only list-like objects or dict are allowed to be passed to isin(), "
             f"you passed a [{type(values)}]"
         )
-    op = DataFrameIsin(values)
+    op = DataFrameIsin(values=values)
     return op(df)
@@ -681,39 +681,39 @@ def test_series_isin():
         assert c.op.inputs[0].index == (i,)
         assert c.op.inputs[0].shape == (10,)
         assert c.op.inputs[1].index == (0,)
-        assert c.op.inputs[1].shape == (4,)  # has been rechunked
+        assert c.op.inputs[1].shape == (10,)
 
     # multiple chunk in one chunks
-    a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=2)
+    a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=5)
     b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=4)
 
     r = tile(a.isin(b))
     for i, c in enumerate(r.chunks):
         assert c.index == (i,)
         assert c.dtype == np.dtype("bool")
-        assert c.shape == (2,)
+        assert c.shape == (5,)
         assert len(c.op.inputs) == 2
         assert c.op.output_types[0] == OutputType.series
         assert c.op.inputs[0].index == (i,)
-        assert c.op.inputs[0].shape == (2,)
+        assert c.op.inputs[0].shape == (5,)
         assert c.op.inputs[1].index == (0,)
         assert c.op.inputs[1].shape == (4,)
 
     # multiple chunk in multiple chunks
-    a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=2)
+    a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=5)
     b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=2)
 
     r = tile(a.isin(b))
     for i, c in enumerate(r.chunks):
         assert c.index == (i,)
         assert c.dtype == np.dtype("bool")
-        assert c.shape == (2,)
+        assert c.shape == (5,)
         assert len(c.op.inputs) == 2
         assert c.op.output_types[0] == OutputType.series
         assert c.op.inputs[0].index == (i,)
-        assert c.op.inputs[0].shape == (2,)
-        assert c.op.inputs[1].index == (0,)
-        assert c.op.inputs[1].shape == (4,)  # has been rechunked
+        assert c.op.inputs[0].shape == (5,)
+        assert c.op.inputs[1].index == (i,)
+        assert c.op.inputs[1].shape == (5,)
 
     with pytest.raises(TypeError):
         _ = a.isin("sth")
 
@@ -702,7 +702,7 @@ def test_isin_execution(setup):
 
     # multiple chunk in multiple chunks
     a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
-    b = pd.Series([2, 1, 9, 3])
+    b = pd.Series([2, 1, 9, 3] * 2)
     sa = from_pandas_series(a, chunk_size=2)
     sb = from_pandas_series(b, chunk_size=2)
 
@@ -747,7 +747,17 @@ def test_isin_execution(setup):
     pd.testing.assert_frame_equal(result, expected)
 
     # mars object
-    b = tensor([2, 1, raw[1][0]], chunk_size=2)
+    b = tensor([2, 1, raw[1][0]] * 2, chunk_size=2)
+    r = df.isin(b)
+    result = r.execute().fetch()
+    expected = raw.isin([2, 1, raw[1][0]])
+    pd.testing.assert_frame_equal(result, expected)
+
+    # mars object and trigger iterative tiling
+    raw = pd.DataFrame(rs.randint(1000, size=(10, 3)))
+    df = from_pandas_df(raw, chunk_size=(5, 2))
+
+    b = from_pandas_series(pd.Series([raw[1][0]] + list(range(9))), chunk_size=2)
     r = df.isin(b)
     result = r.execute().fetch()
     expected = raw.isin([2, 1, raw[1][0]])
 
@@ -433,6 +433,8 @@ def refresh_index_value(tileable: ENTITY_TYPE):
     index_value._index_value.should_be_monotonic = getattr(
         tileable.index_value, "should_be_monotonic", None
     )
+    # keep key as original index_value's
+    index_value._index_value._key = tileable.index_value.key
     tileable._index_value = index_value
Original file line number	Diff line number	Diff line change
`@@ -433,6 +433,8 @@ def refresh_index_value(tileable: ENTITY_TYPE):`
`433`	`433`	`index_value._index_value.should_be_monotonic = getattr(`
`434`	`434`	`tileable.index_value, "should_be_monotonic", None`
`435`	`435`	`)`
	`436`	`+ # keep key as original index_value's`
	`437`	`+ index_value._index_value._key = tileable.index_value.key`
`436`	`438`	`tileable._index_value = index_value`
`437`	`439`
`438`	`440`