add comp argument for RDD.max() and RDD.min()

apache · davies · Aug 22, 2014 · Aug 23, 2014 · Aug 23, 2014 · Aug 23, 2014
commit dd91e08a92ebace863506cdfe52114ffeec894c9
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -810,23 +810,45 @@ def func(iterator):
 
         return self.mapPartitions(func).fold(zeroValue, combOp)
 
-    def max(self):
+    def max(self, comp=None):
         """
         Find the maximum item in this RDD.
 
-        >>> sc.parallelize([1.0, 5.0, 43.0, 10.0]).max()
+        @param comp: A function used to compare two elements, the builtin `cmp`
+                     will be used by default.
+
+        >>> rdd = sc.parallelize([1.0, 5.0, 43.0, 10.0])
+        >>> rdd.max()
         43.0
+        >>> rdd.max(lambda a, b: cmp(str(a), str(b)))
+        5.0
         """
-        return self.reduce(max)
+        if comp is not None:
+            func = lambda a, b: a if comp(a, b) >= 0 else b
+        else:
+            func = max
 
-    def min(self):
+        return self.reduce(func)
+
+    def min(self, comp=None):
         """
         Find the minimum item in this RDD.
 
-        >>> sc.parallelize([1.0, 5.0, 43.0, 10.0]).min()
-        1.0
+        @param comp: A function used to compare two elements, the builtin `cmp`
+                     will be used by default.
+
+        >>> rdd = sc.parallelize([2.0, 5.0, 43.0, 10.0])
+        >>> rdd.min()
+        2.0
+        >>> rdd.min(lambda a, b: cmp(str(a), str(b)))
+        10.0
         """
-        return self.reduce(min)
+        if comp is not None:
+            func = lambda a, b: a if comp(a, b) <= 0 else b
+        else:
+            func = min
+
+        return self.reduce(func)
 
     def sum(self):
         """