SPARK-1165 Implemented RDD.intersection in python.

apache · ScrapCodes · Mar 5, 2014 · Mar 5, 2014 · Mar 7, 2014 · Mar 8, 2014
commit d6effee4ee967f15210d0d57526beab4e3f9c8e2
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -319,6 +319,19 @@ def union(self, other):
             return RDD(self_copy._jrdd.union(other_copy._jrdd), self.ctx,
                        self.ctx.serializer)
 
+    def intersection(self, other):
+        """
+        Return the intersection of this RDD and another one.
+
+        >>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])
+        >>> rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])
+        >>> rdd1.intersection(rdd2).collect()
+        [1, 2, 3]
+        """
+        return self.map(lambda v: (v, None)).cogroup(
+            other.map(lambda v: (v, None))).filter(
+            lambda x: (len(x[1][0]) != 0) and (len(x[1][1]) != 0)).keys()
+
     def _reserialize(self):
         if self._jrdd_deserializer == self.ctx.serializer:
             return self