Skip to content
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
added correct doctest for histogram
  • Loading branch information
dwmclary committed Mar 11, 2014
commit eaf89d957e84d3b926f6c5f3f65acb8764c7ec2f
16 changes: 8 additions & 8 deletions python/pyspark/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,14 +613,6 @@ def sampleVariance(self):
return self.stats().sampleVariance()

def getBuckets(self, bucketCount):
"""
Compute a histogram of the data using bucketCount number of buckets
evenly spaced between the min and max of the RDD.

>>> sc.parallelize([1,49, 23, 100, 75, 50]).histogram()
{(0,49):3, (50, 100):3}
"""

#use the statscounter as a quick way of getting max and min
mm_stats = self.stats()
min = mm_stats.min()
Expand All @@ -634,6 +626,14 @@ def getBuckets(self, bucketCount):
return buckets

def histogram(self, bucketCount, buckets=None):
"""
Compute a histogram of the data using bucketCount number of buckets
evenly spaced between the min and max of the RDD.

>>> sc.parallelize([1,49, 23, 100, 12, 13, 20, 22, 75, 50]).histogram(3)
defaultdict(<type 'int'>, {(67, inf): 2, (1, 33): 6, (34, 66): 2})
"""

evenBuckets = False
if not buckets:
buckets = self.getBuckets(bucketCount)
Expand Down