From cb9d617aff3135143affbc7f0f6431c403a7503b Mon Sep 17 00:00:00 2001 From: Andrew Ray Date: Mon, 24 Jul 2017 15:59:00 -0500 Subject: [PATCH] python summary --- python/pyspark/sql/dataframe.py | 61 ++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 944739bcd207..403f315f917b 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -927,7 +927,7 @@ def _sort_cols(self, cols, kwargs): @since("1.3.1") def describe(self, *cols): - """Computes statistics for numeric and string columns. + """Computes basic statistics for numeric and string columns. This include count, mean, stddev, min, and max. If no columns are given, this function computes statistics for all numerical or string columns. @@ -955,12 +955,71 @@ def describe(self, *cols): | min| 2|Alice| | max| 5| Bob| +-------+------------------+-----+ + + Use summary for expanded statistics and control over which statistics to compute. """ if len(cols) == 1 and isinstance(cols[0], list): cols = cols[0] jdf = self._jdf.describe(self._jseq(cols)) return DataFrame(jdf, self.sql_ctx) + @since("2.3.0") + def summary(self, *statistics): + """Computes specified statistics for numeric and string columns. Available statistics are: + - count + - mean + - stddev + - min + - max + - arbitrary approximate percentiles specified as a percentage (eg, 75%) + + If no statistics are given, this function computes count, mean, stddev, min, + approximate quartiles (percentiles at 25%, 50%, and 75%), and max. + + .. note:: This function is meant for exploratory data analysis, as we make no + guarantee about the backward compatibility of the schema of the resulting DataFrame. + + >>> df.summary().show() + +-------+------------------+-----+ + |summary| age| name| + +-------+------------------+-----+ + | count| 2| 2| + | mean| 3.5| null| + | stddev|2.1213203435596424| null| + | min| 2|Alice| + | 25%| 5.0| null| + | 50%| 5.0| null| + | 75%| 5.0| null| + | max| 5| Bob| + +-------+------------------+-----+ + + >>> df.summary("count", "min", "25%", "75%", "max").show() + +-------+---+-----+ + |summary|age| name| + +-------+---+-----+ + | count| 2| 2| + | min| 2|Alice| + | 25%|5.0| null| + | 75%|5.0| null| + | max| 5| Bob| + +-------+---+-----+ + + To do a summary for specific columns first select them: + + >>> df.select("age", "name").summary("count").show() + +-------+---+----+ + |summary|age|name| + +-------+---+----+ + | count| 2| 2| + +-------+---+----+ + + See also describe for basic statistics. + """ + if len(statistics) == 1 and isinstance(statistics[0], list): + statistics = statistics[0] + jdf = self._jdf.summary(self._jseq(statistics)) + return DataFrame(jdf, self.sql_ctx) + @ignore_unicode_prefix @since(1.3) def head(self, n=None):