apache · brkyvz · Apr 30, 2015 · Apr 30, 2015 · Apr 30, 2015 · Apr 30, 2015
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -44,6 +44,20 @@ def _(col):
     return _
 
 
+def _create_zero_arg_function(name, doc=""):
+    """ Create a zero arg function by name"""
+    def _(val=None):
+        sc = SparkContext._active_spark_context
+        if val:
+            jc = getattr(sc._jvm.functions, name)(val)
+        else:
+            jc = getattr(sc._jvm.functions, name)()
+        return Column(jc)
+    _.__name__ = name
+    _.__doc__ = doc
+    return _
+
+
 _functions = {
     'lit': 'Creates a :class:`Column` of literal value.',
     'col': 'Returns a :class:`Column` based on the given column name.',
@@ -67,11 +81,18 @@ def _(col):
     'sumDistinct': 'Aggregate function: returns the sum of distinct values in the expression.',
 }
 
+_randfunctions = {
+    'rand': 'Generate a random column with i.i.d. samples from U[0.0, 1.0].',
+    'randn': 'Generate a column with i.i.d. samples from the standard normal distribution.'
+}
 
 for _name, _doc in _functions.items():
     globals()[_name] = _create_function(_name, _doc)
+for _name, _doc in _randfunctions.items():
+    globals()[_name] = _create_zero_arg_function(_name, _doc)
 del _name, _doc
 __all__ += _functions.keys()
+__all__ += _randfunctions.keys()
 __all__.sort()
 
 

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -416,6 +416,16 @@ def assert_close(a, b):
         assert_close([math.hypot(i, 2 * i) for i in range(10)],
                      df.select(functions.hypot(df.a, df.b)).collect())
 
+    def test_rand_functions(self):
+        df = self.df
+        from pyspark.sql import functions
+        rnd = df.select('key', functions.rand()).collect()
+        for row in rnd:
+            assert row[1] >= 0.0 and row[1] <= 1.0, "got: %s" % row[1]
+        rndn = df.select('key', functions.randn(5L)).collect()
+        for row in rndn:
+            assert row[1] >= -4.0 and row[1] <= 4.0, "got: %s" % row[1]
+
     def test_save_and_load(self):
         df = self.df
         tmpPath = tempfile.mkdtemp()

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randfuncs/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randfuncs/random.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.randfuncs
+
+import org.apache.spark.TaskContext
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.{DoubleType, DataType}
+import org.apache.spark.util.random.XORShiftRandom
+
+/**
+ * A Random distribution generating expression.
+ * TODO: This can be made generic to generate any type of random distribution, or any type of  
+ * StructType.
+ *
+ * Since this expression is stateful, it cannot be a case object.
+ */
+private[sql] abstract class RDG(seed: Long) extends LeafExpression with Serializable { 
+  self: Product =>
+
+  /**
+   * Record ID within each partition. By being transient, the Random Number Generator is
+   * reset every time we serialize and deserialize it.
+   */
+  @transient private[this] lazy val rng = new XORShiftRandom(seed + TaskContext.get().partitionId())
+
+  override type EvaluatedType = Double
+
+  override def nullable: Boolean = false
+
+  override def dataType: DataType = DoubleType
+
+  def generateNumber(random: XORShiftRandom): Double
+
+  override def eval(input: Row): Double = {
+    generateNumber(rng)
+  }
+}
+
+/** Generate a random column with i.i.d. uniformly distributed values in [0, 1). */
+private[sql] case class Rand(seed: Long) extends RDG(seed) {
+  override def generateNumber(random: XORShiftRandom): Double = random.nextDouble()
+}
+
+/** Generate a random column with i.i.d. gaussian random distribution. */
+private[sql] case class Randn(seed: Long) extends RDG(seed) {
+  override def generateNumber(random: XORShiftRandom): Double = random.nextGaussian()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -24,8 +24,9 @@ import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedFunction, Star}
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.randfuncs.{Randn, Rand}
 import org.apache.spark.sql.types._
-
+import org.apache.spark.util.Utils
 
 /**
  * :: Experimental ::
@@ -346,6 +347,34 @@ object functions {
    */
   def not(e: Column): Column = !e
 
+  /**
+   * Generate a random column with i.i.d. samples from U[0.0, 1.0].
+   *
+   * @group normal_funcs
+   */
+  def rand(seed: Long): Column = Rand(seed)
+
+  /**
+   * Generate a random column with i.i.d. samples from U[0.0, 1.0].
+   *
+   * @group normal_funcs
+   */
+  def rand(): Column = rand(Utils.random.nextLong)
+
+  /**
+   * Generate a column with i.i.d. samples from the standard normal distribution.
+   *
+   * @group normal_funcs
+   */
+  def randn(seed: Long): Column = Randn(seed)
+
+  /**
+   * Generate a column with i.i.d. samples from the standard normal distribution.
+   *
+   * @group normal_funcs
+   */
+  def randn(): Column = randn(Utils.random.nextLong)
+
   /**
    * Partition ID of the Spark task.
    *

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/mathfunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/mathfunctions.scala
@@ -27,8 +27,6 @@ import org.apache.spark.sql.functions.lit
 /**
  * :: Experimental ::
  * Mathematical Functions available for [[DataFrame]].
- *
- * @groupname double_funcs Functions that require DoubleType as an input
  */
 @Experimental
 // scalastyle:off

diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -107,6 +107,9 @@ public void testVarargMethods() {
     df2.select(pow("a", "a"), pow("b", 2.0));
     df2.select(pow(col("a"), col("b")), exp("b"));
     df2.select(sin("a"), acos("b"));
+
+    df2.select(rand(), acos("b"));
+    df2.select(col("*"), randn(5L));
   }
 
   @Ignore

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql
 
+import org.scalatest.Matchers._
+
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext.implicits._
@@ -349,4 +351,24 @@ class ColumnExpressionSuite extends QueryTest {
     assert(schema("value").metadata === Metadata.empty)
     assert(schema("abc").metadata === metadata)
   }
+
+  test("rand") {
+    val randCol = testData.select('key, rand(5L).as("rand"))
+    randCol.columns.length should be (2)
+    val rows = randCol.collect()
+    rows.foreach { row =>
+      assert(row.getDouble(1) <= 1.0)
+      assert(row.getDouble(1) >= 0.0)
+    }
+  }
+
+  test("randn") {
+    val randCol = testData.select('key, randn(5L).as("rand"))
+    randCol.columns.length should be (2)
+    val rows = randCol.collect()
+    rows.foreach { row =>
+      assert(row.getDouble(1) <= 4.0)
+      assert(row.getDouble(1) >= -4.0)
+    }
+  }
 }