tdas
diff --git a/‎.rat-excludes‎
Lines changed: 1 addition & 0 deletions b/‎.rat-excludes‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎assembly/pom.xml‎
Lines changed: 11 additions & 1 deletion b/‎assembly/pom.xml‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎bin/compute-classpath.sh‎
Lines changed: 19 additions & 16 deletions b/‎bin/compute-classpath.sh‎
Lines changed: 19 additions & 16 deletions
diff --git a/‎bin/spark-class‎
Lines changed: 0 additions & 2 deletions b/‎bin/spark-class‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎core/pom.xml‎
Lines changed: 47 additions & 2 deletions b/‎core/pom.xml‎
Lines changed: 47 additions & 2 deletions
diff --git a/‎core/src/main/java/org/apache/spark/api/java/StorageLevels.java‎
Lines changed: 33 additions & 13 deletions b/‎core/src/main/java/org/apache/spark/api/java/StorageLevels.java‎
Lines changed: 33 additions & 13 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 8 additions & 8 deletions b/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala‎
Lines changed: 13 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala‎
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala‎
Lines changed: 4 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala‎
Lines changed: 4 additions & 2 deletions
@@ -39,3 +39,4 @@ work
 .*\.q
 golden
 test.out/*
+.*iml
@@ -163,6 +163,16 @@
         </dependency>
       </dependencies>
     </profile>
+    <profile>
+      <id>hive</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-hive_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
     <profile>
       <id>spark-ganglia-lgpl</id>
       <dependencies>
@@ -208,7 +218,7 @@
           <plugin>
             <groupId>org.codehaus.mojo</groupId>
             <artifactId>buildnumber-maven-plugin</artifactId>
-            <version>1.1</version>
+            <version>1.2</version>
             <executions>
               <execution>
                 <phase>validate</phase>
 
@@ -30,21 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
 # Build up classpath
 CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
 
-# Support for interacting with Hive.  Since hive pulls in a lot of dependencies that might break
-# existing Spark applications, it is not included in the standard spark assembly.  Instead, we only
-# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
-# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
-# the future.
-if [ -f "$FWDIR"/sql/hive/target/scala-$SCALA_VERSION/spark-hive-assembly-*.jar ]; then
-
-  # Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
-  DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
-  CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
-
-  ASSEMBLY_DIR="$FWDIR/sql/hive/target/scala-$SCALA_VERSION/"
-else
-  ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
-fi
+ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"
 
 # First check if we have a dependencies jar. If so, include binary classes with the deps jar
 if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
@@ -59,7 +45,7 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
   CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
 
-  DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*-deps.jar`
+  DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
   CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
 else
   # Else use spark-assembly jar from either RELEASE or assembly directory
@@ -71,6 +57,23 @@ else
   CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
 fi
 
+# When Hive support is needed, Datanucleus jars must be included on the classpath.
+# Datanucleus jars do not work if only included in the  uber jar as plugin.xml metadata is lost.
+# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
+# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
+# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
+# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
+num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ | grep "datanucleus-.*\\.jar" | wc -l)
+if [ $num_datanucleus_jars -gt 0 ]; then
+  AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
+  num_hive_files=$(jar tvf "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)
+  if [ $num_hive_files -gt 0 ]; then
+    echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
+    DATANUCLEUSJARS=$(echo "$FWDIR/lib_managed/jars"/datanucleus-*.jar | tr " " :)
+    CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
+  fi
+fi
+
 # Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1
 if [[ $SPARK_TESTING == 1 ]]; then
   CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/test-classes"
 
@@ -154,5 +154,3 @@ if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
 fi
 
 exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
-
-
@@ -117,12 +117,10 @@
     <dependency>
       <groupId>com.twitter</groupId>
       <artifactId>chill_${scala.binary.version}</artifactId>
-      <version>0.3.1</version>
     </dependency>
     <dependency>
       <groupId>com.twitter</groupId>
       <artifactId>chill-java</artifactId>
-      <version>0.3.1</version>
     </dependency>
     <dependency>
       <groupId>commons-net</groupId>
@@ -200,6 +198,53 @@
       <artifactId>derby</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.tachyonproject</groupId>
+      <artifactId>tachyon</artifactId>
+      <version>0.4.1-thrift</version>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-client</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.curator</groupId>
+          <artifactId>curator-recipes</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.eclipse.jetty</groupId>
+          <artifactId>jetty-jsp</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.eclipse.jetty</groupId>
+          <artifactId>jetty-webapp</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.eclipse.jetty</groupId>
+          <artifactId>jetty-server</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.eclipse.jetty</groupId>
+          <artifactId>jetty-servlet</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>junit</groupId>
+          <artifactId>junit</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.powermock</groupId>
+          <artifactId>powermock-module-junit4</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.powermock</groupId>
+          <artifactId>powermock-api-mockito</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.curator</groupId>
+          <artifactId>curator-test</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_${scala.binary.version}</artifactId>
 
@@ -23,17 +23,18 @@
  * Expose some commonly useful storage level constants.
  */
 public class StorageLevels {
-  public static final StorageLevel NONE = create(false, false, false, 1);
-  public static final StorageLevel DISK_ONLY = create(true, false, false, 1);
-  public static final StorageLevel DISK_ONLY_2 = create(true, false, false, 2);
-  public static final StorageLevel MEMORY_ONLY = create(false, true, true, 1);
-  public static final StorageLevel MEMORY_ONLY_2 = create(false, true, true, 2);
-  public static final StorageLevel MEMORY_ONLY_SER = create(false, true, false, 1);
-  public static final StorageLevel MEMORY_ONLY_SER_2 = create(false, true, false, 2);
-  public static final StorageLevel MEMORY_AND_DISK = create(true, true, true, 1);
-  public static final StorageLevel MEMORY_AND_DISK_2 = create(true, true, true, 2);
-  public static final StorageLevel MEMORY_AND_DISK_SER = create(true, true, false, 1);
-  public static final StorageLevel MEMORY_AND_DISK_SER_2 = create(true, true, false, 2);
+  public static final StorageLevel NONE = create(false, false, false, false, 1);
+  public static final StorageLevel DISK_ONLY = create(true, false, false, false, 1);
+  public static final StorageLevel DISK_ONLY_2 = create(true, false, false, false, 2);
+  public static final StorageLevel MEMORY_ONLY = create(false, true, false, true, 1);
+  public static final StorageLevel MEMORY_ONLY_2 = create(false, true, false, true, 2);
+  public static final StorageLevel MEMORY_ONLY_SER = create(false, true, false, false, 1);
+  public static final StorageLevel MEMORY_ONLY_SER_2 = create(false, true, false, false, 2);
+  public static final StorageLevel MEMORY_AND_DISK = create(true, true, false, true, 1);
+  public static final StorageLevel MEMORY_AND_DISK_2 = create(true, true, false, true, 2);
+  public static final StorageLevel MEMORY_AND_DISK_SER = create(true, true, false, false, 1);
+  public static final StorageLevel MEMORY_AND_DISK_SER_2 = create(true, true, false, false, 2);
+  public static final StorageLevel OFF_HEAP = create(false, false, true, false, 1);
 
   /**
    * Create a new StorageLevel object.
@@ -42,7 +43,26 @@ public class StorageLevels {
    * @param deserialized saved as deserialized objects, if true
    * @param replication replication factor
    */
-  public static StorageLevel create(boolean useDisk, boolean useMemory, boolean deserialized, int replication) {
-    return StorageLevel.apply(useDisk, useMemory, deserialized, replication);
+  @Deprecated
+  public static StorageLevel create(boolean useDisk, boolean useMemory, boolean deserialized,
+      int replication) {
+    return StorageLevel.apply(useDisk, useMemory, false, deserialized, replication);
+  }
+
+  /**
+   * Create a new StorageLevel object.
+   * @param useDisk saved to disk, if true
+   * @param useMemory saved to memory, if true
+   * @param useOffHeap saved to Tachyon, if true
+   * @param deserialized saved as deserialized objects, if true
+   * @param replication replication factor
+   */
+  public static StorageLevel create(
+    boolean useDisk,
+    boolean useMemory,
+    boolean useOffHeap,
+    boolean deserialized,
+    int replication) {
+    return StorageLevel.apply(useDisk, useMemory, useOffHeap, deserialized, replication);
   }
 }
@@ -19,14 +19,13 @@ package org.apache.spark
 
 import java.io._
 import java.net.URI
-import java.util.{Properties, UUID}
 import java.util.concurrent.atomic.AtomicInteger
-
+import java.util.{Properties, UUID}
+import java.util.UUID.randomUUID
 import scala.collection.{Map, Set}
 import scala.collection.generic.Growable
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.reflect.{ClassTag, classTag}
-
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable, FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable}
@@ -130,6 +129,11 @@ class SparkContext(
   val master = conf.get("spark.master")
   val appName = conf.get("spark.app.name")
 
+  // Generate the random name for a temp folder in Tachyon
+  // Add a timestamp as the suffix here to make it more safe
+  val tachyonFolderName = "spark-" + randomUUID.toString()
+  conf.set("spark.tachyonStore.folderName", tachyonFolderName)
+
   val isLocal = (master == "local" || master.startsWith("local["))
 
   if (master == "yarn-client") System.setProperty("SPARK_YARN_MODE", "true")
@@ -393,7 +397,7 @@ class SparkContext(
    *   (a-hdfs-path/part-nnnnn, its content)
    * }}}
    *
-   * @note Small files are perferred, large file is also allowable, but may cause bad performance.
+   * @note Small files are preferred, as each file will be loaded fully in memory.
    */
   def wholeTextFiles(path: String): RDD[(String, String)] = {
     newAPIHadoopFile(
@@ -725,10 +729,6 @@ class SparkContext(
    */
   def getPersistentRDDs: Map[Int, RDD[_]] = persistentRdds.toMap
 
-  def getStageInfo: Map[Stage, StageInfo] = {
-    dagScheduler.stageToInfos
-  }
-
   /**
    * Return information about blocks stored in all of the slaves
    */
 
@@ -17,7 +17,8 @@
 
 package org.apache.spark.api.java
 
-import java.util.{Comparator, List => JList}
+import java.util.{Comparator, Iterator => JIterator, List => JList}
+import java.lang.{Iterable => JIterable}
 
 import scala.collection.JavaConversions._
 import scala.reflect.ClassTag
@@ -280,6 +281,17 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
     new java.util.ArrayList(arr)
   }
 
+  /**
+   * Return an iterator that contains all of the elements in this RDD.
+   *
+   * The iterator will consume as much memory as the largest partition in this RDD.
+   */
+  def toLocalIterator(): JIterator[T] = {
+     import scala.collection.JavaConversions._
+     rdd.toLocalIterator
+  }
+
+
   /**
    * Return an array that contains all of the elements in this RDD.
    * @deprecated As of Spark 1.0.0, toArray() is deprecated, use {@link #collect()} instead
 
@@ -177,7 +177,7 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
    *   (a-hdfs-path/part-nnnnn, its content)
    * }}}
    *
-   * @note Small files are perferred, large file is also allowable, but may cause bad performance.
+   * @note Small files are preferred, as each file will be loaded fully in memory.
    */
   def wholeTextFiles(path: String): JavaPairRDD[String, String] =
     new JavaPairRDD(sc.wholeTextFiles(path))
 
@@ -19,6 +19,7 @@ package org.apache.spark.api.python
 
 import java.io._
 import java.net._
+import java.nio.charset.Charset
 import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
 
 import scala.collection.JavaConversions._
@@ -206,6 +207,7 @@ private object SpecialLengths {
 }
 
 private[spark] object PythonRDD {
+  val UTF8 = Charset.forName("UTF-8")
 
   def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int):
   JavaRDD[Array[Byte]] = {
@@ -266,7 +268,7 @@ private[spark] object PythonRDD {
   }
 
   def writeUTF(str: String, dataOut: DataOutputStream) {
-    val bytes = str.getBytes("UTF-8")
+    val bytes = str.getBytes(UTF8)
     dataOut.writeInt(bytes.length)
     dataOut.write(bytes)
   }
@@ -286,7 +288,7 @@ private[spark] object PythonRDD {
 
 private
 class BytesToString extends org.apache.spark.api.java.function.Function[Array[Byte], String] {
-  override def call(arr: Array[Byte]) : String = new String(arr, "UTF-8")
+  override def call(arr: Array[Byte]) : String = new String(arr, PythonRDD.UTF8)
 }
 
 /**
-Original file line number
+Diff line change
 .*\.q
 golden
 test.out/*
 +.*iml
Original file line number	Diff line number	Diff line change
`@@ -177,7 +177,7 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork`
`177`	`177`	`* (a-hdfs-path/part-nnnnn, its content)`
`178`	`178`	`* }}}`
`179`	`179`	`*`
`180`		`- * @note Small files are perferred, large file is also allowable, but may cause bad performance.`
	`180`	`+ * @note Small files are preferred, as each file will be loaded fully in memory.`
`181`	`181`	`*/`
`182`	`182`	`def wholeTextFiles(path: String): JavaPairRDD[String, String] =`
`183`	`183`	`new JavaPairRDD(sc.wholeTextFiles(path))`
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ package org.apache.spark.api.python`
`19`	`19`
`20`	`20`	`import java.io._`
`21`	`21`	`import java.net._`
	`22`	`+import java.nio.charset.Charset`
`22`	`23`	`import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}`
`23`	`24`
`24`	`25`	`import scala.collection.JavaConversions._`
`@@ -206,6 +207,7 @@ private object SpecialLengths {`
`206`	`207`	`}`
`207`	`208`
`208`	`209`	`private[spark] object PythonRDD {`
	`210`	`+ val UTF8 = Charset.forName("UTF-8")`
`209`	`211`
`210`	`212`	`def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int):`
`211`	`213`	`JavaRDD[Array[Byte]] = {`
`@@ -266,7 +268,7 @@ private[spark] object PythonRDD {`
`266`	`268`	`}`
`267`	`269`
`268`	`270`	`def writeUTF(str: String, dataOut: DataOutputStream) {`
`269`		`- val bytes = str.getBytes("UTF-8")`
	`271`	`+ val bytes = str.getBytes(UTF8)`
`270`	`272`	`dataOut.writeInt(bytes.length)`
`271`	`273`	`dataOut.write(bytes)`
`272`	`274`	`}`
`@@ -286,7 +288,7 @@ private[spark] object PythonRDD {`
`286`	`288`
`287`	`289`	`private`
`288`	`290`	`class BytesToString extends org.apache.spark.api.java.function.Function[Array[Byte], String] {`
`289`		`- override def call(arr: Array[Byte]) : String = new String(arr, "UTF-8")`
	`291`	`+ override def call(arr: Array[Byte]) : String = new String(arr, PythonRDD.UTF8)`
`290`	`292`	`}`
`291`	`293`
`292`	`294`	`/**`