Merge remote-tracking branch 'upstream/master' into pyspark-inputformats

Conflicts: project/SparkBuild.scala python/pyspark/context.py
apache · MLnick · Dec 9, 2013 · Dec 12, 2013 · Dec 15, 2013 · Dec 15, 2013
commit 1c8efbc4a87cd7d719d8fef4e41781c67b414a6f
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
@@ -245,12 +245,12 @@ object SparkBuild extends Build {
         "org.slf4j"                % "slf4j-api"        % slf4jVersion,
         "org.slf4j"                % "slf4j-log4j12"    % slf4jVersion,
         "commons-daemon"           % "commons-daemon"   % "1.0.10", // workaround for bug HADOOP-9407
-        "com.ning"                 % "compress-lzf"     % "1.0.0",
+        "com.ning"                 % "compress-lzf"     % "0.8.4",
         "org.xerial.snappy"        % "snappy-java"      % "1.0.5",
         "org.ow2.asm"              % "asm"              % "4.0",
-        "org.spark-project.akka"  %% "akka-remote"      % "2.2.3-shaded-protobuf"  excludeAll(excludeNetty),
-        "org.spark-project.akka"  %% "akka-slf4j"       % "2.2.3-shaded-protobuf"  excludeAll(excludeNetty),
-        "org.spark-project.akka"  %% "akka-testkit"     % "2.2.3-shaded-protobuf" % "test",
+        "com.google.protobuf"      % "protobuf-java"    % "2.4.1",
+        "com.typesafe.akka"       %% "akka-remote"      % "2.2.3"  excludeAll(excludeNetty),
+        "com.typesafe.akka"       %% "akka-slf4j"       % "2.2.3"  excludeAll(excludeNetty),
         "net.liftweb"             %% "lift-json"        % "2.5.1"  excludeAll(excludeNetty),
         "it.unimi.dsi"             % "fastutil"         % "6.4.4",
         "colt"                     % "colt"             % "1.2.0",
@@ -268,6 +268,8 @@ object SparkBuild extends Build {
         "com.codahale.metrics"     % "metrics-graphite" % "3.0.0",
         "com.twitter"             %% "chill"            % "0.3.1",
         "com.twitter"              % "chill-java"       % "0.3.1",
+        "com.typesafe"             % "config"           % "1.0.2",
+        "com.clearspring.analytics" % "stream"          % "2.5.1",
         "org.msgpack"             %% "msgpack-scala"    % "0.6.8"
       )
   )

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
@@ -50,8 +50,8 @@ class SparkContext(object):
     _python_includes = None # zip and egg files that need to be added to PYTHONPATH
 
 
-    def __init__(self, master, jobName, sparkHome=None, pyFiles=None,
-                 environment=None, batchSize=1024, serializer=PickleSerializer()):
+    def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
+        environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None):
         """
         Create a new SparkContext. At least the master and app name should be set,
         either through the named parameters here or through C{conf}.
@@ -120,17 +120,15 @@ def __init__(self, master, jobName, sparkHome=None, pyFiles=None,
                 self.environment[varName] = v
 
         # Create the Java SparkContext through Py4J
-        empty_string_array = self._gateway.new_array(self._jvm.String, 0)
-        self._jsc = self._jvm.JavaSparkContext(master, jobName, sparkHome,
-                                               empty_string_array)
+        self._jsc = self._jvm.JavaSparkContext(self._conf._jconf)
 
         # Create a single Accumulator in Java that we'll send all our updates through;
         # they will be passed back to us through a TCP server
         self._accumulatorServer = accumulators._start_update_server()
         (host, port) = self._accumulatorServer.server_address
         self._javaAccumulator = self._jsc.accumulator(
-            self._jvm.java.util.ArrayList(),
-            self._jvm.PythonAccumulatorParam(host, port))
+                self._jvm.java.util.ArrayList(),
+                self._jvm.PythonAccumulatorParam(host, port))
 
         self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
 
@@ -465,7 +463,7 @@ def _getJavaStorageLevel(self, storageLevel):
 
         newStorageLevel = self._jvm.org.apache.spark.storage.StorageLevel
         return newStorageLevel(storageLevel.useDisk, storageLevel.useMemory,
-                               storageLevel.deserialized, storageLevel.replication)
+            storageLevel.deserialized, storageLevel.replication)
 
 
 def _test():