-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-5063] More helpful error messages for several invalid operations #3884
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
15b2e6b
57cc8a1
34833e8
99cc09f
b39e041
8d404f3
13afd0f
9f6a0b8
6ef68d0
8cff41a
8e5da69
3f0ea0c
2d0d7f7
a943e00
a38774b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -85,6 +85,14 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
|
|
||
| val startTime = System.currentTimeMillis() | ||
|
|
||
| @volatile private var stopped: Boolean = false | ||
|
|
||
| private def assertNotStopped(): Unit = { | ||
| if (stopped) { | ||
| throw new IllegalStateException("Cannot call methods on a stopped SparkContext") | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Create a SparkContext that loads settings from system properties (for instance, when | ||
| * launching with ./bin/spark-submit). | ||
|
|
@@ -526,6 +534,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| * the argument to avoid this. | ||
| */ | ||
| def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = { | ||
| assertNotStopped() | ||
| new ParallelCollectionRDD[T](this, seq, numSlices, Map[Int, Seq[String]]()) | ||
| } | ||
|
|
||
|
|
@@ -541,6 +550,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| * location preferences (hostnames of Spark nodes) for each object. | ||
| * Create a new partition for each collection item. */ | ||
| def makeRDD[T: ClassTag](seq: Seq[(T, Seq[String])]): RDD[T] = { | ||
| assertNotStopped() | ||
| val indexToPrefs = seq.zipWithIndex.map(t => (t._2, t._1._2)).toMap | ||
| new ParallelCollectionRDD[T](this, seq.map(_._1), seq.size, indexToPrefs) | ||
| } | ||
|
|
@@ -550,6 +560,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| * Hadoop-supported file system URI, and return it as an RDD of Strings. | ||
| */ | ||
| def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String] = { | ||
| assertNotStopped() | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same for textFile: |
||
| hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], | ||
| minPartitions).map(pair => pair._2.toString).setName(path) | ||
| } | ||
|
|
@@ -583,6 +594,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| */ | ||
| def wholeTextFiles(path: String, minPartitions: Int = defaultMinPartitions): | ||
| RDD[(String, String)] = { | ||
| assertNotStopped() | ||
| val job = new NewHadoopJob(hadoopConfiguration) | ||
| NewFileInputFormat.addInputPath(job, new Path(path)) | ||
| val updateConf = job.getConfiguration | ||
|
|
@@ -628,6 +640,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| @Experimental | ||
| def binaryFiles(path: String, minPartitions: Int = defaultMinPartitions): | ||
| RDD[(String, PortableDataStream)] = { | ||
| assertNotStopped() | ||
| val job = new NewHadoopJob(hadoopConfiguration) | ||
| NewFileInputFormat.addInputPath(job, new Path(path)) | ||
| val updateConf = job.getConfiguration | ||
|
|
@@ -652,6 +665,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| @Experimental | ||
| def binaryRecords(path: String, recordLength: Int, conf: Configuration = hadoopConfiguration) | ||
| : RDD[Array[Byte]] = { | ||
| assertNotStopped() | ||
| conf.setInt(FixedLengthBinaryInputFormat.RECORD_LENGTH_PROPERTY, recordLength) | ||
| val br = newAPIHadoopFile[LongWritable, BytesWritable, FixedLengthBinaryInputFormat](path, | ||
| classOf[FixedLengthBinaryInputFormat], | ||
|
|
@@ -685,6 +699,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| valueClass: Class[V], | ||
| minPartitions: Int = defaultMinPartitions | ||
| ): RDD[(K, V)] = { | ||
| assertNotStopped() | ||
| // Add necessary security credentials to the JobConf before broadcasting it. | ||
| SparkHadoopUtil.get.addCredentials(conf) | ||
| new HadoopRDD(this, conf, inputFormatClass, keyClass, valueClass, minPartitions) | ||
|
|
@@ -704,6 +719,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| valueClass: Class[V], | ||
| minPartitions: Int = defaultMinPartitions | ||
| ): RDD[(K, V)] = { | ||
| assertNotStopped() | ||
| // A Hadoop configuration can be about 10 KB, which is pretty big, so broadcast it. | ||
| val confBroadcast = broadcast(new SerializableWritable(hadoopConfiguration)) | ||
| val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, path) | ||
|
|
@@ -783,6 +799,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| kClass: Class[K], | ||
| vClass: Class[V], | ||
| conf: Configuration = hadoopConfiguration): RDD[(K, V)] = { | ||
| assertNotStopped() | ||
| val job = new NewHadoopJob(conf) | ||
| NewFileInputFormat.addInputPath(job, new Path(path)) | ||
| val updatedConf = job.getConfiguration | ||
|
|
@@ -803,6 +820,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| fClass: Class[F], | ||
| kClass: Class[K], | ||
| vClass: Class[V]): RDD[(K, V)] = { | ||
| assertNotStopped() | ||
| new NewHadoopRDD(this, fClass, kClass, vClass, conf) | ||
| } | ||
|
|
||
|
|
@@ -818,6 +836,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| valueClass: Class[V], | ||
| minPartitions: Int | ||
| ): RDD[(K, V)] = { | ||
| assertNotStopped() | ||
| val inputFormatClass = classOf[SequenceFileInputFormat[K, V]] | ||
| hadoopFile(path, inputFormatClass, keyClass, valueClass, minPartitions) | ||
| } | ||
|
|
@@ -829,9 +848,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| * If you plan to directly cache Hadoop writable objects, you should first copy them using | ||
| * a `map` function. | ||
| * */ | ||
| def sequenceFile[K, V](path: String, keyClass: Class[K], valueClass: Class[V] | ||
| ): RDD[(K, V)] = | ||
| def sequenceFile[K, V](path: String, keyClass: Class[K], valueClass: Class[V]): RDD[(K, V)] = { | ||
| assertNotStopped() | ||
| sequenceFile(path, keyClass, valueClass, defaultMinPartitions) | ||
| } | ||
|
|
||
| /** | ||
| * Version of sequenceFile() for types implicitly convertible to Writables through a | ||
|
|
@@ -859,6 +879,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| (implicit km: ClassTag[K], vm: ClassTag[V], | ||
| kcf: () => WritableConverter[K], vcf: () => WritableConverter[V]) | ||
| : RDD[(K, V)] = { | ||
| assertNotStopped() | ||
| val kc = kcf() | ||
| val vc = vcf() | ||
| val format = classOf[SequenceFileInputFormat[Writable, Writable]] | ||
|
|
@@ -880,6 +901,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| path: String, | ||
| minPartitions: Int = defaultMinPartitions | ||
| ): RDD[T] = { | ||
| assertNotStopped() | ||
| sequenceFile(path, classOf[NullWritable], classOf[BytesWritable], minPartitions) | ||
| .flatMap(x => Utils.deserialize[Array[T]](x._2.getBytes, Utils.getContextOrSparkClassLoader)) | ||
| } | ||
|
|
@@ -955,6 +977,13 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| * The variable will be sent to each cluster only once. | ||
| */ | ||
| def broadcast[T: ClassTag](value: T): Broadcast[T] = { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Broadcast, on the other hand, throws a NPE: |
||
| assertNotStopped() | ||
| if (classOf[RDD[_]].isAssignableFrom(classTag[T].runtimeClass)) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, maybe this check should go somewhere else, since I think that it might technically have been safe to create a broadcast variable with an RDD, even though doing anything with it would trigger errors.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've changed this in my latest patch; we log a warning here and any errors are caught by the more general "display an error about RDD nesting if the |
||
| // This is a warning instead of an exception in order to avoid breaking user programs that | ||
| // might have created RDD broadcast variables but not used them: | ||
| logWarning("Can not directly broadcast RDDs; instead, call collect() and " | ||
| + "broadcast the result (see SPARK-5063)") | ||
| } | ||
| val bc = env.broadcastManager.newBroadcast[T](value, isLocal) | ||
| val callSite = getCallSite | ||
| logInfo("Created broadcast " + bc.id + " from " + callSite.shortForm) | ||
|
|
@@ -1047,6 +1076,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| * memory available for caching. | ||
| */ | ||
| def getExecutorMemoryStatus: Map[String, (Long, Long)] = { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This throws an error, so I'll keep it: |
||
| assertNotStopped() | ||
| env.blockManager.master.getMemoryStatus.map { case(blockManagerId, mem) => | ||
| (blockManagerId.host + ":" + blockManagerId.port, mem) | ||
| } | ||
|
|
@@ -1059,6 +1089,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| */ | ||
| @DeveloperApi | ||
| def getRDDStorageInfo: Array[RDDInfo] = { | ||
| assertNotStopped() | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here: |
||
| val rddInfos = persistentRdds.values.map(RDDInfo.fromRdd).toArray | ||
| StorageUtils.updateRddInfo(rddInfos, getExecutorStorageStatus) | ||
| rddInfos.filter(_.isCached) | ||
|
|
@@ -1076,6 +1107,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| */ | ||
| @DeveloperApi | ||
| def getExecutorStorageStatus: Array[StorageStatus] = { | ||
| assertNotStopped() | ||
| env.blockManager.master.getStorageStatus | ||
| } | ||
|
|
||
|
|
@@ -1085,6 +1117,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| */ | ||
| @DeveloperApi | ||
| def getAllPools: Seq[Schedulable] = { | ||
| assertNotStopped() | ||
| // TODO(xiajunluan): We should take nested pools into account | ||
| taskScheduler.rootPool.schedulableQueue.toSeq | ||
| } | ||
|
|
@@ -1095,13 +1128,15 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| */ | ||
| @DeveloperApi | ||
| def getPoolForName(pool: String): Option[Schedulable] = { | ||
| assertNotStopped() | ||
| Option(taskScheduler.rootPool.schedulableNameToSchedulable.get(pool)) | ||
| } | ||
|
|
||
| /** | ||
| * Return current scheduling mode | ||
| */ | ||
| def getSchedulingMode: SchedulingMode.SchedulingMode = { | ||
| assertNotStopped() | ||
| taskScheduler.schedulingMode | ||
| } | ||
|
|
||
|
|
@@ -1207,16 +1242,14 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| SparkContext.SPARK_CONTEXT_CONSTRUCTOR_LOCK.synchronized { | ||
| postApplicationEnd() | ||
| ui.foreach(_.stop()) | ||
| // Do this only if not stopped already - best case effort. | ||
| // prevent NPE if stopped more than once. | ||
| val dagSchedulerCopy = dagScheduler | ||
| dagScheduler = null | ||
| if (dagSchedulerCopy != null) { | ||
| if (!stopped) { | ||
| stopped = true | ||
| env.metricsSystem.report() | ||
| metadataCleaner.cancel() | ||
| env.actorSystem.stop(heartbeatReceiver) | ||
| cleaner.foreach(_.stop()) | ||
| dagSchedulerCopy.stop() | ||
| dagScheduler.stop() | ||
| dagScheduler = null | ||
| taskScheduler = null | ||
| // TODO: Cache.stop()? | ||
| env.stop() | ||
|
|
@@ -1290,8 +1323,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| partitions: Seq[Int], | ||
| allowLocal: Boolean, | ||
| resultHandler: (Int, U) => Unit) { | ||
| if (dagScheduler == null) { | ||
| throw new SparkException("SparkContext has been shutdown") | ||
| if (stopped) { | ||
| throw new IllegalStateException("SparkContext has been shutdown") | ||
| } | ||
| val callSite = getCallSite | ||
| val cleanedFunc = clean(func) | ||
|
|
@@ -1378,6 +1411,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| func: (TaskContext, Iterator[T]) => U, | ||
| evaluator: ApproximateEvaluator[U, R], | ||
| timeout: Long): PartialResult[R] = { | ||
| assertNotStopped() | ||
| val callSite = getCallSite | ||
| logInfo("Starting job: " + callSite.shortForm) | ||
| val start = System.nanoTime | ||
|
|
@@ -1400,6 +1434,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| resultHandler: (Int, U) => Unit, | ||
| resultFunc: => R): SimpleFutureAction[R] = | ||
| { | ||
| assertNotStopped() | ||
| val cleanF = clean(processPartition) | ||
| val callSite = getCallSite | ||
| val waiter = dagScheduler.submitJob( | ||
|
|
@@ -1418,11 +1453,13 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| * for more information. | ||
| */ | ||
| def cancelJobGroup(groupId: String) { | ||
| assertNotStopped() | ||
| dagScheduler.cancelJobGroup(groupId) | ||
| } | ||
|
|
||
| /** Cancel all jobs that have been scheduled or are running. */ | ||
| def cancelAllJobs() { | ||
| assertNotStopped() | ||
| dagScheduler.cancelAllJobs() | ||
| } | ||
|
|
||
|
|
@@ -1469,7 +1506,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| def getCheckpointDir = checkpointDir | ||
|
|
||
| /** Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD). */ | ||
| def defaultParallelism: Int = taskScheduler.defaultParallelism | ||
| def defaultParallelism: Int = { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This throws an exception because |
||
| assertNotStopped() | ||
| taskScheduler.defaultParallelism | ||
| } | ||
|
|
||
| /** Default min number of partitions for Hadoop RDDs when not given by user */ | ||
| @deprecated("use defaultMinPartitions", "1.0.0") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -76,10 +76,27 @@ import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, Bernoulli | |
| * on RDD internals. | ||
| */ | ||
| abstract class RDD[T: ClassTag]( | ||
| @transient private var sc: SparkContext, | ||
| @transient private var _sc: SparkContext, | ||
| @transient private var deps: Seq[Dependency[_]] | ||
| ) extends Serializable with Logging { | ||
|
|
||
| if (classOf[RDD[_]].isAssignableFrom(elementClassTag.runtimeClass)) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly, this should perhaps be a warning instead of an exception in order to avoid any possibility of breaking odd corner-case 1.2.1 apps. I'll change this to a warning and leave the |
||
| // This is a warning instead of an exception in order to avoid breaking user programs that | ||
| // might have defined nested RDDs without running jobs with them. | ||
| logWarning("Spark does not support nested RDDs (see SPARK-5063)") | ||
| } | ||
|
|
||
| private def sc: SparkContext = { | ||
| if (_sc == null) { | ||
| throw new SparkException( | ||
| "RDD transformations and actions can only be invoked by the driver, not inside of other " + | ||
| "transformations; for example, rdd1.map(x => rdd2.values.count() * x) is invalid because " + | ||
| "the values transformation and count action cannot be performed inside of the rdd1.map " + | ||
| "transformation. For more information, see SPARK-5063.") | ||
| } | ||
| _sc | ||
| } | ||
|
|
||
| /** Construct an RDD with just a one-to-one dependency on one parent */ | ||
| def this(@transient oneParent: RDD[_]) = | ||
| this(oneParent.context , List(new OneToOneDependency(oneParent))) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In 1.2, calling this when SparkContext was stopped would throw a NullPointerException: