Skip to content

Commit 94beedc

Browse files
committed
Clean up args in PythonRDD. Set key/value converter defaults to None for PySpark context.py methods
1 parent 1a4a1d6 commit 94beedc

File tree

2 files changed

+55
-53
lines changed

2 files changed

+55
-53
lines changed

core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala

Lines changed: 39 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -349,14 +349,14 @@ private[spark] object PythonRDD extends Logging {
349349
}
350350
}
351351

352-
/** Create and RDD from a path using [[org.apache.hadoop.mapred.SequenceFileInputFormat]] */
352+
/** Create an RDD from a path using [[org.apache.hadoop.mapred.SequenceFileInputFormat]] */
353353
def sequenceFile[K, V](
354354
sc: JavaSparkContext,
355355
path: String,
356356
keyClass: String,
357357
valueClass: String,
358-
keyWrapper: String,
359-
valueWrapper: String,
358+
keyConverter: String,
359+
valueConverter: String,
360360
minSplits: Int) = {
361361
implicit val kcm = ClassTag(Class.forName(keyClass)).asInstanceOf[ClassTag[K]]
362362
implicit val vcm = ClassTag(Class.forName(valueClass)).asInstanceOf[ClassTag[V]]
@@ -374,18 +374,18 @@ private[spark] object PythonRDD extends Logging {
374374
def newAPIHadoopFile[K, V, F <: NewInputFormat[K, V]](
375375
sc: JavaSparkContext,
376376
path: String,
377-
inputFormatClazz: String,
378-
keyClazz: String,
379-
valueClazz: String,
380-
keyWrapper: String,
381-
valueWrapper: String,
377+
inputFormatClass: String,
378+
keyClass: String,
379+
valueClass: String,
380+
keyConverter: String,
381+
valueConverter: String,
382382
confAsMap: java.util.HashMap[String, String]) = {
383383
val conf = PythonHadoopUtil.mapToConf(confAsMap)
384384
val baseConf = sc.hadoopConfiguration()
385385
val mergedConf = PythonHadoopUtil.mergeConfs(baseConf, conf)
386386
val rdd =
387387
newAPIHadoopRDDFromClassNames[K, V, F](sc,
388-
Some(path), inputFormatClazz, keyClazz, valueClazz, mergedConf)
388+
Some(path), inputFormatClass, keyClass, valueClass, mergedConf)
389389
val converted = PythonHadoopUtil.convertRDD[K, V](rdd)
390390
JavaRDD.fromRDD(SerDeUtil.rddToPython(converted))
391391
}
@@ -397,30 +397,30 @@ private[spark] object PythonRDD extends Logging {
397397
*/
398398
def newAPIHadoopRDD[K, V, F <: NewInputFormat[K, V]](
399399
sc: JavaSparkContext,
400-
inputFormatClazz: String,
401-
keyClazz: String,
402-
valueClazz: String,
403-
keyWrapper: String,
404-
valueWrapper: String,
400+
inputFormatClass: String,
401+
keyClass: String,
402+
valueClass: String,
403+
keyConverter: String,
404+
valueConverter: String,
405405
confAsMap: java.util.HashMap[String, String]) = {
406406
val conf = PythonHadoopUtil.mapToConf(confAsMap)
407407
val rdd =
408408
newAPIHadoopRDDFromClassNames[K, V, F](sc,
409-
None, inputFormatClazz, keyClazz, valueClazz, conf)
409+
None, inputFormatClass, keyClass, valueClass, conf)
410410
val converted = PythonHadoopUtil.convertRDD[K, V](rdd)
411411
JavaRDD.fromRDD(SerDeUtil.rddToPython(converted))
412412
}
413413

414414
private def newAPIHadoopRDDFromClassNames[K, V, F <: NewInputFormat[K, V]](
415415
sc: JavaSparkContext,
416416
path: Option[String] = None,
417-
inputFormatClazz: String,
418-
keyClazz: String,
419-
valueClazz: String,
417+
inputFormatClass: String,
418+
keyClass: String,
419+
valueClass: String,
420420
conf: Configuration) = {
421-
implicit val kcm = ClassTag(Class.forName(keyClazz)).asInstanceOf[ClassTag[K]]
422-
implicit val vcm = ClassTag(Class.forName(valueClazz)).asInstanceOf[ClassTag[V]]
423-
implicit val fcm = ClassTag(Class.forName(inputFormatClazz)).asInstanceOf[ClassTag[F]]
421+
implicit val kcm = ClassTag(Class.forName(keyClass)).asInstanceOf[ClassTag[K]]
422+
implicit val vcm = ClassTag(Class.forName(valueClass)).asInstanceOf[ClassTag[V]]
423+
implicit val fcm = ClassTag(Class.forName(inputFormatClass)).asInstanceOf[ClassTag[F]]
424424
val kc = kcm.runtimeClass.asInstanceOf[Class[K]]
425425
val vc = vcm.runtimeClass.asInstanceOf[Class[V]]
426426
val fc = fcm.runtimeClass.asInstanceOf[Class[F]]
@@ -439,18 +439,18 @@ private[spark] object PythonRDD extends Logging {
439439
def hadoopFile[K, V, F <: InputFormat[K, V]](
440440
sc: JavaSparkContext,
441441
path: String,
442-
inputFormatClazz: String,
443-
keyClazz: String,
444-
valueClazz: String,
445-
keyWrapper: String,
446-
valueWrapper: String,
442+
inputFormatClass: String,
443+
keyClass: String,
444+
valueClass: String,
445+
keyConverter: String,
446+
valueConverter: String,
447447
confAsMap: java.util.HashMap[String, String]) = {
448448
val conf = PythonHadoopUtil.mapToConf(confAsMap)
449449
val baseConf = sc.hadoopConfiguration()
450450
val mergedConf = PythonHadoopUtil.mergeConfs(baseConf, conf)
451451
val rdd =
452452
hadoopRDDFromClassNames[K, V, F](sc,
453-
Some(path), inputFormatClazz, keyClazz, valueClazz, mergedConf)
453+
Some(path), inputFormatClass, keyClass, valueClass, mergedConf)
454454
val converted = PythonHadoopUtil.convertRDD[K, V](rdd)
455455
JavaRDD.fromRDD(SerDeUtil.rddToPython(converted))
456456
}
@@ -462,30 +462,30 @@ private[spark] object PythonRDD extends Logging {
462462
*/
463463
def hadoopRDD[K, V, F <: InputFormat[K, V]](
464464
sc: JavaSparkContext,
465-
inputFormatClazz: String,
466-
keyClazz: String,
467-
valueClazz: String,
468-
keyWrapper: String,
469-
valueWrapper: String,
465+
inputFormatClass: String,
466+
keyClass: String,
467+
valueClass: String,
468+
keyConverter: String,
469+
valueConverter: String,
470470
confAsMap: java.util.HashMap[String, String]) = {
471471
val conf = PythonHadoopUtil.mapToConf(confAsMap)
472472
val rdd =
473473
hadoopRDDFromClassNames[K, V, F](sc,
474-
None, inputFormatClazz, keyClazz, valueClazz, conf)
474+
None, inputFormatClass, keyClass, valueClass, conf)
475475
val converted = PythonHadoopUtil.convertRDD[K, V](rdd)
476476
JavaRDD.fromRDD(SerDeUtil.rddToPython(converted))
477477
}
478478

479479
private def hadoopRDDFromClassNames[K, V, F <: InputFormat[K, V]](
480480
sc: JavaSparkContext,
481481
path: Option[String] = None,
482-
inputFormatClazz: String,
483-
keyClazz: String,
484-
valueClazz: String,
482+
inputFormatClass: String,
483+
keyClass: String,
484+
valueClass: String,
485485
conf: Configuration) = {
486-
implicit val kcm = ClassTag(Class.forName(keyClazz)).asInstanceOf[ClassTag[K]]
487-
implicit val vcm = ClassTag(Class.forName(valueClazz)).asInstanceOf[ClassTag[V]]
488-
implicit val fcm = ClassTag(Class.forName(inputFormatClazz)).asInstanceOf[ClassTag[F]]
486+
implicit val kcm = ClassTag(Class.forName(keyClass)).asInstanceOf[ClassTag[K]]
487+
implicit val vcm = ClassTag(Class.forName(valueClass)).asInstanceOf[ClassTag[V]]
488+
implicit val fcm = ClassTag(Class.forName(inputFormatClass)).asInstanceOf[ClassTag[F]]
489489
val kc = kcm.runtimeClass.asInstanceOf[Class[K]]
490490
val vc = vcm.runtimeClass.asInstanceOf[Class[V]]
491491
val fc = fcm.runtimeClass.asInstanceOf[Class[F]]

python/pyspark/context.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -328,13 +328,15 @@ def wholeTextFiles(self, path, minPartitions=None):
328328
return RDD(self._jsc.wholeTextFiles(path, minPartitions), self,
329329
PairDeserializer(UTF8Deserializer(), UTF8Deserializer()))
330330

331-
def dictToJavaMap(self, d):
331+
def _dictToJavaMap(self, d):
332332
jm = self._jvm.java.util.HashMap()
333+
if not d:
334+
d = {}
333335
for k, v in d.iteritems():
334336
jm[k] = v
335337
return jm
336338

337-
def sequenceFile(self, path, keyClass, valueClass, keyConverter="", valueConverter="",
339+
def sequenceFile(self, path, keyClass, valueClass, keyConverter=None, valueConverter=None,
338340
minSplits=None):
339341
"""
340342
Read a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS,
@@ -372,8 +374,8 @@ def sequenceFile(self, path, keyClass, valueClass, keyConverter="", valueConvert
372374
keyConverter, valueConverter, minSplits)
373375
return RDD(jrdd, self, PickleSerializer())
374376

375-
def newAPIHadoopFile(self, path, inputFormatClass, keyClass, valueClass,
376-
keyConverter="", valueConverter="", conf={}):
377+
def newAPIHadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=None,
378+
valueConverter=None, conf=None):
377379
"""
378380
Read a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS,
379381
a local file system (available on all nodes), or any Hadoop-supported file system URI.
@@ -382,26 +384,26 @@ def newAPIHadoopFile(self, path, inputFormatClass, keyClass, valueClass,
382384
A Hadoop configuration can be passed in as a Python dict. This will be converted into a
383385
Configuration in Java
384386
"""
385-
jconf = self.dictToJavaMap(conf)
387+
jconf = self._dictToJavaMap(conf)
386388
jrdd = self._jvm.PythonRDD.newAPIHadoopFile(self._jsc, path, inputFormatClass, keyClass,
387389
valueClass, keyConverter, valueConverter, jconf)
388390
return RDD(jrdd, self, PickleSerializer())
389391

390-
def newAPIHadoopRDD(self, inputFormatClass, keyClass, valueClass,
391-
keyConverter="", valueConverter="", conf={}):
392+
def newAPIHadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None,
393+
valueConverter=None, conf=None):
392394
"""
393395
Read a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
394396
Hadoop configuration,
395397
which is passed in as a Python dict. This will be converted into a Configuration in Java.
396398
The mechanism is the same as for sc.sequenceFile.
397399
"""
398-
jconf = self.dictToJavaMap(conf)
400+
jconf = self._dictToJavaMap(conf)
399401
jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(self._jsc, inputFormatClass, keyClass,
400402
valueClass, keyConverter, valueConverter, jconf)
401403
return RDD(jrdd, self, PickleSerializer())
402404

403-
def hadoopFile(self, path, inputFormatClass, keyClass, valueClass,
404-
keyConverter="", valueConverter="", conf={}):
405+
def hadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=None,
406+
valueConverter=None, conf=None):
405407
"""
406408
Read an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS,
407409
a local file system (available on all nodes), or any Hadoop-supported file system URI.
@@ -410,22 +412,22 @@ def hadoopFile(self, path, inputFormatClass, keyClass, valueClass,
410412
A Hadoop configuration can be passed in as a Python dict. This will be converted into a
411413
Configuration in Java
412414
"""
413-
jconf = self.dictToJavaMap(conf)
415+
jconf = self._dictToJavaMap(conf)
414416
for k, v in conf.iteritems():
415417
jconf[k] = v
416418
jrdd = self._jvm.PythonRDD.hadoopFile(self._jsc, path, inputFormatClass, keyClass,
417419
valueClass, keyConverter, valueConverter, jconf)
418420
return RDD(jrdd, self, PickleSerializer())
419421

420-
def hadoopRDD(self, inputFormatClass, keyClass, valueClass,
421-
keyConverter="", valueConverter="", conf={}):
422+
def hadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None,
423+
valueConverter=None, conf=None):
422424
"""
423425
Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
424426
Hadoop configuration,
425427
which is passed in as a Python dict. This will be converted into a Configuration in Java.
426428
The mechanism is the same as for sc.sequenceFile.
427429
"""
428-
jconf = self.dictToJavaMap(conf)
430+
jconf = self._dictToJavaMap(conf)
429431
jrdd = self._jvm.PythonRDD.hadoopRDD(self._jsc, inputFormatClass, keyClass, valueClass,
430432
keyConverter, valueConverter, jconf)
431433
return RDD(jrdd, self, PickleSerializer())

0 commit comments

Comments
 (0)