-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-3491] [MLlib] [PySpark] use pickle to serialize data in MLlib #2378
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
60e4e2f
c77c87b
3908f5c
f44f771
b30ef35
52d1350
f1544c4
aa2287e
8fe166a
cccb8b1
d9f691f
f2a0856
c383544
6d26b03
4d7963e
84c721d
b02e34f
0ee1525
722dd96
f3506c5
df19464
46a501e
88034f0
9dcfb63
708dc02
e1d1bfc
44736d7
154d141
df625c7
a379a81
44e0551
9ceff73
a2cc855
1fccf1a
2511e76
19d0967
e431377
bd738ab
032cd62
810f97f
dffbba2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,16 +17,19 @@ | |
|
|
||
| package org.apache.spark.mllib.api.python | ||
|
|
||
| import java.io.OutputStream | ||
| import java.nio.{ByteBuffer, ByteOrder} | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
|
|
||
| import net.razorvine.pickle.{Pickler, Unpickler, IObjectConstructor, IObjectPickler, PickleException, Opcodes} | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use |
||
|
|
||
| import org.apache.spark.annotation.DeveloperApi | ||
| import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} | ||
| import org.apache.spark.mllib.classification._ | ||
| import org.apache.spark.mllib.clustering._ | ||
| import org.apache.spark.mllib.optimization._ | ||
| import org.apache.spark.mllib.linalg.{Matrix, SparseVector, Vector, Vectors} | ||
| import org.apache.spark.mllib.linalg._ | ||
| import org.apache.spark.mllib.random.{RandomRDDs => RG} | ||
| import org.apache.spark.mllib.recommendation._ | ||
| import org.apache.spark.mllib.regression._ | ||
|
|
@@ -262,12 +265,12 @@ class PythonMLLibAPI extends Serializable { | |
| * the Py4J documentation. | ||
| */ | ||
| def trainALSModel( | ||
| ratingsBytesJRDD: JavaRDD[Array[Byte]], | ||
| ratingsJRDD: JavaRDD[Object], | ||
| rank: Int, | ||
| iterations: Int, | ||
| lambda: Double, | ||
| blocks: Int): MatrixFactorizationModel = { | ||
| val ratings = ratingsBytesJRDD.rdd.map(SerDe.unpackRating) | ||
| val ratings = ratingsJRDD.rdd.map(_.asInstanceOf[Rating]) | ||
| ALS.train(ratings, rank, iterations, lambda, blocks) | ||
| } | ||
|
|
||
|
|
@@ -278,13 +281,13 @@ class PythonMLLibAPI extends Serializable { | |
| * exit; see the Py4J documentation. | ||
| */ | ||
| def trainImplicitALSModel( | ||
| ratingsBytesJRDD: JavaRDD[Array[Byte]], | ||
| ratingsJRDD: JavaRDD[Object], | ||
| rank: Int, | ||
| iterations: Int, | ||
| lambda: Double, | ||
| blocks: Int, | ||
| alpha: Double): MatrixFactorizationModel = { | ||
| val ratings = ratingsBytesJRDD.rdd.map(SerDe.unpackRating) | ||
| val ratings = ratingsJRDD.rdd.map(_.asInstanceOf[Rating]) | ||
| ALS.trainImplicit(ratings, rank, iterations, lambda, blocks, alpha) | ||
| } | ||
|
|
||
|
|
@@ -510,6 +513,129 @@ private[spark] object SerDe extends Serializable { | |
| private val DENSE_MATRIX_MAGIC: Byte = 3 | ||
| private val LABELED_POINT_MAGIC: Byte = 4 | ||
|
|
||
| private[python] def reduce_object(out: OutputStream, pickler: Pickler, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use camelCase for method names |
||
| module: String, name: String, objects: Object*) = { | ||
| out.write(Opcodes.GLOBAL) | ||
| out.write((module + "\n" + name + "\n").getBytes) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does it increase the storage cost by a lot for small objects? |
||
| out.write(Opcodes.MARK) | ||
| objects.foreach(pickler.save(_)) | ||
| out.write(Opcodes.TUPLE) | ||
| out.write(Opcodes.REDUCE) | ||
| } | ||
|
|
||
| private[python] class DenseVectorPickler extends IObjectPickler { | ||
| def pickle(obj: Object, out: OutputStream, pickler: Pickler) = { | ||
| val vector: DenseVector = obj.asInstanceOf[DenseVector] | ||
| reduce_object(out, pickler, "pyspark.mllib.linalg", "DenseVector", vector.toArray) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto: what is the cost of using class names? |
||
| } | ||
| } | ||
|
|
||
| private[python] class DenseVectorConstructor extends IObjectConstructor { | ||
| def construct(args: Array[Object]) :Object = { | ||
| require(args.length == 1) | ||
| new DenseVector(args(0).asInstanceOf[Array[Double]]) | ||
| } | ||
| } | ||
|
|
||
| private[python] class DenseMatrixPickler extends IObjectPickler { | ||
| def pickle(obj: Object, out: OutputStream, pickler: Pickler) = { | ||
| val m: DenseMatrix = obj.asInstanceOf[DenseMatrix] | ||
| reduce_object(out, pickler, "pyspark.mllib.linalg", "DenseMatrix", | ||
| m.numRows.asInstanceOf[Object], m.numCols.asInstanceOf[Object], m.values) | ||
| } | ||
| } | ||
|
|
||
| private[python] class DenseMatrixConstructor extends IObjectConstructor { | ||
| def construct(args: Array[Object]) :Object = { | ||
| require(args.length == 3) | ||
| new DenseMatrix(args(0).asInstanceOf[Int], args(1).asInstanceOf[Int], | ||
| args(2).asInstanceOf[Array[Double]]) | ||
| } | ||
| } | ||
|
|
||
| private[python] class SparseVectorPickler extends IObjectPickler { | ||
| def pickle(obj: Object, out: OutputStream, pickler: Pickler) = { | ||
| val v: SparseVector = obj.asInstanceOf[SparseVector] | ||
| reduce_object(out, pickler, "pyspark.mllib.linalg", "SparseVector", | ||
| v.size.asInstanceOf[Object], v.indices, v.values) | ||
| } | ||
| } | ||
|
|
||
| private[python] class SparseVectorConstructor extends IObjectConstructor { | ||
| def construct(args: Array[Object]) :Object = { | ||
| require(args.length == 3) | ||
| new SparseVector(args(0).asInstanceOf[Int], args(1).asInstanceOf[Array[Int]], | ||
| args(2).asInstanceOf[Array[Double]]) | ||
| } | ||
| } | ||
|
|
||
| private[python] class LabeledPointPickler extends IObjectPickler { | ||
| def pickle(obj: Object, out: OutputStream, pickler: Pickler) = { | ||
| val point: LabeledPoint = obj.asInstanceOf[LabeledPoint] | ||
| reduce_object(out, pickler, "pyspark.mllib.regression", "LabeledPoint", | ||
| point.label.asInstanceOf[Object], point.features) | ||
| } | ||
| } | ||
|
|
||
| private[python] class LabeledPointConstructor extends IObjectConstructor { | ||
| def construct(args: Array[Object]) :Object = { | ||
| if (args.length != 2) { | ||
| throw new PickleException("should be 2") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use consistent Exception type. (In some other places, require() is used instead.) |
||
| } | ||
| new LabeledPoint(args(0).asInstanceOf[Double], args(1).asInstanceOf[Vector]) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Pickle Rating | ||
| */ | ||
| private[python] class RatingPickler extends IObjectPickler { | ||
| def pickle(obj: Object, out: OutputStream, pickler: Pickler) = { | ||
| val rating: Rating = obj.asInstanceOf[Rating] | ||
| reduce_object(out, pickler, "pyspark.mllib.recommendation", "Rating", | ||
| rating.user.asInstanceOf[Object], rating.product.asInstanceOf[Object], | ||
| rating.rating.asInstanceOf[Object]) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Unpickle Rating | ||
| */ | ||
| private[python] class RatingConstructor extends IObjectConstructor { | ||
| def construct(args: Array[Object]) :Object = { | ||
| if (args.length != 3) { | ||
| throw new PickleException("should be 3") | ||
| } | ||
| new Rating(args(0).asInstanceOf[Int], args(1).asInstanceOf[Int], | ||
| args(2).asInstanceOf[Double]) | ||
| } | ||
| } | ||
|
|
||
| def initialize() = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add return type explicitly. |
||
| Pickler.registerCustomPickler(classOf[DenseVector], new DenseVectorPickler) | ||
| Pickler.registerCustomPickler(classOf[DenseMatrix], new DenseMatrixPickler) | ||
| Pickler.registerCustomPickler(classOf[SparseVector], new SparseVectorPickler) | ||
| Pickler.registerCustomPickler(classOf[LabeledPoint], new LabeledPointPickler) | ||
| Pickler.registerCustomPickler(classOf[Rating], new RatingPickler) | ||
| Unpickler.registerConstructor("pyspark.mllib.linalg", "DenseVector", | ||
| new DenseVectorConstructor) | ||
| Unpickler.registerConstructor("pyspark.mllib.linalg", "DenseMatrix", | ||
| new DenseMatrixConstructor) | ||
| Unpickler.registerConstructor("pyspark.mllib.linalg", "SparseVector", | ||
| new SparseVectorConstructor) | ||
| Unpickler.registerConstructor("pyspark.mllib.regression", "LabeledPoint", | ||
| new LabeledPointConstructor) | ||
| Unpickler.registerConstructor("pyspark.mllib.recommendation", "Rating", new RatingConstructor) | ||
| } | ||
|
|
||
| private[python] def dumps(obj: AnyRef): Array[Byte] = { | ||
| new Pickler().dumps(obj) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In Python, |
||
| } | ||
|
|
||
| private[python] def loads(bytes: Array[Byte]): AnyRef = { | ||
| new Unpickler().loads(bytes) | ||
| } | ||
|
|
||
| private[python] def deserializeDoubleVector(bytes: Array[Byte], offset: Int = 0): Vector = { | ||
| require(bytes.length - offset >= 5, "Byte array too short") | ||
| val magic = bytes(offset) | ||
|
|
@@ -688,43 +814,8 @@ private[spark] object SerDe extends Serializable { | |
| Array.tabulate(matrix.numRows, matrix.numCols)((i, j) => values(i + j * matrix.numRows)) | ||
| } | ||
|
|
||
|
|
||
| /** Unpack a Rating object from an array of bytes */ | ||
| private[python] def unpackRating(ratingBytes: Array[Byte]): Rating = { | ||
| val bb = ByteBuffer.wrap(ratingBytes) | ||
| bb.order(ByteOrder.nativeOrder()) | ||
| val user = bb.getInt() | ||
| val product = bb.getInt() | ||
| val rating = bb.getDouble() | ||
| new Rating(user, product, rating) | ||
| } | ||
|
|
||
| /** Unpack a tuple of Ints from an array of bytes */ | ||
| def unpackTuple(tupleBytes: Array[Byte]): (Int, Int) = { | ||
| val bb = ByteBuffer.wrap(tupleBytes) | ||
| bb.order(ByteOrder.nativeOrder()) | ||
| val v1 = bb.getInt() | ||
| val v2 = bb.getInt() | ||
| (v1, v2) | ||
| } | ||
|
|
||
| /** | ||
| * Serialize a Rating object into an array of bytes. | ||
| * It can be deserialized using RatingDeserializer(). | ||
| * | ||
| * @param rate the Rating object to serialize | ||
| * @return | ||
| */ | ||
| def serializeRating(rate: Rating): Array[Byte] = { | ||
| val len = 3 | ||
| val bytes = new Array[Byte](4 + 8 * len) | ||
| val bb = ByteBuffer.wrap(bytes) | ||
| bb.order(ByteOrder.nativeOrder()) | ||
| bb.putInt(len) | ||
| val db = bb.asDoubleBuffer() | ||
| db.put(rate.user.toDouble) | ||
| db.put(rate.product.toDouble) | ||
| db.put(rate.rating) | ||
| bytes | ||
| /* convert object into Tuple */ | ||
| def asTupleRDD(rdd: RDD[Array[Object]]): RDD[(Int, Int)] = { | ||
| rdd.map(x => (x(0).asInstanceOf[Int], x(1).asInstanceOf[Int])) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -64,6 +64,12 @@ class DenseMatrix(val numRows: Int, val numCols: Int, val values: Array[Double]) | |
| override def toArray: Array[Double] = values | ||
|
|
||
| private[mllib] override def toBreeze: BM[Double] = new BDM[Double](numRows, numCols, values) | ||
|
|
||
| override def equals(o: Any) = o match { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this not check values? Even though that would be expensive, it should be necessary to match the expected behavior of equals().
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch! |
||
| case that: DenseMatrix => | ||
| that.numRows == numRows && that.numCols == numCols | ||
| case _ => false | ||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shall we divide groups based on the serialized size?