-
Notifications
You must be signed in to change notification settings - Fork 29.1k
[SPARK-3974][MLlib] Distributed Block Matrix Abstractions #3200
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
b693209
f378e16
aa8f086
589fbb6
19c17e8
b05aabb
645afbe
49b9586
d033861
9ae85aa
ab6cde0
ba414d2
239ab4b
1e8bb2a
1a63b20
eebbdf7
f9d664b
1694c9e
140f20e
5eecd48
24ec7b8
e1d3ee8
feb32a7
a8eace2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
…itioner
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,47 +21,28 @@ import breeze.linalg.{DenseMatrix => BDM} | |
|
|
||
| import org.apache.spark._ | ||
| import org.apache.spark.mllib.linalg.DenseMatrix | ||
| import org.apache.spark.mllib.rdd.RDDFunctions._ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not necessary.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I didn't see |
||
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.storage.StorageLevel | ||
| import org.apache.spark.util.Utils | ||
|
|
||
| /** | ||
| * Represents a local matrix that makes up one block of a distributed BlockMatrix | ||
| * | ||
| * @param blockRowIndex The row index of this block | ||
| * @param blockColIndex The column index of this block | ||
| * @param blockRowIndex The row index of this block. Must be zero based. | ||
| * @param blockColIndex The column index of this block. Must be zero based. | ||
| * @param mat The underlying local matrix | ||
| */ | ||
| case class SubMatrix(blockRowIndex: Int, blockColIndex: Int, mat: DenseMatrix) extends Serializable | ||
|
|
||
| /** | ||
| * Information of the submatrices of the BlockMatrix maintained on the driver | ||
| * | ||
| * @param partitionId The id of the partition the block is found in | ||
| * @param blockRowIndex The row index of this block | ||
| * @param blockColIndex The column index of this block | ||
| * @param startRow The starting row index with respect to the distributed BlockMatrix | ||
| * @param numRows The number of rows in this block | ||
| * @param startCol The starting column index with respect to the distributed BlockMatrix | ||
| * @param numCols The number of columns in this block | ||
| */ | ||
| case class SubMatrixInfo( | ||
| partitionId: Int, | ||
| blockRowIndex: Int, | ||
| blockColIndex: Int, | ||
| startRow: Long, | ||
| numRows: Int, | ||
| startCol: Long, | ||
| numCols: Int) extends Serializable | ||
|
|
||
| /** | ||
| * A partitioner that decides how the matrix is distributed in the cluster | ||
| * | ||
| * @param numPartitions Number of partitions | ||
| * @param rowPerBlock Number of rows that make up each block. | ||
| * @param colPerBlock Number of columns that make up each block. | ||
| */ | ||
| abstract class BlockMatrixPartitioner( | ||
| private[mllib] abstract class BlockMatrixPartitioner( | ||
| override val numPartitions: Int, | ||
| val rowPerBlock: Int, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rename "rowPerBlock" --> "rowsPerBlock"? Same for "colPerBlock" and in other places in this file. |
||
| val colPerBlock: Int) extends Partitioner { | ||
|
|
@@ -173,20 +154,30 @@ class ColumnBasedPartitioner( | |
| * | ||
| * @param numRowBlocks Number of blocks that form the rows of this matrix | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could it be derived from |
||
| * @param numColBlocks Number of blocks that form the columns of this matrix | ||
| * @param rdd The RDD of SubMatrixs (local matrices) that form this matrix | ||
| * @param partitioner A partitioner that specifies how SubMatrixs are stored in the cluster | ||
| * @param rdd The RDD of SubMatrices (local matrices) that form this matrix | ||
| * @param partitioner A partitioner that specifies how SubMatrices are stored in the cluster | ||
| */ | ||
| class BlockMatrix( | ||
| val numRowBlocks: Int, | ||
| val numColBlocks: Int, | ||
| val rdd: RDD[SubMatrix], | ||
| val partitioner: BlockMatrixPartitioner) extends DistributedMatrix with Logging { | ||
|
|
||
| /** | ||
| * Alternate constructor for BlockMatrix without the input of a partitioner. Will use a Grid | ||
| * Partitioner by default. | ||
| * | ||
| * @param numRowBlocks Number of blocks that form the rows of this matrix | ||
| * @param numColBlocks Number of blocks that form the columns of this matrix | ||
| * @param rdd The RDD of SubMatrices (local matrices) that form this matrix | ||
| */ | ||
| def this(numRowBlocks: Int, numColBlocks: Int, rdd: RDD[SubMatrix]) = { | ||
| this(numRowBlocks, numColBlocks, rdd, new GridPartitioner(numRowBlocks, numColBlocks, | ||
| rdd.first().mat.numRows, rdd.first().mat.numCols)) | ||
| } | ||
| // A key-value pair RDD is required to partition properly | ||
| private var matrixRDD: RDD[(Int, SubMatrix)] = keyBy() | ||
|
|
||
| @transient var blockInfo_ : Map[(Int, Int), SubMatrixInfo] = null | ||
|
|
||
| private lazy val dims: (Long, Long) = getDim | ||
|
|
||
| override def numRows(): Long = dims._1 | ||
|
|
@@ -211,55 +202,26 @@ class BlockMatrix( | |
|
|
||
| /** Returns the dimensions of the matrix. */ | ||
| def getDim: (Long, Long) = { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be private? |
||
| val bi = getBlockInfo | ||
| val xDim = bi.map { x => | ||
| (x._1._1, x._2.numRows.toLong) | ||
| }.groupBy(x => x._1).values.map(_.head._2.toLong).reduceLeft(_ + _) | ||
|
|
||
| val yDim = bi.map { x => | ||
| (x._1._2, x._2.numCols.toLong) | ||
| }.groupBy(x => x._1).values.map(_.head._2.toLong).reduceLeft(_ + _) | ||
|
|
||
| (xDim, yDim) | ||
| } | ||
|
|
||
| /** Calculates the information for each block and collects it on the driver */ | ||
| private def calculateBlockInfo(): Unit = { | ||
| // collect may cause akka frameSize errors | ||
| val blockStartRowColsParts = matrixRDD.mapPartitionsWithIndex { case (partId, iter) => | ||
| iter.map { case (id, block) => | ||
| ((block.blockRowIndex, block.blockColIndex), (partId, block.mat.numRows, block.mat.numCols)) | ||
| val firstRowColumn = rdd.filter(block => block.blockRowIndex == 0 || block.blockColIndex == 0). | ||
| map { block => | ||
| ((block.blockRowIndex, block.blockColIndex), (block.mat.numRows, block.mat.numCols)) | ||
| } | ||
| }.collect() | ||
| val blockStartRowCols = blockStartRowColsParts.sortBy(_._1) | ||
|
|
||
| // Group blockInfo by rowId, pick the first row and sort on rowId | ||
| val rowReps = blockStartRowCols.groupBy(_._1._1).values.map(_.head).toSeq.sortBy(_._1._1) | ||
|
|
||
| // Group blockInfo by columnId, pick the first column and sort on columnId | ||
| val colReps = blockStartRowCols.groupBy(_._1._2).values.map(_.head).toSeq.sortBy(_._1._2) | ||
|
|
||
| // Calculate startRows | ||
| val cumulativeRowSum = rowReps.scanLeft((0, 0L)) { case (x1, x2) => | ||
| (x1._1 + 1, x1._2 + x2._2._2) | ||
| }.toMap | ||
|
|
||
| val cumulativeColSum = colReps.scanLeft((0, 0L)) { case (x1, x2) => | ||
| (x1._1 + 1, x1._2 + x2._2._3) | ||
| }.toMap | ||
|
|
||
| blockInfo_ = blockStartRowCols.map{ case ((rowId, colId), (partId, numRow, numCol)) => | ||
| ((rowId, colId), new SubMatrixInfo(partId, rowId, colId, cumulativeRowSum(rowId), | ||
| numRow, cumulativeColSum(colId), numCol)) | ||
| }.toMap | ||
| } | ||
|
|
||
| /** Returns a map of the information of the blocks that form the distributed matrix. */ | ||
| def getBlockInfo: Map[(Int, Int), SubMatrixInfo] = { | ||
| if (blockInfo_ == null) { | ||
| calculateBlockInfo() | ||
| } | ||
| blockInfo_ | ||
| firstRowColumn.treeAggregate((0L, 0L))( | ||
| seqOp = (c, v) => (c, v) match { case ((x_dim, y_dim), ((indX, indY), (nRow, nCol))) => | ||
| if (indX == 0 && indY == 0) { | ||
| (x_dim + nRow, y_dim + nCol) | ||
| } else if (indX == 0) { | ||
| (x_dim, y_dim + nCol) | ||
| } else { | ||
| (x_dim + nRow, y_dim) | ||
| } | ||
| }, | ||
| combOp = (c1, c2) => (c1, c2) match { | ||
| case ((x_dim1, y_dim1), (x_dim2, y_dim2)) => | ||
| (x_dim1 + x_dim2, y_dim1 + y_dim2) | ||
| }) | ||
| } | ||
|
|
||
| /** Returns the Frobenius Norm of the matrix */ | ||
|
|
@@ -309,23 +271,26 @@ class BlockMatrix( | |
| val nRows = numRows().toInt | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd check numRows and numCols here before converting to Int and throw an error if the matrix is too large. |
||
| val nCols = numCols().toInt | ||
| val values = new Array[Double](nRows * nCols) | ||
| val blockInfos = getBlockInfo | ||
| var rowStart = 0 | ||
| var colStart = 0 | ||
| parts.foreach { part => | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use a match statement so you don't have to use the ._1._1 style of indexing |
||
| val blockInfo = blockInfos((part._1._1, part._1._2)) | ||
| // Figure out where this part should be put | ||
| if (part._1._1 == 0) rowStart = 0 | ||
| val block = part._2 | ||
| var j = 0 | ||
| while (j < blockInfo.numCols) { | ||
| while (j < block.numCols) { | ||
| var i = 0 | ||
| val indStart = (j + blockInfo.startCol.toInt) * nRows + blockInfo.startRow.toInt | ||
| val indEnd = blockInfo.numRows | ||
| val matStart = j * blockInfo.numRows | ||
| val mat = part._2.values | ||
| val indStart = (j + colStart) * nRows + rowStart | ||
| val indEnd = block.numRows | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "indEnd" --> "iEnd" since it matches with i, not indStart |
||
| val matStart = j * block.numRows | ||
| val mat = block.values | ||
| while (i < indEnd) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could be shortened with an Array.copy call, but either is OK with me. |
||
| values(indStart + i) = mat(matStart + i) | ||
| i += 1 | ||
| } | ||
| j += 1 | ||
| } | ||
| rowStart += block.numRows | ||
| if (part._1._1 == numRowBlocks - 1) colStart += block.numCols | ||
| } | ||
| new DenseMatrix(nRows, nCols, values) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Try to be more explicit on the imports.