Skip to content

Commit bfc8c79

Browse files
dongjoon-hyuncloud-fan
authored andcommitted
[SPARK-20566][SQL] ColumnVector should support appendFloats for array
## What changes were proposed in this pull request? This PR aims to add a missing `appendFloats` API for array into **ColumnVector** class. For double type, there is `appendDoubles` for array [here](https://github.com/apache/spark/blob/master/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java#L818-L824). ## How was this patch tested? Pass the Jenkins with a newly added test case. Author: Dongjoon Hyun <[email protected]> Closes #17836 from dongjoon-hyun/SPARK-20566.
1 parent c5dceb8 commit bfc8c79

File tree

2 files changed

+240
-24
lines changed

2 files changed

+240
-24
lines changed

sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,14 @@ public final int appendFloats(int count, float v) {
801801
return result;
802802
}
803803

804+
public final int appendFloats(int length, float[] src, int offset) {
805+
reserve(elementsAppended + length);
806+
int result = elementsAppended;
807+
putFloats(elementsAppended, length, src, offset);
808+
elementsAppended += length;
809+
return result;
810+
}
811+
804812
public final int appendDouble(double v) {
805813
reserve(elementsAppended + 1);
806814
putDouble(elementsAppended, v);

sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala

Lines changed: 232 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -41,33 +41,58 @@ class ColumnarBatchSuite extends SparkFunSuite {
4141
val column = ColumnVector.allocate(1024, IntegerType, memMode)
4242
var idx = 0
4343
assert(column.anyNullsSet() == false)
44+
assert(column.numNulls() == 0)
45+
46+
column.appendNotNull()
47+
reference += false
48+
assert(column.anyNullsSet() == false)
49+
assert(column.numNulls() == 0)
50+
51+
column.appendNotNulls(3)
52+
(1 to 3).foreach(_ => reference += false)
53+
assert(column.anyNullsSet() == false)
54+
assert(column.numNulls() == 0)
55+
56+
column.appendNull()
57+
reference += true
58+
assert(column.anyNullsSet())
59+
assert(column.numNulls() == 1)
60+
61+
column.appendNulls(3)
62+
(1 to 3).foreach(_ => reference += true)
63+
assert(column.anyNullsSet())
64+
assert(column.numNulls() == 4)
65+
66+
idx = column.elementsAppended
4467

4568
column.putNotNull(idx)
4669
reference += false
4770
idx += 1
48-
assert(column.anyNullsSet() == false)
71+
assert(column.anyNullsSet())
72+
assert(column.numNulls() == 4)
4973

5074
column.putNull(idx)
5175
reference += true
5276
idx += 1
53-
assert(column.anyNullsSet() == true)
54-
assert(column.numNulls() == 1)
77+
assert(column.anyNullsSet())
78+
assert(column.numNulls() == 5)
5579

5680
column.putNulls(idx, 3)
5781
reference += true
5882
reference += true
5983
reference += true
6084
idx += 3
61-
assert(column.anyNullsSet() == true)
85+
assert(column.anyNullsSet())
86+
assert(column.numNulls() == 8)
6287

6388
column.putNotNulls(idx, 4)
6489
reference += false
6590
reference += false
6691
reference += false
6792
reference += false
6893
idx += 4
69-
assert(column.anyNullsSet() == true)
70-
assert(column.numNulls() == 4)
94+
assert(column.anyNullsSet())
95+
assert(column.numNulls() == 8)
7196

7297
reference.zipWithIndex.foreach { v =>
7398
assert(v._1 == column.isNullAt(v._2))
@@ -85,9 +110,26 @@ class ColumnarBatchSuite extends SparkFunSuite {
85110
val reference = mutable.ArrayBuffer.empty[Byte]
86111

87112
val column = ColumnVector.allocate(1024, ByteType, memMode)
88-
var idx = 0
89113

90-
val values = (1 :: 2 :: 3 :: 4 :: 5 :: Nil).map(_.toByte).toArray
114+
var values = (10 :: 20 :: 30 :: 40 :: 50 :: Nil).map(_.toByte).toArray
115+
column.appendBytes(2, values, 0)
116+
reference += 10.toByte
117+
reference += 20.toByte
118+
119+
column.appendBytes(3, values, 2)
120+
reference += 30.toByte
121+
reference += 40.toByte
122+
reference += 50.toByte
123+
124+
column.appendBytes(6, 60.toByte)
125+
(1 to 6).foreach(_ => reference += 60.toByte)
126+
127+
column.appendByte(70.toByte)
128+
reference += 70.toByte
129+
130+
var idx = column.elementsAppended
131+
132+
values = (1 :: 2 :: 3 :: 4 :: 5 :: Nil).map(_.toByte).toArray
91133
column.putBytes(idx, 2, values, 0)
92134
reference += 1
93135
reference += 2
@@ -126,9 +168,26 @@ class ColumnarBatchSuite extends SparkFunSuite {
126168
val reference = mutable.ArrayBuffer.empty[Short]
127169

128170
val column = ColumnVector.allocate(1024, ShortType, memMode)
129-
var idx = 0
130171

131-
val values = (1 :: 2 :: 3 :: 4 :: 5 :: Nil).map(_.toShort).toArray
172+
var values = (10 :: 20 :: 30 :: 40 :: 50 :: Nil).map(_.toShort).toArray
173+
column.appendShorts(2, values, 0)
174+
reference += 10.toShort
175+
reference += 20.toShort
176+
177+
column.appendShorts(3, values, 2)
178+
reference += 30.toShort
179+
reference += 40.toShort
180+
reference += 50.toShort
181+
182+
column.appendShorts(6, 60.toShort)
183+
(1 to 6).foreach(_ => reference += 60.toShort)
184+
185+
column.appendShort(70.toShort)
186+
reference += 70.toShort
187+
188+
var idx = column.elementsAppended
189+
190+
values = (1 :: 2 :: 3 :: 4 :: 5 :: Nil).map(_.toShort).toArray
132191
column.putShorts(idx, 2, values, 0)
133192
reference += 1
134193
reference += 2
@@ -189,9 +248,26 @@ class ColumnarBatchSuite extends SparkFunSuite {
189248
val reference = mutable.ArrayBuffer.empty[Int]
190249

191250
val column = ColumnVector.allocate(1024, IntegerType, memMode)
192-
var idx = 0
193251

194-
val values = (1 :: 2 :: 3 :: 4 :: 5 :: Nil).toArray
252+
var values = (10 :: 20 :: 30 :: 40 :: 50 :: Nil).toArray
253+
column.appendInts(2, values, 0)
254+
reference += 10
255+
reference += 20
256+
257+
column.appendInts(3, values, 2)
258+
reference += 30
259+
reference += 40
260+
reference += 50
261+
262+
column.appendInts(6, 60)
263+
(1 to 6).foreach(_ => reference += 60)
264+
265+
column.appendInt(70)
266+
reference += 70
267+
268+
var idx = column.elementsAppended
269+
270+
values = (1 :: 2 :: 3 :: 4 :: 5 :: Nil).toArray
195271
column.putInts(idx, 2, values, 0)
196272
reference += 1
197273
reference += 2
@@ -257,9 +333,26 @@ class ColumnarBatchSuite extends SparkFunSuite {
257333
val reference = mutable.ArrayBuffer.empty[Long]
258334

259335
val column = ColumnVector.allocate(1024, LongType, memMode)
260-
var idx = 0
261336

262-
val values = (1L :: 2L :: 3L :: 4L :: 5L :: Nil).toArray
337+
var values = (10L :: 20L :: 30L :: 40L :: 50L :: Nil).toArray
338+
column.appendLongs(2, values, 0)
339+
reference += 10L
340+
reference += 20L
341+
342+
column.appendLongs(3, values, 2)
343+
reference += 30L
344+
reference += 40L
345+
reference += 50L
346+
347+
column.appendLongs(6, 60L)
348+
(1 to 6).foreach(_ => reference += 60L)
349+
350+
column.appendLong(70L)
351+
reference += 70L
352+
353+
var idx = column.elementsAppended
354+
355+
values = (1L :: 2L :: 3L :: 4L :: 5L :: Nil).toArray
263356
column.putLongs(idx, 2, values, 0)
264357
reference += 1
265358
reference += 2
@@ -320,16 +413,124 @@ class ColumnarBatchSuite extends SparkFunSuite {
320413
}}
321414
}
322415

416+
test("Float APIs") {
417+
(MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
418+
val seed = System.currentTimeMillis()
419+
val random = new Random(seed)
420+
val reference = mutable.ArrayBuffer.empty[Float]
421+
422+
val column = ColumnVector.allocate(1024, FloatType, memMode)
423+
424+
var values = (.1f :: .2f :: .3f :: .4f :: .5f :: Nil).toArray
425+
column.appendFloats(2, values, 0)
426+
reference += .1f
427+
reference += .2f
428+
429+
column.appendFloats(3, values, 2)
430+
reference += .3f
431+
reference += .4f
432+
reference += .5f
433+
434+
column.appendFloats(6, .6f)
435+
(1 to 6).foreach(_ => reference += .6f)
436+
437+
column.appendFloat(.7f)
438+
reference += .7f
439+
440+
var idx = column.elementsAppended
441+
442+
values = (1.0f :: 2.0f :: 3.0f :: 4.0f :: 5.0f :: Nil).toArray
443+
column.putFloats(idx, 2, values, 0)
444+
reference += 1.0f
445+
reference += 2.0f
446+
idx += 2
447+
448+
column.putFloats(idx, 3, values, 2)
449+
reference += 3.0f
450+
reference += 4.0f
451+
reference += 5.0f
452+
idx += 3
453+
454+
val buffer = new Array[Byte](8)
455+
Platform.putFloat(buffer, Platform.BYTE_ARRAY_OFFSET, 2.234f)
456+
Platform.putFloat(buffer, Platform.BYTE_ARRAY_OFFSET + 4, 1.123f)
457+
458+
if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) {
459+
// Ensure array contains Little Endian floats
460+
val bb = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN)
461+
Platform.putFloat(buffer, Platform.BYTE_ARRAY_OFFSET, bb.getFloat(0))
462+
Platform.putFloat(buffer, Platform.BYTE_ARRAY_OFFSET + 4, bb.getFloat(4))
463+
}
464+
465+
column.putFloats(idx, 1, buffer, 4)
466+
column.putFloats(idx + 1, 1, buffer, 0)
467+
reference += 1.123f
468+
reference += 2.234f
469+
idx += 2
470+
471+
column.putFloats(idx, 2, buffer, 0)
472+
reference += 2.234f
473+
reference += 1.123f
474+
idx += 2
475+
476+
while (idx < column.capacity) {
477+
val single = random.nextBoolean()
478+
if (single) {
479+
val v = random.nextFloat()
480+
column.putFloat(idx, v)
481+
reference += v
482+
idx += 1
483+
} else {
484+
val n = math.min(random.nextInt(column.capacity / 20), column.capacity - idx)
485+
val v = random.nextFloat()
486+
column.putFloats(idx, n, v)
487+
var i = 0
488+
while (i < n) {
489+
reference += v
490+
i += 1
491+
}
492+
idx += n
493+
}
494+
}
495+
496+
reference.zipWithIndex.foreach { v =>
497+
assert(v._1 == column.getFloat(v._2), "Seed = " + seed + " MemMode=" + memMode)
498+
if (memMode == MemoryMode.OFF_HEAP) {
499+
val addr = column.valuesNativeAddress()
500+
assert(v._1 == Platform.getFloat(null, addr + 4 * v._2))
501+
}
502+
}
503+
column.close
504+
}}
505+
}
506+
323507
test("Double APIs") {
324508
(MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
325509
val seed = System.currentTimeMillis()
326510
val random = new Random(seed)
327511
val reference = mutable.ArrayBuffer.empty[Double]
328512

329513
val column = ColumnVector.allocate(1024, DoubleType, memMode)
330-
var idx = 0
331514

332-
val values = (1.0 :: 2.0 :: 3.0 :: 4.0 :: 5.0 :: Nil).toArray
515+
var values = (.1 :: .2 :: .3 :: .4 :: .5 :: Nil).toArray
516+
column.appendDoubles(2, values, 0)
517+
reference += .1
518+
reference += .2
519+
520+
column.appendDoubles(3, values, 2)
521+
reference += .3
522+
reference += .4
523+
reference += .5
524+
525+
column.appendDoubles(6, .6)
526+
(1 to 6).foreach(_ => reference += .6)
527+
528+
column.appendDouble(.7)
529+
reference += .7
530+
531+
var idx = column.elementsAppended
532+
533+
values = (1.0 :: 2.0 :: 3.0 :: 4.0 :: 5.0 :: Nil).toArray
333534
column.putDoubles(idx, 2, values, 0)
334535
reference += 1.0
335536
reference += 2.0
@@ -346,8 +547,8 @@ class ColumnarBatchSuite extends SparkFunSuite {
346547
Platform.putDouble(buffer, Platform.BYTE_ARRAY_OFFSET + 8, 1.123)
347548

348549
if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) {
349-
// Ensure array contains Liitle Endian doubles
350-
var bb = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN)
550+
// Ensure array contains Little Endian doubles
551+
val bb = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN)
351552
Platform.putDouble(buffer, Platform.BYTE_ARRAY_OFFSET, bb.getDouble(0))
352553
Platform.putDouble(buffer, Platform.BYTE_ARRAY_OFFSET + 8, bb.getDouble(8))
353554
}
@@ -400,40 +601,47 @@ class ColumnarBatchSuite extends SparkFunSuite {
400601

401602
val column = ColumnVector.allocate(6, BinaryType, memMode)
402603
assert(column.arrayData().elementsAppended == 0)
403-
var idx = 0
604+
605+
val str = "string"
606+
column.appendByteArray(str.getBytes(StandardCharsets.UTF_8),
607+
0, str.getBytes(StandardCharsets.UTF_8).length)
608+
reference += str
609+
assert(column.arrayData().elementsAppended == 6)
610+
611+
var idx = column.elementsAppended
404612

405613
val values = ("Hello" :: "abc" :: Nil).toArray
406614
column.putByteArray(idx, values(0).getBytes(StandardCharsets.UTF_8),
407615
0, values(0).getBytes(StandardCharsets.UTF_8).length)
408616
reference += values(0)
409617
idx += 1
410-
assert(column.arrayData().elementsAppended == 5)
618+
assert(column.arrayData().elementsAppended == 11)
411619

412620
column.putByteArray(idx, values(1).getBytes(StandardCharsets.UTF_8),
413621
0, values(1).getBytes(StandardCharsets.UTF_8).length)
414622
reference += values(1)
415623
idx += 1
416-
assert(column.arrayData().elementsAppended == 8)
624+
assert(column.arrayData().elementsAppended == 14)
417625

418626
// Just put llo
419627
val offset = column.putByteArray(idx, values(0).getBytes(StandardCharsets.UTF_8),
420628
2, values(0).getBytes(StandardCharsets.UTF_8).length - 2)
421629
reference += "llo"
422630
idx += 1
423-
assert(column.arrayData().elementsAppended == 11)
631+
assert(column.arrayData().elementsAppended == 17)
424632

425633
// Put the same "ll" at offset. This should not allocate more memory in the column.
426634
column.putArray(idx, offset, 2)
427635
reference += "ll"
428636
idx += 1
429-
assert(column.arrayData().elementsAppended == 11)
637+
assert(column.arrayData().elementsAppended == 17)
430638

431639
// Put a long string
432640
val s = "abcdefghijklmnopqrstuvwxyz"
433641
column.putByteArray(idx, (s + s).getBytes(StandardCharsets.UTF_8))
434642
reference += (s + s)
435643
idx += 1
436-
assert(column.arrayData().elementsAppended == 11 + (s + s).length)
644+
assert(column.arrayData().elementsAppended == 17 + (s + s).length)
437645

438646
reference.zipWithIndex.foreach { v =>
439647
assert(v._1.length == column.getArrayLength(v._2), "MemoryMode=" + memMode)

0 commit comments

Comments
 (0)