Skip to content

Commit 48dd6a4

Browse files
committed
revert [SPARK-22785][SQL] remove ColumnVector.anyNullsSet
## What changes were proposed in this pull request? In #19980 , we thought `anyNullsSet` can be simply implemented by `numNulls() > 0`. This is logically true, but may have performance problems. `OrcColumnVector` is an example. It doesn't have the `numNulls` property, only has a `noNulls` property. We will lose a lot of performance if we use `numNulls() > 0` to check null. This PR simply revert #19980, with a renaming to call it `hasNull`. Better name suggestions are welcome, e.g. `nullable`? ## How was this patch tested? existing test Author: Wenchen Fan <[email protected]> Closes #20452 from cloud-fan/null.
1 parent 3d0911b commit 48dd6a4

File tree

8 files changed

+44
-3
lines changed

8 files changed

+44
-3
lines changed

sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,11 @@ public void close() {
7777

7878
}
7979

80+
@Override
81+
public boolean hasNull() {
82+
return !baseData.noNulls;
83+
}
84+
8085
@Override
8186
public int numNulls() {
8287
if (baseData.isRepeating) {

sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ public void putNulls(int rowId, int count) {
123123

124124
@Override
125125
public void putNotNulls(int rowId, int count) {
126-
if (numNulls == 0) return;
126+
if (!hasNull()) return;
127127
long offset = nulls + rowId;
128128
for (int i = 0; i < count; ++i, ++offset) {
129129
Platform.putByte(null, offset, (byte) 0);

sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ public void putNulls(int rowId, int count) {
119119

120120
@Override
121121
public void putNotNulls(int rowId, int count) {
122-
if (numNulls == 0) return;
122+
if (!hasNull()) return;
123123
for (int i = 0; i < count; ++i) {
124124
nulls[rowId + i] = (byte)0;
125125
}

sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ public void reset() {
5959
elementsAppended = 0;
6060
if (numNulls > 0) {
6161
putNotNulls(0, capacity);
62+
numNulls = 0;
6263
}
63-
numNulls = 0;
6464
}
6565

6666
@Override
@@ -102,6 +102,11 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
102102
throw new RuntimeException(message, cause);
103103
}
104104

105+
@Override
106+
public boolean hasNull() {
107+
return numNulls > 0;
108+
}
109+
105110
@Override
106111
public int numNulls() { return numNulls; }
107112

sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ public final class ArrowColumnVector extends ColumnVector {
3737
private final ArrowVectorAccessor accessor;
3838
private ArrowColumnVector[] childColumns;
3939

40+
@Override
41+
public boolean hasNull() {
42+
return accessor.getNullCount() > 0;
43+
}
44+
4045
@Override
4146
public int numNulls() {
4247
return accessor.getNullCount();

sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ public abstract class ColumnVector implements AutoCloseable {
6565
@Override
6666
public abstract void close();
6767

68+
/**
69+
* Returns true if this column vector contains any null values.
70+
*/
71+
public abstract boolean hasNull();
72+
6873
/**
6974
* Returns the number of nulls in this column vector.
7075
*/

sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
4242

4343
val columnVector = new ArrowColumnVector(vector)
4444
assert(columnVector.dataType === BooleanType)
45+
assert(columnVector.hasNull)
4546
assert(columnVector.numNulls === 1)
4647

4748
(0 until 10).foreach { i =>
@@ -69,6 +70,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
6970

7071
val columnVector = new ArrowColumnVector(vector)
7172
assert(columnVector.dataType === ByteType)
73+
assert(columnVector.hasNull)
7274
assert(columnVector.numNulls === 1)
7375

7476
(0 until 10).foreach { i =>
@@ -96,6 +98,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
9698

9799
val columnVector = new ArrowColumnVector(vector)
98100
assert(columnVector.dataType === ShortType)
101+
assert(columnVector.hasNull)
99102
assert(columnVector.numNulls === 1)
100103

101104
(0 until 10).foreach { i =>
@@ -123,6 +126,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
123126

124127
val columnVector = new ArrowColumnVector(vector)
125128
assert(columnVector.dataType === IntegerType)
129+
assert(columnVector.hasNull)
126130
assert(columnVector.numNulls === 1)
127131

128132
(0 until 10).foreach { i =>
@@ -150,6 +154,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
150154

151155
val columnVector = new ArrowColumnVector(vector)
152156
assert(columnVector.dataType === LongType)
157+
assert(columnVector.hasNull)
153158
assert(columnVector.numNulls === 1)
154159

155160
(0 until 10).foreach { i =>
@@ -177,6 +182,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
177182

178183
val columnVector = new ArrowColumnVector(vector)
179184
assert(columnVector.dataType === FloatType)
185+
assert(columnVector.hasNull)
180186
assert(columnVector.numNulls === 1)
181187

182188
(0 until 10).foreach { i =>
@@ -204,6 +210,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
204210

205211
val columnVector = new ArrowColumnVector(vector)
206212
assert(columnVector.dataType === DoubleType)
213+
assert(columnVector.hasNull)
207214
assert(columnVector.numNulls === 1)
208215

209216
(0 until 10).foreach { i =>
@@ -232,6 +239,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
232239

233240
val columnVector = new ArrowColumnVector(vector)
234241
assert(columnVector.dataType === StringType)
242+
assert(columnVector.hasNull)
235243
assert(columnVector.numNulls === 1)
236244

237245
(0 until 10).foreach { i =>
@@ -258,6 +266,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
258266

259267
val columnVector = new ArrowColumnVector(vector)
260268
assert(columnVector.dataType === BinaryType)
269+
assert(columnVector.hasNull)
261270
assert(columnVector.numNulls === 1)
262271

263272
(0 until 10).foreach { i =>
@@ -300,6 +309,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
300309

301310
val columnVector = new ArrowColumnVector(vector)
302311
assert(columnVector.dataType === ArrayType(IntegerType))
312+
assert(columnVector.hasNull)
303313
assert(columnVector.numNulls === 1)
304314

305315
val array0 = columnVector.getArray(0)
@@ -344,6 +354,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
344354

345355
val columnVector = new ArrowColumnVector(vector)
346356
assert(columnVector.dataType === schema)
357+
assert(!columnVector.hasNull)
347358
assert(columnVector.numNulls === 0)
348359

349360
val row0 = columnVector.getStruct(0)
@@ -396,6 +407,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
396407

397408
val columnVector = new ArrowColumnVector(vector)
398409
assert(columnVector.dataType === schema)
410+
assert(columnVector.hasNull)
399411
assert(columnVector.numNulls === 1)
400412

401413
val row0 = columnVector.getStruct(0)

sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,41 +66,49 @@ class ColumnarBatchSuite extends SparkFunSuite {
6666
column =>
6767
val reference = mutable.ArrayBuffer.empty[Boolean]
6868
var idx = 0
69+
assert(!column.hasNull)
6970
assert(column.numNulls() == 0)
7071

7172
column.appendNotNull()
7273
reference += false
74+
assert(!column.hasNull)
7375
assert(column.numNulls() == 0)
7476

7577
column.appendNotNulls(3)
7678
(1 to 3).foreach(_ => reference += false)
79+
assert(!column.hasNull)
7780
assert(column.numNulls() == 0)
7881

7982
column.appendNull()
8083
reference += true
84+
assert(column.hasNull)
8185
assert(column.numNulls() == 1)
8286

8387
column.appendNulls(3)
8488
(1 to 3).foreach(_ => reference += true)
89+
assert(column.hasNull)
8590
assert(column.numNulls() == 4)
8691

8792
idx = column.elementsAppended
8893

8994
column.putNotNull(idx)
9095
reference += false
9196
idx += 1
97+
assert(column.hasNull)
9298
assert(column.numNulls() == 4)
9399

94100
column.putNull(idx)
95101
reference += true
96102
idx += 1
103+
assert(column.hasNull)
97104
assert(column.numNulls() == 5)
98105

99106
column.putNulls(idx, 3)
100107
reference += true
101108
reference += true
102109
reference += true
103110
idx += 3
111+
assert(column.hasNull)
104112
assert(column.numNulls() == 8)
105113

106114
column.putNotNulls(idx, 4)
@@ -109,6 +117,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
109117
reference += false
110118
reference += false
111119
idx += 4
120+
assert(column.hasNull)
112121
assert(column.numNulls() == 8)
113122

114123
reference.zipWithIndex.foreach { v =>

0 commit comments

Comments
 (0)