Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ object RebaseDateTime {
-719164, -682945, -646420, -609895, -536845, -500320, -463795,
-390745, -354220, -317695, -244645, -208120, -171595, -141427)

final val lastSwitchJulianDay: Int = julianGregDiffSwitchDay.last

// The first days of Common Era (CE) which is mapped to the '0001-01-01' date in Julian calendar.
private final val julianCommonEraStartDay = julianGregDiffSwitchDay(0)

Expand Down Expand Up @@ -416,6 +418,8 @@ object RebaseDateTime {
// in the interval: [julianGregDiffSwitchMicros(i), julianGregDiffSwitchMicros(i+1))
private val julianGregRebaseMap = loadRebaseRecords("julian-gregorian-rebase-micros.json")

final val lastSwitchJulianTs: Long = julianGregRebaseMap.values.map(_.switches.last).max

/**
* An optimized version of [[rebaseJulianToGregorianMicros(ZoneId, Long)]]. This method leverages
* the pre-calculated rebasing maps to save calculation. If the rebasing map doesn't contain
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -423,15 +423,8 @@ private void readIntBatch(int rowId, int num, WritableColumnVector column) throw
num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
} else if (column.dataType() == DataTypes.DateType ) {
if (rebaseDateTime) {
for (int i = 0; i < num; i++) {
if (defColumn.readInteger() == maxDefLevel) {
column.putInt(
rowId + i,
RebaseDateTime.rebaseJulianToGregorianDays(dataColumn.readInteger()));
} else {
column.putNull(rowId + i);
}
}
defColumn.readIntegersWithRebase(
num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
} else {
defColumn.readIntegers(
num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
Expand All @@ -449,15 +442,8 @@ private void readLongBatch(int rowId, int num, WritableColumnVector column) thro
num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
} else if (originalType == OriginalType.TIMESTAMP_MICROS) {
if (rebaseDateTime) {
for (int i = 0; i < num; i++) {
if (defColumn.readInteger() == maxDefLevel) {
column.putLong(
rowId + i,
RebaseDateTime.rebaseJulianToGregorianMicros(dataColumn.readLong()));
} else {
column.putNull(rowId + i);
}
}
defColumn.readLongsWithRebase(
num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
} else {
defColumn.readLongs(
num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import org.apache.parquet.bytes.ByteBufferInputStream;
import org.apache.parquet.io.ParquetDecodingException;
import org.apache.spark.sql.catalyst.util.RebaseDateTime;
import org.apache.spark.sql.execution.vectorized.WritableColumnVector;

import org.apache.parquet.column.values.ValuesReader;
Expand Down Expand Up @@ -81,6 +82,33 @@ public final void readIntegers(int total, WritableColumnVector c, int rowId) {
}
}

// A fork of `readIntegers` to rebase the date values. For performance reasons, this method
// iterates the values twice: check if we need to rebase first, then go to the optimized branch
// if rebase is not needed.
@Override
public final void readIntegersWithRebase(int total, WritableColumnVector c, int rowId) {
int requiredBytes = total * 4;
ByteBuffer buffer = getBuffer(requiredBytes);
boolean rebase = false;
for (int i = 0; i < total; i += 1) {
rebase |= buffer.getInt(buffer.position() + i * 4) < RebaseDateTime.lastSwitchJulianDay();
}
if (rebase) {
for (int i = 0; i < total; i += 1) {
c.putInt(rowId + i, RebaseDateTime.rebaseJulianToGregorianDays(buffer.getInt()));
}
} else {
if (buffer.hasArray()) {
int offset = buffer.arrayOffset() + buffer.position();
c.putIntsLittleEndian(rowId, total, buffer.array(), offset);
} else {
for (int i = 0; i < total; i += 1) {
c.putInt(rowId + i, buffer.getInt());
}
}
}
}

@Override
public final void readLongs(int total, WritableColumnVector c, int rowId) {
int requiredBytes = total * 8;
Expand All @@ -96,6 +124,33 @@ public final void readLongs(int total, WritableColumnVector c, int rowId) {
}
}

// A fork of `readLongs` to rebase the timestamp values. For performance reasons, this method
// iterates the values twice: check if we need to rebase first, then go to the optimized branch
// if rebase is not needed.
@Override
public final void readLongsWithRebase(int total, WritableColumnVector c, int rowId) {
int requiredBytes = total * 8;
ByteBuffer buffer = getBuffer(requiredBytes);
boolean rebase = false;
for (int i = 0; i < total; i += 1) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The byte code of the loop is:

    LINENUMBER 136 L6
    ILOAD 6
    ALOAD 5
    ALOAD 5
    INVOKEVIRTUAL java/nio/ByteBuffer.position ()I
    ILOAD 7
    BIPUSH 8
    IMUL
    IADD
    INVOKEVIRTUAL java/nio/ByteBuffer.getLong (I)J
    INVOKESTATIC org/apache/spark/sql/catalyst/util/RebaseDateTime.lastSwitchJulianTs ()J
    LCMP
    IFGE L7
    ICONST_1
    GOTO L8

We could avoid mul like

    int pos = buffer.position();
    int endPos = pos + total * 8;
    long threshold = RebaseDateTime.lastSwitchJulianTs();
    while (pos < endPos) {
      rebase |= buffer.getLong(pos) < threshold;
      pos += 8;
    }

Would it be faster?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried it and the perf has no difference.

rebase |= buffer.getLong(buffer.position() + i * 8) < RebaseDateTime.lastSwitchJulianTs();
}
if (rebase) {
for (int i = 0; i < total; i += 1) {
c.putLong(rowId + i, RebaseDateTime.rebaseJulianToGregorianMicros(buffer.getLong()));
}
} else {
if (buffer.hasArray()) {
int offset = buffer.arrayOffset() + buffer.position();
c.putLongsLittleEndian(rowId, total, buffer.array(), offset);
} else {
for (int i = 0; i < total; i += 1) {
c.putLong(rowId + i, buffer.getLong());
}
}
}
}

@Override
public final void readFloats(int total, WritableColumnVector c, int rowId) {
int requiredBytes = total * 4;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.io.api.Binary;

import org.apache.spark.sql.catalyst.util.RebaseDateTime;
import org.apache.spark.sql.execution.vectorized.WritableColumnVector;

import java.io.IOException;
Expand Down Expand Up @@ -203,6 +204,43 @@ public void readIntegers(
}
}

// A fork of `readIntegers`, which rebases the date int value (days) before filling
// the Spark column vector.
public void readIntegersWithRebase(
int total,
WritableColumnVector c,
int rowId,
int level,
VectorizedValuesReader data) throws IOException {
int left = total;
while (left > 0) {
if (this.currentCount == 0) this.readNextGroup();
int n = Math.min(left, this.currentCount);
switch (mode) {
case RLE:
if (currentValue == level) {
data.readIntegersWithRebase(n, c, rowId);
} else {
c.putNulls(rowId, n);
}
break;
case PACKED:
for (int i = 0; i < n; ++i) {
if (currentBuffer[currentBufferIdx++] == level) {
c.putInt(rowId + i,
RebaseDateTime.rebaseJulianToGregorianDays(data.readInteger()));
} else {
c.putNull(rowId + i);
}
}
break;
}
rowId += n;
left -= n;
currentCount -= n;
}
}

// TODO: can this code duplication be removed without a perf penalty?
public void readBooleans(
int total,
Expand Down Expand Up @@ -342,6 +380,43 @@ public void readLongs(
}
}

// A fork of `readLongs`, which rebases the timestamp long value (microseconds) before filling
// the Spark column vector.
public void readLongsWithRebase(
int total,
WritableColumnVector c,
int rowId,
int level,
VectorizedValuesReader data) throws IOException {
int left = total;
while (left > 0) {
if (this.currentCount == 0) this.readNextGroup();
int n = Math.min(left, this.currentCount);
switch (mode) {
case RLE:
if (currentValue == level) {
data.readLongsWithRebase(n, c, rowId);
} else {
c.putNulls(rowId, n);
}
break;
case PACKED:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it impossible to optimize the case too?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't optimize this case because the no-rebase code path looks not very fast. It has a if-else in the loop.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The general idea is to add an extra loop to check if we need to rebase or not, and it's only worthwhile if the no-rebase code path is much faster than the rebase code path.

for (int i = 0; i < n; ++i) {
if (currentBuffer[currentBufferIdx++] == level) {
c.putLong(rowId + i,
RebaseDateTime.rebaseJulianToGregorianMicros(data.readLong()));
} else {
c.putNull(rowId + i);
}
}
break;
}
rowId += n;
left -= n;
currentCount -= n;
}
}

public void readFloats(
int total,
WritableColumnVector c,
Expand Down Expand Up @@ -508,6 +583,11 @@ public void readIntegers(int total, WritableColumnVector c, int rowId) {
}
}

@Override
public void readIntegersWithRebase(int total, WritableColumnVector c, int rowId) {
throw new UnsupportedOperationException("only readInts is valid.");
}

@Override
public byte readByte() {
throw new UnsupportedOperationException("only readInts is valid.");
Expand All @@ -523,6 +603,11 @@ public void readLongs(int total, WritableColumnVector c, int rowId) {
throw new UnsupportedOperationException("only readInts is valid.");
}

@Override
public void readLongsWithRebase(int total, WritableColumnVector c, int rowId) {
throw new UnsupportedOperationException("only readInts is valid.");
}

@Override
public void readBinary(int total, WritableColumnVector c, int rowId) {
throw new UnsupportedOperationException("only readInts is valid.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ public interface VectorizedValuesReader {
void readBooleans(int total, WritableColumnVector c, int rowId);
void readBytes(int total, WritableColumnVector c, int rowId);
void readIntegers(int total, WritableColumnVector c, int rowId);
void readIntegersWithRebase(int total, WritableColumnVector c, int rowId);
void readLongs(int total, WritableColumnVector c, int rowId);
void readLongsWithRebase(int total, WritableColumnVector c, int rowId);
void readFloats(int total, WritableColumnVector c, int rowId);
void readDoubles(int total, WritableColumnVector c, int rowId);
void readBinary(int total, WritableColumnVector c, int rowId);
Expand Down
Loading