Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* limitations under the License.
*/

package org.apache.spark.sql.hive
package org.apache.spark.sql.execution.datasources

import java.io.{DataInput, DataOutput, IOException}
import java.sql.Date
Expand All @@ -35,11 +35,12 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils.{rebaseGregorianToJulian
* @param julianDays The number of days since the epoch 1970-01-01 in
* Julian calendar.
*/
private[hive] class DaysWritable(
class DaysWritable(
var gregorianDays: Int,
var julianDays: Int)
extends DateWritable {

def this() = this(0, 0)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume that gregorianDays and julianDays will be set later via the set method.

def this(gregorianDays: Int) =
this(gregorianDays, rebaseGregorianToJulianDays(gregorianDays))
def this(dateWritable: DateWritable) = {
Expand All @@ -55,6 +56,11 @@ private[hive] class DaysWritable(
override def getDays: Int = julianDays
override def get(): Date = new Date(DateWritable.daysToMillis(julianDays))

override def set(d: Int): Unit = {
gregorianDays = d
julianDays = rebaseGregorianToJulianDays(d)
}

@throws[IOException]
override def write(out: DataOutput): Unit = {
WritableUtils.writeVInt(out, julianDays)
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.orc

import java.io.File
import java.nio.charset.StandardCharsets.UTF_8
import java.sql.Timestamp
import java.sql.{Date, Timestamp}
import java.util.Locale

import org.apache.hadoop.conf.Configuration
Expand Down Expand Up @@ -482,6 +482,32 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
}
}
}

test("SPARK-31238: compatibility with Spark 2.4 in reading dates") {
Seq(false, true).foreach { vectorized =>
withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vectorized.toString) {
checkAnswer(
readResourceOrcFile("test-data/before_1582_date_v2_4.snappy.orc"),
Row(java.sql.Date.valueOf("1200-01-01")))
}
}
}

test("SPARK-31238: rebasing dates in write") {
withTempPath { dir =>
val path = dir.getAbsolutePath
Seq("1001-01-01").toDF("dateS")
.select($"dateS".cast("date").as("date"))
.write
.orc(path)

Seq(false, true).foreach { vectorized =>
withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vectorized.toString) {
checkAnswer(spark.read.orc(path), Row(Date.valueOf("1001-01-01")))
}
}
}
}
}

class OrcSourceSuite extends OrcSuite with SharedSparkSession {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,4 +133,9 @@ abstract class OrcTest extends QueryTest with FileBasedDataSourceTest with Befor
throw new AnalysisException("Can not match OrcTable in the query.")
}
}

protected def readResourceOrcFile(name: String): DataFrame = {
val url = Thread.currentThread().getContextClassLoader.getResource(name)
spark.read.orc(url.toString)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import org.apache.spark.sql.catalyst.util.DateTimeUtils;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DateType;
import org.apache.spark.sql.types.Decimal;
import org.apache.spark.sql.types.TimestampType;
import org.apache.spark.sql.vectorized.ColumnarArray;
Expand All @@ -42,6 +43,7 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto
private DecimalColumnVector decimalData;
private TimestampColumnVector timestampData;
private final boolean isTimestamp;
private final boolean isDate;

private int batchSize;

Expand All @@ -54,6 +56,12 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto
isTimestamp = false;
}

if (type instanceof DateType) {
isDate = true;
} else {
isDate = false;
}

baseData = vector;
if (vector instanceof LongColumnVector) {
longData = (LongColumnVector) vector;
Expand Down Expand Up @@ -130,7 +138,13 @@ public short getShort(int rowId) {

@Override
public int getInt(int rowId) {
return (int) longData.vector[getRowIndex(rowId)];
int index = getRowIndex(rowId);
int value = (int) longData.vector[index];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: int value = (int) longData.vector[getRowIndex(rowId)];

if (isDate) {
return DateTimeUtils.rebaseJulianToGregorianDays(value);
} else {
return value;
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import org.apache.orc.storage.ql.io.sarg.PredicateLeaf.{Operator => OrcOperator}
import org.apache.orc.storage.serde2.io.{DateWritable, HiveDecimalWritable}

import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
import org.apache.spark.sql.execution.datasources.DaysWritable
import org.apache.spark.sql.types.Decimal

/**
Expand All @@ -47,13 +48,13 @@ private[sql] object OrcShimUtils {

def getDateWritable(reuseObj: Boolean): (SpecializedGetters, Int) => DateWritable = {
if (reuseObj) {
val result = new DateWritable()
val result = new DaysWritable()
(getter, ordinal) =>
result.set(getter.getInt(ordinal))
result
} else {
(getter: SpecializedGetters, ordinal: Int) =>
new DateWritable(getter.getInt(ordinal))
new DaysWritable(getter.getInt(ordinal))
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import org.apache.spark.sql.catalyst.util.DateTimeUtils;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DateType;
import org.apache.spark.sql.types.Decimal;
import org.apache.spark.sql.types.TimestampType;
import org.apache.spark.sql.vectorized.ColumnarArray;
Expand All @@ -42,6 +43,7 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto
private DecimalColumnVector decimalData;
private TimestampColumnVector timestampData;
private final boolean isTimestamp;
private final boolean isDate;

private int batchSize;

Expand All @@ -54,6 +56,12 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto
isTimestamp = false;
}

if (type instanceof DateType) {
isDate = true;
} else {
isDate = false;
}

baseData = vector;
if (vector instanceof LongColumnVector) {
longData = (LongColumnVector) vector;
Expand Down Expand Up @@ -130,7 +138,13 @@ public short getShort(int rowId) {

@Override
public int getInt(int rowId) {
return (int) longData.vector[getRowIndex(rowId)];
int index = getRowIndex(rowId);
int value = (int) longData.vector[index];
if (isDate) {
return DateTimeUtils.rebaseJulianToGregorianDays(value);
} else {
return value;
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf.{Operator => OrcOperator}
import org.apache.hadoop.hive.serde2.io.{DateWritable, HiveDecimalWritable}

import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
import org.apache.spark.sql.execution.datasources.DaysWritable
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After moving DaysWritable.scala to here. I guess we don't need this line. Please try to remove.

import org.apache.spark.sql.types.Decimal

/**
Expand All @@ -47,13 +48,13 @@ private[sql] object OrcShimUtils {

def getDateWritable(reuseObj: Boolean): (SpecializedGetters, Int) => DateWritable = {
if (reuseObj) {
val result = new DateWritable()
val result = new DaysWritable()
(getter, ordinal) =>
result.set(getter.getInt(ordinal))
result
} else {
(getter: SpecializedGetters, ordinal: Int) =>
new DateWritable(getter.getInt(ordinal))
new DaysWritable(getter.getInt(ordinal))
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.execution.datasources.DaysWritable
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unnecessary change

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I remove it, the build fails with:

Error:(1012, 44) not found: type DaysWritable
  private def getDateWritable(value: Any): DaysWritable =

I moved DaysWritable from sql/hive to sql/core to reuse it in ORC.

import org.apache.spark.sql.types
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
Expand Down