Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* limitations under the License.
*/

package org.apache.spark.sql.hive
package org.apache.spark.sql.execution.datasources

import java.io.{DataInput, DataOutput, IOException}
import java.sql.Date
Expand All @@ -35,11 +35,12 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils.{rebaseGregorianToJulian
* @param julianDays The number of days since the epoch 1970-01-01 in
* Julian calendar.
*/
private[hive] class DaysWritable(
class DaysWritable(
var gregorianDays: Int,
var julianDays: Int)
extends DateWritable {

def this() = this(0, 0)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume that gregorianDays and julianDays will be set later via the set method.

def this(gregorianDays: Int) =
this(gregorianDays, rebaseGregorianToJulianDays(gregorianDays))
def this(dateWritable: DateWritable) = {
Expand All @@ -55,6 +56,11 @@ private[hive] class DaysWritable(
override def getDays: Int = julianDays
override def get(): Date = new Date(DateWritable.daysToMillis(julianDays))

override def set(d: Int): Unit = {
gregorianDays = d
julianDays = rebaseGregorianToJulianDays(d)
}

@throws[IOException]
override def write(out: DataOutput): Unit = {
WritableUtils.writeVInt(out, julianDays)
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.orc

import java.io.File
import java.nio.charset.StandardCharsets.UTF_8
import java.sql.Timestamp
import java.sql.{Date, Timestamp}
import java.util.Locale

import org.apache.hadoop.conf.Configuration
Expand Down Expand Up @@ -482,6 +482,32 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
}
}
}

test("SPARK-31238: compatibility with Spark 2.4 in reading dates") {
Seq(false, true).foreach { vectorized =>
withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vectorized.toString) {
checkAnswer(
readResourceOrcFile("test-data/before_1582_date_v2_4.snappy.orc"),
Row(java.sql.Date.valueOf("1200-01-01")))
}
}
}

test("SPARK-31238: rebasing dates in write") {
withTempPath { dir =>
val path = dir.getAbsolutePath
Seq("1001-01-01").toDF("dateS")
.select($"dateS".cast("date").as("date"))
.write
.orc(path)

Seq(false, true).foreach { vectorized =>
withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vectorized.toString) {
checkAnswer(spark.read.orc(path), Row(Date.valueOf("1001-01-01")))
}
}
}
}
}

class OrcSourceSuite extends OrcSuite with SharedSparkSession {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,4 +133,9 @@ abstract class OrcTest extends QueryTest with FileBasedDataSourceTest with Befor
throw new AnalysisException("Can not match OrcTable in the query.")
}
}

protected def readResourceOrcFile(name: String): DataFrame = {
val url = Thread.currentThread().getContextClassLoader.getResource(name)
spark.read.orc(url.toString)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import org.apache.spark.sql.catalyst.util.DateTimeUtils;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DateType;
import org.apache.spark.sql.types.Decimal;
import org.apache.spark.sql.types.TimestampType;
import org.apache.spark.sql.vectorized.ColumnarArray;
Expand All @@ -42,6 +43,7 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto
private DecimalColumnVector decimalData;
private TimestampColumnVector timestampData;
private final boolean isTimestamp;
private final boolean isDate;

private int batchSize;

Expand All @@ -54,6 +56,12 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto
isTimestamp = false;
}

if (type instanceof DateType) {
isDate = true;
} else {
isDate = false;
}

baseData = vector;
if (vector instanceof LongColumnVector) {
longData = (LongColumnVector) vector;
Expand Down Expand Up @@ -130,7 +138,12 @@ public short getShort(int rowId) {

@Override
public int getInt(int rowId) {
return (int) longData.vector[getRowIndex(rowId)];
int value = (int) longData.vector[getRowIndex(rowId)];
if (isDate) {
return DateTimeUtils.rebaseJulianToGregorianDays(value);
} else {
return value;
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.execution.datasources.orc

import java.io.{DataInput, DataOutput, IOException}
import java.sql.Date

import org.apache.hadoop.io.WritableUtils
import org.apache.orc.storage.serde2.io.DateWritable

import org.apache.spark.sql.catalyst.util.DateTimeUtils.{rebaseGregorianToJulianDays, rebaseJulianToGregorianDays}

/**
* The class accepts/returns days in Gregorian calendar and rebase them
* via conversion to local date in Julian calendar for dates before 1582-10-15
* in read/write for backward compatibility with Spark 2.4 and earlier versions.
*
* This is a clone of `org.apache.spark.sql.execution.datasources.DaysWritable`.
* The class is cloned because Hive ORC v1.2 uses different `DateWritable`:
* - v1.2: `org.apache.orc.storage.serde2.io.DateWritable`
* - v2.3 and `HiveInspectors`: `org.apache.hadoop.hive.serde2.io.DateWritable`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need the above 4 line comments because v1.2 and v2.3 folder structure already is designed for that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I move it to 2.3, and enable hive-1.2, HiveInspectors will use DaysWritable from v1.2 which is wrong or I miss something in your proposal?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. Let me check again.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

? Sorry, why do you move this to 2.3?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I ask is keep this AS-IS, and move the other DaysWritable.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm checking again from my side.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what I tried MaxGekk#26, and it fails when I enable hive-1.2:

build/sbt -Phive-1.2 clean package

Feel free to open a PR for this PR if you mean something different.

*
* @param gregorianDays The number of days since the epoch 1970-01-01 in
* Gregorian calendar.
* @param julianDays The number of days since the epoch 1970-01-01 in
* Julian calendar.
*/
class DaysWritable(
var gregorianDays: Int,
var julianDays: Int)
extends DateWritable {

def this() = this(0, 0)
def this(gregorianDays: Int) =
this(gregorianDays, rebaseGregorianToJulianDays(gregorianDays))
def this(dateWritable: DateWritable) = {
this(
gregorianDays = dateWritable match {
case daysWritable: DaysWritable => daysWritable.gregorianDays
case dateWritable: DateWritable =>
rebaseJulianToGregorianDays(dateWritable.getDays)
},
julianDays = dateWritable.getDays)
}

override def getDays: Int = julianDays
override def get(): Date = new Date(DateWritable.daysToMillis(julianDays))

override def set(d: Int): Unit = {
gregorianDays = d
julianDays = rebaseGregorianToJulianDays(d)
}

@throws[IOException]
override def write(out: DataOutput): Unit = {
WritableUtils.writeVInt(out, julianDays)
}

@throws[IOException]
override def readFields(in: DataInput): Unit = {
julianDays = WritableUtils.readVInt(in)
gregorianDays = rebaseJulianToGregorianDays(julianDays)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,13 @@ private[sql] object OrcShimUtils {

def getDateWritable(reuseObj: Boolean): (SpecializedGetters, Int) => DateWritable = {
if (reuseObj) {
val result = new DateWritable()
val result = new DaysWritable()
(getter, ordinal) =>
result.set(getter.getInt(ordinal))
result
} else {
(getter: SpecializedGetters, ordinal: Int) =>
new DateWritable(getter.getInt(ordinal))
new DaysWritable(getter.getInt(ordinal))
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import org.apache.spark.sql.catalyst.util.DateTimeUtils;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DateType;
import org.apache.spark.sql.types.Decimal;
import org.apache.spark.sql.types.TimestampType;
import org.apache.spark.sql.vectorized.ColumnarArray;
Expand All @@ -42,6 +43,7 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto
private DecimalColumnVector decimalData;
private TimestampColumnVector timestampData;
private final boolean isTimestamp;
private final boolean isDate;

private int batchSize;

Expand All @@ -54,6 +56,12 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto
isTimestamp = false;
}

if (type instanceof DateType) {
isDate = true;
} else {
isDate = false;
}

baseData = vector;
if (vector instanceof LongColumnVector) {
longData = (LongColumnVector) vector;
Expand Down Expand Up @@ -130,7 +138,12 @@ public short getShort(int rowId) {

@Override
public int getInt(int rowId) {
return (int) longData.vector[getRowIndex(rowId)];
int value = (int) longData.vector[getRowIndex(rowId)];
if (isDate) {
return DateTimeUtils.rebaseJulianToGregorianDays(value);
} else {
return value;
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf.{Operator => OrcOperator}
import org.apache.hadoop.hive.serde2.io.{DateWritable, HiveDecimalWritable}

import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
import org.apache.spark.sql.execution.datasources.DaysWritable
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After moving DaysWritable.scala to here. I guess we don't need this line. Please try to remove.

import org.apache.spark.sql.types.Decimal

/**
Expand All @@ -47,13 +48,13 @@ private[sql] object OrcShimUtils {

def getDateWritable(reuseObj: Boolean): (SpecializedGetters, Int) => DateWritable = {
if (reuseObj) {
val result = new DateWritable()
val result = new DaysWritable()
(getter, ordinal) =>
result.set(getter.getInt(ordinal))
result
} else {
(getter: SpecializedGetters, ordinal: Int) =>
new DateWritable(getter.getInt(ordinal))
new DaysWritable(getter.getInt(ordinal))
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.execution.datasources.DaysWritable
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unnecessary change

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I remove it, the build fails with:

Error:(1012, 44) not found: type DaysWritable
  private def getDateWritable(value: Any): DaysWritable =

I moved DaysWritable from sql/hive to sql/core to reuse it in ORC.

import org.apache.spark.sql.types
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
Expand Down