Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,44 @@
*/
public abstract class Statistics<T extends Comparable<T>> {

/**
* Builder class to build Statistics objects. Used to read the statistics from the Parquet file.
*/
public static class Builder {
private final PrimitiveTypeName type;
private byte[] min;
private byte[] max;
private long numNulls = -1;

private Builder(PrimitiveTypeName type) {
this.type = type;
}

public Builder withMin(byte[] min) {
this.min = min;
return this;
}

public Builder withMax(byte[] max) {
this.max = max;
return this;
}

public Builder withNumNulls(long numNulls) {
this.numNulls = numNulls;
return this;
}

public Statistics<?> build() {
Statistics<?> stats = getStatsBasedOnType(type);
if (min != null && max != null) {
stats.setMinMaxFromBytes(min, max);
}
stats.num_nulls = this.numNulls;
return stats;
}
}

private boolean hasNonNullValue;
private long num_nulls;

Expand Down Expand Up @@ -67,6 +105,17 @@ public static Statistics getStatsBasedOnType(PrimitiveTypeName type) {
}
}

/**
* Returns a builder to create new statistics object. Used to read the statistics from the parquet file.
*
* @param type
* type of the column
* @return builder to create new statistics object
*/
public static Builder getBuilder(PrimitiveTypeName type) {
return new Builder(type);
}

/**
* updates statistics min and max using the passed value
* @param value value to use to update min and max
Expand Down Expand Up @@ -172,7 +221,9 @@ public void mergeStatistics(Statistics stats) {
* Abstract method to set min and max values from byte arrays.
* @param minBytes byte array to set the min value to
* @param maxBytes byte array to set the max value to
* @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
*/
@Deprecated
abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes);

abstract public T genericGetMin();
Expand Down Expand Up @@ -221,16 +272,20 @@ public void incrementNumNulls(long increment) {

/**
* Returns the null count
* @return null count
* @return null count or {@code -1} if the null count is not set
*/
public long getNumNulls() {
return num_nulls;
}

/**
* Sets the number of nulls to the parameter value
* @param nulls null count to set the count to
*
* @param nulls
* null count to set the count to
* @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
*/
@Deprecated
public void setNumNulls(long nulls) {
num_nulls = nulls;
}
Expand All @@ -241,7 +296,7 @@ public void setNumNulls(long nulls) {
* @return true if object is empty, false otherwise
*/
public boolean isEmpty() {
return !hasNonNullValue && num_nulls == 0;
return !hasNonNullValue && !isNumNullsSet();
}

/**
Expand All @@ -251,6 +306,13 @@ public boolean hasNonNullValue() {
return hasNonNullValue;
}

/**
* @return whether numNulls is set and can be used
*/
public boolean isNumNullsSet() {
return num_nulls >= 0;
}

/**
* Sets the page/column as having a valid non-null value
* kind of misnomer here
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public class TestStatistics {
@Test
public void testNumNulls() {
IntStatistics stats = new IntStatistics();
assertTrue(stats.isNumNullsSet());
assertEquals(stats.getNumNulls(), 0);

stats.incrementNumNulls();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
import org.apache.parquet.filter2.predicate.UserDefinedPredicate;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;

import static org.apache.parquet.Preconditions.checkArgument;
import static org.apache.parquet.Preconditions.checkNotNull;

/**
Expand Down Expand Up @@ -122,6 +121,10 @@ public <T extends Comparable<T>> Boolean visit(Eq<T> eq) {
}

if (value == null) {
// We don't know anything about the nulls in this chunk
if (!stats.isNumNullsSet()) {
return BLOCK_MIGHT_MATCH;
}
// we are looking for records where v eq(null)
// so drop if there are no nulls in this chunk
return !hasNulls(meta);
Expand All @@ -133,6 +136,11 @@ public <T extends Comparable<T>> Boolean visit(Eq<T> eq) {
return BLOCK_CANNOT_MATCH;
}

if (!stats.hasNonNullValue()) {
// stats does not contain min/max values, we cannot drop any chunks
return BLOCK_MIGHT_MATCH;
}

// drop if value < min || value > max
return value.compareTo(stats.genericGetMin()) < 0 || value.compareTo(stats.genericGetMax()) > 0;
}
Expand Down Expand Up @@ -166,12 +174,17 @@ public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) {
return isAllNulls(meta);
}

if (hasNulls(meta)) {
if (stats.isNumNullsSet() && hasNulls(meta)) {
// we are looking for records where v notEq(someNonNull)
// but this chunk contains nulls, we cannot drop it
return BLOCK_MIGHT_MATCH;
}

if (!stats.hasNonNullValue()) {
// stats does not contain min/max values, we cannot drop any chunks
return BLOCK_MIGHT_MATCH;
}

// drop if this is a column where min = max = value
return value.compareTo(stats.genericGetMin()) == 0 && value.compareTo(stats.genericGetMax()) == 0;
}
Expand Down Expand Up @@ -201,6 +214,11 @@ public <T extends Comparable<T>> Boolean visit(Lt<T> lt) {
return BLOCK_CANNOT_MATCH;
}

if (!stats.hasNonNullValue()) {
// stats does not contain min/max values, we cannot drop any chunks
return BLOCK_MIGHT_MATCH;
}

T value = lt.getValue();

// drop if value <= min
Expand Down Expand Up @@ -232,6 +250,11 @@ public <T extends Comparable<T>> Boolean visit(LtEq<T> ltEq) {
return BLOCK_CANNOT_MATCH;
}

if (!stats.hasNonNullValue()) {
// stats does not contain min/max values, we cannot drop any chunks
return BLOCK_MIGHT_MATCH;
}

T value = ltEq.getValue();

// drop if value < min
Expand Down Expand Up @@ -263,6 +286,11 @@ public <T extends Comparable<T>> Boolean visit(Gt<T> gt) {
return BLOCK_CANNOT_MATCH;
}

if (!stats.hasNonNullValue()) {
// stats does not contain min/max values, we cannot drop any chunks
return BLOCK_MIGHT_MATCH;
}

T value = gt.getValue();

// drop if value >= max
Expand Down Expand Up @@ -294,6 +322,11 @@ public <T extends Comparable<T>> Boolean visit(GtEq<T> gtEq) {
return BLOCK_CANNOT_MATCH;
}

if (!stats.hasNonNullValue()) {
// stats does not contain min/max values, we cannot drop any chunks
return BLOCK_MIGHT_MATCH;
}

T value = gtEq.getValue();

// drop if value >= max
Expand Down Expand Up @@ -355,6 +388,11 @@ private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean vis
}
}

if (!stats.hasNonNullValue()) {
// stats does not contain min/max values, we cannot drop any chunks
return BLOCK_MIGHT_MATCH;
}

org.apache.parquet.filter2.predicate.Statistics<T> udpStats =
new org.apache.parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(), stats.genericGetMax());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,8 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist
static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal
(String createdBy, Statistics statistics, PrimitiveTypeName type, SortOrder typeSortOrder) {
// create stats object based on the column type
org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.getStatsBasedOnType(type);
org.apache.parquet.column.statistics.Statistics.Builder statsBuilder =
org.apache.parquet.column.statistics.Statistics.getBuilder(type);
// If there was no statistics written to the footer, create an empty Statistics object and return

// NOTE: See docs in CorruptStatistics for explanation of why this check is needed
Expand All @@ -347,11 +348,14 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist
if (statistics != null && !CorruptStatistics.shouldIgnoreStatistics(createdBy, type) &&
SortOrder.SIGNED == typeSortOrder) {
if (statistics.isSetMax() && statistics.isSetMin()) {
stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array());
statsBuilder.withMin(statistics.min.array());
statsBuilder.withMax(statistics.max.array());
}
if (statistics.isSetNull_count()) {
statsBuilder.withNumNulls(statistics.null_count);
}
stats.setNumNulls(statistics.null_count);
}
return stats;
return statsBuilder.build();
}

public org.apache.parquet.column.statistics.Statistics fromParquetStatistics(
Expand Down
Loading