diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java index 0bd77a30a8..54fe6579b9 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java @@ -168,12 +168,12 @@ private void printColumnChunk(Logger console, int width, ColumnChunkMetaData col if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) { console.info(String.format("%-" + width + "s FIXED[%d] %s %-7s %-9d %-8s %-7s %s", name, type.getTypeLength(), shortCodec(codec), encodingSummary, count, - humanReadable(perValue), stats == null ? "" : String.valueOf(stats.getNumNulls()), + humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType()))); } else { console.info(String.format("%-" + width + "s %-9s %s %-7s %-9d %-10s %-7s %s", name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue), - stats == null ? "" : String.valueOf(stats.getNumNulls()), + stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType()))); } } diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java index beda4529ed..4d0e2c9ba5 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java @@ -191,7 +191,7 @@ public String visit(DataPageV1 page) { String enc = encodingAsString(page.getValueEncoding(), false); long totalSize = page.getCompressedSize(); int count = page.getValueCount(); - long numNulls = page.getStatistics().getNumNulls(); + String numNulls = page.getStatistics().isNumNullsSet() ? Long.toString(page.getStatistics().getNumNulls()) : ""; float perValue = ((float) totalSize) / count; String minMax = minMaxAsString(page.getStatistics(), type.getOriginalType()); return String.format("%3d-%-3d %-5s %s %-2s %-7d %-10s %-10s %-8s %-7s %s", diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index 00d0bbf19c..a087c5f70e 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -35,6 +35,44 @@ */ public abstract class Statistics> { + /** + * Builder class to build Statistics objects. Used to read the statistics from the Parquet file. + */ + public static class Builder { + private final PrimitiveType type; + private byte[] min; + private byte[] max; + private long numNulls = -1; + + private Builder(PrimitiveType type) { + this.type = type; + } + + public Builder withMin(byte[] min) { + this.min = min; + return this; + } + + public Builder withMax(byte[] max) { + this.max = max; + return this; + } + + public Builder withNumNulls(long numNulls) { + this.numNulls = numNulls; + return this; + } + + public Statistics build() { + Statistics stats = createStats(type); + if (min != null && max != null) { + stats.setMinMaxFromBytes(min, max); + } + stats.num_nulls = this.numNulls; + return stats; + } + } + private final PrimitiveType type; private final PrimitiveComparator comparator; private boolean hasNonNullValue; @@ -109,6 +147,17 @@ public static Statistics createStats(Type type) { } } + /** + * Returns a builder to create new statistics object. Used to read the statistics from the parquet file. + * + * @param type + * type of the column + * @return builder to create new statistics object + */ + public static Builder getBuilder(PrimitiveType type) { + return new Builder(type); + } + /** * updates statistics min and max using the passed value * @param value value to use to update min and max @@ -217,7 +266,9 @@ public void mergeStatistics(Statistics stats) { * Abstract method to set min and max values from byte arrays. * @param minBytes byte array to set the min value to * @param maxBytes byte array to set the max value to + * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead. */ + @Deprecated abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes); /** @@ -310,9 +361,13 @@ public String maxAsString() { @Override public String toString() { - if (this.hasNonNullValue()) - return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls()); - else if (!this.isEmpty()) + if (this.hasNonNullValue()) { + if (isNumNullsSet()) { + return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls()); + } else { + return String.format("min: %s, max: %s, num_nulls not defined", minAsString(), maxAsString()); + } + } else if (!this.isEmpty()) return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); else return "no stats for this column"; @@ -335,7 +390,7 @@ public void incrementNumNulls(long increment) { /** * Returns the null count - * @return null count + * @return null count or {@code -1} if the null count is not set */ public long getNumNulls() { return num_nulls; @@ -343,8 +398,12 @@ public long getNumNulls() { /** * Sets the number of nulls to the parameter value - * @param nulls null count to set the count to + * + * @param nulls + * null count to set the count to + * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead. */ + @Deprecated public void setNumNulls(long nulls) { num_nulls = nulls; } @@ -355,7 +414,7 @@ public void setNumNulls(long nulls) { * @return true if object is empty, false otherwise */ public boolean isEmpty() { - return !hasNonNullValue && num_nulls == 0; + return !hasNonNullValue && !isNumNullsSet(); } /** @@ -365,6 +424,13 @@ public boolean hasNonNullValue() { return hasNonNullValue; } + /** + * @return whether numNulls is set and can be used + */ + public boolean isNumNullsSet() { + return num_nulls >= 0; + } + /** * Sets the page/column as having a valid non-null value * kind of misnomer here diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java index 8ca1ca635f..5e5d5fd26e 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java @@ -42,6 +42,7 @@ public class TestStatistics { @Test public void testNumNulls() { IntStatistics stats = new IntStatistics(); + assertTrue(stats.isNumNullsSet()); assertEquals(stats.getNumNulls(), 0); stats.incrementNumNulls(); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java index f168a6004c..446c8a3d49 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java @@ -40,7 +40,6 @@ import org.apache.parquet.filter2.predicate.UserDefinedPredicate; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; -import static org.apache.parquet.Preconditions.checkArgument; import static org.apache.parquet.Preconditions.checkNotNull; /** @@ -122,6 +121,10 @@ public > Boolean visit(Eq eq) { } if (value == null) { + // We don't know anything about the nulls in this chunk + if (!stats.isNumNullsSet()) { + return BLOCK_MIGHT_MATCH; + } // we are looking for records where v eq(null) // so drop if there are no nulls in this chunk return !hasNulls(meta); @@ -133,6 +136,11 @@ public > Boolean visit(Eq eq) { return BLOCK_CANNOT_MATCH; } + if (!stats.hasNonNullValue()) { + // stats does not contain min/max values, we cannot drop any chunks + return BLOCK_MIGHT_MATCH; + } + // drop if value < min || value > max return stats.compareMinToValue(value) > 0 || stats.compareMaxToValue(value) < 0; } @@ -166,12 +174,17 @@ public > Boolean visit(NotEq notEq) { return isAllNulls(meta); } - if (hasNulls(meta)) { + if (stats.isNumNullsSet() && hasNulls(meta)) { // we are looking for records where v notEq(someNonNull) // but this chunk contains nulls, we cannot drop it return BLOCK_MIGHT_MATCH; } + if (!stats.hasNonNullValue()) { + // stats does not contain min/max values, we cannot drop any chunks + return BLOCK_MIGHT_MATCH; + } + // drop if this is a column where min = max = value return stats.compareMinToValue(value) == 0 && stats.compareMaxToValue(value) == 0; } @@ -201,6 +214,11 @@ public > Boolean visit(Lt lt) { return BLOCK_CANNOT_MATCH; } + if (!stats.hasNonNullValue()) { + // stats does not contain min/max values, we cannot drop any chunks + return BLOCK_MIGHT_MATCH; + } + T value = lt.getValue(); // drop if value <= min @@ -232,6 +250,11 @@ public > Boolean visit(LtEq ltEq) { return BLOCK_CANNOT_MATCH; } + if (!stats.hasNonNullValue()) { + // stats does not contain min/max values, we cannot drop any chunks + return BLOCK_MIGHT_MATCH; + } + T value = ltEq.getValue(); // drop if value < min @@ -263,6 +286,11 @@ public > Boolean visit(Gt gt) { return BLOCK_CANNOT_MATCH; } + if (!stats.hasNonNullValue()) { + // stats does not contain min/max values, we cannot drop any chunks + return BLOCK_MIGHT_MATCH; + } + T value = gt.getValue(); // drop if value >= max @@ -294,6 +322,11 @@ public > Boolean visit(GtEq gtEq) { return BLOCK_CANNOT_MATCH; } + if (!stats.hasNonNullValue()) { + // stats does not contain min/max values, we cannot drop any chunks + return BLOCK_MIGHT_MATCH; + } + T value = gtEq.getValue(); // drop if value > max @@ -355,6 +388,11 @@ private , U extends UserDefinedPredicate> Boolean vis } } + if (!stats.hasNonNullValue()) { + // stats does not contain min/max values, we cannot drop any chunks + return BLOCK_MIGHT_MATCH; + } + org.apache.parquet.filter2.predicate.Statistics udpStats = new org.apache.parquet.filter2.predicate.Statistics(stats.genericGetMin(), stats.genericGetMax(), stats.comparator()); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index c4e5da3da7..0daabb6fe0 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -401,7 +401,8 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal (String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder) { // create stats object based on the column type - org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.createStats(type); + org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = + org.apache.parquet.column.statistics.Statistics.getBuilder(type); if (formatStats != null) { // Use the new V2 min-max statistics over the former one if it is filled @@ -409,9 +410,12 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist byte[] min = formatStats.min_value.array(); byte[] max = formatStats.max_value.array(); if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) { - stats.setMinMaxFromBytes(min, max); + statsBuilder.withMin(min); + statsBuilder.withMax(max); + } + if (formatStats.isSetNull_count()) { + statsBuilder.withNumNulls(formatStats.null_count); } - stats.setNumNulls(formatStats.null_count); } else { boolean isSet = formatStats.isSetMax() && formatStats.isSetMin(); boolean maxEqualsMin = isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax()) : false; @@ -424,13 +428,16 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) && (sortOrdersMatch || maxEqualsMin)) { if (isSet) { - stats.setMinMaxFromBytes(formatStats.min.array(), formatStats.max.array()); + statsBuilder.withMin(formatStats.min.array()); + statsBuilder.withMax(formatStats.max.array()); + } + if (formatStats.isSetNull_count()) { + statsBuilder.withNumNulls(formatStats.null_count); } - stats.setNumNulls(formatStats.null_count); } } } - return stats; + return statsBuilder.build(); } public org.apache.parquet.column.statistics.Statistics fromParquetStatistics( diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java index d8b4407914..6fdec2a6cd 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java @@ -22,7 +22,6 @@ import java.util.HashSet; import java.util.List; -import org.apache.parquet.io.api.Binary; import org.junit.Test; import org.apache.parquet.column.Encoding; @@ -39,6 +38,7 @@ import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Types; import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn; import static org.apache.parquet.io.api.Binary.fromString; @@ -62,7 +62,8 @@ public class TestStatisticsFilter { - private static ColumnChunkMetaData getIntColumnMeta(IntStatistics stats, long valueCount) { + private static ColumnChunkMetaData getIntColumnMeta(org.apache.parquet.column.statistics.Statistics stats, + long valueCount) { return ColumnChunkMetaData.get(ColumnPath.get("int", "column"), PrimitiveTypeName.INT32, CompressionCodecName.GZIP, @@ -71,7 +72,8 @@ private static ColumnChunkMetaData getIntColumnMeta(IntStatistics stats, long va 0L, 0L, valueCount, 0L, 0L); } - private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, long valueCount) { + private static ColumnChunkMetaData getDoubleColumnMeta(org.apache.parquet.column.statistics.Statistics stats, + long valueCount) { return ColumnChunkMetaData.get(ColumnPath.get("double", "column"), PrimitiveTypeName.DOUBLE, CompressionCodecName.GZIP, @@ -87,13 +89,16 @@ private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, l private static final IntStatistics intStats = new IntStatistics(); private static final IntStatistics nullIntStats = new IntStatistics(); + private static final org.apache.parquet.column.statistics.Statistics emptyIntStats = org.apache.parquet.column.statistics.Statistics + .getBuilder(Types.required(PrimitiveTypeName.INT32).named("test_int32")).build(); private static final DoubleStatistics doubleStats = new DoubleStatistics(); + private static final org.apache.parquet.column.statistics.Statistics missingMinMaxDoubleStats = org.apache.parquet.column.statistics.Statistics + .getBuilder(Types.required(PrimitiveTypeName.DOUBLE).named("test_double")).withNumNulls(100).build(); static { intStats.setMinMax(10, 100); doubleStats.setMinMax(10, 100); - nullIntStats.setMinMax(0, 0); nullIntStats.setNumNulls(177); } @@ -105,6 +110,9 @@ private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, l getIntColumnMeta(nullIntStats, 177L), // column of all nulls getDoubleColumnMeta(doubleStats, 177L)); + private static final List missingMinMaxColumnMetas = Arrays.asList( + getIntColumnMeta(emptyIntStats, 177L), // missing min/max values and numNulls => stats is empty + getDoubleColumnMeta(missingMinMaxDoubleStats, 177L)); // missing min/max, some null values @Test public void testEqNonNull() { @@ -116,6 +124,9 @@ public void testEqNonNull() { // drop columns of all nulls when looking for non-null value assertTrue(canDrop(eq(intColumn, 0), nullColumnMetas)); assertTrue(canDrop(eq(missingColumn, fromString("any")), columnMetas)); + + assertFalse(canDrop(eq(intColumn, 50), missingMinMaxColumnMetas)); + assertFalse(canDrop(eq(doubleColumn, 50.0), missingMinMaxColumnMetas)); } @Test @@ -137,6 +148,9 @@ public void testEqNull() { getDoubleColumnMeta(doubleStats, 177L)))); assertFalse(canDrop(eq(missingColumn, null), columnMetas)); + + assertFalse(canDrop(eq(intColumn, null), missingMinMaxColumnMetas)); + assertFalse(canDrop(eq(doubleColumn, null), missingMinMaxColumnMetas)); } @Test @@ -163,6 +177,9 @@ public void testNotEqNonNull() { getDoubleColumnMeta(doubleStats, 177L)))); assertFalse(canDrop(notEq(missingColumn, fromString("any")), columnMetas)); + + assertFalse(canDrop(notEq(intColumn, 50), missingMinMaxColumnMetas)); + assertFalse(canDrop(notEq(doubleColumn, 50.0), missingMinMaxColumnMetas)); } @Test @@ -192,6 +209,9 @@ public void testNotEqNull() { getDoubleColumnMeta(doubleStats, 177L)))); assertTrue(canDrop(notEq(missingColumn, null), columnMetas)); + + assertFalse(canDrop(notEq(intColumn, null), missingMinMaxColumnMetas)); + assertFalse(canDrop(notEq(doubleColumn, null), missingMinMaxColumnMetas)); } @Test @@ -205,6 +225,9 @@ public void testLt() { assertTrue(canDrop(lt(intColumn, 7), nullColumnMetas)); assertTrue(canDrop(lt(missingColumn, fromString("any")), columnMetas)); + + assertFalse(canDrop(lt(intColumn, 0), missingMinMaxColumnMetas)); + assertFalse(canDrop(lt(doubleColumn, 0.0), missingMinMaxColumnMetas)); } @Test @@ -218,6 +241,9 @@ public void testLtEq() { assertTrue(canDrop(ltEq(intColumn, 7), nullColumnMetas)); assertTrue(canDrop(ltEq(missingColumn, fromString("any")), columnMetas)); + + assertFalse(canDrop(ltEq(intColumn, -1), missingMinMaxColumnMetas)); + assertFalse(canDrop(ltEq(doubleColumn, -0.1), missingMinMaxColumnMetas)); } @Test @@ -231,6 +257,9 @@ public void testGt() { assertTrue(canDrop(gt(intColumn, 7), nullColumnMetas)); assertTrue(canDrop(gt(missingColumn, fromString("any")), columnMetas)); + + assertFalse(canDrop(gt(intColumn, 0), missingMinMaxColumnMetas)); + assertFalse(canDrop(gt(doubleColumn, 0.0), missingMinMaxColumnMetas)); } @Test @@ -244,6 +273,9 @@ public void testGtEq() { assertTrue(canDrop(gtEq(intColumn, 7), nullColumnMetas)); assertTrue(canDrop(gtEq(missingColumn, fromString("any")), columnMetas)); + + assertFalse(canDrop(gtEq(intColumn, 1), missingMinMaxColumnMetas)); + assertFalse(canDrop(gtEq(doubleColumn, 0.1), missingMinMaxColumnMetas)); } @Test @@ -297,6 +329,26 @@ public boolean keep(Integer value) { } } + public static class AllPositiveUdp extends UserDefinedPredicate { + @Override + public boolean keep(Double value) { + if (value == null) { + return true; + } + throw new RuntimeException("this method should not be called with value != null"); + } + + @Override + public boolean canDrop(Statistics statistics) { + return statistics.getMin() <= 0.0; + } + + @Override + public boolean inverseCanDrop(Statistics statistics) { + return statistics.getMin() > 0.0; + } + } + @Test public void testUdp() { FilterPredicate pred = userDefined(intColumn, SevensAndEightsUdp.class); @@ -308,6 +360,8 @@ public void testUdp() { FilterPredicate udpKeepMissingColumn = userDefined(missingColumn2, SevensAndEightsUdp.class); FilterPredicate invUdpKeepMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, SevensAndEightsUdp.class))); + FilterPredicate allPositivePred = userDefined(doubleColumn, AllPositiveUdp.class); + IntStatistics seven = new IntStatistics(); seven.setMinMax(7, 7); @@ -392,6 +446,8 @@ public void testUdp() { assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList( getIntColumnMeta(neither, 177L), getDoubleColumnMeta(doubleStats, 177L)))); + + assertFalse(canDrop(allPositivePred, missingMinMaxColumnMetas)); } @Test diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index ee92d4625b..0cfee1a64c 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -658,6 +658,7 @@ private void testUseStatsWithSignedSortOrder(StatsHelper helper) { binaryType); Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty()); + Assert.assertTrue(convertedStats.isNumNullsSet()); Assert.assertEquals("Should have 3 nulls", 3, convertedStats.getNumNulls()); if (helper == StatsHelper.V1) { assertFalse("Min-max should be null for V1 stats", convertedStats.hasNonNullValue()); @@ -669,6 +670,38 @@ private void testUseStatsWithSignedSortOrder(StatsHelper helper) { } } + @Test + public void testMissingValuesFromStats() { + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + PrimitiveType type = Types.required(PrimitiveTypeName.INT32).named("test_int32"); + + org.apache.parquet.format.Statistics formatStats = new org.apache.parquet.format.Statistics(); + Statistics stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type); + assertFalse(stats.isNumNullsSet()); + assertFalse(stats.hasNonNullValue()); + assertTrue(stats.isEmpty()); + assertEquals(-1, stats.getNumNulls()); + + formatStats.clear(); + formatStats.setMin(BytesUtils.intToBytes(-100)); + formatStats.setMax(BytesUtils.intToBytes(100)); + stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type); + assertFalse(stats.isNumNullsSet()); + assertTrue(stats.hasNonNullValue()); + assertFalse(stats.isEmpty()); + assertEquals(-1, stats.getNumNulls()); + assertEquals(-100, stats.genericGetMin()); + assertEquals(100, stats.genericGetMax()); + + formatStats.clear(); + formatStats.setNull_count(2000); + stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type); + assertTrue(stats.isNumNullsSet()); + assertFalse(stats.hasNonNullValue()); + assertFalse(stats.isEmpty()); + assertEquals(2000, stats.getNumNulls()); + } + @Test public void testSkippedV2Stats() { testSkippedV2Stats( diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java index 87574cd925..0b7b9517a1 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java @@ -60,6 +60,7 @@ import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageTypeParser; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Types; import org.apache.parquet.bytes.HeapByteBufferAllocator; @@ -92,7 +93,8 @@ public void test() throws Exception { int v = 3; BytesInput definitionLevels = BytesInput.fromInt(d); BytesInput repetitionLevels = BytesInput.fromInt(r); - Statistics statistics = new BinaryStatistics(); + Statistics statistics = Statistics.getBuilder(Types.required(PrimitiveTypeName.BINARY).named("test_binary")) + .build(); BytesInput data = BytesInput.fromInt(v); int rowCount = 5; int nullCount = 1; diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java index 4243e9bd18..c73e569271 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java @@ -27,7 +27,6 @@ import org.apache.parquet.Version; import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel; -import org.apache.parquet.hadoop.util.HadoopOutputFile; import org.junit.Assume; import org.junit.Rule; import org.junit.Test; @@ -48,6 +47,7 @@ import org.apache.parquet.schema.MessageTypeParser; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Types; import java.io.File; import java.io.IOException; @@ -95,8 +95,8 @@ public class TestParquetFileWriter { private static final byte[] BYTES4 = { 3, 4, 5, 6 }; private static final CompressionCodecName CODEC = CompressionCodecName.UNCOMPRESSED; - private static final BinaryStatistics STATS1 = new BinaryStatistics(); - private static final BinaryStatistics STATS2 = new BinaryStatistics(); + private static final org.apache.parquet.column.statistics.Statistics EMPTY_STATS = org.apache.parquet.column.statistics.Statistics + .getBuilder(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build(); private String writeSchema; @@ -145,24 +145,24 @@ public void testWriteRead() throws Exception { w.startBlock(3); w.startColumn(C1, 5, CODEC); long c1Starts = w.getPos(); - w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c1Ends = w.getPos(); w.startColumn(C2, 6, CODEC); long c2Starts = w.getPos(); - w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c2Ends = w.getPos(); w.endBlock(); w.startBlock(4); w.startColumn(C1, 7, CODEC); - w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.startColumn(C2, 8, CODEC); - w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.endBlock(); w.end(new HashMap()); @@ -225,15 +225,15 @@ public void testAlignmentWithPadding() throws Exception { w.startBlock(3); w.startColumn(C1, 5, CODEC); long c1Starts = w.getPos(); - w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c1Ends = w.getPos(); w.startColumn(C2, 6, CODEC); long c2Starts = w.getPos(); - w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c2Ends = w.getPos(); w.endBlock(); @@ -242,10 +242,10 @@ public void testAlignmentWithPadding() throws Exception { w.startBlock(4); w.startColumn(C1, 7, CODEC); - w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.startColumn(C2, 8, CODEC); - w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.endBlock(); @@ -330,15 +330,15 @@ public void testAlignmentWithNoPaddingNeeded() throws Exception { w.startBlock(3); w.startColumn(C1, 5, CODEC); long c1Starts = w.getPos(); - w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c1Ends = w.getPos(); w.startColumn(C2, 6, CODEC); long c2Starts = w.getPos(); - w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c2Ends = w.getPos(); w.endBlock(); @@ -347,10 +347,10 @@ public void testAlignmentWithNoPaddingNeeded() throws Exception { w.startBlock(4); w.startColumn(C1, 7, CODEC); - w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.startColumn(C2, 8, CODEC); - w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.endBlock();