Skip to content

Commit 339d397

Browse files
committed
PARQUET-2261: Implement SizeStatistics
1 parent 452c94d commit 339d397

File tree

21 files changed

+1198
-83
lines changed

21 files changed

+1198
-83
lines changed

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.apache.parquet.column.ParquetProperties;
2828
import org.apache.parquet.column.page.DictionaryPage;
2929
import org.apache.parquet.column.page.PageWriter;
30+
import org.apache.parquet.column.statistics.SizeStatistics;
3031
import org.apache.parquet.column.statistics.Statistics;
3132
import org.apache.parquet.column.values.ValuesWriter;
3233
import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter;
@@ -56,6 +57,7 @@ abstract class ColumnWriterBase implements ColumnWriter {
5657
private int valueCount;
5758

5859
private Statistics<?> statistics;
60+
private SizeStatistics.Builder sizeStatisticsBuilder;
5961
private long rowsWrittenSoFar = 0;
6062
private int pageRowCount;
6163

@@ -116,6 +118,8 @@ private void log(Object value, int r, int d) {
116118

117119
private void resetStatistics() {
118120
this.statistics = Statistics.createStats(path.getPrimitiveType());
121+
this.sizeStatisticsBuilder = new SizeStatistics.Builder(
122+
path.getPrimitiveType(), path.getMaxRepetitionLevel(), path.getMaxDefinitionLevel());
119123
}
120124

121125
private void definitionLevel(int definitionLevel) {
@@ -143,6 +147,7 @@ public void writeNull(int repetitionLevel, int definitionLevel) {
143147
repetitionLevel(repetitionLevel);
144148
definitionLevel(definitionLevel);
145149
statistics.incrementNumNulls();
150+
sizeStatisticsBuilder.add(repetitionLevel, definitionLevel);
146151
++valueCount;
147152
}
148153

@@ -207,6 +212,7 @@ public void write(double value, int repetitionLevel, int definitionLevel) {
207212
definitionLevel(definitionLevel);
208213
dataColumn.writeDouble(value);
209214
statistics.updateStats(value);
215+
sizeStatisticsBuilder.add(repetitionLevel, definitionLevel);
210216
updateBloomFilter(value);
211217
++valueCount;
212218
}
@@ -226,6 +232,7 @@ public void write(float value, int repetitionLevel, int definitionLevel) {
226232
definitionLevel(definitionLevel);
227233
dataColumn.writeFloat(value);
228234
statistics.updateStats(value);
235+
sizeStatisticsBuilder.add(repetitionLevel, definitionLevel);
229236
updateBloomFilter(value);
230237
++valueCount;
231238
}
@@ -245,6 +252,7 @@ public void write(Binary value, int repetitionLevel, int definitionLevel) {
245252
definitionLevel(definitionLevel);
246253
dataColumn.writeBytes(value);
247254
statistics.updateStats(value);
255+
sizeStatisticsBuilder.add(repetitionLevel, definitionLevel, value);
248256
updateBloomFilter(value);
249257
++valueCount;
250258
}
@@ -264,6 +272,7 @@ public void write(boolean value, int repetitionLevel, int definitionLevel) {
264272
definitionLevel(definitionLevel);
265273
dataColumn.writeBoolean(value);
266274
statistics.updateStats(value);
275+
sizeStatisticsBuilder.add(repetitionLevel, definitionLevel);
267276
++valueCount;
268277
}
269278

@@ -282,6 +291,7 @@ public void write(int value, int repetitionLevel, int definitionLevel) {
282291
definitionLevel(definitionLevel);
283292
dataColumn.writeInteger(value);
284293
statistics.updateStats(value);
294+
sizeStatisticsBuilder.add(repetitionLevel, definitionLevel);
285295
updateBloomFilter(value);
286296
++valueCount;
287297
}
@@ -301,6 +311,7 @@ public void write(long value, int repetitionLevel, int definitionLevel) {
301311
definitionLevel(definitionLevel);
302312
dataColumn.writeLong(value);
303313
statistics.updateStats(value);
314+
sizeStatisticsBuilder.add(repetitionLevel, definitionLevel);
304315
updateBloomFilter(value);
305316
++valueCount;
306317
}
@@ -395,7 +406,8 @@ void writePage() {
395406
if (DEBUG)
396407
LOG.debug("write page");
397408
try {
398-
writePage(pageRowCount, valueCount, statistics, repetitionLevelColumn, definitionLevelColumn, dataColumn);
409+
writePage(pageRowCount, valueCount, statistics, sizeStatisticsBuilder.build(),
410+
repetitionLevelColumn, definitionLevelColumn, dataColumn);
399411
} catch (IOException e) {
400412
throw new ParquetEncodingException("could not write page for " + path, e);
401413
}
@@ -407,6 +419,10 @@ void writePage() {
407419
pageRowCount = 0;
408420
}
409421

422+
@Deprecated
410423
abstract void writePage(int rowCount, int valueCount, Statistics<?> statistics, ValuesWriter repetitionLevels,
411424
ValuesWriter definitionLevels, ValuesWriter values) throws IOException;
425+
426+
abstract void writePage(int rowCount, int valueCount, Statistics<?> statistics, SizeStatistics sizeStatistics,
427+
ValuesWriter repetitionLevels, ValuesWriter definitionLevels, ValuesWriter values) throws IOException;
412428
}

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import org.apache.parquet.column.ColumnDescriptor;
2626
import org.apache.parquet.column.ParquetProperties;
2727
import org.apache.parquet.column.page.PageWriter;
28+
import org.apache.parquet.column.statistics.SizeStatistics;
2829
import org.apache.parquet.column.statistics.Statistics;
2930
import org.apache.parquet.column.values.ValuesWriter;
3031
import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter;
@@ -54,13 +55,21 @@ ValuesWriter createDLWriter(ParquetProperties props, ColumnDescriptor path) {
5455
}
5556

5657
@Override
58+
@Deprecated
5759
void writePage(int rowCount, int valueCount, Statistics<?> statistics, ValuesWriter repetitionLevels,
58-
ValuesWriter definitionLevels, ValuesWriter values) throws IOException {
60+
ValuesWriter definitionLevels, ValuesWriter values) throws IOException {
61+
writePage(rowCount, valueCount, statistics, null, repetitionLevels, definitionLevels, values);
62+
}
63+
64+
@Override
65+
void writePage(int rowCount, int valueCount, Statistics<?> statistics, SizeStatistics sizeStatistics,
66+
ValuesWriter repetitionLevels, ValuesWriter definitionLevels, ValuesWriter values) throws IOException {
5967
pageWriter.writePage(
6068
concat(repetitionLevels.getBytes(), definitionLevels.getBytes(), values.getBytes()),
6169
valueCount,
6270
rowCount,
6371
statistics,
72+
sizeStatistics,
6473
repetitionLevels.getEncoding(),
6574
definitionLevels.getEncoding(),
6675
values.getEncoding());

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import org.apache.parquet.column.Encoding;
2626
import org.apache.parquet.column.ParquetProperties;
2727
import org.apache.parquet.column.page.PageWriter;
28+
import org.apache.parquet.column.statistics.SizeStatistics;
2829
import org.apache.parquet.column.statistics.Statistics;
2930
import org.apache.parquet.column.values.ValuesWriter;
3031
import org.apache.parquet.column.values.bitpacking.DevNullValuesWriter;
@@ -76,8 +77,15 @@ ValuesWriter createDLWriter(ParquetProperties props, ColumnDescriptor path) {
7677
}
7778

7879
@Override
80+
@Deprecated
7981
void writePage(int rowCount, int valueCount, Statistics<?> statistics, ValuesWriter repetitionLevels,
80-
ValuesWriter definitionLevels, ValuesWriter values) throws IOException {
82+
ValuesWriter definitionLevels, ValuesWriter values) throws IOException {
83+
writePage(rowCount, valueCount, statistics, null, repetitionLevels, definitionLevels, values);
84+
}
85+
86+
@Override
87+
void writePage(int rowCount, int valueCount, Statistics<?> statistics, SizeStatistics sizeStatistics,
88+
ValuesWriter repetitionLevels, ValuesWriter definitionLevels, ValuesWriter values) throws IOException {
8189
// TODO: rework this API. The bytes shall be retrieved before the encoding (encoding might be different otherwise)
8290
BytesInput bytes = values.getBytes();
8391
Encoding encoding = values.getEncoding();
@@ -89,6 +97,7 @@ void writePage(int rowCount, int valueCount, Statistics<?> statistics, ValuesWri
8997
definitionLevels.getBytes(),
9098
encoding,
9199
bytes,
92-
statistics);
100+
statistics,
101+
sizeStatistics);
93102
}
94103
}

parquet-column/src/main/java/org/apache/parquet/column/page/PageWriter.java

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
import org.apache.parquet.bytes.BytesInput;
2424
import org.apache.parquet.column.Encoding;
25+
import org.apache.parquet.column.statistics.SizeStatistics;
2526
import org.apache.parquet.column.statistics.Statistics;
2627

2728
/**
@@ -55,7 +56,25 @@ public interface PageWriter {
5556
* @param valuesEncoding values encoding
5657
* @throws IOException
5758
*/
58-
void writePage(BytesInput bytesInput, int valueCount, int rowCount, Statistics<?> statistics, Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding) throws IOException;
59+
@Deprecated
60+
void writePage(BytesInput bytesInput, int valueCount, int rowCount, Statistics<?> statistics,
61+
Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding) throws IOException;
62+
63+
/**
64+
* writes a single page
65+
* @param bytesInput the bytes for the page
66+
* @param valueCount the number of values in that page
67+
* @param rowCount the number of rows in that page
68+
* @param statistics the statistics for that page
69+
* @param sizeStatistics the size statistics for that page
70+
* @param rlEncoding repetition level encoding
71+
* @param dlEncoding definition level encoding
72+
* @param valuesEncoding values encoding
73+
* @throws IOException
74+
*/
75+
void writePage(BytesInput bytesInput, int valueCount, int rowCount, Statistics<?> statistics,
76+
SizeStatistics sizeStatistics, Encoding rlEncoding, Encoding dlEncoding,
77+
Encoding valuesEncoding) throws IOException;
5978

6079
/**
6180
* writes a single page in the new format
@@ -69,13 +88,31 @@ public interface PageWriter {
6988
* @param statistics optional stats for this page
7089
* @throws IOException if there is an exception while writing page data
7190
*/
91+
@Deprecated
7292
void writePageV2(
7393
int rowCount, int nullCount, int valueCount,
7494
BytesInput repetitionLevels, BytesInput definitionLevels,
7595
Encoding dataEncoding,
7696
BytesInput data,
7797
Statistics<?> statistics) throws IOException;
7898

99+
/**
100+
* writes a single page in the new format
101+
* @param rowCount the number of rows in this page
102+
* @param nullCount the number of null values (out of valueCount)
103+
* @param valueCount the number of values in that page (there could be multiple values per row for repeated fields)
104+
* @param repetitionLevels the repetition levels encoded in RLE without any size header
105+
* @param definitionLevels the definition levels encoded in RLE without any size header
106+
* @param dataEncoding the encoding for the data
107+
* @param data the data encoded with dataEncoding
108+
* @param statistics optional stats for this page
109+
* @param sizeStatistics optional size stats for this page
110+
* @throws IOException if there is an exception while writing page data
111+
*/
112+
void writePageV2(int rowCount, int nullCount, int valueCount, BytesInput repetitionLevels, BytesInput definitionLevels,
113+
Encoding dataEncoding, BytesInput data, Statistics<?> statistics,
114+
SizeStatistics sizeStatistics) throws IOException;
115+
79116
/**
80117
* @return the current size used in the memory buffer for that column chunk
81118
*/

0 commit comments

Comments
 (0)