apache · rdblue · Jun 6, 2015 · Jun 6, 2015 · Jun 15, 2015 · Jun 18, 2015
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java
@@ -28,13 +28,10 @@
 import java.util.HashMap;
 import java.util.Map;
 
-import org.apache.parquet.Ints;
 import org.apache.parquet.Log;
 import org.apache.parquet.column.ColumnWriteStore;
 import org.apache.parquet.column.ParquetProperties;
 import org.apache.parquet.column.ParquetProperties.WriterVersion;
-import org.apache.parquet.column.impl.ColumnWriteStoreV1;
-import org.apache.parquet.column.impl.ColumnWriteStoreV2;
 import org.apache.parquet.hadoop.CodecFactory.BytesCompressor;
 import org.apache.parquet.hadoop.api.WriteSupport;
 import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext;
@@ -54,6 +51,7 @@ class InternalParquetRecordWriter<T> {
   private final Map<String, String> extraMetaData;
   private final long rowGroupSize;
   private long rowGroupSizeThreshold;
+  private long nextRowGroupSize;
   private final int pageSize;
   private final BytesCompressor compressor;
   private final boolean validating;
@@ -92,6 +90,7 @@ public InternalParquetRecordWriter(
     this.extraMetaData = extraMetaData;
     this.rowGroupSize = rowGroupSize;
     this.rowGroupSizeThreshold = rowGroupSize;
+    this.nextRowGroupSize = rowGroupSizeThreshold;
     this.pageSize = pageSize;
     this.compressor = compressor;
     this.validating = validating;
@@ -126,15 +125,17 @@ public void write(T value) throws IOException, InterruptedException {
   private void checkBlockSizeReached() throws IOException {
     if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record.
       long memSize = columnStore.getBufferedSize();
-      if (memSize > rowGroupSizeThreshold) {
-        LOG.info(format("mem size %,d > %,d: flushing %,d records to disk.", memSize, rowGroupSizeThreshold, recordCount));
+      long recordSize = memSize / recordCount;
+      // flush the row group if it is within ~2 records of the limit
+      // it is much better to be slightly under size than to be over at all
+      if (memSize > (nextRowGroupSize - 2 * recordSize)) {
+        LOG.info(format("mem size %,d > %,d: flushing %,d records to disk.", memSize, nextRowGroupSize, recordCount));
         flushRowGroupToStore();
         initStore();
         recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK);
       } else {
-        float recordSize = (float) memSize / recordCount;
         recordCountForNextMemCheck = min(
-            max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(rowGroupSizeThreshold / recordSize)) / 2), // will check halfway
+            max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(nextRowGroupSize / ((float)recordSize))) / 2), // will check halfway
             recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead
             );
         if (DEBUG) LOG.debug(format("Checked mem at %,d will check again at: %,d ", recordCount, recordCountForNextMemCheck));
@@ -145,7 +146,7 @@ private void checkBlockSizeReached() throws IOException {
   private void flushRowGroupToStore()
       throws IOException {
     LOG.info(format("Flushing mem columnStore to file. allocated memory: %,d", columnStore.getAllocatedSize()));
-    if (columnStore.getAllocatedSize() > 3 * (long)rowGroupSizeThreshold) {
+    if (columnStore.getAllocatedSize() > (3 * rowGroupSizeThreshold)) {
       LOG.warn("Too much memory used: " + columnStore.memUsageString());
     }
 
@@ -155,6 +156,9 @@ private void flushRowGroupToStore()
       pageStore.flushToFileWriter(parquetFileWriter);
       recordCount = 0;
       parquetFileWriter.endBlock();
+      this.nextRowGroupSize = Math.min(
+          parquetFileWriter.getNextRowGroupSize(),
+          rowGroupSizeThreshold);
     }
 
     columnStore = null;

diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
@@ -20,6 +20,8 @@
 
 import static org.apache.parquet.Log.DEBUG;
 import static org.apache.parquet.format.Util.writeFileMetaData;
+import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE;
+import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT;
 
 import java.io.IOException;
 import java.nio.charset.Charset;
@@ -41,6 +43,7 @@
 import org.apache.parquet.bytes.BytesInput;
 import org.apache.parquet.bytes.BytesUtils;
 import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.column.Encoding;
 import org.apache.parquet.column.page.DictionaryPage;
 import org.apache.parquet.column.statistics.Statistics;
 import org.apache.parquet.hadoop.metadata.ColumnPath;
@@ -69,6 +72,22 @@ public class ParquetFileWriter {
   public static final byte[] MAGIC = "PAR1".getBytes(Charset.forName("ASCII"));
   public static final int CURRENT_VERSION = 1;
 
+  // need to supply a buffer size when setting block size. this is the default
+  // for hadoop 1 to present. copying it avoids loading DFSConfigKeys.
+  private static final int DFS_BUFFER_SIZE_DEFAULT = 4096;
+
+  // visible for testing
+  static final Set<String> BLOCK_FS_SCHEMES = new HashSet<String>();
+  static {
+    BLOCK_FS_SCHEMES.add("hdfs");
+    BLOCK_FS_SCHEMES.add("webhdfs");
+    BLOCK_FS_SCHEMES.add("viewfs");
+  }
+
+  private static boolean supportsBlockSize(FileSystem fs) {
+    return BLOCK_FS_SCHEMES.contains(fs.getUri().getScheme());
+  }
+
   // File creation modes
   public static enum Mode {
     CREATE,
@@ -79,13 +98,13 @@ public static enum Mode {
 
   private final MessageType schema;
   private final FSDataOutputStream out;
+  private final AlignmentStrategy alignment;
   private BlockMetaData currentBlock;
-  private ColumnChunkMetaData currentColumn;
   private long currentRecordCount;
   private List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
   private long uncompressedLength;
   private long compressedLength;
-  private Set<org.apache.parquet.column.Encoding> currentEncodings;
+  private Set<Encoding> currentEncodings;
 
   private CompressionCodecName currentChunkCodec;
   private ColumnPath currentChunkPath;
@@ -157,7 +176,8 @@ private final STATE error() throws IOException {
    */
   public ParquetFileWriter(Configuration configuration, MessageType schema,
       Path file) throws IOException {
-    this(configuration, schema, file, Mode.CREATE);
+    this(configuration, schema, file, Mode.CREATE, DEFAULT_BLOCK_SIZE,
+        MAX_PADDING_SIZE_DEFAULT);
   }
 
   /**
@@ -168,12 +188,60 @@ public ParquetFileWriter(Configuration configuration, MessageType schema,
    * @throws IOException if the file can not be created
    */
   public ParquetFileWriter(Configuration configuration, MessageType schema,
-      Path file, Mode mode) throws IOException {
-    super();
+                           Path file, Mode mode) throws IOException {
+    this(configuration, schema, file, mode, DEFAULT_BLOCK_SIZE,
+        MAX_PADDING_SIZE_DEFAULT);
+  }
+
+  /**
+   * @param configuration Hadoop configuration
+   * @param schema the schema of the data
+   * @param file the file to write to
+   * @param mode file creation mode
+   * @param rowGroupSize the row group size
+   * @throws IOException if the file can not be created
+   */
+  public ParquetFileWriter(Configuration configuration, MessageType schema,
+                           Path file, Mode mode, long rowGroupSize,
+                           int maxPaddingSize)
+      throws IOException {
     this.schema = schema;
     FileSystem fs = file.getFileSystem(configuration);
     boolean overwriteFlag = (mode == Mode.OVERWRITE);
-    this.out = fs.create(file, overwriteFlag);
+
+    if (supportsBlockSize(fs)) {
+      // use the default block size, unless row group size is larger
+      long dfsBlockSize = Math.max(fs.getDefaultBlockSize(file), rowGroupSize);
+
+      this.alignment = PaddingAlignment.get(
+          dfsBlockSize, rowGroupSize, maxPaddingSize);
+      this.out = fs.create(file, overwriteFlag, DFS_BUFFER_SIZE_DEFAULT,
+          fs.getDefaultReplication(file), dfsBlockSize);
+
+    } else {
+      this.alignment = NoAlignment.get(rowGroupSize);
+      this.out = fs.create(file, overwriteFlag);
+    }
+  }
+
+  /**
+   * FOR TESTING ONLY.
+   *
+   * @param configuration Hadoop configuration
+   * @param schema the schema of the data
+   * @param file the file to write to
+   * @param rowAndBlockSize the row group size
+   * @throws IOException if the file can not be created
+   */
+  ParquetFileWriter(Configuration configuration, MessageType schema,
+                    Path file, long rowAndBlockSize, int maxPaddingSize)
+      throws IOException {
+    FileSystem fs = file.getFileSystem(configuration);
+    this.schema = schema;
+    this.alignment = PaddingAlignment.get(
+        rowAndBlockSize, rowAndBlockSize, maxPaddingSize);
+    this.out = fs.create(file, true, DFS_BUFFER_SIZE_DEFAULT,
+        fs.getDefaultReplication(file), rowAndBlockSize);
   }
 
   /**
@@ -195,6 +263,9 @@ public void startBlock(long recordCount) throws IOException {
     state = state.startBlock();
     if (DEBUG) LOG.debug(out.getPos() + ": start block");
 //    out.write(MAGIC); // TODO: add a magic delimiter
+
+    alignment.alignForRowGroup(out);
+
     currentBlock = new BlockMetaData();
     currentRecordCount = recordCount;
   }
@@ -203,16 +274,14 @@ public void startBlock(long recordCount) throws IOException {
    * start a column inside a block
    * @param descriptor the column descriptor
    * @param valueCount the value count in this column
-   * @param statistics the statistics in this column
    * @param compressionCodecName
    * @throws IOException
    */
   public void startColumn(ColumnDescriptor descriptor,
                           long valueCount,
                           CompressionCodecName compressionCodecName) throws IOException {
     state = state.startColumn();
-    if (DEBUG) LOG.debug(out.getPos() + ": start column: " + descriptor + " count=" + valueCount);
-    currentEncodings = new HashSet<org.apache.parquet.column.Encoding>();
+    currentEncodings = new HashSet<Encoding>();
     currentChunkPath = ColumnPath.get(descriptor.getPath());
     currentChunkType = descriptor.getType();
     currentChunkCodec = compressionCodecName;
@@ -263,9 +332,9 @@ public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOExceptio
   public void writeDataPage(
       int valueCount, int uncompressedPageSize,
       BytesInput bytes,
-      org.apache.parquet.column.Encoding rlEncoding,
-      org.apache.parquet.column.Encoding dlEncoding,
-      org.apache.parquet.column.Encoding valuesEncoding) throws IOException {
+      Encoding rlEncoding,
+      Encoding dlEncoding,
+      Encoding valuesEncoding) throws IOException {
     state = state.write();
     long beforeHeader = out.getPos();
     if (DEBUG) LOG.debug(beforeHeader + ": write data page: " + valueCount + " values");
@@ -300,9 +369,9 @@ public void writeDataPage(
       int valueCount, int uncompressedPageSize,
       BytesInput bytes,
       Statistics statistics,
-      org.apache.parquet.column.Encoding rlEncoding,
-      org.apache.parquet.column.Encoding dlEncoding,
-      org.apache.parquet.column.Encoding valuesEncoding) throws IOException {
+      Encoding rlEncoding,
+      Encoding dlEncoding,
+      Encoding valuesEncoding) throws IOException {
     state = state.write();
     long beforeHeader = out.getPos();
     if (DEBUG) LOG.debug(beforeHeader + ": write data page: " + valueCount + " values");
@@ -337,7 +406,7 @@ void writeDataPages(BytesInput bytes,
                        long uncompressedTotalPageSize,
                        long compressedTotalPageSize,
                        Statistics totalStats,
-                       List<org.apache.parquet.column.Encoding> encodings) throws IOException {
+                       List<Encoding> encodings) throws IOException {
     state = state.write();
     if (DEBUG) LOG.debug(out.getPos() + ": write data pages");
     long headersSize = bytes.size() - compressedTotalPageSize;
@@ -367,8 +436,6 @@ public void endColumn() throws IOException {
         currentChunkValueCount,
         compressedLength,
         uncompressedLength));
-    if (DEBUG) LOG.info("ended Column chumk: " + currentColumn);
-    currentColumn = null;
     this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength);
     this.uncompressedLength = 0;
     this.compressedLength = 0;
@@ -464,6 +531,10 @@ public long getPos() throws IOException {
     return out.getPos();
   }
 
+  public long getNextRowGroupSize() throws IOException {
+    return alignment.nextRowGroupSize(out);
+  }
+
   /**
    * Will merge the metadata of all the footers together
    * @param footers the list files footers to merge
@@ -550,4 +621,83 @@ static MessageType mergeInto(MessageType toMerge, MessageType mergedSchema, bool
     return mergedSchema.union(toMerge, strict);
   }
 
+  private interface AlignmentStrategy {
+    void alignForRowGroup(FSDataOutputStream out) throws IOException;
+
+    long nextRowGroupSize(FSDataOutputStream out) throws IOException;
+  }
+
+  private static class NoAlignment implements AlignmentStrategy {
+    public static NoAlignment get(long rowGroupSize) {
+      return new NoAlignment(rowGroupSize);
+    }
+
+    private final long rowGroupSize;
+
+    private NoAlignment(long rowGroupSize) {
+      this.rowGroupSize = rowGroupSize;
+    }
+
+    @Override
+    public void alignForRowGroup(FSDataOutputStream out) {
+    }
+
+    @Override
+    public long nextRowGroupSize(FSDataOutputStream out) {
+      return rowGroupSize;
+    }
+  }
+
+  /**
+   * Alignment strategy that pads when less than half the row group size is
+   * left before the next DFS block.
+   */
+  private static class PaddingAlignment implements AlignmentStrategy {
+    private static final byte[] zeros = new byte[4096];
+
+    public static PaddingAlignment get(long dfsBlockSize, long rowGroupSize,
+                                       int maxPaddingSize) {
+      return new PaddingAlignment(dfsBlockSize, rowGroupSize, maxPaddingSize);
+    }
+
+    protected final long dfsBlockSize;
+    protected final long rowGroupSize;
+    protected final int maxPaddingSize;
+
+    private PaddingAlignment(long dfsBlockSize, long rowGroupSize,
+                             int maxPaddingSize) {
+      this.dfsBlockSize = dfsBlockSize;
+      this.rowGroupSize = rowGroupSize;
+      this.maxPaddingSize = maxPaddingSize;
+    }
+
+    @Override
+    public void alignForRowGroup(FSDataOutputStream out) throws IOException {
+      long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize);
+
+      if (isPaddingNeeded(remaining)) {
+        if (DEBUG) LOG.debug("Adding " + remaining + " bytes of padding (" +
+            "row group size=" + rowGroupSize + "B, " +
+            "block size=" + dfsBlockSize + "B)");
+        for (; remaining > 0; remaining -= zeros.length) {
+          out.write(zeros, 0, (int) Math.min((long) zeros.length, remaining));
+        }
+      }
+    }
+
+    @Override
+    public long nextRowGroupSize(FSDataOutputStream out) throws IOException {
+      long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize);
+
+      if (isPaddingNeeded(remaining)) {
+        return rowGroupSize;
+      }
+
+      return Math.min(remaining, rowGroupSize);
+    }
+
+    protected boolean isPaddingNeeded(long remaining) {
+      return (remaining <= maxPaddingSize);
+    }
+  }
 }