@@ -45,6 +45,8 @@ pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Pag
4545pub const DEFAULT_MAX_STATISTICS_SIZE : usize = 4096 ;
4646/// Default value for [`WriterProperties::max_row_group_size`]
4747pub const DEFAULT_MAX_ROW_GROUP_SIZE : usize = 1024 * 1024 ;
48+ /// Default value for [`WriterProperties::bloom_filter_position`]
49+ pub const DEFAULT_BLOOM_FILTER_POSITION : BloomFilterPosition = BloomFilterPosition :: AfterRowGroup ;
4850/// Default value for [`WriterProperties::created_by`]
4951pub const DEFAULT_CREATED_BY : & str = concat ! ( "parquet-rs version " , env!( "CARGO_PKG_VERSION" ) ) ;
5052/// Default value for [`WriterProperties::column_index_truncate_length`]
@@ -88,6 +90,24 @@ impl FromStr for WriterVersion {
8890 }
8991}
9092
93+ /// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
94+ /// write Bloom filters
95+ ///
96+ /// Basic constant, which is not part of the Thrift definition.
97+ #[ derive( Debug , Clone , Copy , PartialEq , Eq ) ]
98+ pub enum BloomFilterPosition {
99+ /// Write Bloom Filters of each row group right after the row group
100+ ///
101+ /// This saves memory by writing it as soon as it is computed, at the cost
102+ /// of data locality for readers
103+ AfterRowGroup ,
104+ /// Write Bloom Filters at the end of the file
105+ ///
106+ /// This allows better data locality for readers, at the cost of memory usage
107+ /// for writers.
108+ End ,
109+ }
110+
91111/// Reference counted writer properties.
92112pub type WriterPropertiesPtr = Arc < WriterProperties > ;
93113
@@ -132,6 +152,7 @@ pub struct WriterProperties {
132152 data_page_row_count_limit : usize ,
133153 write_batch_size : usize ,
134154 max_row_group_size : usize ,
155+ bloom_filter_position : BloomFilterPosition ,
135156 writer_version : WriterVersion ,
136157 created_by : String ,
137158 pub ( crate ) key_value_metadata : Option < Vec < KeyValue > > ,
@@ -219,6 +240,11 @@ impl WriterProperties {
219240 self . max_row_group_size
220241 }
221242
243+ /// Returns maximum number of rows in a row group.
244+ pub fn bloom_filter_position ( & self ) -> BloomFilterPosition {
245+ self . bloom_filter_position
246+ }
247+
222248 /// Returns configured writer version.
223249 pub fn writer_version ( & self ) -> WriterVersion {
224250 self . writer_version
@@ -340,6 +366,7 @@ pub struct WriterPropertiesBuilder {
340366 data_page_row_count_limit : usize ,
341367 write_batch_size : usize ,
342368 max_row_group_size : usize ,
369+ bloom_filter_position : BloomFilterPosition ,
343370 writer_version : WriterVersion ,
344371 created_by : String ,
345372 key_value_metadata : Option < Vec < KeyValue > > ,
@@ -359,6 +386,7 @@ impl WriterPropertiesBuilder {
359386 data_page_row_count_limit : DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT ,
360387 write_batch_size : DEFAULT_WRITE_BATCH_SIZE ,
361388 max_row_group_size : DEFAULT_MAX_ROW_GROUP_SIZE ,
389+ bloom_filter_position : DEFAULT_BLOOM_FILTER_POSITION ,
362390 writer_version : DEFAULT_WRITER_VERSION ,
363391 created_by : DEFAULT_CREATED_BY . to_string ( ) ,
364392 key_value_metadata : None ,
@@ -378,6 +406,7 @@ impl WriterPropertiesBuilder {
378406 data_page_row_count_limit : self . data_page_row_count_limit ,
379407 write_batch_size : self . write_batch_size ,
380408 max_row_group_size : self . max_row_group_size ,
409+ bloom_filter_position : self . bloom_filter_position ,
381410 writer_version : self . writer_version ,
382411 created_by : self . created_by ,
383412 key_value_metadata : self . key_value_metadata ,
@@ -489,6 +518,12 @@ impl WriterPropertiesBuilder {
489518 self
490519 }
491520
521+ /// Sets where in the final file Bloom Filters are written (default `AfterRowGroup`)
522+ pub fn set_bloom_filter_position ( mut self , value : BloomFilterPosition ) -> Self {
523+ self . bloom_filter_position = value;
524+ self
525+ }
526+
492527 /// Sets "created by" property (defaults to `parquet-rs version <VERSION>`).
493528 pub fn set_created_by ( mut self , value : String ) -> Self {
494529 self . created_by = value;
@@ -1054,6 +1089,7 @@ mod tests {
10541089 ) ;
10551090 assert_eq ! ( props. write_batch_size( ) , DEFAULT_WRITE_BATCH_SIZE ) ;
10561091 assert_eq ! ( props. max_row_group_size( ) , DEFAULT_MAX_ROW_GROUP_SIZE ) ;
1092+ assert_eq ! ( props. bloom_filter_position( ) , DEFAULT_BLOOM_FILTER_POSITION ) ;
10571093 assert_eq ! ( props. writer_version( ) , DEFAULT_WRITER_VERSION ) ;
10581094 assert_eq ! ( props. created_by( ) , DEFAULT_CREATED_BY ) ;
10591095 assert_eq ! ( props. key_value_metadata( ) , None ) ;
0 commit comments