Skip to content

Commit 323899e

Browse files
committed
PARQUET-1647: Implement logical type FLOAT16
1 parent 4ca813a commit 323899e

File tree

20 files changed

+1196
-8
lines changed

20 files changed

+1196
-8
lines changed

parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,13 @@
2121
import java.util.Arrays;
2222
import org.apache.parquet.column.UnknownColumnTypeException;
2323
import org.apache.parquet.io.api.Binary;
24+
import org.apache.parquet.schema.LogicalTypeAnnotation;
2425
import org.apache.parquet.schema.PrimitiveComparator;
2526
import org.apache.parquet.schema.PrimitiveStringifier;
2627
import org.apache.parquet.schema.PrimitiveType;
2728
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
2829
import org.apache.parquet.schema.Type;
29-
30+
import org.apache.parquet.schema.Float16;
3031

3132
/**
3233
* Statistics class to keep track of statistics in parquet pages and column chunks
@@ -139,6 +140,43 @@ public Statistics<?> build() {
139140
}
140141
}
141142

143+
// Builder for FLOAT16 type to handle special cases of min/max values like NaN, -0.0, and 0.0
144+
private static class Float16Builder extends Builder {
145+
private final static Binary POSITIVE_ZERO_LITTLE_ENDIAN = Binary.fromConstantByteArray(new byte[] {0x00, 0x00});
146+
private final static Binary NEGATIVE_ZERO_LITTLE_ENDIAN = Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0x80});
147+
148+
public Float16Builder(PrimitiveType type) {
149+
super(type);
150+
assert type.getPrimitiveTypeName() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
151+
assert type.getTypeLength() == 2;
152+
}
153+
154+
@Override
155+
public Statistics<?> build() {
156+
BinaryStatistics stats = (BinaryStatistics) super.build();
157+
if (stats.hasNonNullValue()) {
158+
Binary bMin = stats.genericGetMin();
159+
Binary bMax = stats.genericGetMax();
160+
short min = bMin.get2BytesLittleEndian();
161+
short max = bMax.get2BytesLittleEndian();
162+
// Drop min/max values in case of NaN as the sorting order of values is undefined for this case
163+
if (Float16.isNaN(min) || Float16.isNaN(max)) {
164+
stats.setMinMax(POSITIVE_ZERO_LITTLE_ENDIAN, NEGATIVE_ZERO_LITTLE_ENDIAN);
165+
((Statistics<?>) stats).hasNonNullValue = false;
166+
} else {
167+
// Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
168+
if (min == (short) 0x0000) {
169+
stats.setMinMax(NEGATIVE_ZERO_LITTLE_ENDIAN, bMax);
170+
}
171+
if (max == (short) 0x8000) {
172+
stats.setMinMax(bMin, POSITIVE_ZERO_LITTLE_ENDIAN);
173+
}
174+
}
175+
}
176+
return stats;
177+
}
178+
}
179+
142180
private final PrimitiveType type;
143181
private final PrimitiveComparator<T> comparator;
144182
private boolean hasNonNullValue;
@@ -226,6 +264,11 @@ public static Builder getBuilderForReading(PrimitiveType type) {
226264
return new FloatBuilder(type);
227265
case DOUBLE:
228266
return new DoubleBuilder(type);
267+
case FIXED_LEN_BYTE_ARRAY:
268+
LogicalTypeAnnotation logicalTypeAnnotation = type.getLogicalTypeAnnotation();
269+
if (logicalTypeAnnotation instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation) {
270+
return new Float16Builder(type);
271+
}
229272
default:
230273
return new Builder(type);
231274
}

parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import java.io.OutputStream;
2525
import java.io.Serializable;
2626
import java.nio.ByteBuffer;
27+
import java.nio.ByteOrder;
2728
import java.nio.CharBuffer;
2829
import java.nio.charset.CharacterCodingException;
2930
import java.nio.charset.CharsetEncoder;
@@ -85,6 +86,8 @@ private Binary() { }
8586

8687
abstract public ByteBuffer toByteBuffer();
8788

89+
abstract public short get2BytesLittleEndian();
90+
8891
@Override
8992
public boolean equals(Object obj) {
9093
if (obj == null) {
@@ -218,6 +221,15 @@ public ByteBuffer toByteBuffer() {
218221
return ByteBuffer.wrap(value, offset, length);
219222
}
220223

224+
@Override
225+
public short get2BytesLittleEndian() {
226+
if (length != 2) {
227+
throw new IllegalArgumentException("length must be 2");
228+
}
229+
230+
return (short) (((value[offset + 1] & 0xff) << 8) | (value[offset] & 0xff));
231+
}
232+
221233
@Override
222234
public void writeTo(DataOutput out) throws IOException {
223235
out.write(value, offset, length);
@@ -371,6 +383,15 @@ public ByteBuffer toByteBuffer() {
371383
return ByteBuffer.wrap(value);
372384
}
373385

386+
@Override
387+
public short get2BytesLittleEndian() {
388+
if (value.length != 2) {
389+
throw new IllegalArgumentException("length must be 2");
390+
}
391+
392+
return (short) (((value[1] & 0xff) << 8) | (value[0] & 0xff));
393+
}
394+
374395
@Override
375396
public void writeTo(DataOutput out) throws IOException {
376397
out.write(value);
@@ -547,6 +568,15 @@ public ByteBuffer toByteBuffer() {
547568
return ret;
548569
}
549570

571+
@Override
572+
public short get2BytesLittleEndian() {
573+
if (length != 2) {
574+
throw new IllegalArgumentException("length must be 2");
575+
}
576+
577+
return value.order(ByteOrder.LITTLE_ENDIAN).getShort(offset);
578+
}
579+
550580
@Override
551581
public void writeTo(DataOutput out) throws IOException {
552582
// TODO: should not have to materialize those bytes
Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet.schema;
20+
21+
import org.apache.parquet.io.api.Binary;
22+
23+
/**
24+
* The class is a utility class to manipulate half-precision 16-bit
25+
* <a href="https://en.wikipedia.org/wiki/Half-precision_floating-point_format">IEEE 754</a>
26+
* floating point data types (also called fp16 or binary16). A half-precision float can be
27+
* created from or converted to single-precision floats, and is stored in a short data type.
28+
* The IEEE 754 standard specifies an float16 as having the following format:
29+
* <ul>
30+
* <li>Sign bit: 1 bit</li>
31+
* <li>Exponent width: 5 bits</li>
32+
* <li>Significand: 10 bits</li>
33+
* </ul>
34+
*
35+
* <p>The format is laid out as follows:</p>
36+
* <pre>
37+
* 1 11111 1111111111
38+
* ^ --^-- -----^----
39+
* sign | |_______ significand
40+
* |
41+
* -- exponent
42+
* </pre>
43+
* Half-precision floating points can be useful to save memory and/or
44+
* bandwidth at the expense of range and precision when compared to single-precision
45+
* floating points (float32).
46+
* Ref: https://android.googlesource.com/platform/libcore/+/master/luni/src/main/java/libcore/util/FP16.java
47+
*/
48+
public class Float16 {
49+
// Positive infinity of type half-precision float.
50+
private static final short POSITIVE_INFINITY = (short) 0x7c00;
51+
// A Not-a-Number representation of a half-precision float.
52+
private static final short NaN = (short) 0x7e00;
53+
// The bitmask to and a number with to obtain the sign bit.
54+
private static final int SIGN_MASK = 0x8000;
55+
// The offset to shift by to obtain the exponent bits.
56+
private static final int EXPONENT_SHIFT = 10;
57+
// The bitmask to and a number shifted by EXPONENT_SHIFT right, to obtain exponent bits.
58+
private static final int SHIFTED_EXPONENT_MASK = 0x1f;
59+
// The bitmask to and a number with to obtain significand bits.
60+
private static final int SIGNIFICAND_MASK = 0x3ff;
61+
// The offset of the exponent from the actual value.
62+
private static final int EXPONENT_BIAS = 15;
63+
// The offset to shift by to obtain the sign bit.
64+
private static final int SIGN_SHIFT = 15;
65+
// The bitmask to AND with to obtain exponent and significand bits.
66+
private static final int EXPONENT_SIGNIFICAND_MASK = 0x7fff;
67+
68+
private static final int FP32_SIGN_SHIFT = 31;
69+
private static final int FP32_EXPONENT_SHIFT = 23;
70+
private static final int FP32_SHIFTED_EXPONENT_MASK = 0xff;
71+
private static final int FP32_SIGNIFICAND_MASK = 0x7fffff;
72+
private static final int FP32_EXPONENT_BIAS = 127;
73+
private static final int FP32_QNAN_MASK = 0x400000;
74+
private static final int FP32_DENORMAL_MAGIC = 126 << 23;
75+
private static final float FP32_DENORMAL_FLOAT = Float.intBitsToFloat(FP32_DENORMAL_MAGIC);
76+
77+
/**
78+
* Returns true if the specified half-precision float value represents
79+
* a Not-a-Number, false otherwise.
80+
*
81+
* @param h A half-precision float value
82+
* @return True if the value is a NaN, false otherwise
83+
*
84+
*/
85+
public static boolean isNaN(short h) {
86+
return (h & EXPONENT_SIGNIFICAND_MASK) > POSITIVE_INFINITY;
87+
}
88+
89+
/**
90+
* <p>Compares the two specified half-precision float values. The following
91+
* conditions apply during the comparison:</p>
92+
*
93+
* <ul>
94+
* <li>NaN is considered by this method to be equal to itself and greater
95+
* than all other half-precision float values (including {@code #POSITIVE_INFINITY})</li>
96+
* <li>POSITIVE_ZERO is considered by this method to be greater than NEGATIVE_ZERO.</li>
97+
* </ul>
98+
*
99+
* @param x The first half-precision float value to compare.
100+
* @param y The second half-precision float value to compare
101+
*
102+
* @return The value {@code 0} if {@code x} is numerically equal to {@code y}, a
103+
* value less than {@code 0} if {@code x} is numerically less than {@code y},
104+
* and a value greater than {@code 0} if {@code x} is numerically greater
105+
* than {@code y}
106+
*
107+
*/
108+
public static int compare(short x, short y) {
109+
boolean xIsNaN = isNaN(x);
110+
boolean yIsNaN = isNaN(y);
111+
112+
if (!xIsNaN && !yIsNaN) {
113+
int first = ((x & SIGN_MASK) != 0 ? 0x8000 - (x & 0xffff) : x & 0xffff);
114+
int second = ((y & SIGN_MASK) != 0 ? 0x8000 - (y & 0xffff) : y & 0xffff);
115+
// Returns true if the first half-precision float value is less
116+
// (smaller toward negative infinity) than the second half-precision float value.
117+
if (first < second) {
118+
return -1;
119+
}
120+
121+
// Returns true if the first half-precision float value is greater
122+
// (larger toward positive infinity) than the second half-precision float value.
123+
if (first > second) {
124+
return 1;
125+
}
126+
}
127+
128+
// Collapse NaNs, akin to halfToIntBits(), but we want to keep
129+
// (signed) short value types to preserve the ordering of -0.0
130+
// and +0.0
131+
short xBits = xIsNaN ? NaN : x;
132+
short yBits = yIsNaN ? NaN : y;
133+
return (xBits == yBits ? 0 : (xBits < yBits ? -1 : 1));
134+
}
135+
136+
/**
137+
* Converts the specified half-precision float value in Binary little endian into a
138+
* single-precision float value. The following special cases are handled:
139+
* If the input is NaN, the returned value is Float NaN.
140+
* If the input is POSITIVE_INFINITY or NEGATIVE_INFINITY, the returned value is respectively
141+
* Float POSITIVE_INFINITY or Float NEGATIVE_INFINITY.
142+
* If the input is 0 (positive or negative), the returned value is +/-0.0f.
143+
* Otherwise, the returned value is a normalized single-precision float value.
144+
*
145+
* @param b The half-precision float value in Binary little endian to convert to single-precision
146+
* @return A normalized single-precision float value
147+
*/
148+
static float toFloat(Binary b) {
149+
short h = b.get2BytesLittleEndian();
150+
int bits = h & 0xffff;
151+
int s = bits & SIGN_MASK;
152+
int e = (bits >>> EXPONENT_SHIFT) & SHIFTED_EXPONENT_MASK;
153+
int m = (bits ) & SIGNIFICAND_MASK;
154+
int outE = 0;
155+
int outM = 0;
156+
if (e == 0) { // Denormal or 0
157+
if (m != 0) {
158+
// Convert denorm fp16 into normalized fp32
159+
float o = Float.intBitsToFloat(FP32_DENORMAL_MAGIC + m);
160+
o -= FP32_DENORMAL_FLOAT;
161+
return s == 0 ? o : -o;
162+
}
163+
} else {
164+
outM = m << 13;
165+
if (e == 0x1f) { // Infinite or NaN
166+
outE = 0xff;
167+
if (outM != 0) { // SNaNs are quieted
168+
outM |= FP32_QNAN_MASK;
169+
}
170+
} else {
171+
outE = e - EXPONENT_BIAS + FP32_EXPONENT_BIAS;
172+
}
173+
}
174+
int out = (s << 16) | (outE << FP32_EXPONENT_SHIFT) | outM;
175+
return Float.intBitsToFloat(out);
176+
}
177+
178+
/**
179+
* Converts the specified single-precision float value into a
180+
* half-precision float value. The following special cases are handled:
181+
*
182+
* If the input is NaN, the returned value is NaN.
183+
* If the input is Float POSITIVE_INFINITY or Float NEGATIVE_INFINITY,
184+
* the returned value is respectively POSITIVE_INFINITY or NEGATIVE_INFINITY.
185+
* If the input is 0 (positive or negative), the returned value is
186+
* POSITIVE_ZERO or NEGATIVE_ZERO.
187+
* If the input is a less than MIN_VALUE, the returned value
188+
* is flushed to POSITIVE_ZERO or NEGATIVE_ZERO.
189+
* If the input is a less than MIN_NORMAL, the returned value
190+
* is a denorm half-precision float.
191+
* Otherwise, the returned value is rounded to the nearest
192+
* representable half-precision float value.
193+
*
194+
* @param f The single-precision float value to convert to half-precision
195+
* @return A half-precision float value
196+
*/
197+
static short toFloat16(float f) {
198+
int bits = Float.floatToRawIntBits(f);
199+
int s = (bits >>> FP32_SIGN_SHIFT );
200+
int e = (bits >>> FP32_EXPONENT_SHIFT) & FP32_SHIFTED_EXPONENT_MASK;
201+
int m = (bits ) & FP32_SIGNIFICAND_MASK;
202+
int outE = 0;
203+
int outM = 0;
204+
if (e == 0xff) { // Infinite or NaN
205+
outE = 0x1f;
206+
outM = m != 0 ? 0x200 : 0;
207+
} else {
208+
e = e - FP32_EXPONENT_BIAS + EXPONENT_BIAS;
209+
if (e >= 0x1f) { // Overflow
210+
outE = 0x1f;
211+
} else if (e <= 0) { // Underflow
212+
if (e < -10) {
213+
// The absolute fp32 value is less than MIN_VALUE, flush to +/-0
214+
} else {
215+
// The fp32 value is a normalized float less than MIN_NORMAL,
216+
// we convert to a denorm fp16
217+
m = m | 0x800000;
218+
int shift = 14 - e;
219+
outM = m >> shift;
220+
int lowm = m & ((1 << shift) - 1);
221+
int hway = 1 << (shift - 1);
222+
// if above halfway or exactly halfway and outM is odd
223+
if (lowm + (outM & 1) > hway){
224+
// Round to nearest even
225+
// Can overflow into exponent bit, which surprisingly is OK.
226+
// This increment relies on the +outM in the return statement below
227+
outM++;
228+
}
229+
}
230+
} else {
231+
outE = e;
232+
outM = m >> 13;
233+
// if above halfway or exactly halfway and outM is odd
234+
if ((m & 0x1fff) + (outM & 0x1) > 0x1000) {
235+
// Round to nearest even
236+
// Can overflow into exponent bit, which surprisingly is OK.
237+
// This increment relies on the +outM in the return statement below
238+
outM++;
239+
}
240+
}
241+
}
242+
// The outM is added here as the +1 increments for outM above can
243+
// cause an overflow in the exponent bit which is OK.
244+
return (short) ((s << SIGN_SHIFT) | (outE << EXPONENT_SHIFT) + outM);
245+
}
246+
247+
/**
248+
* Returns a string representation of the specified half-precision
249+
* float value. Calling this method is equivalent to calling
250+
* <code>Float.toString(toFloat(h))</code>. See {@link Float#toString(float)}
251+
* for more information on the format of the string representation.
252+
*
253+
* @param h A half-precision float value in binary little-endian format
254+
* @return A string representation of the specified value
255+
*/
256+
static String toFloatString(Binary h) {
257+
return Float.toString(Float16.toFloat(h));
258+
}
259+
}

0 commit comments

Comments
 (0)