Skip to content
Closed
Changes from 1 commit
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
fb9a42d
add two implementations (sparse and dense) for UnsafeArrayData
kiszk Jun 14, 2016
d931428
fix failures of testsuite
kiszk Jun 15, 2016
9777a2d
fix errors of unit tests
kiszk Jun 15, 2016
000eda4
fix failures of unit tests
kiszk Jun 15, 2016
804f081
make DenseID public
kiszk Jun 23, 2016
e6fb261
Use one implementation approach
kiszk Jun 25, 2016
a313084
fix test failures
kiszk Jun 25, 2016
68d92f7
fix test failures
kiszk Jun 25, 2016
7f2da14
update test suite
kiszk Jun 25, 2016
2f26f6f
fix scala style error
kiszk Jun 25, 2016
ccef63c
revert changes
kiszk Jun 25, 2016
c4f1b5e
addressed comments
kiszk Jun 28, 2016
34a5c6a
add benchmark
kiszk Jun 28, 2016
7a77b20
fix scala style error
kiszk Jun 28, 2016
7b0d4da
addressed comments
kiszk Jul 1, 2016
b4eac29
addressed comments
kiszk Jul 2, 2016
eecf6bd
fix parameters of Platform.OFFSET
kiszk Jul 3, 2016
d88a25a
update benchmark results
kiszk Jul 3, 2016
db15432
add test cases
kiszk Jul 3, 2016
3fa7052
addressed comments
kiszk Jul 4, 2016
4c094c2
addressed comments
kiszk Jul 6, 2016
9887171
update test cases
kiszk Jul 6, 2016
9fe7ad0
address comments
kiszk Jul 7, 2016
e4b4b52
address comments for test cases and benchmark
kiszk Jul 7, 2016
585ca7b
addressed comments
kiszk Jul 8, 2016
9933a06
addressed review comments
kiszk Aug 6, 2016
919e832
fixed test failures
kiszk Aug 7, 2016
0886e3a
update test suites
kiszk Aug 9, 2016
c385bf4
align each of variable length elements to 8 bytes
kiszk Aug 18, 2016
c8813db
fixed test failures
kiszk Aug 20, 2016
aa7cfdb
fixed test failures
kiszk Sep 9, 2016
0b7867b
address review comments
kiszk Sep 20, 2016
ab9a16a
address review comments
kiszk Sep 20, 2016
515701b
address review comments
kiszk Sep 20, 2016
8169abd
change benchmark size
kiszk Sep 26, 2016
e356a79
addressed comments
kiszk Sep 26, 2016
2ef6e3b
update performance results
kiszk Sep 26, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
addressed comments
  • Loading branch information
kiszk committed Sep 26, 2016
commit c4f1b5e269dbb31b41b40f8472f8827fbb997bfe
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,20 @@
/**
* An Unsafe implementation of Array which is backed by raw memory instead of Java objects.
*
* Each tuple has three four: [numElements] [null bits] [values] [variable length portion]
* Each tuple has four parts: [numElements][null bits][values or offset][variable length portion]
*
* The `numElements` is 4 bytes storing the number of elements of this array.
*
* In the `null bits` region, we store 1 bit per element, represents whether a element has null
* Its total size is ceil(numElements / 8) bytes, and it is aligned to 8-byte word boundaries.
*
* In the `offsets` region, we store 4 bytes per element, represents the relative offset (w.r.t. the
* base address of the array) of this element in `values` region. We can get the length of this
* element by subtracting next offset.
* Note that offset can by negative which means this element is null.
*
* In the `values` region, we store the content of elements. As we can get length info, so elements
* can be variable-length.
* In the `values or offset` region, we store the content of elements. For fields that hold
* fixed-length primitive types, such as long, double, or int, we store the value directly
* in the field. For fields with non-primitive or variable-length values, we store a relative
* offset (w.r.t. the base address of the row) that points to the beginning of the variable-length
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the base address of the array data

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

* field, and length (they are combined into a long).
*
* Instances of `UnsafeArrayData` act as pointers to row data stored in this format.
*/

// todo: there is a lof of duplicated code between UnsafeRow and UnsafeArrayData.
Expand All @@ -67,11 +66,11 @@ public static int calculateHeaderPortionInBytes(int numFields) {
// The 4-bytes header of `numElements` is also included.
private int sizeInBytes;

/** The width of the null tracking bit set, in bytes */
/** The width of the null tracking bit set plus `numElements`, in bytes */
private int headerInBytes;

private long getFieldOffset(int ordinal, int scale) {
return baseOffset + headerInBytes + ordinal * scale;
private long getFieldOffset(int ordinal, int elementSize) {
return baseOffset + headerInBytes + ordinal * elementSize;
}

public Object getBaseObject() { return baseObject; }
Expand All @@ -92,7 +91,7 @@ public Object[] array() {
* `pointTo()` has been called, since the value returned by this constructor is equivalent
* to a null pointer.
*/
public UnsafeArrayData() { }
public UnsafeArrayData() { }

@Override
public int numElements() { return numElements; }
Expand Down Expand Up @@ -123,7 +122,7 @@ public boolean isNullAt(int ordinal) {
}

@Override
public final Object get(int ordinal, DataType dataType) {
public Object get(int ordinal, DataType dataType) {
if (isNullAt(ordinal) || dataType instanceof NullType) {
return null;
} else if (dataType instanceof BooleanType) {
Expand Down Expand Up @@ -210,6 +209,7 @@ public double getDouble(int ordinal) {

@Override
public Decimal getDecimal(int ordinal, int precision, int scale) {
assertIndexIsValid(ordinal);
if (isNullAt(ordinal)) {
return null;
}
Expand Down Expand Up @@ -286,11 +286,30 @@ public UnsafeMapData getMap(int ordinal) {
return map;
}

public final void writeToMemory(Object target, long targetOffset) {
// This `hashCode` computation could consume much processor time for large data.
// If the computation becomes a bottleneck, we can use a light-weight logic; the first fixed bytes
// are used to compute `hashCode` (See `Vector.hashCode`).
// The same issue exists in `UnsafeRow.hashCode`.
@Override
public int hashCode() {
return Murmur3_x86_32.hashUnsafeBytes(baseObject, baseOffset, sizeInBytes, 42);
}

@Override
public boolean equals(Object other) {
if (other instanceof UnsafeArrayData) {
UnsafeArrayData o = (UnsafeArrayData) other;
return (sizeInBytes == o.sizeInBytes) &&
ByteArrayMethods.arrayEquals(baseObject, baseOffset, o.baseObject, o.baseOffset,
sizeInBytes);
}
return false;
}
public void writeToMemory(Object target, long targetOffset) {
Platform.copyMemory(baseObject, baseOffset, target, targetOffset, sizeInBytes);
}

public final void writeTo(ByteBuffer buffer) {
public void writeTo(ByteBuffer buffer) {
assert(buffer.hasArray());
byte[] target = buffer.array();
int offset = buffer.arrayOffset();
Expand All @@ -299,10 +318,6 @@ public final void writeTo(ByteBuffer buffer) {
buffer.position(pos + sizeInBytes);
}

// This `hashCode` computation could consume much processor time for large data.
// If the computation becomes a bottleneck, we can use a light-weight logic; the first fixed bytes
// are used to compute `hashCode` (See `Vector.hashCode`).
// The same issue exists in `UnsafeRow.hashCode`.
@Override
public UnsafeArrayData copy() {
UnsafeArrayData arrayCopy = new UnsafeArrayData();
Expand Down Expand Up @@ -336,7 +351,7 @@ public short[] toShortArray() {
int size = numElements();
short[] values = new short[size];
Platform.copyMemory(
baseObject, baseOffset + headerInBytes, values, Platform.BYTE_ARRAY_OFFSET, size * 2);
baseObject, baseOffset + headerInBytes, values, Platform.SHORT_ARRAY_OFFSET, size * 2);
return values;
}

Expand All @@ -345,7 +360,7 @@ public int[] toIntArray() {
int size = numElements();
int[] values = new int[size];
Platform.copyMemory(
baseObject, baseOffset + headerInBytes, values, Platform.BYTE_ARRAY_OFFSET, size * 4);
baseObject, baseOffset + headerInBytes, values, Platform.INT_ARRAY_OFFSET, size * 4);
return values;
}

Expand All @@ -354,7 +369,7 @@ public long[] toLongArray() {
int size = numElements();
long[] values = new long[size];
Platform.copyMemory(
baseObject, baseOffset + headerInBytes, values, Platform.BYTE_ARRAY_OFFSET, size * 8);
baseObject, baseOffset + headerInBytes, values, Platform.LONG_ARRAY_OFFSET, size * 8);
return values;
}

Expand All @@ -363,7 +378,7 @@ public float[] toFloatArray() {
int size = numElements();
float[] values = new float[size];
Platform.copyMemory(
baseObject, baseOffset + headerInBytes, values, Platform.BYTE_ARRAY_OFFSET, size * 4);
baseObject, baseOffset + headerInBytes, values, Platform.FLOAT_ARRAY_OFFSET, size * 4);
return values;
}

Expand All @@ -372,22 +387,21 @@ public double[] toDoubleArray() {
int size = numElements();
double[] values = new double[size];
Platform.copyMemory(
baseObject, baseOffset + headerInBytes, values, Platform.BYTE_ARRAY_OFFSET, size * 8);
baseObject, baseOffset + headerInBytes, values, Platform.DOUBLE_ARRAY_OFFSET, size * 8);
return values;
}

public static UnsafeArrayData fromPrimitiveArray(int[] arr) {
final int elementSize = 4;
final int headerSize = calculateHeaderPortionInBytes(arr.length);
if (arr.length > (Integer.MAX_VALUE - headerSize) / elementSize) {
private static UnsafeArrayData fromPrimitiveArray(Object arr, int length, final int elementSize) {
final int headerSize = calculateHeaderPortionInBytes(length);
if (length > (Integer.MAX_VALUE - headerSize) / elementSize) {
throw new UnsupportedOperationException("Cannot convert this array to unsafe format as " +
"it's too big.");
}

final int valueRegionSize = elementSize * arr.length;
final int valueRegionSize = elementSize * length;
final byte[] data = new byte[valueRegionSize + headerSize];

Platform.putInt(data, Platform.BYTE_ARRAY_OFFSET, arr.length);
Platform.putInt(data, Platform.BYTE_ARRAY_OFFSET, length);
Platform.copyMemory(arr, Platform.INT_ARRAY_OFFSET, data,
Platform.BYTE_ARRAY_OFFSET + headerSize, valueRegionSize);

Expand All @@ -396,39 +410,31 @@ public static UnsafeArrayData fromPrimitiveArray(int[] arr) {
return result;
}

public static UnsafeArrayData fromPrimitiveArray(double[] arr) {
final int elementSize = 8;
final int headerSize = calculateHeaderPortionInBytes(arr.length);
if (arr.length > (Integer.MAX_VALUE - headerSize) / elementSize) {
throw new UnsupportedOperationException("Cannot convert this array to unsafe format as " +
"it's too big.");
}
public static UnsafeArrayData fromPrimitiveArray(boolean[] arr) {
return fromPrimitiveArray(arr, arr.length, 1);
}

final int valueRegionSize = elementSize * arr.length;
final byte[] data = new byte[valueRegionSize + headerSize];
public static UnsafeArrayData fromPrimitiveArray(byte[] arr) {
return fromPrimitiveArray(arr, arr.length, 1);
}

Platform.putInt(data, Platform.BYTE_ARRAY_OFFSET, arr.length);
Platform.copyMemory(arr, Platform.DOUBLE_ARRAY_OFFSET, data,
Platform.BYTE_ARRAY_OFFSET + headerSize, valueRegionSize);
public static UnsafeArrayData fromPrimitiveArray(short[] arr) {
return fromPrimitiveArray(arr, arr.length, 2);
}

UnsafeArrayData result = new UnsafeArrayData();
result.pointTo(data, Platform.BYTE_ARRAY_OFFSET, valueRegionSize + headerSize);
return result;
public static UnsafeArrayData fromPrimitiveArray(int[] arr) {
return fromPrimitiveArray(arr, arr.length, 4);
}

@Override
public int hashCode() {
return Murmur3_x86_32.hashUnsafeBytes(baseObject, baseOffset, sizeInBytes, 42);
public static UnsafeArrayData fromPrimitiveArray(long[] arr) {
return fromPrimitiveArray(arr, arr.length, 8);
}

@Override
public boolean equals(Object other) {
if (other instanceof UnsafeArrayData) {
UnsafeArrayData o = (UnsafeArrayData) other;
return (sizeInBytes == o.sizeInBytes) &&
ByteArrayMethods.arrayEquals(baseObject, baseOffset, o.baseObject, o.baseOffset,
sizeInBytes);
}
return false;
public static UnsafeArrayData fromPrimitiveArray(float[] arr) {
return fromPrimitiveArray(arr, arr.length, 4);
}

public static UnsafeArrayData fromPrimitiveArray(double[] arr) {
return fromPrimitiveArray(arr, arr.length, 8);
}
}