-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-6638] [SQL] Improve performance of StringType in SQL #5350
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
685fd07
21f67c6
4699c3a
d32abd1
a85fb27
6b499ac
5f9e120
38c303e
c7dd4d2
bb52e44
8b45864
23a766c
9dc32d1
73e4363
956b0a4
9f4c194
537631c
28d6f32
28f3d81
e5fa5b8
8d17f21
fd11364
ac18ae6
2089d24
13d9d42
867bf50
1314a37
5116b43
08d897b
b04a19c
744788f
341ec2c
59025c8
6d776a9
2772f0d
3b7bfa8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,7 +30,7 @@ import java.util.Arrays | |
|
|
||
| final class UTF8String extends Ordered[UTF8String] with Serializable { | ||
|
|
||
| private var bytes: Array[Byte] = _ | ||
| private[this] var bytes: Array[Byte] = _ | ||
|
|
||
| /** | ||
| * Update the UTF8String with String. | ||
|
|
@@ -48,6 +48,12 @@ final class UTF8String extends Ordered[UTF8String] with Serializable { | |
| this | ||
| } | ||
|
|
||
| @inline | ||
| private[this] def numOfBytes(b: Byte): Int = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe add a comment at here to explain it? |
||
| val offset = (b & 0xFF) - 192 | ||
| if (offset >= 0) UTF8String.tailBytesOfUTF8(offset) else 1 | ||
| } | ||
|
|
||
| /** | ||
| * Return the number of code points in it. | ||
| * | ||
|
|
@@ -57,11 +63,7 @@ final class UTF8String extends Ordered[UTF8String] with Serializable { | |
| var len = 0 | ||
| var i: Int = 0 | ||
| while (i < bytes.length) { | ||
| val b = bytes(i) & 0xFF | ||
| i += 1 | ||
| if (b >= 192) { | ||
| i += UTF8String.tailBytesOfUTF8(b - 192) | ||
| } | ||
| i += numOfBytes(bytes(i)) | ||
| len += 1 | ||
| } | ||
| len | ||
|
|
@@ -84,35 +86,47 @@ final class UTF8String extends Ordered[UTF8String] with Serializable { | |
| var c = 0 | ||
| var i: Int = 0 | ||
| while (c < start && i < bytes.length) { | ||
| val b = bytes(i) & 0xFF | ||
| i += 1 | ||
| if (b >= 192) { | ||
| i += UTF8String.tailBytesOfUTF8(b - 192) | ||
| } | ||
| i += numOfBytes(bytes(i)) | ||
| c += 1 | ||
| } | ||
| var j = i | ||
| while (c < until && j < bytes.length) { | ||
| val b = bytes(j) & 0xFF | ||
| j += 1 | ||
| if (b >= 192) { | ||
| j += UTF8String.tailBytesOfUTF8(b - 192) | ||
| } | ||
| j += numOfBytes(bytes(j)) | ||
| c += 1 | ||
| } | ||
| UTF8String(Arrays.copyOfRange(bytes, i, j)) | ||
| } | ||
|
|
||
| def contains(sub: UTF8String): Boolean = { | ||
| bytes.containsSlice(sub.bytes) | ||
| val b = sub.getBytes | ||
| if (b.length == 0) { | ||
| return true | ||
| } | ||
| var i: Int = 0 | ||
| while (i <= bytes.length - b.length) { | ||
| // In worst case, it's O(N*K), but should works fine with SQL | ||
| if (bytes(i) == b(0) && Arrays.equals(Arrays.copyOfRange(bytes, i, i + b.length), b)) { | ||
| return true | ||
| } | ||
| i += 1 | ||
| } | ||
| false | ||
| } | ||
|
|
||
| def startsWith(prefix: UTF8String): Boolean = { | ||
| bytes.startsWith(prefix.bytes) | ||
| val b = prefix.getBytes | ||
| if (b.length > bytes.length) { | ||
| return false | ||
| } | ||
| Arrays.equals(Arrays.copyOfRange(bytes, 0, b.length), b) | ||
| } | ||
|
|
||
| def endsWith(suffix: UTF8String): Boolean = { | ||
| bytes.endsWith(suffix.bytes) | ||
| val b = suffix.getBytes | ||
| if (b.length > bytes.length) { | ||
| return false | ||
| } | ||
| Arrays.equals(Arrays.copyOfRange(bytes, bytes.length - b.length, bytes.length), b) | ||
| } | ||
|
|
||
| def toUpperCase(): UTF8String = { | ||
|
|
@@ -133,12 +147,13 @@ final class UTF8String extends Ordered[UTF8String] with Serializable { | |
|
|
||
| override def compare(other: UTF8String): Int = { | ||
| var i: Int = 0 | ||
| while (i < bytes.length && i < other.bytes.length) { | ||
| val res = bytes(i).compareTo(other.bytes(i)) | ||
| val b = other.getBytes | ||
| while (i < bytes.length && i < b.length) { | ||
| val res = bytes(i).compareTo(b(i)) | ||
| if (res != 0) return res | ||
| i += 1 | ||
| } | ||
| bytes.length - other.bytes.length | ||
| bytes.length - b.length | ||
| } | ||
|
|
||
| override def compareTo(other: UTF8String): Int = { | ||
|
|
@@ -147,7 +162,7 @@ final class UTF8String extends Ordered[UTF8String] with Serializable { | |
|
|
||
| override def equals(other: Any): Boolean = other match { | ||
| case s: UTF8String => | ||
| Arrays.equals(bytes, s.bytes) | ||
| Arrays.equals(bytes, s.getBytes) | ||
| case s: String => | ||
| // fail fast | ||
| bytes.length >= s.length && length() == s.length && toString() == s | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When do we need this?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For tests.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, we do not expect that
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried to remove this, then we need to convert String into UTF8String manually in many cases, especially, a String inside a Map/Struct/Array. This could simply the tests a lot, I'd like to keep this. |
||
|
|
@@ -163,10 +178,12 @@ final class UTF8String extends Ordered[UTF8String] with Serializable { | |
| object UTF8String { | ||
| // number of tailing bytes in a UTF8 sequence for a code point | ||
| // see http://en.wikipedia.org/wiki/UTF-8, 192-256 of Byte 1 | ||
| private[types] val tailBytesOfUTF8: Array[Int] = Array(1, 1, 1, 1, 1, | ||
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
| 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, | ||
| 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5) | ||
| private[types] val tailBytesOfUTF8: Array[Int] = Array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | ||
| 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | ||
| 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | ||
| 4, 4, 4, 4, 4, 4, 4, 4, | ||
| 5, 5, 5, 5, | ||
| 6, 6, 6, 6) | ||
|
|
||
| /** | ||
| * Create a UTF-8 String from String | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had assumed that we would want to use
bytes: Array[Byte]+length: Intso that the same byte array could be reused multiple times for different values. It seems that allocating and zeroing out the byte arrays could actually be pretty expensive.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, okay talked to @rxin and we are going to try and do this later?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right now,
UTF8Stringwill take bytes fromBinary.getBytesorString.getBytes, no copy, until we call copy() explicitly.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is the
Binarythat you are referring to at here? Also, can you explain what do you mean byno copyat here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
Binaryisparquet.io.api.Binary, When we create aUTFStringfromBinary.getBytes, we does not need to do another copy for bytes.Before this patch, we will create a copy as
String.