|
1 | 1 | --- |
2 | 2 | title: Search Trees |
3 | | -weight: 4 |
| 3 | +weight: 3 |
4 | 4 | draft: true |
5 | 5 | --- |
6 | 6 |
|
| 7 | +In the [previous article](../s-tree), we designed *static* B-trees (*S-trees*), and we [briefly discussed](../s-tree/#as-a-dynamic-tree) how to turn them *dynamic* while retaining performance gains from [SIMD](/hpc/simd). |
| 8 | + |
| 9 | +In this article |
| 10 | + |
| 11 | +The problem is multi-dimensional. |
| 12 | + |
| 13 | +Of course, this comparison is not fair, as implementing a dynamic search tree is a more high-dimensional problem. |
| 14 | + |
| 15 | +We’d also need to implement the update operation, which will not be that efficient, and for which we’d need to sacrifice the fanout factor. But it still seems possible to implement a 10-20x faster std::set and a 3-5x faster absl::btree_set, depending on how you define “faster” — and this is one of the things we’ll attempt to do next. |
| 16 | + |
| 17 | +Static as |
| 18 | + |
| 19 | + |
| 20 | + |
| 21 | + |
| 22 | + |
| 23 | + |
| 24 | + |
| 25 | +When the data set is small, the latency increases in discrete steps: 3.5ns for under 32 elements, 6.5ns, and to 12ns, until it hits the L2 cache (not shown on graphs) and starts increasing more smoothly yet still with noticeable spikes when the tree grows upwards. |
| 26 | + |
| 27 | +One interesting use case is rope, also known as cord, which is used for wrapping strings in a tree to support mass operations. For example, editing a very large text file. Which is the topic. |
| 28 | + |
| 29 | +It is common that >90% of operations are lookups. Optimizing searches is important because every other operation starts with locating a key. |
| 30 | + |
| 31 | +I don't know (yet) why insertions are *that* slow. My guess is that it has something to do with data dependencies between queries. |
| 32 | + |
| 33 | +I apologize to everyone else, but this is sort of your fault for not using a public benchmark. |
| 34 | + |
| 35 | +## B− Tree |
| 36 | + |
| 37 | +[B+ tree](../s-tree/#b-tree-layout-1). |
| 38 | + |
| 39 | +B− ("B minus") tree. The difference is: |
| 40 | + |
| 41 | +- We are specifically storing the *last* element. This is needed |
| 42 | +- We use a small node size $B=32$. This is needed simd to be efficient (we will discuss other node sizes in the future) |
| 43 | +- We don't store any pointers except for the children (while B+ stores one pointer for the next leaf node). |
| 44 | + |
| 45 | +The difference is that |
| 46 | + |
| 47 | +### Layout |
| 48 | + |
| 49 | +To simplify memory all with infinities. |
| 50 | + |
| 51 | +```c++ |
| 52 | +const int B = 32; |
| 53 | +int H = 1; // tree height |
| 54 | + |
| 55 | +const int R = 1e8; // reserve |
| 56 | + |
| 57 | +alignas(64) int tree[R]; |
| 58 | +int n_tree = B; // 31 (+ 1) + 32 for internal nodes and 31 for data nodes |
| 59 | +int root = 0; |
| 60 | + |
| 61 | +for (int i = 0; i < R; i++) |
| 62 | + tree[i] = INT_MAX; |
| 63 | +``` |
| 64 | + |
| 65 | +To "allocate" a new node, we simply increase `n_tree` by $B$ if it is a data node or by $2 \cdot B$ if it is an internal node. |
| 66 | + |
| 67 | +### Searching |
| 68 | + |
| 69 | +```c++ |
| 70 | +typedef __m256i reg; |
| 71 | + |
| 72 | +reg cmp(reg x, int *node) { |
| 73 | + reg y = _mm256_load_si256((reg*) node); |
| 74 | + return _mm256_cmpgt_epi32(x, y); |
| 75 | +} |
| 76 | + |
| 77 | +unsigned rank32(reg x, int *node) { |
| 78 | + reg m1 = cmp(x, node); |
| 79 | + reg m2 = cmp(x, node + 8); |
| 80 | + reg m3 = cmp(x, node + 16); |
| 81 | + reg m4 = cmp(x, node + 24); |
| 82 | + |
| 83 | + m1 = _mm256_blend_epi16(m1, m2, 0b01010101); |
| 84 | + m3 = _mm256_blend_epi16(m3, m4, 0b01010101); |
| 85 | + m1 = _mm256_packs_epi16(m1, m3); |
| 86 | + |
| 87 | + unsigned mask = _mm256_movemask_epi8(m1); |
| 88 | + return __builtin_popcount(mask); |
| 89 | +} |
| 90 | +``` |
| 91 | +
|
| 92 | +```c++ |
| 93 | +int lower_bound(int _x) { |
| 94 | + //std::cerr << std::endl << "lb " << _x << std::endl; |
| 95 | + unsigned k = root; |
| 96 | + reg x = _mm256_set1_epi32(_x); |
| 97 | + |
| 98 | + for (int h = 0; h < H - 1; h++) { |
| 99 | + unsigned i = rank32(x, &tree[k]); |
| 100 | + k = tree[k + B + i]; |
| 101 | + } |
| 102 | +
|
| 103 | + unsigned i = rank32(x, &tree[k]); |
| 104 | +
|
| 105 | + return tree[k + i]; // what if next block? maybe we store 31 elements? |
| 106 | +} |
| 107 | +``` |
| 108 | + |
| 109 | +### Insertions |
| 110 | + |
| 111 | +```c++ |
| 112 | +struct Precalc { |
| 113 | + alignas(64) int mask[B][B]; |
| 114 | + |
| 115 | + constexpr Precalc() : mask{} { |
| 116 | + for (int i = 0; i < B; i++) |
| 117 | + for (int j = i; j < B - 1; j++) |
| 118 | + mask[i][j] = -1; |
| 119 | + } |
| 120 | +}; |
| 121 | + |
| 122 | +constexpr Precalc P; |
| 123 | +``` |
| 124 | + |
| 125 | +```c++ |
| 126 | +void insert(int *node, int i, int x) { |
| 127 | + for (int j = B - 8; j >= 0; j -= 8) { |
| 128 | + reg t = _mm256_load_si256((reg*) &node[j]); |
| 129 | + reg mask = _mm256_load_si256((reg*) &P.mask[i][j]); |
| 130 | + _mm256_maskstore_epi32(&node[j + 1], mask, t); |
| 131 | + } |
| 132 | + node[i] = x; |
| 133 | +} |
| 134 | + |
| 135 | +// move the second half of a node and fill it with infinities |
| 136 | +void move(int *from, int *to) { |
| 137 | + const reg infs = _mm256_set1_epi32(INT_MAX); |
| 138 | + for (int i = 0; i < B / 2; i += 8) { |
| 139 | + reg t = _mm256_load_si256((reg*) &from[B / 2 + i]); |
| 140 | + _mm256_store_si256((reg*) &to[i], t); |
| 141 | + _mm256_store_si256((reg*) &from[B / 2 + i], infs); // probably not necessary for pointers |
| 142 | + } |
| 143 | +} |
| 144 | +``` |
| 145 | +
|
| 146 | +```c++ |
| 147 | +void insert(int _x) { |
| 148 | + unsigned sk[20], si[20]; |
| 149 | + |
| 150 | + unsigned k = root; |
| 151 | + reg x = _mm256_set1_epi32(_x); |
| 152 | +
|
| 153 | + for (int h = 0; h < H - 1; h++) { |
| 154 | + unsigned i = rank32(x, &tree[k]); |
| 155 | + sk[h] = k, si[h] = i; |
| 156 | + k = tree[k + B + i]; |
| 157 | + } |
| 158 | +
|
| 159 | + unsigned i = rank32(x, &tree[k]); |
| 160 | +
|
| 161 | + bool filled = (tree[k + B - 2] != INT_MAX); |
| 162 | + bool updated = (tree[k + i] == INT_MAX); |
| 163 | +
|
| 164 | + insert(tree + k, i, _x); |
| 165 | +
|
| 166 | + if (updated) { |
| 167 | + for (int h = H - 2; h >= 0; h--) { |
| 168 | + int idx = sk[h] + si[h]; |
| 169 | + tree[idx] = (tree[idx] < _x ? _x : tree[idx]); |
| 170 | + } |
| 171 | + } |
| 172 | +
|
| 173 | + if (filled) { |
| 174 | + // create a new leaf node |
| 175 | + move(tree + k, tree + n_tree); |
| 176 | + |
| 177 | + int v = tree[k + B / 2 - 1]; // new key to be inserted |
| 178 | + int p = n_tree; // pointer to the newly created node |
| 179 | + |
| 180 | + n_tree += B; |
| 181 | +
|
| 182 | + for (int h = H - 2; h >= 0; h--) { |
| 183 | + k = sk[h], i = si[h]; |
| 184 | +
|
| 185 | + filled = (tree[k + B - 3] != INT_MAX); |
| 186 | +
|
| 187 | + // the node already has a correct key (right one) and a correct pointer (left one) |
| 188 | + insert(tree + k, i, v); |
| 189 | + insert(tree + k + B, i + 1, p); |
| 190 | + |
| 191 | + if (!filled) |
| 192 | + return; |
| 193 | +
|
| 194 | + // create a new internal node |
| 195 | + move(tree + k, tree + n_tree); // move keys |
| 196 | + move(tree + k + B, tree + n_tree + B); // move pointers |
| 197 | +
|
| 198 | + v = tree[k + B / 2 - 1]; |
| 199 | + tree[k + B / 2 - 1] = INT_MAX; |
| 200 | +
|
| 201 | + p = n_tree; |
| 202 | + n_tree += 2 * B; |
| 203 | + } |
| 204 | +
|
| 205 | + if (filled) { |
| 206 | + // tree grows |
| 207 | +
|
| 208 | + tree[n_tree] = v; |
| 209 | +
|
| 210 | + tree[n_tree + B] = root; |
| 211 | + tree[n_tree + B + 1] = p; |
| 212 | +
|
| 213 | + root = n_tree; |
| 214 | + n_tree += 2 * B; |
| 215 | + H++; |
| 216 | + } |
| 217 | + } |
| 218 | +} |
| 219 | +``` |
| 220 | + |
| 221 | +## Optimizations |
| 222 | + |
7 | 223 | ... |
| 224 | + |
| 225 | +## Acknowledgements |
| 226 | + |
| 227 | +Thanks to Danila Kutenin for meaningful discussions of applicability. |
0 commit comments