16 #ifndef TLX_SORT_STRINGS_SAMPLE_SORT_TOOLS_HEADER 17 #define TLX_SORT_STRINGS_SAMPLE_SORT_TOOLS_HEADER 33 namespace sort_strings_detail {
38 template <
typename Type>
40 std::string
to_binary(Type v,
const size_t width = (8 *
sizeof(Type))) {
41 std::string str(width,
' ');
42 for (
size_t i = 0; i < width; i++) {
43 str[width - i - 1] = (v & 1) ?
'1' :
'0';
50 template <
size_t TreeBits>
52 static const bool debug =
false;
63 int hi = treebits - 32 + clz<uint32_t>(id);
66 unsigned int bkt = ((
id << (hi + 1)) & bitmask) | (1 << hi);
78 int lo = ctz<uint32_t>(id) + 1;
81 unsigned int bkt = ((
id >> lo) & bitmask) | (1 << (treebits - lo));
121 template <
typename key_type,
size_t num_splitters>
124 static const bool debug_splitter =
false;
130 key_type tree[num_splitters + 1],
131 unsigned char splitter_lcp[num_splitters + 1],
132 const key_type* samples,
size_t samplesize)
133 : splitter_(splitter), tree_(tree),
134 lcp_iter_(splitter_lcp), samples_(samples) {
135 key_type sentinel = 0;
136 recurse(samples, samples + samplesize, 1, sentinel);
138 assert(splitter_ == splitter + num_splitters);
139 assert(lcp_iter_ == splitter_lcp + num_splitters);
141 splitter_lcp[0] &= 0x80;
143 splitter_lcp[num_splitters] = 0;
146 ptrdiff_t
snum(
const key_type* s)
const {
147 return static_cast<ptrdiff_t
>(s - samples_);
150 key_type
recurse(
const key_type* lo,
const key_type* hi,
151 unsigned int treeidx, key_type& rec_prevkey) {
153 <<
"rec_buildtree(" << snum(lo) <<
"," << snum(hi)
154 <<
", treeidx=" << treeidx <<
")";
157 const key_type* mid = lo +
static_cast<ptrdiff_t
>(hi - lo) / 2;
160 <<
"tree[" << treeidx <<
"] = samples[" << snum(mid) <<
"] = " 163 key_type mykey = tree_[treeidx] = *mid;
165 const key_type* midlo = mid;
166 while (lo < midlo && *(midlo - 1) == mykey) midlo--;
168 const key_type* midhi = mid;
169 while (midhi + 1 < hi && *midhi == mykey) midhi++;
171 if (midhi - midlo > 1)
172 TLX_LOG0 <<
"key range = [" << snum(midlo) <<
"," << snum(midhi) <<
")";
174 const key_type* midlo = mid, * midhi = mid + 1;
176 if (2 * treeidx < num_splitters)
178 key_type prevkey = recurse(lo, midlo, 2 * treeidx + 0, rec_prevkey);
180 key_type xorSplit = prevkey ^ mykey;
186 <<
" - " <<
clz(xorSplit)
187 <<
" bits = " <<
clz(xorSplit) / 8
190 *splitter_++ = mykey;
193 (
clz(xorSplit) / 8) |
195 ((mykey & 0xFF) ? 0 : 0x80);
197 return recurse(midhi, hi, 2 * treeidx + 1, mykey);
201 key_type xorSplit = rec_prevkey ^ mykey;
207 <<
" - " <<
clz(xorSplit)
208 <<
" bits = " <<
clz(xorSplit) / 8
211 *splitter_++ = mykey;
214 (
clz(xorSplit) / 8) |
216 ((mykey & 0xFF) ? 0 : 0x80);
231 template <
typename key_type,
size_t num_splitters>
234 static const bool debug_splitter =
false;
239 unsigned char splitter_lcp[num_splitters + 1],
240 const key_type* samples,
size_t samplesize)
242 lcp_iter_(splitter_lcp),
244 key_type sentinel = 0;
245 recurse(samples, samples + samplesize, 1, sentinel);
247 assert(lcp_iter_ == splitter_lcp + num_splitters);
249 splitter_lcp[0] &= 0x80;
251 splitter_lcp[num_splitters] = 0;
254 ptrdiff_t
snum(
const key_type* s)
const {
255 return static_cast<ptrdiff_t
>(s - samples_);
258 key_type
recurse(
const key_type* lo,
const key_type* hi,
259 unsigned int treeidx, key_type& rec_prevkey) {
261 <<
"rec_buildtree(" << snum(lo) <<
"," << snum(hi)
262 <<
", treeidx=" << treeidx <<
")";
265 const key_type* mid = lo +
static_cast<ptrdiff_t
>(hi - lo) / 2;
268 <<
"tree[" << treeidx <<
"] = samples[" << snum(mid) <<
"] = " 271 key_type mykey = tree_[treeidx] = *mid;
273 const key_type* midlo = mid;
274 while (lo < midlo && *(midlo - 1) == mykey) midlo--;
276 const key_type* midhi = mid;
277 while (midhi + 1 < hi && *midhi == mykey) midhi++;
279 if (midhi - midlo > 1)
280 TLX_LOG0 <<
"key range = [" << snum(midlo) <<
"," << snum(midhi) <<
")";
282 const key_type* midlo = mid, * midhi = mid + 1;
284 if (2 * treeidx < num_splitters)
286 key_type prevkey = recurse(lo, midlo, 2 * treeidx + 0, rec_prevkey);
288 key_type xorSplit = prevkey ^ mykey;
294 <<
" - " <<
clz(xorSplit)
295 <<
" bits = " <<
clz(xorSplit) / 8
299 (
clz(xorSplit) / 8) |
301 ((mykey & 0xFF) ? 0 : 0x80);
303 return recurse(midhi, hi, 2 * treeidx + 1, mykey);
307 key_type xorSplit = rec_prevkey ^ mykey;
313 <<
" - " <<
clz(xorSplit)
314 <<
" bits = " <<
clz(xorSplit) / 8
318 (
clz(xorSplit) / 8) |
320 ((mykey & 0xFF) ? 0 : 0x80);
335 template <
typename key_type,
size_t TreeBits,
size_t Rollout = 4>
340 static const size_t num_splitters = (1 <<
treebits) - 1;
343 void build(key_type* samples,
size_t samplesize,
344 unsigned char* splitter_lcp) {
346 splitter_, splitter_tree_, splitter_lcp,
347 samples, samplesize);
355 while (i <= num_splitters) {
357 i = 2 * i + (key <= splitter_tree_[i] ? 0 : 1);
359 i -= num_splitters + 1;
362 if (i < num_splitters && splitter_[i] == key) b += 1;
368 #define TLX_CLASSIFY_TREE_STEP \ 369 for (size_t u = 0; u < Rollout; ++u) { \ 370 i[u] = 2 * i[u] + (key[u] <= splitter_tree_[i[u]] ? 0 : 1); \ 372 TLX_ATTRIBUTE_FALLTHROUGH; 377 const key_type key[Rollout], uint16_t obkt[Rollout])
const {
380 unsigned int i[Rollout];
381 std::fill(i, i + Rollout, 1u);
421 for (
size_t u = 0; u < Rollout; ++u)
422 i[u] -= num_splitters + 1;
424 for (
size_t u = 0; u < Rollout; ++u) {
429 for (
size_t u = 0; u < Rollout; ++u) {
431 if (i[u] < num_splitters && splitter_[i[u]] == key[u])
436 #undef TLX_CLASSIFY_TREE_STEP 439 template <
typename StringSet>
442 const StringSet& strset,
443 typename StringSet::Iterator begin,
typename StringSet::Iterator end,
444 uint16_t* bktout,
size_t depth)
const {
447 if (begin + Rollout <= end)
449 key_type key[Rollout];
450 for (
size_t u = 0; u < Rollout; ++u)
451 key[u] = get_key<key_type>(strset, begin[u], depth);
453 find_bkt_unroll(key, bktout);
455 begin += Rollout, bktout += Rollout;
459 key_type key = get_key<key_type>(strset, *begin++, depth);
460 *bktout++ = this->find_bkt(key);
467 {
return splitter_[i]; }
470 key_type splitter_[num_splitters];
471 key_type splitter_tree_[num_splitters + 1];
477 template <
typename key_type,
size_t TreeBits>
482 static const size_t num_splitters = (1 <<
treebits) - 1;
485 void build(key_type* samples,
size_t samplesize,
unsigned char* splitter_lcp) {
487 splitter_tree_, splitter_lcp, samples, samplesize);
490 #define TLX_CLASSIFY_TREE_STEP \ 491 if (TLX_UNLIKELY(key == splitter_tree_[i])) { \ 493 2 * PerfectTreeCalculations<treebits>::level_to_preorder(i) - 1; \ 495 i = 2 * i + (key < splitter_tree_[i] ? 0 : 1); \ 496 TLX_ATTRIBUTE_FALLTHROUGH; 542 i -= num_splitters + 1;
546 #undef TLX_CLASSIFY_TREE_STEP 549 template <
typename StringSet>
551 const StringSet& strset,
552 typename StringSet::Iterator begin,
typename StringSet::Iterator end,
553 uint16_t* bktout,
size_t depth)
const {
556 key_type key = get_key<key_type>(strset, *begin++, depth);
557 *bktout++ = find_bkt(key);
563 return splitter_tree_[
568 key_type splitter_tree_[num_splitters + 1];
575 template <
typename key_type,
size_t TreeBits,
unsigned Rollout = 4>
580 static const size_t num_splitters = (1 <<
treebits) - 1;
583 void build(key_type* samples,
size_t samplesize,
584 unsigned char* splitter_lcp) {
586 splitter_tree_, splitter_lcp, samples, samplesize);
595 while (i <= num_splitters) {
597 i = 2 * i + (key <= splitter_tree_[i] ? 0 : 1);
600 i -= num_splitters + 1;
604 if (i < num_splitters && get_splitter(i) == key) {
613 #define TLX_CLASSIFY_TREE_STEP \ 614 for (size_t u = 0; u < Rollout; ++u) { \ 615 i[u] = 2 * i[u] + (key[u] <= splitter_tree_[i[u]] ? 0 : 1); \ 617 TLX_ATTRIBUTE_FALLTHROUGH; 622 const key_type key[Rollout], uint16_t obkt[Rollout])
const {
625 unsigned int i[Rollout];
626 std::fill(i + 0, i + Rollout, 1);
667 for (
unsigned u = 0; u < Rollout; ++u)
668 i[u] -= num_splitters + 1;
670 for (
unsigned u = 0; u < Rollout; ++u) {
675 for (
unsigned u = 0; u < Rollout; ++u) {
677 if (i[u] < num_splitters && get_splitter(i[u]) == key[u])
682 #undef TLX_CLASSIFY_TREE_STEP 685 template <
typename StringSet>
688 const StringSet& strset,
689 typename StringSet::Iterator begin,
typename StringSet::Iterator end,
690 uint16_t* bktout,
size_t depth)
const {
693 if (begin + Rollout <= end)
695 key_type key[Rollout];
696 for (
size_t u = 0; u < Rollout; ++u)
697 key[u] = get_key<key_type>(strset, begin[u], depth);
699 find_bkt_unroll(key, bktout);
701 begin += Rollout, bktout += Rollout;
705 key_type key = get_key<key_type>(strset, *begin++, depth);
706 *bktout++ = this->find_bkt(key);
713 return splitter_tree_[
718 key_type splitter_tree_[num_splitters + 1];
726 #endif // !TLX_SORT_STRINGS_SAMPLE_SORT_TOOLS_HEADER
void build(key_type *samples, size_t samplesize, unsigned char *splitter_lcp)
build tree and splitter array from sample
Sample Sort Classification Tree Unrolled with Equal Comparisons.
static void perfect_tree_calculations_self_verify()
key_type get_splitter(unsigned int i) const
return a splitter
static void self_verify()
unsigned int find_bkt(const key_type &key) const
binary search on splitter array for bucket number
void build(key_type *samples, size_t samplesize, unsigned char *splitter_lcp)
build tree and splitter array from sample
unsigned int find_bkt(const key_type &key) const
binary search on splitter array for bucket number
Sample Sort Classification Tree Unrolled, Interleaved, and with Perfect Tree Index Calculations...
const key_type * samples_
SSTreeBuilderLevelOrder(key_type tree[num_splitters], unsigned char splitter_lcp[num_splitters+1], const key_type *samples, size_t samplesize)
build tree, sizes: splitter_tree[num_splitters + 1] and
SSTreeBuilderPreAndLevelOrder(key_type splitter[num_splitters], key_type tree[num_splitters+1], unsigned char splitter_lcp[num_splitters+1], const key_type *samples, size_t samplesize)
void find_bkt_unroll(const key_type key[Rollout], uint16_t obkt[Rollout]) const
search in splitter tree for bucket number, unrolled for Rollout keys at once.
unsigned int find_bkt(const key_type &key) const
binary search on splitter array for bucket number
void classify(const StringSet &strset, typename StringSet::Iterator begin, typename StringSet::Iterator end, uint16_t *bktout, size_t depth) const
classify all strings in area by walking tree and saving bucket id
static const size_t num_nodes
#define tlx_die_unequal(X, Y)
Check that X == Y or die miserably, but output the values of X and Y for better debugging.
Recursive TreeBuilder for full-descent and unrolled variants, constructs only a level-order binary tr...
key_type get_splitter(unsigned int i) const
return a splitter
void classify(const StringSet &strset, typename StringSet::Iterator begin, typename StringSet::Iterator end, uint16_t *bktout, size_t depth) const
classify all strings in area by walking tree and saving bucket id
Sample Sort Classification Tree Unrolled and Interleaved.
unsigned char * lcp_iter_
void find_bkt_unroll(const key_type key[Rollout], uint16_t obkt[Rollout]) const
search in splitter tree for bucket number, unrolled for Rollout keys at once.
static std::string to_binary(Type v, const size_t width=(8 *sizeof(Type)))
represent binary digits of large integer datatypes
#define TLX_LOG0
Override default output: never or always output log.
static unsigned int pre_to_levelorder(unsigned int id)
ptrdiff_t snum(const key_type *s) const
#define TLX_LOGC(cond)
Explicitly specify the condition for logging.
static unsigned int level_to_preorder(unsigned int id)
Class to transform in-order to level-order indexes in a perfect binary tree.
key_type recurse(const key_type *lo, const key_type *hi, unsigned int treeidx, key_type &rec_prevkey)
std::string hexdump_type(const Type &t)
Dump a (binary) item as a sequence of uppercase hexadecimal pairs.
Recursive TreeBuilder for full-descent and unrolled variants, constructs a both a pre-order and level...
static const size_t treebits
ptrdiff_t snum(const key_type *s) const
key_type get_splitter(unsigned int i) const
return a splitter
unsigned char * lcp_iter_
void build(key_type *samples, size_t samplesize, unsigned char *splitter_lcp)
build tree and splitter array from sample
const key_type * samples_
#define TLX_LOG
Default logging method: output if the local debug variable is true.
void classify(const StringSet &strset, typename StringSet::Iterator begin, typename StringSet::Iterator end, uint16_t *bktout, size_t depth) const
classify all strings in area by walking tree and saving bucket id
key_type recurse(const key_type *lo, const key_type *hi, unsigned int treeidx, key_type &rec_prevkey)