16 #ifndef TLX_SORT_STRINGS_PARALLEL_SAMPLE_SORT_HEADER 17 #define TLX_SORT_STRINGS_PARALLEL_SAMPLE_SORT_HEADER 41 namespace sort_strings_detail {
93 template <
typename Parameters>
117 : para_ss_steps(0), sequ_ss_steps(0), base_sort_steps(0),
118 num_threads(_thread_num),
119 threads_(_thread_num)
123 template <
typename StringPtr>
130 return std::max(threshold, rest_size / num_threads);
133 return std::max(threshold, total_size / num_threads);
147 template <
typename KeyType>
148 static inline unsigned char 151 return clz(a ^ b) / 8;
154 template <
typename KeyType>
155 static inline unsigned char 158 return sizeof(KeyType) - (
ctz(a) / 8);
162 template <
typename KeyType>
163 static inline unsigned char 165 return static_cast<unsigned char>(a >> (8 * (
sizeof(KeyType) - 1 - d)));
178 virtual void substep_all_done() = 0;
184 assert(substep_working_ == 0);
195 assert(substep_working_ > 0);
196 if (--substep_working_ == 0)
204 template <
size_t bktnum,
typename Context,
typename Classify,
205 typename StringPtr,
typename BktSizeType>
207 const StringPtr& strptr,
size_t depth,
208 const BktSizeType* bkt) {
209 assert(!strptr.flipped());
212 typedef typename Context::key_type
key_type;
215 key_type prevkey = 0;
226 if (bkt[b] != bkt[b + 1]) {
227 prevkey = classifier.get_splitter(b / 2);
228 assert(prevkey == get_key_at<key_type>(strset, bkt[b + 1] - 1, depth));
234 if (bkt[b] != bkt[b + 1]) {
235 prevkey = get_key_at<key_type>(strset, bkt[b + 1] - 1, depth);
244 if (b < bktnum && b % 2 == 0)
goto even_bucket;
250 if (bkt[b] != bkt[b + 1]) {
251 key_type thiskey = classifier.get_splitter(b / 2);
252 assert(thiskey == get_key_at<key_type>(strset, bkt[b], depth));
255 strptr.
set_lcp(bkt[b], depth + rlcp);
259 <<
"LCP at odd-bucket " << b
260 <<
" [" << bkt[b] <<
"," << bkt[b + 1] <<
")" 261 <<
" is " << depth + rlcp;
264 assert(prevkey == get_key_at<key_type>(strset, bkt[b + 1] - 1, depth));
269 if (bkt[b] != bkt[b + 1]) {
270 key_type thiskey = get_key_at<key_type>(strset, bkt[b], depth);
273 strptr.
set_lcp(bkt[b], depth + rlcp);
277 <<
"LCP at even-bucket " << b
278 <<
" [" << bkt[b] <<
"," << bkt[b + 1] <<
")" 279 <<
" is " << depth + rlcp;
281 prevkey = get_key_at<key_type>(strset, bkt[b + 1] - 1, depth);
290 template <
typename Context,
typename StringPtr,
typename BktSizeType>
308 const StringPtr& strptr,
size_t depth)
309 : ctx_(ctx), pstep_(pstep), strptr_(strptr), depth_(depth) {
311 <<
"enqueue depth=" << depth_
312 <<
" size=" << strptr_.
size() <<
" flip=" << strptr_.flipped();
317 ctx_.mtimer.add(mtimer_);
321 size_t bktcache_size_ = 0;
324 mtimer_.
start(
"sequ_ss");
326 size_t n = strptr_.
size();
329 <<
"Process PS5SmallsortJob " <<
this <<
" of size " << n;
334 if (ctx_.enable_sequential_sample_sort && n >= ctx_.smallsort_threshold)
336 bktcache_.
resize(n *
sizeof(uint16_t));
337 sort_sample_sort(strptr_, depth_);
341 mtimer_.
start(
"mkqs");
342 sort_mkqs_cache(strptr_, depth_);
346 this->substep_notify_done();
360 using bktsize_type = BktSizeType;
364 static const size_t num_splitters = Context::Classify::num_splitters;
365 static const size_t bktnum = 2 * num_splitters + 1;
367 unsigned char splitter_lcp[num_splitters + 1];
368 bktsize_type bkt[bktnum + 1];
372 : strptr_(strptr), idx_(0), depth_(depth) {
373 size_t n = strptr_.
size();
377 const size_t oversample_factor = 2;
378 const size_t sample_size = oversample_factor * num_splitters;
382 const StringSet& strset = strptr_.
active();
384 std::minstd_rand rng(reinterpret_cast<uintptr_t>(samples.
data()));
386 for (
size_t i = 0; i < sample_size; ++i)
387 samples[i] = get_key_at<key_type>(strset, rng() % n, depth_);
391 classifier.build(samples.
data(), sample_size, splitter_lcp);
396 strset, strset.begin(), strset.end(), bktcache, depth_);
400 bktsize_type bktsize[bktnum];
401 memset(bktsize, 0, bktnum *
sizeof(bktsize_type));
403 for (
size_t si = 0; si < n; ++si)
404 ++bktsize[bktcache[si]];
409 for (
unsigned int i = 1; i < bktnum; ++i) {
410 bkt[i] = bkt[i - 1] + bktsize[i];
412 assert(bkt[bktnum - 1] == n);
417 const StringSet& strB = strptr_.
active();
419 const StringSet& sorted = strptr_.shadow();
420 typename StringSet::Iterator sbegin = sorted.begin();
422 for (
typename StringSet::Iterator str = strB.begin();
423 str != strB.end(); ++str, ++bktcache)
424 *(sbegin + --bkt[*bktcache]) = std::move(*str);
434 TLX_LOGC(ctx.debug_lcp) <<
"Calculate LCP after sample sort step";
436 ps5_sample_sort_lcp<bktnum>(ctx, classifier, strptr_, depth_, bkt);
441 size_t ss_front_ = 0;
447 assert(ss_front_ == 0);
448 assert(ss_stack_.size() == 0);
450 uint16_t* bktcache =
reinterpret_cast<uint16_t*
>(bktcache_.
data());
453 ss_stack_.emplace_back(ctx_, strptr, depth, bktcache);
457 while (ss_stack_.size() > ss_front_)
459 Step& s = ss_stack_.back();
462 if (i < Step::bktnum)
464 size_t bktsize = s.bkt[i + 1] - s.bkt[i];
466 StringPtr sp = s.strptr_.flip(s.bkt[i], bktsize);
474 else if (bktsize < ctx_.smallsort_threshold)
476 assert(i / 2 <= Step::num_splitters);
477 if (i == Step::bktnum - 1)
479 <<
"Recurse[" << s.depth_ <<
"]: > bkt " 480 << i <<
" size " << bktsize <<
" no lcp";
483 <<
"Recurse[" << s.depth_ <<
"]: < bkt " 484 << i <<
" size " << bktsize <<
" lcp " 485 << int(s.splitter_lcp[i / 2] & 0x7F);
489 sp, s.depth_ + (s.splitter_lcp[i / 2] & 0x7F));
493 if (i == Step::bktnum - 1)
495 <<
"Recurse[" << s.depth_ <<
"]: > bkt " 496 << i <<
" size " << bktsize <<
" no lcp";
499 <<
"Recurse[" << s.depth_ <<
"]: < bkt " 500 << i <<
" size " << bktsize <<
" lcp " 501 << int(s.splitter_lcp[i / 2] & 0x7F);
503 ss_stack_.emplace_back(
504 ctx_, sp, s.depth_ + (s.splitter_lcp[i / 2] & 0x7F), bktcache);
513 else if (s.splitter_lcp[i / 2] & 0x80) {
516 <<
"Recurse[" << s.depth_ <<
"]: = bkt " 517 << i <<
" size " << bktsize <<
" is done!";
518 StringPtr spb = sp.copy_back();
522 s.depth_ +
lcpKeyDepth(s.classifier.get_splitter(i / 2)));
524 ctx_.donesize(bktsize);
526 else if (bktsize < ctx_.smallsort_threshold)
529 <<
"Recurse[" << s.depth_ <<
"]: = bkt " 530 << i <<
" size " << bktsize <<
" lcp keydepth!";
533 sort_mkqs_cache(sp, s.depth_ +
sizeof(key_type));
538 <<
"Recurse[" << s.depth_ <<
"]: = bkt " 539 << i <<
" size " << bktsize <<
" lcp keydepth!";
541 ss_stack_.emplace_back(
542 ctx_, sp, s.depth_ +
sizeof(key_type), bktcache);
549 assert(ss_stack_.size() > ss_front_);
552 ss_stack_.back().calculate_lcp(ctx_);
554 ss_stack_.pop_back();
557 if (ctx_.enable_work_sharing && ctx_.threads_.has_idle()) {
558 sample_sort_free_work();
564 assert(ss_stack_.size() >= ss_front_);
566 if (ss_stack_.size() == ss_front_) {
568 return mkqs_free_work();
573 <<
"Freeing top level of PS5SmallsortJob's sample_sort stack";
576 Step& s = ss_stack_[ss_front_];
578 while (s.idx_ < Step::bktnum)
582 size_t bktsize = s.bkt[i + 1] - s.bkt[i];
584 StringPtr sp = s.strptr_.flip(s.bkt[i], bktsize);
594 if (i == Step::bktnum - 1)
596 <<
"Recurse[" << s.depth_ <<
"]: > bkt " 597 << i <<
" size " << bktsize <<
" no lcp";
600 <<
"Recurse[" << s.depth_ <<
"]: < bkt " 601 << i <<
" size " << bktsize <<
" lcp " 602 << int(s.splitter_lcp[i / 2] & 0x7F);
605 ctx_.enqueue(
this, sp,
606 s.depth_ + (s.splitter_lcp[i / 2] & 0x7F));
615 else if (s.splitter_lcp[i / 2] & 0x80) {
618 <<
"Recurse[" << s.depth_ <<
"]: = bkt " 619 << i <<
" size " << bktsize <<
" is done!";
620 StringPtr spb = sp.copy_back();
624 s.classifier.get_splitter(i / 2)));
626 ctx_.donesize(bktsize);
631 <<
"Recurse[" << s.depth_ <<
"]: = bkt " 632 << i <<
" size " << bktsize <<
" lcp keydepth!";
635 ctx_.enqueue(
this, sp, s.depth_ +
sizeof(key_type));
647 static inline int cmp(
const key_type& a,
const key_type& b) {
648 return (a > b) ? 1 : (a < b) ? -1 : 0;
651 template <
typename Type>
653 med3(Type* A,
size_t i,
size_t j,
size_t k) {
654 if (A[i] == A[j])
return i;
655 if (A[k] == A[i] || A[k] == A[j])
return k;
657 if (A[j] < A[k])
return j;
658 if (A[i] < A[k])
return k;
662 if (A[j] > A[k])
return j;
663 if (A[i] < A[k])
return i;
671 const StringSet& strings = strptr.
active();
672 size_t n = strptr.
size();
674 for (pi = 1; --n > 0; ++pi) {
675 typename StringSet::String tmps = std::move(strings.at(pi));
676 key_type tmpc = cache[pi];
677 for (pj = pi; pj > 0; --pj) {
678 if (cache[pj - 1] <= tmpc)
680 strings.at(pj) = std::move(strings.at(pj - 1));
681 cache[pj] = cache[pj - 1];
683 strings.at(pj) = std::move(tmps);
689 template <
bool CacheDirty>
692 StringPtr strptr = _strptr.copy_back();
694 if (strptr.
size() <= 1)
return;
698 insertion_sort_cache_block(strptr, cache);
700 size_t start = 0, bktsize = 1;
701 for (
size_t i = 0; i < strptr.
size() - 1; ++i) {
703 if (cache[i] == cache[i + 1]) {
708 if (start != 0 && strptr.
with_lcp) {
709 int rlcp =
lcpKeyType(cache[start - 1], cache[start]);
710 strptr.
set_lcp(start, depth + rlcp);
715 if (cache[start] & 0xFF) {
718 strptr.
sub(start, bktsize), depth +
sizeof(
key_type),
730 if (start != 0 && strptr.
with_lcp) {
731 int rlcp =
lcpKeyType(cache[start - 1], cache[start]);
732 strptr.
set_lcp(start, depth + rlcp);
736 if (cache[start] & 0xFF) {
739 strptr.
sub(start, bktsize), depth +
sizeof(
key_type),
761 key_type* cache,
size_t depth,
bool CacheDirty)
762 : strptr_(strptr), cache_(cache), depth_(depth), idx_(0) {
763 size_t n = strptr_.
size();
765 const StringSet& strset = strptr_.
active();
768 typename StringSet::Iterator it = strset.begin();
769 for (
size_t i = 0; i < n; ++i, ++it) {
770 cache_[i] = get_key<key_type>(strset, *it, depth);
776 med3(cache_, 0, n / 8, n / 4),
777 med3(cache_, n / 2 - n / 8, n / 2, n / 2 + n / 8),
778 med3(cache_, n - 1 - n / 4, n - 1 - n / 8, n - 3));
783 key_type pivot = cache_[0];
785 key_type max_lt = 0, min_gt = std::numeric_limits<key_type>::max();
789 size_t leq = 1, llt = 1, rgt = n - 1, req = n - 1;
794 int r = cmp(cache[llt], pivot);
796 min_gt =
std::min(min_gt, cache[llt]);
800 std::swap(strset.at(leq), strset.at(llt));
805 max_lt = std::max(max_lt, cache[llt]);
811 int r = cmp(cache[rgt], pivot);
813 max_lt = std::max(max_lt, cache[rgt]);
817 std::swap(strset.at(req), strset.at(rgt));
822 min_gt =
std::min(min_gt, cache[rgt]);
828 std::swap(strset.at(llt), strset.at(rgt));
834 size_t num_leq = leq, num_req = n - 1 - req;
835 num_eq_ = num_leq + num_req;
839 assert(num_lt_ + num_eq_ + num_gt_ == n);
842 const size_t size1 =
std::min(num_leq, num_lt_);
843 std::swap_ranges(strset.begin(), strset.begin() + size1,
844 strset.begin() + llt - size1);
845 std::swap_ranges(cache, cache + size1, cache + llt - size1);
848 const size_t size2 =
std::min(num_req, num_gt_);
849 std::swap_ranges(strset.begin() + llt, strset.begin() + llt + size2,
850 strset.begin() + n - size2);
851 std::swap_ranges(cache + llt, cache + llt + size2,
855 eq_recurse_ = (pivot & 0xFF);
858 if (strptr_.
with_lcp && num_lt_ > 0) {
859 assert(max_lt == *std::max_element(
860 cache_ + 0, cache + num_lt_));
864 TLX_LOGC(ctx.debug_lcp) <<
"LCP lt with pivot: " << depth_ + lcp_lt_;
870 if (strptr_.
with_lcp && num_gt_ > 0) {
871 assert(min_gt == *std::min_element(
872 cache_ + num_lt_ + num_eq_, cache_ + n));
876 TLX_LOGC(ctx.debug_lcp) <<
"LCP pivot with gt: " << depth_ + lcp_gt_;
879 ++ctx.base_sort_steps;
883 if (strptr_.
with_lcp && num_lt_ > 0) {
884 strptr_.
set_lcp(num_lt_, depth_ + lcp_lt_);
888 if (strptr_.
with_lcp && num_gt_ > 0) {
889 strptr_.
set_lcp(num_lt_ + num_eq_, depth_ + lcp_gt_);
895 size_t ms_front_ = 0;
899 assert(strcmp(mtimer_.
running(),
"mkqs") == 0);
901 if (!ctx_.enable_sequential_mkqs ||
902 strptr.
size() < ctx_.inssort_threshold) {
904 <<
"insertion_sort() size " 905 << strptr.
size() <<
" depth " << depth;
909 ctx_.donesize(strptr.
size());
914 <<
"sort_mkqs_cache() size " << strptr.
size() <<
" depth " << depth;
922 key_type* cache =
reinterpret_cast<key_type*
>(bktcache_.
data());
924 assert(ms_front_ == 0);
925 assert(ms_stack_.size() == 0);
929 ms_stack_.emplace_back(ctx_, strptr, cache, depth,
true);
931 while (ms_stack_.size() > ms_front_)
941 else if (ms.
num_lt_ < ctx_.inssort_threshold) {
948 ms_stack_.emplace_back(
955 else if (ms.
idx_ == 2) {
961 StringPtr spb = sp.copy_back();
963 ctx_.donesize(spb.
size());
965 else if (ms.
num_eq_ < ctx_.inssort_threshold) {
972 ms_stack_.emplace_back(
975 ms.
depth_ +
sizeof(key_type),
true);
979 else if (ms.
idx_ == 3) {
986 else if (ms.
num_gt_ < ctx_.inssort_threshold) {
988 insertion_sort_cache<false>(
993 ms_stack_.emplace_back(
1002 assert(ms_stack_.size() > ms_front_);
1005 ms_stack_.back().calculate_lcp();
1007 ms_stack_.pop_back();
1010 if (ctx_.enable_work_sharing && ctx_.threads_.has_idle()) {
1011 sample_sort_free_work();
1017 assert(ms_stack_.size() >= ms_front_);
1019 for (
unsigned int fl = 0; fl < 8; ++fl)
1021 if (ms_stack_.size() == ms_front_) {
1026 <<
"Freeing top level of PS5SmallsortJob's mkqs stack - size " 1027 << ms_stack_.size();
1031 MKQSStep& ms = ms_stack_[ms_front_];
1035 this->substep_add();
1045 this->substep_add();
1046 ctx_.enqueue(
this, sp, ms.
depth_ +
sizeof(key_type));
1049 StringPtr spb = sp.copy_back();
1056 this->substep_add();
1072 <<
"SmallSort[" << depth_ <<
"] " 1073 <<
"all substeps done -> LCP calculation";
1075 while (ms_front_ > 0) {
1077 <<
"SmallSort[" << depth_ <<
"] ms_front_: " << ms_front_;
1078 ms_stack_[--ms_front_].calculate_lcp();
1081 while (ss_front_ > 0) {
1083 <<
"SmallSort[" << depth_ <<
"] ss_front_: " << ss_front_;
1084 ss_stack_[--ss_front_].calculate_lcp(ctx_);
1095 template <
typename Context,
typename StringPtr>
1122 static const size_t treebits_ = Context::Classify::treebits;
1123 static const size_t num_splitters_ = Context::Classify::num_splitters;
1124 static const size_t bktnum_ = 2 * num_splitters_ + 1;
1127 unsigned char splitter_lcp_[num_splitters_ + 1];
1138 const StringPtr& strptr,
size_t depth)
1139 : ctx_(ctx), pstep_(pstep), strptr_(strptr), depth_(depth) {
1141 parts_ = strptr_.
size() / ctx.sequential_threshold() * 2;
1142 if (parts_ == 0) parts_ = 1;
1145 bktcache_.
resize(parts_);
1147 psize_ = (strptr.
size() + parts_ - 1) / parts_;
1150 <<
"enqueue depth=" << depth_
1151 <<
" size=" << strptr_.
size()
1152 <<
" parts=" << parts_
1153 <<
" psize=" << psize_
1154 <<
" flip=" << strptr_.flipped();
1156 ctx.threads_.enqueue([
this]() { sample(); });
1157 ++ctx.para_ss_steps;
1167 TLX_LOGC(ctx_.debug_jobs) <<
"Process SampleJob @ " <<
this;
1169 const size_t oversample_factor = 2;
1170 size_t sample_size = oversample_factor * num_splitters_;
1172 const StringSet& strset = strptr_.
active();
1173 size_t n = strset.size();
1177 std::minstd_rand rng(reinterpret_cast<uintptr_t>(samples.
data()));
1179 for (
size_t i = 0; i < sample_size; ++i)
1180 samples[i] = get_key_at<key_type>(strset, rng() % n, depth_);
1184 classifier_.build(samples.
data(), sample_size, splitter_lcp_);
1188 for (
unsigned int p = 0; p < parts_; ++p) {
1189 ctx_.threads_.enqueue([
this, p]() { count(p); });
1198 TLX_LOGC(ctx_.debug_jobs) <<
"Process CountJob " << p <<
" @ " <<
this;
1200 const StringSet& strset = strptr_.
active();
1202 StrIterator strB = strset.begin() + p * psize_;
1203 StrIterator strE = strset.begin() +
std::min((p + 1) * psize_, strptr_.
size());
1204 if (strE < strB) strE = strB;
1206 bktcache_[p].
resize(strE - strB);
1207 uint16_t* bktcache = bktcache_[p].
data();
1208 classifier_.classify(strset, strB, strE, bktcache, depth_);
1210 bkt_[p].
resize(bktnum_ + (p == 0 ? 1 : 0));
1211 size_t* bkt = bkt_[p].
data();
1212 memset(bkt, 0, bktnum_ *
sizeof(
size_t));
1214 for (uint16_t* bc = bktcache; bc != bktcache + (strE - strB); ++bc)
1223 TLX_LOGC(ctx_.debug_jobs) <<
"Finishing CountJob " <<
this <<
" with prefixsum";
1226 if (ctx_.use_only_first_sortstep)
1231 for (
unsigned int i = 0; i < bktnum_; ++i) {
1232 for (
unsigned int p = 0; p < parts_; ++p) {
1233 bkt_[p][i] = (sum += bkt_[p][i]);
1236 assert(sum == strptr_.
size());
1240 for (
unsigned int p = 0; p < parts_; ++p) {
1241 ctx_.threads_.enqueue([
this, p]() { distribute(p); });
1250 TLX_LOGC(ctx_.debug_jobs) <<
"Process DistributeJob " << p <<
" @ " <<
this;
1252 const StringSet& strset = strptr_.
active();
1254 StrIterator strB = strset.begin() + p * psize_;
1255 StrIterator strE = strset.begin() +
std::min((p + 1) * psize_, strptr_.
size());
1256 if (strE < strB) strE = strB;
1259 const StringSet& sorted = strptr_.shadow();
1260 typename StringSet::Iterator sbegin = sorted.begin();
1262 uint16_t* bktcache = bktcache_[p].
data();
1263 size_t* bkt = bkt_[p].
data();
1265 for (StrIterator str = strB; str != strE; ++str, ++bktcache)
1266 *(sbegin + --bkt[*bktcache]) = std::move(*str);
1274 distribute_finished();
1279 <<
"Finishing DistributeJob " <<
this <<
" with enqueuing subjobs";
1281 size_t* bkt = bkt_[0].
data();
1285 assert(bkt[0] == 0);
1286 bkt[bktnum_] = strptr_.
size();
1289 this->substep_add();
1292 while (i < bktnum_ - 1)
1295 size_t bktsize = bkt[i + 1] - bkt[i];
1299 else if (bktsize == 1) {
1300 strptr_.flip(bkt[i], 1).copy_back();
1306 <<
"Recurse[" << depth_ <<
"]: < bkt " << bkt[i]
1307 <<
" size " << bktsize <<
" lcp " 1308 << int(splitter_lcp_[i / 2] & 0x7F);
1309 this->substep_add();
1310 ctx_.enqueue(
this, strptr_.flip(bkt[i], bktsize),
1311 depth_ + (splitter_lcp_[i / 2] & 0x7F));
1315 bktsize = bkt[i + 1] - bkt[i];
1319 else if (bktsize == 1) {
1320 strptr_.flip(bkt[i], 1).copy_back();
1325 if (splitter_lcp_[i / 2] & 0x80) {
1328 <<
"Recurse[" << depth_ <<
"]: = bkt " << bkt[i]
1329 <<
" size " << bktsize <<
" is done!";
1330 StringPtr sp = strptr_.flip(bkt[i], bktsize).copy_back();
1332 depth_ +
lcpKeyDepth(classifier_.get_splitter(i / 2)));
1333 ctx_.donesize(bktsize);
1337 <<
"Recurse[" << depth_ <<
"]: = bkt " << bkt[i]
1338 <<
" size " << bktsize <<
" lcp keydepth!";
1339 this->substep_add();
1340 ctx_.enqueue(
this, strptr_.flip(bkt[i], bktsize),
1347 size_t bktsize = bkt[i + 1] - bkt[i];
1352 else if (bktsize == 1) {
1353 strptr_.flip(bkt[i], 1).copy_back();
1358 <<
"Recurse[" << depth_ <<
"]: > bkt " << bkt[i]
1359 <<
" size " << bktsize <<
" no lcp";
1360 this->substep_add();
1361 ctx_.enqueue(
this, strptr_.flip(bkt[i], bktsize), depth_);
1364 this->substep_notify_done();
1377 <<
"pSampleSortStep[" << depth_ <<
"]: all substeps done.";
1379 ps5_sample_sort_lcp<bktnum_>(
1380 ctx_, classifier_, strptr_, depth_, bkt_[0].
data());
1392 template <
typename Parameters>
1393 template <
typename StringPtr>
1395 PS5SortStep* pstep,
const StringPtr& strptr,
size_t depth) {
1397 (strptr.
size() > sequential_threshold() ||
1402 if (strptr.
size() < (1LLU << 32)) {
1404 *
this, pstep, strptr, depth);
1405 threads_.enqueue([j]() { j->run(); });
1409 *
this, pstep, strptr, depth);
1410 threads_.enqueue([j]() { j->run(); });
1419 template <
typename PS5Parameters,
typename StringPtr>
1423 Context ctx(std::thread::hardware_concurrency());
1424 ctx.total_size = strptr.
size();
1425 ctx.rest_size = strptr.
size();
1426 ctx.num_threads = ctx.threads_.size();
1429 timer.
start(
"sort");
1431 ctx.enqueue(
nullptr, strptr, depth);
1432 ctx.threads_.loop_until_empty();
1436 assert(!ctx.enable_rest_size || ctx.rest_size == 0);
1442 <<
" sizeof(key_type)=" <<
sizeof(
typename PS5Parameters::key_type)
1443 <<
" splitter_treebits=" <<
size_t(BigSortStep::treebits_)
1444 <<
" num_splitters=" << size_t(BigSortStep::num_splitters_)
1445 <<
" num_threads=" << ctx.num_threads
1446 <<
" enable_work_sharing=" << size_t(ctx.enable_work_sharing)
1447 <<
" use_restsize=" << size_t(ctx.enable_rest_size)
1448 <<
" tm_para_ss=" << ctx.mtimer.get(
"para_ss")
1449 <<
" tm_seq_ss=" << ctx.mtimer.get(
"sequ_ss")
1450 <<
" tm_mkqs=" << ctx.mtimer.get(
"mkqs")
1451 <<
" tm_inssort=" << ctx.mtimer.get(
"inssort")
1452 <<
" tm_total=" << ctx.mtimer.total()
1454 << (ctx.num_threads * timer.
total()) - ctx.mtimer.total()
1455 <<
" steps_para_sample_sort=" << ctx.para_ss_steps
1456 <<
" steps_seq_sample_sort=" << ctx.sequ_ss_steps
1457 <<
" steps_base_sort=" << ctx.base_sort_steps;
1462 template <
typename PS5Parameters,
typename StringPtr>
1465 const StringPtr& strptr,
size_t depth,
size_t memory = 0) {
1469 const StringSet& strset = strptr.
active();
1472 typedef typename StringSet::Container Container;
1475 Container shadow = strset.allocate(strset.size());
1476 StringShadowPtr new_strptr(strset, StringSet(shadow));
1478 parallel_sample_sort_base<PS5Parameters>(new_strptr, depth);
1480 StringSet::deallocate(shadow);
1485 template <
typename PS5Parameters,
typename StringPtr>
1488 const StringPtr& strptr,
size_t depth,
size_t memory = 0) {
1492 typedef typename StringPtr::LcpType LcpType;
1493 const StringSet& strset = strptr.
active();
1496 typedef typename StringSet::Container Container;
1499 Container shadow = strset.allocate(strset.size());
1500 StringShadowLcpPtr new_strptr(strset, StringSet(shadow), strptr.lcp());
1502 parallel_sample_sort_base<PS5Parameters>(new_strptr, depth);
1504 StringSet::deallocate(shadow);
1509 template <
typename StringPtr>
1511 const StringPtr& strptr,
size_t depth,
size_t memory) {
1512 return parallel_sample_sort_params<PS5ParametersDefault>(
1513 strptr, depth, memory);
1519 #endif // !TLX_SORT_STRINGS_PARALLEL_SAMPLE_SORT_HEADER iterator end() noexcept
return mutable iterator beyond last element
static const bool debug_bucket_size
PS5BigSortStep(Context &ctx, PS5SortStep *pstep, const StringPtr &strptr, size_t depth)
std::atomic< size_t > substep_working_
Number of substeps still running.
Objectified string array pointer and shadow pointer array for out-of-place swapping of pointers...
size_t sequential_threshold()
return sequential sorting threshold
Independent RAII Scoped MultiTimer: contains a MultiTimer which is started with the given timer...
PS5SmallsortJob(Context &ctx, PS5SortStep *pstep, const StringPtr &strptr, size_t depth)
StringPtr sub(size_t offset, size_t sub_size) const
Advance (both) pointers by given offset, return sub-array.
void distribute_finished()
size_t psize_
size of all parts except the last
void resize(size_type new_size)
resize the array to contain exactly new_size items
size_t size() const
return valid length
void parallel_sample_sort(const StringPtr &strptr, size_t depth, size_t memory)
Parallel Sample Sort Function with default parameter size for a generic StringSet.
MultiTimer mtimer
timers for individual sorting steps
void enqueue(PS5SortStep *sstep, const StringPtr &strptr, size_t depth)
enqueue a new job in the thread pool
static const bool enable_parallel_sample_sort
enable/disable various sorting levels
const StringSet & active() const
return currently active array
void sample_sort_free_work()
static const bool enable_sequential_sample_sort
static const unsigned TreeBits
depth of classification tree used in sample sorts
Sample Sort Classification Tree Unrolled, Interleaved, and with Perfect Tree Index Calculations...
PS5SortStep * pstep_
parent sort step for notification
void sort_mkqs_cache(const StringPtr &strptr, size_t depth)
static unsigned char getCharAtDepth(const KeyType &a, unsigned char d)
return the d-th character in the (swapped) key
Parallel Super Scalar String Sample Sort Context.
Simpler non-growing vector without initialization.
Parallel Super Scalar String Sample Sort Parameter Struct.
static size_t med3(Type *A, size_t i, size_t j, size_t k)
void ps5_sample_sort_lcp(const Context &ctx, const Classify &classifier, const StringPtr &strptr, size_t depth, const BktSizeType *bkt)
LCP Calculation for Finished Sample Sort Steps.
Context::key_type key_type
simple_vector< uint8_t > bktcache_
void substep_all_done() final
Pure virtual function called by substep when all substeps are done.
void count(unsigned int p)
MultiTimer can be used to measure time usage of different phases in a program or algorithm.
void substep_add()
Register new substep.
void calculate_lcp(Context &ctx)
static const size_t smallsort_threshold
threshold to run sequential small sorts
static const bool enable_work_sharing
enable work freeing
static const bool debug_lcp
iterator data() noexcept
return iterator to beginning of vector
PS5BigSortStep Out-of-Place Parallel Sample Sort with Separate Jobs.
Context::Classify classifier_
classifier instance and variables (contains splitter tree
void fill_lcp(const LcpType &) const
fill entire LCP array with v, excluding the first lcp[0] position!
static const bool debug_result
const char * running() const
return name of currently running timer.
SSClassifyTreeCalcUnrollInterleave< key_type, TreeBits > Classify
classification tree variant for sample sorts
static void insertion_sort_cache_block(const StringPtr &strptr, key_type *cache)
Insertion sort the strings only based on the cached characters.
static const bool enable_sequential_mkqs
void set_lcp(size_t, const LcpType &) const
set the i-th lcp to v and check its value
PS5SortStep * pstep_
parent sort step
ThreadPool threads_
thread pool
static const bool use_only_first_sortstep
terminate sort after first parallel sample sort step
static uint32_t min(uint32_t x, uint32_t y)
void substep_notify_done()
Notify superstep that the currently substep is done.
std::atomic< size_t > rest_size
number of remaining strings to sort
static enable_if<!StringPtr::with_lcp, void >::type insertion_sort(const StringPtr &strptr, size_t depth, size_t)
Generic insertion sort for abstract string sets.
iterator begin() noexcept
return mutable iterator to first element
static const bool with_lcp
if we want to save the LCPs
PS5Context(size_t _thread_num)
context constructor
void swap(CountingPtr< A, D > &a1, CountingPtr< A, D > &a2) noexcept
swap enclosed object with another counting pointer (no reference counts need change) ...
void substep_all_done() final
Pure virtual function called by substep when all substeps are done.
static int cmp(const key_type &a, const key_type &b)
Stack of Recursive MKQS Steps.
Context::key_type key_type
static void sort(Iterator begin, Iterator end, Comparator cmp=Comparator())
Call best known sorting network for up to sixteen elements with given comparison method.
static const bool debug_steps
void stop()
stop the currently running timer.
StringPtr::StringSet StringSet
unsigned char eq_recurse_
size_t key_type
key type for sample sort: 32-bit or 64-bit
static unsigned char lcpKeyDepth(const KeyType &a)
static const bool enable_rest_size
whether the base sequential_threshold() on the remaining unsorted string set or on the whole string s...
size_type size() const noexcept
return number of items in vector
StringPtr strptr_
string pointers, size, and current sorting depth
SFINAE enable_if – copy of std::enable_if<> with less extra cruft.
RAII Scoped MultiTimer switcher: switches the timer of a MultiTimer on construction and back to old o...
std::vector< MKQSStep > ms_stack_
std::atomic< size_t > pwork_
number of threads still working
static const size_t inssort_threshold
threshold to switch to insertion sort
simple_vector< simple_vector< size_t > > bkt_
individual bucket array of threads, keep bkt[0] for DistributeJob
#define TLX_LOGC(cond)
Explicitly specify the condition for logging.
size_t num_threads
number of threads overall
simple_vector< simple_vector< uint16_t > > bktcache_
bucket ids cache, created by classifier and later counted
void start(const char *timer)
start new timer phase, stop the currently running one.
SampleSort: Non-Recursive In-Place Sequential Sample Sort for Small Sorts.
std::vector< SeqSampleSortStep > ss_stack_
MKQSStep(Context &ctx, const StringPtr &strptr, key_type *cache, size_t depth, bool CacheDirty)
void distribute(unsigned int p)
size_t parts_
number of parts into which the strings were split
void sort_sample_sort(const StringPtr &strptr, size_t depth)
void destroy()
deallocate contained array
Objectified string array pointer and shadow pointer array for out-of-place swapping of pointers...
static unsigned char lcpKeyType(const KeyType &a, const KeyType &b)
LCP calculation of Splitter Strings.
StringPtr::StringSet StringSet
enable_if<!StringPtr::with_lcp, void >::type parallel_sample_sort_params(const StringPtr &strptr, size_t depth, size_t memory=0)
Parallel Sample Sort Function for a generic StringSet, this allocates the shadow array for flipping...
ThreadPool starts a fixed number p of std::threads which process Jobs that are enqueued into a concur...
SeqSampleSortStep(Context &ctx, const StringPtr &strptr, size_t depth, uint16_t *bktcache)
StringSet::Iterator StrIterator
void donesize(size_t n)
decrement number of unordered strings
static const bool debug_jobs
std::atomic< size_t > sequ_ss_steps
PS5SortStep Top-Level Class to Keep Track of Substeps.
Stack of Recursive Sample Sort Steps.
Objectified string array pointer array.
virtual ~PS5BigSortStep()
static void insertion_sort_cache(const StringPtr &_strptr, key_type *cache, size_t depth)
Insertion sort, but use cached characters if possible.
Context::Classify classifier
static const bool debug_recursion
double total() const
return total duration of all timers.
void parallel_sample_sort_base(const StringPtr &strptr, size_t depth)
Main Parallel Sample Sort Function. See below for more convenient wrappers.
size_t total_size
total size of input