tlx
multikey_quicksort.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * tlx/sort/strings/multikey_quicksort.hpp
3  *
4  * Generic multikey quicksort for strings. This is an internal implementation
5  * header, see tlx/sort/strings.hpp for public front-end functions.
6  *
7  * Based on multikey quicksort, a quick sort algorithm for arrays of character
8  * strings by Bentley and Sedgewick.
9  *
10  * J. Bentley and R. Sedgewick. "Fast Algorithms for Sorting and Searching
11  * Strings." In Proceedings of 8th Annual ACM-SIAM Symposium on Discrete
12  * Algorithms, 1997.
13  *
14  * http://www.cs.princeton.edu/~rs/strings/index.html
15  *
16  * Part of tlx - http://panthema.net/tlx
17  *
18  * Copyright (C) 2015-2018 Timo Bingmann <tb@panthema.net>
19  *
20  * All rights reserved. Published under the Boost Software License, Version 1.0
21  ******************************************************************************/
22 
23 #ifndef TLX_SORT_STRINGS_MULTIKEY_QUICKSORT_HEADER
24 #define TLX_SORT_STRINGS_MULTIKEY_QUICKSORT_HEADER
25 
27 
28 #include <algorithm>
29 #include <cstddef>
30 #include <utility>
31 
32 namespace tlx {
33 
34 //! \addtogroup tlx_sort
35 //! \{
36 
37 namespace sort_strings_detail {
38 
39 template <typename StringSet>
40 static inline void vec_swap(
41  typename StringSet::Iterator a, typename StringSet::Iterator b, size_t n) {
42  while (n-- > 0)
43  std::swap(*a++, *b++);
44 }
45 
46 template <typename StringSet>
47 static inline typename StringSet::Iterator med3func(
48  const StringSet& ss,
49  typename StringSet::Iterator a, typename StringSet::Iterator b,
50  typename StringSet::Iterator c, size_t depth) {
51  typename StringSet::Char va = ss.get_char(*a, depth);
52  typename StringSet::Char vb = ss.get_char(*b, depth);
53  if (va == vb)
54  return a;
55  typename StringSet::Char vc = ss.get_char(*c, depth);
56  if (vc == va || vc == vb)
57  return c;
58  return va < vb
59  ? (vb < vc ? b : (va < vc ? c : a))
60  : (vb > vc ? b : (va < vc ? a : c));
61 }
62 
63 /*!
64  * Generic multikey quicksort for strings. Based on multikey quicksort, a quick
65  * sort algorithm for arrays of character strings by Bentley and Sedgewick. This
66  * method requires up to O(maxlcp) memory due to the recursion stack and it runs
67  * in expected time O(D + n log n) and worst-case time O(D + n^2).
68  *
69  * J. Bentley and R. Sedgewick. Fast algorithms for sorting and searching
70  * strings. In Proceedings of 8th Annual ACM-SIAM Symposium on Discrete
71  * Algorithms, 1997.
72  */
73 template <typename StringPtr>
74 static inline void multikey_quicksort(
75  const StringPtr& strptr, size_t depth, size_t memory) {
76 
77  typedef typename StringPtr::StringSet StringSet;
78  typedef typename StringSet::Iterator Iterator;
79 
80  const StringSet& ss = strptr.active();
81  const Iterator a = ss.begin();
82  size_t n = ss.size();
83 
84  // try to estimate the amount of memory in a stack frame
85  static const size_t memory_use =
86  2 * sizeof(size_t) + sizeof(StringSet) + 5 * sizeof(Iterator);
87 
88  if (n < 32 || (memory != 0 && memory < memory_use + 1)) {
89  return insertion_sort(strptr, depth, memory);
90  }
91 
92  ptrdiff_t r;
93  Iterator pa, pb, pc, pd, pn;
94 
95  {
96  Iterator pl = a;
97  Iterator pm = a + (n / 2);
98  pn = a + (n - 1);
99  if (n > 30) {
100  // on big arrays: pseudomedian of 9
101  size_t d = (n / 8);
102  pl = med3func(ss, pl, pl + d, pl + 2 * d, depth);
103  pm = med3func(ss, pm - d, pm, pm + d, depth);
104  pn = med3func(ss, pn - 2 * d, pn - d, pn, depth);
105  }
106  pm = med3func(ss, pl, pm, pn, depth);
107  std::swap(*a, *pm);
108  int pivot = ss.get_char(*a, depth);
109  pa = pb = a + 1;
110  pc = pd = a + n - 1;
111  for ( ; ; ) {
112  while (pb <= pc && (r = static_cast<int>(ss.get_char(*pb, depth)) - pivot) <= 0) {
113  if (r == 0) std::swap(*pa++, *pb);
114  pb++;
115  }
116  while (pb <= pc && (r = static_cast<int>(ss.get_char(*pc, depth)) - pivot) >= 0) {
117  if (r == 0) std::swap(*pc, *pd--);
118  pc--;
119  }
120  if (pb > pc) break;
121  std::swap(*pb++, *pc--);
122  }
123  pn = a + n;
124 
125  size_t pe_start_index, pe_end_index;
126  r = std::min(pa - a, pb - pa);
127  vec_swap<StringSet>(a, pb - r, r);
128  pe_start_index = r;
129  r = std::min(pd - pc, pn - pd - 1);
130  pe_end_index = (pn - a) - r;
131  vec_swap<StringSet>(pb, pn - r, r);
132  if (pivot == 0) {
133  for (size_t it = pe_start_index + 1; it < pe_end_index; ++it)
134  strptr.set_lcp(it, depth);
135  }
136  }
137 
138  r = pb - pa;
139  if (r > 0) {
140  strptr.set_lcp(a - ss.begin() + r, depth);
141  }
142  if (r > 1) {
143  multikey_quicksort(strptr.sub(a - ss.begin(), r),
144  depth, memory - memory_use);
145  }
146  if (ss.get_char(*(a + r), depth) != 0) {
148  strptr.sub(a - ss.begin() + r, (pa - a) + (pn - pd - 1)),
149  depth + 1, memory - memory_use);
150  }
151  r = pd - pc;
152  if (r > 0) {
153  strptr.set_lcp(a - ss.begin() + n - r, depth);
154  }
155  if ((r = pd - pc) > 1) {
156  multikey_quicksort(strptr.sub(a - ss.begin() + n - r, r),
157  depth, memory - memory_use);
158  }
159 }
160 
161 } // namespace sort_strings_detail
162 
163 //! \}
164 
165 } // namespace tlx
166 
167 #endif // !TLX_SORT_STRINGS_MULTIKEY_QUICKSORT_HEADER
168 
169 /******************************************************************************/
StringPtr sub(size_t offset, size_t sub_size) const
Advance (both) pointers by given offset, return sub-array.
Definition: string_ptr.hpp:69
static void multikey_quicksort(const StringPtr &strptr, size_t depth, size_t memory)
Generic multikey quicksort for strings.
const StringSet & active() const
return currently active array
Definition: string_ptr.hpp:63
static void vec_swap(typename StringSet::Iterator a, typename StringSet::Iterator b, size_t n)
void set_lcp(size_t, const LcpType &) const
set the i-th lcp to v and check its value
Definition: string_ptr.hpp:79
static uint32_t min(uint32_t x, uint32_t y)
Definition: md5.cpp:32
static enable_if<!StringPtr::with_lcp, void >::type insertion_sort(const StringPtr &strptr, size_t depth, size_t)
Generic insertion sort for abstract string sets.
void swap(CountingPtr< A, D > &a1, CountingPtr< A, D > &a2) noexcept
swap enclosed object with another counting pointer (no reference counts need change) ...
Objectified string array pointer array.
Definition: string_ptr.hpp:47
static StringSet::Iterator med3func(const StringSet &ss, typename StringSet::Iterator a, typename StringSet::Iterator b, typename StringSet::Iterator c, size_t depth)