-/* Overview of bmerge variables:
-**
-** list1 and list2 address the main and auxiliary arrays.
-** They swap identities after each merge pass.
-** Base points to the original list1, so we can tell if
-** the pointers ended up where they belonged (or must be copied).
-**
-** When we are merging two lists, f1 and f2 are the next elements
-** on the respective lists. l1 and l2 mark the end of the lists.
-** tp2 is the current location in the merged list.
-**
-** p1 records where f1 started.
-** After the merge, a new descriptor is built there.
-**
-** p2 is a ``parallel'' pointer in (what starts as) descriptor space.
-** It is used to identify and delimit the runs.
-**
-** In the heat of determining where q, the greater of the f1/f2 elements,
-** belongs in the other list, b, t and p, represent bottom, top and probe
-** locations, respectively, in the other list.
-** They make convenient temporary pointers in other places.
-*/
+/* The original merge sort, in use since 5.7, was as fast as, or faster than,
+ * qsort on many platforms, but slower than qsort, conspicuously so,
+ * on others. The most likely explanation was platform-specific
+ * differences in cache sizes and relative speeds.
+ *
+ * The quicksort divide-and-conquer algorithm guarantees that, as the
+ * problem is subdivided into smaller and smaller parts, the parts
+ * fit into smaller (and faster) caches. So it doesn't matter how
+ * many levels of cache exist, quicksort will "find" them, and,
+ * as long as smaller is faster, take advanatge of them.
+ *
+ * By contrast, consider how the original mergesort algorithm worked.
+ * Suppose we have five runs (each typically of length 2 after dynprep).
+ *
+ * pass base aux
+ * 0 1 2 3 4 5
+ * 1 12 34 5
+ * 2 1234 5
+ * 3 12345
+ * 4 12345
+ *
+ * Adjacent pairs are merged in "grand sweeps" through the input.
+ * This means, on pass 1, the records in runs 1 and 2 aren't revisited until
+ * runs 3 and 4 are merged and the runs from run 5 have been copied.
+ * The only cache that matters is one large enough to hold *all* the input.
+ * On some platforms, this may be many times slower than smaller caches.
+ *
+ * The following pseudo-code uses the same basic merge algorithm,
+ * but in a divide-and-conquer way.
+ *
+ * # merge $runs runs at offset $offset of list $list1 into $list2.
+ * # all unmerged runs ($runs == 1) originate in list $base.
+ * sub mgsort2 {
+ * my ($offset, $runs, $base, $list1, $list2) = @_;
+ *
+ * if ($runs == 1) {
+ * if ($list1 is $base) copy run to $list2
+ * return offset of end of list (or copy)
+ * } else {
+ * $off2 = mgsort2($offset, $runs-($runs/2), $base, $list2, $list1)
+ * mgsort2($off2, $runs/2, $base, $list2, $list1)
+ * merge the adjacent runs at $offset of $list1 into $list2
+ * return the offset of the end of the merged runs
+ * }
+ * }
+ * mgsort2(0, $runs, $base, $aux, $base);
+ *
+ * For our 5 runs, the tree of calls looks like
+ *
+ * 5
+ * 3 2
+ * 2 1 1 1
+ * 1 1
+ *
+ * 1 2 3 4 5
+ *
+ * and the corresponding activity looks like
+ *
+ * copy runs 1 and 2 from base to aux
+ * merge runs 1 and 2 from aux to base
+ * (run 3 is where it belongs, no copy needed)
+ * merge runs 12 and 3 from base to aux
+ * (runs 4 and 5 are where they belong, no copy needed)
+ * merge runs 4 and 5 from base to aux
+ * merge runs 123 and 45 from aux to base
+ *
+ * Note that we merge runs 1 and 2 immediately after copying them,
+ * while they are still likely to be in fast cache. Similarly,
+ * run 3 is merged with run 12 while it still may be lingering in cache.
+ * This implementation should therefore enjoy much of the cache-friendly
+ * behavior that quicksort does. In addition, it does less copying
+ * than the original mergesort implementation (only runs 1 and 2 are copied)
+ * and the "balancing" of merges is better (merged runs comprise more nearly
+ * equal numbers of original runs).
+ *
+ * The actual cache-friendly implementation will use a pseudo-stack
+ * to avoid recursion, and will unroll processing of runs of length 2,
+ * but it is otherwise similar to the recursive implementation.
+ */
+
+typedef struct {
+ IV offset; /* offset of 1st of 2 runs at this level */
+ IV runs; /* how many runs must be combined into 1 */
+} off_runs; /* pseudo-stack element */
+
+
+static I32
+cmp_desc(pTHX_ gptr a, gptr b)
+{
+ return -PL_sort_RealCmp(aTHX_ a, b);
+}