After playing with my own implementation of Quicksort, I did a comparison with qsort as implemented in C++ standard lib. Disappointing to me, that implementation turned out to be a lot faster. But why? After taking a look at the source, I did some research.

SECURITYSAFECRITICAL_ATTRIBUTE #ifdef __USE_CONTEXT void __fileDECL qsort_s ( void *base, size_t num, size_t width, int (__fileDECL *comp)(void *, const void *, const void *), void *context ) #else /* __USE_CONTEXT */ void __fileDECL qsort ( void *base, size_t num, size_t width, int (__fileDECL *comp)(const void *, const void *) ) #endif /* __USE_CONTEXT */ { char *lo, *hi; /* ends of sub-array currently sorting */ char *mid; /* points to middle of subarray */ char *loguy, *higuy; /* traveling pointers for partition step */ size_t size; /* size of the sub-array */ char *lostk[STKSIZ], *histk[STKSIZ]; int stkptr; /* stack for saving sub-array to be processed */ /* validation section */ _VALIDATE_RETURN_VOID(base != NULL || num == 0, EINVAL); _VALIDATE_RETURN_VOID(width > 0, EINVAL); _VALIDATE_RETURN_VOID(comp != NULL, EINVAL); if (num < 2) return; /* nothing to do */ stkptr = 0; /* initialize stack */ lo = (char *)base; hi = (char *)base + width * (num-1); /* initialize limits */ /* this entry point is for pseudo-recursion calling: setting lo and hi and jumping to here is like recursion, but stkptr is preserved, locals aren't, so we preserve stuff on the stack */ recurse: size = (hi - lo) / width + 1; /* number of el's to sort */ /* below a certain size, it is faster to use a O(n^2) sorting method */ if (size <= CUTOFF) { __SHORTSORT(lo, hi, width, comp, context); } else { /* First we pick a partitioning element. The efficiency of the algorithm demands that we find one that is approximately the median of the values, but also that we select one fast. We choose the median of the first, middle, and last elements, to avoid bad performance in the face of already sorted data, or data that is made up of multiple sorted runs appended together. Testing shows that a median-of-three algorithm provides better performance than simply picking the middle element for the latter case. */ mid = lo + (size / 2) * width; /* find middle element */ /* Sort the first, middle, last elements into order */ if (__COMPARE(context, lo, mid) > 0) { swap(lo, mid, width); } if (__COMPARE(context, lo, hi) > 0) { swap(lo, hi, width); } if (__COMPARE(context, mid, hi) > 0) { swap(mid, hi, width); } /* We now wish to partition the array into three pieces, one consisting of elements <= partition element, one of elements equal to the partition element, and one of elements > than it. This is done below; comments indicate conditions established at every step. */ loguy = lo; higuy = hi; /* Note that higuy decreases and loguy increases on every iteration, so loop must terminate. */ for (;;) { /* lo <= loguy < hi, lo < higuy <= hi, A[i] <= A[mid] for lo <= i <= loguy, A[i] > A[mid] for higuy <= i < hi, A[hi] >= A[mid] */ /* The doubled loop is to avoid calling comp(mid,mid), since some existing comparison funcs don't work when passed the same value for both pointers. */ if (mid > loguy) { do { loguy += width; } while (loguy < mid && __COMPARE(context, loguy, mid) <= 0); } if (mid <= loguy) { do { loguy += width; } while (loguy <= hi && __COMPARE(context, loguy, mid) <= 0); } /* lo < loguy <= hi+1, A[i] <= A[mid] for lo <= i < loguy, either loguy > hi or A[loguy] > A[mid] */ do { higuy -= width; } while (higuy > mid && __COMPARE(context, higuy, mid) > 0); /* lo <= higuy < hi, A[i] > A[mid] for higuy < i < hi, either higuy == lo or A[higuy] <= A[mid] */ if (higuy < loguy) break; /* if loguy > hi or higuy == lo, then we would have exited, so A[loguy] > A[mid], A[higuy] <= A[mid], loguy <= hi, higuy > lo */ swap(loguy, higuy, width); /* If the partition element was moved, follow it. Only need to check for mid == higuy, since before the swap, A[loguy] > A[mid] implies loguy != mid. */ if (mid == higuy) mid = loguy; /* A[loguy] <= A[mid], A[higuy] > A[mid]; so condition at top of loop is re-established */ } /* A[i] <= A[mid] for lo <= i < loguy, A[i] > A[mid] for higuy < i < hi, A[hi] >= A[mid] higuy < loguy implying: higuy == loguy-1 or higuy == hi - 1, loguy == hi + 1, A[hi] == A[mid] */ /* Find adjacent elements equal to the partition element. The doubled loop is to avoid calling comp(mid,mid), since some existing comparison funcs don't work when passed the same value for both pointers. */ higuy += width; if (mid < higuy) { do { higuy -= width; } while (higuy > mid && __COMPARE(context, higuy, mid) == 0); } if (mid >= higuy) { do { higuy -= width; } while (higuy > lo && __COMPARE(context, higuy, mid) == 0); } /* OK, now we have the following: higuy < loguy lo <= higuy <= hi A[i] <= A[mid] for lo <= i <= higuy A[i] == A[mid] for higuy < i < loguy A[i] > A[mid] for loguy <= i < hi A[hi] >= A[mid] */ /* We've finished the partition, now we want to sort the subarrays [lo, higuy] and [loguy, hi]. We do the smaller one first to minimize stack usage. We only sort arrays of length 2 or more.*/ if ( higuy - lo >= hi - loguy ) { if (lo < higuy) { lostk[stkptr] = lo; histk[stkptr] = higuy; ++stkptr; } /* save big recursion for later */ if (loguy < hi) { lo = loguy; goto recurse; /* do small recursion */ } } else { if (loguy < hi) { lostk[stkptr] = loguy; histk[stkptr] = hi; ++stkptr; /* save big recursion for later */ } if (lo < higuy) { hi = higuy; goto recurse; /* do small recursion */ } } } /* We have sorted the array, except for any pending sorts on the stack. Check if there are any, and do them. */ --stkptr; if (stkptr >= 0) { lo = lostk[stkptr]; hi = histk[stkptr]; goto recurse; /* pop subarray from stack */ } else return; /* all subarrays done */ }

Apparently, the answer does nothing to have to do with the minor tricks and improvements the algorithm does, as I thought in advance. The true reason why it’s significantly faster is that is uses less subcalls than I do. Calling a function seems to be very expensive. Knowing so, Ive created a different algorithm using only the 2 mandatory subcalls (to itself). And guess what: It’s even faster than the std::qsort algorithm. Few tweaks later, here’s the fastest solution I came up with. It sorts 30 millions integers in about 5.3 seconds (std::qsort 13.6s), 100 million integers in 14s (std::qsort 41.7s). So it’s about 2.5x as fast.

I leave it up to you to multi-thread this algorithm. It’s gonna be super fast ðŸ™‚

// ------------------------------------------------------------------------ void QuickestSort( int a[], int n) { // nothing to do! if ( n <= 1 ) return; // pivot index position int pp = 0; // only for big arrays: calc. median from a[0], a[n/2], a[n] if ( n > 100 ) { if ( a[0] > a[n/2] ) { if ( a[0] > a[n] ) pp = (a[n/2] > a[n]) ? n/2 : n; else pp = 0; } else { if ( a[n/2] > a[n] ) pp = (a[0] > a[n]) ? 0 : n; else pp = n/2; } } // move pivot to a[0] int t = a[0]; a[0] = a[pp]; a[pp] = t; // pivot value int p = a[0]; // initial left and right pointer int i = 0; int j = n; while ( TRUE ) { // from left to right find element greater than pivot while ( i < n && a[++i] < p ) {} // from right to left find element smaller than pivot while ( a[--j] > p ) {} // left >= right: pointers crossed -> stop if ( i >= j ) break; // switch elements t = a[i]; a[i] = a[j]; a[j] = t; } // move pivot to the middle of the array t = a[i-1]; a[i-1] = a[0]; a[0] = t; // divide & conquer QuickestSort( a, i-1 ); QuickestSort( a+i, n-i ); }

## Leave A Comment