src/main/radixsort.c - R - Git at Google

 /*
  *  R : A Computer Language for Statistical Data Analysis
  *  Copyright (C) 2016   The R Core Team
  *
  *  Based on code donated from the data.table package
  *  (C) 2006-2015 Matt Dowle and Arun Srinivasan.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 2 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, a copy is available at
  *  https://www.R-project.org/Licenses/
  */

 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif

 #include <Defn.h>
 #include <Internal.h>

 /* It would be better to find a way to avoid abusing TRUELENGTH, but
    in the meantime replace TRUELENGTH/SET_TRUELENGTH with
    TRLEN/SET_TRLEN that cast to int to avoid warnings. */
 #define TRLEN(x) ((int) TRUELENGTH(x))
 #define SET_TRLEN(x, v) SET_TRUELENGTH(x, ((int) (v)))

 // gs = groupsizes e.g.23, 12, 87, 2, 1, 34,...
 static int *gs[2] = { NULL };
 //two vectors flip flopped:flip and 1 - flip
 static int flip = 0;
 //allocated stack size
 static int gsalloc[2] = { 0 };
 static int gsngrp[2] = { 0 };
 //max grpn so far
 static int gsmax[2] = { 0 };
 //max size of stack, set by do_radixsort to nrows
 static int gsmaxalloc = 0;
 //switched off for last arg unless retGrp==TRUE
 static Rboolean stackgrps = TRUE;
 // TRUE for setkey, FALSE for by=
 static Rboolean sortStr = TRUE;
 // used by do_radixsort and [i|d|c]sort to reorder order.
 // not needed if narg==1
 static int *newo = NULL;
 // =1, 0, -1 for TRUE, NA, FALSE respectively.
 // Value rewritten inside do_radixsort().
 static int nalast = -1;
 // =1, -1 for ascending and descending order respectively
 static int order = 1;

 //replaced n < 200 with n < N_SMALL.Easier to change later
 #define N_SMALL 200
 // range limit for counting sort. Should be less than INT_MAX
 // (see setRange for details)
 #define N_RANGE 100000

 static SEXP *saveds = NULL;
 static R_len_t *savedtl = NULL, nalloc = 0, nsaved = 0;

 static void savetl_init()
 {
     if (nsaved || nalloc || saveds || savedtl)
 	error("Internal error: savetl_init checks failed (%d %d %p %p).",
 	      nsaved, nalloc, saveds, savedtl);
     nsaved = 0;
     nalloc = 100;
     saveds = (SEXP *) malloc(nalloc * sizeof(SEXP));
     if (saveds == NULL)
 	error("Could not allocate saveds in savetl_init");
     savedtl = (R_len_t *) malloc(nalloc * sizeof(R_len_t));
     if (savedtl == NULL) {
 	free(saveds);
 	error("Could not allocate saveds in savetl_init");
     }
 }

 static void savetl_end()
 {
     // Can get called if nothing has been saved yet (nsaved == 0), or
     // even if _init() has not been called yet (pointers NULL). Such as
     // to clear up before error. Also, it might be that nothing needed
     // to be saved anyway.
     for (int i = 0; i < nsaved; i++)
 	SET_TRLEN(saveds[i], savedtl[i]);
     free(saveds);  // does nothing on NULL input
     free(savedtl);
     nsaved = nalloc = 0;
     saveds = NULL;
     savedtl = NULL;
 }


 static void savetl(SEXP s)
 {
     if (nsaved >= nalloc) {
 	nalloc *= 2;
 	char *tmp;
 	tmp = (char *) realloc(saveds, nalloc * sizeof(SEXP));
 	if (tmp == NULL) {
 	    savetl_end();
 	    error("Could not realloc saveds in savetl");
 	}
 	saveds = (SEXP *) tmp;
 	tmp = (char *) realloc(savedtl, nalloc * sizeof(R_len_t));
 	if (tmp == NULL) {
 	    savetl_end();
 	    error("Could not realloc savedtl in savetl");
 	}
 	savedtl = (R_len_t *) tmp;
     }
     saveds[nsaved] = s;
     savedtl[nsaved] = TRLEN(s);
     nsaved++;
 }

 // http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html#Swallowing-the-Semicolon
 #define Error(...) do {savetl_end(); error(__VA_ARGS__);} while(0)
 #undef warning
 // since it can be turned to error via warn = 2
 #define warning(...) Do not use warning in this file
 /* use malloc/realloc (not Calloc/Realloc) so we can trap errors
    and call savetl_end() before the error(). */

 static void growstack(uint64_t newlen)
 {
     // no link to icount range restriction,
     // just 100,000 seems a good minimum at 0.4MB
     if (newlen == 0) newlen = 100000;
     if (newlen > gsmaxalloc) newlen = gsmaxalloc;
     gs[flip] = realloc(gs[flip], newlen * sizeof(int));
     if (gs[flip] == NULL)
 	Error("Failed to realloc working memory stack to %d*4bytes (flip=%d)",
 	      (int)newlen /* no bigger than gsmaxalloc */, flip);
     gsalloc[flip] = (int)newlen;
 }

 static void push(int x)
 {
     if (!stackgrps || x == 0)
 	return;
     if (gsalloc[flip] == gsngrp[flip])
 	growstack((uint64_t)(gsngrp[flip]) * 2);
     gs[flip][gsngrp[flip]++] = x;
     if (x > gsmax[flip])
 	gsmax[flip] = x;
 }

 static void mpush(int x, int n)
 {
     if (!stackgrps || x == 0)
 	return;
     if (gsalloc[flip] < gsngrp[flip] + n)
 	growstack(((uint64_t)(gsngrp[flip]) + n) * 2);
     for (int i = 0; i < n; i++)
 	gs[flip][gsngrp[flip]++] = x;
     if (x > gsmax[flip])
 	gsmax[flip] = x;
 }

 static void flipflop()
 {
     flip = 1 - flip;
     gsngrp[flip] = 0;
     gsmax[flip] = 0;
     if (gsalloc[flip] < gsalloc[1 - flip])
 	growstack((uint64_t)(gsalloc[1 - flip]) * 2);
 }

 static void gsfree()
 {
     free(gs[0]);
     free(gs[1]);
     gs[0] = NULL;
     gs[1] = NULL;
     flip = 0;
     gsalloc[0] = gsalloc[1] = 0;
     gsngrp[0] = gsngrp[1] = 0;
     gsmax[0] = gsmax[1] = 0;
     gsmaxalloc = 0;
 }

 #ifdef TIMING_ON
 // many calls to clock() can be expensive,
 // hence compiled out rather than switch(verbose)
 #include <time.h>
 #define NBLOCK 20
 static clock_t tblock[NBLOCK], tstart;
 static int nblock[NBLOCK];
 #define TBEG() tstart = clock();
 #define TEND(i) tblock[i] += clock()-tstart; nblock[i]++; tstart = clock();
 #else
 #define TBEG()
 #define TEND(i)
 #endif

 static int range, xmin; // used by both icount and do_radixsort
 static void setRange(int *x, int n)
 {
     xmin = NA_INTEGER;
     int xmax = NA_INTEGER;
     double overflow;

     int i = 0;
     while(i < n && x[i] == NA_INTEGER) i++;
     if (i < n) xmax = xmin = x[i];
     for (; i < n; i++) {
 	int tmp = x[i];
 	if (tmp == NA_INTEGER)
 	    continue;
 	if (tmp > xmax)
 	    xmax = tmp;
 	else if (tmp < xmin)
 	    xmin = tmp;
     }
     // all NAs, nothing to do
     if (xmin == NA_INTEGER) {
 	range = NA_INTEGER;
 	return;
     }
     // ex: x=c(-2147483647L, NA_integer_, 1L) results in overflowing int range.
     overflow = (double) xmax - (double) xmin + 1;
     // detect and force iradix here, since icount is out of the picture
     if (overflow > INT_MAX) {
 	range = INT_MAX;
 	return;
     }

     range = xmax - xmin + 1;

     return;
 }

 // x*order results in integer overflow when -1*NA,
 // so careful to avoid that here :
 static inline int icheck(int x)
 {
     // if nalast == 1, NAs must go last.
     return ((nalast != 1) ? ((x != NA_INTEGER) ? x*order : x) :
 	    ((x != NA_INTEGER) ? (x*order) - 1 : INT_MAX));
 }


 static void icount(int *x, int *o, int n)
 /* Counting sort:
    1. Places the ordering into o directly, overwriting whatever was there
    2. Doesn't change x
    3. Pushes group sizes onto stack
 */
 {
     int napos = range; // NA's always counted in last bin
     // static is IMPORTANT, counting sort is called repetitively.
     static unsigned int counts[N_RANGE + 1] = { 0 };
     /* counts are set back to 0 at the end efficiently. 1e5 = 0.4MB i.e
        tiny. We'll only use the front part of it, as large as range. So it's
        just reserving space, not using it. Have defined N_RANGE to be 100000.*/
     if (range > N_RANGE)
 	Error("Internal error: range = %d; isorted cannot handle range > %d",
 	      range, N_RANGE);
     for (int i = 0; i < n; i++) {
 	// For nalast=NA case, we won't remove/skip NAs, rather set 'o' indices
 	// to 0. subset will skip them. We can't know how many NAs to skip
 	// beforehand - i.e. while allocating "ans" vector
 	if (x[i] == NA_INTEGER)
 	    counts[napos]++;
 	else
 	    counts[x[i] - xmin]++;
     }

     int tmp = 0;
     if (nalast != 1 && counts[napos]) {
         push(counts[napos]);
         tmp += counts[napos];
     }
     int w = (order==1) ? 0 : range-1;
     for (int i = 0; i < range; i++)
         /* no point in adding tmp < n && i <= range, since range includes max,
            need to go to max, unlike 256 loops elsewhere in radixsort.c */
     {
 	if (counts[w]) {
 	    // cumulate but not through 0's.
 	    // Helps resetting zeros when n < range, below.
 	    push(counts[w]);
 	    counts[w] = (tmp += counts[w]);
 	}
         w += order; // order is +1 or -1
     }
     if (nalast == 1 && counts[napos]) {
         push(counts[napos]);
         counts[napos] = (tmp += counts[napos]);
     }
     for (int i = n - 1; i >= 0; i--) {
 	// This way na.last=TRUE/FALSE cases will have just a
 	// single if-check overhead.
 	o[--counts[(x[i] == NA_INTEGER) ? napos :
 		   x[i] - xmin]] = (int) (i + 1);
     }
     // nalast = 1, -1 are both taken care already.
     if (nalast == 0)
 	// nalast = 0 is dealt with separately as it just sets o to 0
 	for (int i = 0; i < n; i++)
 	    o[i] = (x[o[i] - 1] == NA_INTEGER) ? 0 : o[i];
     // at those indices where x is NA. x[o[i]-1] because x is not modifed here.

     /* counts were cumulated above so leaves non zero.
        Faster to clear up now ready for next time. */
     if (n < range) {
 	/* Many zeros in counts already. Loop through n instead,
 	   doesn't matter if we set to 0 several times on any repeats */
 	counts[napos] = 0;
 	for (int i = 0; i < n; i++) {
 	    if (x[i] != NA_INTEGER)
 		counts[x[i] - xmin] = 0;
 	}
     } else
 	memset(counts, 0, (range + 1) * sizeof(int));
     return;
 }

 static void iinsert(int *x, int *o, int n)
 /*  orders both x and o by reference in-place. Fast for small vectors,
     low overhead.  don't be tempted to binsearch backwards here, have
     to shift anyway; many memmove would have overhead and do the same
     thing. */
 /*  when nalast == 0, iinsert will be called only from within iradix,
     where o[.] = 0 for x[.]=NA is already taken care of */
 {
     for (int i = 1; i < n; i++) {
 	int xtmp = x[i];
 	if (xtmp < x[i - 1]) {
 	    int j = i - 1;
 	    int otmp = o[i];
 	    while (j >= 0 && xtmp < x[j]) {
 		x[j + 1] = x[j];
 		o[j + 1] = o[j];
 		j--;
 	    }
 	    x[j + 1] = xtmp;
 	    o[j + 1] = otmp;
 	}
     }
     int tt = 0;
     for (int i = 1; i < n; i++)
 	if (x[i] == x[i - 1])
 	    tt++;
 	else {
 	    push(tt + 1);
 	    tt = 0;
 	}
     push(tt + 1);
 }

 /*
   iradix is a counting sort performed forwards from MSB to LSB, with
   some tricks and short circuits building on Terdiman and Herf.
   http://codercorner.com/RadixSortRevisited.htm
   http://stereopsis.com/radix.html

   ~ Note they are LSD, but we do MSD here which is more complicated,
     for efficiency.
   ~ NAs need no special treatment as NA is the most negative integer
     in R (checked in init.c once,  for efficiency) so NA naturally sort
     to the front.
   ~ Using 4-pass 1-byte radix for the following reasons :

   * 11-bit (Herf) reduces to 3-passes (3*11=33) yes, and LSD need
     random access to o vector in each pass 1:n so reduction in passes is
     good, but Terdiman's idea to skip a radix if all values are equal
     occurs less the wider the radix. A narrower radix benefits more from that.
     * That's detected here using a single 'if', an improvement on
       Terdiman's exposition of a single loop to find if any count==n
     * The pass through counts bites when radix is wider,
       because we repetitively call this iradix from fastorder forwards.
   *  Herf's parallel histogramming is neat. In 4-pass 1-byte it needs
      4*256 storage, that's  tiny, and can be static. 4*256 << 3*2048.
      4-pass 1-byte is simpler and tighter code than 3-pass 11-bit,
      giving modern optimizers and modern CPUs a better chance.
      We may get lucky anyway, if one or two of the 4-passes are skipped.

   Recall: there are no comparisons at all in counting and radix,
   there is wide random access in each LSD radix pass, though.
 */

 // 4 are used for iradix, 8 for dradix and i64radix
 static unsigned int radixcounts[8][257] = { {0} };

 static int skip[8];
 /* global because iradix and iradix_r interact and are called repetitively.
    counts are set back to 0 after each use, to benefit from skipped radix. */
 static void *radix_xsub = NULL;
 static size_t radix_xsuballoc = 0;

 static int *otmp = NULL, otmp_alloc = 0;
 static void alloc_otmp(int n)
 {
     if (otmp_alloc >= n)
 	return;
     otmp = (int *) realloc(otmp, n * sizeof(int));
     if (otmp == NULL)
 	Error("Failed to allocate working memory for otmp. Requested %d * %d bytes",
 	      n, sizeof(int));
     otmp_alloc = n;
 }

 // TO DO: save xtmp if possible, see allocs in do_radixsort
 static void *xtmp = NULL;
 static int xtmp_alloc = 0;
 // TO DO: currently always the largest type (double) but
 //        could be int if that's all that's needed
 static void alloc_xtmp(int n)
 {
     if (xtmp_alloc >= n)
 	return;
     xtmp = (double *) realloc(xtmp, n * sizeof(double));
     if (xtmp == NULL)
 	Error("Failed to allocate working memory for xtmp. Requested %d * %d bytes",
 	      n, sizeof(double));
     xtmp_alloc = n;
 }

 static void iradix_r(int *xsub, int *osub, int n, int radix);

 static void iradix(int *x, int *o, int n)
 /* As icount :
    Places the ordering into o directly, overwriting whatever was there
    Doesn't change x
    Pushes group sizes onto stack */
 {
     int nextradix, itmp, thisgrpn, maxgrpn;
     unsigned int thisx = 0, shift, *thiscounts;

     for (int i = 0; i < n;i++) {
 	/* parallel histogramming pass; i.e. count occurrences of
 	   0:255 in each byte.  Sequential so almost negligible. */
 	// relies on overflow behaviour. And shouldn't -INT_MIN be up in iradix?
 	thisx = (unsigned int) (icheck(x[i])) - INT_MIN;
 	// unrolled since inside n-loop
 	radixcounts[0][thisx & 0xFF]++;
 	radixcounts[1][thisx >> 8 & 0xFF]++;
 	radixcounts[2][thisx >> 16 & 0xFF]++;
 	radixcounts[3][thisx >> 24 & 0xFF]++;
     }
     for (int radix = 0; radix < 4; radix++) {
 	/* any(count == n) => all radix must have been that value =>
 	   last x (still thisx) was that value */
 	int i = thisx >> (radix*8) & 0xFF;
 	skip[radix] = radixcounts[radix][i] == n;
 	// clear it now, the other counts must be 0 already
 	if (skip[radix])
 	    radixcounts[radix][i] = 0;
     }

     int radix = 3;  // MSD
     while (radix >= 0 && skip[radix]) radix--;
     if (radix == -1) { // All radix are skipped; one number repeated n times.
 	if (nalast == 0 && x[0] == NA_INTEGER)
 	    // all values are identical. return 0 if nalast=0 & all NA
 	    // because of 'return', have to take care of it here.
 	    for (int i = 0; i < n; i++)
 		o[i] = 0;
 	else
 	    for (int i = 0; i < n; i++)
 		o[i] = (i + 1);
 	push(n);
 	return;
     }
     for (int i = radix - 1; i >= 0; i--) {
 	if (!skip[i])
 	    memset(radixcounts[i], 0, 257 * sizeof(unsigned int));
 	/* clear the counts as we only needed the parallel pass for skip[]
 	   and we're going to use radixcounts again below. Can't use parallel
 	   lower counts in MSD radix, unlike LSD. */
     }
     thiscounts = radixcounts[radix];
     shift = radix * 8;

     itmp = thiscounts[0];
     maxgrpn = itmp;
     for (int i = 1; itmp < n && i < 256; i++) {
 	thisgrpn = thiscounts[i];
 	if (thisgrpn) {
 	    // don't cummulate through 0s, important below.
 	    if (thisgrpn > maxgrpn)
 		maxgrpn = thisgrpn;
 	    thiscounts[i] = (itmp += thisgrpn);
 	}
     }
     for (int i = n - 1; i >= 0; i--) {
 	thisx = ((unsigned int) (icheck(x[i])) - INT_MIN) >> shift & 0xFF;
 	o[--thiscounts[thisx]] = i + 1;
     }

     if (radix_xsuballoc < maxgrpn) {
         // The largest group according to the first non-skipped radix,
         // so could be big (if radix is needed on first arg)
         // TO DO: could include extra bits to divide the first radix
         // up more. Often the MSD has groups in just 0-4 out of 256.
         // free'd at the end of do_radixsort once we're done calling iradix
         // repetitively
         radix_xsub = (int *) realloc(radix_xsub, maxgrpn * sizeof(double));
         if (!radix_xsub)
             Error("Failed to realloc working memory %d*8bytes (xsub in iradix), radix=%d",
                   maxgrpn, radix);
         radix_xsuballoc = maxgrpn;
     }

     // TO DO: can we leave this to do_radixsort and remove these calls??
     alloc_otmp(maxgrpn);
     // TO DO: doesn't need to be sizeof(double) always, see inside
     alloc_xtmp(maxgrpn);

     nextradix = radix - 1;
     while (nextradix >= 0 && skip[nextradix]) nextradix--;
     if (thiscounts[0] != 0)
 	Error("Internal error. thiscounts[0]=%d but should have been decremented to 0. dradix=%d",
 	      thiscounts[0], radix);
     thiscounts[256] = n;
     itmp = 0;
     for (int i = 1; itmp < n && i <= 256; i++) {
         if (thiscounts[i] == 0) continue;
         // undo cumulate; i.e. diff
         thisgrpn = thiscounts[i] - itmp;
         if (thisgrpn == 1 || nextradix == -1) {
             push(thisgrpn);
         } else {
             for (int j = 0; j < thisgrpn; j++)
                 // this is why this xsub here can't be the same memory as
                 // xsub in do_radixsort.
                 ((int *)radix_xsub)[j] = icheck(x[o[itmp+j]-1]);
             // changes xsub and o by reference recursively.
             iradix_r(radix_xsub, o+itmp, thisgrpn, nextradix);
         }
         itmp = thiscounts[i];
         thiscounts[i] = 0;
     }
     if (nalast == 0) // nalast = 1, -1 are both taken care already.
 	// nalast = 0 is dealt with separately as it just sets o to 0
 	for (int i = 0; i < n; i++)
 	    o[i] = (x[o[i] - 1] == NA_INTEGER) ? 0 : o[i];
     // at those indices where x is NA. x[o[i]-1] because x is not
     // modified by reference unlike iinsert or iradix_r
 }

 static void iradix_r(int *xsub, int *osub, int n, int radix)
 // xsub is a recursive offset into xsub working memory above in
 // iradix, reordered by reference.  osub is a an offset into the main
 // answer o, reordered by reference.  radix iterates 3,2,1,0
 {
     int j, itmp, thisx, thisgrpn, nextradix, shift;
     unsigned int *thiscounts;

     // N_SMALL=200 is guess based on limited testing. Needs
     // calibrate().  Was 50 based on sum(1:50)=1275 worst -vs- 256
     // cummulate + 256 memset + allowance since reverse order is
     // unlikely.  when nalast==0, iinsert will be called only from
     // within iradix.
     if (n < N_SMALL) {
 	iinsert(xsub, osub, n);
 	return;
     }

     shift = radix * 8;
     thiscounts = radixcounts[radix];

     for (int i = 0; i < n; i++) {
 	thisx = (unsigned int) xsub[i] - INT_MIN; // sequential in xsub
 	thiscounts[thisx >> shift & 0xFF]++;
     }
     itmp = thiscounts[0];
     for (int i = 1; itmp < n && i < 256; i++)
 	// don't cummulate through 0s, important below
 	if (thiscounts[i])
 	    thiscounts[i] = (itmp += thiscounts[i]);
     for (int i = n - 1; i >= 0; i--) {
 	thisx = ((unsigned int) xsub[i] - INT_MIN) >> shift & 0xFF;
 	j = --thiscounts[thisx];
 	otmp[j] = osub[i];
 	((int *) xtmp)[j] = xsub[i];
     }
     memcpy(osub, otmp, n * sizeof(int));
     memcpy(xsub, xtmp, n * sizeof(int));

     nextradix = radix - 1;
     while (nextradix >= 0 && skip[nextradix]) nextradix--;
     /* TO DO: If nextradix == -1 AND no further args from do_radixsort AND
        !retGrp, we're done. We have o. Remember to memset thiscounts
        before returning. */

     if (thiscounts[0] != 0)
 	Error("Logical error. thiscounts[0]=%d but should have been decremented to 0. radix=%d",
 	      thiscounts[0], radix);
     thiscounts[256] = n;
     itmp = 0;
     for (int i = 1; itmp < n && i <= 256; i++) {
 	if (thiscounts[i] == 0)
 	    continue;
 	thisgrpn = thiscounts[i] - itmp;        // undo cummulate; i.e. diff
 	if (thisgrpn == 1 || nextradix == -1) {
 	    push(thisgrpn);
 	} else {
 	    iradix_r(xsub+itmp, osub+itmp, thisgrpn, nextradix);
 	}
 	itmp = thiscounts[i];
 	thiscounts[i] = 0;
     }
 }

 // dradix from Arun's fastradixdouble.c
 // + changed to MSD and hooked into do_radixsort framework here.
 // + replaced tolerance with rounding s.f.

 static unsigned long long dmask1;
 static unsigned long long dmask2;

 static void setNumericRounding(int dround)
 {
     dmask1 = dround ? 1 << (8 * dround - 1) : 0;
     dmask2 = 0xffffffffffffffff << dround * 8;
 }

 static union {
     double d;
     unsigned long long ull;
 } u;

 static
 unsigned long long dtwiddle(void *p, int i, int order)
 {
     u.d = order * ((double *)p)[i]; // take care of 'order' at the beginning
     if (R_FINITE(u.d)) {
 	u.ull = (u.d != 0.0) ? u.ull + ((u.ull & dmask1) << 1) : 0;
     } else if (ISNAN(u.d)) {
 	u.ull = 0;
 	return (nalast == 1 ? ~u.ull : u.ull);
     }
     unsigned long long mask = (u.ull & 0x8000000000000000) ?
 	// always flip sign bit and if negative (sign bit was set)
 	// flip other bits too
 	0xffffffffffffffff : 0x8000000000000000;
     return ((u.ull ^ mask) & dmask2);
 }

 static Rboolean dnan(void *p, int i)
 {
     u.d = ((double *) p)[i];
     return (ISNAN(u.d));
 }

 static unsigned long long (*twiddle) (void *, int, int);
 static Rboolean(*is_nan) (void *, int);
 // the size of the arg type (4 or 8). Just 8 currently until iradix is
 // merged in.
 static size_t colSize = 8;

 static void dradix_r(unsigned char *xsub, int *osub, int n, int radix);

 #ifdef WORDS_BIGENDIAN
 #define RADIX_BYTE colSize - radix - 1
 #else
 #define RADIX_BYTE radix
 #endif

 static void dradix(unsigned char *x, int *o, int n)
 {
     int radix, nextradix, itmp, thisgrpn, maxgrpn;
     unsigned int *thiscounts;
     unsigned long long thisx = 0;
     // see comments in iradix for structure.  This follows the same.
     // TO DO: merge iradix in here (almost ready)
     for (int i = 0; i < n; i++) {
 	thisx = twiddle(x, i, order);
 	for (radix = 0; radix < colSize; radix++)
 	    // if dround == 2 then radix 0 and 1 will be all 0 here and skipped.
 	    /* on little endian, 0 is the least significant bits (the right)
 	       and 7 is the most including sign (the left); i.e. reversed. */
 	    radixcounts[radix][((unsigned char *)&thisx)[RADIX_BYTE]]++;
     }
     for (radix = 0; radix < colSize; radix++) {
 	// thisx is the last x after loop above
 	int i = ((unsigned char *) &thisx)[RADIX_BYTE];
 	skip[radix] = radixcounts[radix][i] == n;
 	// clear it now, the other counts must be 0 already
 	if (skip[radix])
 	    radixcounts[radix][i] = 0;
     }
     radix = (int) colSize - 1;  // MSD
     while (radix >= 0 && skip[radix]) radix--;
     if (radix == -1) {
 	// All radix are skipped; i.e. one number repeated n times.
 	if (nalast == 0 && is_nan(x, 0))
 	    // all values are identical. return 0 if nalast=0 & all NA
 	    // because of 'return', have to take care of it here.
 	    for (int i = 0; i < n; i++)
 		o[i] = 0;
 	else
 	    for (int i = 0; i < n; i++)
 		o[i] = (i + 1);
 	push(n);
 	return;
     }
     for (int i = radix - 1; i >= 0; i--) {
 	// clear the lower radix counts, we only did them to know
 	// skip. will be reused within each group
 	if (!skip[i])
 	    memset(radixcounts[i], 0, 257 * sizeof(unsigned int));
     }
     thiscounts = radixcounts[radix];
     itmp = thiscounts[0];
     maxgrpn = itmp;
     for (int i = 1; itmp < n && i < 256; i++) {
 	thisgrpn = thiscounts[i];
 	if (thisgrpn) {  // don't cummulate through 0s, important below
 	    if (thisgrpn > maxgrpn)
 		maxgrpn = thisgrpn;
 	    thiscounts[i] = (itmp += thisgrpn);
 	}
     }
     for (int i = n - 1; i >= 0; i--) {
 	thisx = twiddle(x, i, order);
 	o[ --thiscounts[((unsigned char *)&thisx)[RADIX_BYTE]] ] = i + 1;
     }

     if (radix_xsuballoc < maxgrpn) {
         // TO DO: centralize this alloc
         // The largest group according to the first non-skipped radix,
         // so could be big (if radix is needed on first arg) TO DO:
         // could include extra bits to divide the first radix up
         // more. Often the MSD has groups in just 0-4 out of 256.
         // free'd at the end of do_radixsort once we're done calling iradix
         // repetitively
         radix_xsub = (double *) realloc(radix_xsub, maxgrpn * sizeof(double));
         if (!radix_xsub)
             Error("Failed to realloc working memory %d*8bytes (xsub in dradix), radix=%d",
                   maxgrpn, radix);
         radix_xsuballoc = maxgrpn;
     }

     alloc_otmp(maxgrpn);   // TO DO: leave to do_radixsort and remove these?
     alloc_xtmp(maxgrpn);

     nextradix = radix - 1;
     while (nextradix >= 0 && skip[nextradix])
 	nextradix--;
     if (thiscounts[0] != 0)
 	Error("Logical error. thiscounts[0]=%d but should have been decremented to 0. dradix=%d",
 	      thiscounts[0], radix);
     thiscounts[256] = n;
     itmp = 0;
     for (int i = 1; itmp < n && i <= 256; i++) {
         if (thiscounts[i] == 0)
             continue;
         thisgrpn = thiscounts[i] - itmp;  // undo cummulate; i.e. diff
         if (thisgrpn == 1 || nextradix == -1) {
             push(thisgrpn);
         } else {
             if (colSize == 4) { // ready for merging in iradix ...
                 error("Not yet used, still using iradix instead");
                 for (int j = 0; j < thisgrpn; j++)
                     ((int *)radix_xsub)[j] = (int)twiddle(x, o[itmp+j]-1, order);
                 // this is why this xsub here can't be the same memory
                 // as xsub in do_radixsort
             } else
 		for (int j = 0; j < thisgrpn; j++)
 		    ((unsigned long long *)radix_xsub)[j] =
 			twiddle(x, o[itmp+j]-1, order);
 	    // changes xsub and o by reference recursively.
 	    dradix_r(radix_xsub, o+itmp, thisgrpn, nextradix);
 	}
 	itmp = thiscounts[i];
 	thiscounts[i] = 0;
     }
     if (nalast == 0) // nalast = 1, -1 are both taken care already.
 	for (int i = 0; i < n; i++)
 	    o[i] = is_nan(x, o[i] - 1) ? 0 : o[i];
     // nalast = 0 is dealt with separately as it just sets o to 0
     // at those indices where x is NA. x[o[i]-1] because x is not
     // modified by reference unlike iinsert or iradix_r

 }

 static void dinsert(unsigned long long *x, int *o, int n)
 // orders both x and o by reference in-place. Fast for small vectors,
 // low overhead.  don't be tempted to binsearch backwards here, have
 // to shift anyway; many memmove would have overhead and do the same
 // thing 'dinsert' will not be called when nalast = 0 and o[0] = -1.
 {
     int otmp, tt;
     unsigned long long xtmp;
     for (int i = 1; i < n; i++) {
 	xtmp = x[i];
 	if (xtmp < x[i - 1]) {
 	    int j = i - 1;
 	    otmp = o[i];
 	    while (j >= 0 && xtmp < x[j]) {
 		x[j + 1] = x[j];
 		o[j + 1] = o[j];
 		j--;
 	    }
 	    x[j + 1] = xtmp;
 	    o[j + 1] = otmp;
 	}
     }
     tt = 0;
     for (int i = 1; i < n; i++)
 	if (x[i] == x[i - 1])
 	    tt++;
 	else {
 	    push(tt + 1);
 	    tt = 0;
 	}
     push(tt + 1);
 }

 static void dradix_r(unsigned char *xsub, int *osub, int n, int radix)
 /* xsub is a recursive offset into xsub working memory above in
    dradix, reordered by reference.  osub is a an offset into the main
    answer o, reordered by reference.  dradix iterates
    7,6,5,4,3,2,1,0 */
 {
     int itmp, thisgrpn, nextradix;
     unsigned int *thiscounts;
     unsigned char *p;
     if (n < 200) {
 	/* 200 is guess based on limited testing. Needs calibrate(). Was 50
 	   based on sum(1:50)=1275 worst -vs- 256 cummulate + 256 memset +
 	   allowance since reverse order is unlikely */
 	// order=1 here because it's already taken care of in iradix
 	dinsert((void *)xsub, osub, n);

 	return;
     }
     thiscounts = radixcounts[radix];
     p = xsub + RADIX_BYTE;
     for (int i = 0; i < n; i++) {
 	thiscounts[*p]++;
 	p += colSize;
     }
     itmp = thiscounts[0];
     for (int i = 1; itmp < n && i < 256; i++)
 	// don't cummulate through 0s, important below
 	if (thiscounts[i])
 	    thiscounts[i] = (itmp += thiscounts[i]);
     p = xsub + (n - 1) * colSize;
     if (colSize == 4) {
 	error("Not yet used, still using iradix instead");
 	for (int i = n - 1; i >= 0; i--) {
 	    int j = --thiscounts[*(p + RADIX_BYTE)];
 	    otmp[j] = osub[i];
 	    ((int *) xtmp)[j] = *(int *) p;
 	    p -= colSize;
 	}
     } else {
 	for (int i = n - 1; i >= 0; i--) {
 	    int j = --thiscounts[*(p + RADIX_BYTE)];
 	    otmp[j] = osub[i];
 	    ((unsigned long long *) xtmp)[j] = *(unsigned long long *) p;
 	    p -= colSize;
 	}
     }
     memcpy(osub, otmp, n * sizeof(int));
     memcpy(xsub, xtmp, n * colSize);

     nextradix = radix - 1;
     while (nextradix >= 0 && skip[nextradix])
 	nextradix--;
     // TO DO: If nextradix==-1 and no further args from do_radixsort,
     // we're done. We have o. Remember to memset thiscounts before
     // returning.

     if (thiscounts[0] != 0)
 	Error("Logical error. thiscounts[0]=%d but should have been decremented to 0. radix=%d",
 	      thiscounts[0], radix);
     thiscounts[256] = n;
     itmp = 0;
     for (int i = 1; itmp < n && i <= 256; i++) {
 	if (thiscounts[i] == 0)
 	    continue;
 	thisgrpn = thiscounts[i] - itmp;        // undo cummulate; i.e. diff
 	if (thisgrpn == 1 || nextradix == -1)
 	    push(thisgrpn);
 	else
 	    dradix_r(xsub + itmp * colSize, osub + itmp, thisgrpn,
 		     nextradix);
 	itmp = thiscounts[i];
 	thiscounts[i] = 0;
     }
 }

 // TO DO?: dcount. Find step size, then range = (max-min)/step and
 // proceed as icount. Many fixed precision floats (such as prices) may
 // be suitable. Fixed precision such as 1.10, 1.15, 1.20, 1.25, 1.30
 // ... do use all bits so dradix skipping may not help.

 static int *cradix_counts = NULL;
 static int cradix_counts_alloc = 0;
 static int maxlen = 1;
 static SEXP *cradix_xtmp = NULL;
 static int cradix_xtmp_alloc = 0;

 // same as StrCmp but also takes into account 'decreasing' and 'na.last' args.
 static int StrCmp2(SEXP x, SEXP y)
 {
     // same cached pointer (including NA_STRING == NA_STRING)
     if (x == y) return 0;
     // if x=NA, nalast=1 ? then x > y else x < y (Note: nalast == 0 is
     // already taken care of in 'csorted', won't be 0 here)
     if (x == NA_STRING) return nalast;
     if (y == NA_STRING) return -nalast;     // if y=NA, nalast=1 ? then y > x
     return order*strcmp(CHAR(x), CHAR(y));  // same as explanation in StrCmp
 }

 static int StrCmp(SEXP x, SEXP y)            // also used by bmerge and chmatch
 {
     // same cached pointer (including NA_STRING == NA_STRING)
     if (x == y) return 0;
     if (x == NA_STRING) return -1;    // x < y
     if (y == NA_STRING) return 1;     // x > y
     // assumes strings are in same encoding
     return strcmp(CHAR(x), CHAR(y));
 }

 #define CHAR_ENCODING(x) (IS_ASCII(x) ? CE_UTF8 : getCharCE(x))

 static void checkEncodings(SEXP x)
 {
     cetype_t ce;

     int i;
     for (i = 0; i < length(x) && STRING_ELT(x, i) == NA_STRING; i++);

     if (i < length(x)) {
         ce = CHAR_ENCODING(STRING_ELT(x, i));
         if (ce == CE_NATIVE) {
             error(_("Character encoding must be UTF-8, Latin-1 or bytes"));
         }
     }

     /* Disabled for now -- doubles the time (for already sorted vectors): why?
     for (int i = 1; i < length(x); i++) {
         if (ce != CHAR_ENCODING(STRING_ELT(x, i))) {
             error(_("Mixed character encodings are not supported"));
         }
     }
     */
 }

 static void cradix_r(SEXP * xsub, int n, int radix)
 // xsub is a unique set of CHARSXP, to be ordered by reference

 // First time, radix == 0, and xsub == x. Then recursively moves SEXP together
 // for L1 cache efficiency.

 // Quite different to iradix because
 //   1) x is known to be unique so fits in cache
 //      (wide random access not an issue)
 //   2) they're variable length character strings
 //   3) no need to maintain o.  Just simply reorder x. No grps or push.

 // Fortunately, UTF sorts in the same order if treated as ASCII, so we
 // can simplify by doing it by bytes.

 // TO DO: confirm a forwards (MSD) radix for efficiency, although more
 // complicated.

 // This part has nothing to do with truelength. The
 // truelength stuff is to do with finding the unique strings.  We may
 // be able to improve CHARSXP derefencing by submitting patch to R to
 // make R's string cache contiguous but would likely be difficult. If
 // we strxfrm, then it'll then be contiguous and compact then anyway.
 {
     int itmp, *thiscounts, thisgrpn=0, thisx=0;
     SEXP stmp;

     // TO DO?: chmatch to existing sorted vector, then grow it.
     // TO DO?: if (n<N_SMALL = 200) insert sort, then loop through groups via ==
     if (n <= 1) return;
     if (n == 2) {
 	if (StrCmp(xsub[1], xsub[0]) < 0) {
 	    stmp = xsub[0];
 	    xsub[0] = xsub[1];
 	    xsub[1] = stmp;
 	}
 	return;
     }
     // TO DO: if (n < 50) cinsert (continuing from radix offset into
     // CHAR) or using StrCmp. But 256 is narrow, so quick and not too
     // much an issue.

     thiscounts = cradix_counts + radix * 256;
     for (int i = 0; i < n; i++) {
 	thisx = xsub[i] == NA_STRING ?
 	    0 : (radix < LENGTH(xsub[i]) ?
 		 (unsigned char) (CHAR(xsub[i])[radix]) : 1);
 	thiscounts[ thisx ]++;   // 0 for NA,  1 for ""
     }
     // this also catches when subx has shorter strings than the rest,
     // thiscounts[0] == n and we'll recurse very quickly through to the
     // overall maxlen with no 256 overhead each time
     if (thiscounts[thisx] == n && radix < maxlen - 1) {
 	cradix_r(xsub, n, radix + 1);
 	thiscounts[thisx] = 0;  // the rest must be 0 already, save the memset
 	return;
     }
     itmp = thiscounts[0];
     for (int i = 1; i < 256; i++)
 	// don't cummulate through 0s, important below
 	if (thiscounts[i])
 	    thiscounts[i] = (itmp += thiscounts[i]);
     for (int i = n - 1; i >= 0; i--) {
 	thisx = xsub[i] == NA_STRING ?
 	    0 : (radix < LENGTH(xsub[i]) ?
 		 (unsigned char) (CHAR(xsub[i])[radix]) : 1);
 	int j = --thiscounts[thisx];
 	cradix_xtmp[j] = xsub[i];
     }
     memcpy(xsub, cradix_xtmp, n * sizeof(SEXP));
     if (radix == maxlen - 1) {
 	memset(thiscounts, 0, 256 * sizeof(int));
 	return;
     }
     if (thiscounts[0] != 0)
 	Error("Logical error. counts[0]=%d in cradix but should have been decremented to 0. radix=%d",
 	      thiscounts[0], radix);
     itmp = 0;
     for (int i = 1; i < 256; i++) {
 	if (thiscounts[i] == 0)
 	    continue;
 	thisgrpn = thiscounts[i] - itmp;        // undo cummulate; i.e. diff
 	cradix_r(xsub + itmp, thisgrpn, radix + 1);
 	itmp = thiscounts[i];
 	// set to 0 now since we're here, saves memset
 	// afterwards. Important to clear! Also more portable for
 	// machines where 0 isn't all bits 0 (?!)
 	thiscounts[i] = 0;
     }
     if (itmp < n - 1)
 	cradix_r(xsub + itmp, n - itmp, radix + 1);     // final group
 }

 static SEXP *ustr = NULL;
 static int ustr_alloc = 0, ustr_n = 0;

 static void cgroup(SEXP * x, int *o, int n)
 // As icount :
 //   Places the ordering into o directly, overwriting whatever was there
 //   Doesn't change x
 //   Pushes group sizes onto stack

 // Only run when sortStr == FALSE. Basically a counting sort, in first
 // appearance order, directly.  Since it doesn't sort the strings, the
 // name is cgroup.  there is no _pre for this.  ustr created and
 // cleared each time.
 {
     // savetl_init() is called once at the start of do_radixsort
     if (ustr_n != 0)
 	Error
 	    ("Internal error. ustr isn't empty when starting cgroup: ustr_n=%d, ustr_alloc=%d",
 	     ustr_n, ustr_alloc);
     for (int i = 0; i < n; i++) {
 	SEXP s = x[i];
 	if (TRLEN(s) < 0) {        // this case first as it's the most frequent
 	    SET_TRLEN(s, TRLEN(s) - 1);
 	    // use negative counts so as to detect R's own (positive)
 	    // usage of tl on CHARSXP
 	    continue;
 	}
 	if (TRLEN(s) > 0) {
 	    // Save any of R's own usage of tl (assumed positive, so
 	    // we can both count and save in one scan), to restore
 	    // afterwards. From R 2.14.0, tl is initialized to 0,
 	    // prior to that it was random so this step saved too much.
 	    savetl(s);
 	    SET_TRLEN(s, 0);
 	}
 	if (ustr_alloc <= ustr_n) {
 	    // 10000 = 78k of 8byte pointers. Small initial guess,
 	    // negligible time to alloc.
 	    ustr_alloc = (ustr_alloc == 0) ? 10000 : ustr_alloc*2;
 	    if (ustr_alloc > n)
 		ustr_alloc = n;
 	    ustr = realloc(ustr, ustr_alloc * sizeof(SEXP));
 	    if (ustr == NULL)
 		Error("Unable to realloc %d * %d bytes in cgroup", ustr_alloc,
 		      sizeof(SEXP));
 	}
 	SET_TRLEN(s, -1);
 	ustr[ustr_n++] = s;
     }
     // TO DO: the same string in different encodings will be
     // considered different here. Sweep through ustr and merge counts
     // where equal (sort needed therefore, unfortunately?, only if
     // there are any marked encodings present)
     int cumsum = 0;
     for (int i = 0; i < ustr_n; i++) {      // 0.000
 	push(-TRLEN(ustr[i]));
 	SET_TRLEN(ustr[i], cumsum += -TRLEN(ustr[i]));
     }
     int *target = (o[0] != -1) ? newo : o;
     for (int i = n - 1; i >= 0; i--) {
 	SEXP s = x[i];           // 0.400 (page fetches on string cache)
 	int k = TRLEN(s) - 1;
 	SET_TRLEN(s, k);
 	target[k] = i + 1;      // 0.800 (random access to o)
     }
     // The cummulate meant counts are left non zero, so reset for next
     // time (0.00s).
     for (int i = 0; i < ustr_n; i++)
 	SET_TRLEN(ustr[i], 0);
     ustr_n = 0;
 }

 static int *csort_otmp = NULL, csort_otmp_alloc = 0;
 static void alloc_csort_otmp(int n)
 {
     if (csort_otmp_alloc >= n)
 	return;
     csort_otmp = (int *) realloc(csort_otmp, n * sizeof(int));
     if (csort_otmp == NULL)
 	Error
 	    ("Failed to allocate working memory for csort_otmp. Requested %d * %d bytes",
 	     n, sizeof(int));
     csort_otmp_alloc = n;
 }

 static void csort(SEXP * x, int *o, int n)
 /*
    As icount :
    Places the ordering into o directly, overwriting whatever was there
    Doesn't change x
    Pushes group sizes onto stack
    Requires csort_pre() to have created and sorted ustr already
 */
 {
     /* can't use otmp, since iradix might be called here and that uses
        otmp (and xtmp).  alloc_csort_otmp(n) is called from do_radixsort for
        either n=nrow if 1st arg, or n=maxgrpn if onwards args */
     for (int i = 0; i < n; i++)
 	csort_otmp[i] = (x[i] == NA_STRING) ? NA_INTEGER : -TRLEN(x[i]);
     if (nalast == 0 && n == 2) {
         // special case for nalast == 0. n == 1 is handled inside
         // do_radixsort. at least 1 will be NA here else use o from caller
         // directly (not 1st arg)
         if (o[0] == -1)
             for (int i = 0; i < n; i++)
                 o[i] = i + 1;
         for (int i = 0;  i < n; i++)
             if (csort_otmp[i] == NA_INTEGER)
                 o[i] = 0;
         push(1); push(1);
         return;
     }
     if (n < N_SMALL && nalast != 0) { // TO DO: calibrate() N_SMALL=200
         if (o[0] == -1)
             for (int i = 0; i < n; i++)
                 o[i] = i + 1;
         // else use o from caller directly (not 1st arg)
         for (int i = 0; i < n; i++)
             csort_otmp[i] = icheck(csort_otmp[i]);
         iinsert(csort_otmp, o, n);
     } else {
 	setRange(csort_otmp, n);
 	if (range == NA_INTEGER)
 	    Error("Internal error. csort's otmp contains all-NA");
 	int *target = (o[0] != -1) ? newo : o;
 	if (range <= N_RANGE)
 	    // TO DO: calibrate(). radix was faster (9.2s
 	    // "range<=10000" instead of 11.6s "range<=N_RANGE &&
 	    // range<n") for run(7) where range=N_RANGE n=10000000
 	    icount(csort_otmp, target, n);
 	else
 	    iradix(csort_otmp, target, n);
     }
     // all i* push onto stack. Using their counts may be faster here
     // than thrashing SEXP fetches over several passes as cgroup does
     // (but cgroup needs that to keep orginal order, and cgroup saves
     // the sort in csort_pre).
 }

 static void csort_pre(SEXP * x, int n)
 // Finds ustr and sorts it.  Runs once for each arg (if
 // sortStr == TRUE), then ustr is used by csort within each group ustr
 // is grown on each character arg, to save sorting the same strings
 // again if several args contain the same strings
 {
     SEXP s;
     int old_un, new_un;
     // savetl_init() is called once at the start of do_radixsort
     old_un = ustr_n;
     for (int i = 0; i < n; i++) {
 	s = x[i];
 	// this case first as it's the most frequent. Already in ustr,
 	// this negative is its ordering.
 	if (TRLEN(s) < 0)
 	    continue;
 	// Save any of R's own usage of tl (assumed positive, so we
 	// can both count and save in one scan), to restore
 	// afterwards. From R 2.14.0, tl is initialized to 0, prior to
 	// that it was random so this step saved too much.
 	if (TRLEN(s) > 0) {
 	    savetl(s);
 	    SET_TRLEN(s, 0);
 	}
 	if (ustr_alloc <= ustr_n) {
 	    // 10000 = 78k of 8byte pointers. Small initial guess,
 	    // negligible time to alloc.
 	    ustr_alloc = (ustr_alloc == 0) ? 10000 : ustr_alloc*2;
 	    if (ustr_alloc > old_un+n)
 		ustr_alloc = old_un + n;
 	    ustr = realloc(ustr, ustr_alloc * sizeof(SEXP));
 	    if (ustr == NULL)
 		Error("Failed to realloc ustr. Requested %d * %d bytes",
 		      ustr_alloc, sizeof(SEXP));
 	}
 	SET_TRLEN(s, -1);  // this -1 will become its ordering later below
 	ustr[ustr_n++] = s;
 	// length on CHARSXP is the nchar of char * (excluding \0),
 	// and treats marked encodings as if ascii.
 	if (s != NA_STRING && LENGTH(s) > maxlen)
 	    maxlen = LENGTH(s);
     }
     new_un = ustr_n;
     if (new_un == old_un)
 	return;
     // No new strings observed, seen them all before in previous
     // arg. ustr already sufficient.  If we ever make ustr
     // permanently held by data.table, we'll just need to make the
     // final loop to set -i-1 before returning here.  sort ustr.

     // TODO: just sort new ones and merge them in.  These allocs are
     // here, to save them being in the recursive cradix_r()
     if (cradix_counts_alloc < maxlen) {
 	cradix_counts_alloc = maxlen + 10;   // +10 to save too many reallocs
 	cradix_counts = (int *)realloc(cradix_counts,
 				       cradix_counts_alloc * 256 * sizeof(int));
 	if (!cradix_counts)
 	    Error("Failed to alloc cradix_counts");
 	memset(cradix_counts, 0, cradix_counts_alloc * 256 * sizeof(int));
     }
     if (cradix_xtmp_alloc < ustr_n) {
         cradix_xtmp = (SEXP *) realloc(cradix_xtmp,  ustr_n * sizeof(SEXP));
         // TO DO: Reuse the one we have in do_radixsort.
         // Does it need to be n length?
         if (!cradix_xtmp)
             Error("Failed to alloc cradix_tmp");
         cradix_xtmp_alloc = ustr_n;
     }
     // sorts ustr in-place by reference save ordering in the
     // CHARSXP. negative so as to distinguish with R's own usage.
     cradix_r(ustr, ustr_n, 0);
     for (int i = 0; i < ustr_n; i++)
 	SET_TRLEN(ustr[i], -i - 1);
 }

 // functions to test vectors for sortedness: isorted, dsorted and csorted

 // base:is.unsorted returns NA in the presence of any NA, but we need
 // to consider na.last, and we also return -1 if x is sorted in
 // _strictly_ reverse order; a common case we optimize.  If a vector
 // is in decreasing order *with ties*, then an in-place reverse (no
 // sort) would result in instability of ties, so we are strict. We
 // also save grouping information during the check; that information
 // is required when sorting by multiple arguments.

 // TO DO: test in big steps first to return faster if unsortedness is
 // at the end (a common case of rbind'ing data to end) These are all
 // sequential access to x, so very quick and cache efficient.

 // order = 1 is ascending and order=-1 is descending; also takes care
 // of na.last argument with check through 'icheck' Relies on
 // NA_INTEGER == INT_MIN, checked in init.c
 static int isorted(int *x, int n)
 {
     int i = 1, j = 0;
     // when nalast = NA,
     // all NAs ? return special value to replace all o's values with '0'
     // any NAs ? return 0 = unsorted and leave it
     //   to sort routines to replace o's with 0's
     // no NAs ? continue to check rest of isorted - the same routine as usual
     if (nalast == 0) {
 	for (int k = 0; k < n; k++)
 	    if (x[k] != NA_INTEGER)
 		j++;
 	if (j == 0) {
 	    push(n);
 	    return (-2);
 	}
 	if (j != n)
 	    return (0);
     }
     if (n <= 1) {
 	push(n);
 	return (1);
     }
     if (icheck(x[1]) < icheck(x[0])) {
 	i = 2;
 	while (i < n && icheck(x[i]) < icheck(x[i - 1]))
 	    i++;
 	// strictly opposite to expected 'order', no ties;
 	if (i == n) {
 	    mpush(1, n);
 	    return (-1);
 	}
 	// e.g. no more than one NA at the beginning/end (for order=-1/1)
 	else return (0);
     }
     int old = gsngrp[flip];
     int tt = 1;
     for (int i = 1; i < n; i++) {
 	if (icheck(x[i]) < icheck(x[i - 1])) {
 	    gsngrp[flip] = old;
 	    return (0);
 	}
 	if (x[i] == x[i - 1])
 	    tt++;
 	else {
 	    push(tt); tt = 1;
 	}
     }
     push(tt);
     // same as 'order', NAs at the beginning for order=1, at end for
     // order=-1, possibly with ties
     return(1);
 }

 // order=1 is ascending and -1 is descending
 // also accounts for nalast=0 (=NA), =1 (TRUE), -1 (FALSE) (in twiddle)
 static int dsorted(double *x, int n)
 {
     int i = 1, j = 0;
     unsigned long long prev, this;
     if (nalast == 0) {
 	// when nalast = NA,
 	// all NAs ? return special value to replace all o's values with '0'
 	// any NAs ? return 0 = unsorted and leave it to sort routines to
 	//           replace o's with 0's
 	// no NAs  ? continue to check the rest of isorted -
 	//           the same routine as usual
 	for (int k = 0; k < n; k++)
 	    if (!is_nan(x, k))
 		j++;
 	if (j == 0) {
 	    push(n);
 	    return (-2);
 	}
 	if (j != n)
 	    return (0);
     }
     if (n <= 1) {
 	push(n);
 	return (1);
     }
     prev = twiddle(x, 0, order);
     this = twiddle(x, 1, order);
     if (this < prev) {
 	i = 2;
 	prev = this;
 	while (i < n && (this = twiddle(x, i, order)) < prev) {
 	    i++;
 	    prev = this;
 	}
 	if (i == n) {
 	    mpush(1, n);
 	    return (-1);
 	}
 	// strictly opposite of expected 'order', no ties; e.g. no
 	// more than one NA at the beginning/end (for order=-1/1)

 	// TO DO: improve to be stable for ties in reverse
 	else return(0);
     }
     int old = gsngrp[flip];
     int tt = 1;
     for (int i = 1; i < n; i++) {
 	// TO DO: once we get past -Inf, NA and NaN at the bottom, and
 	//        +Inf at the top, the middle only need be twiddled
 	//        for tolerance (worth it?)
 	this = twiddle(x, i, order);
 	if (this < prev) {
 	    gsngrp[flip] = old;
 	    return (0);
 	}
 	if (this == prev)
 	    tt++;
 	else {
 	    push(tt);
 	    tt = 1;
 	}
 	prev = this;
     }
     push(tt);
     // exactly as expected in 'order' (1=increasing, -1=decreasing),
     // possibly with ties
     return (1);
 }

 // order=1 is ascending and -1 is descending
 // also accounts for nalast=0 (=NA), =1 (TRUE), -1 (FALSE)
 static int csorted(SEXP *x, int n)
 {
     int i = 1, j = 0, tmp;
     if (nalast == 0) {
 	// when nalast = NA,
 	// all NAs ? return special value to replace all o's values with '0'
 	// any NAs ? return 0 = unsorted and leave it to sort routines
 	//           to replace o's with 0's
 	// no NAs  ? continue to check the rest of isorted -
 	//           the same routine as usual
 	for (int k = 0; k < n; k++)
 	    if (x[k] != NA_STRING)
 		j++;
 	if (j == 0) {
 	    push(n);
 	    return (-2);
 	}
 	if (j != n)
 	    return (0);
     }
     if (n <= 1) {
 	push(n);
 	return (1);
     }
     if (StrCmp2(x[1], x[0]) < 0) {
 	i = 2;
 	while (i < n && StrCmp2(x[i], x[i - 1]) < 0)
 	    i++;
 	if (i == n) {
 	    mpush(1, n);
 	    return (-1);
 	}
 	// strictly opposite of expected 'order', no ties;
 	// e.g. no more than one NA at the beginning/end (for order=-1/1)
 	else
 	    return (0);
     }
     int old = gsngrp[flip];
     int tt = 1;
     for (int i = 1; i < n; i++) {
 	tmp = StrCmp2(x[i], x[i - 1]);
 	if (tmp < 0) {
 	    gsngrp[flip] = old;
 	    return (0);
 	}
 	if (tmp == 0)
 	    tt++;
 	else {
 	    push(tt);
 	    tt = 1;
 	}
     }
     push(tt);
     // exactly as expected in 'order', possibly with ties
     return (1);
 }

 static void isort(int *x, int *o, int n)
 {
     if (n <= 2) {
 	// nalast = 0 and n == 2 (check bottom of this file for explanation)
 	if (nalast == 0 && n == 2) {
 	    if (o[0] == -1) {
 		o[0] = 1;
 		o[1] = 2;
 	    }
 	    for (int i = 0; i < n; i++)
 		if (x[i] == NA_INTEGER)
 		    o[i] = 0;
 	    push(1); push(1);
 	    return;
 	} else Error("Internal error: isort received n=%d. isorted should have dealt with this (e.g. as a reverse sorted vector) already",n);
     }
     if (n < N_SMALL && o[0] != -1 && nalast != 0) {
         // see comment above in iradix_r on N_SMALL=200.
         /* if not o[0] then can't just populate with 1:n here, since x
            is changed by ref too (so would need to be copied). */
         /* pushes inside too. Changes x and o by reference, so not
            suitable in first arg when o hasn't been populated yet
            and x is an actual argument (hence check on o[0]). */
         if (order != 1 || nalast != -1)
             // so that default case, i.e., order=1, nalast=FALSE will
             // not be affected (ex: `setkey`)
             for (int i = 0; i < n; i++)
                 x[i] = icheck(x[i]);
         iinsert(x, o, n);
     } else {
         /* Tighter range (e.g. copes better with a few abormally large
            values in some groups), but also, when setRange was once at
            arg level that caused an extra scan of (long) x
            first. 10,000 calls to setRange takes just 0.04s
            i.e. negligible. */
         setRange(x, n);
         if (range == NA_INTEGER)
             Error("Internal error: isort passed all-NA. isorted should have caught this before this point");
         int *target = (o[0] != -1) ? newo : o;
         // was range < 10000 for subgroups, but 1e5 for the first
         // arg, tried to generalise here.  1e4 rather than 1e5 here
         // because iterated was (thisgrpn < 200 || range > 20000) then
         // radix a short vector with large range can bite icount when
         // iterated (BLOCK 4 and 6)
         if (range <= N_RANGE && range <= n) {
             icount(x, target, n);
         } else {
             iradix(x, target, n);
         }
     }
 }

 static void dsort(double *x, int *o, int n)
 {
     if (n <= 2) {
 	if (nalast == 0 && n == 2) {
 	    // don't have to twiddle here.. at least one will be NA
 	    // and 'n' WILL BE 2.
 	    if (o[0] == -1) {
 		o[0] = 1;
 		o[1] = 2;
 	    }
 	    for (int i = 0; i < n; i++)
 		if (is_nan(x, i))
 		    o[i] = 0;
 	    push(1); push(1);
 	    return;
 	}
 	Error("Internal error: dsort received n=%d. dsorted should have dealt with this (e.g. as a reverse sorted vector) already",n);
     }
     if (n < N_SMALL && o[0] != -1 && nalast != 0) {
 	// see comment above in iradix_r re N_SMALL=200,  and isort for o[0]
 	for (int i = 0; i < n; i++)
 	    ((unsigned long long *)x)[i] = twiddle(x, i, order);
 	// have to twiddle here anyways, can't speed up default case
 	// like in isort
 	dinsert((unsigned long long *)x, o, n);
     } else {
 	dradix((unsigned char *) x, (o[0] != -1) ? newo : o, n);
     }
 }

 SEXP attribute_hidden do_radixsort(SEXP call, SEXP op, SEXP args, SEXP rho)
 {
     int n = -1, narg = 0, ngrp, tmp, *osub, thisgrpn;
     R_xlen_t nl = n;
     Rboolean isSorted = TRUE, retGrp;
     void *xd;
     int *o = NULL;

     /* ML: FIXME: Here are just two of the dangerous assumptions here */
     if (sizeof(int) != 4) {
         error("radix sort assumes sizeof(int) == 4");
     }
     if (sizeof(double) != 8) {
         error("radix sort assumes sizeof(double) == 8");
     }

     nalast = (asLogical(CAR(args)) == NA_LOGICAL) ? 0 :
 	(asLogical(CAR(args)) == TRUE) ? 1 : -1; // 1=TRUE, -1=FALSE, 0=NA
     args = CDR(args);
     SEXP decreasing = CAR(args);
     args = CDR(args);

     /* If TRUE, return starts of runs of identical values + max group size. */
     retGrp = asLogical(CAR(args));
     args = CDR(args);

     /* If FALSE, get order of strings in appearance order. Essentially
        abuses the CHARSXP table to group strings without hashing
        them. Only makes sense when retGrp=TRUE.
     */
     sortStr = asLogical(CAR(args));
     args = CDR(args);

     /* When grouping, we round off doubles to account for imprecision */
     setNumericRounding(retGrp ? 2 : 0);

     if (args == R_NilValue)
 	return R_NilValue;
     if (isVector(CAR(args)))
 	nl = XLENGTH(CAR(args));
     for (SEXP ap = args; ap != R_NilValue; ap = CDR(ap), narg++) {
 	if (!isVector(CAR(ap)))
 	    error(_("argument %d is not a vector"), narg + 1);
         //Rprintf("%d, %d\n", XLENGTH(CAR(ap)), nl);
 	if (XLENGTH(CAR(ap)) != nl)
 	    error(_("argument lengths differ"));
     }

     if (narg != length(decreasing))
 	error(_("length(decreasing) must match the number of order arguments"));
     for (int i = 0; i < narg; i++) {
 	if (LOGICAL(decreasing)[i] == NA_LOGICAL)
 	    error(_("'decreasing' elements must be TRUE or FALSE"));
     }
     order = asLogical(decreasing) ? -1 : 1;

     SEXP x = CAR(args);
     args = CDR(args);

     // (ML) FIXME: need to support long vectors
     if (nl > INT_MAX) {
 	error(_("long vectors not supported"));
     }
     n = (int) nl;

     // upper limit for stack size (all size 1 groups). We'll detect
     // and avoid that limit, but if just one non-1 group (say 2), that
     // can't be avoided.
     gsmaxalloc = n;

     // once for the result, needs to be length n.

     // TO DO: save allocation if NULL is returned (isSorted = =TRUE) so
     // [i|c|d]sort know they can populate o directly with no working
     // memory needed to reorder existing order had to repace this from
     // '0' to '-1' because 'nalast = 0' replace 'o[.]' with 0 values.

     SEXP ans = PROTECT(allocVector(INTSXP, n));
     o = INTEGER(ans);
     if (n > 0)
 	o[0] = -1;
     xd = DATAPTR(x);

     stackgrps = narg > 1 || retGrp;

     if (TYPEOF(x) == STRSXP) {
         checkEncodings(x);
     }

     savetl_init();   // from now on use Error not error.

     switch (TYPEOF(x)) {
     case INTSXP:
     case LGLSXP:
 	tmp = isorted(xd, n);
 	break;
     case REALSXP :
 	twiddle = &dtwiddle;
 	is_nan  = &dnan;
 	tmp = dsorted(xd, n);
 	break;
     case STRSXP :
 	tmp = csorted(xd, n);
 	break;
     default :
         Error("First arg is type '%s', not yet supported",
               type2char(TYPEOF(x)));
     }
     if (tmp) {
 	// -1 or 1. NEW: or -2 in case of nalast == 0 and all NAs
 	if (tmp == 1) {
 	    // same as expected in 'order' (1 = increasing, -1 = decreasing)
 	    isSorted = TRUE;
 	    for (int i = 0; i < n; i++)
 		o[i] = i + 1;
 	} else if (tmp == -1) {
 	    // -1 (or -n for result of strcmp), strictly opposite to
 	    // -expected 'order'
 	    isSorted = FALSE;
 	    for (int i = 0; i < n; i++)
 		o[i] = n - i;
 	} else if (nalast == 0 && tmp == -2) {
 	    // happens only when nalast=NA/0. Means all NAs, replace
 	    // with 0's therefore!
 	    isSorted = FALSE;
 	    for (int i = 0; i < n; i++)
 		o[i] = 0;
 	}
     } else {
 	isSorted = FALSE;
 	switch (TYPEOF(x)) {
 	case INTSXP:
 	case LGLSXP:
 	    isort(xd, o, n);
 	    break;
 	case REALSXP :
 	    dsort(xd, o, n);
 	    break;
 	case STRSXP :
 	    if (sortStr) {
 		csort_pre(xd, n);
 		alloc_csort_otmp(n);
 		csort(xd, o, n);
 	    } else
 		cgroup(xd, o, n);
 	    break;
 	default:
 	    Error
 		("Internal error: previous default should have caught unsupported type");
 	}
     }

     int maxgrpn = gsmax[flip];   // biggest group in the first arg
     void *xsub = NULL;           // local
     int (*f) ();
     void (*g) ();

     if (narg > 1 && gsngrp[flip] < n) {
         // double is the largest type, 8
         xsub = (void *) malloc(maxgrpn * sizeof(double));
         if (xsub == NULL)
             Error("Couldn't allocate xsub in do_radixsort, requested %d * %d bytes.",
                   maxgrpn, sizeof(double));
         // global variable, used by isort, dsort, sort and cgroup
         newo = (int *) malloc(maxgrpn * sizeof(int));
         if (newo == NULL)
             Error("Couldn't allocate newo in do_radixsort, requested %d * %d bytes.",
                   maxgrpn, sizeof(int));
     }

     for (int col = 2; col <= narg; col++) {
 	x = CAR(args);
 	args = CDR(args);
 	xd = DATAPTR(x);
 	ngrp = gsngrp[flip];
 	if (ngrp == n && nalast != 0)
 	    break;
 	flipflop();
 	stackgrps = col != narg || retGrp;
 	order = LOGICAL(decreasing)[col - 1] ? -1 : 1;
 	switch (TYPEOF(x)) {
 	case INTSXP:
 	case LGLSXP:
 	    f = &isorted;
 	    g = &isort;
 	    break;
 	case REALSXP:
 	    twiddle = &dtwiddle;
 	    is_nan = &dnan;
 	    f = &dsorted;
 	    g = &dsort;
 	    break;
 	case STRSXP:
 	    f = &csorted;
 	    if (sortStr) {
 		csort_pre(xd, n);
 		alloc_csort_otmp(gsmax[1 - flip]);
 		g = &csort;
 	    }
 	    // no increasing/decreasing order required if sortStr = FALSE,
 	    // just a dummy argument
 	    else
 		g = &cgroup;
 	    break;
 	default:
 	    Error("Arg %d is type '%s', not yet supported",
 		  col, type2char(TYPEOF(x)));
 	}
 	int i = 0;
 	for (int grp = 0; grp < ngrp; grp++) {
 	    thisgrpn = gs[1 - flip][grp];
 	    if (thisgrpn == 1) {
 		if (nalast == 0) {
 		    // this edge case had to be taken care of
 		    // here.. (see the bottom of this file for
 		    // more explanation)
 		    switch (TYPEOF(x)) {
 		    case INTSXP:
 			if (INTEGER(x)[o[i] - 1] == NA_INTEGER) {
 			    isSorted = FALSE;
 			    o[i] = 0;
 			}
 			break;
 		    case LGLSXP:
 			if (LOGICAL(x)[o[i] - 1] == NA_LOGICAL) {
 			    isSorted = FALSE;
 			    o[i] = 0;
 			}
 			break;
 		    case REALSXP:
 			if (ISNAN(REAL(x)[o[i] - 1])) {
 			    isSorted = FALSE;
 			    o[i] = 0;
 			}
 			break;
 		    case STRSXP:
 			if (STRING_ELT(x, o[i] - 1) == NA_STRING) {
 			    isSorted = FALSE;
 			    o[i] = 0;
                         } break;
                     default :
                         Error("Internal error: previous default should have caught unsupported type");
                     }
                 }
                 i++;
                 push(1);
                 continue;
             }
             osub = o+i;
             // ** TO DO **: if isSorted, we can just point xsub
             //        into x directly. If (*f)() returns 0,
             //        though, will have to copy x at that point
             //        When doing this, xsub could be allocated at
             //        that point for the first time.
             if (TYPEOF(x) == STRSXP)
                 for (int j = 0; j < thisgrpn; j++)
                     ((SEXP *) xsub)[j] = ((SEXP *) xd)[o[i++] - 1];
             else if (TYPEOF(x) == REALSXP)
                 for (int j = 0; j < thisgrpn; j++)
                     ((double *) xsub)[j] = ((double *) xd)[o[i++] - 1];
             else
                 for (int j = 0; j < thisgrpn; j++)
                     ((int *) xsub)[j] = ((int *) xd)[o[i++] - 1];

             // continue; // BASELINE short circuit timing
             // point. Up to here is the cost of creating xsub.
             // [i|d|c]sorted(); very low cost, sequential
             tmp = (*f)(xsub, thisgrpn);
             if (tmp) {
                 // *sorted will have already push()'d the groups
                 if (tmp == -1) {
 		    isSorted = FALSE;
 		    for (int k = 0; k < thisgrpn / 2; k++) {
 			// reverse the order in-place using no
 			// function call or working memory
 			// isorted only returns -1 for
 			// _strictly_ decreasing order,
 			// otherwise ties wouldn't be stable
 			tmp = osub[k];
 			osub[k] = osub[thisgrpn - 1 - k];
 			osub[thisgrpn - 1 - k] = tmp;
 		    }
 		} else if (nalast == 0 && tmp == -2) {
 		    // all NAs, replace osub[.] with 0s.
 		    isSorted = FALSE;
 		    for (int k = 0; k < thisgrpn; k++) osub[k] = 0;
 		}
 		continue;
 	    }
 	    isSorted = FALSE;
 	    // nalast=NA will result in newo[0] = 0. So had to change to -1.
 	    newo[0] = -1;
 	    // may update osub directly, or if not will put the
 	    // result in global newo
 	    (*g)(xsub, osub, thisgrpn);

 	    if (newo[0] != -1) {
 		if (nalast != 0)
 		    for (int j = 0; j < thisgrpn; j++)
 			// reuse xsub to reorder osub
 			((int *) xsub)[j] = osub[newo[j] - 1];
 		else
 		    for (int j = 0; j < thisgrpn; j++)
 			// final nalast case to handle!
 			((int *) xsub)[j] = (newo[j] == 0) ? 0 :
 			    osub[newo[j] - 1];
 		memcpy(osub, xsub, thisgrpn * sizeof(int));
 	    }
 	}
     }

     if (!sortStr && ustr_n != 0)
         Error("Internal error: at the end of do_radixsort sortStr == FALSE but ustr_n !=0 [%d]",
               ustr_n);
     for(int i = 0; i < ustr_n; i++)
         SET_TRLEN(ustr[i], 0);
     maxlen = 1;  // reset global. Minimum needed to count "" and NA
     ustr_n = 0;
     savetl_end();
     free(ustr);
     ustr = NULL;
     ustr_alloc = 0;

     if (retGrp) {
         int maxgrpn = NA_INTEGER;
         ngrp = gsngrp[flip];
         SEXP s_ends = install("ends");
         setAttrib(ans, s_ends, x = allocVector(INTSXP, ngrp));
         if (ngrp > 0) {
             INTEGER(x)[0] = gs[flip][0];
             for (int i = 1; i < ngrp; i++)
                 INTEGER(x)[i] = INTEGER(x)[i - 1] + gs[flip][i];
             maxgrpn = gsmax[flip];
         }
         SEXP s_maxgrpn = install("maxgrpn");
         setAttrib(ans, s_maxgrpn, ScalarInteger(maxgrpn));
         SEXP nms;
         PROTECT(nms = allocVector(STRSXP, 2));
         SET_STRING_ELT(nms, 0, mkChar("grouping"));
         SET_STRING_ELT(nms, 1, mkChar("integer"));
         setAttrib(ans, R_ClassSymbol, nms);
         UNPROTECT(1);
     }

     Rboolean dropZeros = !retGrp && !isSorted && nalast == 0;
     if (dropZeros) {
         int zeros = 0;
         for (int i = 0; i < n; i++) {
             if (o[i] == 0)
                 zeros++;
         }
         if (zeros > 0) {
             PROTECT(ans = allocVector(INTSXP, n - zeros));
             int *o2 = INTEGER(ans);
             for (int i = 0, i2 = 0; i < n; i++) {
                 if (o[i] > 0)
                     o2[i2++] = o[i];
             }
             UNPROTECT(1);
         }
     }

     gsfree();
     free(radix_xsub);          radix_xsub=NULL;    radix_xsuballoc=0;
     free(xsub); free(newo);    xsub=newo=NULL;
     free(xtmp);                xtmp=NULL;          xtmp_alloc=0;
     free(otmp);                otmp=NULL;          otmp_alloc=0;
     free(csort_otmp);          csort_otmp=NULL;    csort_otmp_alloc=0;

     free(cradix_counts);       cradix_counts=NULL; cradix_counts_alloc=0;
     free(cradix_xtmp);         cradix_xtmp=NULL;   cradix_xtmp_alloc=0;
     // TO DO: use xtmp already got

     UNPROTECT(1);
     return ans;
 }