blob: 8c281b8ea7ade7ac60632822086ce0a8d9b1e31e [file] [log] [blame]
/*
* R : A Computer Language for Statistical Data Analysis
* Copyright (C) 1997--2019 The R Core Team
* Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, a copy is available at
* https://www.R-project.org/Licenses/
*/
/* This is currently restricted to vectors of length < 2^30 */
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#define R_USE_SIGNALS 1
#include <Defn.h>
#include <Internal.h>
#define NIL -1
#define ARGUSED(x) LEVELS(x)
#define SET_ARGUSED(x,v) SETLEVELS(x,v)
/* interval at which to check interrupts */
#define NINTERRUPT 1000000
typedef size_t hlen;
/* Hash function and equality test for keys */
typedef struct _HashData HashData;
struct _HashData {
int K;
hlen M;
R_xlen_t nmax;
#ifdef LONG_VECTOR_SUPPORT
Rboolean isLong;
#endif
hlen (*hash)(SEXP, R_xlen_t, HashData *);
int (*equal)(SEXP, R_xlen_t, SEXP, R_xlen_t);
SEXP HashTable;
int nomatch;
Rboolean useUTF8;
Rboolean useCache;
};
#define HTDATA_INT(d) (INTEGER0((d)->HashTable))
#define HTDATA_DBL(d) (REAL0((d)->HashTable))
/*
Integer keys are hashed via a random number generator
based on Knuth's recommendations. The high order K bits
are used as the hash code.
NB: lots of this code relies on M being a power of two and
on silent integer overflow mod 2^32.
<FIXME> Integer keys are wasteful for logical and raw vectors, but
the tables are small in that case. It would be much easier to
implement long vectors, though.
*/
/* Currently the hash table is implemented as a (signed) integer
array. So there are two 31-bit restrictions, the length of the
array and the values. The values are initially NIL (-1). O-based
indices are inserted by isDuplicated, and invalidated by setting
to NA_INTEGER.
*/
static hlen scatter(unsigned int key, HashData *d)
{
return 3141592653U * key >> (32 - d->K);
}
static hlen lhash(SEXP x, R_xlen_t indx, HashData *d)
{
int xi = LOGICAL_ELT(x, indx);
if (xi == NA_LOGICAL) return 2U;
return (hlen) xi;
}
static R_INLINE hlen ihash(SEXP x, R_xlen_t indx, HashData *d)
{
int xi = INTEGER_ELT(x, indx);
if (xi == NA_INTEGER) return 0;
return scatter((unsigned int) xi, d);
}
/* We use unions here because Solaris gcc -O2 has trouble with
casting + incrementing pointers. We use tests here, but R currently
assumes int is 4 bytes and double is 8 bytes.
*/
union foo {
double d;
unsigned int u[2];
};
static R_INLINE hlen rhash(SEXP x, R_xlen_t indx, HashData *d)
{
/* There is a problem with signed 0s under IEC60559 */
double xi = REAL_ELT(x, indx);
double tmp = (xi == 0.0) ? 0.0 : xi;
/* need to use both 32-byte chunks or endianness is an issue */
/* we want all NaNs except NA equal, and all NAs equal */
if (R_IsNA(tmp)) tmp = NA_REAL;
else if (R_IsNaN(tmp)) tmp = R_NaN;
#if 2*SIZEOF_INT == SIZEOF_DOUBLE
{
union foo tmpu;
tmpu.d = tmp;
return scatter(tmpu.u[0] + tmpu.u[1], d);
}
#else
return scatter(*((unsigned int *) (&tmp)), d);
#endif
}
static Rcomplex unify_complex_na(Rcomplex z) {
Rcomplex ans;
ans.r = (z.r == 0.0) ? 0.0 : z.r;
ans.i = (z.i == 0.0) ? 0.0 : z.i;
if (R_IsNA(ans.r) || R_IsNA(ans.i))
ans.r = ans.i = NA_REAL;
else if (R_IsNaN(ans.r) || R_IsNaN(ans.i))
ans.r = ans.i = R_NaN;
return ans;
}
static hlen chash(SEXP x, R_xlen_t indx, HashData *d)
{
Rcomplex tmp = unify_complex_na(COMPLEX_ELT(x, indx));
#if 2*SIZEOF_INT == SIZEOF_DOUBLE
{
unsigned int u;
union foo tmpu;
tmpu.d = tmp.r;
u = tmpu.u[0] ^ tmpu.u[1];
tmpu.d = tmp.i;
u ^= tmpu.u[0] ^ tmpu.u[1];
return scatter(u, d);
}
#else
return scatter((*((unsigned int *)(&tmp.r)) ^
(*((unsigned int *)(&tmp.i)))), d);
#endif
}
/* Hash CHARSXP by address. Hash values are int, For 64bit pointers,
* we do (upper ^ lower) */
static R_INLINE hlen cshash(SEXP x, R_xlen_t indx, HashData *d)
{
intptr_t z = (intptr_t) STRING_ELT(x, indx);
unsigned int z1 = (unsigned int)(z & 0xffffffff), z2 = 0;
#if SIZEOF_LONG == 8
z2 = (unsigned int)(z/0x100000000L);
#endif
return scatter(z1 ^ z2, d);
}
static R_INLINE hlen shash(SEXP x, R_xlen_t indx, HashData *d)
{
unsigned int k;
const char *p;
if(!d->useUTF8 && d->useCache) return cshash(x, indx, d);
const void *vmax = vmaxget();
/* Not having d->useCache really should not happen anymore. */
p = translateCharUTF8(STRING_ELT(x, indx));
k = 0;
while (*p++)
k = 11 * k + (unsigned int) *p; /* was 8 but 11 isn't a power of 2 */
vmaxset(vmax); /* discard any memory used by translateChar */
return scatter(k, d);
}
static int lequal(SEXP x, R_xlen_t i, SEXP y, R_xlen_t j)
{
if (i < 0 || j < 0) return 0;
return (LOGICAL_ELT(x, i) == LOGICAL_ELT(y, j));
}
static R_INLINE int iequal(SEXP x, R_xlen_t i, SEXP y, R_xlen_t j)
{
if (i < 0 || j < 0) return 0;
return (INTEGER_ELT(x, i) == INTEGER_ELT(y, j));
}
/* BDR 2002-1-17 We don't want NA and other NaNs to be equal */
static R_INLINE int requal(SEXP x, R_xlen_t i, SEXP y, R_xlen_t j)
{
if (i < 0 || j < 0) return 0;
double xi = REAL_ELT(x, i);
double yj = REAL_ELT(y, j);
if (!ISNAN(xi) && !ISNAN(yj))
return (xi == yj);
else if (R_IsNA(xi) && R_IsNA(yj)) return 1;
else if (R_IsNaN(xi) && R_IsNaN(yj)) return 1;
else return 0;
}
/* This is differentiating {NA,1}, {NA,0}, {NA, NaN}, {NA, NA},
* but R's print() and format() render all as "NA" */
static int cplx_eq(Rcomplex x, Rcomplex y)
{
if (!ISNAN(x.r) && !ISNAN(x.i) && !ISNAN(y.r) && !ISNAN(y.i))
return x.r == y.r && x.i == y.i;
else if (R_IsNA(x.r) || R_IsNA(x.i)) // x is NA
return (R_IsNA(y.r) || R_IsNA(y.i)) ? 1 : 0;
else if (R_IsNA(y.r) || R_IsNA(y.i)) // y is NA but x is not
return 0;
// else : none is NA but there's at least one NaN; hence ISNAN(.) == R_IsNaN(.)
return
(((ISNAN(x.r) && ISNAN(y.r)) || (!ISNAN(x.r) && !ISNAN(y.r) && x.r == y.r)) && // Re
((ISNAN(x.i) && ISNAN(y.i)) || (!ISNAN(x.i) && !ISNAN(y.i) && x.i == y.i)) // Im
) ? 1 : 0;
}
static int cequal(SEXP x, R_xlen_t i, SEXP y, R_xlen_t j)
{
if (i < 0 || j < 0) return 0;
return cplx_eq(COMPLEX_ELT(x, i), COMPLEX_ELT(y, j));
}
static R_INLINE int sequal(SEXP x, R_xlen_t i, SEXP y, R_xlen_t j)
{
if (i < 0 || j < 0) return 0;
SEXP xi = STRING_ELT(x, i);
SEXP yj = STRING_ELT(y, j);
/* Two strings which have the same address must be the same,
so avoid looking at the contents */
if (xi == yj) return 1;
/* Then if either is NA the other cannot be */
/* Once all CHARSXPs are cached, Seql will handle this */
if (xi == NA_STRING || yj == NA_STRING)
return 0;
/* another pre-test to avoid the call to Seql */
if (IS_CACHED(xi) && IS_CACHED(yj) && ENC_KNOWN(xi) == ENC_KNOWN(yj))
return 0;
return Seql(xi, yj);
}
static hlen rawhash(SEXP x, R_xlen_t indx, HashData *d)
{
return (hlen) RAW_ELT(x, indx);
}
static int rawequal(SEXP x, R_xlen_t i, SEXP y, R_xlen_t j)
{
if (i < 0 || j < 0) return 0;
return (RAW_ELT(x, i) == RAW_ELT(y, j));
}
static hlen vhash(SEXP x, R_xlen_t indx, HashData *d)
{
int i;
unsigned int key;
SEXP _this = VECTOR_ELT(x, indx);
key = OBJECT(_this) + 2*TYPEOF(_this) + 100U*(unsigned int) length(_this);
/* maybe we should also look at attributes, but that slows us down */
switch (TYPEOF(_this)) {
case LGLSXP:
/* This is not too clever: pack into 32-bits and then scatter? */
for(i = 0; i < LENGTH(_this); i++) {
key ^= lhash(_this, i, d);
key *= 97;
}
break;
case INTSXP:
for(i = 0; i < LENGTH(_this); i++) {
key ^= ihash(_this, i, d);
key *= 97;
}
break;
case REALSXP:
for(i = 0; i < LENGTH(_this); i++) {
key ^= rhash(_this, i, d);
key *= 97;
}
break;
case CPLXSXP:
for(i = 0; i < LENGTH(_this); i++) {
key ^= chash(_this, i, d);
key *= 97;
}
break;
case STRSXP:
for(i = 0; i < LENGTH(_this); i++) {
key ^= shash(_this, i, d);
key *= 97;
}
break;
case RAWSXP:
for(i = 0; i < LENGTH(_this); i++) {
key ^= scatter((unsigned int)rawhash(_this, i, d), d);
key *= 97;
}
break;
case VECSXP:
for(i = 0; i < LENGTH(_this); i++) {
key ^= vhash(_this, i, d);
key *= 97;
}
break;
default:
break;
}
return scatter(key, d);
}
static int vequal(SEXP x, R_xlen_t i, SEXP y, R_xlen_t j)
{
if (i < 0 || j < 0) return 0;
return R_compute_identical(VECTOR_ELT(x, i), VECTOR_ELT(y, j), 0);
}
/*
Choose M to be the smallest power of 2
not less than 2*n and set K = log2(M).
Need K >= 1 and hence M >= 2, and 2^M < 2^31-1, hence n <= 2^29.
Dec 2004: modified from 4*n to 2*n, since in the worst case we have
a 50% full table, and that is still rather efficient -- see
R. Sedgewick (1998) Algorithms in C++ 3rd edition p.606.
*/
static void MKsetup(R_xlen_t n, HashData *d, R_xlen_t nmax)
{
#ifdef LONG_VECTOR_SUPPORT
/* M = 2^32 is safe, hence n <= 2^31 -1 */
if(n < 0) /* protect against overflow to -ve */
error(_("length %d is too large for hashing"), n);
#else
if(n < 0 || n >= 1073741824) /* protect against overflow to -ve */
error(_("length %d is too large for hashing"), n);
#endif
if (nmax != NA_INTEGER && nmax != 1) n = nmax;
size_t n2 = 2U * (size_t) n;
d->M = 2;
d->K = 1;
while (d->M < n2) {
d->M *= 2;
d->K++;
}
d->nmax = n;
}
#define IMAX 4294967296L
static void HashTableSetup(SEXP x, HashData *d, R_xlen_t nmax)
{
d->useUTF8 = FALSE;
d->useCache = TRUE;
switch (TYPEOF(x)) {
case LGLSXP:
d->hash = lhash;
d->equal = lequal;
d->nmax = d->M = 4;
d->K = 2; /* unused */
break;
case INTSXP:
{
d->hash = ihash;
d->equal = iequal;
#ifdef LONG_VECTOR_SUPPORT
R_xlen_t nn = XLENGTH(x);
if (nn > IMAX) nn = IMAX;
MKsetup(nn, d, nmax);
#else
MKsetup(LENGTH(x), d, nmax);
#endif
}
break;
case REALSXP:
d->hash = rhash;
d->equal = requal;
MKsetup(XLENGTH(x), d, nmax);
break;
case CPLXSXP:
d->hash = chash;
d->equal = cequal;
MKsetup(XLENGTH(x), d, nmax);
break;
case STRSXP:
d->hash = shash;
d->equal = sequal;
MKsetup(XLENGTH(x), d, nmax);
break;
case RAWSXP:
d->hash = rawhash;
d->equal = rawequal;
d->nmax = d->M = 256;
d->K = 8; /* unused */
break;
case VECSXP:
d->hash = vhash;
d->equal = vequal;
MKsetup(XLENGTH(x), d, nmax);
break;
default:
UNIMPLEMENTED_TYPE("HashTableSetup", x);
}
#ifdef LONG_VECTOR_SUPPORT
d->isLong = IS_LONG_VEC(x);
if (d->isLong) {
d->HashTable = allocVector(REALSXP, (R_xlen_t) d->M);
for (R_xlen_t i = 0; i < d->M; i++) HTDATA_DBL(d)[i] = NIL;
} else
#endif
{
d->HashTable = allocVector(INTSXP, (R_xlen_t) d->M);
for (R_xlen_t i = 0; i < d->M; i++) HTDATA_INT(d)[i] = NIL;
}
}
/* Open address hashing */
/* Collision resolution is by linear probing */
/* The table is guaranteed large so this is sufficient */
static int isDuplicated(SEXP x, R_xlen_t indx, HashData *d)
{
#ifdef LONG_VECTOR_SUPPORT
if (d->isLong) {
double *h = HTDATA_DBL(d);
hlen i = d->hash(x, indx, d);
while (h[i] != NIL) {
if (d->equal(x, (R_xlen_t) h[i], x, indx))
return h[i] >= 0 ? 1 : 0;
i = (i + 1) % d->M;
}
if (d->nmax-- < 0) error("hash table is full");
h[i] = (double) indx;
} else
#endif
{
int *h = HTDATA_INT(d);
hlen i = d->hash(x, indx, d);
while (h[i] != NIL) {
if (d->equal(x, h[i], x, indx))
return h[i] >= 0 ? 1 : 0;
i = (i + 1) % d->M;
}
if (d->nmax-- < 0) error("hash table is full");
h[i] = (int) indx;
}
return 0;
}
static void removeEntry(SEXP table, SEXP x, R_xlen_t indx, HashData *d)
{
#ifdef LONG_VECTOR_SUPPORT
if (d->isLong) {
double *h = HTDATA_DBL(d);
hlen i = d->hash(x, indx, d);
while (h[i] >= 0) {
if (d->equal(table, (R_xlen_t) h[i], x, indx)) {
h[i] = NA_INTEGER; /* < 0, only index values are inserted */
return;
}
i = (i + 1) % d->M;
}
} else
#endif
{
int *h = HTDATA_INT(d);
hlen i = d->hash(x, indx, d);
while (h[i] >= 0) {
if (d->equal(table, h[i], x, indx)) {
h[i] = NA_INTEGER; /* < 0, only index values are inserted */
return;
}
i = (i + 1) % d->M;
}
}
}
#define DUPLICATED_INIT \
HashData data; \
HashTableSetup(x, &data, nmax); \
if(TYPEOF(x) == STRSXP) { \
data.useUTF8 = FALSE; data.useCache = TRUE; \
for(i = 0; i < n; i++) { \
if(IS_BYTES(STRING_ELT(x, i))) { \
data.useUTF8 = FALSE; break; \
} \
if(ENC_KNOWN(STRING_ELT(x, i))) { \
data.useUTF8 = TRUE; \
} \
if(!IS_CACHED(STRING_ELT(x, i))) { \
data.useCache = FALSE; break; \
} \
} \
}
/* used in scan() */
SEXP duplicated(SEXP x, Rboolean from_last)
{
SEXP ans;
int *v, nmax = NA_INTEGER;
if (!isVector(x)) error(_("'duplicated' applies only to vectors"));
R_xlen_t i, n = XLENGTH(x);
DUPLICATED_INIT;
PROTECT(data.HashTable);
PROTECT(ans = allocVector(LGLSXP, n));
v = LOGICAL(ans);
if(from_last)
for (i = n-1; i >= 0; i--) {
// if ((i+1) % NINTERRUPT == 0) R_CheckUserInterrupt();
v[i] = isDuplicated(x, i, &data);
}
else
for (i = 0; i < n; i++) {
// if ((i+1) % NINTERRUPT == 0) R_CheckUserInterrupt();
v[i] = isDuplicated(x, i, &data);
}
UNPROTECT(2);
return ans;
}
static SEXP Duplicated(SEXP x, Rboolean from_last, int nmax)
{
SEXP ans;
int *v;
if (!isVector(x)) error(_("'duplicated' applies only to vectors"));
R_xlen_t i, n = XLENGTH(x);
DUPLICATED_INIT;
PROTECT(data.HashTable);
PROTECT(ans = allocVector(LGLSXP, n));
v = LOGICAL(ans);
if(from_last)
for (i = n-1; i >= 0; i--) {
// if ((i+1) % NINTERRUPT == 0) R_CheckUserInterrupt();
v[i] = isDuplicated(x, i, &data);
}
else
for (i = 0; i < n; i++) {
// if ((i+1) % NINTERRUPT == 0) R_CheckUserInterrupt();
v[i] = isDuplicated(x, i, &data);
}
UNPROTECT(2);
return ans;
}
/* simpler version of the above : return 1-based index of first, or 0 : */
R_xlen_t any_duplicated(SEXP x, Rboolean from_last)
{
R_xlen_t result = 0;
int nmax = NA_INTEGER;
if (!isVector(x)) error(_("'duplicated' applies only to vectors"));
R_xlen_t i, n = XLENGTH(x);
DUPLICATED_INIT;
PROTECT(data.HashTable);
if(from_last) {
for (i = n-1; i >= 0; i--) {
if(isDuplicated(x, i, &data)) { result = ++i; break; }
}
} else {
for (i = 0; i < n; i++) {
if(isDuplicated(x, i, &data)) { result = ++i; break; }
}
}
UNPROTECT(1);
return result;
}
static SEXP duplicated3(SEXP x, SEXP incomp, Rboolean from_last, int nmax)
{
SEXP ans;
int *v, j, m;
if (!isVector(x)) error(_("'duplicated' applies only to vectors"));
R_xlen_t i, n = XLENGTH(x);
DUPLICATED_INIT;
PROTECT(data.HashTable);
PROTECT(ans = allocVector(LGLSXP, n));
v = LOGICAL(ans);
if(from_last)
for (i = n-1; i >= 0; i--) {
// if ((i+1) % NINTERRUPT == 0) R_CheckUserInterrupt();
v[i] = isDuplicated(x, i, &data);
}
else
for (i = 0; i < n; i++) {
// if ((i+1) % NINTERRUPT == 0) R_CheckUserInterrupt();
v[i] = isDuplicated(x, i, &data);
}
if(length(incomp)) {
PROTECT(incomp = coerceVector(incomp, TYPEOF(x)));
m = length(incomp);
for (i = 0; i < n; i++)
if(v[i]) {
for(j = 0; j < m; j++)
if(data.equal(x, i, incomp, j)) {v[i] = 0; break;}
}
UNPROTECT(1);
}
UNPROTECT(2);
return ans;
}
/* return (1-based) index of first duplication, or 0 : */
R_xlen_t any_duplicated3(SEXP x, SEXP incomp, Rboolean from_last)
{
int j, m = length(incomp), nmax = NA_INTEGER;
if (!isVector(x)) error(_("'duplicated' applies only to vectors"));
R_xlen_t i, n = XLENGTH(x);
DUPLICATED_INIT;
PROTECT(data.HashTable);
if(!m) error(_("any_duplicated3(., <0-length incomp>)"));
PROTECT(incomp = coerceVector(incomp, TYPEOF(x)));
m = length(incomp);
if(from_last)
for (i = n-1; i >= 0; i--) {
#define IS_DUPLICATED_CHECK \
if(isDuplicated(x, i, &data)) { \
Rboolean isDup = TRUE; \
for(j = 0; j < m; j++) \
if(data.equal(x, i, incomp, j)) { \
isDup = FALSE; break; \
} \
if(isDup) { \
UNPROTECT(2); \
return ++i; \
} \
/* else continue */ \
}
IS_DUPLICATED_CHECK;
}
else {
for (i = 0; i < n; i++) {
IS_DUPLICATED_CHECK;
}
}
UNPROTECT(2);
return 0;
}
#undef IS_DUPLICATED_CHECK
#undef DUPLICATED_INIT
/* .Internal(duplicated(x)) [op=0]
.Internal(unique(x)) [op=1]
.Internal(anyDuplicated(x)) [op=2]
*/
SEXP attribute_hidden do_duplicated(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP x, incomp, dup, ans;
int fromLast, nmax = NA_INTEGER;
R_xlen_t i, k, n;
checkArity(op, args);
x = CAR(args);
incomp = CADR(args);
if (length(CADDR(args)) < 1)
error(_("'fromLast' must be length 1"));
fromLast = asLogical(CADDR(args));
if (fromLast == NA_LOGICAL)
error(_("'fromLast' must be TRUE or FALSE"));
Rboolean fL = (Rboolean) fromLast;
/* handle zero length vectors, and NULL */
if ((n = xlength(x)) == 0)
return(PRIMVAL(op) <= 1
? allocVector(PRIMVAL(op) != 1 ? LGLSXP : TYPEOF(x), 0)
: ScalarInteger(0));
if (!isVector(x)) {
error(_("%s() applies only to vectors"),
(PRIMVAL(op) == 0 ? "duplicated" :
(PRIMVAL(op) == 1 ? "unique" : /* 2 */ "anyDuplicated")));
}
if (PRIMVAL(op) <= 1) {
nmax = asInteger(CADDDR(args));
if (nmax != NA_INTEGER && nmax <= 0)
error(_("'nmax' must be positive"));
}
if(length(incomp) && /* S has FALSE to mean empty */
!(isLogical(incomp) && length(incomp) == 1 &&
LOGICAL_ELT(incomp, 0) == 0)) {
if(PRIMVAL(op) == 2) {
/* return R's 1-based index :*/
R_xlen_t ind = any_duplicated3(x, incomp, fL);
if(ind > INT_MAX) return ScalarReal((double) ind);
else return ScalarInteger((int)ind);
} else
dup = duplicated3(x, incomp, fL, nmax);
}
else {
if(PRIMVAL(op) == 2) {
R_xlen_t ind = any_duplicated(x, fL);
if(ind > INT_MAX) return ScalarReal((double) ind);
else return ScalarInteger((int)ind);
} else
dup = Duplicated(x, fL, nmax);
}
if (PRIMVAL(op) == 0) /* "duplicated()" */
return dup;
/* ELSE
use the results of "duplicated" to get "unique" */
/* count unique entries */
k = 0;
for (i = 0; i < n; i++)
if (LOGICAL_ELT(dup, i) == 0)
k++;
PROTECT(dup);
PROTECT(ans = allocVector(TYPEOF(x), k));
k = 0;
switch (TYPEOF(x)) {
case LGLSXP:
for (i = 0; i < n; i++)
if (LOGICAL_ELT(dup, i) == 0)
LOGICAL0(ans)[k++] = LOGICAL_ELT(x, i);
break;
case INTSXP:
for (i = 0; i < n; i++)
if (LOGICAL_ELT(dup, i) == 0)
INTEGER0(ans)[k++] = INTEGER_ELT(x, i);
break;
case REALSXP:
for (i = 0; i < n; i++)
if (LOGICAL_ELT(dup, i) == 0)
REAL0(ans)[k++] = REAL_ELT(x, i);
break;
case CPLXSXP:
for (i = 0; i < n; i++)
if (LOGICAL_ELT(dup, i) == 0)
COMPLEX0(ans)[k++] = COMPLEX_ELT(x, i);
break;
case STRSXP:
for (i = 0; i < n; i++)
if (LOGICAL_ELT(dup, i) == 0)
SET_STRING_ELT(ans, k++, STRING_ELT(x, i));
break;
case VECSXP:
for (i = 0; i < n; i++)
if (LOGICAL_ELT(dup, i) == 0)
SET_VECTOR_ELT(ans, k++, VECTOR_ELT(x, i));
break;
case RAWSXP:
for (i = 0; i < n; i++)
if (LOGICAL_ELT(dup, i) == 0)
RAW0(ans)[k++] = RAW_ELT(x, i);
break;
default:
UNIMPLEMENTED_TYPE("duplicated", x);
}
UNPROTECT(2);
return ans;
}
/* Build a hash table, ignoring information on duplication */
static void DoHashing(SEXP table, HashData *d)
{
R_xlen_t i, n = XLENGTH(table);
for (i = 0; i < n; i++) {
// if ((i+1) % NINTERRUPT == 0) R_CheckUserInterrupt();
(void) isDuplicated(table, i, d);
}
}
/* invalidate entries: normally few */
static void UndoHashing(SEXP x, SEXP table, HashData *d)
{
for (R_xlen_t i = 0; i < XLENGTH(x); i++) removeEntry(table, x, i, d);
}
#define DEFLOOKUP(NAME, HASHFUN, EQLFUN) \
static R_INLINE int \
NAME(SEXP table, SEXP x, R_xlen_t indx, HashData *d) \
{ \
int *h = HTDATA_INT(d); \
hlen i = HASHFUN(x, indx, d); \
while (h[i] != NIL) { \
if (EQLFUN(table, h[i], x, indx)) \
return h[i] >= 0 ? h[i] + 1 : d->nomatch; \
i = (i + 1) % d->M; \
} \
return d->nomatch; \
}
/* definitions to help the C compiler to inline of most important cases */
DEFLOOKUP(iLookup, ihash, iequal)
DEFLOOKUP(rLookup, rhash, requal)
DEFLOOKUP(sLookup, shash, sequal)
/* definition for the general case */
DEFLOOKUP(Lookup, d->hash, d->equal)
/* Now do the table lookup */
static SEXP HashLookup(SEXP table, SEXP x, HashData *d)
{
SEXP ans;
R_xlen_t i, n;
n = XLENGTH(x);
PROTECT(ans = allocVector(INTSXP, n));
int *pa = INTEGER0(ans);
switch (TYPEOF(x)) {
case INTSXP:
for (i = 0; i < n; i++)
pa[i] = iLookup(table, x, i, d);
break;
case REALSXP:
for (i = 0; i < n; i++)
pa[i] = rLookup(table, x, i, d);
break;
case STRSXP:
for (i = 0; i < n; i++)
pa[i] = sLookup(table, x, i, d);
break;
default:
for (i = 0; i < n; i++)
pa[i] = Lookup(table, x, i, d);
}
UNPROTECT(1);
return ans;
}
static SEXP match_transform(SEXP s, SEXP env)
{
if(OBJECT(s)) {
if(inherits(s, "factor")) return asCharacterFactor(s);
else if(inherits(s, "POSIXlt")) { /* and maybe more classes in the future:
* Call R's (generic) as.character(s) : */
SEXP call, r;
PROTECT(call = lang2(R_AsCharacterSymbol, s));
r = eval(call, env);
UNPROTECT(1);
return r;
}
}
/* else */
return duplicate(s);
}
// workhorse of R's match() and hence also " ix %in% itable "
SEXP match5(SEXP itable, SEXP ix, int nmatch, SEXP incomp, SEXP env)
{
R_xlen_t n = xlength(ix);
/* handle zero length arguments */
if (n == 0) return allocVector(INTSXP, 0);
SEXP ans;
if (length(itable) == 0) {
ans = allocVector(INTSXP, n);
int *pa = INTEGER0(ans);
for (R_xlen_t i = 0; i < n; i++) pa[i] = nmatch;
return ans;
}
int nprot = 0;
SEXP x = PROTECT(match_transform(ix, env)); nprot++;
SEXP table = PROTECT(match_transform(itable, env)); nprot++;
/* or should we use PROTECT_WITH_INDEX and REPROTECT below ? */
SEXPTYPE type;
/* Coerce to a common type; type == NILSXP is ok here.
* Note that above we coerce factors and "POSIXlt", only to character.
* Hence, coerce to character or to `higher' type
* (given that we have "Vector" or NULL) */
if(TYPEOF(x) >= STRSXP || TYPEOF(table) >= STRSXP) type = STRSXP;
else type = TYPEOF(x) < TYPEOF(table) ? TYPEOF(table) : TYPEOF(x);
PROTECT(x = coerceVector(x, type)); nprot++;
PROTECT(table = coerceVector(table, type)); nprot++;
// special case scalar x -- for speed only :
if(XLENGTH(x) == 1 && !incomp) {
int val = nmatch;
int ntable = LENGTH(table);
switch (type) {
case STRSXP: {
SEXP x_val = STRING_ELT(x,0);
for (int i=0; i < ntable; i++) if (Seql(STRING_ELT(table,i), x_val)) {
val = i + 1; break;
}
break; }
case LGLSXP:
case INTSXP: {
int x_val = INTEGER_ELT(x, 0),
*table_p = INTEGER(table);
for (int i=0; i < ntable; i++) if (table_p[i] == x_val) {
val = i + 1; break;
}
break; }
case REALSXP: {
double xv = REAL_ELT(x, 0);
// pblm with signed 0s under IEC60559
double x_val = (xv == 0.) ? 0. : xv;
double *table_p = REAL(table);
/* we want all NaNs except NA equal, and all NAs equal */
if (R_IsNA(x_val)) {
for (int i=0; i < ntable; i++) if (R_IsNA(table_p[i])) {
val = i + 1; break;
}
}
else if (R_IsNaN(x_val)) {
for (int i=0; i < ntable; i++) if (R_IsNaN(table_p[i])) {
val = i + 1; break;
}
}
else {
for (int i=0; i < ntable; i++) if (table_p[i] == x_val) {
val = i + 1; break;
}
}
break; }
case CPLXSXP: {
Rcomplex x_val = COMPLEX_ELT(x, 0),
*table_p = COMPLEX(table);
for (int i=0; i < ntable; i++)
if (cplx_eq(table_p[i], x_val)) {
val = i + 1; break;
}
break; }
case RAWSXP: {
Rbyte x_val = RAW_ELT(x, 0),
*table_p = RAW(table);
for (int i=0; i < ntable; i++) if (table_p[i] == x_val) {
val = i + 1; break;
}
break; }
}
PROTECT(ans = ScalarInteger(val)); nprot++;
}
else { // regular case
HashData data;
if (incomp) { PROTECT(incomp = coerceVector(incomp, type)); nprot++; }
data.nomatch = nmatch;
HashTableSetup(table, &data, NA_INTEGER);
if(type == STRSXP) {
Rboolean useBytes = FALSE;
Rboolean useUTF8 = FALSE;
Rboolean useCache = TRUE;
for(R_xlen_t i = 0; i < xlength(x); i++) {
SEXP s = STRING_ELT(x, i);
if(IS_BYTES(s)) {
useBytes = TRUE;
useUTF8 = FALSE;
break;
}
if(ENC_KNOWN(s)) {
useUTF8 = TRUE;
}
if(!IS_CACHED(s)) {
useCache = FALSE;
break;
}
}
if(!useBytes || useCache) {
for(int i = 0; i < LENGTH(table); i++) {
SEXP s = STRING_ELT(table, i);
if(IS_BYTES(s)) {
useBytes = TRUE;
useUTF8 = FALSE;
break;
}
if(ENC_KNOWN(s)) {
useUTF8 = TRUE;
}
if(!IS_CACHED(s)) {
useCache = FALSE;
break;
}
}
}
data.useUTF8 = useUTF8;
data.useCache = useCache;
}
PROTECT(data.HashTable); nprot++;
DoHashing(table, &data);
if (incomp) UndoHashing(incomp, table, &data);
ans = HashLookup(table, x, &data);
}
UNPROTECT(nprot);
return ans;
} // end{ match5 }
SEXP matchE(SEXP itable, SEXP ix, int nmatch, SEXP env)
{
return match5(itable, ix, nmatch, NULL, env);
}
/* used from other code, not here: */
SEXP match(SEXP itable, SEXP ix, int nmatch)
{
return match5(itable, ix, nmatch, NULL, R_BaseEnv);
}
// .Internal(match(x, table, nomatch, incomparables)) :
SEXP attribute_hidden do_match(SEXP call, SEXP op, SEXP args, SEXP env)
{
checkArity(op, args);
if ((!isVector(CAR(args)) && !isNull(CAR(args)))
|| (!isVector(CADR(args)) && !isNull(CADR(args))))
error(_("'match' requires vector arguments"));
int nomatch = asInteger(CADDR(args));
SEXP incomp = CADDDR(args);
if (isNull(incomp) || /* S has FALSE to mean empty */
(length(incomp) == 1 && isLogical(incomp) &&
LOGICAL_ELT(incomp, 0) == 0))
return match5(CADR(args), CAR(args), nomatch, NULL, env);
else
return match5(CADR(args), CAR(args), nomatch, incomp, env);
}
/* pmatch and charmatch return integer positions, so cannot be used
for long vector tables */
/* Partial Matching of Strings */
/* Fully S Compatible version. */
/* Hmm, this was not all S compatible! The desired behaviour is:
* First do exact matches, and mark elements as used as they are matched
* unless dup_ok is true.
* Then do partial matching, from left to right, using up the table
* unless dup_ok is true. Multiple partial matches are ignored.
* Empty strings are unmatched BDR 2000/2/16
*/
SEXP attribute_hidden do_pmatch(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP ans, input, target;
int mtch, n_target, mtch_count, dups_ok, no_match;
size_t temp;
int *used = NULL, *ians;
const char **in, **tar;
Rboolean no_dups;
Rboolean useBytes = FALSE, useUTF8 = FALSE;
checkArity(op, args);
input = CAR(args);
R_xlen_t n_input = XLENGTH(input);
target = CADR(args);
n_target = LENGTH(target); // not allowed to be long
no_match = asInteger(CADDR(args));
dups_ok = asLogical(CADDDR(args));
if (dups_ok == NA_LOGICAL)
error(_("invalid '%s' argument"), "duplicates.ok");
no_dups = !dups_ok;
if (!isString(input) || !isString(target))
error(_("argument is not of mode character"));
if(no_dups) {
used = (int *) R_alloc((size_t) n_target, sizeof(int));
for (int j = 0; j < n_target; j++) used[j] = 0;
}
for(R_xlen_t i = 0; i < n_input; i++) {
if(IS_BYTES(STRING_ELT(input, i))) {
useBytes = TRUE;
useUTF8 = FALSE;
break;
} else if(ENC_KNOWN(STRING_ELT(input, i))) {
useUTF8 = TRUE;
}
}
if(!useBytes) {
for(R_xlen_t i = 0; i < n_target; i++) {
if(IS_BYTES(STRING_ELT(target, i))) {
useBytes = TRUE;
useUTF8 = FALSE;
break;
} else if(ENC_KNOWN(STRING_ELT(target, i))) {
useUTF8 = TRUE;
}
}
}
in = (const char **) R_alloc((size_t) n_input, sizeof(char *));
tar = (const char **) R_alloc((size_t) n_target, sizeof(char *));
PROTECT(ans = allocVector(INTSXP, n_input));
ians = INTEGER0(ans);
if(useBytes) {
for(R_xlen_t i = 0; i < n_input; i++) {
in[i] = CHAR(STRING_ELT(input, i));
ians[i] = 0;
}
for(int j = 0; j < n_target; j++)
tar[j] = CHAR(STRING_ELT(target, j));
}
else if(useUTF8) {
for(R_xlen_t i = 0; i < n_input; i++) {
in[i] = translateCharUTF8(STRING_ELT(input, i));
ians[i] = 0;
}
for(int j = 0; j < n_target; j++)
tar[j] = translateCharUTF8(STRING_ELT(target, j));
} else {
for(R_xlen_t i = 0; i < n_input; i++) {
in[i] = translateChar(STRING_ELT(input, i));
ians[i] = 0;
}
for(int j = 0; j < n_target; j++)
tar[j] = translateChar(STRING_ELT(target, j));
}
/* First pass, exact matching */
R_xlen_t nexact = 0;
/* Compromise when hashing used changed in 3.2.0 (PR#15697) */
if (n_input <= 100 || n_target <= 100) {
for (R_xlen_t i = 0; i < n_input; i++) {
const char *ss = in[i];
if (strlen(ss) == 0) continue;
for (int j = 0; j < n_target; j++) {
if (no_dups && used[j]) continue;
if (strcmp(ss, tar[j]) == 0) {
ians[i] = j + 1;
if (no_dups) used[j] = 1;
nexact++;
break;
}
}
}
} else {
HashData data;
HashTableSetup(target, &data, NA_INTEGER);
data.useUTF8 = useUTF8;
data.nomatch = 0;
DoHashing(target, &data);
for (R_xlen_t i = 0; i < n_input; i++) {
if (strlen(in[i]) == 0) /* don't look up "" */
continue;
int j = Lookup(target, input, i, &data);
if ((j == 0) || (no_dups && used[j - 1])) continue;
if (no_dups) used[j - 1] = 1;
ians[i] = j;
nexact++;
}
}
if(nexact < n_input) {
/* Second pass, partial matching */
for (R_xlen_t i = 0; i < n_input; i++) {
const char *ss;
if (ians[i]) continue;
ss = in[i];
temp = strlen(ss);
if (temp == 0) continue;
mtch = 0;
mtch_count = 0;
for (int j = 0; j < n_target; j++) {
if (no_dups && used[j]) continue;
if (strncmp(ss, tar[j], temp) == 0) {
mtch = j + 1;
mtch_count++;
}
}
if (mtch > 0 && mtch_count == 1) {
if(no_dups) used[mtch - 1] = 1;
ians[i] = mtch;
}
}
/* Third pass, set no matches */
for (R_xlen_t i = 0; i < n_input; i++)
if(ians[i] == 0) ians[i] = no_match;
}
UNPROTECT(1);
return ans;
}
/* Partial Matching of Strings */
/* Based on Therneau's charmatch. */
SEXP attribute_hidden do_charmatch(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP ans, input, target;
const char *ss, *st;
Rboolean useBytes = FALSE, useUTF8 = FALSE;
checkArity(op, args);
input = CAR(args);
R_xlen_t n_input = LENGTH(input);
target = CADR(args);
int n_target = LENGTH(target);
if (!isString(input) || !isString(target))
error(_("argument is not of mode character"));
int no_match = asInteger(CADDR(args));
for(R_xlen_t i = 0; i < n_input; i++) {
if(IS_BYTES(STRING_ELT(input, i))) {
useBytes = TRUE;
useUTF8 = FALSE;
break;
} else if(ENC_KNOWN(STRING_ELT(input, i))) {
useUTF8 = TRUE;
}
}
if(!useBytes) {
for(int i = 0; i < n_target; i++) {
if(IS_BYTES(STRING_ELT(target, i))) {
useBytes = TRUE;
useUTF8 = FALSE;
break;
} else if(ENC_KNOWN(STRING_ELT(target, i))) {
useUTF8 = TRUE;
}
}
}
PROTECT(ans = allocVector(INTSXP, n_input));
int *ians = INTEGER0(ans);
const void *vmax = vmaxget(); // prudence: .Internal does this too.
for(R_xlen_t i = 0; i < n_input; i++) {
if(useBytes)
ss = CHAR(STRING_ELT(input, i));
else if(useUTF8)
ss = translateCharUTF8(STRING_ELT(input, i));
else
ss = translateChar(STRING_ELT(input, i));
size_t temp = strlen(ss);
int imatch = NA_INTEGER;
Rboolean perfect = FALSE;
/* we could reset vmax here too: worth it? */
for(int j = 0; j < n_target; j++) {
if(useBytes)
st = CHAR(STRING_ELT(target, j));
else if(useUTF8)
st = translateCharUTF8(STRING_ELT(target, j));
else
st = translateChar(STRING_ELT(target, j));
int k = strncmp(ss, st, temp);
if (k == 0) {
if (strlen(st) == temp) {
if (perfect)
imatch = 0;
else {
perfect = TRUE;
imatch = j + 1;
}
}
else if (!perfect) {
if (imatch == NA_INTEGER)
imatch = j + 1;
else
imatch = 0;
}
}
}
ians[i] = (imatch == NA_INTEGER) ? no_match : imatch;
vmaxset(vmax);
}
UNPROTECT(1);
return ans;
}
/* Functions for matching the supplied arguments to the */
/* formal arguments of functions. The returned value */
/* is a list with all components named. */
static SEXP StripUnmatched(SEXP s)
{
if (s == R_NilValue) return s;
if (CAR(s) == R_MissingArg && !ARGUSED(s) ) {
return StripUnmatched(CDR(s));
}
else if (CAR(s) == R_DotsSymbol ) {
return StripUnmatched(CDR(s));
}
else {
SETCDR(s, StripUnmatched(CDR(s)));
return s;
}
}
static SEXP ExpandDots(SEXP s, int expdots)
{
SEXP r;
if (s == R_NilValue)
return s;
if (TYPEOF(CAR(s)) == DOTSXP ) {
SET_TYPEOF(CAR(s), LISTSXP); /* a safe mutation */
if (expdots) {
r = CAR(s);
while (CDR(r) != R_NilValue ) {
SET_ARGUSED(r, 1);
r = CDR(r);
}
SET_ARGUSED(r, 1);
SETCDR(r, ExpandDots(CDR(s), expdots));
return CAR(s);
}
}
else
SET_ARGUSED(s, 0);
SETCDR(s, ExpandDots(CDR(s), expdots));
return s;
}
static SEXP subDots(SEXP rho)
{
SEXP rval, dots, a, b, t;
int len,i;
dots = findVar(R_DotsSymbol, rho);
if (dots == R_UnboundValue)
error(_("... used in a situation where it does not exist"));
if (dots == R_MissingArg)
return dots;
if (!isPairList(dots))
error(_("... is not a pairlist"));
len = length(dots);
PROTECT(dots);
PROTECT(rval=allocList(len));
for(a = dots, b = rval, i = 1; i <= len; a = CDR(a), b = CDR(b), i++) {
SET_TAG(b, TAG(a));
t = CAR(a);
while (TYPEOF(t) == PROMSXP)
t = PREXPR(t);
if( isSymbol(t) || isLanguage(t) )
SETCAR(b, installDDVAL(i));
else
SETCAR(b, t);
}
UNPROTECT(2);
return rval;
}
SEXP attribute_hidden do_matchcall(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP formals, actuals, rlist;
SEXP funcall, f, b, rval, sysp, t1, t2, tail;
// RCNTXT *cptr;
int expdots;
checkArity(op,args);
funcall = CADR(args);
if (TYPEOF(funcall) == EXPRSXP)
funcall = VECTOR_ELT(funcall, 0);
if (TYPEOF(funcall) != LANGSXP)
error(_("invalid '%s' argument"), "call");
b = CAR(args);
if (TYPEOF(b) != CLOSXP)
error(_("invalid '%s' argument"), "definition");
sysp = CAR(CDDDR(args));
if (!isEnvironment(sysp))
error(_("'envir' must be an environment"));
/* Do we expand ... ? */
expdots = asLogical(CAR(CDDR(args)));
if (expdots == NA_LOGICAL)
error(_("invalid '%s' argument"), "expand.dots");
/* Get the formals and match the actual args */
formals = FORMALS(b);
PROTECT(actuals = shallow_duplicate(CDR(funcall)));
/* If there is a ... symbol then expand it out in the sysp env
We need to take some care since the ... might be in the middle
of the actuals */
t2 = R_MissingArg;
for (t1=actuals ; t1!=R_NilValue ; t1 = CDR(t1) ) {
if (CAR(t1) == R_DotsSymbol) {
t2 = subDots(sysp);
break;
}
}
/* now to splice t2 into the correct spot in actuals */
if (t2 != R_MissingArg ) { /* so we did something above */
if( CAR(actuals) == R_DotsSymbol ) {
UNPROTECT(1);
actuals = listAppend(t2, CDR(actuals));
PROTECT(actuals);
}
else {
for(t1=actuals; t1!=R_NilValue; t1=CDR(t1)) {
if( CADR(t1) == R_DotsSymbol ) {
tail = CDDR(t1);
SETCDR(t1, t2);
listAppend(actuals,tail);
break;
}
}
}
} else { /* get rid of it */
if( CAR(actuals) == R_DotsSymbol ) {
UNPROTECT(1);
actuals = CDR(actuals);
PROTECT(actuals);
}
else {
for(t1=actuals; t1!=R_NilValue; t1=CDR(t1)) {
if( CADR(t1) == R_DotsSymbol ) {
tail = CDDR(t1);
SETCDR(t1, tail);
break;
}
}
}
}
rlist = matchArgs_RC(formals, actuals, call);
/* Attach the argument names as tags */
for (f = formals, b = rlist; b != R_NilValue; b = CDR(b), f = CDR(f)) {
SET_TAG(b, TAG(f));
}
/* Handle the dots */
PROTECT(rlist = ExpandDots(rlist, expdots));
/* Eliminate any unmatched formals and any that match R_DotSymbol */
/* This needs to be after ExpandDots as the DOTSXP might match ... */
rlist = StripUnmatched(rlist);
PROTECT(rval = allocSExp(LANGSXP));
SETCAR(rval, lazy_duplicate(CAR(funcall)));
SETCDR(rval, rlist);
UNPROTECT(3);
return rval;
}
#include <R_ext/RS.h> /* for Memzero */
#ifdef _AIX /*some people just have to be different: is this still needed? */
# include <memory.h>
#endif
static SEXP
rowsum(SEXP x, SEXP g, SEXP uniqueg, SEXP snarm, SEXP rn)
{
SEXP matches,ans;
int n, p, ng, narm;
R_xlen_t offset = 0, offsetg = 0;
HashData data;
data.nomatch = 0;
n = LENGTH(g);
ng = length(uniqueg);
narm = asLogical(snarm);
if(narm == NA_LOGICAL) error("'na.rm' must be TRUE or FALSE");
if(isMatrix(x)) p = ncols(x); else p = 1;
HashTableSetup(uniqueg, &data, NA_INTEGER);
PROTECT(data.HashTable);
DoHashing(uniqueg, &data);
PROTECT(matches = HashLookup(uniqueg, g, &data));
int *pmatches = INTEGER(matches);
PROTECT(ans = allocMatrix(TYPEOF(x), ng, p));
switch(TYPEOF(x)){
case REALSXP:
Memzero(REAL0(ans), ng*p);
for(int i = 0; i < p; i++) {
double *pa = REAL0(ans);
for(int j = 0; j < n; j++) {
double xjpo = REAL_ELT(x, j + offset);
if(!narm || !ISNAN(xjpo))
pa[pmatches[j] - 1 + offsetg] += xjpo;
}
offset += n;
offsetg += ng;
}
break;
case INTSXP:
Memzero(INTEGER0(ans), ng*p);
for(int i = 0; i < p; i++) {
int *pa = INTEGER0(ans);
for(int j = 0; j < n; j++) {
int xjpo = INTEGER_ELT(x, j + offset);
if (xjpo == NA_INTEGER) {
if(!narm)
pa[pmatches[j] - 1 + offsetg] = NA_INTEGER;
} else if (pa[pmatches[j] - 1 + offsetg] != NA_INTEGER) {
/* check for integer overflows */
int itmp = pa[pmatches[j] - 1 + offsetg];
double dtmp = itmp;
dtmp += xjpo;
if (dtmp < INT_MIN || dtmp > INT_MAX) itmp = NA_INTEGER;
else itmp += xjpo;
pa[pmatches[j] - 1 + offsetg] = itmp;
}
}
offset += n;
offsetg += ng;
}
break;
default:
error("non-numeric matrix in rowsum(): this should not happen");
}
if (TYPEOF(rn) != STRSXP) error("row names are not character");
SEXP dn = allocVector(VECSXP, 2), dn2, dn3;
setAttrib(ans, R_DimNamesSymbol, dn);
SET_VECTOR_ELT(dn, 0, rn);
dn2 = getAttrib(x, R_DimNamesSymbol);
if(length(dn2) >= 2 &&
!isNull(dn3 = VECTOR_ELT(dn2, 1))) SET_VECTOR_ELT(dn, 1, dn3);
UNPROTECT(3); /* HashTable, matches, ans */
return ans;
}
static SEXP
rowsum_df(SEXP x, SEXP g, SEXP uniqueg, SEXP snarm, SEXP rn)
{
SEXP matches,ans,col,xcol;
int p, narm;
HashData data;
data.nomatch = 0;
R_xlen_t n = XLENGTH(g);
p = LENGTH(x);
R_xlen_t ng = XLENGTH(uniqueg);
narm = asLogical(snarm);
if(narm == NA_LOGICAL) error("'na.rm' must be TRUE or FALSE");
HashTableSetup(uniqueg, &data, NA_INTEGER);
PROTECT(data.HashTable);
DoHashing(uniqueg, &data);
PROTECT(matches = HashLookup(uniqueg, g, &data));
int *pmatches = INTEGER(matches);
PROTECT(ans = allocVector(VECSXP, p));
for(int i = 0; i < p; i++) {
xcol = VECTOR_ELT(x,i);
if (!isNumeric(xcol))
error(_("non-numeric data frame in rowsum"));
switch(TYPEOF(xcol)){
case REALSXP:
PROTECT(col = allocVector(REALSXP,ng));
Memzero(REAL0(col), ng);
for(R_xlen_t j = 0; j < n; j++) {
double xj = REAL_ELT(xcol, j);
if(!narm || !ISNAN(xj))
REAL0(col)[pmatches[j] - 1] += xj;
}
SET_VECTOR_ELT(ans,i,col);
UNPROTECT(1);
break;
case INTSXP:
PROTECT(col = allocVector(INTSXP, ng));
Memzero(INTEGER0(col), ng);
for(R_xlen_t j = 0; j < n; j++) {
int xj = INTEGER_ELT(xcol, j);
if (xj == NA_INTEGER) {
if(!narm)
INTEGER0(col)[pmatches[j] - 1] = NA_INTEGER;
} else if (INTEGER0(col)[pmatches[j] - 1] != NA_INTEGER) {
int itmp = INTEGER0(col)[pmatches[j] - 1];
double dtmp = itmp;
dtmp += xj;
if (dtmp < INT_MIN || dtmp > INT_MAX) itmp = NA_INTEGER;
else itmp += xj;
INTEGER0(col)[pmatches[j] - 1] = itmp;
}
}
SET_VECTOR_ELT(ans, i, col);
UNPROTECT(1);
break;
default:
error(_("this cannot happen"));
}
}
namesgets(ans, getAttrib(x, R_NamesSymbol));
if (TYPEOF(rn) != STRSXP) error("row names are not character");
setAttrib(ans, R_RowNamesSymbol, rn);
classgets(ans, mkString("data.frame"));
UNPROTECT(3); /* HashTable, matches, ans */
return ans;
}
SEXP attribute_hidden do_rowsum(SEXP call, SEXP op, SEXP args, SEXP env)
{
checkArity(op, args);
if(PRIMVAL(op) == 1)
return rowsum_df(CAR(args), CADR(args), CADDR(args), CADDDR(args),
CAD4R(args));
else
return rowsum(CAR(args), CADR(args), CADDR(args), CADDDR(args),
CAD4R(args));
}
/* returns 1-based duplicate no */
static int isDuplicated2(SEXP x, int indx, HashData *d)
{
int *h = HTDATA_INT(d);
hlen i = d->hash(x, indx, d);
while (h[i] != NIL) {
if (d->equal(x, h[i], x, indx))
return h[i] + 1;
i = (i + 1) % d->M;
}
h[i] = indx;
return 0;
}
static SEXP duplicated2(SEXP x, HashData *d)
{
SEXP ans;
int i, n;
n = LENGTH(x);
HashTableSetup(x, d, NA_INTEGER);
PROTECT(d->HashTable);
PROTECT(ans = allocVector(INTSXP, n));
int *h = HTDATA_INT(d);
int *v = INTEGER0(ans);
for (i = 0; i < d->M; i++) h[i] = NIL;
for (i = 0; i < n; i++) {
// if ((i+1) % NINTERRUPT == 0) R_CheckUserInterrupt();
v[i] = isDuplicated2(x, i, d);
}
UNPROTECT(2);
return ans;
}
SEXP attribute_hidden do_makeunique(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP names, sep, ans, dup, newx;
int i, cnt, *cnts, dp;
int n, len, maxlen = 0;
HashData data;
const char *csep, *ss;
const void *vmax;
checkArity(op, args);
names = CAR(args);
if(!isString(names))
error(_("'names' must be a character vector"));
n = LENGTH(names);
sep = CADR(args);
if(!isString(sep) || LENGTH(sep) != 1)
error(_("'%s' must be a character string"), "sep");
csep = translateChar(STRING_ELT(sep, 0));
PROTECT(ans = allocVector(STRSXP, n));
vmax = vmaxget();
for(i = 0; i < n; i++) {
SET_STRING_ELT(ans, i, STRING_ELT(names, i));
len = (int) strlen(translateChar(STRING_ELT(names, i)));
if(len > maxlen) maxlen = len;
vmaxset(vmax);
}
if(n > 1) {
/* +2 for terminator and rounding error */
char buf[maxlen + (int) strlen(csep)
+ (int) (log((double)n)/log(10.0)) + 2];
if(n < 10000) {
R_CheckStack2((size_t)n * sizeof(int));
cnts = (int *) alloca(((size_t) n) * sizeof(int));
} else {
/* This is going to be slow so use expensive allocation
that will be recovered if interrupted. */
cnts = (int *) R_alloc((size_t) n, sizeof(int));
}
for(i = 0; i < n; i++) cnts[i] = 1;
data.nomatch = 0;
PROTECT(newx = allocVector(STRSXP, 1));
PROTECT(dup = duplicated2(names, &data));
PROTECT(data.HashTable);
vmax = vmaxget();
for(i = 1; i < n; i++) { /* first cannot be a duplicate */
dp = INTEGER_ELT(dup, i); /* 1-based number of first occurrence */
if(dp == 0) continue;
ss = translateChar(STRING_ELT(names, i));
/* Try appending 1,2,3, ..., n-1 until it is not already in use */
for(cnt = cnts[dp - 1]; cnt < n; cnt++) {
sprintf(buf, "%s%s%d", ss, csep, cnt);
SET_STRING_ELT(newx, 0, mkChar(buf));
if(Lookup(ans, newx, 0, &data) == data.nomatch) break;
}
SET_STRING_ELT(ans, i, STRING_ELT(newx, 0));
/* insert it */ (void) isDuplicated(ans, i, &data);
cnts[dp - 1] = cnt+1; /* cache the first unused cnt value */
vmaxset(vmax);
}
UNPROTECT(3);
}
UNPROTECT(1);
return ans;
}
/* Use hashing to improve object.size. Here we want equal CHARSXPs,
not equal contents. */
static int csequal(SEXP x, R_xlen_t i, SEXP y, R_xlen_t j)
{
return STRING_ELT(x, i) == STRING_ELT(y, j);
}
static void HashTableSetup1(SEXP x, HashData *d)
{
d->hash = cshash;
d->equal = csequal;
#ifdef LONG_VECTOR_SUPPORT
d->isLong = FALSE;
#endif
MKsetup(XLENGTH(x), d, NA_INTEGER);
d->HashTable = allocVector(INTSXP, (R_xlen_t) d->M);
for (R_xlen_t i = 0; i < d->M; i++) HTDATA_INT(d)[i] = NIL;
}
/* used in utils */
SEXP Rf_csduplicated(SEXP x)
{
if(TYPEOF(x) != STRSXP)
error("C function 'csduplicated' not called on a STRSXP");
R_xlen_t n = XLENGTH(x);
HashData data;
HashTableSetup1(x, &data);
PROTECT(data.HashTable);
SEXP ans = PROTECT(allocVector(LGLSXP, n));
int *v = LOGICAL(ans);
for (R_xlen_t i = 0; i < n; i++) v[i] = isDuplicated(x, i, &data);
UNPROTECT(2);
return ans;
}
#include <R_ext/Random.h>
// sample.int(.) --> .Internal(sample2(n, size)) :
SEXP attribute_hidden do_sample2(SEXP call, SEXP op, SEXP args, SEXP env)
{
checkArity(op, args);
SEXP ans;
double dn = asReal(CAR(args));
int k = asInteger(CADR(args));
if (!R_FINITE(dn) || dn < 0 || dn > 4.5e15 || (k > 0 && dn == 0))
error(_("invalid first argument"));
if (k < 0) error(_("invalid '%s' argument"), "size");
if (k > dn/2) error("This algorithm is for size <= n/2");
HashData data;
GetRNGstate();
if (dn > INT_MAX) {
ans = PROTECT(allocVector(REALSXP, k));
double *ry = REAL0(ans);
HashTableSetup(ans, &data, NA_INTEGER);
PROTECT(data.HashTable);
for (int i = 0; i < k; i++)
for(int j = 0; j < 100; j++) { // average < 2
ry[i] = R_unif_index(dn) + 1;
if(!isDuplicated(ans, i, &data)) break;
}
} else {
ans = PROTECT(allocVector(INTSXP, k));
int *iy = INTEGER0(ans);
HashTableSetup(ans, &data, NA_INTEGER);
PROTECT(data.HashTable);
for (int i = 0; i < k; i++)
for(int j = 0; j < 100; j++) { // average < 2
iy[i] = (int)(R_unif_index(dn) + 1);
if(!isDuplicated(ans, i, &data)) break;
}
}
PutRNGstate();
UNPROTECT(2);
return ans;
}