blob: 06354bd45406de6ee0887fb2bb13f993f7b114ce [file] [log] [blame]
/*
* R : A Computer Language for Statistical Data Analysis
* Copyright (C) 2001--2017 The R Core Team
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Pulic License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, a copy is available at
* https://www.R-project.org/Licenses/
*/
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include <Defn.h>
#include <Internal.h>
#define isRaw(x) (TYPEOF(x) == RAWSXP)
/* charToRaw works at byte level, ignores encoding */
SEXP attribute_hidden do_charToRaw(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP ans, x = CAR(args);
int nc;
checkArity(op, args);
if (!isString(x) || LENGTH(x) == 0)
error(_("argument must be a character vector of length 1"));
if (LENGTH(x) > 1)
warning(_("argument should be a character vector of length 1\nall but the first element will be ignored"));
nc = LENGTH(STRING_ELT(x, 0));
ans = allocVector(RAWSXP, nc);
if (nc) memcpy(RAW(ans), CHAR(STRING_ELT(x, 0)), nc);
return ans;
}
/* <UTF8> rawToChar should work at byte level */
SEXP attribute_hidden do_rawToChar(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP ans, x = CAR(args);
checkArity(op, args);
if (!isRaw(x))
error(_("argument 'x' must be a raw vector"));
int multiple = asLogical(CADR(args));
if (multiple == NA_LOGICAL)
error(_("argument 'multiple' must be TRUE or FALSE"));
if (multiple) {
R_xlen_t i, nc = XLENGTH(x);
char buf[2];
buf[1] = '\0';
PROTECT(ans = allocVector(STRSXP, nc));
for (i = 0; i < nc; i++) {
buf[0] = (char) RAW(x)[i];
SET_STRING_ELT(ans, i, mkChar(buf));
}
/* do we want to copy e.g. names here? */
} else {
int i, j, nc = LENGTH(x);
/* String is not necessarily 0-terminated and may contain nuls.
Strip trailing nuls */
for (i = 0, j = -1; i < nc; i++) if(RAW(x)[i]) j = i;
nc = j + 1;
PROTECT(ans = allocVector(STRSXP, 1));
SET_STRING_ELT(ans, 0,
mkCharLenCE((const char *)RAW(x), j+1, CE_NATIVE));
}
UNPROTECT(1);
return ans;
}
SEXP attribute_hidden do_rawShift(SEXP call, SEXP op, SEXP args, SEXP env)
{
checkArity(op, args);
SEXP ans, x = CAR(args);
int shift = asInteger(CADR(args));
if (!isRaw(x))
error(_("argument 'x' must be a raw vector"));
if (shift == NA_INTEGER || shift < -8 || shift > 8)
error(_("argument 'shift' must be a small integer"));
PROTECT(ans = duplicate(x));
if (shift > 0)
for (R_xlen_t i = 0; i < XLENGTH(x); i++)
RAW(ans)[i] <<= shift;
else
for (R_xlen_t i = 0; i < XLENGTH(x); i++)
RAW(ans)[i] >>= (-shift);
UNPROTECT(1);
return ans;
}
SEXP attribute_hidden do_rawToBits(SEXP call, SEXP op, SEXP args, SEXP env)
{
checkArity(op, args);
SEXP ans, x = CAR(args);
R_xlen_t i, j = 0;
unsigned int tmp;
if (!isRaw(x))
error(_("argument 'x' must be a raw vector"));
PROTECT(ans = allocVector(RAWSXP, 8*XLENGTH(x)));
for (i = 0; i < XLENGTH(x); i++) {
tmp = (unsigned int) RAW(x)[i];
for (int k = 0; k < 8; k++, tmp >>= 1)
RAW(ans)[j++] = tmp & 0x1;
}
UNPROTECT(1);
return ans;
}
SEXP attribute_hidden do_intToBits(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP ans, x;
R_xlen_t i, j = 0;
unsigned int tmp;
checkArity(op, args);
PROTECT(x = coerceVector(CAR(args), INTSXP));
if (!isInteger(x))
error(_("argument 'x' must be an integer vector"));
PROTECT(ans = allocVector(RAWSXP, 32*XLENGTH(x)));
for (i = 0; i < XLENGTH(x); i++) {
tmp = (unsigned int) INTEGER(x)[i];
for (int k = 0; k < 32; k++, tmp >>= 1)
RAW(ans)[j++] = tmp & 0x1;
}
UNPROTECT(2);
return ans;
}
SEXP attribute_hidden do_packBits(SEXP call, SEXP op, SEXP args, SEXP env)
{
checkArity(op, args);
SEXP ans, x = CAR(args), stype = CADR(args);
Rboolean useRaw;
R_xlen_t i, len = XLENGTH(x), slen;
int fac;
if (TYPEOF(x) != RAWSXP && TYPEOF(x) != LGLSXP && TYPEOF(x) != INTSXP)
error(_("argument 'x' must be raw, integer or logical"));
if (!isString(stype) || LENGTH(stype) != 1)
error(_("argument '%s' must be a character string"), "type");
useRaw = strcmp(CHAR(STRING_ELT(stype, 0)), "integer");
fac = useRaw ? 8 : 32;
if (len% fac)
error(_("argument 'x' must be a multiple of %d long"), fac);
slen = len/fac;
PROTECT(ans = allocVector(useRaw ? RAWSXP : INTSXP, slen));
for (i = 0; i < slen; i++)
if (useRaw) {
Rbyte btmp = 0;
for (int k = 7; k >= 0; k--) {
btmp <<= 1;
if (isRaw(x))
btmp |= RAW(x)[8*i + k] & 0x1;
else if (isLogical(x) || isInteger(x)) {
int j = INTEGER(x)[8*i+k];
if (j == NA_INTEGER)
error(_("argument 'x' must not contain NAs"));
btmp |= j & 0x1;
}
}
RAW(ans)[i] = btmp;
} else {
unsigned int itmp = 0;
for (int k = 31; k >= 0; k--) {
itmp <<= 1;
if (isRaw(x))
itmp |= RAW(x)[32*i + k] & 0x1;
else if (isLogical(x) || isInteger(x)) {
int j = INTEGER(x)[32*i+k];
if (j == NA_INTEGER)
error(_("argument 'x' must not contain NAs"));
itmp |= j & 0x1;
}
}
INTEGER(ans)[i] = (int) itmp;
}
UNPROTECT(1);
return ans;
}
/* Simplified version for RFC3629 definition of UTF-8 */
static int mbrtoint(int *w, const char *s)
{
unsigned int byte;
byte = *((unsigned char *)s);
if (byte == 0) {
*w = 0;
return 0;
} else if (byte < 0xC0) {
*w = (int) byte;
return 1;
} else if (byte < 0xE0) {
if (!s[1]) return -2;
if ((s[1] & 0xC0) == 0x80) {
*w = (int) (((byte & 0x1F) << 6) | (s[1] & 0x3F));
return 2;
} else return -1;
} else if (byte < 0xF0) {
if (!s[1] || !s[2]) return -2;
if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
*w = (int) (((byte & 0x0F) << 12)
| ((s[1] & 0x3F) << 6) | (s[2] & 0x3F));
byte = *w;
if (byte >= 0xD800 && byte <= 0xDFFF) return -1; /* surrogate */
// Following Corrigendum 9, these are valid in UTF-8
// if (byte == 0xFFFE || byte == 0xFFFF) return -1;
return 3;
} else return -1;
} else if (byte <= 0xF4) { // for RFC3629
if (!s[1] || !s[2] || !s[3]) return -2;
if (((s[1] & 0xC0) == 0x80)
&& ((s[2] & 0xC0) == 0x80)
&& ((s[3] & 0xC0) == 0x80)) {
*w = (int) (((byte & 0x07) << 18)
| ((s[1] & 0x3F) << 12)
| ((s[2] & 0x3F) << 6)
| (s[3] & 0x3F));
byte = *w;
return (byte <= 0x10FFFF) ? 4 : -1;
} else return -1;
} else return -1;
/* return -2; not reached */
}
SEXP attribute_hidden do_utf8ToInt(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP ans, x = CAR(args);
int tmp, used = 0; /* -Wall */
R_xlen_t i, j, nc;
checkArity(op, args);
if (!isString(x) || LENGTH(x) == 0)
error(_("argument must be a character vector of length 1"));
if (LENGTH(x) > 1)
warning(_("argument should be a character vector of length 1\nall but the first element will be ignored"));
if (STRING_ELT(x, 0) == NA_STRING) return ScalarInteger(NA_INTEGER);
const char *s = CHAR(STRING_ELT(x, 0));
if (!utf8Valid(s)) return ScalarInteger(NA_INTEGER);
nc = XLENGTH(STRING_ELT(x, 0)); /* ints will be shorter */
int *ians = (int *) R_alloc(nc, sizeof(int));
for (i = 0, j = 0; i < nc; i++) {
used = mbrtoint(&tmp, s);
if (used <= 0) break;
ians[j++] = tmp;
s += used;
}
if (used < 0) error(_("invalid UTF-8 string"));
ans = allocVector(INTSXP, j);
if (j) memcpy(INTEGER(ans), ians, sizeof(int) * j);
return ans;
}
/* Based on PCRE, but current Unicode only needs 4 bytes with maximum 0x10ffff */
static const int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff };
static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0 };
static size_t inttomb(char *s, const int wc)
{
register int i, j;
unsigned int cvalue = wc;
char buf[10], *b;
b = s ? s : buf;
if (cvalue == 0) {*b = 0; return 0;}
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
if (cvalue <= utf8_table1[i]) break;
b += i;
for (j = i; j > 0; j--) {
*b-- = (char)(0x80 | (cvalue & 0x3f));
cvalue >>= 6;
}
*b = (char)(utf8_table2[i] | cvalue);
return i + 1;
}
#include <R_ext/RS.h> /* for Calloc/Free */
SEXP attribute_hidden do_intToUtf8(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP ans, x;
int multiple, s_pair;
size_t used, len;
char buf[10], *tmp;
checkArity(op, args);
PROTECT(x = coerceVector(CAR(args), INTSXP));
if (!isInteger(x))
error(_("argument 'x' must be an integer vector"));
multiple = asLogical(CADR(args));
if (multiple == NA_LOGICAL)
error(_("argument 'multiple' must be TRUE or FALSE"));
s_pair = asLogical(CADDR(args));
if (s_pair == NA_LOGICAL)
error(_("argument 'allow_surrogate_pairs' must be TRUE or FALSE"));
if (multiple) {
if (s_pair)
warning("allow_surrogate_pairs = TRUE is incompatible with multiple = TRUE and will be ignored");
R_xlen_t i, nc = XLENGTH(x);
PROTECT(ans = allocVector(STRSXP, nc));
for (i = 0; i < nc; i++) {
int this = INTEGER(x)[i];
if (this == NA_INTEGER
|| (this >= 0xD800 && this <= 0xDFFF)
|| this > 0x10FFFF)
SET_STRING_ELT(ans, i, NA_STRING);
else {
used = inttomb(buf, this);
buf[used] = '\0';
SET_STRING_ELT(ans, i, mkCharCE(buf, CE_UTF8));
}
}
/* do we want to copy e.g. names here? */
} else {
int i, nc = LENGTH(x);
Rboolean haveNA = FALSE;
/* Note that this gives zero length for input '0', so it is omitted */
for (i = 0, len = 0; i < nc; i++) {
int this = INTEGER(x)[i];
if (this == NA_INTEGER
|| (this >= 0xDC00 && this <= 0xDFFF)
|| this > 0x10FFFF) {
haveNA = TRUE;
break;
}
else if (this >= 0xD800 && this <= 0xDBFF) {
if(!s_pair || i >= nc-1) {haveNA = TRUE; break;}
int next = INTEGER(x)[i+1];
if(next >= 0xDC00 && next <= 0xDFFF) i++;
else {haveNA = TRUE; break;}
len += 4; // all points not in the basic plane have length 4
}
else
len += inttomb(NULL, this);
}
if (haveNA) {
PROTECT(ans = allocVector(STRSXP, 1));
SET_STRING_ELT(ans, 0, NA_STRING);
UNPROTECT(2);
return ans;
}
if (len >= 10000) {
tmp = Calloc(len+1, char);
} else {
R_CheckStack2(len+1);
tmp = alloca(len+1); tmp[len] = '\0';
}
for (i = 0, len = 0; i < nc; i++) {
int this = INTEGER(x)[i];
if(s_pair && (this >= 0xD800 && this <= 0xDBFF)) {
// all the validity checking has already been done.
int next = INTEGER(x)[++i];
unsigned int hi = this - 0xD800, lo = next - 0xDC00;
this = 0x10000 + (hi << 10) + lo;
}
used = inttomb(buf, this);
strncpy(tmp + len, buf, used);
len += used;
}
PROTECT(ans = allocVector(STRSXP, 1));
SET_STRING_ELT(ans, 0, mkCharLenCE(tmp, (int) len, CE_UTF8));
if(len >= 10000) Free(tmp);
}
UNPROTECT(2);
return ans;
}