blob: 28f343a6807409980b09116c25ca92897181a9b3 [file] [log] [blame]
/*
* R : A Computer Language for Statistical Data Analysis
* Copyright (C) 2003-2016 The R Core Team.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, a copy is available at
* https://www.R-project.org/Licenses/
*/
/* <UTF8> OK, provided the delimiters are ASCII
match length is in now in chars.
*/
#include <string.h>
#include <R.h>
#include "tools.h"
#include <stdlib.h> /* for MB_CUR_MAX */
#include <wchar.h>
LibExtern Rboolean mbcslocale;
size_t Rf_mbrtowc(wchar_t *wc, const char *s, size_t n, mbstate_t *ps);
/* .Call, so manages R_alloc stack */
SEXP
delim_match(SEXP x, SEXP delims)
{
/*
Match delimited substrings in a character vector x.
Returns an integer vector with the same length of x giving the
starting position of the match (including the start delimiter), or
-1 if there is none, with attribute "match.length" giving the
length of the matched text (including the end delimiter), or -1
for no match.
This is still very experimental.
Currently, the start and end delimiters must be single characters;
it would be nice to allow for arbitrary regexps.
Currently, the only syntax supported is Rd ('\' is the escape
character, '%' starts a comment extending to the next newline, no
quote characters. It would be nice to generalize this, too.
Nevertheless, this is already useful for parsing Rd.
*/
char c;
const char *s, *delim_start, *delim_end;
Sint n, i, pos, start, end, delim_depth;
int lstart, lend;
Rboolean is_escaped, equal_start_and_end_delims;
SEXP ans, matchlen;
mbstate_t mb_st; int used;
if(!isString(x) || !isString(delims) || (length(delims) != 2))
error(_("invalid argument type"));
delim_start = translateChar(STRING_ELT(delims, 0));
delim_end = translateChar(STRING_ELT(delims, 1));
lstart = (int) strlen(delim_start); lend = (int) strlen(delim_end);
equal_start_and_end_delims = strcmp(delim_start, delim_end) == 0;
n = length(x);
PROTECT(ans = allocVector(INTSXP, n));
PROTECT(matchlen = allocVector(INTSXP, n));
for(i = 0; i < n; i++) {
memset(&mb_st, 0, sizeof(mbstate_t));
start = end = -1;
s = translateChar(STRING_ELT(x, i));
pos = is_escaped = delim_depth = 0;
while((c = *s) != '\0') {
if(c == '\n') {
is_escaped = FALSE;
}
else if(c == '\\') {
is_escaped = is_escaped ? FALSE : TRUE;
}
else if(is_escaped) {
is_escaped = FALSE;
}
else if(c == '%') {
while((c != '\0') && (c != '\n')) {
if(mbcslocale) {
used = (int) Rf_mbrtowc(NULL, s, MB_CUR_MAX, &mb_st);
if(used == 0) break;
s += used; c = *s;
} else
c = *++s;
pos++;
}
}
else if(strncmp(s, delim_end, lend) == 0) {
if(delim_depth > 1) delim_depth--;
else if(delim_depth == 1) {
end = pos;
break;
}
else if(equal_start_and_end_delims) {
start = pos;
delim_depth++;
}
}
else if(strncmp(s, delim_start, lstart) == 0) {
if(delim_depth == 0) start = pos;
delim_depth++;
}
if(mbcslocale) {
used = (int) Rf_mbrtowc(NULL, s, MB_CUR_MAX, &mb_st);
if(used == 0) break;
s += used;
} else
s++;
pos++;
}
if(end > -1) {
INTEGER(ans)[i] = start + 1; /* index from one */
INTEGER(matchlen)[i] = end - start + 1;
}
else {
INTEGER(ans)[i] = INTEGER(matchlen)[i] = -1;
}
}
setAttrib(ans, install("match.length"), matchlen);
UNPROTECT(2);
return(ans);
}
SEXP
check_nonASCII(SEXP text, SEXP ignore_quotes)
{
/* Check if all the lines in 'text' are ASCII, after removing
comments and ignoring the contents of quotes (unless ignore_quotes)
(which might span more than one line and might be escaped).
This cannot be entirely correct, as quotes and \ might occur as
part of another character in a MBCS: but this does not happen
in UTF-8.
*/
int i, nbslash = 0; /* number of preceding backslashes */
const char *p;
char quote= '\0';
Rboolean ign, inquote = FALSE;
if(TYPEOF(text) != STRSXP) error("invalid input");
ign = asLogical(ignore_quotes);
if(ign == NA_LOGICAL) error("'ignore_quotes' must be TRUE or FALSE");
for (i = 0; i < LENGTH(text); i++) {
p = CHAR(STRING_ELT(text, i)); // ASCII or not not affected by charset
inquote = FALSE; /* avoid runaway quotes */
for(; *p; p++) {
if(!inquote && *p == '#') break;
if(!inquote || ign) {
if((unsigned int) *p > 127) {
/* Rprintf("%s\n", CHAR(STRING_ELT(text, i)));
Rprintf("found %x\n", (unsigned int) *p); */
return ScalarLogical(TRUE);
}
}
if((nbslash % 2 == 0) && (*p == '"' || *p == '\'')) {
if(inquote && *p == quote) {
inquote = FALSE;
} else if(!inquote) {
quote = *p;
inquote = TRUE;
}
}
if(*p == '\\') nbslash++; else nbslash = 0;
}
}
return ScalarLogical(FALSE);
}
SEXP check_nonASCII2(SEXP text)
{
SEXP ans = R_NilValue;
int i, m = 0, m_all = 100, *ind, *ians, yes;
const char *p;
if(TYPEOF(text) != STRSXP) error("invalid input");
ind = Calloc(m_all, int);
for (i = 0; i < LENGTH(text); i++) {
p = CHAR(STRING_ELT(text, i));
yes = 0;
for(; *p; p++)
if((unsigned int) *p > 127) {
yes = 1;
break;
}
if(yes) {
if(m >= m_all) {
m_all *= 2;
ind = Realloc(ind, m_all, int);
}
ind[m++] = i + 1; /* R is 1-based */
}
}
if(m) {
ans = allocVector(INTSXP, m);
ians = INTEGER(ans);
for(i = 0; i < m; i++) ians[i] = ind[i];
}
Free(ind);
return ans;
}
SEXP doTabExpand(SEXP strings, SEXP starts) /* does tab expansion for UTF-8 strings only */
{
int i,start, bufsize = 1024;
char *buffer = malloc(bufsize*sizeof(char)), *b;
const char *input;
SEXP result;
if (!buffer) error(_("out of memory"));
PROTECT(result = allocVector(STRSXP, length(strings)));
for (i = 0; i < length(strings); i++) {
input = CHAR(STRING_ELT(strings, i));
start = INTEGER(starts)[i];
for (b = buffer; *input; ) {
/* only the first byte of multi-byte chars counts */
if (0x80 <= (unsigned char)*input && (unsigned char)*input <= 0xBF)
start--;
else if (*input == '\n')
start = (int)(buffer-b-1);
if (*input == '\t') do {
*b++ = ' ';
} while (((b-buffer+start) & 7) != 0);
else *b++ = *input;
if (b - buffer >= bufsize - 8) {
int pos = (int)(b - buffer);
char *tmp;
bufsize *= 2;
tmp = realloc(buffer, bufsize*sizeof(char));
if (!tmp) {
free(buffer); /* free original allocation */
error(_("out of memory"));
} else buffer = tmp;
b = buffer + pos;
}
input++;
}
*b = '\0';
SET_STRING_ELT(result, i, mkCharCE(buffer, Rf_getCharCE(STRING_ELT(strings, i))));
}
UNPROTECT(1);
free(buffer);
return result;
}
/* This could be done in wchar_t, but it is only used for
ASCII delimiters which are not lead bytes in UTF-8 or
DBCS encodings. */
SEXP splitString(SEXP string, SEXP delims)
{
if(!isString(string) || length(string) != 1)
error("first arg must be a single character string");
if(!isString(delims) || length(delims) != 1)
error("first arg must be a single character string");
if(STRING_ELT(string, 0) == NA_STRING)
return ScalarString(NA_STRING);
if(STRING_ELT(delims, 0) == NA_STRING)
return ScalarString(NA_STRING);
const char *in = CHAR(STRING_ELT(string, 0)),
*del = CHAR(STRING_ELT(delims, 0));
cetype_t ienc = getCharCE(STRING_ELT(string, 0));
int nc = (int) strlen(in), used = 0;
// Used for short strings, so OK to over-allocate wildly
SEXP out = PROTECT(allocVector(STRSXP, nc));
const char *p;
char tmp[nc], *this = tmp;
int nthis = 0;
for(p = in; *p ; p++) {
if(strchr(del, *p)) {
// put out current string (if any)
if(nthis)
SET_STRING_ELT(out, used++, mkCharLenCE(tmp, nthis, ienc));
// put out delimiter
SET_STRING_ELT(out, used++, mkCharLen(p, 1));
// restart
this = tmp; nthis = 0;
} else {
*this++ = *p;
nthis++;
}
}
if(nthis) SET_STRING_ELT(out, used++, mkCharLenCE(tmp, nthis, ienc));
SEXP ans = lengthgets(out, used);
UNPROTECT(1);
return ans;
}