src/library/tools/src/text.c - R - Git at Google

 /*
  *  R : A Computer Language for Statistical Data Analysis
  *  Copyright (C) 2003-2016   The R Core Team.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 2 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, a copy is available at
  *  https://www.R-project.org/Licenses/
  */

 /* <UTF8> OK, provided the delimiters are ASCII
    match length is in now in chars.
 */

 #include <string.h>
 #include <R.h>
 #include "tools.h"

 #include <stdlib.h> /* for MB_CUR_MAX */
 #include <wchar.h>
 LibExtern Rboolean mbcslocale;
 size_t Rf_mbrtowc(wchar_t *wc, const char *s, size_t n, mbstate_t *ps);

 /* .Call, so manages R_alloc stack */
 SEXP
 delim_match(SEXP x, SEXP delims)
 {
     /*
       Match delimited substrings in a character vector x.

       Returns an integer vector with the same length of x giving the
       starting position of the match (including the start delimiter), or
       -1 if there is none, with attribute "match.length" giving the
       length of the matched text (including the end delimiter), or -1
       for no match.

       This is still very experimental.

       Currently, the start and end delimiters must be single characters;
       it would be nice to allow for arbitrary regexps.

       Currently, the only syntax supported is Rd ('\' is the escape
       character, '%' starts a comment extending to the next newline, no
       quote characters.  It would be nice to generalize this, too.

       Nevertheless, this is already useful for parsing Rd.
     */

     char c;
     const char *s, *delim_start, *delim_end;
     Sint n, i, pos, start, end, delim_depth;
     int lstart, lend;
     Rboolean is_escaped, equal_start_and_end_delims;
     SEXP ans, matchlen;
     mbstate_t mb_st; int used;

     if(!isString(x) || !isString(delims) || (length(delims) != 2))
 	error(_("invalid argument type"));

     delim_start = translateChar(STRING_ELT(delims, 0));
     delim_end = translateChar(STRING_ELT(delims, 1));
     lstart = (int) strlen(delim_start); lend = (int) strlen(delim_end);
     equal_start_and_end_delims = strcmp(delim_start, delim_end) == 0;

     n = length(x);
     PROTECT(ans = allocVector(INTSXP, n));
     PROTECT(matchlen = allocVector(INTSXP, n));

     for(i = 0; i < n; i++) {
 	memset(&mb_st, 0, sizeof(mbstate_t));
 	start = end = -1;
 	s = translateChar(STRING_ELT(x, i));
 	pos = is_escaped = delim_depth = 0;
 	while((c = *s) != '\0') {
 	    if(c == '\n') {
 		is_escaped = FALSE;
 	    }
 	    else if(c == '\\') {
 		is_escaped = is_escaped ? FALSE : TRUE;
 	    }
 	    else if(is_escaped) {
 		is_escaped = FALSE;
 	    }
 	    else if(c == '%') {
 		while((c != '\0') && (c != '\n')) {
 		    if(mbcslocale) {
 			used = (int) Rf_mbrtowc(NULL, s, MB_CUR_MAX, &mb_st);
 			if(used == 0) break;
 			s += used; c = *s;
 		    } else
 			c = *++s;
 		    pos++;
 		}
 	    }
 	    else if(strncmp(s, delim_end, lend) == 0) {
 		if(delim_depth > 1) delim_depth--;
 		else if(delim_depth == 1) {
 		    end = pos;
 		    break;
 		}
 		else if(equal_start_and_end_delims) {
 		    start = pos;
 		    delim_depth++;
 		}
 	    }
 	    else if(strncmp(s, delim_start, lstart) == 0) {
 		if(delim_depth == 0) start = pos;
 		delim_depth++;
 	    }
 	    if(mbcslocale) {
 		used = (int) Rf_mbrtowc(NULL, s, MB_CUR_MAX, &mb_st);
 		if(used == 0) break;
 		s += used;
 	    } else
 		s++;
 	    pos++;
 	}
 	if(end > -1) {
 	    INTEGER(ans)[i] = start + 1; /* index from one */
 	    INTEGER(matchlen)[i] = end - start + 1;
 	}
 	else {
 	    INTEGER(ans)[i] = INTEGER(matchlen)[i] = -1;
 	}
     }
     setAttrib(ans, install("match.length"), matchlen);
     UNPROTECT(2);
     return(ans);
 }

 SEXP
 check_nonASCII(SEXP text, SEXP ignore_quotes)
 {
     /* Check if all the lines in 'text' are ASCII, after removing
        comments and ignoring the contents of quotes (unless ignore_quotes)
        (which might span more than one line and might be escaped).

        This cannot be entirely correct, as quotes and \ might occur as
        part of another character in a MBCS: but this does not happen
        in UTF-8.
     */
     int i, nbslash = 0; /* number of preceding backslashes */
     const char *p;
     char quote= '\0';
     Rboolean ign, inquote = FALSE;

     if(TYPEOF(text) != STRSXP) error("invalid input");
     ign = asLogical(ignore_quotes);
     if(ign == NA_LOGICAL) error("'ignore_quotes' must be TRUE or FALSE");

     for (i = 0; i < LENGTH(text); i++) {
 	p = CHAR(STRING_ELT(text, i)); // ASCII or not not affected by charset
 	inquote = FALSE; /* avoid runaway quotes */
 	for(; *p; p++) {
 	    if(!inquote && *p == '#') break;
 	    if(!inquote || ign) {
 		if((unsigned int) *p > 127) {
 		    /* Rprintf("%s\n", CHAR(STRING_ELT(text, i)));
 		       Rprintf("found %x\n", (unsigned int) *p); */
 		    return ScalarLogical(TRUE);
 		}
 	    }
 	    if((nbslash % 2 == 0) && (*p == '"' || *p == '\'')) {
 		if(inquote && *p == quote) {
 		    inquote = FALSE;
 		} else if(!inquote) {
 		    quote = *p;
 		    inquote = TRUE;
 		}
 	    }
 	    if(*p == '\\') nbslash++; else nbslash = 0;
 	}
     }
     return ScalarLogical(FALSE);
 }

 SEXP check_nonASCII2(SEXP text)
 {
     SEXP ans = R_NilValue;
     int i, m = 0, m_all = 100, *ind, *ians, yes;
     const char *p;

     if(TYPEOF(text) != STRSXP) error("invalid input");
     ind = Calloc(m_all, int);
     for (i = 0; i < LENGTH(text); i++) {
 	p = CHAR(STRING_ELT(text, i));
 	yes = 0;
 	for(; *p; p++)
 	    if((unsigned int) *p > 127) {
 		yes = 1;
 		break;
 	    }
 	if(yes) {
 	    if(m >= m_all) {
 		m_all *= 2;
 		ind = Realloc(ind, m_all, int);
 	    }
 	    ind[m++] = i + 1; /* R is 1-based */
 	}
     }
     if(m) {
 	ans = allocVector(INTSXP, m);
 	ians = INTEGER(ans);
 	for(i = 0; i < m; i++) ians[i] = ind[i];
     }
     Free(ind);
     return ans;
 }

 SEXP doTabExpand(SEXP strings, SEXP starts)  /* does tab expansion for UTF-8 strings only */
 {
     int i,start, bufsize = 1024;
     char *buffer = malloc(bufsize*sizeof(char)), *b;
     const char *input;
     SEXP result;
     if (!buffer) error(_("out of memory"));
     PROTECT(result = allocVector(STRSXP, length(strings)));
     for (i = 0; i < length(strings); i++) {
     	input = CHAR(STRING_ELT(strings, i));
     	start = INTEGER(starts)[i];
     	for (b = buffer; *input; ) {
     	    /* only the first byte of multi-byte chars counts */
     	    if (0x80 <= (unsigned char)*input && (unsigned char)*input <= 0xBF)
     		start--;
     	    else if (*input == '\n')
     	    	start = (int)(buffer-b-1);
     	    if (*input == '\t') do {
     	    	*b++ = ' ';
     	    } while (((b-buffer+start) & 7) != 0);
     	    else *b++ = *input;
     	    if (b - buffer >= bufsize - 8) {
     	    	int pos = (int)(b - buffer);
 		char *tmp;
     	        bufsize *= 2;
     	    	tmp = realloc(buffer, bufsize*sizeof(char));
     	    	if (!tmp) {
 		    free(buffer); /* free original allocation */
 		    error(_("out of memory"));
 		} else buffer = tmp;
     	    	b = buffer + pos;
     	    }
     	    input++;
     	}
     	*b = '\0';
     	SET_STRING_ELT(result, i, mkCharCE(buffer, Rf_getCharCE(STRING_ELT(strings, i))));
     }
     UNPROTECT(1);
     free(buffer);
     return result;
 }

 /* This could be done in wchar_t, but it is only used for
    ASCII delimiters which are not lead bytes in UTF-8 or
    DBCS encodings. */
 SEXP splitString(SEXP string, SEXP delims)
 {
     if(!isString(string) || length(string) != 1)
 	error("first arg must be a single character string");
     if(!isString(delims) || length(delims) != 1)
 	error("first arg must be a single character string");

     if(STRING_ELT(string, 0) == NA_STRING)
 	return ScalarString(NA_STRING);
     if(STRING_ELT(delims, 0) == NA_STRING)
 	return ScalarString(NA_STRING);

     const char *in = CHAR(STRING_ELT(string, 0)),
 	*del = CHAR(STRING_ELT(delims, 0));
     cetype_t ienc = getCharCE(STRING_ELT(string, 0));
     int nc = (int) strlen(in), used = 0;

     // Used for short strings, so OK to over-allocate wildly
     SEXP out = PROTECT(allocVector(STRSXP, nc));
     const char *p;
     char tmp[nc], *this = tmp;
     int nthis = 0;
     for(p = in; *p ; p++) {
 	if(strchr(del, *p)) {
 	    // put out current string (if any)
 	    if(nthis)
 		SET_STRING_ELT(out, used++, mkCharLenCE(tmp, nthis, ienc));
 	    // put out delimiter
 	    SET_STRING_ELT(out, used++, mkCharLen(p, 1));
 	    // restart
 	    this = tmp; nthis = 0;
 	} else {
 	    *this++ = *p;
 	    nthis++;
 	}
     }
     if(nthis) SET_STRING_ELT(out, used++, mkCharLenCE(tmp, nthis, ienc));

     SEXP ans = lengthgets(out, used);
     UNPROTECT(1);
     return ans;
 }
	/*
	* R : A Computer Language for Statistical Data Analysis
	* Copyright (C) 2003-2016 The R Core Team.
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 2 of the License, or
	* (at your option) any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, a copy is available at
	* https://www.R-project.org/Licenses/
	*/

	/* <UTF8> OK, provided the delimiters are ASCII
	match length is in now in chars.
	*/

	#include <string.h>
	#include <R.h>
	#include "tools.h"

	#include <stdlib.h> /* for MB_CUR_MAX */
	#include <wchar.h>
	LibExtern Rboolean mbcslocale;
	size_t Rf_mbrtowc(wchar_t wc, const char s, size_t n, mbstate_t *ps);

	/* .Call, so manages R_alloc stack */
	SEXP
	delim_match(SEXP x, SEXP delims)
	{
	/*
	Match delimited substrings in a character vector x.

	Returns an integer vector with the same length of x giving the
	starting position of the match (including the start delimiter), or
	-1 if there is none, with attribute "match.length" giving the
	length of the matched text (including the end delimiter), or -1
	for no match.

	This is still very experimental.

	Currently, the start and end delimiters must be single characters;
	it would be nice to allow for arbitrary regexps.

	Currently, the only syntax supported is Rd ('\' is the escape
	character, '%' starts a comment extending to the next newline, no
	quote characters. It would be nice to generalize this, too.

	Nevertheless, this is already useful for parsing Rd.
	*/

	char c;
	const char s, delim_start, *delim_end;
	Sint n, i, pos, start, end, delim_depth;
	int lstart, lend;
	Rboolean is_escaped, equal_start_and_end_delims;
	SEXP ans, matchlen;
	mbstate_t mb_st; int used;

	if(!isString(x) \|\| !isString(delims) \|\| (length(delims) != 2))
	error(_("invalid argument type"));

	delim_start = translateChar(STRING_ELT(delims, 0));
	delim_end = translateChar(STRING_ELT(delims, 1));
	lstart = (int) strlen(delim_start); lend = (int) strlen(delim_end);
	equal_start_and_end_delims = strcmp(delim_start, delim_end) == 0;

	n = length(x);
	PROTECT(ans = allocVector(INTSXP, n));
	PROTECT(matchlen = allocVector(INTSXP, n));

	for(i = 0; i < n; i++) {
	memset(&mb_st, 0, sizeof(mbstate_t));
	start = end = -1;
	s = translateChar(STRING_ELT(x, i));
	pos = is_escaped = delim_depth = 0;
	while((c = *s) != '\0') {
	if(c == '\n') {
	is_escaped = FALSE;
	}
	else if(c == '\\') {
	is_escaped = is_escaped ? FALSE : TRUE;
	}
	else if(is_escaped) {
	is_escaped = FALSE;
	}
	else if(c == '%') {
	while((c != '\0') && (c != '\n')) {
	if(mbcslocale) {
	used = (int) Rf_mbrtowc(NULL, s, MB_CUR_MAX, &mb_st);
	if(used == 0) break;
	s += used; c = *s;
	} else
	c = *++s;
	pos++;
	}
	}
	else if(strncmp(s, delim_end, lend) == 0) {
	if(delim_depth > 1) delim_depth--;
	else if(delim_depth == 1) {
	end = pos;
	break;
	}
	else if(equal_start_and_end_delims) {
	start = pos;
	delim_depth++;
	}
	}
	else if(strncmp(s, delim_start, lstart) == 0) {
	if(delim_depth == 0) start = pos;
	delim_depth++;
	}
	if(mbcslocale) {
	used = (int) Rf_mbrtowc(NULL, s, MB_CUR_MAX, &mb_st);
	if(used == 0) break;
	s += used;
	} else
	s++;
	pos++;
	}
	if(end > -1) {
	INTEGER(ans)[i] = start + 1; /* index from one */
	INTEGER(matchlen)[i] = end - start + 1;
	}
	else {
	INTEGER(ans)[i] = INTEGER(matchlen)[i] = -1;
	}
	}
	setAttrib(ans, install("match.length"), matchlen);
	UNPROTECT(2);
	return(ans);
	}

	SEXP
	check_nonASCII(SEXP text, SEXP ignore_quotes)
	{
	/* Check if all the lines in 'text' are ASCII, after removing
	comments and ignoring the contents of quotes (unless ignore_quotes)
	(which might span more than one line and might be escaped).

	This cannot be entirely correct, as quotes and \ might occur as
	part of another character in a MBCS: but this does not happen
	in UTF-8.
	*/
	int i, nbslash = 0; /* number of preceding backslashes */
	const char *p;
	char quote= '\0';
	Rboolean ign, inquote = FALSE;

	if(TYPEOF(text) != STRSXP) error("invalid input");
	ign = asLogical(ignore_quotes);
	if(ign == NA_LOGICAL) error("'ignore_quotes' must be TRUE or FALSE");

	for (i = 0; i < LENGTH(text); i++) {
	p = CHAR(STRING_ELT(text, i)); // ASCII or not not affected by charset
	inquote = FALSE; /* avoid runaway quotes */
	for(; *p; p++) {
	if(!inquote && *p == '#') break;
	if(!inquote \|\| ign) {
	if((unsigned int) *p > 127) {
	/* Rprintf("%s\n", CHAR(STRING_ELT(text, i)));
	Rprintf("found %x\n", (unsigned int) p); /
	return ScalarLogical(TRUE);
	}
	}
	if((nbslash % 2 == 0) && (p == '"' \|\| p == '\'')) {
	if(inquote && *p == quote) {
	inquote = FALSE;
	} else if(!inquote) {
	quote = *p;
	inquote = TRUE;
	}
	}
	if(*p == '\\') nbslash++; else nbslash = 0;
	}
	}
	return ScalarLogical(FALSE);
	}

	SEXP check_nonASCII2(SEXP text)
	{
	SEXP ans = R_NilValue;
	int i, m = 0, m_all = 100, ind, ians, yes;
	const char *p;

	if(TYPEOF(text) != STRSXP) error("invalid input");
	ind = Calloc(m_all, int);
	for (i = 0; i < LENGTH(text); i++) {
	p = CHAR(STRING_ELT(text, i));
	yes = 0;
	for(; *p; p++)
	if((unsigned int) *p > 127) {
	yes = 1;
	break;
	}
	if(yes) {
	if(m >= m_all) {
	m_all *= 2;
	ind = Realloc(ind, m_all, int);
	}
	ind[m++] = i + 1; /* R is 1-based */
	}
	}
	if(m) {
	ans = allocVector(INTSXP, m);
	ians = INTEGER(ans);
	for(i = 0; i < m; i++) ians[i] = ind[i];
	}
	Free(ind);
	return ans;
	}

	SEXP doTabExpand(SEXP strings, SEXP starts) /* does tab expansion for UTF-8 strings only */
	{
	int i,start, bufsize = 1024;
	char buffer = malloc(bufsizesizeof(char)), *b;
	const char *input;
	SEXP result;
	if (!buffer) error(_("out of memory"));
	PROTECT(result = allocVector(STRSXP, length(strings)));
	for (i = 0; i < length(strings); i++) {
	input = CHAR(STRING_ELT(strings, i));
	start = INTEGER(starts)[i];
	for (b = buffer; *input; ) {
	/* only the first byte of multi-byte chars counts */
	if (0x80 <= (unsigned char)input && (unsigned char)input <= 0xBF)
	start--;
	else if (*input == '\n')
	start = (int)(buffer-b-1);
	if (*input == '\t') do {
	*b++ = ' ';
	} while (((b-buffer+start) & 7) != 0);
	else b++ = input;
	if (b - buffer >= bufsize - 8) {
	int pos = (int)(b - buffer);
	char *tmp;
	bufsize *= 2;
	tmp = realloc(buffer, bufsize*sizeof(char));
	if (!tmp) {
	free(buffer); /* free original allocation */
	error(_("out of memory"));
	} else buffer = tmp;
	b = buffer + pos;
	}
	input++;
	}
	*b = '\0';
	SET_STRING_ELT(result, i, mkCharCE(buffer, Rf_getCharCE(STRING_ELT(strings, i))));
	}
	UNPROTECT(1);
	free(buffer);
	return result;
	}

	/* This could be done in wchar_t, but it is only used for
	ASCII delimiters which are not lead bytes in UTF-8 or
	DBCS encodings. */
	SEXP splitString(SEXP string, SEXP delims)
	{
	if(!isString(string) \|\| length(string) != 1)
	error("first arg must be a single character string");
	if(!isString(delims) \|\| length(delims) != 1)
	error("first arg must be a single character string");

	if(STRING_ELT(string, 0) == NA_STRING)
	return ScalarString(NA_STRING);
	if(STRING_ELT(delims, 0) == NA_STRING)
	return ScalarString(NA_STRING);

	const char *in = CHAR(STRING_ELT(string, 0)),
	*del = CHAR(STRING_ELT(delims, 0));
	cetype_t ienc = getCharCE(STRING_ELT(string, 0));
	int nc = (int) strlen(in), used = 0;

	// Used for short strings, so OK to over-allocate wildly
	SEXP out = PROTECT(allocVector(STRSXP, nc));
	const char *p;
	char tmp[nc], *this = tmp;
	int nthis = 0;
	for(p = in; *p ; p++) {
	if(strchr(del, *p)) {
	// put out current string (if any)
	if(nthis)
	SET_STRING_ELT(out, used++, mkCharLenCE(tmp, nthis, ienc));
	// put out delimiter
	SET_STRING_ELT(out, used++, mkCharLen(p, 1));
	// restart
	this = tmp; nthis = 0;
	} else {
	this++ = p;
	nthis++;
	}
	}
	if(nthis) SET_STRING_ELT(out, used++, mkCharLenCE(tmp, nthis, ienc));

	SEXP ans = lengthgets(out, used);
	UNPROTECT(1);
	return ans;
	}