blob: 09ef855c505670e3536002d7214c3dd9906dc561 [file] [log] [blame]
/*
* R : A Computer Language for Statistical Data Analysis
* Copyright (C) 2002--2015 The R Core Team
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Pulic License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, a copy is available at
* https://www.R-project.org/Licenses/
*/
/* This at times needed to be separate from grep.c, as TRE has a
conflicting regcomp and the two headers cannot both be included in
one file
*/
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include <Defn.h>
#include <Internal.h>
/* This is remapped */
#undef pmatch
/* interval at which to check interrupts */
#define NINTERRUPT 1000000
#include <R_ext/RS.h> /* for Calloc/Free */
#include <wchar.h>
#include <tre/tre.h>
static void
amatch_regaparams(regaparams_t *params, int patlen,
double *bounds, int *costs)
{
int cost, max_cost, warn = 0;
double bound;
cost = params->cost_ins = max_cost = costs[0];
cost = params->cost_del = costs[1];
if(cost > max_cost) max_cost = cost;
cost = params->cost_subst = costs[2];
if(cost > max_cost) max_cost = cost;
bound = bounds[0];
if(ISNA(bound)) {
params->max_cost = INT_MAX;
} else {
if(bound < 1) bound *= (patlen * max_cost);
params->max_cost = IntegerFromReal(ceil(bound), &warn);
CoercionWarning(warn);
}
bound = bounds[1];
if(ISNA(bound)) {
params->max_del = INT_MAX;
} else {
if(bound < 1) bound *= patlen;
params->max_del = IntegerFromReal(ceil(bound), &warn);
CoercionWarning(warn);
}
bound = bounds[2];
if(ISNA(bound)) {
params->max_ins = INT_MAX;
} else {
if(bound < 1) bound *= patlen;
params->max_ins = IntegerFromReal(ceil(bound), &warn);
CoercionWarning(warn);
}
bound = bounds[3];
if(ISNA(bound)) {
params->max_subst = INT_MAX;
} else {
if(bound < 1) bound *= patlen;
params->max_subst = IntegerFromReal(ceil(bound), &warn);
CoercionWarning(warn);
}
bound = bounds[4];
if(ISNA(bound)) {
params->max_err = INT_MAX;
} else {
if(bound < 1) bound *= patlen;
params->max_err = IntegerFromReal(ceil(bound), &warn);
CoercionWarning(warn);
}
}
SEXP attribute_hidden do_agrep(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP pat, vec, ind, ans;
SEXP opt_costs, opt_bounds;
int opt_icase, opt_value, opt_fixed, useBytes;
R_xlen_t i, j, n;
int nmatches, patlen;
Rboolean useWC = FALSE;
const void *vmax = NULL;
regex_t reg;
regaparams_t params;
regamatch_t match;
int rc, cflags = REG_EXTENDED | REG_NOSUB;
checkArity(op, args);
pat = CAR(args); args = CDR(args);
vec = CAR(args); args = CDR(args);
opt_icase = asLogical(CAR(args)); args = CDR(args);
opt_value = asLogical(CAR(args)); args = CDR(args);
opt_costs = CAR(args); args = CDR(args);
opt_bounds = CAR(args); args = CDR(args);
useBytes = asLogical(CAR(args));
args = CDR(args);
opt_fixed = asLogical(CAR(args));
if(opt_icase == NA_INTEGER) opt_icase = 0;
if(opt_value == NA_INTEGER) opt_value = 0;
if(useBytes == NA_INTEGER) useBytes = 0;
if(opt_fixed == NA_INTEGER) opt_fixed = 1;
if(opt_fixed) cflags |= REG_LITERAL;
if(!isString(pat) || LENGTH(pat) < 1)
error(_("invalid '%s' argument"), "pattern");
if(LENGTH(pat) > 1)
warning(_("argument '%s' has length > 1 and only the first element will be used"), "pattern");
if(!isString(vec)) error(_("invalid '%s' argument"), "x");
if(opt_icase) cflags |= REG_ICASE;
n = XLENGTH(vec);
if(!useBytes) {
Rboolean haveBytes = IS_BYTES(STRING_ELT(pat, 0));
if(!haveBytes)
for (i = 0; i < n; i++)
if(IS_BYTES(STRING_ELT(vec, i))) {
haveBytes = TRUE;
break;
}
if(haveBytes) useBytes = TRUE;
}
if(!useBytes) {
useWC = !IS_ASCII(STRING_ELT(pat, 0));
if(!useWC) {
for (i = 0 ; i < n ; i++) {
if(STRING_ELT(vec, i) == NA_STRING) continue;
if(!IS_ASCII(STRING_ELT(vec, i))) {
useWC = TRUE;
break;
}
}
}
}
if(STRING_ELT(pat, 0) == NA_STRING) {
if(opt_value) {
PROTECT(ans = allocVector(STRSXP, n));
for(i = 0; i < n; i++)
SET_STRING_ELT(ans, i, NA_STRING);
SEXP nms = getAttrib(vec, R_NamesSymbol);
if(!isNull(nms))
setAttrib(ans, R_NamesSymbol, nms);
} else {
PROTECT(ans = allocVector(INTSXP, n));
for(i = 0; i < n; i++)
INTEGER(ans)[i] = NA_INTEGER;
}
UNPROTECT(1);
return ans;
}
SEXP s_nchar = install("nchar");
if(useBytes)
PROTECT(call = lang3(s_nchar, pat,
ScalarString(mkChar("bytes"))));
else
PROTECT(call = lang3(s_nchar, pat,
ScalarString(mkChar("chars"))));
patlen = asInteger(eval(call, env));
UNPROTECT(1);
if(!patlen)
error(_("'pattern' must be a non-empty character string"));
/* wtransChar and translateChar can R_alloc */
vmax = vmaxget();
if(useBytes)
rc = tre_regcompb(&reg, CHAR(STRING_ELT(pat, 0)), cflags);
else if(useWC)
rc = tre_regwcomp(&reg, wtransChar(STRING_ELT(pat, 0)), cflags);
else {
const char *spat = translateChar(STRING_ELT(pat, 0));
if(mbcslocale && !mbcsValid(spat))
error(_("regular expression is invalid in this locale"));
rc = tre_regcomp(&reg, spat, cflags);
}
if(rc) {
char errbuf[1001];
tre_regerror(rc, &reg, errbuf, 1001);
error(_("regcomp error: '%s'"), errbuf);
}
tre_regaparams_default(&params);
amatch_regaparams(&params, patlen,
REAL(opt_bounds), INTEGER(opt_costs));
/* Matching. */
n = LENGTH(vec);
PROTECT(ind = allocVector(LGLSXP, n));
nmatches = 0;
for (i = 0 ; i < n ; i++) {
// if ((i+1) % NINTERRUPT == 0) R_CheckUserInterrupt();
if(STRING_ELT(vec, i) == NA_STRING) {
LOGICAL(ind)[i] = 0;
continue;
}
/* Perform match. */
/* undocumented, must be zeroed */
memset(&match, 0, sizeof(match));
if(useBytes)
rc = tre_regaexecb(&reg,
CHAR(STRING_ELT(vec, i)),
&match, params, 0);
else if(useWC) {
rc = tre_regawexec(&reg,
wtransChar(STRING_ELT(vec, i)),
&match, params, 0);
vmaxset(vmax);
} else {
const char *s = translateChar(STRING_ELT(vec, i));
if(mbcslocale && !mbcsValid(s))
error(_("input string %d is invalid in this locale"), i+1);
rc = tre_regaexec(&reg, s, &match, params, 0);
vmaxset(vmax);
}
if(rc == REG_OK) {
LOGICAL(ind)[i] = 1;
nmatches++;
} else LOGICAL(ind)[i] = 0;
}
tre_regfree(&reg);
if (PRIMVAL(op)) {/* agrepl case */
UNPROTECT(1);
return ind;
}
if(opt_value) {
PROTECT(ans = allocVector(STRSXP, nmatches));
SEXP nmold = getAttrib(vec, R_NamesSymbol), nm;
for (j = i = 0 ; i < n ; i++) {
if(LOGICAL(ind)[i])
SET_STRING_ELT(ans, j++, STRING_ELT(vec, i));
}
/* copy across names and subset */
if(!isNull(nmold)) {
nm = allocVector(STRSXP, nmatches);
for (i = 0, j = 0; i < n ; i++)
if(LOGICAL(ind)[i])
SET_STRING_ELT(nm, j++, STRING_ELT(nmold, i));
setAttrib(ans, R_NamesSymbol, nm);
}
}
#ifdef LONG_VECTOR_SUPPORT
else if (n > INT_MAX) {
PROTECT(ans = allocVector(REALSXP, nmatches));
for (j = i = 0 ; i < n ; i++)
if(LOGICAL(ind)[i] == 1)
REAL(ans)[j++] = (double)(i + 1);
}
#endif
else {
PROTECT(ans = allocVector(INTSXP, nmatches));
for (j = i = 0 ; i < n ; i++)
if(LOGICAL(ind)[i] == 1)
INTEGER(ans)[j++] = (int)(i + 1);
}
UNPROTECT(2);
return ans;
}
#define ANS(I, J) REAL(ans)[I + J * nx]
#define COUNTS(I, J, K) INTEGER(counts)[I + J * nx + K * nxy]
#define MAT(X, I, J) X[I + (J) * nr]
static SEXP
adist_full(SEXP x, SEXP y, double *costs, Rboolean opt_counts)
{
SEXP ans, counts, trafos = R_NilValue /* -Wall */, dimnames, names;
double cost_ins, cost_del, cost_sub;
double *dists, d, d_ins, d_del, d_sub;
char *paths = NULL, p, *buf = NULL;
int i, j, k, l, m, nx, ny, nxy, *xi, *yj, nxi, nyj, nr, nc, nz;
int nins, ndel, nsub, buflen = 100, need;
counts = R_NilValue; /* -Wall */
nx = LENGTH(x);
ny = LENGTH(y);
nxy = nx * ny;
cost_ins = costs[0];
cost_del = costs[1];
cost_sub = costs[2];
PROTECT(ans = allocMatrix(REALSXP, nx, ny));
if(opt_counts) {
PROTECT(counts = alloc3DArray(INTSXP, nx, ny, 3));
PROTECT(trafos = allocMatrix(STRSXP, nx, ny));
buf = Calloc(buflen, char);
}
for(i = 0; i < nx; i++) {
nxi = LENGTH(VECTOR_ELT(x, i));
xi = INTEGER(VECTOR_ELT(x, i));
if(nxi && (xi[0] == NA_INTEGER)) {
for(j = 0; j < ny; j++) {
ANS(i, j) = NA_REAL;
}
if(opt_counts) {
for(m = 0; m < 3; m++) {
COUNTS(i, j, m) = NA_INTEGER;
}
}
} else {
for(j = 0; j < ny; j++) {
nyj = LENGTH(VECTOR_ELT(y, j));
yj = INTEGER(VECTOR_ELT(y, j));
if(nyj && (yj[0] == NA_INTEGER)) {
ANS(i, j) = NA_REAL;
if(opt_counts) {
for(m = 0; m < 3; m++) {
COUNTS(i, j, m) = NA_INTEGER;
}
}
}
else {
/* Determine operation-weighted edit distance via
* straightforward dynamic programming.
*/
nr = nxi + 1;
nc = nyj + 1;
dists = Calloc(nr * nc, double);
MAT(dists, 0, 0) = 0;
for(k = 1; k < nr; k++)
MAT(dists, k, 0) = k * cost_del;
for(l = 1; l < nc; l++)
MAT(dists, 0, l) = l * cost_ins;
if(opt_counts) {
paths = Calloc(nr * nc, char);
for(k = 1; k < nr; k++)
MAT(paths, k, 0) = 'D';
for(l = 1; l < nc; l++)
MAT(paths, 0, l) = 'I';
}
for(k = 1; k < nr; k++) {
for(l = 1; l < nc; l++) {
if(xi[k - 1] == yj[l - 1]) {
MAT(dists, k, l) =
MAT(dists, k - 1, l - 1);
if(opt_counts)
MAT(paths, k, l) = 'M';
} else {
d_ins = MAT(dists, k, l - 1) + cost_ins;
d_del = MAT(dists, k - 1, l) + cost_del;
d_sub = MAT(dists, k - 1, l - 1) + cost_sub;
if(opt_counts) {
if(d_ins <= d_del) {
d = d_ins;
p = 'I';
} else {
d = d_del;
p = 'D';
}
if(d_sub < d) {
d = d_sub;
p = 'S';
}
MAT(paths, k, l) = p;
} else {
d = fmin(fmin(d_ins, d_del), d_sub);
}
MAT(dists, k, l) = d;
}
}
}
ANS(i, j) = MAT(dists, nxi, nyj);
if(opt_counts) {
if(!R_finite(ANS(i, j))) {
for(m = 0; m < 3; m++) {
COUNTS(i, j, m) = NA_INTEGER;
}
SET_STRING_ELT(trafos, i + nx * j, NA_STRING);
} else {
nins = ndel = nsub = 0;
k = nxi; l = nyj; m = k + l; nz = m;
need = 2 * m + 1;
if(buflen < need) {
buf = Realloc(buf, need , char);
buflen = need;
}
/* Need to read backwards and fill forwards. */
while((k > 0) || (l > 0)) {
p = MAT(paths, k, l);
if(p == 'I') {
nins++;
l--;
} else if(p == 'D') {
ndel++;
k--;
} else {
if(p == 'S')
nsub++;
k--;
l--;
}
buf[m] = p;
m++;
}
/* Now reverse the transcript. */
for(k = 0, l = --m; l >= nz; k++, l--)
buf[k] = buf[l];
buf[++k] = '\0';
COUNTS(i, j, 0) = nins;
COUNTS(i, j, 1) = ndel;
COUNTS(i, j, 2) = nsub;
SET_STRING_ELT(trafos, i + nx * j, mkChar(buf));
}
Free(paths);
}
Free(dists);
}
}
}
}
PROTECT(x = getAttrib(x, R_NamesSymbol));
PROTECT(y = getAttrib(y, R_NamesSymbol));
if(!isNull(x) || !isNull(y)) {
PROTECT(dimnames = allocVector(VECSXP, 2));
SET_VECTOR_ELT(dimnames, 0, x);
SET_VECTOR_ELT(dimnames, 1, y);
setAttrib(ans, R_DimNamesSymbol, dimnames);
UNPROTECT(1); /* dimnames */
}
if(opt_counts) {
Free(buf);
PROTECT(dimnames = allocVector(VECSXP, 3));
PROTECT(names = allocVector(STRSXP, 3));
SET_STRING_ELT(names, 0, mkChar("ins"));
SET_STRING_ELT(names, 1, mkChar("del"));
SET_STRING_ELT(names, 2, mkChar("sub"));
SET_VECTOR_ELT(dimnames, 0, x);
SET_VECTOR_ELT(dimnames, 1, y);
SET_VECTOR_ELT(dimnames, 2, names);
setAttrib(counts, R_DimNamesSymbol, dimnames);
setAttrib(ans, install("counts"), counts);
UNPROTECT(2); /* names, dimnames */
if(!isNull(x) || !isNull(y)) {
PROTECT(dimnames = allocVector(VECSXP, 2));
SET_VECTOR_ELT(dimnames, 0, x);
SET_VECTOR_ELT(dimnames, 1, y);
setAttrib(trafos, R_DimNamesSymbol, dimnames);
UNPROTECT(1); /* dimnames */
}
setAttrib(ans, install("trafos"), trafos);
UNPROTECT(2); /* trafos, counts */
}
UNPROTECT(3); /* y, x, ans */
return ans;
}
#define OFFSETS(I, J, K) INTEGER(offsets)[I + J * nx + K * nxy]
SEXP attribute_hidden do_adist(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP x, y;
SEXP ans, counts, offsets, dimnames, names, elt;
SEXP opt_costs;
int opt_fixed, opt_partial, opt_counts, opt_icase, useBytes;
int i = 0, j = 0, m, nx, ny, nxy;
const char *s, *t;
const void *vmax = NULL;
Rboolean haveBytes, useWC = FALSE;
regex_t reg;
regaparams_t params;
regamatch_t match;
size_t nmatch = 0 /* -Wall */;
regmatch_t *pmatch = NULL; /* -Wall */
int rc, cflags = REG_EXTENDED;
checkArity(op, args);
x = CAR(args); args = CDR(args);
y = CAR(args); args = CDR(args);
opt_costs = CAR(args); args = CDR(args);
opt_counts = asLogical(CAR(args)); args = CDR(args);
opt_fixed = asInteger(CAR(args)); args = CDR(args);
opt_partial = asInteger(CAR(args)); args = CDR(args);
opt_icase = asLogical(CAR(args)); args = CDR(args);
useBytes = asLogical(CAR(args));
if(opt_counts == NA_INTEGER) opt_counts = 0;
if(opt_fixed == NA_INTEGER) opt_fixed = 1;
if(opt_partial == NA_INTEGER) opt_partial = 0;
if(opt_icase == NA_INTEGER) opt_icase = 0;
if(useBytes == NA_INTEGER) useBytes = 0;
if(opt_fixed) cflags |= REG_LITERAL;
if(opt_icase) cflags |= REG_ICASE;
if(!opt_fixed && !opt_partial) {
warning(_("argument '%s' will be ignored"), "partial = FALSE");
}
if(!opt_partial)
return(adist_full(x, y, REAL(opt_costs), opt_counts));
counts = R_NilValue; /* -Wall */
offsets = R_NilValue; /* -Wall */
if(!opt_counts) cflags |= REG_NOSUB;
nx = length(x);
ny = length(y);
nxy = nx * ny;
if(!useBytes) {
haveBytes = FALSE;
for(i = 0; i < nx; i++) {
if(IS_BYTES(STRING_ELT(x, i))) {
haveBytes = TRUE;
break;
}
}
if(!haveBytes) {
for(j = 0; j < ny; j++) {
if(IS_BYTES(STRING_ELT(y, j))) {
haveBytes = TRUE;
break;
}
}
}
if(haveBytes) useBytes = TRUE;
}
if(!useBytes) {
for(i = 0; i < nx; i++) {
if(STRING_ELT(x, i) == NA_STRING) continue;
if(!IS_ASCII(STRING_ELT(x, i))) {
useWC = TRUE;
break;
}
}
if(!useWC) {
for(j = 0; j < ny; j++) {
if(STRING_ELT(y, j) == NA_STRING) continue;
if(!IS_ASCII(STRING_ELT(y, j))) {
useWC = TRUE;
break;
}
}
}
}
tre_regaparams_default(&params);
params.max_cost = INT_MAX;
params.cost_ins = INTEGER(opt_costs)[0];;
params.cost_del = INTEGER(opt_costs)[1];
params.cost_subst = INTEGER(opt_costs)[2];
PROTECT(ans = allocMatrix(REALSXP, nx, ny));
if(opt_counts) {
PROTECT(counts = alloc3DArray(INTSXP, nx, ny, 3));
PROTECT(offsets = alloc3DArray(INTSXP, nx, ny, 2));
}
/* wtransChar and translateChar can R_alloc */
vmax = vmaxget();
for(i = 0; i < nx; i++) {
elt = STRING_ELT(x, i);
if(elt == NA_STRING) {
for(j = 0; j < ny; j++) {
ANS(i, j) = NA_REAL;
if(opt_counts) {
for(m = 0; m < 3; m++) {
COUNTS(i, j, m) = NA_INTEGER;
}
OFFSETS(i, j, 0) = -1;
OFFSETS(i, j, 1) = -1;
}
}
} else {
if(useBytes)
rc = tre_regcompb(&reg, CHAR(elt), cflags);
else if(useWC) {
rc = tre_regwcomp(&reg, wtransChar(elt), cflags);
vmaxset(vmax);
} else {
s = translateChar(elt);
if(mbcslocale && !mbcsValid(s)) {
error(_("input string x[%d] is invalid in this locale"),
i + 1);
}
rc = tre_regcomp(&reg, s, cflags);
vmaxset(vmax);
}
if(rc) {
char errbuf[1001];
tre_regerror(rc, &reg, errbuf, 1001);
error(_("regcomp error: '%s'"), errbuf);
}
if(opt_counts) {
nmatch = reg.re_nsub + 1;
pmatch = (regmatch_t *) malloc(nmatch * sizeof(regmatch_t));
}
for(j = 0; j < ny; j++) {
elt = STRING_ELT(y, j);
if(elt == NA_STRING) {
ANS(i, j) = NA_REAL;
if(opt_counts) {
for(m = 0; m < 3; m++) {
COUNTS(i, j, m) = NA_INTEGER;
}
OFFSETS(i, j, 0) = -1;
OFFSETS(i, j, 1) = -1;
}
}
else {
/* Perform match. */
/* undocumented, must be zeroed */
memset(&match, 0, sizeof(match));
if(opt_counts) {
match.nmatch = nmatch;
match.pmatch = pmatch;
}
if(useBytes)
rc = tre_regaexecb(&reg, CHAR(elt),
&match, params, 0);
else if(useWC) {
rc = tre_regawexec(&reg, wtransChar(elt),
&match, params, 0);
vmaxset(vmax);
} else {
t = translateChar(elt);
if(mbcslocale && !mbcsValid(t)) {
error(_("input string y[%d] is invalid in this locale"),
j + 1);
}
rc = tre_regaexec(&reg, t,
&match, params, 0);
vmaxset(vmax);
}
if(rc == REG_OK) {
ANS(i, j) = (double) match.cost;
if(opt_counts) {
COUNTS(i, j, 0) = match.num_ins;
COUNTS(i, j, 1) = match.num_del;
COUNTS(i, j, 2) = match.num_subst;
OFFSETS(i, j, 0) = match.pmatch[0].rm_so + 1;
OFFSETS(i, j, 1) = match.pmatch[0].rm_eo;
}
} else {
/* Should maybe check for REG_NOMATCH? */
ANS(i, j) = R_PosInf;
if(opt_counts) {
for(m = 0; m < 3; m++) {
COUNTS(i, j, m) = NA_INTEGER;
}
OFFSETS(i, j, 0) = -1;
OFFSETS(i, j, 1) = -1;
}
}
}
}
if(opt_counts)
free(pmatch);
tre_regfree(&reg);
}
}
PROTECT(x = getAttrib(x, R_NamesSymbol));
PROTECT(y = getAttrib(y, R_NamesSymbol));
if(!isNull(x) || !isNull(y)) {
PROTECT(dimnames = allocVector(VECSXP, 2));
SET_VECTOR_ELT(dimnames, 0, x);
SET_VECTOR_ELT(dimnames, 1, y);
setAttrib(ans, R_DimNamesSymbol, dimnames);
UNPROTECT(1); /* dimnames */
}
if(opt_counts) {
PROTECT(dimnames = allocVector(VECSXP, 3));
PROTECT(names = allocVector(STRSXP, 3));
SET_STRING_ELT(names, 0, mkChar("ins"));
SET_STRING_ELT(names, 1, mkChar("del"));
SET_STRING_ELT(names, 2, mkChar("sub"));
SET_VECTOR_ELT(dimnames, 0, x);
SET_VECTOR_ELT(dimnames, 1, y);
SET_VECTOR_ELT(dimnames, 2, names);
setAttrib(counts, R_DimNamesSymbol, dimnames);
setAttrib(ans, install("counts"), counts);
UNPROTECT(2); /* names, dimnames */
PROTECT(dimnames = allocVector(VECSXP, 3));
PROTECT(names = allocVector(STRSXP, 2));
SET_STRING_ELT(names, 0, mkChar("first"));
SET_STRING_ELT(names, 1, mkChar("last"));
SET_VECTOR_ELT(dimnames, 0, x);
SET_VECTOR_ELT(dimnames, 1, y);
SET_VECTOR_ELT(dimnames, 2, names);
setAttrib(offsets, R_DimNamesSymbol, dimnames);
setAttrib(ans, install("offsets"), offsets);
UNPROTECT(4); /* names, dimnames, counts, offsets */
}
UNPROTECT(3); /* y, x, counts */
return ans;
}
SEXP attribute_hidden do_aregexec(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP pat, vec, ans, matchpos, matchlen;
SEXP opt_bounds, opt_costs;
int opt_icase, opt_fixed, useBytes;
Rboolean haveBytes, useWC = FALSE;
const char *s, *t;
const void *vmax = NULL;
regex_t reg;
size_t nmatch;
regmatch_t *pmatch;
regaparams_t params;
regamatch_t match;
int so, patlen;
int rc, cflags = REG_EXTENDED;
checkArity(op, args);
pat = CAR(args); args = CDR(args);
vec = CAR(args); args = CDR(args);
opt_bounds = CAR(args); args = CDR(args);
opt_costs = CAR(args); args = CDR(args);
opt_icase = asLogical(CAR(args)); args = CDR(args);
opt_fixed = asLogical(CAR(args)); args = CDR(args);
useBytes = asLogical(CAR(args));
if(opt_icase == NA_INTEGER) opt_icase = 0;
if(opt_fixed == NA_INTEGER) opt_fixed = 0;
if(useBytes == NA_INTEGER) useBytes = 0;
if(opt_fixed && opt_icase) {
warning(_("argument '%s' will be ignored"),
"ignore.case = TRUE");
opt_icase = 0;
}
if(opt_fixed) cflags |= REG_LITERAL;
if(opt_icase) cflags |= REG_ICASE;
if(!isString(pat) ||
(length(pat) < 1) ||
(STRING_ELT(pat, 0) == NA_STRING))
error(_("invalid '%s' argument"), "pattern");
if(length(pat) > 1)
warning(_("argument '%s' has length > 1 and only the first element will be used"), "pattern");
if(!isString(vec))
error(_("invalid '%s' argument"), "text");
R_xlen_t n = XLENGTH(vec);
if(!useBytes) {
haveBytes = IS_BYTES(STRING_ELT(pat, 0));
if(!haveBytes)
for(R_xlen_t i = 0; i < n; i++) {
if(IS_BYTES(STRING_ELT(vec, i))) {
haveBytes = TRUE;
break;
}
}
if(haveBytes) useBytes = TRUE;
}
if(!useBytes) {
useWC = !IS_ASCII(STRING_ELT(pat, 0));
if(!useWC) {
for(R_xlen_t i = 0 ; i < n ; i++) {
if(STRING_ELT(vec, i) == NA_STRING) continue;
if(!IS_ASCII(STRING_ELT(vec, i))) {
useWC = TRUE;
break;
}
}
}
}
SEXP s_nchar = install("nchar");
if(useBytes)
PROTECT(call = lang3(s_nchar, pat,
ScalarString(mkChar("bytes"))));
else
PROTECT(call = lang3(s_nchar, pat,
ScalarString(mkChar("chars"))));
patlen = asInteger(eval(call, env));
UNPROTECT(1);
if(!patlen)
error(_("'pattern' must be a non-empty character string"));
if(useBytes)
rc = tre_regcompb(&reg, CHAR(STRING_ELT(pat, 0)), cflags);
else if(useWC)
rc = tre_regwcomp(&reg, wtransChar(STRING_ELT(pat, 0)), cflags);
else {
s = translateChar(STRING_ELT(pat, 0));
if(mbcslocale && !mbcsValid(s))
error(_("regular expression is invalid in this locale"));
rc = tre_regcomp(&reg, s, cflags);
}
if(rc) {
char errbuf[1001];
tre_regerror(rc, &reg, errbuf, 1001);
error(_("regcomp error: '%s'"), errbuf);
}
nmatch = reg.re_nsub + 1;
pmatch = (regmatch_t *) malloc(nmatch * sizeof(regmatch_t));
tre_regaparams_default(&params);
amatch_regaparams(&params, patlen,
REAL(opt_bounds), INTEGER(opt_costs));
PROTECT(ans = allocVector(VECSXP, n));
for(R_xlen_t i = 0; i < n; i++) {
// if ((i+1) % NINTERRUPT == 0) R_CheckUserInterrupt();
if(STRING_ELT(vec, i) == NA_STRING) {
PROTECT(matchpos = ScalarInteger(NA_INTEGER));
SEXP s_match_length = install("match.length");
setAttrib(matchpos, s_match_length,
ScalarInteger(NA_INTEGER));
SET_VECTOR_ELT(ans, i, matchpos);
UNPROTECT(1);
} else {
vmax = vmaxget();
/* Perform match. */
memset(&match, 0, sizeof(match));
match.nmatch = nmatch;
match.pmatch = pmatch;
if(useBytes)
rc = tre_regaexecb(&reg, CHAR(STRING_ELT(vec, i)),
&match, params, 0);
else if(useWC) {
rc = tre_regawexec(&reg, wtransChar(STRING_ELT(vec, i)),
&match, params, 0);
vmaxset(vmax);
}
else {
t = translateChar(STRING_ELT(vec, i));
if(mbcslocale && !mbcsValid(t))
error(_("input string %d is invalid in this locale"),
i + 1);
rc = tre_regaexec(&reg, t,
&match, params, 0);
vmaxset(vmax);
}
if(rc == REG_OK) {
PROTECT(matchpos = allocVector(INTSXP, nmatch));
PROTECT(matchlen = allocVector(INTSXP, nmatch));
for(R_xlen_t j = 0; j < match.nmatch; j++) {
so = match.pmatch[j].rm_so;
INTEGER(matchpos)[j] = so + 1;
INTEGER(matchlen)[j] = match.pmatch[j].rm_eo - so;
}
setAttrib(matchpos, install("match.length"), matchlen);
if(useBytes)
setAttrib(matchpos, install("useBytes"),
ScalarLogical(TRUE));
SET_VECTOR_ELT(ans, i, matchpos);
UNPROTECT(2);
} else {
/* No match (or could there be an error?). */
/* Alternatively, could return nmatch -1 values.
*/
PROTECT(matchpos = ScalarInteger(-1));
PROTECT(matchlen = ScalarInteger(-1));
setAttrib(matchpos, install("match.length"), matchlen);
SET_VECTOR_ELT(ans, i, matchpos);
UNPROTECT(2);
}
}
}
free(pmatch);
tre_regfree(&reg);
UNPROTECT(1);
return ans;
}