blob: 6fd415e3a8cbcd16a9fe20f86ab0c0bdb0b0cea5 [file] [log] [blame]
# File src/library/base/R/grep.R
# Part of the R package, https://www.R-project.org
#
# Copyright (C) 1995-2015 The R Core Team
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# https://www.R-project.org/Licenses/
## Q: Why are we using as.character(.) all over the place instead of doing that in C ?
## A: These must work for objects which have their own as.character(.) methods *and*
## as.character() is fast [Primitive]
strsplit <-
function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE)
.Internal(strsplit(x, as.character(split), fixed, perl, useBytes))
grep <-
function(pattern, x, ignore.case = FALSE, perl = FALSE,
value = FALSE, fixed = FALSE, useBytes = FALSE, invert = FALSE)
{
## when value = TRUE we return names
if(!is.character(x)) x <- structure(as.character(x), names=names(x))
.Internal(grep(as.character(pattern), x, ignore.case, value,
perl, fixed, useBytes, invert))
}
grepl <-
function(pattern, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
{
if(!is.character(x)) x <- as.character(x)
.Internal(grepl(as.character(pattern), x, ignore.case, FALSE,
perl, fixed, useBytes, FALSE))
}
sub <-
function(pattern, replacement, x, ignore.case = FALSE,
perl = FALSE, fixed = FALSE, useBytes = FALSE)
{
if (!is.character(x)) x <- as.character(x)
.Internal(sub(as.character(pattern), as.character(replacement), x,
ignore.case, perl, fixed, useBytes))
}
gsub <-
function(pattern, replacement, x, ignore.case = FALSE,
perl = FALSE, fixed = FALSE, useBytes = FALSE)
{
if (!is.character(x)) x <- as.character(x)
.Internal(gsub(as.character(pattern), as.character(replacement), x,
ignore.case, perl, fixed, useBytes))
}
regexpr <-
function(pattern, text, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
{
if (!is.character(text)) text <- as.character(text)
.Internal(regexpr(as.character(pattern), text,
ignore.case, perl, fixed, useBytes))
}
gregexpr <-
function(pattern, text, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
{
if (!is.character(text)) text <- as.character(text)
.Internal(gregexpr(as.character(pattern), text,
ignore.case, perl, fixed, useBytes))
}
grepRaw <-
function(pattern, x, offset = 1L, ignore.case = FALSE, value = FALSE,
fixed = FALSE, all = FALSE, invert = FALSE)
{
if (!is.raw(pattern)) pattern <- charToRaw(as.character(pattern))
if (!is.raw(x)) x <- charToRaw(as.character(x))
.Internal(grepRaw(pattern, x, offset, ignore.case, fixed, value, all, invert))
}
regexec <-
function(pattern, text, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
{
if (!is.character(text)) text <- as.character(text)
if(!perl || fixed)
return(.Internal(regexec(as.character(pattern), text, ignore.case, fixed,
useBytes)))
## For perl = TRUE, re-use regexpr(perl = TRUE) which always
## captures subexpressions.
match_data_from_pos_and_len <- function(pos, len) {
attr(pos, "match.length") <- len
pos
}
m <- regexpr(pattern, text,
ignore.case = ignore.case, useBytes = useBytes,
perl = TRUE)
y <- vector("list", length(text))
y[is.na(m)] <- list(match_data_from_pos_and_len(NA_integer_, NA_integer_))
ind <- !is.na(m) & (m == -1L)
if(any(ind)) {
y[ind] <- list(match_data_from_pos_and_len(-1L, -1L))
}
ind <- !is.na(m) & !ind
if(any(ind)) {
pos <- cbind(m[ind],
attr(m, "capture.start")[ind, , drop = FALSE])
len <- cbind(attr(m, "match.length")[ind],
attr(m, "capture.length")[ind, , drop = FALSE])
y[ind] <- Map(match_data_from_pos_and_len,
split(pos, row(pos)),
split(len, row(len)))
}
if(identical(attr(m, "useBytes"), TRUE))
y <- lapply(y, `attr<-`, "useBytes", TRUE)
y
}
agrep <-
function(pattern, x, max.distance = 0.1, costs = NULL,
ignore.case = FALSE, value = FALSE, fixed = TRUE,
useBytes = FALSE)
{
pattern <- as.character(pattern)
if(!is.character(x)) x <- as.character(x)
## TRE needs integer costs: coerce here for simplicity.
costs <- as.integer(.amatch_costs(costs))
bounds <- .amatch_bounds(max.distance)
.Internal(agrep(pattern, x, ignore.case, value, costs, bounds,
useBytes, fixed))
}
agrepl <-
function(pattern, x, max.distance = 0.1, costs = NULL,
ignore.case = FALSE, fixed = TRUE, useBytes = FALSE)
{
pattern <- as.character(pattern)
if(!is.character(x)) x <- as.character(x)
## TRE needs integer costs: coerce here for simplicity.
costs <- as.integer(.amatch_costs(costs))
bounds <- .amatch_bounds(max.distance)
.Internal(agrepl(pattern, x, ignore.case, FALSE, costs, bounds,
useBytes, fixed))
}
.amatch_bounds <-
function(x = 0.1)
{
## Expand max match distance argument for agrep() et al into bounds
## for the TRE regaparams struct.
## Note that TRE allows for possibly different (integer) costs for
## insertions, deletions and substitions, and allows for specifying
## separate bounds for these numbers as well as the total number of
## "errors" (transformations) and the total cost.
##
## When using unit costs (and older versions of agrep() did not
## allow otherwise), the total number of errors is the same as the
## total cost, and bounds on the total number of errors imply the
## same bounds for the individual transformation counts. This no
## longer holds when using possibly different costs.
##
## See ? agrep for details on handling the match distance argument.
##
## Older versions of agrep() expanded fractions (of the pattern
## length) in R code: but as the C code determines whether matching
## used bytes or characters, only the C code can determine the
## pattern length and hence expand fractions.
##
## Unspecified bounds are taken as NA_real_, and set to INT_MAX by
## the C code.
if(!is.list(x)) {
## Sanity checks.
if(!is.numeric(x) || (x < 0))
stop("match distance components must be non-negative")
bounds <- c(as.double(x), rep.int(NA_real_, 4L))
} else {
table <-
c("cost", "insertions", "deletions", "substitutions", "all")
## Partial matching.
pos <- pmatch(names(x), table)
if(anyNA(pos)) {
warning("unknown match distance components ignored")
x <- x[!is.na(pos)]
}
names(x) <- table[pos]
## Sanity checks.
x <- unlist(x)
if(!all(is.numeric(x)) || any(x < 0))
stop("match distance components must be non-negative")
## Defaults.
if(!is.na(x["cost"])) {
bounds <- rep.int(NA_real_, 5L)
} else {
## If 'cost' is missing: if 'all' is missing it is set to
## 0.1, and the other transformation number bounds default
## to 'all'.
if(is.na(x["all"]))
x["all"] <- 0.1
bounds <- c(NA_real_, rep.int(x["all"], 4L))
}
names(bounds) <- table
bounds[names(x)] <- x
}
bounds
}
.amatch_costs <-
function(x = NULL)
{
costs <- c(insertions = 1, deletions = 1, substitutions = 1)
if(!is.null(x)) {
x <- as.list(x)
## Partial matching.
pos <- pmatch(names(x), names(costs))
if(anyNA(pos)) {
warning("unknown cost components ignored")
x <- x[!is.na(pos)]
}
## Sanity checks.
x <- unlist(x)
if(!all(is.numeric(x)) || any(x < 0))
stop("cost components must be non-negative")
costs[pos] <- x
}
costs
}
regmatches <-
function(x, m, invert = FALSE)
{
if(length(x) != length(m))
stop(gettextf("%s and %s must have the same length",
sQuote("x"), sQuote("m")),
domain = NA)
ili <- is.list(m)
## Handle useBytes/encoding issues.
## Match positions from regexpr(), gregexpr() and regexec() are in
## characters unless 'useBytes = TRUE' was given, now recorded via
## the 'index.type' attribute (in addition to the 'useBytes' one
## being TRUE when 'useBytes = TRUE' was given *or* all character
## string involved were ASCII).
## To convince substring() and nchar() used below accordingly that
## match data positions are in bytes, we set the input encoding to
## "bytes" for the former and call the latter with 'type = "bytes"'.
itype <- "chars"
useBytes <- if(ili)
any(unlist(lapply(m, attr, "index.type")) == "bytes")
else
any(attr(m, "index.type") == "bytes")
if(useBytes) {
itype <- Encoding(x) <- "bytes"
}
## For NA matches (from matching a non-NA pattern on an NA string),
## direct matches give nothing and inverse matches give NA (as
## nothing was matched).
if(!ili && isFALSE(invert)) {
so <- m[ind <- (!is.na(m) & (m > -1L))]
eo <- so + attr(m, "match.length")[ind] - 1L
return(substring(x[ind], so, eo))
}
y <- if(is.na(invert)) {
Map(function(u, so, ml) {
if((n <- length(so)) == 1L) {
if(is.na(so) )
return(NA_character_) # Or u ...
else if(so == -1L)
return(u)
}
eo <- so + ml - 1L
if(n > 1L) {
## regexec() could give overlapping matches.
## Matches are non-overlapping iff
## eo[i] < so[i + 1], i = 1, ..., n - 1.
if(any(eo[-n] >= so[-1L]))
stop(gettextf("need non-overlapping matches for %s",
sQuote("invert = NA")),
domain = NA)
}
beg <- c(1L, c(rbind(so, eo + 1L)))
end <- c(c(rbind(so - 1L, eo)), nchar(u, itype))
substring(u, beg, end)
},
x, m,
if(ili)
lapply(m, attr, "match.length")
else
attr(m, "match.length"),
USE.NAMES = FALSE)
} else if(invert) {
Map(function(u, so, ml) {
if((n <- length(so)) == 1L) {
if(is.na(so) )
return(NA_character_) # Or u ...
else if(so == -1L)
return(u)
}
beg <- if(n > 1L) {
## See above.
eo <- so + ml - 1L
if(any(eo[-n] >= so[-1L]))
stop(gettextf("need non-overlapping matches for %s",
sQuote("invert = TRUE")),
domain = NA)
c(1L, eo + 1L)
} else {
c(1L, so + ml)
}
end <- c(so - 1L, nchar(u, itype))
substring(u, beg, end)
},
x, m,
if(ili)
lapply(m, attr, "match.length")
else
attr(m, "match.length"),
USE.NAMES = FALSE)
} else {
Map(function(u, so, ml) {
if(length(so) == 1L) {
if(is.na(so) || (so == -1L))
return(character())
}
substring(u, so, so + ml - 1L)
},
x, m,
lapply(m, attr, "match.length"),
USE.NAMES = FALSE)
}
names(y) <- names(x)
y
}
## Suppose matching partitions a string as
## n0 m1 n1 ... mk nk
## where the m and n substrings are the matched and non-matched parts,
## respectively, and n0 and/or nk can be empty.
## (regexec() can give overlapping matches, in which case extracting
## inverted matches or replacing cannot work.)
## For list match data, k can be any non-negative integer.
## Extraction and replacement straightforwardly work on the m or n
## sequences, depending on whether invert is FALSE or TRUE.
## For vector match data from regexpr(), k can be 0 or 1.
## If k = 0 (no match):
## invert
## FALSE TRUE
## extract drop n0
## replace n0 r0
## If k = 1:
## invert
## FALSE TRUE
## extract m1 c(n0, n1)
## replace n0 r1 n1 r0 m1 r1
`regmatches<-` <-
function(x, m, invert = FALSE, value)
{
if(!length(x)) return(x)
ili <- is.list(m)
if(!ili && invert && any(m == -1L)) {
## regmatches() drops empty matches for vector match data if
## invert is FALSE (see above): we need to work around this when
## replacing non-matches (PR #15723).
y <- rep_len(list(character()), length(x))
y[m > -1L] <- as.list(regmatches(x, m, FALSE))
} else {
y <- regmatches(x, m, !invert)
}
## <FIXME>
## It might be simpler to try reducing the vector case to the list
## case, transforming m and value as needed,
## </FIXME>
if(!ili && !invert) {
## For non-list m and invert = FALSE, we need a character vector
## of replacement values with length the number of matched
## elements.
value <- as.character(value)
if(anyNA(value))
stop("missing replacement values are not allowed")
## Entries for matched elements have length 2.
pos <- which(lengths(y) == 2L)
np <- length(pos)
nv <- length(value)
if(np != nv) {
if(!nv)
stop("must have replacement values for matches")
value <- rep_len(value, np)
}
y <- y[pos]
x[pos] <- paste0(sapply(y, `[`, 1L), value, sapply(y, `[`, 2L))
return(x)
}
## We need a list of character vectors without missings, which has
## the same length as x.
value <- lapply(value, as.character)
if(anyNA(value)) # {recursively!}
stop("missing replacement values are not allowed")
if(!length(value))
stop("value does not provide any replacement values")
value <- rep_len(value, length(x))
y <- if(invert) {
## Replace non-matches.
## An element of x with k matches has a corresponding y element
## of length k, and needs k + 1 replacement values.
Map(function(u, v) {
nu <- length(u)
nv <- length(v)
if(nv != (nu + 1L)) {
if(!nv)
stop("must have replacements for non-matches")
v <- rep_len(v, nu + 1L)
}
paste0(v, c(u, ""), collapse = "")
},
y, value, USE.NAMES = FALSE)
} else {
## Replace matches.
## An element of x with k matches has a corresponding y element
## of length k + 1, and needs k replacement values.
Map(function(u, v) {
nu <- length(u)
nv <- length(v)
if(nv != (nu - 1L)) {
if(!nv)
stop("must have replacements for matches")
v <- rep_len(v, nu - 1L)
}
paste0(u, c(v, ""), collapse = "")
},
y, value, USE.NAMES = FALSE)
}
y <- unlist(y)
names(y) <- names(x)
y
}
pcre_config <- function() .Internal(pcre_config())