blob: cfde96a89f56eed8ec4774b70a235f29705255b5 [file] [log] [blame]
% File src/library/base/man/grep.Rd
% Part of the R package, https://www.R-project.org
% Copyright 1995-2019 R Core Team
% Distributed under GPL 2 or later
\name{grep}
\title{Pattern Matching and Replacement}
\alias{grep}
\alias{grepl}
\alias{sub}
\alias{gsub}
\alias{regexpr}
\alias{gregexpr}
\alias{regexec}
\description{
\code{grep}, \code{grepl}, \code{regexpr}, \code{gregexpr} and
\code{regexec} search for matches to argument \code{pattern} within
each element of a character vector: they differ in the format of and
amount of detail in the results.
\code{sub} and \code{gsub} perform replacement of the first and all
matches respectively.
}
\usage{
grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE,
fixed = FALSE, useBytes = FALSE, invert = FALSE)
grepl(pattern, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
regexpr(pattern, text, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
gregexpr(pattern, text, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
regexec(pattern, text, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
}
\arguments{
\item{pattern}{character string containing a \link{regular expression}
(or character string for \code{fixed = TRUE}) to be matched
in the given character vector. Coerced by
\code{\link{as.character}} to a character string if possible. If a
character vector of length 2 or more is supplied, the first element
is used with a warning. Missing values are allowed except for
\code{regexpr} and \code{gregexpr}.}
\item{x, text}{a character vector where matches are sought, or an
object which can be coerced by \code{as.character} to a character
vector. \link{Long vectors} are supported.}
\item{ignore.case}{if \code{FALSE}, the pattern matching is \emph{case
sensitive} and if \code{TRUE}, case is ignored during matching.}
\item{perl}{logical. Should Perl-compatible regexps be used?}
\item{value}{if \code{FALSE}, a vector containing the (\code{integer})
indices of the matches determined by \code{grep} is returned, and if
\code{TRUE}, a vector containing the matching elements themselves is
returned.}
\item{fixed}{logical. If \code{TRUE}, \code{pattern} is a string to be
matched as is. Overrides all conflicting arguments.}
\item{useBytes}{logical. If \code{TRUE} the matching is done
byte-by-byte rather than character-by-character. See
\sQuote{Details}.}
\item{invert}{logical. If \code{TRUE} return indices or values for
elements that do \emph{not} match.}
\item{replacement}{a replacement for matched pattern in \code{sub} and
\code{gsub}. Coerced to character if possible. For
\code{fixed = FALSE} this can include backreferences \code{"\\1"} to
\code{"\\9"} to parenthesized subexpressions of \code{pattern}. For
\code{perl = TRUE} only, it can also contain \code{"\\U"} or
\code{"\\L"} to convert the rest of the replacement to upper or
lower case and \code{"\\E"} to end case conversion. If a
character vector of length 2 or more is supplied, the first element
is used with a warning. If \code{NA}, all elements in the result
corresponding to matches will be set to \code{NA}.
}
}
\details{
Arguments which should be character strings or character vectors are
coerced to character if possible.
Each of these functions operates in one of three modes:
\enumerate{
\item \code{fixed = TRUE}: use exact matching.
\item \code{perl = TRUE}: use Perl-style regular expressions.
\item \code{fixed = FALSE, perl = FALSE}: use POSIX 1003.2
extended regular expressions (the default).
}
See the help pages on \link{regular expression} for details of the
different types of regular expressions.
The two \code{*sub} functions differ only in that \code{sub} replaces
only the first occurrence of a \code{pattern} whereas \code{gsub}
replaces all occurrences. If \code{replacement} contains
backreferences which are not defined in \code{pattern} the result is
undefined (but most often the backreference is taken to be \code{""}).
For \code{regexpr}, \code{gregexpr} and \code{regexec} it is an error
for \code{pattern} to be \code{NA}, otherwise \code{NA} is permitted
and gives an \code{NA} match.
The main effect of \code{useBytes} is to avoid errors/warnings about
invalid inputs and spurious matches in multibyte locales, but for
\code{regexpr} it changes the interpretation of the output.
It inhibits the conversion of inputs with marked encodings, and is
forced if any input is found which is marked as \code{"bytes"}
(see \code{\link{Encoding}}).
Caseless matching does not make much sense for bytes in a multibyte
locale, and you should expect it only to work for ASCII characters if
\code{useBytes = TRUE}.
\code{regexpr} and \code{gregexpr} with \code{perl = TRUE} allow
Python-style named captures, but not for \emph{long vector} inputs.
Invalid inputs in the current locale are warned about up to 5 times.
Caseless matching with \code{perl = TRUE} for non-ASCII characters
depends on the PCRE library being compiled with \sQuote{Unicode
property support}.
}
\value{
\code{grep(value = FALSE)} returns a vector of the indices
of the elements of \code{x} that yielded a match (or not, for
\code{invert = TRUE}). This will be an integer vector unless the input
is a \emph{long vector}, when it will be a double vector.
\code{grep(value = TRUE)} returns a character vector containing the
selected elements of \code{x} (after coercion, preserving names but no
other attributes).
\code{grepl} returns a logical vector (match or not for each element of
\code{x}).
\code{sub} and \code{gsub} return a character vector of the same
length and with the same attributes as \code{x} (after possible
coercion to character). Elements of character vectors \code{x} which
are not substituted will be returned unchanged (including any declared
encoding). If \code{useBytes = FALSE} a non-ASCII substituted result
will often be in UTF-8 with a marked encoding (e.g., if there is a
UTF-8 input, and in a multibyte locale unless \code{fixed = TRUE}).
Such strings can be re-encoded by \code{\link{enc2native}}.
\code{regexpr} returns an integer vector of the same length as
\code{text} giving the starting position of the first match or
\eqn{-1} if there is none, with attribute \code{"match.length"}, an
integer vector giving the length of the matched text (or \eqn{-1} for
no match). The match positions and lengths are in characters unless
\code{useBytes = TRUE} is used, when they are in bytes (as they are
for an ASCII-only matching: in either case an attribute
\code{useBytes} with value \code{TRUE} is set on the result). If
named capture is used there are further attributes
\code{"capture.start"}, \code{"capture.length"} and
\code{"capture.names"}.
\code{gregexpr} returns a list of the same length as \code{text} each
element of which is of the same form as the return value for
\code{regexpr}, except that the starting positions of every (disjoint)
match are given.
\code{regexec} returns a list of the same length as \code{text} each
element of which is either \eqn{-1} if there is no match, or a
sequence of integers with the starting positions of the match and all
substrings corresponding to parenthesized subexpressions of
\code{pattern}, with attribute \code{"match.length"} a vector
giving the lengths of the matches (or \eqn{-1} for no match). The
interpretation of positions and length and the attributes follows
\code{regexpr}.
Where matching failed because of resource limits (especially for PCRE)
this is regarded as a non-match, usually with a warning.
}
\section{Warning}{
The POSIX 1003.2 mode of \code{gsub} and \code{gregexpr} does not
work correctly with repeated word-boundaries (e.g.,
\code{pattern = "\\b"}).
Use \code{perl = TRUE} for such matches (but that may not
work as expected with non-ASCII inputs, as the meaning of
\sQuote{word} is system-dependent).
}
\section{Performance considerations}{
If you are doing a lot of regular expression matching, including on
very long strings, you will want to consider the options used.
Generally PCRE will be faster than the default regular expression
engine, and \code{fixed = TRUE} faster still (especially when each
pattern is matched only a few times).
If you are working in a single-byte locale and have marked UTF-8
strings that are representable in that locale, convert them first as
just one UTF-8 string will force all the matching to be done in
Unicode, which attracts a penalty of around \eqn{3\times{}}{3x} for
the default POSIX 1003.2 mode.
If you can make use of \code{useBytes = TRUE}, the strings will not be
checked before matching, and the actual matching will be faster.
Often byte-based matching suffices in a UTF-8 locale since byte
patterns of one character never match part of another.
PCRE-based matching by default puts additional effort into
\sQuote{studying} the compiled pattern when \code{x}/\code{text} has
length at least 10. As from \R 3.4.0 that study may use the PCRE JIT
compiler on platforms where it is available (see
\code{\link{pcre_config}}). The details are controlled by
\code{\link{options}} \code{PCRE_study} and \code{PCRE_use_JIT}.
(Some timing comparisons can be seen by running file
\file{tests/PCRE.R} in the \R sources (and perhaps installed).)
People working with PCRE and very long strings can adjust the maximum
size of the JIT stack by setting environment variable
\env{R_PCRE_JIT_STACK_MAXSIZE} before JIT is used to a value between
\code{1} and \code{1000} in MB: the default is \code{64}. (Then it
would usually be wise to set the \link{option}
\code{PCRE_limit_recursion}.)
}
\source{
The C code for POSIX-style regular expression matching has changed
over the years. As from \R 2.10.0 (Oct 2009) the TRE library of Ville
Laurikari (\url{http://laurikari.net/tre/}) is used. The POSIX
standard does give some room for interpretation, especially in the
handling of invalid regular expressions and the collation of character
ranges, so the results will have changed slightly over the years.
For Perl-style matching PCRE (\url{http://www.pcre.org}) is used:
again the results may depend (slightly) on the version of PCRE in use.
}
\references{
Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
\emph{The New S Language}.
Wadsworth & Brooks/Cole (\code{grep})
}
% the `aka' below is for ESS
\seealso{
\link{regular expression} (aka \code{\link{regexp}}) for the details
of the pattern specification.
\code{\link{regmatches}} for extracting matched substrings based on
the results of \code{regexpr}, \code{gregexpr} and \code{regexec}.
\code{\link{glob2rx}} to turn wildcard matches into regular expressions.
\code{\link{agrep}} for approximate matching.
\code{\link{charmatch}}, \code{\link{pmatch}} for partial matching,
\code{\link{match}} for matching to whole strings,
\code{\link{startsWith}} for matching of initial parts of strings.
\code{\link{tolower}}, \code{\link{toupper}} and \code{\link{chartr}}
for character translations.
\code{\link{apropos}} uses regexps and has more examples.
\code{\link{grepRaw}} for matching raw vectors.
Options \code{PCRE_limit_recursion}, \code{PCRE_study} and
\code{PCRE_use_JIT}.
\code{\link{extSoftVersion}} for the versions of regex and PCRE
libraries in use, \code{\link{pcre_config}} for more details for
PCRE.
}
\examples{
grep("[a-z]", letters)
txt <- c("arm","foot","lefroo", "bafoobar")
if(length(i <- grep("foo", txt)))
cat("'foo' appears at least once in\n\t", txt, "\n")
i # 2 and 4
txt[i]
## Double all 'a' or 'b's; "\\" must be escaped, i.e., 'doubled'
gsub("([ab])", "\\\\1_\\\\1_", "abc and ABC")
txt <- c("The", "licenses", "for", "most", "software", "are",
"designed", "to", "take", "away", "your", "freedom",
"to", "share", "and", "change", "it.",
"", "By", "contrast,", "the", "GNU", "General", "Public", "License",
"is", "intended", "to", "guarantee", "your", "freedom", "to",
"share", "and", "change", "free", "software", "--",
"to", "make", "sure", "the", "software", "is",
"free", "for", "all", "its", "users")
( i <- grep("[gu]", txt) ) # indices
stopifnot( txt[i] == grep("[gu]", txt, value = TRUE) )
## Note that in locales such as en_US this includes B as the
## collation order is aAbBcCdDe ...
(ot <- sub("[b-e]",".", txt))
txt[ot != gsub("[b-e]",".", txt)]#- gsub does "global" substitution
txt[gsub("g","#", txt) !=
gsub("g","#", txt, ignore.case = TRUE)] # the "G" words
regexpr("en", txt)
gregexpr("e", txt)
## Using grepl() for filtering
## Find functions with argument names matching "warn":
findArgs <- function(env, pattern) {
nms <- ls(envir = as.environment(env))
nms <- nms[is.na(match(nms, c("F","T")))] # <-- work around "checking hack"
aa <- sapply(nms, function(.) { o <- get(.)
if(is.function(o)) names(formals(o)) })
iw <- sapply(aa, function(a) any(grepl(pattern, a, ignore.case=TRUE)))
aa[iw]
}
findArgs("package:base", "warn")
## trim trailing white space
str <- "Now is the time "
sub(" +$", "", str) ## spaces only
## what is considered 'white space' depends on the locale.
sub("[[:space:]]+$", "", str) ## white space, POSIX-style
## what PCRE considered white space changed in version 8.34: see ?regex
sub("\\\\s+$", "", str, perl = TRUE) ## PCRE-style white space
## capitalizing
txt <- "a test of capitalizing"
gsub("(\\\\w)(\\\\w*)", "\\\\U\\\\1\\\\L\\\\2", txt, perl=TRUE)
gsub("\\\\b(\\\\w)", "\\\\U\\\\1", txt, perl=TRUE)
txt2 <- "useRs may fly into JFK or laGuardia"
gsub("(\\\\w)(\\\\w*)(\\\\w)", "\\\\U\\\\1\\\\E\\\\2\\\\U\\\\3", txt2, perl=TRUE)
sub("(\\\\w)(\\\\w*)(\\\\w)", "\\\\U\\\\1\\\\E\\\\2\\\\U\\\\3", txt2, perl=TRUE)
## named capture
notables <- c(" Ben Franklin and Jefferson Davis",
"\tMillard Fillmore")
# name groups 'first' and 'last'
name.rex <- "(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)"
(parsed <- regexpr(name.rex, notables, perl = TRUE))
gregexpr(name.rex, notables, perl = TRUE)[[2]]
parse.one <- function(res, result) {
m <- do.call(rbind, lapply(seq_along(res), function(i) {
if(result[i] == -1) return("")
st <- attr(result, "capture.start")[i, ]
substring(res[i], st, st + attr(result, "capture.length")[i, ] - 1)
}))
colnames(m) <- attr(result, "capture.names")
m
}
parse.one(notables, parsed)
## Decompose a URL into its components.
## Example by LT (http://www.cs.uiowa.edu/~luke/R/regexp.html).
x <- "http://stat.umn.edu:80/xyz"
m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)
m
regmatches(x, m)
## Element 3 is the protocol, 4 is the host, 6 is the port, and 7
## is the path. We can use this to make a function for extracting the
## parts of a URL:
URL_parts <- function(x) {
m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)
parts <- do.call(rbind,
lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L)))
colnames(parts) <- c("protocol","host","port","path")
parts
}
URL_parts(x)
## There is no gregexec() yet, but one can emulate it by running
## regexec() on the regmatches obtained via gregexpr(). E.g.:
pattern <- "([[:alpha:]]+)([[:digit:]]+)"
s <- "Test: A1 BC23 DEF456"
lapply(regmatches(s, gregexpr(pattern, s)),
function(e) regmatches(e, regexec(pattern, e)))
}
\keyword{character}
\keyword{utilities}