| % File src/library/base/man/grep.Rd |
| % Part of the R package, https://www.R-project.org |
| % Copyright 1995-2020 R Core Team |
| % Distributed under GPL 2 or later |
| |
| \name{grep} |
| \title{Pattern Matching and Replacement} |
| \alias{grep} |
| \alias{grepl} |
| \alias{sub} |
| \alias{gsub} |
| \alias{regexpr} |
| \alias{gregexpr} |
| \alias{regexec} |
| \alias{gregexec} |
| \description{ |
| \code{grep}, \code{grepl}, \code{regexpr}, \code{gregexpr}, |
| \code{regexec} and \code{gregexec} search for matches to argument |
| \code{pattern} within each element of a character vector: they differ in |
| the format of and amount of detail in the results. |
| |
| \code{sub} and \code{gsub} perform replacement of the first and all |
| matches respectively. |
| } |
| \usage{ |
| grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE, |
| fixed = FALSE, useBytes = FALSE, invert = FALSE) |
| |
| grepl(pattern, x, ignore.case = FALSE, perl = FALSE, |
| fixed = FALSE, useBytes = FALSE) |
| |
| sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, |
| fixed = FALSE, useBytes = FALSE) |
| |
| gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, |
| fixed = FALSE, useBytes = FALSE) |
| |
| regexpr(pattern, text, ignore.case = FALSE, perl = FALSE, |
| fixed = FALSE, useBytes = FALSE) |
| |
| gregexpr(pattern, text, ignore.case = FALSE, perl = FALSE, |
| fixed = FALSE, useBytes = FALSE) |
| |
| regexec(pattern, text, ignore.case = FALSE, perl = FALSE, |
| fixed = FALSE, useBytes = FALSE) |
| |
| gregexec(pattern, text, ignore.case = FALSE, perl = FALSE, |
| fixed = FALSE, useBytes = FALSE) |
| } |
| \arguments{ |
| \item{pattern}{character string containing a \link{regular expression} |
| (or character string for \code{fixed = TRUE}) to be matched |
| in the given character vector. Coerced by |
| \code{\link{as.character}} to a character string if possible. If a |
| character vector of length 2 or more is supplied, the first element |
| is used with a warning. Missing values are allowed except for |
| \code{regexpr}, \code{gregexpr} and \code{regexec}.} |
| \item{x, text}{a character vector where matches are sought, or an |
| object which can be coerced by \code{as.character} to a character |
| vector. \link{Long vectors} are supported.} |
| \item{ignore.case}{if \code{FALSE}, the pattern matching is \emph{case |
| sensitive} and if \code{TRUE}, case is ignored during matching.} |
| \item{perl}{logical. Should Perl-compatible regexps be used?} |
| \item{value}{if \code{FALSE}, a vector containing the (\code{integer}) |
| indices of the matches determined by \code{grep} is returned, and if |
| \code{TRUE}, a vector containing the matching elements themselves is |
| returned.} |
| \item{fixed}{logical. If \code{TRUE}, \code{pattern} is a string to be |
| matched as is. Overrides all conflicting arguments.} |
| \item{useBytes}{logical. If \code{TRUE} the matching is done |
| byte-by-byte rather than character-by-character. See |
| \sQuote{Details}.} |
| \item{invert}{logical. If \code{TRUE} return indices or values for |
| elements that do \emph{not} match.} |
| \item{replacement}{a replacement for matched pattern in \code{sub} and |
| \code{gsub}. Coerced to character if possible. For |
| \code{fixed = FALSE} this can include backreferences \code{"\\1"} to |
| \code{"\\9"} to parenthesized subexpressions of \code{pattern}. For |
| \code{perl = TRUE} only, it can also contain \code{"\\U"} or |
| \code{"\\L"} to convert the rest of the replacement to upper or |
| lower case and \code{"\\E"} to end case conversion. If a |
| character vector of length 2 or more is supplied, the first element |
| is used with a warning. If \code{NA}, all elements in the result |
| corresponding to matches will be set to \code{NA}. |
| } |
| } |
| \details{ |
| Arguments which should be character strings or character vectors are |
| coerced to character if possible. |
| |
| Each of these functions operates in one of three modes: |
| \enumerate{ |
| \item \code{fixed = TRUE}: use exact matching. |
| \item \code{perl = TRUE}: use Perl-style regular expressions. |
| \item \code{fixed = FALSE, perl = FALSE}: use POSIX 1003.2 |
| extended regular expressions (the default). |
| } |
| See the help pages on \link{regular expression} for details of the |
| different types of regular expressions. |
| |
| The two \code{*sub} functions differ only in that \code{sub} replaces |
| only the first occurrence of a \code{pattern} whereas \code{gsub} |
| replaces all occurrences. If \code{replacement} contains |
| backreferences which are not defined in \code{pattern} the result is |
| undefined (but most often the backreference is taken to be \code{""}). |
| |
| For \code{regexpr}, \code{gregexpr}, \code{regexec} and \code{gregexec} |
| it is an error for \code{pattern} to be \code{NA}, otherwise \code{NA} |
| is permitted and gives an \code{NA} match. |
| |
| Both \code{grep} and \code{grepl} take missing values in \code{x} as |
| not matching a non-missing \code{pattern}. |
| |
| The main effect of \code{useBytes = TRUE} is to avoid errors/warnings |
| about invalid inputs and spurious matches in multibyte locales, but |
| for \code{regexpr} it changes the interpretation of the output. It |
| inhibits the conversion of inputs with marked encodings, and is forced |
| if any input is found which is marked as \code{"bytes"} (see |
| \code{\link{Encoding}}). |
| |
| Caseless matching does not make much sense for bytes in a multibyte |
| locale, and you should expect it only to work for ASCII characters if |
| \code{useBytes = TRUE}. |
| |
| \code{regexpr} and \code{gregexpr} with \code{perl = TRUE} allow |
| Python-style named captures, but not for \emph{long vector} inputs. |
| |
| Invalid inputs in the current locale are warned about up to 5 times. |
| |
| Caseless matching with \code{perl = TRUE} for non-ASCII characters |
| depends on the PCRE library being compiled with \sQuote{Unicode |
| property support}, which PCRE2 is by default. |
| } |
| |
| \value{ |
| \code{grep(value = FALSE)} returns a vector of the indices |
| of the elements of \code{x} that yielded a match (or not, for |
| \code{invert = TRUE}). This will be an integer vector unless the input |
| is a \emph{\link{long vector}}, when it will be a double vector. |
| |
| \code{grep(value = TRUE)} returns a character vector containing the |
| selected elements of \code{x} (after coercion, preserving names but no |
| other attributes). |
| |
| \code{grepl} returns a logical vector (match or not for each element of |
| \code{x}). |
| |
| \code{sub} and \code{gsub} return a character vector of the same |
| length and with the same attributes as \code{x} (after possible |
| coercion to character). Elements of character vectors \code{x} which |
| are not substituted will be returned unchanged (including any declared |
| encoding). If \code{useBytes = FALSE} a non-ASCII substituted result |
| will often be in UTF-8 with a marked encoding (e.g., if there is a |
| UTF-8 input, and in a multibyte locale unless \code{fixed = TRUE}). |
| Such strings can be re-encoded by \code{\link{enc2native}}. |
| |
| \code{regexpr} returns an integer vector of the same length as |
| \code{text} giving the starting position of the first match or |
| \eqn{-1} if there is none, with attribute \code{"match.length"}, an |
| integer vector giving the length of the matched text (or \eqn{-1} for |
| no match). The match positions and lengths are in characters unless |
| \code{useBytes = TRUE} is used, when they are in bytes (as they are |
| for ASCII-only matching: in either case an attribute |
| \code{useBytes} with value \code{TRUE} is set on the result). If |
| named capture is used there are further attributes |
| \code{"capture.start"}, \code{"capture.length"} and |
| \code{"capture.names"}. |
| |
| \code{gregexpr} returns a list of the same length as \code{text} each |
| element of which is of the same form as the return value for |
| \code{regexpr}, except that the starting positions of every (disjoint) |
| match are given. |
| |
| \code{regexec} returns a list of the same length as \code{text} each |
| element of which is either \eqn{-1} if there is no match, or a |
| sequence of integers with the starting positions of the match and all |
| substrings corresponding to parenthesized subexpressions of |
| \code{pattern}, with attribute \code{"match.length"} a vector |
| giving the lengths of the matches (or \eqn{-1} for no match). The |
| interpretation of positions and length and the attributes follows |
| \code{regexpr}. |
| |
| \code{gregexec} returns the same as \code{regexec}, except that to |
| accommodate multiple matches per element of \code{text}, the integer |
| sequences for each match are made into columns of a matrix, with one |
| matrix per element of \code{text} with matches. |
| |
| Where matching failed because of resource limits (especially for |
| \code{perl = TRUE}) this is regarded as a non-match, usually with a |
| warning. |
| } |
| |
| \section{Warning}{ |
| The POSIX 1003.2 mode of \code{gsub} and \code{gregexpr} does not |
| work correctly with repeated word-boundaries (e.g., |
| \code{pattern = "\\b"}). |
| Use \code{perl = TRUE} for such matches (but that may not |
| work as expected with non-ASCII inputs, as the meaning of |
| \sQuote{word} is system-dependent). |
| } |
| |
| \section{Performance considerations}{ |
| If you are doing a lot of regular expression matching, including on |
| very long strings, you will want to consider the options used. |
| Generally \code{perl = TRUE} will be faster than the default regular |
| expression engine, and \code{fixed = TRUE} faster still (especially |
| when each pattern is matched only a few times). |
| |
| If you are working in a single-byte locale and have marked UTF-8 |
| strings that are representable in that locale, convert them first as |
| just one UTF-8 string will force all the matching to be done in |
| Unicode, which attracts a penalty of around \eqn{3\times{}}{3x} for |
| the default POSIX 1003.2 mode. |
| |
| If you can make use of \code{useBytes = TRUE}, the strings will not be |
| checked before matching, and the actual matching will be faster. |
| Often byte-based matching suffices in a UTF-8 locale since byte |
| patterns of one character never match part of another. Character ranges |
| may produce unexpected results. |
| |
| PCRE-based matching by default used to put additional effort into |
| \sQuote{studying} the compiled pattern when \code{x}/\code{text} has |
| length 10 or more. That study may use the PCRE JIT compiler on |
| platforms where it is available (see \code{\link{pcre_config}}). As |
| from PCRE2 (PCRE version >= 10.00 as reported by |
| \code{\link{extSoftVersion}}), there is no study phase, but the |
| patterns are optimized automatically when possible, and PCRE JIT is |
| used when enabled. The details are controlled by |
| \code{\link{options}} \code{PCRE_study} and \code{PCRE_use_JIT}. |
| (Some timing comparisons can be seen by running file |
| \file{tests/PCRE.R} in the \R sources (and perhaps installed).) |
| People working with PCRE and very long strings can adjust the maximum |
| size of the JIT stack by setting environment variable |
| \env{R_PCRE_JIT_STACK_MAXSIZE} before JIT is used to a value between |
| \code{1} and \code{1000} in MB: the default is \code{64}. When JIT is |
| not used with PCRE version < 10.30 (that is with PCRE1 and old |
| versions of PCRE2), it might also be wise to set the option |
| \code{PCRE_limit_recursion}. |
| } |
| |
| \source{ |
| The C code for POSIX-style regular expression matching has changed |
| over the years. As from \R 2.10.0 (Oct 2009) the TRE library of Ville |
| Laurikari (\url{https://github.com/laurikari/tre}) is used. The POSIX |
| standard does give some room for interpretation, especially in the |
| handling of invalid regular expressions and the collation of character |
| ranges, so the results will have changed slightly over the years. |
| |
| For Perl-style matching PCRE2 or PCRE (\url{https://www.pcre.org}) is |
| used: again the results may depend (slightly) on the version of PCRE |
| in use. |
| } |
| |
| \note{ |
| Aspects will be platform-dependent as well as locale-dependent: for |
| example the implementation of character classes (except |
| \code{[:digit:]} and \code{[:xdigit:]}). One can expect results to be |
| consistent for ASCII inputs and when working in UTF-8 mode (when most |
| platforms will use Unicode character tables, although those are |
| updated frequently and subject to some degree of interpretation -- is |
| a circled capital letter alphabetic or a symbol?). However, results |
| in 8-bit encodings can differ considerably between platforms, modes |
| and from the UTF-8 versions. |
| } |
| |
| \references{ |
| Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) |
| \emph{The New S Language}. |
| Wadsworth & Brooks/Cole (\code{grep}) |
| } |
| |
| % the `aka' below is for ESS |
| \seealso{ |
| \link{regular expression} (aka \code{\link{regexp}}) for the details |
| of the pattern specification. |
| |
| \code{\link{regmatches}} for extracting matched substrings based on |
| the results of \code{regexpr}, \code{gregexpr} and \code{regexec}. |
| |
| \code{\link{glob2rx}} to turn wildcard matches into regular expressions. |
| |
| \code{\link{agrep}} for approximate matching. |
| |
| \code{\link{charmatch}}, \code{\link{pmatch}} for partial matching, |
| \code{\link{match}} for matching to whole strings, |
| \code{\link{startsWith}} for matching of initial parts of strings. |
| |
| \code{\link{tolower}}, \code{\link{toupper}} and \code{\link{chartr}} |
| for character translations. |
| |
| \code{\link{apropos}} uses regexps and has more examples. |
| |
| \code{\link{grepRaw}} for matching raw vectors. |
| |
| Options \code{PCRE_limit_recursion}, \code{PCRE_study} and |
| \code{PCRE_use_JIT}. |
| |
| \code{\link{extSoftVersion}} for the versions of regex and PCRE |
| libraries in use, \code{\link{pcre_config}} for more details for |
| PCRE. |
| } |
| \examples{ |
| grep("[a-z]", letters) |
| |
| txt <- c("arm","foot","lefroo", "bafoobar") |
| if(length(i <- grep("foo", txt))) |
| cat("'foo' appears at least once in\n\t", txt, "\n") |
| i # 2 and 4 |
| txt[i] |
| |
| ## Double all 'a' or 'b's; "\\" must be escaped, i.e., 'doubled' |
| gsub("([ab])", "\\\\1_\\\\1_", "abc and ABC") |
| |
| txt <- c("The", "licenses", "for", "most", "software", "are", |
| "designed", "to", "take", "away", "your", "freedom", |
| "to", "share", "and", "change", "it.", |
| "", "By", "contrast,", "the", "GNU", "General", "Public", "License", |
| "is", "intended", "to", "guarantee", "your", "freedom", "to", |
| "share", "and", "change", "free", "software", "--", |
| "to", "make", "sure", "the", "software", "is", |
| "free", "for", "all", "its", "users") |
| ( i <- grep("[gu]", txt) ) # indices |
| stopifnot( txt[i] == grep("[gu]", txt, value = TRUE) ) |
| |
| ## Note that for some implementations character ranges are |
| ## locale-dependent (but not currently). Then [b-e] in locales such as |
| ## en_US may include B as the collation order is aAbBcCdDe ... |
| (ot <- sub("[b-e]",".", txt)) |
| txt[ot != gsub("[b-e]",".", txt)]#- gsub does "global" substitution |
| ## In caseless matching, ranges include both cases: |
| a <- grep("[b-e]", txt, value = TRUE) |
| b <- grep("[b-e]", txt, ignore.case = TRUE, value = TRUE) |
| setdiff(b, a) |
| |
| txt[gsub("g","#", txt) != |
| gsub("g","#", txt, ignore.case = TRUE)] # the "G" words |
| |
| regexpr("en", txt) |
| |
| gregexpr("e", txt) |
| |
| ## Using grepl() for filtering |
| ## Find functions with argument names matching "warn": |
| findArgs <- function(env, pattern) { |
| nms <- ls(envir = as.environment(env)) |
| nms <- nms[is.na(match(nms, c("F","T")))] # <-- work around "checking hack" |
| aa <- sapply(nms, function(.) { o <- get(.) |
| if(is.function(o)) names(formals(o)) }) |
| iw <- sapply(aa, function(a) any(grepl(pattern, a, ignore.case=TRUE))) |
| aa[iw] |
| } |
| findArgs("package:base", "warn") |
| |
| ## trim trailing white space |
| str <- "Now is the time " |
| sub(" +$", "", str) ## spaces only |
| ## what is considered 'white space' depends on the locale. |
| sub("[[:space:]]+$", "", str) ## white space, POSIX-style |
| ## what PCRE considered white space changed in version 8.34: see ?regex |
| sub("\\\\s+$", "", str, perl = TRUE) ## PCRE-style white space |
| |
| ## capitalizing |
| txt <- "a test of capitalizing" |
| gsub("(\\\\w)(\\\\w*)", "\\\\U\\\\1\\\\L\\\\2", txt, perl=TRUE) |
| gsub("\\\\b(\\\\w)", "\\\\U\\\\1", txt, perl=TRUE) |
| |
| txt2 <- "useRs may fly into JFK or laGuardia" |
| gsub("(\\\\w)(\\\\w*)(\\\\w)", "\\\\U\\\\1\\\\E\\\\2\\\\U\\\\3", txt2, perl=TRUE) |
| sub("(\\\\w)(\\\\w*)(\\\\w)", "\\\\U\\\\1\\\\E\\\\2\\\\U\\\\3", txt2, perl=TRUE) |
| |
| ## named capture |
| notables <- c(" Ben Franklin and Jefferson Davis", |
| "\tMillard Fillmore") |
| # name groups 'first' and 'last' |
| name.rex <- "(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)" |
| (parsed <- regexpr(name.rex, notables, perl = TRUE)) |
| gregexpr(name.rex, notables, perl = TRUE)[[2]] |
| parse.one <- function(res, result) { |
| m <- do.call(rbind, lapply(seq_along(res), function(i) { |
| if(result[i] == -1) return("") |
| st <- attr(result, "capture.start")[i, ] |
| substring(res[i], st, st + attr(result, "capture.length")[i, ] - 1) |
| })) |
| colnames(m) <- attr(result, "capture.names") |
| m |
| } |
| parse.one(notables, parsed) |
| |
| ## Decompose a URL into its components. |
| ## Example by LT (http://www.cs.uiowa.edu/~luke/R/regexp.html). |
| x <- "http://stat.umn.edu:80/xyz" |
| m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x) |
| m |
| regmatches(x, m) |
| ## Element 3 is the protocol, 4 is the host, 6 is the port, and 7 |
| ## is the path. We can use this to make a function for extracting the |
| ## parts of a URL: |
| URL_parts <- function(x) { |
| m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x) |
| parts <- do.call(rbind, |
| lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L))) |
| colnames(parts) <- c("protocol","host","port","path") |
| parts |
| } |
| URL_parts(x) |
| |
| ## gregexec() may match multiple times within a single string. |
| pattern <- "([[:alpha:]]+)([[:digit:]]+)" |
| s <- "Test: A1 BC23 DEF456" |
| m <- gregexec(pattern, s) |
| m |
| regmatches(s, m) |
| |
| ## Before gregexec() was implemented, one could emulate it by running |
| ## regexec() on the regmatches obtained via gregexpr(). E.g.: |
| lapply(regmatches(s, gregexpr(pattern, s)), |
| function(e) regmatches(e, regexec(pattern, e))) |
| } |
| \keyword{character} |
| \keyword{utilities} |