blob: 805c762867778b71f70a648532242fd1d4aec33c [file] [log] [blame]
% File src/library/base/man/regmatches.Rd
% Part of the R package, https://www.R-project.org
% Copyright 1995-2014 R Core Team
% Distributed under GPL 2 or later
\name{regmatches}
\alias{regmatches}
\alias{regmatches<-}
\title{Extract or Replace Matched Substrings}
\description{
Extract or replace matched substrings from match data obtained by
\code{\link{regexpr}}, \code{\link{gregexpr}},
\code{\link{regexec}} or \code{\link{gregexec}}.
}
\usage{
regmatches(x, m, invert = FALSE)
regmatches(x, m, invert = FALSE) <- value
}
\arguments{
\item{x}{a character vector}
\item{m}{an object with match data}
\item{invert}{a logical: if \code{TRUE}, extract or replace the
non-matched substrings.}
\item{value}{an object with suitable replacement values for the
matched or non-matched substrings (see \code{Details}).}
}
\details{
If \code{invert} is \code{FALSE} (default), \code{regmatches} extracts
the matched substrings as specified by the match data. For vector
match data (as obtained from \code{\link{regexpr}}), empty matches are
dropped; for list match data, empty matches give empty components
(zero-length character vectors).
If \code{invert} is \code{TRUE}, \code{regmatches} extracts the
non-matched substrings, i.e., the strings are split according to the
matches similar to \code{\link{strsplit}} (for vector match data, at
most a single split is performed).
If \code{invert} is \code{NA}, \code{regmatches} extracts both
non-matched and matched substrings, always starting and ending with a
non-match (empty if the match occurred at the beginning or the end,
respectively).
Note that the match data can be obtained from regular expression
matching on a modified version of \code{x} with the same numbers of
characters.
The replacement function can be used for replacing the matched or
non-matched substrings. For vector match data, if \code{invert} is
\code{FALSE}, \code{value} should be a character vector with length the
number of matched elements in \code{m}. Otherwise, it should be a
list of character vectors with the same length as \code{m}, each as
long as the number of replacements needed. Replacement coerces values
to character or list and generously recycles values as needed.
Missing replacement values are not allowed.
}
\value{
For \code{regmatches}, a character vector with the matched substrings
if \code{m} is a vector and \code{invert} is \code{FALSE}. Otherwise,
a list with the matched or/and non-matched substrings.
For \code{regmatches<-}, the updated character vector.
}
\examples{
x <- c("A and B", "A, B and C", "A, B, C and D", "foobar")
pattern <- "[[:space:]]*(,|and)[[:space:]]"
## Match data from regexpr()
m <- regexpr(pattern, x)
regmatches(x, m)
regmatches(x, m, invert = TRUE)
## Match data from gregexpr()
m <- gregexpr(pattern, x)
regmatches(x, m)
regmatches(x, m, invert = TRUE)
## Consider
x <- "John (fishing, hunting), Paul (hiking, biking)"
## Suppose we want to split at the comma (plus spaces) between the
## persons, but not at the commas in the parenthesized hobby lists.
## One idea is to "blank out" the parenthesized parts to match the
## parts to be used for splitting, and extract the persons as the
## non-matched parts.
## First, match the parenthesized hobby lists.
m <- gregexpr("\\\\([^)]*\\\\)", x)
## Create blank strings with given numbers of characters.
blanks <- function(n) strrep(" ", n)
## Create a copy of x with the parenthesized parts blanked out.
s <- x
regmatches(s, m) <- Map(blanks, lapply(regmatches(s, m), nchar))
s
## Compute the positions of the split matches (note that we cannot call
## strsplit() on x with match data from s).
m <- gregexpr(", *", s)
## And finally extract the non-matched parts.
regmatches(x, m, invert = TRUE)
## regexec() and gregexec() return overlapping ranges because the
## first match is the full match. This conflicts with regmatches()<-
## and regmatches(..., invert=TRUE). We can work-around by dropping
## the first match.
drop_first <- function(x) {
if(!anyNA(x) && all(x > 0)) {
ml <- attr(x, 'match.length')
if(is.matrix(x)) x <- x[-1,] else x <- x[-1]
attr(x, 'match.length') <- if(is.matrix(ml)) ml[-1,] else ml[-1]
}
x
}
m <- gregexec("(\\\\w+) \\\\(((?:\\\\w+(?:, )?)+)\\\\)", x)
regmatches(x, m)
try(regmatches(x, m, invert=TRUE))
regmatches(x, lapply(m, drop_first))
## invert=TRUE loses matrix structure because we are retrieving what
## is in between every sub-match
regmatches(x, lapply(m, drop_first), invert=TRUE)
y <- z <- x
## Notice **list**(...) on the RHS
regmatches(y, lapply(m, drop_first)) <- list(c("<NAME>", "<HOBBY-LIST>"))
y
regmatches(z, lapply(m, drop_first), invert=TRUE) <-
list(sprintf("<\%d>", 1:5))
z
## With `perl = TRUE` and `invert = FALSE` capture group names
## are preserved. Collect functions and arguments in calls:
NEWS <- head(readLines(file.path(R.home(), 'doc', 'NEWS.2')), 100)
m <- gregexec("(?<fun>\\\\w+)\\\\((?<args>[^)]*)\\\\)", NEWS, perl = TRUE)
y <- regmatches(NEWS, m)
y[[16]]
## Make tabular, adding original line numbers
mdat <- as.data.frame(t(do.call(cbind, y)))
mdat <- cbind(mdat, line=rep(seq_along(y), lengths(y) / ncol(mdat)))
head(mdat)
NEWS[head(mdat[['line']])]
}
\keyword{character}
\keyword{utilities}