src/library/base/man/regmatches.Rd - R - Git at Google

 % File src/library/base/man/regmatches.Rd
 % Part of the R package, https://www.R-project.org
 % Copyright 1995-2014 R Core Team
 % Distributed under GPL 2 or later

 \name{regmatches}
 \alias{regmatches}
 \alias{regmatches<-}
 \title{Extract or Replace Matched Substrings}
 \description{
   Extract or replace matched substrings from match data obtained by
   \code{\link{regexpr}}, \code{\link{gregexpr}},
   \code{\link{regexec}} or \code{\link{gregexec}}.
 }
 \usage{
 regmatches(x, m, invert = FALSE)
 regmatches(x, m, invert = FALSE) <- value
 }
 \arguments{
   \item{x}{a character vector}
   \item{m}{an object with match data}
   \item{invert}{a logical: if \code{TRUE}, extract or replace the
     non-matched substrings.}
   \item{value}{an object with suitable replacement values for the
     matched or non-matched substrings (see \code{Details}).}
 }
 \details{
   If \code{invert} is \code{FALSE} (default), \code{regmatches} extracts
   the matched substrings as specified by the match data.  For vector
   match data (as obtained from \code{\link{regexpr}}), empty matches are
   dropped; for list match data, empty matches give empty components
   (zero-length character vectors).

   If \code{invert} is \code{TRUE}, \code{regmatches} extracts the
   non-matched substrings, i.e., the strings are split according to the
   matches similar to \code{\link{strsplit}} (for vector match data, at
   most a single split is performed).

   If \code{invert} is \code{NA}, \code{regmatches} extracts both
   non-matched and matched substrings, always starting and ending with a
   non-match (empty if the match occurred at the beginning or the end,
   respectively).

   Note that the match data can be obtained from regular expression
   matching on a modified version of \code{x} with the same numbers of
   characters.

   The replacement function can be used for replacing the matched or
   non-matched substrings.  For vector match data, if \code{invert} is
   \code{FALSE}, \code{value} should be a character vector with length the
   number of matched elements in \code{m}.  Otherwise, it should be a
   list of character vectors with the same length as \code{m}, each as
   long as the number of replacements needed.  Replacement coerces values
   to character or list and generously recycles values as needed.
   Missing replacement values are not allowed.
 }
 \value{
   For \code{regmatches}, a character vector with the matched substrings
   if \code{m} is a vector and \code{invert} is \code{FALSE}.  Otherwise,
   a list with the matched or/and non-matched substrings.

   For \code{regmatches<-}, the updated character vector.
 }
 \examples{
 x <- c("A and B", "A, B and C", "A, B, C and D", "foobar")
 pattern <- "[[:space:]]*(,|and)[[:space:]]"
 ## Match data from regexpr()
 m <- regexpr(pattern, x)
 regmatches(x, m)
 regmatches(x, m, invert = TRUE)
 ## Match data from gregexpr()
 m <- gregexpr(pattern, x)
 regmatches(x, m)
 regmatches(x, m, invert = TRUE)

 ## Consider
 x <- "John (fishing, hunting), Paul (hiking, biking)"
 ## Suppose we want to split at the comma (plus spaces) between the
 ## persons, but not at the commas in the parenthesized hobby lists.
 ## One idea is to "blank out" the parenthesized parts to match the
 ## parts to be used for splitting, and extract the persons as the
 ## non-matched parts.
 ## First, match the parenthesized hobby lists.
 m <- gregexpr("\\\\([^)]*\\\\)", x)
 ## Create blank strings with given numbers of characters.
 blanks <- function(n) strrep(" ", n)
 ## Create a copy of x with the parenthesized parts blanked out.
 s <- x
 regmatches(s, m) <- Map(blanks, lapply(regmatches(s, m), nchar))
 s
 ## Compute the positions of the split matches (note that we cannot call
 ## strsplit() on x with match data from s).
 m <- gregexpr(", *", s)
 ## And finally extract the non-matched parts.
 regmatches(x, m, invert = TRUE)

 ## regexec() and gregexec() return overlapping ranges because the
 ## first match is the full match.  This conflicts with regmatches()<-
 ## and regmatches(..., invert=TRUE).  We can work-around by dropping
 ## the first match.
 drop_first <- function(x) {
     if(!anyNA(x) && all(x > 0)) {
         ml <- attr(x, 'match.length')
         if(is.matrix(x)) x <- x[-1,] else x <- x[-1]
         attr(x, 'match.length') <- if(is.matrix(ml)) ml[-1,] else ml[-1]
     }
     x
 }
 m <- gregexec("(\\\\w+) \\\\(((?:\\\\w+(?:, )?)+)\\\\)", x)
 regmatches(x, m)
 try(regmatches(x, m, invert=TRUE))
 regmatches(x, lapply(m, drop_first))
 ## invert=TRUE loses matrix structure because we are retrieving what
 ## is in between every sub-match
 regmatches(x, lapply(m, drop_first), invert=TRUE)
 y <- z <- x
 ## Notice **list**(...) on the RHS
 regmatches(y, lapply(m, drop_first)) <- list(c("<NAME>", "<HOBBY-LIST>"))
 y
 regmatches(z, lapply(m, drop_first), invert=TRUE) <-
     list(sprintf("<\%d>", 1:5))
 z

 ## With `perl = TRUE` and `invert = FALSE` capture group names
 ## are preserved.  Collect functions and arguments in calls:
 NEWS <- head(readLines(file.path(R.home(), 'doc', 'NEWS.2')), 100)
 m <- gregexec("(?<fun>\\\\w+)\\\\((?<args>[^)]*)\\\\)", NEWS, perl = TRUE)
 y <- regmatches(NEWS, m)
 y[[16]]
 ## Make tabular, adding original line numbers
 mdat <- as.data.frame(t(do.call(cbind, y)))
 mdat <- cbind(mdat, line=rep(seq_along(y), lengths(y) / ncol(mdat)))
 head(mdat)
 NEWS[head(mdat[['line']])]
 }
 \keyword{character}
 \keyword{utilities}
	% File src/library/base/man/regmatches.Rd
	% Part of the R package, https://www.R-project.org
	% Copyright 1995-2014 R Core Team
	% Distributed under GPL 2 or later

	\name{regmatches}
	\alias{regmatches}
	\alias{regmatches<-}
	\title{Extract or Replace Matched Substrings}
	\description{
	Extract or replace matched substrings from match data obtained by
	\code{\link{regexpr}}, \code{\link{gregexpr}},
	\code{\link{regexec}} or \code{\link{gregexec}}.
	}
	\usage{
	regmatches(x, m, invert = FALSE)
	regmatches(x, m, invert = FALSE) <- value
	}
	\arguments{
	\item{x}{a character vector}
	\item{m}{an object with match data}
	\item{invert}{a logical: if \code{TRUE}, extract or replace the
	non-matched substrings.}
	\item{value}{an object with suitable replacement values for the
	matched or non-matched substrings (see \code{Details}).}
	}
	\details{
	If \code{invert} is \code{FALSE} (default), \code{regmatches} extracts
	the matched substrings as specified by the match data. For vector
	match data (as obtained from \code{\link{regexpr}}), empty matches are
	dropped; for list match data, empty matches give empty components
	(zero-length character vectors).

	If \code{invert} is \code{TRUE}, \code{regmatches} extracts the
	non-matched substrings, i.e., the strings are split according to the
	matches similar to \code{\link{strsplit}} (for vector match data, at
	most a single split is performed).

	If \code{invert} is \code{NA}, \code{regmatches} extracts both
	non-matched and matched substrings, always starting and ending with a
	non-match (empty if the match occurred at the beginning or the end,
	respectively).

	Note that the match data can be obtained from regular expression
	matching on a modified version of \code{x} with the same numbers of
	characters.

	The replacement function can be used for replacing the matched or
	non-matched substrings. For vector match data, if \code{invert} is
	\code{FALSE}, \code{value} should be a character vector with length the
	number of matched elements in \code{m}. Otherwise, it should be a
	list of character vectors with the same length as \code{m}, each as
	long as the number of replacements needed. Replacement coerces values
	to character or list and generously recycles values as needed.
	Missing replacement values are not allowed.
	}
	\value{
	For \code{regmatches}, a character vector with the matched substrings
	if \code{m} is a vector and \code{invert} is \code{FALSE}. Otherwise,
	a list with the matched or/and non-matched substrings.

	For \code{regmatches<-}, the updated character vector.
	}
	\examples{
	x <- c("A and B", "A, B and C", "A, B, C and D", "foobar")
	pattern <- "[[:space:]]*(,\|and)[[:space:]]"
	## Match data from regexpr()
	m <- regexpr(pattern, x)
	regmatches(x, m)
	regmatches(x, m, invert = TRUE)
	## Match data from gregexpr()
	m <- gregexpr(pattern, x)
	regmatches(x, m)
	regmatches(x, m, invert = TRUE)

	## Consider
	x <- "John (fishing, hunting), Paul (hiking, biking)"
	## Suppose we want to split at the comma (plus spaces) between the
	## persons, but not at the commas in the parenthesized hobby lists.
	## One idea is to "blank out" the parenthesized parts to match the
	## parts to be used for splitting, and extract the persons as the
	## non-matched parts.
	## First, match the parenthesized hobby lists.
	m <- gregexpr("\\\\([^)]*\\\\)", x)
	## Create blank strings with given numbers of characters.
	blanks <- function(n) strrep(" ", n)
	## Create a copy of x with the parenthesized parts blanked out.
	s <- x
	regmatches(s, m) <- Map(blanks, lapply(regmatches(s, m), nchar))
	s
	## Compute the positions of the split matches (note that we cannot call
	## strsplit() on x with match data from s).
	m <- gregexpr(", *", s)
	## And finally extract the non-matched parts.
	regmatches(x, m, invert = TRUE)

	## regexec() and gregexec() return overlapping ranges because the
	## first match is the full match. This conflicts with regmatches()<-
	## and regmatches(..., invert=TRUE). We can work-around by dropping
	## the first match.
	drop_first <- function(x) {
	if(!anyNA(x) && all(x > 0)) {
	ml <- attr(x, 'match.length')
	if(is.matrix(x)) x <- x[-1,] else x <- x[-1]
	attr(x, 'match.length') <- if(is.matrix(ml)) ml[-1,] else ml[-1]
	}
	x
	}
	m <- gregexec("(\\\\w+) \\\\(((?:\\\\w+(?:, )?)+)\\\\)", x)
	regmatches(x, m)
	try(regmatches(x, m, invert=TRUE))
	regmatches(x, lapply(m, drop_first))
	## invert=TRUE loses matrix structure because we are retrieving what
	## is in between every sub-match
	regmatches(x, lapply(m, drop_first), invert=TRUE)
	y <- z <- x
	## Notice list(...) on the RHS
	regmatches(y, lapply(m, drop_first)) <- list(c("<NAME>", "<HOBBY-LIST>"))
	y
	regmatches(z, lapply(m, drop_first), invert=TRUE) <-
	list(sprintf("<\%d>", 1:5))
	z

	## With `perl = TRUE` and `invert = FALSE` capture group names
	## are preserved. Collect functions and arguments in calls:
	NEWS <- head(readLines(file.path(R.home(), 'doc', 'NEWS.2')), 100)
	m <- gregexec("(?<fun>\\\\w+)\\\\((?<args>[^)]*)\\\\)", NEWS, perl = TRUE)
	y <- regmatches(NEWS, m)
	y[[16]]
	## Make tabular, adding original line numbers
	mdat <- as.data.frame(t(do.call(cbind, y)))
	mdat <- cbind(mdat, line=rep(seq_along(y), lengths(y) / ncol(mdat)))
	head(mdat)
	NEWS[head(mdat[['line']])]
	}
	\keyword{character}
	\keyword{utilities}