| # File src/library/base/R/grep.R |
| # Part of the R package, https://www.R-project.org |
| # |
| # Copyright (C) 1995-2015 The R Core Team |
| # |
| # This program is free software; you can redistribute it and/or modify |
| # it under the terms of the GNU General Public License as published by |
| # the Free Software Foundation; either version 2 of the License, or |
| # (at your option) any later version. |
| # |
| # This program is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| # GNU General Public License for more details. |
| # |
| # A copy of the GNU General Public License is available at |
| # https://www.R-project.org/Licenses/ |
| |
| |
| ## Q: Why are we using as.character(.) all over the place instead of doing that in C ? |
| ## A: These must work for objects which have their own as.character(.) methods *and* |
| ## as.character() is fast [Primitive] |
| |
| strsplit <- |
| function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE) |
| .Internal(strsplit(x, as.character(split), fixed, perl, useBytes)) |
| |
| grep <- |
| function(pattern, x, ignore.case = FALSE, perl = FALSE, |
| value = FALSE, fixed = FALSE, useBytes = FALSE, invert = FALSE) |
| { |
| ## when value = TRUE we return names |
| if(!is.character(x)) x <- structure(as.character(x), names=names(x)) |
| .Internal(grep(as.character(pattern), x, ignore.case, value, |
| perl, fixed, useBytes, invert)) |
| } |
| |
| grepl <- |
| function(pattern, x, ignore.case = FALSE, perl = FALSE, |
| fixed = FALSE, useBytes = FALSE) |
| { |
| if(!is.character(x)) x <- as.character(x) |
| .Internal(grepl(as.character(pattern), x, ignore.case, FALSE, |
| perl, fixed, useBytes, FALSE)) |
| } |
| |
| sub <- |
| function(pattern, replacement, x, ignore.case = FALSE, |
| perl = FALSE, fixed = FALSE, useBytes = FALSE) |
| { |
| if (!is.character(x)) x <- as.character(x) |
| .Internal(sub(as.character(pattern), as.character(replacement), x, |
| ignore.case, perl, fixed, useBytes)) |
| } |
| |
| gsub <- |
| function(pattern, replacement, x, ignore.case = FALSE, |
| perl = FALSE, fixed = FALSE, useBytes = FALSE) |
| { |
| if (!is.character(x)) x <- as.character(x) |
| .Internal(gsub(as.character(pattern), as.character(replacement), x, |
| ignore.case, perl, fixed, useBytes)) |
| } |
| |
| regexpr <- |
| function(pattern, text, ignore.case = FALSE, perl = FALSE, |
| fixed = FALSE, useBytes = FALSE) |
| { |
| if (!is.character(text)) text <- as.character(text) |
| .Internal(regexpr(as.character(pattern), text, |
| ignore.case, perl, fixed, useBytes)) |
| } |
| |
| gregexpr <- |
| function(pattern, text, ignore.case = FALSE, perl = FALSE, |
| fixed = FALSE, useBytes = FALSE) |
| { |
| if (!is.character(text)) text <- as.character(text) |
| .Internal(gregexpr(as.character(pattern), text, |
| ignore.case, perl, fixed, useBytes)) |
| } |
| |
| grepRaw <- |
| function(pattern, x, offset = 1L, ignore.case = FALSE, value = FALSE, |
| fixed = FALSE, all = FALSE, invert = FALSE) |
| { |
| if (!is.raw(pattern)) pattern <- charToRaw(as.character(pattern)) |
| if (!is.raw(x)) x <- charToRaw(as.character(x)) |
| .Internal(grepRaw(pattern, x, offset, ignore.case, fixed, value, all, invert)) |
| } |
| |
| regexec <- |
| function(pattern, text, ignore.case = FALSE, perl = FALSE, |
| fixed = FALSE, useBytes = FALSE) |
| { |
| if (!is.character(text)) text <- as.character(text) |
| if(!perl || fixed) |
| return(.Internal(regexec(as.character(pattern), text, ignore.case, fixed, |
| useBytes))) |
| |
| ## For perl = TRUE, re-use regexpr(perl = TRUE) which always |
| ## captures subexpressions. |
| |
| match_data_from_pos_and_len <- function(pos, len) { |
| attr(pos, "match.length") <- len |
| pos |
| } |
| |
| m <- regexpr(pattern, text, |
| ignore.case = ignore.case, useBytes = useBytes, |
| perl = TRUE) |
| y <- vector("list", length(text)) |
| y[is.na(m)] <- list(match_data_from_pos_and_len(NA_integer_, NA_integer_)) |
| ind <- !is.na(m) & (m == -1L) |
| if(any(ind)) { |
| y[ind] <- list(match_data_from_pos_and_len(-1L, -1L)) |
| } |
| ind <- !is.na(m) & !ind |
| if(any(ind)) { |
| pos <- cbind(m[ind], |
| attr(m, "capture.start")[ind, , drop = FALSE]) |
| len <- cbind(attr(m, "match.length")[ind], |
| attr(m, "capture.length")[ind, , drop = FALSE]) |
| y[ind] <- Map(match_data_from_pos_and_len, |
| split(pos, row(pos)), |
| split(len, row(len))) |
| } |
| if(identical(attr(m, "useBytes"), TRUE)) |
| y <- lapply(y, `attr<-`, "useBytes", TRUE) |
| y |
| } |
| |
| agrep <- |
| function(pattern, x, max.distance = 0.1, costs = NULL, |
| ignore.case = FALSE, value = FALSE, fixed = TRUE, |
| useBytes = FALSE) |
| { |
| pattern <- as.character(pattern) |
| if(!is.character(x)) x <- as.character(x) |
| |
| ## TRE needs integer costs: coerce here for simplicity. |
| costs <- as.integer(.amatch_costs(costs)) |
| bounds <- .amatch_bounds(max.distance) |
| |
| .Internal(agrep(pattern, x, ignore.case, value, costs, bounds, |
| useBytes, fixed)) |
| } |
| |
| agrepl <- |
| function(pattern, x, max.distance = 0.1, costs = NULL, |
| ignore.case = FALSE, fixed = TRUE, useBytes = FALSE) |
| { |
| pattern <- as.character(pattern) |
| if(!is.character(x)) x <- as.character(x) |
| |
| ## TRE needs integer costs: coerce here for simplicity. |
| costs <- as.integer(.amatch_costs(costs)) |
| bounds <- .amatch_bounds(max.distance) |
| |
| .Internal(agrepl(pattern, x, ignore.case, FALSE, costs, bounds, |
| useBytes, fixed)) |
| } |
| |
| .amatch_bounds <- |
| function(x = 0.1) |
| { |
| ## Expand max match distance argument for agrep() et al into bounds |
| ## for the TRE regaparams struct. |
| |
| ## Note that TRE allows for possibly different (integer) costs for |
| ## insertions, deletions and substitions, and allows for specifying |
| ## separate bounds for these numbers as well as the total number of |
| ## "errors" (transformations) and the total cost. |
| ## |
| ## When using unit costs (and older versions of agrep() did not |
| ## allow otherwise), the total number of errors is the same as the |
| ## total cost, and bounds on the total number of errors imply the |
| ## same bounds for the individual transformation counts. This no |
| ## longer holds when using possibly different costs. |
| ## |
| ## See ? agrep for details on handling the match distance argument. |
| ## |
| ## Older versions of agrep() expanded fractions (of the pattern |
| ## length) in R code: but as the C code determines whether matching |
| ## used bytes or characters, only the C code can determine the |
| ## pattern length and hence expand fractions. |
| ## |
| ## Unspecified bounds are taken as NA_real_, and set to INT_MAX by |
| ## the C code. |
| |
| if(!is.list(x)) { |
| ## Sanity checks. |
| if(!is.numeric(x) || (x < 0)) |
| stop("match distance components must be non-negative") |
| bounds <- c(as.double(x), rep.int(NA_real_, 4L)) |
| } else { |
| table <- |
| c("cost", "insertions", "deletions", "substitutions", "all") |
| ## Partial matching. |
| pos <- pmatch(names(x), table) |
| if(anyNA(pos)) { |
| warning("unknown match distance components ignored") |
| x <- x[!is.na(pos)] |
| } |
| names(x) <- table[pos] |
| ## Sanity checks. |
| x <- unlist(x) |
| if(!all(is.numeric(x)) || any(x < 0)) |
| stop("match distance components must be non-negative") |
| ## Defaults. |
| if(!is.na(x["cost"])) { |
| bounds <- rep.int(NA_real_, 5L) |
| } else { |
| ## If 'cost' is missing: if 'all' is missing it is set to |
| ## 0.1, and the other transformation number bounds default |
| ## to 'all'. |
| if(is.na(x["all"])) |
| x["all"] <- 0.1 |
| bounds <- c(NA_real_, rep.int(x["all"], 4L)) |
| } |
| names(bounds) <- table |
| bounds[names(x)] <- x |
| } |
| |
| bounds |
| } |
| |
| .amatch_costs <- |
| function(x = NULL) |
| { |
| costs <- c(insertions = 1, deletions = 1, substitutions = 1) |
| if(!is.null(x)) { |
| x <- as.list(x) |
| ## Partial matching. |
| pos <- pmatch(names(x), names(costs)) |
| if(anyNA(pos)) { |
| warning("unknown cost components ignored") |
| x <- x[!is.na(pos)] |
| } |
| ## Sanity checks. |
| x <- unlist(x) |
| if(!all(is.numeric(x)) || any(x < 0)) |
| stop("cost components must be non-negative") |
| costs[pos] <- x |
| } |
| |
| costs |
| } |
| |
| regmatches <- |
| function(x, m, invert = FALSE) |
| { |
| if(length(x) != length(m)) |
| stop(gettextf("%s and %s must have the same length", |
| sQuote("x"), sQuote("m")), |
| domain = NA) |
| |
| ili <- is.list(m) |
| |
| ## Handle useBytes/encoding issues. |
| ## Match positions from regexpr(), gregexpr() and regexec() are in |
| ## characters unless 'useBytes = TRUE' was given, now recorded via |
| ## the 'index.type' attribute (in addition to the 'useBytes' one |
| ## being TRUE when 'useBytes = TRUE' was given *or* all character |
| ## string involved were ASCII). |
| ## To convince substring() and nchar() used below accordingly that |
| ## match data positions are in bytes, we set the input encoding to |
| ## "bytes" for the former and call the latter with 'type = "bytes"'. |
| itype <- "chars" |
| useBytes <- if(ili) |
| any(unlist(lapply(m, attr, "index.type")) == "bytes") |
| else |
| any(attr(m, "index.type") == "bytes") |
| if(useBytes) { |
| itype <- Encoding(x) <- "bytes" |
| } |
| |
| ## For NA matches (from matching a non-NA pattern on an NA string), |
| ## direct matches give nothing and inverse matches give NA (as |
| ## nothing was matched). |
| |
| if(!ili && isFALSE(invert)) { |
| so <- m[ind <- (!is.na(m) & (m > -1L))] |
| eo <- so + attr(m, "match.length")[ind] - 1L |
| return(substring(x[ind], so, eo)) |
| } |
| |
| y <- if(is.na(invert)) { |
| Map(function(u, so, ml) { |
| if((n <- length(so)) == 1L) { |
| if(is.na(so) ) |
| return(NA_character_) # Or u ... |
| else if(so == -1L) |
| return(u) |
| } |
| eo <- so + ml - 1L |
| if(n > 1L) { |
| ## regexec() could give overlapping matches. |
| ## Matches are non-overlapping iff |
| ## eo[i] < so[i + 1], i = 1, ..., n - 1. |
| if(any(eo[-n] >= so[-1L])) |
| stop(gettextf("need non-overlapping matches for %s", |
| sQuote("invert = NA")), |
| domain = NA) |
| } |
| beg <- c(1L, c(rbind(so, eo + 1L))) |
| end <- c(c(rbind(so - 1L, eo)), nchar(u, itype)) |
| substring(u, beg, end) |
| }, |
| x, m, |
| if(ili) |
| lapply(m, attr, "match.length") |
| else |
| attr(m, "match.length"), |
| USE.NAMES = FALSE) |
| } else if(invert) { |
| Map(function(u, so, ml) { |
| if((n <- length(so)) == 1L) { |
| if(is.na(so) ) |
| return(NA_character_) # Or u ... |
| else if(so == -1L) |
| return(u) |
| } |
| beg <- if(n > 1L) { |
| ## See above. |
| eo <- so + ml - 1L |
| if(any(eo[-n] >= so[-1L])) |
| stop(gettextf("need non-overlapping matches for %s", |
| sQuote("invert = TRUE")), |
| domain = NA) |
| c(1L, eo + 1L) |
| } else { |
| c(1L, so + ml) |
| } |
| end <- c(so - 1L, nchar(u, itype)) |
| substring(u, beg, end) |
| }, |
| x, m, |
| if(ili) |
| lapply(m, attr, "match.length") |
| else |
| attr(m, "match.length"), |
| USE.NAMES = FALSE) |
| } else { |
| Map(function(u, so, ml) { |
| if(length(so) == 1L) { |
| if(is.na(so) || (so == -1L)) |
| return(character()) |
| } |
| substring(u, so, so + ml - 1L) |
| }, |
| x, m, |
| lapply(m, attr, "match.length"), |
| USE.NAMES = FALSE) |
| } |
| |
| names(y) <- names(x) |
| y |
| } |
| |
| ## Suppose matching partitions a string as |
| ## n0 m1 n1 ... mk nk |
| ## where the m and n substrings are the matched and non-matched parts, |
| ## respectively, and n0 and/or nk can be empty. |
| ## (regexec() can give overlapping matches, in which case extracting |
| ## inverted matches or replacing cannot work.) |
| ## For list match data, k can be any non-negative integer. |
| ## Extraction and replacement straightforwardly work on the m or n |
| ## sequences, depending on whether invert is FALSE or TRUE. |
| ## For vector match data from regexpr(), k can be 0 or 1. |
| ## If k = 0 (no match): |
| ## invert |
| ## FALSE TRUE |
| ## extract drop n0 |
| ## replace n0 r0 |
| ## If k = 1: |
| ## invert |
| ## FALSE TRUE |
| ## extract m1 c(n0, n1) |
| ## replace n0 r1 n1 r0 m1 r1 |
| |
| `regmatches<-` <- |
| function(x, m, invert = FALSE, value) |
| { |
| if(!length(x)) return(x) |
| |
| ili <- is.list(m) |
| |
| if(!ili && invert && any(m == -1L)) { |
| ## regmatches() drops empty matches for vector match data if |
| ## invert is FALSE (see above): we need to work around this when |
| ## replacing non-matches (PR #15723). |
| y <- rep_len(list(character()), length(x)) |
| y[m > -1L] <- as.list(regmatches(x, m, FALSE)) |
| } else { |
| y <- regmatches(x, m, !invert) |
| } |
| |
| ## <FIXME> |
| ## It might be simpler to try reducing the vector case to the list |
| ## case, transforming m and value as needed, |
| ## </FIXME> |
| |
| if(!ili && !invert) { |
| ## For non-list m and invert = FALSE, we need a character vector |
| ## of replacement values with length the number of matched |
| ## elements. |
| value <- as.character(value) |
| if(anyNA(value)) |
| stop("missing replacement values are not allowed") |
| ## Entries for matched elements have length 2. |
| pos <- which(lengths(y) == 2L) |
| np <- length(pos) |
| nv <- length(value) |
| if(np != nv) { |
| if(!nv) |
| stop("must have replacement values for matches") |
| value <- rep_len(value, np) |
| } |
| y <- y[pos] |
| x[pos] <- paste0(sapply(y, `[`, 1L), value, sapply(y, `[`, 2L)) |
| return(x) |
| } |
| |
| ## We need a list of character vectors without missings, which has |
| ## the same length as x. |
| value <- lapply(value, as.character) |
| if(anyNA(value)) # {recursively!} |
| stop("missing replacement values are not allowed") |
| if(!length(value)) |
| stop("value does not provide any replacement values") |
| value <- rep_len(value, length(x)) |
| |
| y <- if(invert) { |
| ## Replace non-matches. |
| ## An element of x with k matches has a corresponding y element |
| ## of length k, and needs k + 1 replacement values. |
| Map(function(u, v) { |
| nu <- length(u) |
| nv <- length(v) |
| if(nv != (nu + 1L)) { |
| if(!nv) |
| stop("must have replacements for non-matches") |
| v <- rep_len(v, nu + 1L) |
| } |
| paste0(v, c(u, ""), collapse = "") |
| }, |
| y, value, USE.NAMES = FALSE) |
| } else { |
| ## Replace matches. |
| ## An element of x with k matches has a corresponding y element |
| ## of length k + 1, and needs k replacement values. |
| Map(function(u, v) { |
| nu <- length(u) |
| nv <- length(v) |
| if(nv != (nu - 1L)) { |
| if(!nv) |
| stop("must have replacements for matches") |
| v <- rep_len(v, nu - 1L) |
| } |
| paste0(u, c(v, ""), collapse = "") |
| }, |
| y, value, USE.NAMES = FALSE) |
| } |
| |
| y <- unlist(y) |
| names(y) <- names(x) |
| |
| y |
| } |
| |
| pcre_config <- function() .Internal(pcre_config()) |