tests/utf8-regex.R - R - Git at Google

 ## This can only be done in a locale that extends Latin-1
 (inf <- l10n_info())
 if(!(inf$`UTF-8` || inf$`Latin-1`)) {
     warning("this test must be done in a Latin-1 or UTF-8 locale")
     q()
 }

 inp <- readLines(n = 2)
 «Latin-1 accented chars»: éè øØ å<Å æ<Æ é éè
 éè

 inp
 (txt <- iconv(inp[1], "latin1", ""))
 (pat <- iconv(inp[2], "latin1", ""))
 if(any(is.na(c(txt, pat)))) {
     ## backup test
     warning("this test must be done in a Latin-1 or UTF-8 locale")
     q()
 }

 testit <- function(x) {print(x); stopifnot(identical(x, 1L))}
 testit(grep(pat, txt))
 testit(grep(pat, txt, ignore.case = TRUE))
 testit(grep(pat, txt, useBytes = TRUE))
 testit(grep(pat, txt, ignore.case = TRUE, useBytes = TRUE))
 testit(grep(pat, txt, fixed = TRUE))
 testit(grep(pat, txt, fixed = TRUE, useBytes = TRUE))
 testit(grep(pat, txt, perl = TRUE))
 testit(grep(pat, txt, ignore.case = TRUE, perl = TRUE))
 testit(grep(pat, txt, perl = TRUE, useBytes = TRUE))
 testit(grep(pat, txt, ignore.case = TRUE, perl = TRUE, useBytes = TRUE))
 testit(grep(toupper(pat), txt, ignore.case = TRUE))
 testit(grep(toupper(pat), txt, ignore.case = TRUE, perl = TRUE))
 ## matches in Latin-1 but not in UTF-8
 grep(toupper(pat), txt, ignore.case = TRUE, perl = TRUE, useBytes = TRUE)

 (r1 <- regexpr("en", txt, fixed=TRUE))
 (r2 <- regexpr("en", txt, fixed=TRUE, useBytes=TRUE))
 stopifnot(identical(r1, regexpr("en", txt)))
 stopifnot(identical(r2, regexpr("en", txt, useBytes = TRUE)))
 stopifnot(identical(r1, regexpr("en", txt, perl=TRUE)))
 stopifnot(identical(r2, regexpr("en", txt, perl=TRUE, useBytes=TRUE)))
 stopifnot(identical(r1, regexpr("EN", txt, ignore.case=TRUE)))
 stopifnot(identical(r2, regexpr("EN", txt, ignore.case=TRUE, useBytes=TRUE)))
 stopifnot(identical(r1, regexpr("EN", txt, ignore.case=TRUE, perl=TRUE)))
 stopifnot(identical(r2, regexpr("EN", txt, ignore.case=TRUE, perl=TRUE,
                                 useBytes=TRUE)))

 (r1 <- regexpr(pat, txt, fixed=TRUE))
 (r2 <- regexpr(pat, txt, fixed=TRUE, useBytes=TRUE))
 stopifnot(identical(r1, regexpr(pat, txt)))
 stopifnot(identical(r2, regexpr(pat, txt, useBytes=TRUE)))
 stopifnot(identical(r1, regexpr(pat, txt, perl=TRUE)))
 stopifnot(identical(r2, regexpr(pat, txt, perl=TRUE, useBytes=TRUE)))
 stopifnot(identical(r1, regexpr(pat, txt, ignore.case=TRUE)))
 stopifnot(identical(r2, regexpr(pat, txt, ignore.case=TRUE, useBytes=TRUE)))
 stopifnot(identical(r1, regexpr(pat, txt, ignore.case=TRUE, perl=TRUE)))
 stopifnot(identical(r2, regexpr(pat, txt, ignore.case=TRUE, perl=TRUE,
                                 useBytes=TRUE)))
 pat2 <- toupper(pat)
 stopifnot(identical(r1, regexpr(pat2, txt, ignore.case=TRUE)))
 stopifnot(identical(r1, regexpr(pat2, txt, ignore.case=TRUE, perl=TRUE)))
 ## will not match in a UTF-8 locale
 regexpr(pat2, txt, ignore.case=TRUE, perl=TRUE, useBytes=TRUE)


 (r1 <- gregexpr(pat, txt, fixed=TRUE))
 (r2 <- gregexpr(pat, txt, fixed=TRUE, useBytes=TRUE))
 stopifnot(identical(r1, gregexpr(pat, txt)))
 stopifnot(identical(r2, gregexpr(pat, txt, useBytes=TRUE)))
 stopifnot(identical(r1, gregexpr(pat, txt, perl=TRUE)))
 stopifnot(identical(r2, gregexpr(pat, txt, perl=TRUE, useBytes=TRUE)))
 stopifnot(identical(r1, gregexpr(pat, txt, ignore.case=TRUE)))
 stopifnot(identical(r2, gregexpr(pat, txt, ignore.case=TRUE, useByte=TRUE)))
 stopifnot(identical(r1, gregexpr(pat, txt, ignore.case=TRUE, perl=TRUE)))
 stopifnot(identical(r2, gregexpr(pat, txt, ignore.case=TRUE, perl=TRUE,
                                  useBytes=TRUE)))

 txt2 <- c("The", "licenses", "for", "most", "software", "are",
   "designed", "to", "take", "away", "your", "freedom",
   "to", "share", "and", "change", "it.",
    "", "By", "contrast,", "the", "GNU", "General", "Public", "License",
    "is", "intended", "to", "guarantee", "your", "freedom", "to",
    "share", "and", "change", "free", "software", "--",
    "to", "make", "sure", "the", "software", "is",
    "free", "for", "all", "its", "users")
 ( i <- grep("[gu]", txt2, perl = TRUE) )
 stopifnot(identical(i, grep("[gu]", txt2)))
 ## results depend on the locale
 (ot <- sub("[b-e]",".", txt2, perl = TRUE))
 txt2[ot != sub("[b-e]",".", txt2)]
 (ot <- sub("[b-e]",".", txt2, ignore.case = TRUE, perl = TRUE))
 txt2[ot != sub("[b-e]",".", txt2, ignore.case = TRUE)]


 ## These may end up with different encodings: == copes, identical does not

 eq <- function(a, b) a == b
 (r1 <- gsub(pat, "ef", txt))
 stopifnot(eq(r1, gsub(pat, "ef", txt, useBytes = TRUE)))
 stopifnot(eq(r1, gsub(pat, "ef", txt, fixed = TRUE)))
 stopifnot(eq(r1, gsub(pat, "ef", txt, fixed = TRUE, useBytes = TRUE)))
 stopifnot(eq(r1, gsub(pat, "ef", txt, perl = TRUE)))
 stopifnot(eq(r1, gsub(pat, "ef", txt, perl = TRUE, useBytes = TRUE)))

 pat <- substr(pat, 1, 1)
 (r1 <- gsub(pat, "gh", txt))
 stopifnot(eq(r1, gsub(pat, "gh", txt, useBytes = TRUE)))
 stopifnot(eq(r1, gsub(pat, "gh", txt, fixed = TRUE)))
 stopifnot(eq(r1, gsub(pat, "gh", txt, fixed = TRUE, useBytes = TRUE)))
 stopifnot(eq(r1, gsub(pat, "gh", txt, perl = TRUE)))
 stopifnot(eq(r1, gsub(pat, "gh", txt, perl = TRUE, useBytes = TRUE)))


 stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx"))
 stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx"), perl = TRUE)
 stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx"), useBytes = TRUE)
 stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx"), perl = TRUE, useBytes = TRUE)

 ## this one seems system-dependent
 (x <- gsub("\\b", "|", "The quick brown \ue8\ue9", perl = TRUE))
 # stopifnot(identical(x, "|The| |quick| |brown| |\ue8\ue9|"))
 (x <- gsub("\\b", "|", "The quick brown fox", perl = TRUE))
 stopifnot(identical(x, "|The| |quick| |brown| |fox|"))
 ## The following is warned against in the help page, but worked in some versions
 gsub("\\b", "|", "The quick brown fox")

 (z <- strsplit(txt, pat)[[1]])
 stopifnot(eq(z, strsplit(txt, pat, useBytes = TRUE)[[1]]))
 stopifnot(eq(z, strsplit(txt, pat, fixed = TRUE)[[1]]))
 stopifnot(eq(z, strsplit(txt, pat, fixed = TRUE, useBytes = TRUE)[[1]]))
 stopifnot(eq(z, strsplit(txt, pat, perl = TRUE)[[1]]))
 stopifnot(eq(z, strsplit(txt, pat, perl = TRUE, useBytes = TRUE)[[1]]))

 (z <- strsplit(txt, "[a-c]")[[1]])
 stopifnot(eq(z, strsplit(txt, "[a-c]", useBytes = TRUE)[[1]]))
 stopifnot(eq(z, strsplit(txt, "[a-c]", perl = TRUE)[[1]]))
 stopifnot(eq(z, strsplit(txt, "[a-c]", perl = TRUE, useBytes = TRUE)[[1]]))

 ## from strsplit.Rd
 z <- strsplit("A text I want to display with spaces", NULL)[[1]]
 stopifnot(identical(z,
                     strsplit("A text I want to display with spaces", "")[[1]]))

 x <- c(as = "asfef", qu = "qwerty", "yuiop[", "b", "stuff.blah.yech")
 (z <- strsplit(x, "e"))
 stopifnot(identical(z, strsplit(x, "e", useBytes = TRUE)))
 stopifnot(identical(z, strsplit(x, "e", fixed = TRUE)))
 stopifnot(identical(z, strsplit(x, "e", fixed = TRUE, useBytes = TRUE)))
 stopifnot(identical(z, strsplit(x, "e", perl = TRUE)))
 stopifnot(identical(z, strsplit(x, "e", perl = TRUE, useBytes = TRUE)))

 ## moved from reg-tests-1b.R.
 ## fails to match on Cygwin, Mar 2011
 ## regexpr(fixed = TRUE) with a single-byte pattern matching to a MBCS string
 x <- iconv("fa\xE7ile a ", "latin1", "UTF-8")
 stopifnot(identical(regexpr(" ", x), regexpr(" ", x, fixed=TRUE)))
 # fixed=TRUE reported match position in bytes in R <= 2.10.0
 stopifnot(identical(regexpr(" a", x), regexpr(" a", x, fixed=TRUE)))
 ## always worked.

 ## this broke and segfaulted in 2.13.1 and earlier (PR#14627)
 x <- paste(rep("a ", 600), collapse="")
 testit(agrep(x, x))
 testit(agrep(x, x, max.distance=0.5))

 ## this is used in QC to check dependencies and was broken intermittently by TRE changes
 stopifnot(isTRUE(grepl('^[[:space:]]*(R|[[:alpha:]][[:alnum:].]*[[:alnum:]])([[:space:]]*\\(([^) ]+)[[:space:]]+([^) ]+)\\))?[[:space:]]*$', ' R (>= 2.13.0) ')))

 ## Bad sub() and gsub() with some regexprs PR#16009
 x <- c(NA, "  abc", "a b c    ", "a  b c")
 (y <- gsub("\\s{2,}", " ", x))
 stopifnot(y[-1] == c(" abc", "a b c ", "a b c"))
 x <- c("\ue4", "  abc", "a b c    ", "a  b c")
 (y <- gsub("\\s{2,}", " ", x))
 stopifnot(y == c(x[1], " abc", "a b c ", "a b c"))
 ## results were c(x[1], " ", " ", " ") in both cases in R 3.1.1
	## This can only be done in a locale that extends Latin-1
	(inf <- l10n_info())
	if(!(inf$`UTF-8` \|\| inf$`Latin-1`)) {
	warning("this test must be done in a Latin-1 or UTF-8 locale")
	q()
	}

	inp <- readLines(n = 2)
	«Latin-1 accented chars»: éè øØ å<Å æ<Æ é éè
	éè

	inp
	(txt <- iconv(inp[1], "latin1", ""))
	(pat <- iconv(inp[2], "latin1", ""))
	if(any(is.na(c(txt, pat)))) {
	## backup test
	warning("this test must be done in a Latin-1 or UTF-8 locale")
	q()
	}

	testit <- function(x) {print(x); stopifnot(identical(x, 1L))}
	testit(grep(pat, txt))
	testit(grep(pat, txt, ignore.case = TRUE))
	testit(grep(pat, txt, useBytes = TRUE))
	testit(grep(pat, txt, ignore.case = TRUE, useBytes = TRUE))
	testit(grep(pat, txt, fixed = TRUE))
	testit(grep(pat, txt, fixed = TRUE, useBytes = TRUE))
	testit(grep(pat, txt, perl = TRUE))
	testit(grep(pat, txt, ignore.case = TRUE, perl = TRUE))
	testit(grep(pat, txt, perl = TRUE, useBytes = TRUE))
	testit(grep(pat, txt, ignore.case = TRUE, perl = TRUE, useBytes = TRUE))
	testit(grep(toupper(pat), txt, ignore.case = TRUE))
	testit(grep(toupper(pat), txt, ignore.case = TRUE, perl = TRUE))
	## matches in Latin-1 but not in UTF-8
	grep(toupper(pat), txt, ignore.case = TRUE, perl = TRUE, useBytes = TRUE)

	(r1 <- regexpr("en", txt, fixed=TRUE))
	(r2 <- regexpr("en", txt, fixed=TRUE, useBytes=TRUE))
	stopifnot(identical(r1, regexpr("en", txt)))
	stopifnot(identical(r2, regexpr("en", txt, useBytes = TRUE)))
	stopifnot(identical(r1, regexpr("en", txt, perl=TRUE)))
	stopifnot(identical(r2, regexpr("en", txt, perl=TRUE, useBytes=TRUE)))
	stopifnot(identical(r1, regexpr("EN", txt, ignore.case=TRUE)))
	stopifnot(identical(r2, regexpr("EN", txt, ignore.case=TRUE, useBytes=TRUE)))
	stopifnot(identical(r1, regexpr("EN", txt, ignore.case=TRUE, perl=TRUE)))
	stopifnot(identical(r2, regexpr("EN", txt, ignore.case=TRUE, perl=TRUE,
	useBytes=TRUE)))

	(r1 <- regexpr(pat, txt, fixed=TRUE))
	(r2 <- regexpr(pat, txt, fixed=TRUE, useBytes=TRUE))
	stopifnot(identical(r1, regexpr(pat, txt)))
	stopifnot(identical(r2, regexpr(pat, txt, useBytes=TRUE)))
	stopifnot(identical(r1, regexpr(pat, txt, perl=TRUE)))
	stopifnot(identical(r2, regexpr(pat, txt, perl=TRUE, useBytes=TRUE)))
	stopifnot(identical(r1, regexpr(pat, txt, ignore.case=TRUE)))
	stopifnot(identical(r2, regexpr(pat, txt, ignore.case=TRUE, useBytes=TRUE)))
	stopifnot(identical(r1, regexpr(pat, txt, ignore.case=TRUE, perl=TRUE)))
	stopifnot(identical(r2, regexpr(pat, txt, ignore.case=TRUE, perl=TRUE,
	useBytes=TRUE)))
	pat2 <- toupper(pat)
	stopifnot(identical(r1, regexpr(pat2, txt, ignore.case=TRUE)))
	stopifnot(identical(r1, regexpr(pat2, txt, ignore.case=TRUE, perl=TRUE)))
	## will not match in a UTF-8 locale
	regexpr(pat2, txt, ignore.case=TRUE, perl=TRUE, useBytes=TRUE)


	(r1 <- gregexpr(pat, txt, fixed=TRUE))
	(r2 <- gregexpr(pat, txt, fixed=TRUE, useBytes=TRUE))
	stopifnot(identical(r1, gregexpr(pat, txt)))
	stopifnot(identical(r2, gregexpr(pat, txt, useBytes=TRUE)))
	stopifnot(identical(r1, gregexpr(pat, txt, perl=TRUE)))
	stopifnot(identical(r2, gregexpr(pat, txt, perl=TRUE, useBytes=TRUE)))
	stopifnot(identical(r1, gregexpr(pat, txt, ignore.case=TRUE)))
	stopifnot(identical(r2, gregexpr(pat, txt, ignore.case=TRUE, useByte=TRUE)))
	stopifnot(identical(r1, gregexpr(pat, txt, ignore.case=TRUE, perl=TRUE)))
	stopifnot(identical(r2, gregexpr(pat, txt, ignore.case=TRUE, perl=TRUE,
	useBytes=TRUE)))

	txt2 <- c("The", "licenses", "for", "most", "software", "are",
	"designed", "to", "take", "away", "your", "freedom",
	"to", "share", "and", "change", "it.",
	"", "By", "contrast,", "the", "GNU", "General", "Public", "License",
	"is", "intended", "to", "guarantee", "your", "freedom", "to",
	"share", "and", "change", "free", "software", "--",
	"to", "make", "sure", "the", "software", "is",
	"free", "for", "all", "its", "users")
	( i <- grep("[gu]", txt2, perl = TRUE) )
	stopifnot(identical(i, grep("[gu]", txt2)))
	## results depend on the locale
	(ot <- sub("[b-e]",".", txt2, perl = TRUE))
	txt2[ot != sub("[b-e]",".", txt2)]
	(ot <- sub("[b-e]",".", txt2, ignore.case = TRUE, perl = TRUE))
	txt2[ot != sub("[b-e]",".", txt2, ignore.case = TRUE)]


	## These may end up with different encodings: == copes, identical does not

	eq <- function(a, b) a == b
	(r1 <- gsub(pat, "ef", txt))
	stopifnot(eq(r1, gsub(pat, "ef", txt, useBytes = TRUE)))
	stopifnot(eq(r1, gsub(pat, "ef", txt, fixed = TRUE)))
	stopifnot(eq(r1, gsub(pat, "ef", txt, fixed = TRUE, useBytes = TRUE)))
	stopifnot(eq(r1, gsub(pat, "ef", txt, perl = TRUE)))
	stopifnot(eq(r1, gsub(pat, "ef", txt, perl = TRUE, useBytes = TRUE)))

	pat <- substr(pat, 1, 1)
	(r1 <- gsub(pat, "gh", txt))
	stopifnot(eq(r1, gsub(pat, "gh", txt, useBytes = TRUE)))
	stopifnot(eq(r1, gsub(pat, "gh", txt, fixed = TRUE)))
	stopifnot(eq(r1, gsub(pat, "gh", txt, fixed = TRUE, useBytes = TRUE)))
	stopifnot(eq(r1, gsub(pat, "gh", txt, perl = TRUE)))
	stopifnot(eq(r1, gsub(pat, "gh", txt, perl = TRUE, useBytes = TRUE)))


	stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx"))
	stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx"), perl = TRUE)
	stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx"), useBytes = TRUE)
	stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx"), perl = TRUE, useBytes = TRUE)

	## this one seems system-dependent
	(x <- gsub("\\b", "\|", "The quick brown \ue8\ue9", perl = TRUE))
	# stopifnot(identical(x, "\|The\| \|quick\| \|brown\| \|\ue8\ue9\|"))
	(x <- gsub("\\b", "\|", "The quick brown fox", perl = TRUE))
	stopifnot(identical(x, "\|The\| \|quick\| \|brown\| \|fox\|"))
	## The following is warned against in the help page, but worked in some versions
	gsub("\\b", "\|", "The quick brown fox")

	(z <- strsplit(txt, pat)[[1]])
	stopifnot(eq(z, strsplit(txt, pat, useBytes = TRUE)[[1]]))
	stopifnot(eq(z, strsplit(txt, pat, fixed = TRUE)[[1]]))
	stopifnot(eq(z, strsplit(txt, pat, fixed = TRUE, useBytes = TRUE)[[1]]))
	stopifnot(eq(z, strsplit(txt, pat, perl = TRUE)[[1]]))
	stopifnot(eq(z, strsplit(txt, pat, perl = TRUE, useBytes = TRUE)[[1]]))

	(z <- strsplit(txt, "[a-c]")[[1]])
	stopifnot(eq(z, strsplit(txt, "[a-c]", useBytes = TRUE)[[1]]))
	stopifnot(eq(z, strsplit(txt, "[a-c]", perl = TRUE)[[1]]))
	stopifnot(eq(z, strsplit(txt, "[a-c]", perl = TRUE, useBytes = TRUE)[[1]]))

	## from strsplit.Rd
	z <- strsplit("A text I want to display with spaces", NULL)[[1]]
	stopifnot(identical(z,
	strsplit("A text I want to display with spaces", "")[[1]]))

	x <- c(as = "asfef", qu = "qwerty", "yuiop[", "b", "stuff.blah.yech")
	(z <- strsplit(x, "e"))
	stopifnot(identical(z, strsplit(x, "e", useBytes = TRUE)))
	stopifnot(identical(z, strsplit(x, "e", fixed = TRUE)))
	stopifnot(identical(z, strsplit(x, "e", fixed = TRUE, useBytes = TRUE)))
	stopifnot(identical(z, strsplit(x, "e", perl = TRUE)))
	stopifnot(identical(z, strsplit(x, "e", perl = TRUE, useBytes = TRUE)))

	## moved from reg-tests-1b.R.
	## fails to match on Cygwin, Mar 2011
	## regexpr(fixed = TRUE) with a single-byte pattern matching to a MBCS string
	x <- iconv("fa\xE7ile a ", "latin1", "UTF-8")
	stopifnot(identical(regexpr(" ", x), regexpr(" ", x, fixed=TRUE)))
	# fixed=TRUE reported match position in bytes in R <= 2.10.0
	stopifnot(identical(regexpr(" a", x), regexpr(" a", x, fixed=TRUE)))
	## always worked.

	## this broke and segfaulted in 2.13.1 and earlier (PR#14627)
	x <- paste(rep("a ", 600), collapse="")
	testit(agrep(x, x))
	testit(agrep(x, x, max.distance=0.5))

	## this is used in QC to check dependencies and was broken intermittently by TRE changes
	stopifnot(isTRUE(grepl('^[[:space:]](R\|[[:alpha:]][[:alnum:].][[:alnum:]])([[:space:]]\\(([^) ]+)[[:space:]]+([^) ]+)\\))?[[:space:]]$', ' R (>= 2.13.0) ')))

	## Bad sub() and gsub() with some regexprs PR#16009
	x <- c(NA, " abc", "a b c ", "a b c")
	(y <- gsub("\\s{2,}", " ", x))
	stopifnot(y[-1] == c(" abc", "a b c ", "a b c"))
	x <- c("\ue4", " abc", "a b c ", "a b c")
	(y <- gsub("\\s{2,}", " ", x))
	stopifnot(y == c(x[1], " abc", "a b c ", "a b c"))
	## results were c(x[1], " ", " ", " ") in both cases in R 3.1.1