src/library/base/man/validUTF8.Rd - R - Git at Google

 % File src/library/base/man/validUTF8.Rd
 % Part of the R package, https://www.R-project.org
 % Copyright 2015-2018 R Core Team
 % Distributed under GPL 2 or later

 \name{validUTF8}
 \title{Check if a Character Vector is Validly Encoded}
 \alias{validUTF8}
 \alias{validEnc}
 \description{
   Check if each element of a character vector is valid in its implied
   encoding.
 }
 \usage{
 validUTF8(x)
 validEnc(x)
 }

 \arguments{
   \item{x}{a character vector.}
 }

 \details{
   These use similar checks to those used by functions such as
   \code{\link{grep}}.

   \code{validUTF8} ignores any marked encoding (see
   \code{\link{Encoding}}) and so looks directly if the bytes in each
   string are valid UTF-8.

   \code{validEnc} regards character strings as validly encoded unless
   their encodings are marked as UTF-8 or they are unmarked and the \R
   session is in a UTF-8 or other multi-byte locale.  (The checks in
   other multi-byte locales depend on the OS and as with
   \code{\link{iconv}} not all invalid inputs may be detected.)
 }

 \note{
   It would be possible to check for the validity of character strings in
   a Latin-1 encoding, but extensions such as CP1252 are widely accepted
   as \sQuote{Latin-1} and 8-bit encodings rarely need to be checked for
   validity.
 }

 \value{
   A logical vector of the same length as \code{x}.   \code{NA} elements
   are regarded as validly encoded.
 }

 \examples{
 x <-
   ## from example(text)
 c("Jetz", "no", "chli", "z\xc3\xbcrit\xc3\xbc\xc3\xbctsch:",
   "(noch", "ein", "bi\xc3\x9fchen", "Z\xc3\xbc", "deutsch)",
    ## from a CRAN check log
    "\xfa\xb4\xbf\xbf\x9f")
 validUTF8(x)
 validEnc(x) # depends on the locale
 Encoding(x) <-"UTF-8"
 validEnc(x) # typically the last, x[10], is invalid

 ## Maybe advantageous to declare it "unknown":
 G <- x ; Encoding(G[!validEnc(G)]) <- "unknown"
 try( substr(x, 1,1) ) # gives 'invalid multibyte string' error
 try( substr(G, 1,1) ) # works
 nchar(G) # fine, too
 ## but it is not "more valid" typically:
 all.equal(validEnc(x),
           validEnc(G)) # typically TRUE
 }
	% File src/library/base/man/validUTF8.Rd
	% Part of the R package, https://www.R-project.org
	% Copyright 2015-2018 R Core Team
	% Distributed under GPL 2 or later

	\name{validUTF8}
	\title{Check if a Character Vector is Validly Encoded}
	\alias{validUTF8}
	\alias{validEnc}
	\description{
	Check if each element of a character vector is valid in its implied
	encoding.
	}
	\usage{
	validUTF8(x)
	validEnc(x)
	}

	\arguments{
	\item{x}{a character vector.}
	}

	\details{
	These use similar checks to those used by functions such as
	\code{\link{grep}}.

	\code{validUTF8} ignores any marked encoding (see
	\code{\link{Encoding}}) and so looks directly if the bytes in each
	string are valid UTF-8.

	\code{validEnc} regards character strings as validly encoded unless
	their encodings are marked as UTF-8 or they are unmarked and the \R
	session is in a UTF-8 or other multi-byte locale. (The checks in
	other multi-byte locales depend on the OS and as with
	\code{\link{iconv}} not all invalid inputs may be detected.)
	}

	\note{
	It would be possible to check for the validity of character strings in
	a Latin-1 encoding, but extensions such as CP1252 are widely accepted
	as \sQuote{Latin-1} and 8-bit encodings rarely need to be checked for
	validity.
	}

	\value{
	A logical vector of the same length as \code{x}. \code{NA} elements
	are regarded as validly encoded.
	}

	\examples{
	x <-
	## from example(text)
	c("Jetz", "no", "chli", "z\xc3\xbcrit\xc3\xbc\xc3\xbctsch:",
	"(noch", "ein", "bi\xc3\x9fchen", "Z\xc3\xbc", "deutsch)",
	## from a CRAN check log
	"\xfa\xb4\xbf\xbf\x9f")
	validUTF8(x)
	validEnc(x) # depends on the locale
	Encoding(x) <-"UTF-8"
	validEnc(x) # typically the last, x[10], is invalid

	## Maybe advantageous to declare it "unknown":
	G <- x ; Encoding(G[!validEnc(G)]) <- "unknown"
	try( substr(x, 1,1) ) # gives 'invalid multibyte string' error
	try( substr(G, 1,1) ) # works
	nchar(G) # fine, too
	## but it is not "more valid" typically:
	all.equal(validEnc(x),
	validEnc(G)) # typically TRUE
	}