blob: 0fb0e377b5f5af27c530a9472456a728c1c1f2c1 [file] [log] [blame]
% File src/library/base/man/validUTF8.Rd
% Part of the R package, https://www.R-project.org
% Copyright 2015-2018 R Core Team
% Distributed under GPL 2 or later
\name{validUTF8}
\title{Check if a Character Vector is Validly Encoded}
\alias{validUTF8}
\alias{validEnc}
\description{
Check if each element of a character vector is valid in its implied
encoding.
}
\usage{
validUTF8(x)
validEnc(x)
}
\arguments{
\item{x}{a character vector.}
}
\details{
These use similar checks to those used by functions such as
\code{\link{grep}}.
\code{validUTF8} ignores any marked encoding (see
\code{\link{Encoding}}) and so looks directly if the bytes in each
string are valid UTF-8.
\code{validEnc} regards character strings as validly encoded unless
their encodings are marked as UTF-8 or they are unmarked and the \R
session is in a UTF-8 or other multi-byte locale. (The checks in
other multi-byte locales depend on the OS and as with
\code{\link{iconv}} not all invalid inputs may be detected.)
}
\note{
It would be possible to check for the validity of character strings in
a Latin-1 encoding, but extensions such as CP1252 are widely accepted
as \sQuote{Latin-1} and 8-bit encodings rarely need to be checked for
validity.
}
\value{
A logical vector of the same length as \code{x}. \code{NA} elements
are regarded as validly encoded.
}
\examples{
x <-
## from example(text)
c("Jetz", "no", "chli", "z\xc3\xbcrit\xc3\xbc\xc3\xbctsch:",
"(noch", "ein", "bi\xc3\x9fchen", "Z\xc3\xbc", "deutsch)",
## from a CRAN check log
"\xfa\xb4\xbf\xbf\x9f")
validUTF8(x)
validEnc(x) # depends on the locale
Encoding(x) <-"UTF-8"
validEnc(x) # typically the last, x[10], is invalid
## Maybe advantageous to declare it "unknown":
G <- x ; Encoding(G[!validEnc(G)]) <- "unknown"
try( substr(x, 1,1) ) # gives 'invalid multibyte string' error
try( substr(G, 1,1) ) # works
nchar(G) # fine, too
## but it is not "more valid" typically:
all.equal(validEnc(x),
validEnc(G)) # typically TRUE
}