blob: b09422e61259da86fd946412f155b6b7ebdde4dd [file] [log] [blame]
# File src/library/utils/R/iconv.R
# Part of the R package, https://www.R-project.org
#
# Copyright (C) 1995-2014 The R Core Team
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# https://www.R-project.org/Licenses/
## If you were wondering what these language codes stand for, see
## ftp://ftp.ilog.fr/pub/Users/haible/utf8/ISO_639
localeToCharset <- function(locale = Sys.getlocale("LC_CTYPE"))
{
guess <- function(en)
{
if(en %in% c("aa", "af", "an", "br", "ca", "da", "de", "en",
"es", "et", "eu", "fi", "fo", "fr", "ga", "gl",
"gv", "id", "is", "it", "kl", "kw", "ml", "ms",
"nb", "nn", "no", "oc", "om", "pt", "so", "sq",
"st", "sv", "tl", "uz", "wa", "xh", "zu"))
return("ISO8859-1")
if(en %in% c("bs", "cs", "hr", "hu", "pl", "ro", "sk", "sl"))
return("ISO8859-2")
if(en %in% "mt") return("ISO8859-3")
if(en %in% c("mk", "ru")) return("ISO8859-5")
if(en %in% "ar") return("ISO8859-6")
if(en %in% "el") return("ISO8859-7")
if(en %in% c("he", "iw", "yi")) return("ISO8859-8")
if(en %in% "tr") return("ISO8859-9")
if(en %in% "lg") return("ISO8859-10")
if(en %in% c("lt", "lv", "mi")) return("ISO8859-13")
if(en %in% "cy") return("ISO8859-14")
if(en %in% "uk") return("KOI8-U")
if(en %in% "ja") return("EUC-JP")
if(en %in% "ko") return("EUC-KR")
if(en %in% "th") return("TIS-620")
if(en %in% "tg") return("KOI8-T")
if(en %in% "ka") return("GEORGIAN-PS")
if(en %in% "kk") return("PT154")
## not safe to guess for zh
return(NA_character_)
}
if(locale %in% c("C", "POSIX")) return("ASCII")
if(.Platform$OS.type == "windows") {
x <- strsplit(locale, ".", fixed=TRUE)[[1L]]
if(length(x) != 2) return(NA_character_)
## PUTTY suggests mapping Windows code pages as
## 1250 -> ISO 8859-2
## 1251 -> KOI8-U
## 1252 -> ISO 8859-1
## 1253 -> ISO 8859-7
## 1254 -> ISO 8859-9
## 1255 -> ISO 8859-8
## 1256 -> ISO 8859-6
## 1257 -> ISO 8859-13
switch(x[2L],
# this is quite wrong "1250" = return("ISO8859-2"),
# this is quite wrong "1251" = return("KOI8-U"),
"1252" = return("ISO8859-1"),
# "1253" = return("ISO8859-7"),
# "1254" = return("ISO8859-9"),
# "1255" = return("ISO8859-8"),
# "1256" = return("ISO8859-6"),
"1257" = return("ISO8859-13")
)
return(paste0("CP", x[2L]))
} else {
## Assume locales are like en_US[.utf8[@euro]]
x <- strsplit(locale, ".", fixed=TRUE)[[1L]]
enc <- if(length(x) == 2) gsub("@.*$o", "", x[2L]) else ""
# AIX uses UTF-8, macOS utf-8
if(toupper(enc) == "UTF-8") enc <- "utf8"
if(nzchar(enc) && enc != "utf8") {
enc <- tolower(enc)
known <-
c("ISO8859-1", "ISO8859-2", "ISO8859-3", "ISO8859-6",
"ISO8859-7", "ISO8859-8", "ISO8859-9", "ISO8859-10",
"ISO8859-13", "ISO8859-14", "ISO8859-15",
"CP1251", "CP1255", "EUC-JP", "EUC-KR", "EUC-TW",
"GEORGIAN-PS", "KOI8-R", "KOI8-U", "TCVN",
"BIG5" , "GB2312", "GB18030", "GBK",
"TIS-620", "SHIFT_JIS", "GB2312", "BIG5-HKSCS")
names(known) <-
c("iso88591", "iso88592", "iso88593", "iso88596",
"iso88597", "iso88598", "iso88599", "iso885910",
"iso885913", "iso885914", "iso885915",
"cp1251", "cp1255", "eucjp", "euckr", "euctw",
"georgianps", "koi8r", "koi8u", "tcvn",
"big5" , "gb2312", "gb18030", "gbk",
"tis-620", "sjis", "eucn", "big5-hkscs")
if (grepl("darwin",R.version$os)) {
k <- c(known, "ISO8859-1", "ISO8859-2", "ISO8859-4",
"ISO8859-7", "ISO8859-9", "ISO8859-13", "ISO8859-15",
"KOI8-U", "KOI8-R", "PT154", "ASCII", "ARMSCII-8",
"ISCII-DEV", "BIG5-HKCSC")
names(k) <- c(names(known), "iso8859-1", "iso8859-2", "iso8859-4",
"iso8859-7", "iso8859-9", "iso8859-13", "iso8859-15",
"koi8-u", "koi8-r", "pt154", "us-ascii", "armscii-8",
"iscii-dev", "big5hkscs")
known <- k
}
if(enc %in% names(known)) return(unname(known[enc]))
if(length(grep("^cp-", enc))) # old Linux
return(sub("cp-([0-9]+)", "CP\\1", enc))
if(enc == "EUC") {
## let's hope it is a ll_* name.
if(length(grep("^[[:alpha:]]{2}_", x[1L], perl = TRUE))) {
ll <- substr(x[1L], 1L, 2L)
return(switch(ll, "jp"="EUC-JP", "kr"="EUC-KR",
"zh"="GB2312"))
}
}
}
## on Darwin all real locales w/o encoding are UTF-8
## HOWEVER! unlike the C code, we cannot filter out
## invalid locales, so it will be wrong for non-supported
## locales (why is this duplicated in R code anyway?)
if (grepl("darwin", R.version$os)) return("UTF-8")
## let's hope it is a ll_* name.
if(length(grep("^[[:alpha:]]{2}_", x[1L], perl = TRUE))) {
ll <- substr(x[1L], 1L, 2L)
if(enc == "utf8") return(c("UTF-8", guess(ll)))
else return(guess(ll))
}
return(NA_character_)
}
}