blob: 5dce18f2c4946cd95ddb088aa0759c5b2ae93cdf [file] [log] [blame]
/*
* R : A Computer Language for Statistical Data Analysis
* Copyright (C) 2005-2014 The R Core Team
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, a copy is available at
* https://www.R-project.org/Licenses/
*/
/* This file was contributed by Ei-ji Nakama.
* It exports locale2charset for use in gram.y, and rlocale.c on macOS.
* And sysutils.c, grDevices/src/devPS.c
*/
/* setlocale(LC_CTYPE,NULL) to encodingname cf nl_langinfo(LC_CTYPE) */
/*********************************************************************
* usage : char *locale2charset(const char *locale) *
* return : ASCII - default and undefine *
* other - encodename *
* *
* cc -o localecharset -DDEBUG_TEST=1 localecharset.c *
* or *
* cc -o localecharset -DDEBUG_TEST=2 localecharset.c *
*********************************************************************/
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#ifdef DEBUG_TEST
#define SPRINT(x) printf("%6d:" #x "=%s\n", __LINE__, x)
#define DPRINT(x) printf("%6d:" #x "=%d\n", __LINE__, x)
//#define HAVE_STRING_H
#endif
#include <string.h>
#include <memory.h>
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
//#include <rlocale.h> /* To get the correct linkage for locale2charset */
/* name_value struct */
typedef struct {
char *name;
char *value;
} name_value;
#ifndef __APPLE__
/*
* codeset name defined.
*
cat /usr/X11R6/lib/X11/locale/locale.alias | \
sed -e '/#.*$/d' -e 's/://' | \
awk '{gsub(/^[^.]+\./, "", $2);
$2=toupper($2);
gsub(/^EUC/, "EUC-",$2);
gsub(/^BIG5HKSCS$/, "BIG5-HKSCS",$2);
if (($2!="")&&(!system("iconv --list|grep " $2 ))) print $2
}' | \
sed -e '/\/$/d' | \
sort | uniq | \
awk '{NAME=$1;gsub(/-/,"_",NAME);
printf("static const char ENC_%-20s\"%s\";\n",
NAME "[]=" ,
$1)}'
*/
static char ENC_ARMSCII_8[]= "ARMSCII-8";
static char ENC_BIG5[]= "BIG5";
static char ENC_BIG5_HKSCS[]= "BIG5-HKSCS";
static char ENC_C[]= "C";
static char ENC_CP1251[]= "CP1251";
static char ENC_CP1255[]= "CP1255";
static char ENC_CP1256[]= "CP1256";
static char ENC_EUC_CN[]= "EUC-CN";
static char ENC_EUC_JP[]= "EUC-JP";
static char ENC_EUC_KR[]= "EUC-KR";
static char ENC_EUC_TW[]= "EUC-TW";
static char ENC_GB2312[]= "GB2312";
static char ENC_GBK[]= "GBK";
static char ENC_GEORGIAN_ACADEMY[]= "GEORGIAN-ACADEMY";
/* static char ENC_GEORGIAN_PS[]= "GEORGIAN-PS"; */
/* static char ENC_ISIRI_3342[]= "ISIRI-3342"; */
static char ENC_ISO8859_1[]= "ISO8859-1";
static char ENC_ISO8859_10[]= "ISO8859-10";
static char ENC_ISO8859_11[]= "ISO8859-11";
static char ENC_ISO8859_13[]= "ISO8859-13";
/* static char ENC_ISO8859_14[]= "ISO8859-14"; */
static char ENC_ISO8859_15[]= "ISO8859-15";
static char ENC_ISO8859_2[]= "ISO8859-2";
static char ENC_ISO8859_3[]= "ISO8859-3";
/* static char ENC_ISO8859_4[]= "ISO8859-4"; */
static char ENC_ISO8859_5[]= "ISO8859-5";
static char ENC_ISO8859_6[]= "ISO8859-6";
static char ENC_ISO8859_7[]= "ISO8859-7";
static char ENC_ISO8859_8[]= "ISO8859-8";
static char ENC_ISO8859_9[]= "ISO8859-9";
static char ENC_KOI8_R[]= "KOI8-R";
static char ENC_KOI8_U[]= "KOI8-U";
/* static char ENC_SJIS[]= "SJIS"; */
static char ENC_TCVN[]= "TCVN";
/* static char ENC_TIS620[]= "TIS620"; */
static char ENC_UTF_8[]= "UTF-8";
/* static char ENC_VISCII[]= "VISCII"; */
/*
# charset getscript. iconv list output line is backslant.
cat /usr/X11R6/lib/X11/locale/locale.alias | \
sed -e '/#.*$/d ; /^[A-z]*\./d' -e 's/://' | \
awk '{gsub(/^[^.]+\./, "", $2);
$2=toupper($2);
gsub(/^EUC/, "EUC-",$2);
gsub(/^BIG5HKSCS$/, "BIG5-HKSCS",$2);
NAME=$2;
gsub(/\xe7/,"\"\"\\xe7\"\"",$1);
gsub(/\xe5/,"\"\"\\xe5\"\"",$1);
gsub(/-/, "_",NAME);
NAME="ENC_" NAME;
if (($2!="")&&(!system("iconv --list|grep " $2 ))) print $1 " " NAME
}' | \
sed -e '/\/$/d' | \
sort -k 1 | uniq | \
awk '{printf (" {%-34s%s},\n", "\"" $1 "\",", $2)}'
*/
static const name_value guess[] = {
{"Cextend", ENC_ISO8859_1},
{"English_United-States.437", ENC_C},
{"ISO-8859-1", ENC_ISO8859_1},
{"ISO8859-1", ENC_ISO8859_1},
{"Japanese-EUC", ENC_EUC_JP},
{"Jp_JP", ENC_EUC_JP},
{"POSIX", ENC_C},
{"POSIX-UTF2", ENC_C},
{"aa_DJ", ENC_ISO8859_1},
{"aa_ER", ENC_UTF_8},
{"aa_ER@saaho", ENC_UTF_8},
{"aa_ET", ENC_UTF_8},
{"af", ENC_ISO8859_1},
{"af_ZA", ENC_ISO8859_1},
{"am", ENC_UTF_8},
{"am_ET", ENC_UTF_8},
{"an_ES", ENC_ISO8859_15},
{"ar", ENC_ISO8859_6},
{"ar_AA", ENC_ISO8859_6},
{"ar_AE", ENC_ISO8859_6},
{"ar_BH", ENC_ISO8859_6},
{"ar_DZ", ENC_ISO8859_6},
{"ar_EG", ENC_ISO8859_6},
{"ar_IN", ENC_UTF_8},
{"ar_IQ", ENC_ISO8859_6},
{"ar_JO", ENC_ISO8859_6},
{"ar_KW", ENC_ISO8859_6},
{"ar_LB", ENC_ISO8859_6},
{"ar_LY", ENC_ISO8859_6},
{"ar_MA", ENC_ISO8859_6},
{"ar_OM", ENC_ISO8859_6},
{"ar_QA", ENC_ISO8859_6},
{"ar_SA", ENC_ISO8859_6},
{"ar_SD", ENC_ISO8859_6},
{"ar_SY", ENC_ISO8859_6},
{"ar_TN", ENC_ISO8859_6},
{"ar_YE", ENC_ISO8859_6},
{"be", ENC_CP1251},
{"be_BY", ENC_CP1251},
{"bg", ENC_CP1251},
{"bg_BG", ENC_CP1251},
{"bn_BD", ENC_UTF_8},
{"bn_IN", ENC_UTF_8},
{"bokm""\xe5""l", ENC_ISO8859_1},
{"bokmal", ENC_ISO8859_1},
{"br", ENC_ISO8859_1},
{"br_FR", ENC_ISO8859_1},
{"br_FR@euro", ENC_ISO8859_15},
{"bs_BA", ENC_ISO8859_2},
{"bulgarian", ENC_CP1251},
{"byn_ER", ENC_UTF_8},
{"c-french.iso88591", ENC_ISO8859_1},
{"ca", ENC_ISO8859_1},
{"ca_ES", ENC_ISO8859_1},
{"ca_ES@euro", ENC_ISO8859_15},
{"catalan", ENC_ISO8859_1},
{"chinese-s", ENC_EUC_CN},
{"chinese-t", ENC_EUC_TW},
{"croatian", ENC_ISO8859_2},
{"cs", ENC_ISO8859_2},
{"cs_CS", ENC_ISO8859_2},
{"cs_CZ", ENC_ISO8859_2},
{"cy", ENC_ISO8859_1},
{"cy_GB", ENC_ISO8859_1},
{"cz", ENC_ISO8859_2},
{"cz_CZ", ENC_ISO8859_2},
{"czech", ENC_ISO8859_2},
{"da", ENC_ISO8859_1},
{"da_DK", ENC_ISO8859_1},
{"danish", ENC_ISO8859_1},
{"dansk", ENC_ISO8859_1},
{"de", ENC_ISO8859_1},
{"de_AT", ENC_ISO8859_1},
{"de_AT@euro", ENC_ISO8859_15},
{"de_BE", ENC_ISO8859_1},
{"de_BE@euro", ENC_ISO8859_15},
{"de_CH", ENC_ISO8859_1},
{"de_DE", ENC_ISO8859_1},
{"de_DE@euro", ENC_ISO8859_15},
{"de_LI", ENC_ISO8859_1},
{"de_LI@euro", ENC_ISO8859_15},
{"de_LU", ENC_ISO8859_1},
{"de_LU@euro", ENC_ISO8859_15},
{"deutsch", ENC_ISO8859_1},
{"dutch", ENC_ISO8859_1},
{"eesti", ENC_ISO8859_1},
{"el", ENC_ISO8859_7},
{"el_GR", ENC_ISO8859_7},
{"en", ENC_ISO8859_1},
{"en_AU", ENC_ISO8859_1},
{"en_BW", ENC_ISO8859_1},
{"en_CA", ENC_ISO8859_1},
{"en_DK", ENC_ISO8859_1},
{"en_GB", ENC_ISO8859_1},
{"en_HK", ENC_ISO8859_1},
{"en_IE", ENC_ISO8859_1},
{"en_IE@euro", ENC_ISO8859_15},
{"en_IN", ENC_UTF_8},
{"en_NZ", ENC_ISO8859_1},
{"en_PH", ENC_ISO8859_1},
{"en_SG", ENC_ISO8859_1},
{"en_UK", ENC_ISO8859_1},
{"en_US", ENC_ISO8859_1},
{"en_ZA", ENC_ISO8859_1},
{"en_ZW", ENC_ISO8859_1},
{"es", ENC_ISO8859_1},
{"es_AR", ENC_ISO8859_1},
{"es_BO", ENC_ISO8859_1},
{"es_CL", ENC_ISO8859_1},
{"es_CO", ENC_ISO8859_1},
{"es_CR", ENC_ISO8859_1},
{"es_DO", ENC_ISO8859_1},
{"es_EC", ENC_ISO8859_1},
{"es_ES", ENC_ISO8859_1},
{"es_ES@euro", ENC_ISO8859_15},
{"es_GT", ENC_ISO8859_1},
{"es_HN", ENC_ISO8859_1},
{"es_MX", ENC_ISO8859_1},
{"es_NI", ENC_ISO8859_1},
{"es_PA", ENC_ISO8859_1},
{"es_PE", ENC_ISO8859_1},
{"es_PR", ENC_ISO8859_1},
{"es_PY", ENC_ISO8859_1},
{"es_SV", ENC_ISO8859_1},
{"es_US", ENC_ISO8859_1},
{"es_UY", ENC_ISO8859_1},
{"es_VE", ENC_ISO8859_1},
{"estonian", ENC_ISO8859_1},
{"et", ENC_ISO8859_15},
{"et_EE", ENC_ISO8859_15},
{"eu", ENC_ISO8859_1},
{"eu_ES", ENC_ISO8859_1},
{"eu_ES@euro", ENC_ISO8859_15},
{"eu_FR", ENC_ISO8859_1},
{"eu_FR@euro", ENC_ISO8859_15},
{"fa", ENC_UTF_8},
{"fa_IR", ENC_UTF_8},
{"fi", ENC_ISO8859_1},
{"fi_FI", ENC_ISO8859_1},
{"fi_FI@euro", ENC_ISO8859_15},
{"finnish", ENC_ISO8859_1},
{"fo", ENC_ISO8859_1},
{"fo_FO", ENC_ISO8859_1},
{"fr", ENC_ISO8859_1},
{"fr_BE", ENC_ISO8859_1},
{"fr_BE@euro", ENC_ISO8859_15},
{"fr_CA", ENC_ISO8859_1},
{"fr_CH", ENC_ISO8859_1},
{"fr_FR", ENC_ISO8859_1},
{"fr_FR@euro", ENC_ISO8859_15},
{"fr_LU", ENC_ISO8859_1},
{"fr_LU@euro", ENC_ISO8859_15},
{"fran""\xe7""ais", ENC_ISO8859_1},
{"french", ENC_ISO8859_1},
{"ga", ENC_ISO8859_1},
{"ga_IE", ENC_ISO8859_1},
{"ga_IE@euro", ENC_ISO8859_15},
{"galego", ENC_ISO8859_1},
{"galician", ENC_ISO8859_1},
{"gd", ENC_ISO8859_1},
{"gd_GB", ENC_ISO8859_1},
{"german", ENC_ISO8859_1},
{"gez_ER", ENC_UTF_8},
{"gez_ER@abegede", ENC_UTF_8},
{"gez_ET", ENC_UTF_8},
{"gez_ET@abegede", ENC_UTF_8},
{"gl", ENC_ISO8859_1},
{"gl_ES", ENC_ISO8859_1},
{"gl_ES@euro", ENC_ISO8859_15},
{"greek", ENC_ISO8859_7},
{"gu_IN", ENC_UTF_8},
{"gv", ENC_ISO8859_1},
{"gv_GB", ENC_ISO8859_1},
{"he", ENC_ISO8859_8},
{"he_IL", ENC_ISO8859_8},
{"hebrew", ENC_ISO8859_8},
{"hr", ENC_ISO8859_2},
{"hr_HR", ENC_ISO8859_2},
{"hrvatski", ENC_ISO8859_2},
{"hu", ENC_ISO8859_2},
{"hu_HU", ENC_ISO8859_2},
{"hungarian", ENC_ISO8859_2},
{"hy", ENC_ARMSCII_8},
{"hy_AM", ENC_ARMSCII_8},
{"icelandic", ENC_ISO8859_1},
{"id", ENC_ISO8859_1},
{"id_ID", ENC_ISO8859_1},
{"in", ENC_ISO8859_1},
{"in_ID", ENC_ISO8859_1},
{"is", ENC_ISO8859_1},
{"is_IS", ENC_ISO8859_1},
{"iso_8859_1", ENC_ISO8859_1},
{"it", ENC_ISO8859_1},
{"it_CH", ENC_ISO8859_1},
{"it_IT", ENC_ISO8859_1},
{"it_IT@euro", ENC_ISO8859_15},
{"italian", ENC_ISO8859_1},
{"iw", ENC_ISO8859_8},
{"iw_IL", ENC_ISO8859_8},
{"ja", ENC_EUC_JP},
{"ja_JP", ENC_EUC_JP},
{"japan", ENC_EUC_JP},
{"japanese", ENC_EUC_JP},
{"ka", ENC_GEORGIAN_ACADEMY},
{"ka_GE", ENC_GEORGIAN_ACADEMY},
{"kl", ENC_ISO8859_1},
{"kl_GL", ENC_ISO8859_1},
{"kn_IN", ENC_UTF_8},
{"ko", ENC_EUC_KR},
{"ko_KR", ENC_EUC_KR},
{"korean", ENC_EUC_KR},
{"kw", ENC_ISO8859_1},
{"kw_GB", ENC_ISO8859_1},
{"lg_UG", ENC_ISO8859_10},
{"lithuanian", ENC_ISO8859_13},
{"lt", ENC_ISO8859_13},
{"lt_LT", ENC_ISO8859_13},
{"lv", ENC_ISO8859_13},
{"lv_LV", ENC_ISO8859_13},
{"mi", ENC_ISO8859_13},
{"mi_NZ", ENC_ISO8859_13},
{"mk", ENC_ISO8859_5},
{"mk_MK", ENC_ISO8859_5},
{"ml_IN", ENC_UTF_8},
{"mn_MN", ENC_UTF_8},
{"mr_IN", ENC_UTF_8},
{"ms", ENC_ISO8859_1},
{"ms_MY", ENC_ISO8859_1},
{"mt", ENC_ISO8859_3},
{"mt_MT", ENC_ISO8859_3},
{"nb", ENC_ISO8859_1},
{"nb_NO", ENC_ISO8859_1},
{"ne_NP", ENC_UTF_8},
{"nl", ENC_ISO8859_1},
{"nl_BE", ENC_ISO8859_1},
{"nl_BE@euro", ENC_ISO8859_15},
{"nl_NL", ENC_ISO8859_1},
{"nl_NL@euro", ENC_ISO8859_15},
{"nn", ENC_ISO8859_1},
{"nn_NO", ENC_ISO8859_1},
{"no", ENC_ISO8859_1},
{"no@nynorsk", ENC_ISO8859_1},
{"no_NO", ENC_ISO8859_1},
{"norwegian", ENC_ISO8859_1},
{"nynorsk", ENC_ISO8859_1},
{"oc", ENC_ISO8859_1},
{"oc_FR", ENC_ISO8859_1},
{"oc_FR@euro", ENC_ISO8859_15},
{"om_ET", ENC_UTF_8},
{"om_KE", ENC_ISO8859_1},
{"pa_IN", ENC_UTF_8},
{"ph", ENC_ISO8859_1},
{"ph_PH", ENC_ISO8859_1},
{"pl", ENC_ISO8859_2},
{"pl_PL", ENC_ISO8859_2},
{"polish", ENC_ISO8859_2},
{"portuguese", ENC_ISO8859_1},
{"pp", ENC_ISO8859_1},
{"pp_AN", ENC_ISO8859_1},
{"pt", ENC_ISO8859_1},
{"pt_BR", ENC_ISO8859_1},
{"pt_PT", ENC_ISO8859_1},
{"pt_PT@euro", ENC_ISO8859_15},
{"ro", ENC_ISO8859_2},
{"ro_RO", ENC_ISO8859_2},
{"romanian", ENC_ISO8859_2},
{"ru", ENC_KOI8_R},
{"ru_RU", ENC_KOI8_R},
{"ru_UA", ENC_KOI8_U},
{"rumanian", ENC_ISO8859_2},
{"russian", ENC_ISO8859_5},
{"se_NO", ENC_UTF_8},
{"serbocroatian", ENC_ISO8859_2},
{"sh", ENC_ISO8859_2},
{"sh_SP", ENC_ISO8859_2},
{"sh_YU", ENC_ISO8859_2},
{"sid_ET", ENC_UTF_8},
{"sk", ENC_ISO8859_2},
{"sk_SK", ENC_ISO8859_2},
{"sl", ENC_ISO8859_2},
{"sl_SI", ENC_ISO8859_2},
{"slovak", ENC_ISO8859_2},
{"slovene", ENC_ISO8859_2},
{"slovenian", ENC_ISO8859_2},
{"so_DJ", ENC_ISO8859_1},
{"so_ET", ENC_UTF_8},
{"so_KE", ENC_ISO8859_1},
{"so_SO", ENC_ISO8859_1},
{"sp", ENC_ISO8859_5},
{"sp_YU", ENC_ISO8859_5},
{"spanish", ENC_ISO8859_1},
{"sq", ENC_ISO8859_2},
{"sq_AL", ENC_ISO8859_2},
{"sr", ENC_ISO8859_5},
{"sr@cyrillic", ENC_ISO8859_5},
{"sr_SP", ENC_ISO8859_2},
{"sr_YU", ENC_ISO8859_5},
{"sr_YU@cyrillic", ENC_ISO8859_5},
{"st_ZA", ENC_ISO8859_1},
{"sv", ENC_ISO8859_1},
{"sv_FI", ENC_ISO8859_1},
{"sv_FI@euro", ENC_ISO8859_15},
{"sv_SE", ENC_ISO8859_1},
{"sv_SE@euro", ENC_ISO8859_15},
{"swedish", ENC_ISO8859_1},
{"te_IN", ENC_UTF_8},
{"th", ENC_ISO8859_11},
{"th_TH", ENC_ISO8859_11},
{"thai", ENC_ISO8859_11},
{"ti_ER", ENC_UTF_8},
{"ti_ET", ENC_UTF_8},
{"tig_ER", ENC_UTF_8},
{"tl", ENC_ISO8859_1},
{"tl_PH", ENC_ISO8859_1},
{"tr", ENC_ISO8859_9},
{"tr_TR", ENC_ISO8859_9},
{"turkish", ENC_ISO8859_9},
{"uk", ENC_KOI8_U},
{"uk_UA", ENC_KOI8_U},
{"ur", ENC_CP1256},
{"ur_PK", ENC_CP1256},
{"uz_UZ", ENC_ISO8859_1},
{"uz_UZ@cyrillic", ENC_UTF_8},
{"vi", ENC_TCVN},
{"vi_VN", ENC_TCVN},
{"wa", ENC_ISO8859_1},
{"wa_BE", ENC_ISO8859_1},
{"wa_BE@euro", ENC_ISO8859_15},
{"xh_ZA", ENC_ISO8859_1},
{"yi", ENC_CP1255},
{"yi_US", ENC_CP1255},
{"zh_CN", ENC_GBK},
{"zh_HK", ENC_BIG5_HKSCS},
{"zh_SG", ENC_GB2312},
{"zh_TW", ENC_BIG5},
{"zu_ZA", ENC_ISO8859_1},
};
static const int guess_count = (sizeof(guess)/sizeof(name_value));
#endif
static const name_value known[] = {
{"iso88591", "ISO8859-1"},
{"iso88592", "ISO8859-2"},
{"iso88593", "ISO8859-3"},
{"iso88596", "ISO8859-6"},
{"iso88597", "ISO8859-7"},
{"iso88598", "ISO8859-8"},
{"iso88599", "ISO8859-9"},
{"iso885910", "ISO8859-10"},
{"iso885913", "ISO8859-13"},
{"iso885914", "ISO8859-14"},
{"iso885915", "ISO8859-15"},
{"cp1251", "CP1251"},
{"cp1255", "CP1255"},
{"eucjp", "EUC-JP"},
{"euckr", "EUC-KR"},
{"euctw", "EUC-TW"},
{"georgianps", "GEORGIAN-PS"},
{"koi8u", "KOI8-U"},
{"tcvn", "TCVN"},
{"big5", "BIG5"},
{"gb2312", "GB2312"},
{"gb18030", "GB18030"},
{"gbk", "GBK"},
{"tis-620", "TIS-620"},
{"sjis", "SHIFT_JIS"},
{"euccn", "GB2312"},
{"big5-hkscs", "BIG5-HKSCS"},
#ifdef __APPLE__
/* known additional Apple encodings (see locale -a) up to macOS 10.5,
unlike other systems they correspond directly */
{"iso8859-1", "ISO8859-1"},
{"iso8859-2", "ISO8859-2"},
{"iso8859-4", "ISO8859-4"},
{"iso8859-7", "ISO8859-7"},
{"iso8859-9", "ISO8859-9"},
{"iso8859-13", "ISO8859-13"},
{"iso8859-15", "ISO8859-15"},
{"koi8-u", "KOI8-U"},
{"koi8-r", "KOI8-R"},
{"pt154", "PT154"},
{"us-ascii", "ASCII"},
{"armscii-8", "ARMSCII-8"},
{"iscii-dev", "ISCII-DEV"},
{"big5hkscs", "BIG5-HKSCS"},
#endif
};
static const int known_count = (sizeof(known)/sizeof(name_value));
#ifndef __APPLE__
static char* name_value_search(const char *name, const name_value table[],
const int table_count)
{
int min, mid, max;
#if defined(DEBUG_TEST)
static last;
DPRINT(last);
last = 0;
#endif
min = 0;
max = table_count - 1;
if ( 0 > strcmp(name,table[min].name) ||
0 < strcmp(name,table[max].name) ) {
#if defined(DEBUG_TEST) && DEBUG_TEST > 1
DPRINT(strcmp(name, table[min].name));
DPRINT(strcmp(name, table[max].name));
#endif
return (NULL);
}
while (max >= min) {
#if defined(DEBUG_TEST)
last++;
#endif
mid = (min + max) / 2;
#if defined(DEBUG_TEST) && DEBUG_TEST > 1
SPRINT(table[mid].name);
#endif
if (0 < strcmp(name,table[mid].name)) {
#if defined(DEBUG_TEST) && DEBUG_TEST > 1
DPRINT(strcmp(name, table[mid].name));
#endif
min = mid + 1;
} else if (0 > strcmp(name, table[mid].name)) {
#if defined(DEBUG_TEST) && DEBUG_TEST > 1
DPRINT(strcmp(name, table[mid].name));
#endif
max = mid - 1;
} else {
#if defined(DEBUG_TEST) && DEBUG_TEST > 1
DPRINT(strcmp(name, table[mid].name));
#endif
return(table[mid].value);
}
}
return (NULL);
}
#endif
const char *locale2charset(const char *locale)
{
static char charset[128];
char la_loc[128];
char enc[128], *p;
int i;
int cp;
#ifndef __APPLE__
char *value;
#endif
if ((locale == NULL) || (0 == strcmp(locale, "NULL")))
locale = setlocale(LC_CTYPE,NULL);
/* in some rare circumstances Darwin may return NULL */
if (!locale || !strcmp(locale, "C") || !strcmp(locale, "POSIX"))
return ("ASCII");
memset(charset,0,sizeof(charset));
/* separate language_locale.encoding
NB, under Windows 'locale' may contains dots
*/
memset(la_loc, 0, sizeof(la_loc));
memset(enc, 0, sizeof(enc));
p = strrchr(locale, '.');
if(p) {
strncpy(enc, p+1, sizeof(enc)-1);
enc[sizeof(enc) - 1] = '\0';
strncpy(la_loc, locale, sizeof(la_loc)-1);
la_loc[sizeof(la_loc) - 1] = '\0';
p = strrchr(la_loc, '.');
if(p) *p = '\0';
}
#ifdef Win32
/*
## PUTTY suggests mapping Windows code pages as
## 1250 -> ISO 8859-2: this is WRONG
## 1251 -> KOI8-U
## 1252 -> ISO 8859-1
## 1253 -> ISO 8859-7
## 1254 -> ISO 8859-9
## 1255 -> ISO 8859-8
## 1256 -> ISO 8859-6
## 1257 -> ISO 8859-13
*/
switch(cp = atoi(enc)) {
/* case 1250: return "ISO8859-2"; */
/* case 1251: return "KOI8-U"; This is not anywhere near the same */
case 1252: return "ISO8859-1";
/*
case 1253: return "ISO8859-7";
case 1254: return "ISO8859-9";
case 1255: return "ISO8859-8";
case 1256: return "ISO8859-6";
*/
case 1257: return "ISO8859-13";
default:
snprintf(charset, 128, "CP%u", cp);
return charset;
}
#endif
/*
* Assume locales are like en_US[.utf8[@euro]]
*/
/* cut encoding @hoge no use.
for(i=0;enc[i] && enc[i]!='@' && i<sizeof(enc)-1;i++);
enc[i]='\0';
*/
/* for AIX */
if (0 == strcmp(enc, "UTF-8")) strcpy(enc, "utf8");
if(strcmp(enc, "") && strcmp(enc, "utf8")) {
for(i = 0; enc[i]; i++) enc[i] = (char) tolower(enc[i]);
for(i = 0; i < known_count; i++)
if (0 == strcmp(known[i].name,enc)) return known[i].value;
/* cut encoding old linux cp- */
if (0 == strncmp(enc, "cp-", 3)){
snprintf(charset, 128, "CP%s", enc+3);
return charset;
}
/* cut encoding IBM ibm- */
if (0 == strncmp(enc, "ibm", 3)){
cp = atoi(enc + 3);
snprintf(charset, 128, "IBM-%d", abs(cp));
/* IBM-[0-9]+ case */
if(cp != 0) return charset;
/* IBM-eucXX case */
strncpy(charset, (enc[3] == '-') ? enc+4: enc+3, sizeof(charset));
charset[sizeof(charset) - 1] = '\0';
if(strncmp(charset, "euc", 3)) {
if (charset[3] != '-') {
for(i = (int) strlen(charset)-3; 0 < i; i--)
charset[i+1] = charset[i];
charset[3] = '-';
}
for(i = 0; charset[i]; i++)
charset[i] = (char) toupper(charset[i]);
return charset;
}
}
/* let's hope it is a ll_* name */
if (0 == strcmp(enc, "euc")) {
/* This is OK as encoding names are ASCII */
if(isalpha((int)la_loc[0]) && isalpha((int)la_loc[1])
&& (la_loc[2] == '_')) {
if (0 == strncmp("ja", la_loc, 2)) return "EUC-JP";
if (0 == strncmp("ko", la_loc, 2)) return "EUC-KR";
if (0 == strncmp("zh", la_loc, 2)) return "GB2312";
}
}
}
#ifdef __APPLE__
/* on macOS *all* real locales w/o encoding part are UTF-8 locales
(C and POSIX are virtual and taken care of previously) */
return "UTF-8";
#else
if(0 == strcmp(enc, "utf8")) return "UTF-8";
value = name_value_search(la_loc, guess, guess_count);
return value == NULL ? "ASCII" : value;
#endif
}
/*****************************************************
* Test !!
*****************************************************/
#ifdef DEBUG_TEST
main()
{
int i;
i=0;
setlocale(LC_CTYPE,"");
DPRINT(guess_count);
#ifndef Win32
SPRINT(locale2charset(NULL));
SPRINT(locale2charset("ja"));
SPRINT(locale2charset("ja_JP"));
SPRINT(locale2charset("ja_JP.eucJP"));
SPRINT(locale2charset("ja_JP.ujis"));
SPRINT(locale2charset("ja_JP.IBM-eucJP"));
SPRINT(locale2charset("ja_JP.sjis"));
SPRINT(locale2charset("ja_JP.IBM-932"));
/* cannot encoding only zh */
SPRINT(locale2charset("zh""\0""BIG5"));
SPRINT(locale2charset("zh_CN"));
SPRINT(locale2charset("zh_CN.BIG5"));
SPRINT(locale2charset("zh_TW"));
SPRINT(locale2charset("zh_TW.eucTW"));
SPRINT(locale2charset("zh_TW.big5"));
SPRINT(locale2charset("zh_SG"));
SPRINT(locale2charset("zh_HK"));
SPRINT(locale2charset("ko"));
SPRINT(locale2charset("en"));
SPRINT(locale2charset("en_IE@euro"));
SPRINT(locale2charset("en_IN"));
SPRINT(locale2charset("C"));
SPRINT(locale2charset("fran""\xe7""ais"));
for(i=0;i<guess_count;i++){
locale2charset(guess[i].name);
}
#else
SPRINT(locale2charset("japanese_JAPAN.932"));
SPRINT(locale2charset("japanese_JAPAN.932"));
#endif
}
#endif /* DEBUG_TEST */