blob: 1c7a79c8749fc69b75b2e4aba17fb2fd7b8a7fb7 [file] [log] [blame]
/*
* R : A Computer Language for Statistical Data Analysis
* Copyright (C) 2005-2017 The R Core Team
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, a copy is available at
* https://www.R-project.org/Licenses/
*/
/* The original version of this file was contributed by Ei-ji Nakama.
* See also the comments in ../include/rlocale.h.
*
* It provides replacements for the wctype functions on
* Windows (where they are not correct in e.g. Japanese)
* AIX (missing)
* macOS in CJK (where these just call the ctype functions)
*
* It also provides wc[s]width, where widths of CJK fonts are often
* wrong in vendor-supplied versions and in Markus Kuhn's version
* used for Windows in R 2.[12].x.
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#ifdef HAVE_VISIBILITY_ATTRIBUTE
# define attribute_hidden __attribute__ ((visibility ("hidden")))
#else
# define attribute_hidden
#endif
#include <string.h>
#include <stdlib.h>
#define IN_RLOCALE_C 1 /* used in rlocale.h */
#include <rlocale.h>
#include "rlocale_data.h"
#include <wctype.h>
#include <wchar.h>
#include <ctype.h>
#include <locale.h>
#include <limits.h>
#include <R_ext/Riconv.h>
// This seems based on Markus Kuhn's function but with 1-based 'max'
static int wcsearch(int wint, const struct interval *table, int max)
{
int min = 0;
int mid;
max--;
if (wint < table[0].first || wint > table[max].last)
return 0;
while (max >= min) {
mid = (min + max) / 2;
if (wint > table[mid].last)
min = mid + 1;
else if (wint < table[mid].first)
max = mid - 1;
else
return 1;
}
return 0;
}
static int wcwidthsearch(int wint, const struct interval_wcwidth *table,
int max, int locale)
{
int min = 0;
int mid;
max--;
/* This quickly gives one for ASCII characters since the table
starts at 0xa0 */
if (wint < table[0].first || wint > table[max].last) return 1;
while (max >= min) {
mid = (min + max) / 2;
if (wint > table[mid].last)
min = mid + 1;
else if (wint < table[mid].first)
max = mid - 1;
else{
return(table[mid].mb[locale]);
}
}
return -1;
}
/* The idea here here has never been explained.
See also the comments in ../include/rlocale.h.
That does not explain the separate entries for Singapore
(simplified) and Hong Kong/Macau (traditional) where it seems the
Windows system font is not different from PRC/Taiwan respectively,
nor what font was used for non-Windows, nor where the values came
from.
Except perhaps on macOS, the non-Windows locale names are for the
default MBCS encodings (e.g. GBK, GB1312, BIG5, EUCJP, EUCKR).
There are other non-UTF-8 encodings for those locales,
e.g. ja_JP.SJIS, ko_KR.CP949, zh_CN.eucCN, zh_HK.Big5HKSCS.
*/
typedef struct {
char *name;
int locale;
} cjk_locale_name_t;
static cjk_locale_name_t cjk_locale_name[] = {
// Windows locale names
{"CHINESE(SINGAPORE)_SIGNAPORE", MB_zh_SG},
{"CHINESE_SIGNAPORE", MB_zh_SG},
{"CHINESE(PRC)_PEOPLE'S REPUBLIC OF CHINA", MB_zh_CN},
{"CHINESE_PEOPLE'S REPUBLIC OF CHINA", MB_zh_CN},
{"CHINESE_MACAU S.A.R.", MB_zh_HK},
{"CHINESE(PRC)_HONG KONG", MB_zh_HK},
{"CHINESE_HONG KONG S.A.R.", MB_zh_HK},
{"CHINESE(TAIWAN)_TAIWAN", MB_zh_TW},
{"CHINESE_TAIWAN", MB_zh_TW},
{"CHINESE-S", MB_zh_CN},
{"CHINESE-T", MB_zh_TW},
{"JAPANESE_JAPAN", MB_ja_JP},
{"JAPANESE", MB_ja_JP},
{"KOREAN_KOREA", MB_ko_KR},
{"KOREAN", MB_ko_KR},
// Other OSes, but only in default encodings.
{"ZH_TW", MB_zh_TW},
{"ZH_CN", MB_zh_CN},
{"ZH_CN.BIG5", MB_zh_TW},
{"ZH_HK", MB_zh_HK},
{"ZH_SG", MB_zh_SG},
{"JA_JP", MB_ja_JP},
{"KO_KR", MB_ko_KR},
{"ZH", MB_zh_CN},
{"JA", MB_ja_JP},
{"KO", MB_ko_KR},
// Default, where all EA Ambiguous characters have width one.
{"", MB_Default},
};
// used in character.c, ../gnuwin32/console.c , ../library/grDevices/src/devP*.c :
int Ri18n_wcwidth(Rwchar_t c)
{
char lc_str[128];
unsigned int i, j;
static char *lc_cache = "";
static int lc = 0;
if (0 != strcmp(setlocale(LC_CTYPE, NULL), lc_cache)) {
strncpy(lc_str, setlocale(LC_CTYPE, NULL), sizeof(lc_str) - 1);
lc_str[sizeof(lc_str) - 1] = '\0';
for (i = 0, j = (int) strlen(lc_str); i < j && i < sizeof(lc_str); i++)
lc_str[i] = (char) toupper(lc_str[i]);
for (i = 0; i < (sizeof(cjk_locale_name)/sizeof(cjk_locale_name_t));
i++) {
if (0 == strncmp(cjk_locale_name[i].name, lc_str,
strlen(cjk_locale_name[i].name))) {
lc = cjk_locale_name[i].locale;
break;
}
}
}
int wd = wcwidthsearch(c, table_wcwidth,
(sizeof(table_wcwidth)/sizeof(struct interval_wcwidth)),
lc);
if (wd >= 0) return wd; // currently all are 1 or 2.
int zw = wcsearch(c, zero_width, zero_width_count);
return zw ? 0 : 1; // assume unknown chars are width one.
}
/* Used in character.c, errors.c, ../gnuwin32/console.c */
attribute_hidden
int Ri18n_wcswidth (const wchar_t *s, size_t n)
{
int rs = 0;
while ((n-- > 0) && (*s != L'\0'))
{
int now = Ri18n_wcwidth (*s);
if (now == -1) return -1;
rs += now;
s++;
}
return rs;
}
/*********************************************************************
* macOS's wide character type functions are based on FreeBSD
* and only work correctly for Latin-1 characters.
* So we replace them. May also be needed on FreeBSD.
********************************************************************/
#if defined(__APPLE__)
/* allow for both PowerPC and Intel platforms */
#ifdef WORDS_BIGENDIAN
static const char UNICODE[] = "UCS-4BE";
#else
static const char UNICODE[] = "UCS-4LE";
#endif
/* in Defn.h which is not included here */
extern const char *locale2charset(const char *);
#define ISWFUNC(ISWNAME) static int Ri18n_isw ## ISWNAME (wint_t wc) \
{ \
char mb_buf[MB_LEN_MAX+1]; \
size_t mb_len; \
int ucs4_buf[2]; \
size_t wc_len; \
void *cd; \
char fromcode[128]; \
char *_mb_buf; \
char *_wc_buf; \
size_t rc ; \
\
strncpy(fromcode, locale2charset(NULL), sizeof(fromcode)); \
fromcode[sizeof(fromcode) - 1] = '\0'; \
if(0 == strcmp(fromcode, "UTF-8")) \
return wcsearch(wc,table_w ## ISWNAME , table_w ## ISWNAME ## _count);\
memset(mb_buf, 0, sizeof(mb_buf)); \
memset(ucs4_buf, 0, sizeof(ucs4_buf)); \
wcrtomb( mb_buf, wc, NULL); \
if((void *)(-1) != (cd = Riconv_open(UNICODE, fromcode))) { \
wc_len = sizeof(ucs4_buf); \
_wc_buf = (char *)ucs4_buf; \
mb_len = strlen(mb_buf); \
_mb_buf = (char *)mb_buf; \
rc = Riconv(cd, (const char **)&_mb_buf, (size_t *)&mb_len, \
(char **)&_wc_buf, (size_t *)&wc_len); \
Riconv_close(cd); \
wc = ucs4_buf[0]; \
return wcsearch(wc,table_w ## ISWNAME , table_w ## ISWNAME ## _count); \
} \
return(-1); \
}
#endif // __APPLE__
/*********************************************************************
* iswalpha etc. does not function correctly for Windows
* iswalpha etc. does not function at all in AIX.
* all locale wchar_t == UNICODE
********************************************************************/
#if defined(Win32) || defined(_AIX)
#define ISWFUNC(ISWNAME) static int Ri18n_isw ## ISWNAME (wint_t wc) \
{ \
return wcsearch(wc,table_w ## ISWNAME , table_w ## ISWNAME ## _count); \
}
#endif
/*********************************************************************
* iswalpha etc. do function correctly for Linux
********************************************************************/
#ifndef ISWFUNC
#define ISWFUNC(ISWNAME) static int Ri18n_isw ## ISWNAME (wint_t wc) \
{ \
return isw ## ISWNAME (wc); \
}
/* Solaris 8 was missing iswblank. Its man page was missing iswcntrl,
but the function is there. MinGW used not to have iswblank until
mingw-runtime-3.11. */
#ifndef HAVE_ISWBLANK
#define iswblank(wc) iswctype(wc, wctype("blank"))
#endif
#endif
/* These are the functions which C99 and POSIX define. However,
not all are used elsewhere in R, but they are used in Ri18n_iswctype. */
ISWFUNC(upper)
ISWFUNC(lower)
ISWFUNC(alpha)
ISWFUNC(digit)
ISWFUNC(xdigit)
ISWFUNC(space)
ISWFUNC(print)
ISWFUNC(graph)
ISWFUNC(blank)
ISWFUNC(cntrl)
ISWFUNC(punct)
/* defined below in terms of digit and alpha
ISWFUNC(alnum)
*/
wctype_t Ri18n_wctype(const char *);
int Ri18n_iswctype(wint_t, wctype_t);
static int Ri18n_iswalnum (wint_t wc)
{
return (Ri18n_iswctype(wc, Ri18n_wctype("digit")) ||
Ri18n_iswctype(wc, Ri18n_wctype("alpha")) );
}
/*
* iswctype
*/
typedef struct {
char * name;
wctype_t wctype;
int(*func)(wint_t);
} Ri18n_wctype_func_l ;
static const Ri18n_wctype_func_l Ri18n_wctype_func[] = {
{"upper", 1<<0, Ri18n_iswupper},
{"lower", 1<<1, Ri18n_iswlower},
{"alpha", 1<<2, Ri18n_iswalpha},
{"digit", 1<<3, Ri18n_iswdigit},
{"xdigit", 1<<4, Ri18n_iswxdigit},
{"space", 1<<5, Ri18n_iswspace},
{"print", 1<<6, Ri18n_iswprint},
{"graph", 1<<7, Ri18n_iswgraph},
{"blank", 1<<8, Ri18n_iswblank},
{"cntrl", 1<<9, Ri18n_iswcntrl},
{"punct", 1<<10, Ri18n_iswpunct},
{"alnum", 1<<11, Ri18n_iswalnum},
{NULL, 0, NULL}
};
/* These two used (via macros) in X11 dataentry */
wctype_t Ri18n_wctype(const char *name)
{
int i;
for (i = 0 ; Ri18n_wctype_func[i].name != NULL &&
0 != strcmp(Ri18n_wctype_func[i].name, name) ; i++ );
return Ri18n_wctype_func[i].wctype;
}
int Ri18n_iswctype(wint_t wc, wctype_t desc)
{
int i;
for (i = 0 ; Ri18n_wctype_func[i].wctype != 0 &&
Ri18n_wctype_func[i].wctype != desc ; i++ );
return (*Ri18n_wctype_func[i].func)(wc);
}