blob: 1e9b736686bc4824697abf958789bc63ac9138e7 [file] [log] [blame]
/*
* R : A Computer Language for Statistical Data Analysis
* Copyright (C) 2005-2020 The R Core Team
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, a copy is available at
* https://www.R-project.org/Licenses/
*/
/* Internal header, not installed */
/* This file was contributed by Ei-ji Nakama.
* See also the comments in ../main/rlocale.c.
* It does 2 things:
* (a) supplies wrapper/substitute wc[s]width functions for use in
* character.c, errors.c, printutils.c, devPS.c, RGui console.
* (b) Defines a replacement for iswctype to be used on Windows, maxOS and AIX.
* in gram.c, the TRE engine and elsewhere.
*
* It is not an installed header.
*/
#ifndef R_LOCALE_H
#define R_LOCALE_H
#include <wchar.h>
#include <ctype.h>
#include <wctype.h>
/*
The R_wchar_t typedef represents a single Unicode code point. On
most systems it is the same as wchar_t, but on Windows (and 32-bit
AIX and perhaps others) where wchar_t is too small and UTF-16 is
used, it needs to be an unsigned int .
AIX ref: https://www.gnu.org/software/gnulib/manual/html_node/wcwidth.html
*/
#ifdef Win32
typedef unsigned int R_wchar_t;
#else
typedef wchar_t R_wchar_t;
#endif
#if !defined(USE_RI18N_WIDTH) && (!defined(HAVE_WCWIDTH) || !defined(HAVE_WCSWIDTH))
# define USE_RI18N_WIDTH 1
#endif
#ifdef USE_RI18N_WIDTH
/*
* Windows CJK
* In Unicode, there is not a rule about character width.
* A letter of breadth is used in a CJK (China, Japan, Korea,
* Taiwan, Hong Kong, and Singapore) area, and there are a
* letter and a standard (character width is not still prescribed)
* of a cord in a country.
* Letter width is a problem of a font, but it is a rule route
* besides a alphanumeric character that use a breadth letter.
* It is generally defined as a breadth letter for a font such
* as Japanese.
* - Win32
* Attempted explanation by BDR
* The display widths of characters are not prescribed in Unicode.
* Double-width characters are used in the CJK area: their width can
* be font-specific, with different fonts in use in different parts
* of the CJK area. The tables supplied in many OSes and by Markus
* Kuhn are not do not take the exact locale into account. The
* tables supplied in rlocale_data.h allow different widths for
* different parts of the CJK area, and also where needed different
* widths on Windows. (The Windows differences are in zh_CN, and
* apply to European characters.)
*
* The differences are mainly (but not exclusively) in the
* Unicode 'East Asian Ambiguous' class.
*
*/
extern int Ri18n_wcwidth(R_wchar_t);
extern int Ri18n_wcswidth (const wchar_t *, size_t);
#endif
/* macOS CJK and WindowXP(Japanese)
* iswctypes of macOS calls isctypes. no i18n.
* For example, iswprint of Windows does not accept a macron of
* Japanese "a-ru" of R as a letter.
* Therefore Japanese "Buraian.Ripuri-" of "Brian Ripley" is
* shown of hex-string.:-)
*/
/*
iswspace is used in Rstrptime.h, character.c and util.c
iswalpha, iswalnum used in gram.y and in X11/dataentry.c
iswdigit is used in plotmath.c X11/dataentry.c (and indirectly in gram.y)
iswprint is used in printutils.c
*/
#if defined(Win32) && !defined(USE_RI18N_FNS)
# define USE_RI18N_FNS
#endif
#ifdef USE_RI18N_FNS
extern wctype_t Ri18n_wctype(const char *);
// Apparently wint_t is unsigned short on Windows, unsigned int on Linux
extern int Ri18n_iswctype(wint_t, wctype_t);
#ifndef IN_RLOCALE_C
/* We want to avoid these redefinitions in rlocale.c itself */
#undef iswupper
#undef iswlower
#undef iswalpha
#undef iswdigit
#undef iswxdigit
#undef iswspace
#undef iswprint
#undef iswgraph
#undef iswblank
#undef iswcntrl
#undef iswpunct
#undef iswalnum
#undef wctype
#undef iswctype
#define iswupper(__x) Ri18n_iswctype(__x, Ri18n_wctype("upper"))
#define iswlower(__x) Ri18n_iswctype(__x, Ri18n_wctype("lower"))
#define iswalpha(__x) Ri18n_iswctype(__x, Ri18n_wctype("alpha"))
#define iswdigit(__x) Ri18n_iswctype(__x, Ri18n_wctype("digit"))
#define iswxdigit(__x) Ri18n_iswctype(__x, Ri18n_wctype("xdigit"))
#define iswspace(__x) Ri18n_iswctype(__x, Ri18n_wctype("space"))
#define iswprint(__x) Ri18n_iswctype(__x, Ri18n_wctype("print"))
#define iswgraph(__x) Ri18n_iswctype(__x, Ri18n_wctype("graph"))
#define iswblank(__x) Ri18n_iswctype(__x, Ri18n_wctype("blank"))
#define iswcntrl(__x) Ri18n_iswctype(__x, Ri18n_wctype("cntrl"))
#define iswpunct(__x) Ri18n_iswctype(__x, Ri18n_wctype("punct"))
#define iswalnum(__x) Ri18n_iswctype(__x, Ri18n_wctype("alnum"))
#define wctype(__x) Ri18n_wctype(__x)
#define iswctype(__x,__y) Ri18n_iswctype(__x,__y)
#endif
#endif
#ifdef USE_RI18N_CASE
R_wchar_t Ri18n_towupper(R_wchar_t wc);
R_wchar_t Ri18n_towlower(R_wchar_t wc);
#endif
/* These definitions are from winnls.h in MinGW-W64. We don't need
* the rest of that file. */
#define HIGH_SURROGATE_START 0xd800
#define HIGH_SURROGATE_END 0xdbff
#define LOW_SURROGATE_START 0xdc00
#define LOW_SURROGATE_END 0xdfff
/* The first two of these definitions use the argument twice which is
* bad, but we include them here in the original form for consistency
* with Mingw_w64. Users should be careful that evaluating the
* argument doesn't result in side effects.
*/
#define IS_HIGH_SURROGATE(wch) (((wch) >= HIGH_SURROGATE_START) && ((wch) <= HIGH_SURROGATE_END))
#define IS_LOW_SURROGATE(wch) (((wch) >= LOW_SURROGATE_START) && ((wch) <= LOW_SURROGATE_END))
#define IS_SURROGATE_PAIR(hs, ls) (IS_HIGH_SURROGATE (hs) && IS_LOW_SURROGATE (ls))
# define utf8toucs32 Rf_utf8toucs32
R_wchar_t utf8toucs32(wchar_t high, const char *s);
// convert strings UTF-8 <-> UCS-4 (stored in R_wchar_t aka int)
# define utf8towcs4 Rf_utf8towcs4
size_t utf8towcs4(R_wchar_t *wc, const char *s, size_t n);
#define wcs4toutf8 Rf_wcs4toutf8
size_t wcs4toutf8(char *s, const R_wchar_t *wc, size_t n);
#endif /* R_LOCALE_H */