src/include/rlocale.h - R - Git at Google

 /*
  *  R : A Computer Language for Statistical Data Analysis
  *  Copyright (C) 2005-2020   The R Core Team
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 2 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, a copy is available at
  *  https://www.R-project.org/Licenses/
  */

 /* Internal header, not installed */

 /*  This file was contributed by Ei-ji Nakama.
  *  See also the comments in  ../main/rlocale.c.

  *  It does 2 things:
  * (a) supplies wrapper/substitute wc[s]width functions for use in
  *    character.c, errors.c, printutils.c, devPS.c, RGui console.
  * (b) Defines a replacement for iswctype to be used on Windows, maxOS and AIX.
  * in gram.c, the TRE engine and elsewhere.
  *
  * It is not an installed header.
  */

 #ifndef R_LOCALE_H
 #define R_LOCALE_H

 #include <wchar.h>
 #include <ctype.h>
 #include <wctype.h>

 /*
   The R_wchar_t typedef represents a single Unicode code point.  On
   most systems it is the same as wchar_t, but on Windows (and 32-bit
   AIX and perhaps others) where wchar_t is too small and UTF-16 is
   used, it needs to be an unsigned int .

   AIX ref: https://www.gnu.org/software/gnulib/manual/html_node/wcwidth.html
  */

 #ifdef Win32
 typedef unsigned int R_wchar_t;
 #else
 typedef wchar_t R_wchar_t;
 #endif

 #if !defined(USE_RI18N_WIDTH) && (!defined(HAVE_WCWIDTH) || !defined(HAVE_WCSWIDTH))
 # define USE_RI18N_WIDTH 1
 #endif

 #ifdef USE_RI18N_WIDTH
 /*
  * Windows CJK
  * In Unicode, there is not a rule about character width.
  * A letter of breadth is used in a CJK (China, Japan, Korea,
  * Taiwan, Hong Kong, and Singapore) area, and there are a
  * letter and a standard (character width is not still prescribed)
  * of a cord in a country.
  * Letter width is a problem of a font, but it is a rule route
  * besides a alphanumeric character that use a breadth letter.
  * It is generally defined as a breadth letter for a font such
  * as Japanese.
  * - Win32

  *  Attempted explanation by BDR
  *  The display widths of characters are not prescribed in Unicode.
  *  Double-width characters are used in the CJK area: their width can
  *  be font-specific, with different fonts in use in different parts
  *  of the CJK area.  The tables supplied in many OSes and by Markus
  *  Kuhn are not do not take the exact locale into account.  The
  *  tables supplied in rlocale_data.h allow different widths for
  *  different parts of the CJK area, and also where needed different
  *  widths on Windows.  (The Windows differences are in zh_CN, and
  *  apply to European characters.)
  *
  * The differences are mainly (but not exclusively) in the
  * Unicode 'East Asian Ambiguous' class.
  *
  */

 extern int Ri18n_wcwidth(R_wchar_t);
 extern int Ri18n_wcswidth (const wchar_t *, size_t);
 #endif

 /* macOS CJK and WindowXP(Japanese)
  * iswctypes of macOS calls isctypes. no i18n.
  * For example, iswprint of Windows does not accept a macron of
  * Japanese "a-ru" of R as a letter.
  * Therefore Japanese "Buraian.Ripuri-" of "Brian Ripley" is
  * shown of hex-string.:-)
  */

 /*
    iswspace is used in Rstrptime.h, character.c and util.c
    iswalpha, iswalnum used in gram.y and in X11/dataentry.c
    iswdigit is used in plotmath.c X11/dataentry.c (and indirectly in gram.y)
    iswprint is used in printutils.c
 */
 #if defined(Win32) && !defined(USE_RI18N_FNS)
 # define USE_RI18N_FNS
 #endif

 #ifdef USE_RI18N_FNS

 extern wctype_t Ri18n_wctype(const char *);
 // Apparently wint_t is unsigned short on Windows, unsigned int on Linux
 extern int      Ri18n_iswctype(wint_t, wctype_t);

 #ifndef IN_RLOCALE_C
 /* We want to avoid these redefinitions in rlocale.c itself */
 #undef iswupper
 #undef iswlower
 #undef iswalpha
 #undef iswdigit
 #undef iswxdigit
 #undef iswspace
 #undef iswprint
 #undef iswgraph
 #undef iswblank
 #undef iswcntrl
 #undef iswpunct
 #undef iswalnum
 #undef wctype
 #undef iswctype

 #define iswupper(__x)     Ri18n_iswctype(__x, Ri18n_wctype("upper"))
 #define iswlower(__x)     Ri18n_iswctype(__x, Ri18n_wctype("lower"))
 #define iswalpha(__x)     Ri18n_iswctype(__x, Ri18n_wctype("alpha"))
 #define iswdigit(__x)     Ri18n_iswctype(__x, Ri18n_wctype("digit"))
 #define iswxdigit(__x)    Ri18n_iswctype(__x, Ri18n_wctype("xdigit"))
 #define iswspace(__x)     Ri18n_iswctype(__x, Ri18n_wctype("space"))
 #define iswprint(__x)     Ri18n_iswctype(__x, Ri18n_wctype("print"))
 #define iswgraph(__x)     Ri18n_iswctype(__x, Ri18n_wctype("graph"))
 #define iswblank(__x)     Ri18n_iswctype(__x, Ri18n_wctype("blank"))
 #define iswcntrl(__x)     Ri18n_iswctype(__x, Ri18n_wctype("cntrl"))
 #define iswpunct(__x)     Ri18n_iswctype(__x, Ri18n_wctype("punct"))
 #define iswalnum(__x)     Ri18n_iswctype(__x, Ri18n_wctype("alnum"))
 #define wctype(__x)       Ri18n_wctype(__x)
 #define iswctype(__x,__y) Ri18n_iswctype(__x,__y)
 #endif

 #endif

 #ifdef USE_RI18N_CASE
 R_wchar_t Ri18n_towupper(R_wchar_t wc);
 R_wchar_t Ri18n_towlower(R_wchar_t wc);
 #endif


 /* These definitions are from winnls.h in MinGW-W64.  We don't need
  * the rest of that file. */

 #define HIGH_SURROGATE_START 0xd800
 #define HIGH_SURROGATE_END 0xdbff
 #define LOW_SURROGATE_START 0xdc00
 #define LOW_SURROGATE_END 0xdfff

 /* The first two of these definitions use the argument twice which is
  * bad, but we include them here in the original form for consistency
  * with Mingw_w64.  Users should be careful that evaluating the
  * argument doesn't result in side effects.
  */

 #define IS_HIGH_SURROGATE(wch) (((wch) >= HIGH_SURROGATE_START) && ((wch) <= HIGH_SURROGATE_END))
 #define IS_LOW_SURROGATE(wch) (((wch) >= LOW_SURROGATE_START) && ((wch) <= LOW_SURROGATE_END))
 #define IS_SURROGATE_PAIR(hs, ls) (IS_HIGH_SURROGATE (hs) && IS_LOW_SURROGATE (ls))

 # define utf8toucs32		Rf_utf8toucs32
 R_wchar_t utf8toucs32(wchar_t high, const char *s);

 // convert strings UTF-8 <-> UCS-4 (stored in R_wchar_t aka int)
 # define utf8towcs4		Rf_utf8towcs4
 size_t utf8towcs4(R_wchar_t *wc, const char *s, size_t n);
 #define wcs4toutf8              Rf_wcs4toutf8
 size_t wcs4toutf8(char *s, const R_wchar_t *wc, size_t n);

 #endif /* R_LOCALE_H */
	/*
	* R : A Computer Language for Statistical Data Analysis
	* Copyright (C) 2005-2020 The R Core Team
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 2 of the License, or
	* (at your option) any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, a copy is available at
	* https://www.R-project.org/Licenses/
	*/

	/* Internal header, not installed */

	/* This file was contributed by Ei-ji Nakama.
	* See also the comments in ../main/rlocale.c.

	* It does 2 things:
	* (a) supplies wrapper/substitute wc[s]width functions for use in
	* character.c, errors.c, printutils.c, devPS.c, RGui console.
	* (b) Defines a replacement for iswctype to be used on Windows, maxOS and AIX.
	* in gram.c, the TRE engine and elsewhere.
	*
	* It is not an installed header.
	*/

	#ifndef R_LOCALE_H
	#define R_LOCALE_H

	#include <wchar.h>
	#include <ctype.h>
	#include <wctype.h>

	/*
	The R_wchar_t typedef represents a single Unicode code point. On
	most systems it is the same as wchar_t, but on Windows (and 32-bit
	AIX and perhaps others) where wchar_t is too small and UTF-16 is
	used, it needs to be an unsigned int .

	AIX ref: https://www.gnu.org/software/gnulib/manual/html_node/wcwidth.html
	*/

	#ifdef Win32
	typedef unsigned int R_wchar_t;
	#else
	typedef wchar_t R_wchar_t;
	#endif

	#if !defined(USE_RI18N_WIDTH) && (!defined(HAVE_WCWIDTH) \|\| !defined(HAVE_WCSWIDTH))
	# define USE_RI18N_WIDTH 1
	#endif

	#ifdef USE_RI18N_WIDTH
	/*
	* Windows CJK
	* In Unicode, there is not a rule about character width.
	* A letter of breadth is used in a CJK (China, Japan, Korea,
	* Taiwan, Hong Kong, and Singapore) area, and there are a
	* letter and a standard (character width is not still prescribed)
	* of a cord in a country.
	* Letter width is a problem of a font, but it is a rule route
	* besides a alphanumeric character that use a breadth letter.
	* It is generally defined as a breadth letter for a font such
	* as Japanese.
	* - Win32

	* Attempted explanation by BDR
	* The display widths of characters are not prescribed in Unicode.
	* Double-width characters are used in the CJK area: their width can
	* be font-specific, with different fonts in use in different parts
	* of the CJK area. The tables supplied in many OSes and by Markus
	* Kuhn are not do not take the exact locale into account. The
	* tables supplied in rlocale_data.h allow different widths for
	* different parts of the CJK area, and also where needed different
	* widths on Windows. (The Windows differences are in zh_CN, and
	* apply to European characters.)
	*
	* The differences are mainly (but not exclusively) in the
	* Unicode 'East Asian Ambiguous' class.
	*
	*/

	extern int Ri18n_wcwidth(R_wchar_t);
	extern int Ri18n_wcswidth (const wchar_t *, size_t);
	#endif

	/* macOS CJK and WindowXP(Japanese)
	* iswctypes of macOS calls isctypes. no i18n.
	* For example, iswprint of Windows does not accept a macron of
	* Japanese "a-ru" of R as a letter.
	* Therefore Japanese "Buraian.Ripuri-" of "Brian Ripley" is
	* shown of hex-string.:-)
	*/

	/*
	iswspace is used in Rstrptime.h, character.c and util.c
	iswalpha, iswalnum used in gram.y and in X11/dataentry.c
	iswdigit is used in plotmath.c X11/dataentry.c (and indirectly in gram.y)
	iswprint is used in printutils.c
	*/
	#if defined(Win32) && !defined(USE_RI18N_FNS)
	# define USE_RI18N_FNS
	#endif

	#ifdef USE_RI18N_FNS

	extern wctype_t Ri18n_wctype(const char *);
	// Apparently wint_t is unsigned short on Windows, unsigned int on Linux
	extern int Ri18n_iswctype(wint_t, wctype_t);

	#ifndef IN_RLOCALE_C
	/* We want to avoid these redefinitions in rlocale.c itself */
	#undef iswupper
	#undef iswlower
	#undef iswalpha
	#undef iswdigit
	#undef iswxdigit
	#undef iswspace
	#undef iswprint
	#undef iswgraph
	#undef iswblank
	#undef iswcntrl
	#undef iswpunct
	#undef iswalnum
	#undef wctype
	#undef iswctype

	#define iswupper(__x) Ri18n_iswctype(__x, Ri18n_wctype("upper"))
	#define iswlower(__x) Ri18n_iswctype(__x, Ri18n_wctype("lower"))
	#define iswalpha(__x) Ri18n_iswctype(__x, Ri18n_wctype("alpha"))
	#define iswdigit(__x) Ri18n_iswctype(__x, Ri18n_wctype("digit"))
	#define iswxdigit(__x) Ri18n_iswctype(__x, Ri18n_wctype("xdigit"))
	#define iswspace(__x) Ri18n_iswctype(__x, Ri18n_wctype("space"))
	#define iswprint(__x) Ri18n_iswctype(__x, Ri18n_wctype("print"))
	#define iswgraph(__x) Ri18n_iswctype(__x, Ri18n_wctype("graph"))
	#define iswblank(__x) Ri18n_iswctype(__x, Ri18n_wctype("blank"))
	#define iswcntrl(__x) Ri18n_iswctype(__x, Ri18n_wctype("cntrl"))
	#define iswpunct(__x) Ri18n_iswctype(__x, Ri18n_wctype("punct"))
	#define iswalnum(__x) Ri18n_iswctype(__x, Ri18n_wctype("alnum"))
	#define wctype(__x) Ri18n_wctype(__x)
	#define iswctype(__x,__y) Ri18n_iswctype(__x,__y)
	#endif

	#endif

	#ifdef USE_RI18N_CASE
	R_wchar_t Ri18n_towupper(R_wchar_t wc);
	R_wchar_t Ri18n_towlower(R_wchar_t wc);
	#endif


	/* These definitions are from winnls.h in MinGW-W64. We don't need
	* the rest of that file. */

	#define HIGH_SURROGATE_START 0xd800
	#define HIGH_SURROGATE_END 0xdbff
	#define LOW_SURROGATE_START 0xdc00
	#define LOW_SURROGATE_END 0xdfff

	/* The first two of these definitions use the argument twice which is
	* bad, but we include them here in the original form for consistency
	* with Mingw_w64. Users should be careful that evaluating the
	* argument doesn't result in side effects.
	*/

	#define IS_HIGH_SURROGATE(wch) (((wch) >= HIGH_SURROGATE_START) && ((wch) <= HIGH_SURROGATE_END))
	#define IS_LOW_SURROGATE(wch) (((wch) >= LOW_SURROGATE_START) && ((wch) <= LOW_SURROGATE_END))
	#define IS_SURROGATE_PAIR(hs, ls) (IS_HIGH_SURROGATE (hs) && IS_LOW_SURROGATE (ls))

	# define utf8toucs32 Rf_utf8toucs32
	R_wchar_t utf8toucs32(wchar_t high, const char *s);

	// convert strings UTF-8 <-> UCS-4 (stored in R_wchar_t aka int)
	# define utf8towcs4 Rf_utf8towcs4
	size_t utf8towcs4(R_wchar_t wc, const char s, size_t n);
	#define wcs4toutf8 Rf_wcs4toutf8
	size_t wcs4toutf8(char s, const R_wchar_t wc, size_t n);

	#endif /* R_LOCALE_H */