mingw/gettext/gettext-tools/src/filter-sr-latin.c - kiwivm - Git at Google

 /* Recode Serbian text from Cyrillic to Latin script.
    Copyright (C) 2006-2007, 2009 Free Software Foundation, Inc.
    Written by Danilo Šegan <danilo@gnome.org>, 2006,
    and Bruno Haible <bruno@clisp.org>, 2006.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */

 #ifdef HAVE_CONFIG_H
 # include <config.h>
 #endif

 /* Specification.  */
 #include "filters.h"

 #include <stdlib.h>

 #include "xalloc.h"


 /* Table for Serbian Cyrillic to Latin transcription.
    The table is indexed by the Unicode code point, in the range 0x0400..0x04ef.
    The longest table entry is three bytes long.  */
 static const char table[240][3 + 1] =
 {
   /* U+0400 */ "\xC3\x88", /* "È" */
   /* U+0401 */ "",
   /* U+0402 */ "\xC4\x90", /* "Đ" */
   /* U+0403 */ "",
   /* U+0404 */ "",
   /* U+0405 */ "",
   /* U+0406 */ "",
   /* U+0407 */ "",
   /* U+0408 */ "J",
   /* U+0409 */ "Lj",
   /* U+040A */ "Nj",
   /* U+040B */ "\xC4\x86", /* "Ć" */
   /* U+040C */ "",
   /* U+040D */ "\xC3\x8C", /* "Ì" */
   /* U+040E */ "",
   /* U+040F */ "D\xC5\xBE", /* "Dž" */
   /* U+0410 */ "A",
   /* U+0411 */ "B",
   /* U+0412 */ "V",
   /* U+0413 */ "G",
   /* U+0414 */ "D",
   /* U+0415 */ "E",
   /* U+0416 */ "\xC5\xBD", /* "Ž" */
   /* U+0417 */ "Z",
   /* U+0418 */ "I",
   /* U+0419 */ "",
   /* U+041A */ "K",
   /* U+041B */ "L",
   /* U+041C */ "M",
   /* U+041D */ "N",
   /* U+041E */ "O",
   /* U+041F */ "P",
   /* U+0420 */ "R",
   /* U+0421 */ "S",
   /* U+0422 */ "T",
   /* U+0423 */ "U",
   /* U+0424 */ "F",
   /* U+0425 */ "H",
   /* U+0426 */ "C",
   /* U+0427 */ "\xC4\x8C", /* "Č" */
   /* U+0428 */ "\xC5\xA0", /* "Š" */
   /* U+0429 */ "",
   /* U+042A */ "",
   /* U+042B */ "",
   /* U+042C */ "",
   /* U+042D */ "",
   /* U+042E */ "",
   /* U+042F */ "",
   /* U+0430 */ "a",
   /* U+0431 */ "b",
   /* U+0432 */ "v",
   /* U+0433 */ "g",
   /* U+0434 */ "d",
   /* U+0435 */ "e",
   /* U+0436 */ "\xC5\xBE", /* "ž" */
   /* U+0437 */ "z",
   /* U+0438 */ "i",
   /* U+0439 */ "",
   /* U+043A */ "k",
   /* U+043B */ "l",
   /* U+043C */ "m",
   /* U+043D */ "n",
   /* U+043E */ "o",
   /* U+043F */ "p",
   /* U+0440 */ "r",
   /* U+0441 */ "s",
   /* U+0442 */ "t",
   /* U+0443 */ "u",
   /* U+0444 */ "f",
   /* U+0445 */ "h",
   /* U+0446 */ "c",
   /* U+0447 */ "\xC4\x8D", /* "č" */
   /* U+0448 */ "\xC5\xA1", /* "š" */
   /* U+0449 */ "",
   /* U+044A */ "",
   /* U+044B */ "",
   /* U+044C */ "",
   /* U+044D */ "",
   /* U+044E */ "",
   /* U+044F */ "",
   /* U+0450 */ "\xC3\xA8", /* "è" */
   /* U+0451 */ "",
   /* U+0452 */ "\xC4\x91", /* "đ" */
   /* U+0453 */ "",
   /* U+0454 */ "",
   /* U+0455 */ "",
   /* U+0456 */ "",
   /* U+0457 */ "",
   /* U+0458 */ "j",
   /* U+0459 */ "lj",
   /* U+045A */ "nj",
   /* U+045B */ "\xC4\x87", /* "ć" */
   /* U+045C */ "",
   /* U+045D */ "\xC3\xAC", /* "ì" */
   /* U+045E */ "",
   /* U+045F */ "d\xC5\xBE", /* "dž" */
   /* U+0460 */ "",
   /* U+0461 */ "",
   /* U+0462 */ "",
   /* U+0463 */ "",
   /* U+0464 */ "",
   /* U+0465 */ "",
   /* U+0466 */ "",
   /* U+0467 */ "",
   /* U+0468 */ "",
   /* U+0469 */ "",
   /* U+046A */ "",
   /* U+046B */ "",
   /* U+046C */ "",
   /* U+046D */ "",
   /* U+046E */ "",
   /* U+046F */ "",
   /* U+0470 */ "",
   /* U+0471 */ "",
   /* U+0472 */ "",
   /* U+0473 */ "",
   /* U+0474 */ "",
   /* U+0475 */ "",
   /* U+0476 */ "",
   /* U+0477 */ "",
   /* U+0478 */ "",
   /* U+0479 */ "",
   /* U+047A */ "",
   /* U+047B */ "",
   /* U+047C */ "",
   /* U+047D */ "",
   /* U+047E */ "",
   /* U+047F */ "",
   /* U+0480 */ "",
   /* U+0481 */ "",
   /* U+0482 */ "",
   /* U+0483 */ "",
   /* U+0484 */ "",
   /* U+0485 */ "",
   /* U+0486 */ "",
   /* U+0487 */ "",
   /* U+0488 */ "",
   /* U+0489 */ "",
   /* U+048A */ "",
   /* U+048B */ "",
   /* U+048C */ "",
   /* U+048D */ "",
   /* U+048E */ "",
   /* U+048F */ "",
   /* U+0490 */ "",
   /* U+0491 */ "",
   /* U+0492 */ "",
   /* U+0493 */ "",
   /* U+0494 */ "",
   /* U+0495 */ "",
   /* U+0496 */ "",
   /* U+0497 */ "",
   /* U+0498 */ "",
   /* U+0499 */ "",
   /* U+049A */ "",
   /* U+049B */ "",
   /* U+049C */ "",
   /* U+049D */ "",
   /* U+049E */ "",
   /* U+049F */ "",
   /* U+04A0 */ "",
   /* U+04A1 */ "",
   /* U+04A2 */ "",
   /* U+04A3 */ "",
   /* U+04A4 */ "",
   /* U+04A5 */ "",
   /* U+04A6 */ "",
   /* U+04A7 */ "",
   /* U+04A8 */ "",
   /* U+04A9 */ "",
   /* U+04AA */ "",
   /* U+04AB */ "",
   /* U+04AC */ "",
   /* U+04AD */ "",
   /* U+04AE */ "",
   /* U+04AF */ "",
   /* U+04B0 */ "",
   /* U+04B1 */ "",
   /* U+04B2 */ "",
   /* U+04B3 */ "",
   /* U+04B4 */ "",
   /* U+04B5 */ "",
   /* U+04B6 */ "",
   /* U+04B7 */ "",
   /* U+04B8 */ "",
   /* U+04B9 */ "",
   /* U+04BA */ "",
   /* U+04BB */ "",
   /* U+04BC */ "",
   /* U+04BD */ "",
   /* U+04BE */ "",
   /* U+04BF */ "",
   /* U+04C0 */ "",
   /* U+04C1 */ "",
   /* U+04C2 */ "",
   /* U+04C3 */ "",
   /* U+04C4 */ "",
   /* U+04C5 */ "",
   /* U+04C6 */ "",
   /* U+04C7 */ "",
   /* U+04C8 */ "",
   /* U+04C9 */ "",
   /* U+04CA */ "",
   /* U+04CB */ "",
   /* U+04CC */ "",
   /* U+04CD */ "",
   /* U+04CE */ "",
   /* U+04CF */ "",
   /* U+04D0 */ "",
   /* U+04D1 */ "",
   /* U+04D2 */ "",
   /* U+04D3 */ "",
   /* U+04D4 */ "",
   /* U+04D5 */ "",
   /* U+04D6 */ "",
   /* U+04D7 */ "",
   /* U+04D8 */ "",
   /* U+04D9 */ "",
   /* U+04DA */ "",
   /* U+04DB */ "",
   /* U+04DC */ "",
   /* U+04DD */ "",
   /* U+04DE */ "",
   /* U+04DF */ "",
   /* U+04E0 */ "",
   /* U+04E1 */ "",
   /* U+04E2 */ "\xC4\xAA", /* "Ī" */
   /* U+04E3 */ "\xC4\xAB", /* "ī" */
   /* U+04E4 */ "",
   /* U+04E5 */ "",
   /* U+04E6 */ "",
   /* U+04E7 */ "",
   /* U+04E8 */ "",
   /* U+04E9 */ "",
   /* U+04EA */ "",
   /* U+04EB */ "",
   /* U+04EC */ "",
   /* U+04ED */ "",
   /* U+04EE */ "\xC5\xAA", /* "Ū" */
   /* U+04EF */ "\xC5\xAB" /* "ū" */
 };

 /* Quick test for an uppercase character in the range U+0041..U+005A.
    The argument must be a byte in the range 0..UCHAR_MAX.  */
 #define IS_UPPERCASE_LATIN(byte) \
   ((unsigned char) ((byte) - 'A') <= 'Z' - 'A')

 /* Quick test for an uppercase character in the range U+0400..U+042F,
    or exactly U+04E2 or U+04EE.
    The arguments must be bytes in the range 0..UCHAR_MAX.  */
 #define IS_UPPERCASE_CYRILLIC(byte1,byte2) \
   (((byte1) == 0xd0 && (unsigned char) ((byte2) - 0x80) < 0x30) \
    || ((byte1) == 0xd3 && ((byte2) == 0xa2 || (byte2) == 0xae)))

 void
 serbian_to_latin (const char *input, size_t input_len,
                   char **output_p, size_t *output_len_p)
 {
   /* Loop through the input string, producing a replacement for each character.
      Only characters in the range U+0400..U+04EF (\xD0\x80..\xD3\xAF) need to
      be handled, and more precisely only those for which a replacement exists
      in the table.  Other characters are copied without modification.
      The characters U+0409, U+040A, U+040F are transliterated to uppercase or
      mixed-case replacements ("LJ" / "Lj", "NJ" / "Nj", "DŽ" / "Dž"), depending
      on the case of the surrounding characters.
      Since we assume UTF-8 encoding, the bytes \xD0..\xD3 can only occur at the
      beginning of a character; the second and further bytes of a character are
      all in the range \x80..\xBF.  */

   /* Since sequences of 2 bytes are mapped to sequences of at most 3 bytes,
      the size of the output will be at most 1.5 * input_len.  */
   size_t allocated = input_len + (input_len >> 1);
   char *output = XNMALLOC (allocated, char);

   const char *input_end = input + input_len;
   const char *ip;
   char *op;

   for (ip = input, op = output; ip < input_end; )
     {
       unsigned char byte = (unsigned char) *ip;

       /* Test for the first byte of a Cyrillic character.  */
       if ((byte >= 0xd0 && byte <= 0xd3) && (ip + 1 < input_end))
         {
           unsigned char second_byte = (unsigned char) ip[1];

           /* Verify the second byte is valid.  */
           if (second_byte >= 0x80 && second_byte < 0xc0)
             {
               unsigned int uc = ((byte & 0x1f) << 6) | (second_byte & 0x3f);

               if (uc >= 0x0400 && uc <= 0x04ef)
                 {
                   /* Look up replacement from the table.  */
                   const char *repl = table[uc - 0x0400];

                   if (repl[0] != '\0')
                     {
                       /* Found a replacement.
                          Now handle the special cases.  */
                       if (uc == 0x0409 || uc == 0x040a || uc == 0x040f)
                         if ((ip + 2 < input_end
                              && IS_UPPERCASE_LATIN ((unsigned char) ip[2]))
                             || (ip + 3 < input_end
                                 && IS_UPPERCASE_CYRILLIC ((unsigned char) ip[2],
                                                           (unsigned char) ip[3]))
                             || (ip >= input + 1
                                 && IS_UPPERCASE_LATIN ((unsigned char) ip[-1]))
                             || (ip >= input + 2
                                 && IS_UPPERCASE_CYRILLIC ((unsigned char) ip[-2],
                                                           (unsigned char) ip[-1])))
                           {
                             /* Use the upper-case replacement instead of
                                the mixed-case replacement.  */
                             switch (uc)
                               {
                               case 0x0409:
                                 repl = "LJ"; break;
                               case 0x040a:
                                 repl = "NJ"; break;
                               case 0x040f:
                                 repl = "D\xC5\xBD"/* "DŽ" */; break;
                               default:
                                 abort ();
                               }
                           }

                       /* Use the replacement.  */
                       *op++ = *repl++;
                       if (*repl != '\0')
                         {
                           *op++ = *repl++;
                           if (*repl != '\0')
                             {
                               *op++ = *repl++;
                               /* All replacements have at most 3 bytes.  */
                               if (*repl != '\0')
                                 abort ();
                             }
                         }
                       ip += 2;
                       continue;
                     }
                 }
             }
         }
       *op++ = *ip++;
     }

   {
     size_t output_len = op - output;

     /* Verify that the allocated size was not exceeded.  */
     if (output_len > allocated)
       abort ();
     /* Shrink the result.  */
     if (output_len < allocated)
       output = (char *) xrealloc (output, output_len);

     /* Done.  */
     *output_p = output;
     *output_len_p = output_len;
   }
 }
	/* Recode Serbian text from Cyrillic to Latin script.
	Copyright (C) 2006-2007, 2009 Free Software Foundation, Inc.
	Written by Danilo Šegan <danilo@gnome.org>, 2006,
	and Bruno Haible <bruno@clisp.org>, 2006.

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation; either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program. If not, see <https://www.gnu.org/licenses/>. */

	#ifdef HAVE_CONFIG_H
	# include <config.h>
	#endif

	/* Specification. */
	#include "filters.h"

	#include <stdlib.h>

	#include "xalloc.h"


	/* Table for Serbian Cyrillic to Latin transcription.
	The table is indexed by the Unicode code point, in the range 0x0400..0x04ef.
	The longest table entry is three bytes long. */
	static const char table[240][3 + 1] =
	{
	/* U+0400 / "\xC3\x88", / "È" */
	/* U+0401 */ "",
	/* U+0402 / "\xC4\x90", / "Đ" */
	/* U+0403 */ "",
	/* U+0404 */ "",
	/* U+0405 */ "",
	/* U+0406 */ "",
	/* U+0407 */ "",
	/* U+0408 */ "J",
	/* U+0409 */ "Lj",
	/* U+040A */ "Nj",
	/* U+040B / "\xC4\x86", / "Ć" */
	/* U+040C */ "",
	/* U+040D / "\xC3\x8C", / "Ì" */
	/* U+040E */ "",
	/* U+040F / "D\xC5\xBE", / "Dž" */
	/* U+0410 */ "A",
	/* U+0411 */ "B",
	/* U+0412 */ "V",
	/* U+0413 */ "G",
	/* U+0414 */ "D",
	/* U+0415 */ "E",
	/* U+0416 / "\xC5\xBD", / "Ž" */
	/* U+0417 */ "Z",
	/* U+0418 */ "I",
	/* U+0419 */ "",
	/* U+041A */ "K",
	/* U+041B */ "L",
	/* U+041C */ "M",
	/* U+041D */ "N",
	/* U+041E */ "O",
	/* U+041F */ "P",
	/* U+0420 */ "R",
	/* U+0421 */ "S",
	/* U+0422 */ "T",
	/* U+0423 */ "U",
	/* U+0424 */ "F",
	/* U+0425 */ "H",
	/* U+0426 */ "C",
	/* U+0427 / "\xC4\x8C", / "Č" */
	/* U+0428 / "\xC5\xA0", / "Š" */
	/* U+0429 */ "",
	/* U+042A */ "",
	/* U+042B */ "",
	/* U+042C */ "",
	/* U+042D */ "",
	/* U+042E */ "",
	/* U+042F */ "",
	/* U+0430 */ "a",
	/* U+0431 */ "b",
	/* U+0432 */ "v",
	/* U+0433 */ "g",
	/* U+0434 */ "d",
	/* U+0435 */ "e",
	/* U+0436 / "\xC5\xBE", / "ž" */
	/* U+0437 */ "z",
	/* U+0438 */ "i",
	/* U+0439 */ "",
	/* U+043A */ "k",
	/* U+043B */ "l",
	/* U+043C */ "m",
	/* U+043D */ "n",
	/* U+043E */ "o",
	/* U+043F */ "p",
	/* U+0440 */ "r",
	/* U+0441 */ "s",
	/* U+0442 */ "t",
	/* U+0443 */ "u",
	/* U+0444 */ "f",
	/* U+0445 */ "h",
	/* U+0446 */ "c",
	/* U+0447 / "\xC4\x8D", / "č" */
	/* U+0448 / "\xC5\xA1", / "š" */
	/* U+0449 */ "",
	/* U+044A */ "",
	/* U+044B */ "",
	/* U+044C */ "",
	/* U+044D */ "",
	/* U+044E */ "",
	/* U+044F */ "",
	/* U+0450 / "\xC3\xA8", / "è" */
	/* U+0451 */ "",
	/* U+0452 / "\xC4\x91", / "đ" */
	/* U+0453 */ "",
	/* U+0454 */ "",
	/* U+0455 */ "",
	/* U+0456 */ "",
	/* U+0457 */ "",
	/* U+0458 */ "j",
	/* U+0459 */ "lj",
	/* U+045A */ "nj",
	/* U+045B / "\xC4\x87", / "ć" */
	/* U+045C */ "",
	/* U+045D / "\xC3\xAC", / "ì" */
	/* U+045E */ "",
	/* U+045F / "d\xC5\xBE", / "dž" */
	/* U+0460 */ "",
	/* U+0461 */ "",
	/* U+0462 */ "",
	/* U+0463 */ "",
	/* U+0464 */ "",
	/* U+0465 */ "",
	/* U+0466 */ "",
	/* U+0467 */ "",
	/* U+0468 */ "",
	/* U+0469 */ "",
	/* U+046A */ "",
	/* U+046B */ "",
	/* U+046C */ "",
	/* U+046D */ "",
	/* U+046E */ "",
	/* U+046F */ "",
	/* U+0470 */ "",
	/* U+0471 */ "",
	/* U+0472 */ "",
	/* U+0473 */ "",
	/* U+0474 */ "",
	/* U+0475 */ "",
	/* U+0476 */ "",
	/* U+0477 */ "",
	/* U+0478 */ "",
	/* U+0479 */ "",
	/* U+047A */ "",
	/* U+047B */ "",
	/* U+047C */ "",
	/* U+047D */ "",
	/* U+047E */ "",
	/* U+047F */ "",
	/* U+0480 */ "",
	/* U+0481 */ "",
	/* U+0482 */ "",
	/* U+0483 */ "",
	/* U+0484 */ "",
	/* U+0485 */ "",
	/* U+0486 */ "",
	/* U+0487 */ "",
	/* U+0488 */ "",
	/* U+0489 */ "",
	/* U+048A */ "",
	/* U+048B */ "",
	/* U+048C */ "",
	/* U+048D */ "",
	/* U+048E */ "",
	/* U+048F */ "",
	/* U+0490 */ "",
	/* U+0491 */ "",
	/* U+0492 */ "",
	/* U+0493 */ "",
	/* U+0494 */ "",
	/* U+0495 */ "",
	/* U+0496 */ "",
	/* U+0497 */ "",
	/* U+0498 */ "",
	/* U+0499 */ "",
	/* U+049A */ "",
	/* U+049B */ "",
	/* U+049C */ "",
	/* U+049D */ "",
	/* U+049E */ "",
	/* U+049F */ "",
	/* U+04A0 */ "",
	/* U+04A1 */ "",
	/* U+04A2 */ "",
	/* U+04A3 */ "",
	/* U+04A4 */ "",
	/* U+04A5 */ "",
	/* U+04A6 */ "",
	/* U+04A7 */ "",
	/* U+04A8 */ "",
	/* U+04A9 */ "",
	/* U+04AA */ "",
	/* U+04AB */ "",
	/* U+04AC */ "",
	/* U+04AD */ "",
	/* U+04AE */ "",
	/* U+04AF */ "",
	/* U+04B0 */ "",
	/* U+04B1 */ "",
	/* U+04B2 */ "",
	/* U+04B3 */ "",
	/* U+04B4 */ "",
	/* U+04B5 */ "",
	/* U+04B6 */ "",
	/* U+04B7 */ "",
	/* U+04B8 */ "",
	/* U+04B9 */ "",
	/* U+04BA */ "",
	/* U+04BB */ "",
	/* U+04BC */ "",
	/* U+04BD */ "",
	/* U+04BE */ "",
	/* U+04BF */ "",
	/* U+04C0 */ "",
	/* U+04C1 */ "",
	/* U+04C2 */ "",
	/* U+04C3 */ "",
	/* U+04C4 */ "",
	/* U+04C5 */ "",
	/* U+04C6 */ "",
	/* U+04C7 */ "",
	/* U+04C8 */ "",
	/* U+04C9 */ "",
	/* U+04CA */ "",
	/* U+04CB */ "",
	/* U+04CC */ "",
	/* U+04CD */ "",
	/* U+04CE */ "",
	/* U+04CF */ "",
	/* U+04D0 */ "",
	/* U+04D1 */ "",
	/* U+04D2 */ "",
	/* U+04D3 */ "",
	/* U+04D4 */ "",
	/* U+04D5 */ "",
	/* U+04D6 */ "",
	/* U+04D7 */ "",
	/* U+04D8 */ "",
	/* U+04D9 */ "",
	/* U+04DA */ "",
	/* U+04DB */ "",
	/* U+04DC */ "",
	/* U+04DD */ "",
	/* U+04DE */ "",
	/* U+04DF */ "",
	/* U+04E0 */ "",
	/* U+04E1 */ "",
	/* U+04E2 / "\xC4\xAA", / "Ī" */
	/* U+04E3 / "\xC4\xAB", / "ī" */
	/* U+04E4 */ "",
	/* U+04E5 */ "",
	/* U+04E6 */ "",
	/* U+04E7 */ "",
	/* U+04E8 */ "",
	/* U+04E9 */ "",
	/* U+04EA */ "",
	/* U+04EB */ "",
	/* U+04EC */ "",
	/* U+04ED */ "",
	/* U+04EE / "\xC5\xAA", / "Ū" */
	/* U+04EF / "\xC5\xAB" / "ū" */
	};

	/* Quick test for an uppercase character in the range U+0041..U+005A.
	The argument must be a byte in the range 0..UCHAR_MAX. */
	#define IS_UPPERCASE_LATIN(byte) \
	((unsigned char) ((byte) - 'A') <= 'Z' - 'A')

	/* Quick test for an uppercase character in the range U+0400..U+042F,
	or exactly U+04E2 or U+04EE.
	The arguments must be bytes in the range 0..UCHAR_MAX. */
	#define IS_UPPERCASE_CYRILLIC(byte1,byte2) \
	(((byte1) == 0xd0 && (unsigned char) ((byte2) - 0x80) < 0x30) \
	\|\| ((byte1) == 0xd3 && ((byte2) == 0xa2 \|\| (byte2) == 0xae)))

	void
	serbian_to_latin (const char *input, size_t input_len,
	char *output_p, size_t output_len_p)
	{
	/* Loop through the input string, producing a replacement for each character.
	Only characters in the range U+0400..U+04EF (\xD0\x80..\xD3\xAF) need to
	be handled, and more precisely only those for which a replacement exists
	in the table. Other characters are copied without modification.
	The characters U+0409, U+040A, U+040F are transliterated to uppercase or
	mixed-case replacements ("LJ" / "Lj", "NJ" / "Nj", "DŽ" / "Dž"), depending
	on the case of the surrounding characters.
	Since we assume UTF-8 encoding, the bytes \xD0..\xD3 can only occur at the
	beginning of a character; the second and further bytes of a character are
	all in the range \x80..\xBF. */

	/* Since sequences of 2 bytes are mapped to sequences of at most 3 bytes,
	the size of the output will be at most 1.5 * input_len. */
	size_t allocated = input_len + (input_len >> 1);
	char *output = XNMALLOC (allocated, char);

	const char *input_end = input + input_len;
	const char *ip;
	char *op;

	for (ip = input, op = output; ip < input_end; )
	{
	unsigned char byte = (unsigned char) *ip;

	/* Test for the first byte of a Cyrillic character. */
	if ((byte >= 0xd0 && byte <= 0xd3) && (ip + 1 < input_end))
	{
	unsigned char second_byte = (unsigned char) ip[1];

	/* Verify the second byte is valid. */
	if (second_byte >= 0x80 && second_byte < 0xc0)
	{
	unsigned int uc = ((byte & 0x1f) << 6) \| (second_byte & 0x3f);

	if (uc >= 0x0400 && uc <= 0x04ef)
	{
	/* Look up replacement from the table. */
	const char *repl = table[uc - 0x0400];

	if (repl[0] != '\0')
	{
	/* Found a replacement.
	Now handle the special cases. */
	if (uc == 0x0409 \|\| uc == 0x040a \|\| uc == 0x040f)
	if ((ip + 2 < input_end
	&& IS_UPPERCASE_LATIN ((unsigned char) ip[2]))
	\|\| (ip + 3 < input_end
	&& IS_UPPERCASE_CYRILLIC ((unsigned char) ip[2],
	(unsigned char) ip[3]))
	\|\| (ip >= input + 1
	&& IS_UPPERCASE_LATIN ((unsigned char) ip[-1]))
	\|\| (ip >= input + 2
	&& IS_UPPERCASE_CYRILLIC ((unsigned char) ip[-2],
	(unsigned char) ip[-1])))
	{
	/* Use the upper-case replacement instead of
	the mixed-case replacement. */
	switch (uc)
	{
	case 0x0409:
	repl = "LJ"; break;
	case 0x040a:
	repl = "NJ"; break;
	case 0x040f:
	repl = "D\xC5\xBD"/* "DŽ" */; break;
	default:
	abort ();
	}
	}

	/* Use the replacement. */
	op++ = repl++;
	if (*repl != '\0')
	{
	op++ = repl++;
	if (*repl != '\0')
	{
	op++ = repl++;
	/* All replacements have at most 3 bytes. */
	if (*repl != '\0')
	abort ();
	}
	}
	ip += 2;
	continue;
	}
	}
	}
	}
	op++ = ip++;
	}

	{
	size_t output_len = op - output;

	/* Verify that the allocated size was not exceeded. */
	if (output_len > allocated)
	abort ();
	/* Shrink the result. */
	if (output_len < allocated)
	output = (char *) xrealloc (output, output_len);

	/* Done. */
	*output_p = output;
	*output_len_p = output_len;
	}
	}