mingw/gettext/gettext-tools/src/read-properties.c - kiwivm - Git at Google

 /* Reading Java .properties files.
    Copyright (C) 2003, 2005-2007, 2009, 2018, 2020 Free Software Foundation, Inc.
    Written by Bruno Haible <bruno@clisp.org>, 2003.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */

 #ifdef HAVE_CONFIG_H
 # include <config.h>
 #endif

 /* Specification.  */
 #include "read-properties.h"

 #include <assert.h>
 #include <errno.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #include "error.h"
 #include "error-progname.h"
 #include "message.h"
 #include "read-catalog-abstract.h"
 #include "xalloc.h"
 #include "xvasprintf.h"
 #include "po-xerror.h"
 #include "msgl-ascii.h"
 #include "read-file.h"
 #include "unistr.h"
 #include "gettext.h"

 #define _(str) gettext (str)

 /* For compiling this file in C++ mode.  */
 #ifdef __cplusplus
 # define this thiss
 #endif


 /* The format of the Java .properties files is documented in the JDK
    documentation for class java.util.Properties.  In the case of .properties
    files for PropertyResourceBundle, each non-comment line contains a
    key/value pair in the form "key = value" or "key : value" or "key value",
    where the key is the msgid and the value is the msgstr.  Messages with
    plurals are not supported in this format.

    The encoding of Java .properties files is:
      - ASCII with Java \uxxxx escape sequences,
      - ISO-8859-1 if non-ASCII bytes are encounterd,
      - UTF-8 if non-ASCII bytes are encountered and the entire file is
        valid UTF-8 (in Java 9 or newer), see
        https://docs.oracle.com/javase/9/intl/internationalization-enhancements-jdk-9.htm */

 /* Handling of comments: We copy all comments from the .properties file to
    the PO file. This is not really needed; it's a service for translators
    who don't like PO files and prefer to maintain the .properties file.  */

 /* Real filename, used in error messages about the input file.  */
 static const char *real_file_name;

 /* File name and line number.  */
 extern lex_pos_ty gram_pos;

 /* The contents of the input file.  */
 static char *contents;
 static size_t contents_length;

 /* True if the input file is assumed to be in UTF-8 encoding.
    False if it is assumed to be in ISO-8859-1 encoding.  */
 static bool assume_utf8;

 /* Current position in contents.  */
 static size_t position;

 /* Phase 1: Read an input byte.
    Max. 1 pushback byte.  */

 static int
 phase1_getc ()
 {
   if (position == contents_length)
     return EOF;

   return (unsigned char) contents[position++];
 }

 static inline void
 phase1_ungetc (int c)
 {
   if (c != EOF)
     position--;
 }


 /* Phase 2: Read an input byte, treating CR/LF like a single LF.
    Max. 2 pushback bytes.  */

 static unsigned char phase2_pushback[2];
 static int phase2_pushback_length;

 static int
 phase2_getc ()
 {
   int c;

   if (phase2_pushback_length)
     c = phase2_pushback[--phase2_pushback_length];
   else
     {
       c = phase1_getc ();

       if (c == '\r')
         {
           int c2 = phase1_getc ();
           if (c2 == '\n')
             c = c2;
           else
             phase1_ungetc (c2);
         }
     }

   if (c == '\n')
     gram_pos.line_number++;

   return c;
 }

 static void
 phase2_ungetc (int c)
 {
   if (c == '\n')
     --gram_pos.line_number;
   if (c != EOF)
     phase2_pushback[phase2_pushback_length++] = c;
 }


 /* Phase 3: Read an input byte, treating CR/LF like a single LF,
    with handling of continuation lines.
    Max. 1 pushback character.  */

 static int
 phase3_getc ()
 {
   int c = phase2_getc ();

   for (;;)
     {
       if (c != '\\')
         return c;

       c = phase2_getc ();
       if (c != '\n')
         {
           phase2_ungetc (c);
           return '\\';
         }

       /* Skip the backslash-newline and all whitespace that follows it.  */
       do
         c = phase2_getc ();
       while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
     }
 }

 static inline void
 phase3_ungetc (int c)
 {
   phase2_ungetc (c);
 }


 /* Converts a string from ISO-8859-1 encoding to UTF-8 encoding.  */
 static char *
 conv_from_iso_8859_1 (char *string)
 {
   if (is_ascii_string (string))
     return string;
   else
     {
       size_t length = strlen (string);
       /* Each ISO-8859-1 character needs 2 bytes at worst.  */
       unsigned char *utf8_string = XNMALLOC (2 * length + 1, unsigned char);
       unsigned char *q = utf8_string;
       const char *str = string;
       const char *str_limit = str + length;

       while (str < str_limit)
         {
           unsigned int uc = (unsigned char) *str++;
           int n = u8_uctomb (q, uc, 6);
           assert (n > 0);
           q += n;
         }
       *q = '\0';
       assert (q - utf8_string <= 2 * length);

       return (char *) utf8_string;
     }
 }


 /* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
    encoding.  May destructively modify the argument string.  */
 static char *
 conv_from_java (char *string)
 {
   /* This conversion can only shrink the string, never increase its size.
      So there is no need to xmalloc the result freshly.  */
   const char *p = string;
   unsigned char *q = (unsigned char *) string;

   while (*p != '\0')
     {
       if (p[0] == '\\' && p[1] == 'u')
         {
           unsigned int n = 0;
           int i;

           for (i = 0; i < 4; i++)
             {
               int c1 = (unsigned char) p[2 + i];

               if (c1 >= '0' && c1 <= '9')
                 n = (n << 4) + (c1 - '0');
               else if (c1 >= 'A' && c1 <= 'F')
                 n = (n << 4) + (c1 - 'A' + 10);
               else if (c1 >= 'a' && c1 <= 'f')
                 n = (n << 4) + (c1 - 'a' + 10);
               else
                 goto just_one_byte;
             }

           if (i == 4)
             {
               unsigned int uc;

               if (n >= 0xd800 && n < 0xdc00)
                 {
                   if (p[6] == '\\' && p[7] == 'u')
                     {
                       unsigned int m = 0;

                       for (i = 0; i < 4; i++)
                         {
                           int c1 = (unsigned char) p[8 + i];

                           if (c1 >= '0' && c1 <= '9')
                             m = (m << 4) + (c1 - '0');
                           else if (c1 >= 'A' && c1 <= 'F')
                             m = (m << 4) + (c1 - 'A' + 10);
                           else if (c1 >= 'a' && c1 <= 'f')
                             m = (m << 4) + (c1 - 'a' + 10);
                           else
                             goto just_one_byte;
                         }

                       if (i == 4 && (m >= 0xdc00 && m < 0xe000))
                         {
                           /* Combine two UTF-16 words to a character.  */
                           uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
                           p += 12;
                         }
                       else
                         goto just_one_byte;
                     }
                   else
                     goto just_one_byte;
                 }
               else
                 {
                   uc = n;
                   p += 6;
                 }

               q += u8_uctomb (q, uc, 6);
               continue;
             }
         }
       just_one_byte:
         *q++ = (unsigned char) *p++;
     }
   *q = '\0';
   return string;
 }


 /* Phase 4: Read the next single byte or UTF-16 code point,
    treating CR/LF like a single LF, with handling of continuation lines
    and of \uxxxx sequences.  */

 /* Return value of phase 4 when EOF is reached.  */
 #define P4_EOF 0xffff

 /* Convert an UTF-16 code point to a return value that can be distinguished
    from a single-byte return value.  */
 #define UNICODE(code) (0x10000 + (code))

 /* Test a return value of phase 4 whether it designates an UTF-16 code
    point.  */
 #define IS_UNICODE(p4_result) ((p4_result) >= 0x10000)

 /* Extract the UTF-16 code of a return value that satisfies IS_UNICODE.  */
 #define UTF16_VALUE(p4_result) ((p4_result) - 0x10000)

 static int
 phase4_getuc ()
 {
   int c = phase3_getc ();

   if (c == EOF)
     return P4_EOF;
   if (c == '\\')
     {
       int c2 = phase3_getc ();

       if (c2 == 't')
         return '\t';
       if (c2 == 'n')
         return '\n';
       if (c2 == 'r')
         return '\r';
       if (c2 == 'f')
         return '\f';
       if (c2 == 'u')
         {
           unsigned int n = 0;
           int i;

           for (i = 0; i < 4; i++)
             {
               int c1 = phase3_getc ();

               if (c1 >= '0' && c1 <= '9')
                 n = (n << 4) + (c1 - '0');
               else if (c1 >= 'A' && c1 <= 'F')
                 n = (n << 4) + (c1 - 'A' + 10);
               else if (c1 >= 'a' && c1 <= 'f')
                 n = (n << 4) + (c1 - 'a' + 10);
               else
                 {
                   phase3_ungetc (c1);
                   po_xerror (PO_SEVERITY_ERROR, NULL,
                              real_file_name, gram_pos.line_number, (size_t)(-1),
                              false, _("warning: invalid \\uxxxx syntax for Unicode character"));
                   return 'u';
                 }
             }
           return UNICODE (n);
         }

       return c2;
     }
   else
     return c;
 }


 /* Reads a key or value string.
    Returns the string in UTF-8 encoding, or NULL if the end of the logical
    line is reached.
    Parsing ends:
      - when returning NULL, after the end of the logical line,
      - otherwise, if in_key is true, after the whitespace and possibly the
        separator that follows after the string,
      - otherwise, if in_key is false, after the end of the logical line. */

 static char *
 read_escaped_string (bool in_key)
 {
   /* The part of the string that has already been converted to UTF-8.  */
   static unsigned char *utf8_buffer;
   static size_t utf8_buflen;
   static size_t utf8_allocated;
   /* The first half of an UTF-16 surrogate character.  */
   unsigned short utf16_surr;
   /* Line in which this surrogate character occurred.  */
   size_t utf16_surr_line;

   /* Ensures utf8_buffer has room for N bytes.  N must be <= 10.  */
   #define utf8_buffer_ensure_available(n)  \
     do                                                                        \
       {                                                                       \
         if (utf8_buflen + (n) > utf8_allocated)                               \
           {                                                                   \
             utf8_allocated = 2 * utf8_allocated + 10;                         \
             utf8_buffer =                                                     \
               (unsigned char *) xrealloc (utf8_buffer, utf8_allocated);       \
           }                                                                   \
       }                                                                       \
     while (0)

   /* Appends a lone surrogate to utf8_buffer.  */
   /* Note: A half surrogate is invalid in UTF-8:
      - RFC 3629 says
          "The definition of UTF-8 prohibits encoding character
           numbers between U+D800 and U+DFFF".
      - Unicode 4.0 chapter 3
        <https://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
        section 3.9, p.77, says
          "Because surrogate code points are not Unicode scalar
           values, any UTF-8 byte sequence that would otherwise
           map to code points D800..DFFF is ill-formed."
        and in table 3-6, p. 78, does not mention D800..DFFF.
      - The unicode.org FAQ question "How do I convert an unpaired
        UTF-16 surrogate to UTF-8?" has the answer
          "By representing such an unpaired surrogate on its own
           as a 3-byte sequence, the resulting UTF-8 data stream
           would become ill-formed."
      So use U+FFFD instead.  */
   #define utf8_buffer_append_lone_surrogate(uc, line) \
     do                                                                        \
       {                                                                       \
         error_with_progname = false;                                          \
         po_xerror (PO_SEVERITY_ERROR, NULL,                                   \
                    real_file_name, (line), (size_t)(-1), false,               \
                    xasprintf (_("warning: lone surrogate U+%04X"), (uc)));    \
         error_with_progname = true;                                           \
         utf8_buffer_ensure_available (3);                                     \
         utf8_buffer[utf8_buflen++] = 0xef;                                    \
         utf8_buffer[utf8_buflen++] = 0xbf;                                    \
         utf8_buffer[utf8_buflen++] = 0xbd;                                    \
       }                                                                       \
     while (0)

   int c;

   /* Skip whitespace before the string.  */
   do
     c = phase3_getc ();
   while (c == ' ' || c == '\t' || c == '\r' || c == '\f');

   if (c == EOF || c == '\n')
     /* Empty string.  */
     return NULL;

   /* Start accumulating the string.  */
   utf8_buflen = 0;
   utf16_surr = 0;
   utf16_surr_line = 0;
   for (;;)
     {
       if (in_key && (c == '=' || c == ':'
                      || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
         {
           /* Skip whitespace after the string.  */
           while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
             c = phase3_getc ();
           /* Skip '=' or ':' separator.  */
           if (!(c == '=' || c == ':'))
             phase3_ungetc (c);
           break;
         }

       phase3_ungetc (c);

       /* Read the next byte or UTF-16 code point.  */
       c = phase4_getuc ();
       if (c == P4_EOF)
         break;

       /* Append it to the buffer.  */
       if (IS_UNICODE (c))
         {
           /* Append an UTF-16 code point.  */
           /* Test whether this character and the previous one form a Unicode
              surrogate pair.  */
           if (utf16_surr != 0
               && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
             {
               unsigned short utf16buf[2];
               ucs4_t uc;
               int len;

               utf16buf[0] = utf16_surr;
               utf16buf[1] = UTF16_VALUE (c);
               if (u16_mbtouc (&uc, utf16buf, 2) != 2)
                 abort ();

               utf8_buffer_ensure_available (6);
               len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 6);
               if (len < 0)
                 {
                   error_with_progname = false;
                   po_xerror (PO_SEVERITY_ERROR, NULL,
                              real_file_name, gram_pos.line_number, (size_t)(-1),
                              false, _("warning: invalid Unicode character"));
                   error_with_progname = true;
                 }
               else
                 utf8_buflen += len;

               utf16_surr = 0;
             }
           else
             {
               if (utf16_surr != 0)
                 {
                   utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
                   utf16_surr = 0;
                 }

               if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
                 {
                   utf16_surr = UTF16_VALUE (c);
                   utf16_surr_line = gram_pos.line_number;
                 }
               else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
                 utf8_buffer_append_lone_surrogate (UTF16_VALUE (c), gram_pos.line_number);
               else
                 {
                   ucs4_t uc = UTF16_VALUE (c);
                   int len;

                   utf8_buffer_ensure_available (3);
                   len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 3);
                   if (len < 0)
                     {
                       error_with_progname = false;
                       po_xerror (PO_SEVERITY_ERROR, NULL,
                                  real_file_name, gram_pos.line_number, (size_t)(-1),
                                  false, _("warning: invalid Unicode character"));
                       error_with_progname = true;
                     }
                   else
                     utf8_buflen += len;
                 }
             }
         }
       else
         {
           /* Append a single byte.  */
           if (utf16_surr != 0)
             {
               utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
               utf16_surr = 0;
             }

           if (assume_utf8)
             {
               /* No conversion needed.  */
               utf8_buffer_ensure_available (1);
               utf8_buffer[utf8_buflen++] = c;
             }
           else
             {
               /* Convert the byte from ISO-8859-1 to UTF-8 on the fly.  */
               ucs4_t uc = c;
               int len;

               utf8_buffer_ensure_available (2);
               len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 2);
               if (len < 0)
                 abort ();
               utf8_buflen += len;
             }
         }

       c = phase3_getc ();
       if (c == EOF || c == '\n')
         {
           if (in_key)
             phase3_ungetc (c);
           break;
         }
     }
   if (utf16_surr != 0)
     utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);

   /* Return the result.  */
   {
     unsigned char *utf8_string = XNMALLOC (utf8_buflen + 1, unsigned char);
     if (utf8_buflen > 0)
       memcpy (utf8_string, utf8_buffer, utf8_buflen);
     utf8_string[utf8_buflen] = '\0';

     return (char *) utf8_string;
   }
   #undef utf8_buffer_append_lone_surrogate
   #undef utf8_buffer_ensure_available
 }


 /* Read a .properties file from a stream, and dispatch to the various
    abstract_catalog_reader_class_ty methods.  */
 static void
 properties_parse (abstract_catalog_reader_ty *this, FILE *file,
                   const char *real_filename, const char *logical_filename)
 {
   /* Read the file into memory.  */
   contents = fread_file (file, 0, &contents_length);
   if (contents == NULL)
     {
       const char *errno_description = strerror (errno);
       po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
                  xasprintf ("%s: %s",
                             xasprintf (_("error while reading \"%s\""),
                                        real_filename),
                             errno_description));
       return;
     }

   /* Test whether it's valid UTF-8.  */
   assume_utf8 = (u8_check ((uint8_t *) contents, contents_length) == NULL);

   position = 0;
   real_file_name = real_filename;
   gram_pos.file_name = xstrdup (real_file_name);
   gram_pos.line_number = 1;

   for (;;)
     {
       int c;
       bool comment;
       bool hidden;

       c = phase2_getc ();

       if (c == EOF)
         break;

       comment = false;
       hidden = false;
       if (c == '#')
         comment = true;
       else if (c == '!')
         {
           /* For compatibility with write-properties.c, we treat '!' not
              followed by space as a fuzzy or untranslated message.  */
           int c2 = phase2_getc ();
           if (c2 == ' ' || c2 == '\n' || c2 == EOF)
             comment = true;
           else
             hidden = true;
           phase2_ungetc (c2);
         }
       else
         phase2_ungetc (c);

       if (comment)
         {
           /* A comment line.  */
           static char *buffer;
           static size_t bufmax;
           static size_t buflen;

           buflen = 0;
           for (;;)
             {
               c = phase2_getc ();

               if (buflen >= bufmax)
                 {
                   bufmax += 100;
                   buffer = xrealloc (buffer, bufmax);
                 }

               if (c == EOF || c == '\n')
                 break;

               buffer[buflen++] = c;
             }
           buffer[buflen] = '\0';

           po_callback_comment_dispatcher (
             conv_from_java (
               assume_utf8 ? buffer : conv_from_iso_8859_1 (buffer)));
         }
       else
         {
           /* A key/value pair.  */
           char *msgid;
           lex_pos_ty msgid_pos;

           msgid_pos = gram_pos;
           msgid = read_escaped_string (true);
           if (msgid == NULL)
             /* Skip blank line.  */
             ;
           else
             {
               char *msgstr;
               lex_pos_ty msgstr_pos;
               bool force_fuzzy;

               msgstr_pos = gram_pos;
               msgstr = read_escaped_string (false);
               if (msgstr == NULL)
                 msgstr = xstrdup ("");

               /* Be sure to make the message fuzzy if it was commented out
                  and if it is not already header/fuzzy/untranslated.  */
               force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');

               po_callback_message (NULL, msgid, &msgid_pos, NULL,
                                    msgstr, strlen (msgstr) + 1, &msgstr_pos,
                                    NULL, NULL, NULL,
                                    force_fuzzy, false);
             }
         }
     }

   free (contents);
   contents = NULL;
   real_file_name = NULL;
   gram_pos.line_number = 0;
 }

 const struct catalog_input_format input_format_properties =
 {
   properties_parse,                     /* parse */
   true                                  /* produces_utf8 */
 };
	/* Reading Java .properties files.
	Copyright (C) 2003, 2005-2007, 2009, 2018, 2020 Free Software Foundation, Inc.
	Written by Bruno Haible <bruno@clisp.org>, 2003.

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation; either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program. If not, see <https://www.gnu.org/licenses/>. */

	#ifdef HAVE_CONFIG_H
	# include <config.h>
	#endif

	/* Specification. */
	#include "read-properties.h"

	#include <assert.h>
	#include <errno.h>
	#include <stdbool.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	#include "error.h"
	#include "error-progname.h"
	#include "message.h"
	#include "read-catalog-abstract.h"
	#include "xalloc.h"
	#include "xvasprintf.h"
	#include "po-xerror.h"
	#include "msgl-ascii.h"
	#include "read-file.h"
	#include "unistr.h"
	#include "gettext.h"

	#define _(str) gettext (str)

	/* For compiling this file in C++ mode. */
	#ifdef __cplusplus
	# define this thiss
	#endif


	/* The format of the Java .properties files is documented in the JDK
	documentation for class java.util.Properties. In the case of .properties
	files for PropertyResourceBundle, each non-comment line contains a
	key/value pair in the form "key = value" or "key : value" or "key value",
	where the key is the msgid and the value is the msgstr. Messages with
	plurals are not supported in this format.

	The encoding of Java .properties files is:
	- ASCII with Java \uxxxx escape sequences,
	- ISO-8859-1 if non-ASCII bytes are encounterd,
	- UTF-8 if non-ASCII bytes are encountered and the entire file is
	valid UTF-8 (in Java 9 or newer), see
	https://docs.oracle.com/javase/9/intl/internationalization-enhancements-jdk-9.htm */

	/* Handling of comments: We copy all comments from the .properties file to
	the PO file. This is not really needed; it's a service for translators
	who don't like PO files and prefer to maintain the .properties file. */

	/* Real filename, used in error messages about the input file. */
	static const char *real_file_name;

	/* File name and line number. */
	extern lex_pos_ty gram_pos;

	/* The contents of the input file. */
	static char *contents;
	static size_t contents_length;

	/* True if the input file is assumed to be in UTF-8 encoding.
	False if it is assumed to be in ISO-8859-1 encoding. */
	static bool assume_utf8;

	/* Current position in contents. */
	static size_t position;

	/* Phase 1: Read an input byte.
	Max. 1 pushback byte. */

	static int
	phase1_getc ()
	{
	if (position == contents_length)
	return EOF;

	return (unsigned char) contents[position++];
	}

	static inline void
	phase1_ungetc (int c)
	{
	if (c != EOF)
	position--;
	}


	/* Phase 2: Read an input byte, treating CR/LF like a single LF.
	Max. 2 pushback bytes. */

	static unsigned char phase2_pushback[2];
	static int phase2_pushback_length;

	static int
	phase2_getc ()
	{
	int c;

	if (phase2_pushback_length)
	c = phase2_pushback[--phase2_pushback_length];
	else
	{
	c = phase1_getc ();

	if (c == '\r')
	{
	int c2 = phase1_getc ();
	if (c2 == '\n')
	c = c2;
	else
	phase1_ungetc (c2);
	}
	}

	if (c == '\n')
	gram_pos.line_number++;

	return c;
	}

	static void
	phase2_ungetc (int c)
	{
	if (c == '\n')
	--gram_pos.line_number;
	if (c != EOF)
	phase2_pushback[phase2_pushback_length++] = c;
	}


	/* Phase 3: Read an input byte, treating CR/LF like a single LF,
	with handling of continuation lines.
	Max. 1 pushback character. */

	static int
	phase3_getc ()
	{
	int c = phase2_getc ();

	for (;;)
	{
	if (c != '\\')
	return c;

	c = phase2_getc ();
	if (c != '\n')
	{
	phase2_ungetc (c);
	return '\\';
	}

	/* Skip the backslash-newline and all whitespace that follows it. */
	do
	c = phase2_getc ();
	while (c == ' ' \|\| c == '\t' \|\| c == '\r' \|\| c == '\f');
	}
	}

	static inline void
	phase3_ungetc (int c)
	{
	phase2_ungetc (c);
	}


	/* Converts a string from ISO-8859-1 encoding to UTF-8 encoding. */
	static char *
	conv_from_iso_8859_1 (char *string)
	{
	if (is_ascii_string (string))
	return string;
	else
	{
	size_t length = strlen (string);
	/* Each ISO-8859-1 character needs 2 bytes at worst. */
	unsigned char utf8_string = XNMALLOC (2 length + 1, unsigned char);
	unsigned char *q = utf8_string;
	const char *str = string;
	const char *str_limit = str + length;

	while (str < str_limit)
	{
	unsigned int uc = (unsigned char) *str++;
	int n = u8_uctomb (q, uc, 6);
	assert (n > 0);
	q += n;
	}
	*q = '\0';
	assert (q - utf8_string <= 2 * length);

	return (char *) utf8_string;
	}
	}


	/* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
	encoding. May destructively modify the argument string. */
	static char *
	conv_from_java (char *string)
	{
	/* This conversion can only shrink the string, never increase its size.
	So there is no need to xmalloc the result freshly. */
	const char *p = string;
	unsigned char q = (unsigned char ) string;

	while (*p != '\0')
	{
	if (p[0] == '\\' && p[1] == 'u')
	{
	unsigned int n = 0;
	int i;

	for (i = 0; i < 4; i++)
	{
	int c1 = (unsigned char) p[2 + i];

	if (c1 >= '0' && c1 <= '9')
	n = (n << 4) + (c1 - '0');
	else if (c1 >= 'A' && c1 <= 'F')
	n = (n << 4) + (c1 - 'A' + 10);
	else if (c1 >= 'a' && c1 <= 'f')
	n = (n << 4) + (c1 - 'a' + 10);
	else
	goto just_one_byte;
	}

	if (i == 4)
	{
	unsigned int uc;

	if (n >= 0xd800 && n < 0xdc00)
	{
	if (p[6] == '\\' && p[7] == 'u')
	{
	unsigned int m = 0;

	for (i = 0; i < 4; i++)
	{
	int c1 = (unsigned char) p[8 + i];

	if (c1 >= '0' && c1 <= '9')
	m = (m << 4) + (c1 - '0');
	else if (c1 >= 'A' && c1 <= 'F')
	m = (m << 4) + (c1 - 'A' + 10);
	else if (c1 >= 'a' && c1 <= 'f')
	m = (m << 4) + (c1 - 'a' + 10);
	else
	goto just_one_byte;
	}

	if (i == 4 && (m >= 0xdc00 && m < 0xe000))
	{
	/* Combine two UTF-16 words to a character. */
	uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
	p += 12;
	}
	else
	goto just_one_byte;
	}
	else
	goto just_one_byte;
	}
	else
	{
	uc = n;
	p += 6;
	}

	q += u8_uctomb (q, uc, 6);
	continue;
	}
	}
	just_one_byte:
	q++ = (unsigned char) p++;
	}
	*q = '\0';
	return string;
	}


	/* Phase 4: Read the next single byte or UTF-16 code point,
	treating CR/LF like a single LF, with handling of continuation lines
	and of \uxxxx sequences. */

	/* Return value of phase 4 when EOF is reached. */
	#define P4_EOF 0xffff

	/* Convert an UTF-16 code point to a return value that can be distinguished
	from a single-byte return value. */
	#define UNICODE(code) (0x10000 + (code))

	/* Test a return value of phase 4 whether it designates an UTF-16 code
	point. */
	#define IS_UNICODE(p4_result) ((p4_result) >= 0x10000)

	/* Extract the UTF-16 code of a return value that satisfies IS_UNICODE. */
	#define UTF16_VALUE(p4_result) ((p4_result) - 0x10000)

	static int
	phase4_getuc ()
	{
	int c = phase3_getc ();

	if (c == EOF)
	return P4_EOF;
	if (c == '\\')
	{
	int c2 = phase3_getc ();

	if (c2 == 't')
	return '\t';
	if (c2 == 'n')
	return '\n';
	if (c2 == 'r')
	return '\r';
	if (c2 == 'f')
	return '\f';
	if (c2 == 'u')
	{
	unsigned int n = 0;
	int i;

	for (i = 0; i < 4; i++)
	{
	int c1 = phase3_getc ();

	if (c1 >= '0' && c1 <= '9')
	n = (n << 4) + (c1 - '0');
	else if (c1 >= 'A' && c1 <= 'F')
	n = (n << 4) + (c1 - 'A' + 10);
	else if (c1 >= 'a' && c1 <= 'f')
	n = (n << 4) + (c1 - 'a' + 10);
	else
	{
	phase3_ungetc (c1);
	po_xerror (PO_SEVERITY_ERROR, NULL,
	real_file_name, gram_pos.line_number, (size_t)(-1),
	false, _("warning: invalid \\uxxxx syntax for Unicode character"));
	return 'u';
	}
	}
	return UNICODE (n);
	}

	return c2;
	}
	else
	return c;
	}


	/* Reads a key or value string.
	Returns the string in UTF-8 encoding, or NULL if the end of the logical
	line is reached.
	Parsing ends:
	- when returning NULL, after the end of the logical line,
	- otherwise, if in_key is true, after the whitespace and possibly the
	separator that follows after the string,
	- otherwise, if in_key is false, after the end of the logical line. */

	static char *
	read_escaped_string (bool in_key)
	{
	/* The part of the string that has already been converted to UTF-8. */
	static unsigned char *utf8_buffer;
	static size_t utf8_buflen;
	static size_t utf8_allocated;
	/* The first half of an UTF-16 surrogate character. */
	unsigned short utf16_surr;
	/* Line in which this surrogate character occurred. */
	size_t utf16_surr_line;

	/* Ensures utf8_buffer has room for N bytes. N must be <= 10. */
	#define utf8_buffer_ensure_available(n) \
	do \
	{ \
	if (utf8_buflen + (n) > utf8_allocated) \
	{ \
	utf8_allocated = 2 * utf8_allocated + 10; \
	utf8_buffer = \
	(unsigned char *) xrealloc (utf8_buffer, utf8_allocated); \
	} \
	} \
	while (0)

	/* Appends a lone surrogate to utf8_buffer. */
	/* Note: A half surrogate is invalid in UTF-8:
	- RFC 3629 says
	"The definition of UTF-8 prohibits encoding character
	numbers between U+D800 and U+DFFF".
	- Unicode 4.0 chapter 3
	<https://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
	section 3.9, p.77, says
	"Because surrogate code points are not Unicode scalar
	values, any UTF-8 byte sequence that would otherwise
	map to code points D800..DFFF is ill-formed."
	and in table 3-6, p. 78, does not mention D800..DFFF.
	- The unicode.org FAQ question "How do I convert an unpaired
	UTF-16 surrogate to UTF-8?" has the answer
	"By representing such an unpaired surrogate on its own
	as a 3-byte sequence, the resulting UTF-8 data stream
	would become ill-formed."
	So use U+FFFD instead. */
	#define utf8_buffer_append_lone_surrogate(uc, line) \
	do \
	{ \
	error_with_progname = false; \
	po_xerror (PO_SEVERITY_ERROR, NULL, \
	real_file_name, (line), (size_t)(-1), false, \
	xasprintf (_("warning: lone surrogate U+%04X"), (uc))); \
	error_with_progname = true; \
	utf8_buffer_ensure_available (3); \
	utf8_buffer[utf8_buflen++] = 0xef; \
	utf8_buffer[utf8_buflen++] = 0xbf; \
	utf8_buffer[utf8_buflen++] = 0xbd; \
	} \
	while (0)

	int c;

	/* Skip whitespace before the string. */
	do
	c = phase3_getc ();
	while (c == ' ' \|\| c == '\t' \|\| c == '\r' \|\| c == '\f');

	if (c == EOF \|\| c == '\n')
	/* Empty string. */
	return NULL;

	/* Start accumulating the string. */
	utf8_buflen = 0;
	utf16_surr = 0;
	utf16_surr_line = 0;
	for (;;)
	{
	if (in_key && (c == '=' \|\| c == ':'
	\|\| c == ' ' \|\| c == '\t' \|\| c == '\r' \|\| c == '\f'))
	{
	/* Skip whitespace after the string. */
	while (c == ' ' \|\| c == '\t' \|\| c == '\r' \|\| c == '\f')
	c = phase3_getc ();
	/* Skip '=' or ':' separator. */
	if (!(c == '=' \|\| c == ':'))
	phase3_ungetc (c);
	break;
	}

	phase3_ungetc (c);

	/* Read the next byte or UTF-16 code point. */
	c = phase4_getuc ();
	if (c == P4_EOF)
	break;

	/* Append it to the buffer. */
	if (IS_UNICODE (c))
	{
	/* Append an UTF-16 code point. */
	/* Test whether this character and the previous one form a Unicode
	surrogate pair. */
	if (utf16_surr != 0
	&& (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
	{
	unsigned short utf16buf[2];
	ucs4_t uc;
	int len;

	utf16buf[0] = utf16_surr;
	utf16buf[1] = UTF16_VALUE (c);
	if (u16_mbtouc (&uc, utf16buf, 2) != 2)
	abort ();

	utf8_buffer_ensure_available (6);
	len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 6);
	if (len < 0)
	{
	error_with_progname = false;
	po_xerror (PO_SEVERITY_ERROR, NULL,
	real_file_name, gram_pos.line_number, (size_t)(-1),
	false, _("warning: invalid Unicode character"));
	error_with_progname = true;
	}
	else
	utf8_buflen += len;

	utf16_surr = 0;
	}
	else
	{
	if (utf16_surr != 0)
	{
	utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
	utf16_surr = 0;
	}

	if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
	{
	utf16_surr = UTF16_VALUE (c);
	utf16_surr_line = gram_pos.line_number;
	}
	else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
	utf8_buffer_append_lone_surrogate (UTF16_VALUE (c), gram_pos.line_number);
	else
	{
	ucs4_t uc = UTF16_VALUE (c);
	int len;

	utf8_buffer_ensure_available (3);
	len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 3);
	if (len < 0)
	{
	error_with_progname = false;
	po_xerror (PO_SEVERITY_ERROR, NULL,
	real_file_name, gram_pos.line_number, (size_t)(-1),
	false, _("warning: invalid Unicode character"));
	error_with_progname = true;
	}
	else
	utf8_buflen += len;
	}
	}
	}
	else
	{
	/* Append a single byte. */
	if (utf16_surr != 0)
	{
	utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
	utf16_surr = 0;
	}

	if (assume_utf8)
	{
	/* No conversion needed. */
	utf8_buffer_ensure_available (1);
	utf8_buffer[utf8_buflen++] = c;
	}
	else
	{
	/* Convert the byte from ISO-8859-1 to UTF-8 on the fly. */
	ucs4_t uc = c;
	int len;

	utf8_buffer_ensure_available (2);
	len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 2);
	if (len < 0)
	abort ();
	utf8_buflen += len;
	}
	}

	c = phase3_getc ();
	if (c == EOF \|\| c == '\n')
	{
	if (in_key)
	phase3_ungetc (c);
	break;
	}
	}
	if (utf16_surr != 0)
	utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);

	/* Return the result. */
	{
	unsigned char *utf8_string = XNMALLOC (utf8_buflen + 1, unsigned char);
	if (utf8_buflen > 0)
	memcpy (utf8_string, utf8_buffer, utf8_buflen);
	utf8_string[utf8_buflen] = '\0';

	return (char *) utf8_string;
	}
	#undef utf8_buffer_append_lone_surrogate
	#undef utf8_buffer_ensure_available
	}


	/* Read a .properties file from a stream, and dispatch to the various
	abstract_catalog_reader_class_ty methods. */
	static void
	properties_parse (abstract_catalog_reader_ty this, FILE file,
	const char real_filename, const char logical_filename)
	{
	/* Read the file into memory. */
	contents = fread_file (file, 0, &contents_length);
	if (contents == NULL)
	{
	const char *errno_description = strerror (errno);
	po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
	xasprintf ("%s: %s",
	xasprintf (_("error while reading \"%s\""),
	real_filename),
	errno_description));
	return;
	}

	/* Test whether it's valid UTF-8. */
	assume_utf8 = (u8_check ((uint8_t *) contents, contents_length) == NULL);

	position = 0;
	real_file_name = real_filename;
	gram_pos.file_name = xstrdup (real_file_name);
	gram_pos.line_number = 1;

	for (;;)
	{
	int c;
	bool comment;
	bool hidden;

	c = phase2_getc ();

	if (c == EOF)
	break;

	comment = false;
	hidden = false;
	if (c == '#')
	comment = true;
	else if (c == '!')
	{
	/* For compatibility with write-properties.c, we treat '!' not
	followed by space as a fuzzy or untranslated message. */
	int c2 = phase2_getc ();
	if (c2 == ' ' \|\| c2 == '\n' \|\| c2 == EOF)
	comment = true;
	else
	hidden = true;
	phase2_ungetc (c2);
	}
	else
	phase2_ungetc (c);

	if (comment)
	{
	/* A comment line. */
	static char *buffer;
	static size_t bufmax;
	static size_t buflen;

	buflen = 0;
	for (;;)
	{
	c = phase2_getc ();

	if (buflen >= bufmax)
	{
	bufmax += 100;
	buffer = xrealloc (buffer, bufmax);
	}

	if (c == EOF \|\| c == '\n')
	break;

	buffer[buflen++] = c;
	}
	buffer[buflen] = '\0';

	po_callback_comment_dispatcher (
	conv_from_java (
	assume_utf8 ? buffer : conv_from_iso_8859_1 (buffer)));
	}
	else
	{
	/* A key/value pair. */
	char *msgid;
	lex_pos_ty msgid_pos;

	msgid_pos = gram_pos;
	msgid = read_escaped_string (true);
	if (msgid == NULL)
	/* Skip blank line. */
	;
	else
	{
	char *msgstr;
	lex_pos_ty msgstr_pos;
	bool force_fuzzy;

	msgstr_pos = gram_pos;
	msgstr = read_escaped_string (false);
	if (msgstr == NULL)
	msgstr = xstrdup ("");

	/* Be sure to make the message fuzzy if it was commented out
	and if it is not already header/fuzzy/untranslated. */
	force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');

	po_callback_message (NULL, msgid, &msgid_pos, NULL,
	msgstr, strlen (msgstr) + 1, &msgstr_pos,
	NULL, NULL, NULL,
	force_fuzzy, false);
	}
	}
	}

	free (contents);
	contents = NULL;
	real_file_name = NULL;
	gram_pos.line_number = 0;
	}

	const struct catalog_input_format input_format_properties =
	{
	properties_parse, /* parse */
	true /* produces_utf8 */
	};