mingw/gettext/gettext-tools/src/read-stringtable.c - kiwivm - Git at Google

 /* Reading NeXTstep/GNUstep .strings files.
    Copyright (C) 2003, 2005-2007, 2009, 2019-2020 Free Software Foundation, Inc.
    Written by Bruno Haible <bruno@clisp.org>, 2003.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */

 #ifdef HAVE_CONFIG_H
 # include <config.h>
 #endif

 /* Specification.  */
 #include "read-stringtable.h"

 #include <assert.h>
 #include <errno.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #include "attribute.h"
 #include "error.h"
 #include "error-progname.h"
 #include "read-catalog-abstract.h"
 #include "xalloc.h"
 #include "xvasprintf.h"
 #include "po-xerror.h"
 #include "unistr.h"
 #include "gettext.h"

 #define _(str) gettext (str)

 /* The format of NeXTstep/GNUstep .strings files is documented in
      gnustep-base-1.8.0/Tools/make_strings/Using.txt
    and in the comments of method propertyListFromStringsFileFormat in
      gnustep-base-1.8.0/Source/NSString.m
    In summary, it's a Objective-C like file with pseudo-assignments of the form
           "key" = "value";
    where the key is the msgid and the value is the msgstr.

    The implementation of the parser of .strings files is in
      gnustep-base-1.8.0/Source/NSString.m
      function GSPropertyListFromStringsFormat
      (indirectly called from NSBundle's method localizedStringForKey).

    A test case is in
      gnustep-base-1.8.0/Testing/English.lproj/NXStringTable.example
  */

 /* Handling of comments: We copy all comments from the .strings file to
    the PO file. This is not really needed; it's a service for translators
    who don't like PO files and prefer to maintain the .strings file.  */


 /* Real filename, used in error messages about the input file.  */
 static const char *real_file_name;

 /* File name and line number.  */
 extern lex_pos_ty gram_pos;

 /* The input file stream.  */
 static FILE *fp;


 /* Phase 1: Read a byte.
    Max. 4 pushback characters.  */

 static unsigned char phase1_pushback[4];
 static int phase1_pushback_length;

 static int
 phase1_getc ()
 {
   int c;

   if (phase1_pushback_length)
     return phase1_pushback[--phase1_pushback_length];

   c = getc (fp);

   if (c == EOF)
     {
       if (ferror (fp))
         {
           const char *errno_description = strerror (errno);
           po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
                      xasprintf ("%s: %s",
                                 xasprintf (_("error while reading \"%s\""),
                                            real_file_name),
                                 errno_description));
         }
       return EOF;
     }

   return c;
 }

 static void
 phase1_ungetc (int c)
 {
   if (c != EOF)
     phase1_pushback[phase1_pushback_length++] = c;
 }


 /* Phase 2: Read an UCS-4 character.
    Max. 2 pushback characters.  */

 /* End-of-file indicator for functions returning an UCS-4 character.  */
 #define UEOF -1

 static int phase2_pushback[4];
 static int phase2_pushback_length;

 /* The input file can be in Unicode encoding (UCS-2BE, UCS-2LE, UTF-8, each
    with a BOM!), or otherwise the locale-dependent default encoding is used.
    Since we don't want to depend on the locale here, we use ISO-8859-1
    instead.  */
 enum enc
 {
   enc_undetermined,
   enc_ucs2be,
   enc_ucs2le,
   enc_utf8,
   enc_iso8859_1
 };
 static enum enc encoding;

 static int
 phase2_getc ()
 {
   if (phase2_pushback_length)
     return phase2_pushback[--phase2_pushback_length];

   if (encoding == enc_undetermined)
     {
       /* Determine the input file's encoding.  */
       int c0, c1;

       c0 = phase1_getc ();
       if (c0 == EOF)
         return UEOF;
       c1 = phase1_getc ();
       if (c1 == EOF)
         {
           phase1_ungetc (c0);
           encoding = enc_iso8859_1;
         }
       else if (c0 == 0xfe && c1 == 0xff)
         encoding = enc_ucs2be;
       else if (c0 == 0xff && c1 == 0xfe)
         encoding = enc_ucs2le;
       else
         {
           int c2;

           c2 = phase1_getc ();
           if (c2 == EOF)
             {
               phase1_ungetc (c1);
               phase1_ungetc (c0);
               encoding = enc_iso8859_1;
             }
           else if (c0 == 0xef && c1 == 0xbb && c2 == 0xbf)
             encoding = enc_utf8;
           else
             {
               phase1_ungetc (c2);
               phase1_ungetc (c1);
               phase1_ungetc (c0);
               encoding = enc_iso8859_1;
             }
         }
     }

   switch (encoding)
     {
     case enc_ucs2be:
       /* Read an UCS-2BE encoded character.  */
       {
         int c0, c1;

         c0 = phase1_getc ();
         if (c0 == EOF)
           return UEOF;
         c1 = phase1_getc ();
         if (c1 == EOF)
           return UEOF;
         return (c0 << 8) + c1;
       }

     case enc_ucs2le:
       /* Read an UCS-2LE encoded character.  */
       {
         int c0, c1;

         c0 = phase1_getc ();
         if (c0 == EOF)
           return UEOF;
         c1 = phase1_getc ();
         if (c1 == EOF)
           return UEOF;
         return c0 + (c1 << 8);
       }

     case enc_utf8:
       /* Read an UTF-8 encoded character.  */
       {
         unsigned char buf[6];
         unsigned int count;
         int c;
         ucs4_t uc;

         c = phase1_getc ();
         if (c == EOF)
           return UEOF;
         buf[0] = c;
         count = 1;

         if (buf[0] >= 0xc0)
           {
             c = phase1_getc ();
             if (c == EOF)
               return UEOF;
             buf[1] = c;
             count = 2;

             if (buf[0] >= 0xe0
                 && ((buf[1] ^ 0x80) < 0x40))
               {
                 c = phase1_getc ();
                 if (c == EOF)
                   return UEOF;
                 buf[2] = c;
                 count = 3;

                 if (buf[0] >= 0xf0
                     && ((buf[2] ^ 0x80) < 0x40))
                   {
                     c = phase1_getc ();
                     if (c == EOF)
                       return UEOF;
                     buf[3] = c;
                     count = 4;

                     if (buf[0] >= 0xf8
                         && ((buf[3] ^ 0x80) < 0x40))
                       {
                         c = phase1_getc ();
                         if (c == EOF)
                           return UEOF;
                         buf[4] = c;
                         count = 5;

                         if (buf[0] >= 0xfc
                             && ((buf[4] ^ 0x80) < 0x40))
                           {
                             c = phase1_getc ();
                             if (c == EOF)
                               return UEOF;
                             buf[5] = c;
                             count = 6;
                           }
                       }
                   }
               }
           }

         u8_mbtouc (&uc, buf, count);
         return uc;
       }

     case enc_iso8859_1:
       /* Read an ISO-8859-1 encoded character.  */
       {
         int c = phase1_getc ();

         if (c == EOF)
           return UEOF;
         return c;
       }

     default:
       abort ();
     }
 }

 static void
 phase2_ungetc (int c)
 {
   if (c != UEOF)
     phase2_pushback[phase2_pushback_length++] = c;
 }


 /* Phase 3: Read an UCS-4 character, with line number handling.  */

 static int
 phase3_getc ()
 {
   int c = phase2_getc ();

   if (c == '\n')
     gram_pos.line_number++;

   return c;
 }

 static void
 phase3_ungetc (int c)
 {
   if (c == '\n')
     --gram_pos.line_number;
   phase2_ungetc (c);
 }


 /* Convert from UCS-4 to UTF-8.  */
 static char *
 conv_from_ucs4 (const int *buffer, size_t buflen)
 {
   unsigned char *utf8_string;
   size_t pos;
   unsigned char *q;

   /* Each UCS-4 word needs 6 bytes at worst.  */
   utf8_string = XNMALLOC (6 * buflen + 1, unsigned char);

   for (pos = 0, q = utf8_string; pos < buflen; )
     {
       unsigned int uc;
       int n;

       uc = buffer[pos++];
       n = u8_uctomb (q, uc, 6);
       assert (n > 0);
       q += n;
     }
   *q = '\0';
   assert (q - utf8_string <= 6 * buflen);

   return (char *) utf8_string;
 }


 /* Parse a string enclosed in double-quotes.  Input is UCS-4 encoded.
    Return the string in UTF-8 encoding, or NULL if the input doesn't represent
    a valid string enclosed in double-quotes.  */
 static char *
 parse_escaped_string (const int *string, size_t length)
 {
   static int *buffer;
   static size_t bufmax;
   static size_t buflen;
   const int *string_limit = string + length;
   int c;

   if (string == string_limit)
     return NULL;
   c = *string++;
   if (c != '"')
     return NULL;
   buflen = 0;
   for (;;)
     {
       if (string == string_limit)
         return NULL;
       c = *string++;
       if (c == '"')
         break;
       if (c == '\\')
         {
           if (string == string_limit)
             return NULL;
           c = *string++;
           if (c >= '0' && c <= '7')
             {
               unsigned int n = 0;
               int j = 0;
               for (;;)
                 {
                   n = n * 8 + (c - '0');
                   if (++j == 3)
                     break;
                   if (string == string_limit)
                     break;
                   c = *string;
                   if (!(c >= '0' && c <= '7'))
                     break;
                   string++;
                 }
               c = n;
             }
           else if (c == 'u' || c == 'U')
             {
               unsigned int n = 0;
               int j;
               for (j = 0; j < 4; j++)
                 {
                   if (string == string_limit)
                     break;
                   c = *string;
                   if (c >= '0' && c <= '9')
                     n = n * 16 + (c - '0');
                   else if (c >= 'A' && c <= 'F')
                     n = n * 16 + (c - 'A' + 10);
                   else if (c >= 'a' && c <= 'f')
                     n = n * 16 + (c - 'a' + 10);
                   else
                     break;
                   string++;
                 }
               c = n;
             }
           else
             switch (c)
               {
               case 'a': c = '\a'; break;
               case 'b': c = '\b'; break;
               case 't': c = '\t'; break;
               case 'r': c = '\r'; break;
               case 'n': c = '\n'; break;
               case 'v': c = '\v'; break;
               case 'f': c = '\f'; break;
               }
         }
       if (buflen >= bufmax)
         {
           bufmax = 2 * bufmax + 10;
           buffer = xrealloc (buffer, bufmax * sizeof (int));
         }
       buffer[buflen++] = c;
     }

   return conv_from_ucs4 (buffer, buflen);
 }


 /* Accumulating flag comments.  */

 static char *special_comment;

 static inline void
 special_comment_reset ()
 {
   if (special_comment != NULL)
     free (special_comment);
   special_comment = NULL;
 }

 static void
 special_comment_add (const char *flag)
 {
   if (special_comment == NULL)
     special_comment = xstrdup (flag);
   else
     {
       size_t total_len = strlen (special_comment) + 2 + strlen (flag) + 1;
       special_comment = xrealloc (special_comment, total_len);
       strcat (special_comment, ", ");
       strcat (special_comment, flag);
     }
 }

 static inline void
 special_comment_finish ()
 {
   if (special_comment != NULL)
     {
       po_callback_comment_special (special_comment);
       free (special_comment);
       special_comment = NULL;
     }
 }


 /* Accumulating comments.  */

 static int *buffer;
 static size_t bufmax;
 static size_t buflen;
 static bool next_is_obsolete;
 static bool next_is_fuzzy;
 static char *fuzzy_msgstr;
 static bool expect_fuzzy_msgstr_as_c_comment;
 static bool expect_fuzzy_msgstr_as_cxx_comment;

 static inline void
 comment_start ()
 {
   buflen = 0;
 }

 static inline void
 comment_add (int c)
 {
   if (buflen >= bufmax)
     {
       bufmax = 2 * bufmax + 10;
       buffer = xrealloc (buffer, bufmax * sizeof (int));
     }
   buffer[buflen++] = c;
 }

 static inline void
 comment_line_end (size_t chars_to_remove, bool test_for_fuzzy_msgstr)
 {
   char *line;

   buflen -= chars_to_remove;
   /* Drop trailing white space, but not EOLs.  */
   while (buflen >= 1
          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
     --buflen;

   /* At special positions we interpret a comment of the form
        = "escaped string"
      with an optional trailing semicolon as being the fuzzy msgstr, not a
      regular comment.  */
   if (test_for_fuzzy_msgstr
       && buflen > 2 && buffer[0] == '=' && buffer[1] == ' '
       && (fuzzy_msgstr =
           parse_escaped_string (buffer + 2,
                                 buflen - (buffer[buflen - 1] == ';') - 2)))
     return;

   line = conv_from_ucs4 (buffer, buflen);

   if (strcmp (line, "Flag: untranslated") == 0)
     {
       special_comment_add ("fuzzy");
       next_is_fuzzy = true;
     }
   else if (strcmp (line, "Flag: unmatched") == 0)
     next_is_obsolete = true;
   else if (strlen (line) >= 6 && memcmp (line, "Flag: ", 6) == 0)
     special_comment_add (line + 6);
   else if (strlen (line) >= 9 && memcmp (line, "Comment: ", 9) == 0)
     /* A comment extracted from the source.  */
     po_callback_comment_dot (line + 9);
   else
     {
       char *last_colon;
       unsigned long number;
       char *endp;

       if (strlen (line) >= 6 && memcmp (line, "File: ", 6) == 0
           && (last_colon = strrchr (line + 6, ':')) != NULL
           && *(last_colon + 1) != '\0'
           && (number = strtoul (last_colon + 1, &endp, 10), *endp == '\0'))
         {
           /* A "File: <filename>:<number>" type comment.  */
           *last_colon = '\0';
           po_callback_comment_filepos (line + 6, number);
         }
       else
         po_callback_comment (line);
     }
 }


 /* Phase 4: Replace each comment that is not inside a string with a space
    character.  */

 static int
 phase4_getc ()
 {
   int c;

   c = phase3_getc ();
   if (c != '/')
     return c;
   c = phase3_getc ();
   switch (c)
     {
     default:
       phase3_ungetc (c);
       return '/';

     case '*':
       /* C style comment.  */
       {
         bool last_was_star;
         size_t trailing_stars;
         bool seen_newline;

         comment_start ();
         last_was_star = false;
         trailing_stars = 0;
         seen_newline = false;
         /* Drop additional stars at the beginning of the comment.  */
         for (;;)
           {
             c = phase3_getc ();
             if (c != '*')
               break;
             last_was_star = true;
           }
         phase3_ungetc (c);
         for (;;)
           {
             c = phase3_getc ();
             if (c == UEOF)
               break;
             /* We skip all leading white space, but not EOLs.  */
             if (!(buflen == 0 && (c == ' ' || c == '\t')))
               comment_add (c);
             switch (c)
               {
               case '\n':
                 seen_newline = true;
                 comment_line_end (1, false);
                 comment_start ();
                 last_was_star = false;
                 trailing_stars = 0;
                 continue;

               case '*':
                 last_was_star = true;
                 trailing_stars++;
                 continue;

               case '/':
                 if (last_was_star)
                   {
                     /* Drop additional stars at the end of the comment.  */
                     comment_line_end (trailing_stars + 1,
                                       expect_fuzzy_msgstr_as_c_comment
                                       && !seen_newline);
                     break;
                   }
                 FALLTHROUGH;

               default:
                 last_was_star = false;
                 trailing_stars = 0;
                 continue;
               }
             break;
           }
         return ' ';
       }

     case '/':
       /* C++ style comment.  */
       comment_start ();
       for (;;)
         {
           c = phase3_getc ();
           if (c == '\n' || c == UEOF)
             break;
           /* We skip all leading white space, but not EOLs.  */
           if (!(buflen == 0 && (c == ' ' || c == '\t')))
             comment_add (c);
         }
       comment_line_end (0, expect_fuzzy_msgstr_as_cxx_comment);
       return '\n';
     }
 }

 static inline void
 phase4_ungetc (int c)
 {
   phase3_ungetc (c);
 }


 /* Return true if a character is considered as whitespace.  */
 static bool
 is_whitespace (int c)
 {
   return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f'
           || c == '\b');
 }

 /* Return true if a character needs quoting, i.e. cannot be used in unquoted
    tokens.  */
 static bool
 is_quotable (int c)
 {
   if ((c >= '0' && c <= '9')
       || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
     return false;
   switch (c)
     {
     case '!': case '#': case '$': case '%': case '&': case '*':
     case '+': case '-': case '.': case '/': case ':': case '?':
     case '@': case '|': case '~': case '_': case '^':
       return false;
     default:
       return true;
     }
 }


 /* Read a key or value string.
    Return the string in UTF-8 encoding, or NULL if no string is seen.
    Return the start position of the string in *pos.  */
 static char *
 read_string (lex_pos_ty *pos)
 {
   static int *buffer;
   static size_t bufmax;
   static size_t buflen;
   int c;

   /* Skip whitespace before the string.  */
   do
     c = phase4_getc ();
   while (is_whitespace (c));

   if (c == UEOF)
     /* No more string.  */
     return NULL;

   *pos = gram_pos;
   buflen = 0;
   if (c == '"')
     {
       /* Read a string enclosed in double-quotes.  */
       for (;;)
         {
           c = phase3_getc ();
           if (c == UEOF || c == '"')
             break;
           if (c == '\\')
             {
               c = phase3_getc ();
               if (c == UEOF)
                 break;
               if (c >= '0' && c <= '7')
                 {
                   unsigned int n = 0;
                   int j = 0;
                   for (;;)
                     {
                       n = n * 8 + (c - '0');
                       if (++j == 3)
                         break;
                       c = phase3_getc ();
                       if (!(c >= '0' && c <= '7'))
                         {
                           phase3_ungetc (c);
                           break;
                         }
                     }
                   c = n;
                 }
               else if (c == 'u' || c == 'U')
                 {
                   unsigned int n = 0;
                   int j;
                   for (j = 0; j < 4; j++)
                     {
                       c = phase3_getc ();
                       if (c >= '0' && c <= '9')
                         n = n * 16 + (c - '0');
                       else if (c >= 'A' && c <= 'F')
                         n = n * 16 + (c - 'A' + 10);
                       else if (c >= 'a' && c <= 'f')
                         n = n * 16 + (c - 'a' + 10);
                       else
                         {
                           phase3_ungetc (c);
                           break;
                         }
                     }
                   c = n;
                 }
               else
                 switch (c)
                   {
                   case 'a': c = '\a'; break;
                   case 'b': c = '\b'; break;
                   case 't': c = '\t'; break;
                   case 'r': c = '\r'; break;
                   case 'n': c = '\n'; break;
                   case 'v': c = '\v'; break;
                   case 'f': c = '\f'; break;
                   }
             }
           if (buflen >= bufmax)
             {
               bufmax = 2 * bufmax + 10;
               buffer = xrealloc (buffer, bufmax * sizeof (int));
             }
           buffer[buflen++] = c;
         }
       if (c == UEOF)
         po_xerror (PO_SEVERITY_ERROR, NULL,
                    real_file_name, gram_pos.line_number, (size_t)(-1), false,
                    _("warning: unterminated string"));
     }
   else
     {
       /* Read a token outside quotes.  */
       if (is_quotable (c))
         po_xerror (PO_SEVERITY_ERROR, NULL,
                    real_file_name, gram_pos.line_number, (size_t)(-1), false,
                    _("warning: syntax error"));
       for (; c != UEOF && !is_quotable (c); c = phase4_getc ())
         {
           if (buflen >= bufmax)
             {
               bufmax = 2 * bufmax + 10;
               buffer = xrealloc (buffer, bufmax * sizeof (int));
             }
           buffer[buflen++] = c;
         }
     }

   return conv_from_ucs4 (buffer, buflen);
 }


 /* Read a .strings file from a stream, and dispatch to the various
    abstract_catalog_reader_class_ty methods.  */
 static void
 stringtable_parse (abstract_catalog_reader_ty *pop, FILE *file,
                    const char *real_filename, const char *logical_filename)
 {
   fp = file;
   real_file_name = real_filename;
   gram_pos.file_name = xstrdup (real_file_name);
   gram_pos.line_number = 1;
   encoding = enc_undetermined;
   expect_fuzzy_msgstr_as_c_comment = false;
   expect_fuzzy_msgstr_as_cxx_comment = false;

   for (;;)
     {
       char *msgid;
       lex_pos_ty msgid_pos;
       char *msgstr;
       lex_pos_ty msgstr_pos;
       int c;

       /* Prepare for next msgid/msgstr pair.  */
       special_comment_reset ();
       next_is_obsolete = false;
       next_is_fuzzy = false;
       fuzzy_msgstr = NULL;

       /* Read the key and all the comments preceding it.  */
       msgid = read_string (&msgid_pos);
       if (msgid == NULL)
         break;

       special_comment_finish ();

       /* Skip whitespace.  */
       do
         c = phase4_getc ();
       while (is_whitespace (c));

       /* Expect a '=' or ';'.  */
       if (c == UEOF)
         {
           po_xerror (PO_SEVERITY_ERROR, NULL,
                      real_file_name, gram_pos.line_number, (size_t)(-1), false,
                      _("warning: unterminated key/value pair"));
           break;
         }
       if (c == ';')
         {
           /* "key"; is an abbreviation for "key"=""; and does not
              necessarily designate an untranslated entry.  */
           msgstr = xstrdup ("");
           msgstr_pos = msgid_pos;
           po_callback_message (NULL, msgid, &msgid_pos, NULL,
                                msgstr, strlen (msgstr) + 1, &msgstr_pos,
                                NULL, NULL, NULL,
                                false, next_is_obsolete);
         }
       else if (c == '=')
         {
           /* Read the value.  */
           msgstr = read_string (&msgstr_pos);
           if (msgstr == NULL)
             {
               po_xerror (PO_SEVERITY_ERROR, NULL,
                          real_file_name, gram_pos.line_number, (size_t)(-1),
                          false, _("warning: unterminated key/value pair"));
               break;
             }

           /* Skip whitespace.  But for fuzzy key/value pairs, look for the
              tentative msgstr in the form of a C style comment.  */
           expect_fuzzy_msgstr_as_c_comment = next_is_fuzzy;
           do
             {
               c = phase4_getc ();
               if (fuzzy_msgstr != NULL)
                 expect_fuzzy_msgstr_as_c_comment = false;
             }
           while (is_whitespace (c));
           expect_fuzzy_msgstr_as_c_comment = false;

           /* Expect a ';'.  */
           if (c == ';')
             {
               /* But for fuzzy key/value pairs, look for the tentative msgstr
                  in the form of a C++ style comment. */
               if (fuzzy_msgstr == NULL && next_is_fuzzy)
                 {
                   do
                     c = phase3_getc ();
                   while (c == ' ');
                   phase3_ungetc (c);

                   expect_fuzzy_msgstr_as_cxx_comment = true;
                   c = phase4_getc ();
                   phase4_ungetc (c);
                   expect_fuzzy_msgstr_as_cxx_comment = false;
                 }
               if (fuzzy_msgstr != NULL && strcmp (msgstr, msgid) == 0)
                 msgstr = fuzzy_msgstr;

               /* A key/value pair.  */
               po_callback_message (NULL, msgid, &msgid_pos, NULL,
                                    msgstr, strlen (msgstr) + 1, &msgstr_pos,
                                    NULL, NULL, NULL,
                                    false, next_is_obsolete);
             }
           else
             {
               po_xerror (PO_SEVERITY_ERROR, NULL,
                          real_file_name, gram_pos.line_number, (size_t)(-1),
                          false,
                          _("warning: syntax error, expected ';' after string"));
               break;
             }
         }
       else
         {
           po_xerror (PO_SEVERITY_ERROR, NULL,
                      real_file_name, gram_pos.line_number, (size_t)(-1), false,
                      _("warning: syntax error, expected '=' or ';' after string"));
           break;
         }
     }

   fp = NULL;
   real_file_name = NULL;
   gram_pos.line_number = 0;
 }

 const struct catalog_input_format input_format_stringtable =
 {
   stringtable_parse,                    /* parse */
   true                                  /* produces_utf8 */
 };
	/* Reading NeXTstep/GNUstep .strings files.
	Copyright (C) 2003, 2005-2007, 2009, 2019-2020 Free Software Foundation, Inc.
	Written by Bruno Haible <bruno@clisp.org>, 2003.

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation; either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program. If not, see <https://www.gnu.org/licenses/>. */

	#ifdef HAVE_CONFIG_H
	# include <config.h>
	#endif

	/* Specification. */
	#include "read-stringtable.h"

	#include <assert.h>
	#include <errno.h>
	#include <stdbool.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	#include "attribute.h"
	#include "error.h"
	#include "error-progname.h"
	#include "read-catalog-abstract.h"
	#include "xalloc.h"
	#include "xvasprintf.h"
	#include "po-xerror.h"
	#include "unistr.h"
	#include "gettext.h"

	#define _(str) gettext (str)

	/* The format of NeXTstep/GNUstep .strings files is documented in
	gnustep-base-1.8.0/Tools/make_strings/Using.txt
	and in the comments of method propertyListFromStringsFileFormat in
	gnustep-base-1.8.0/Source/NSString.m
	In summary, it's a Objective-C like file with pseudo-assignments of the form
	"key" = "value";
	where the key is the msgid and the value is the msgstr.

	The implementation of the parser of .strings files is in
	gnustep-base-1.8.0/Source/NSString.m
	function GSPropertyListFromStringsFormat
	(indirectly called from NSBundle's method localizedStringForKey).

	A test case is in
	gnustep-base-1.8.0/Testing/English.lproj/NXStringTable.example
	*/

	/* Handling of comments: We copy all comments from the .strings file to
	the PO file. This is not really needed; it's a service for translators
	who don't like PO files and prefer to maintain the .strings file. */


	/* Real filename, used in error messages about the input file. */
	static const char *real_file_name;

	/* File name and line number. */
	extern lex_pos_ty gram_pos;

	/* The input file stream. */
	static FILE *fp;


	/* Phase 1: Read a byte.
	Max. 4 pushback characters. */

	static unsigned char phase1_pushback[4];
	static int phase1_pushback_length;

	static int
	phase1_getc ()
	{
	int c;

	if (phase1_pushback_length)
	return phase1_pushback[--phase1_pushback_length];

	c = getc (fp);

	if (c == EOF)
	{
	if (ferror (fp))
	{
	const char *errno_description = strerror (errno);
	po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
	xasprintf ("%s: %s",
	xasprintf (_("error while reading \"%s\""),
	real_file_name),
	errno_description));
	}
	return EOF;
	}

	return c;
	}

	static void
	phase1_ungetc (int c)
	{
	if (c != EOF)
	phase1_pushback[phase1_pushback_length++] = c;
	}


	/* Phase 2: Read an UCS-4 character.
	Max. 2 pushback characters. */

	/* End-of-file indicator for functions returning an UCS-4 character. */
	#define UEOF -1

	static int phase2_pushback[4];
	static int phase2_pushback_length;

	/* The input file can be in Unicode encoding (UCS-2BE, UCS-2LE, UTF-8, each
	with a BOM!), or otherwise the locale-dependent default encoding is used.
	Since we don't want to depend on the locale here, we use ISO-8859-1
	instead. */
	enum enc
	{
	enc_undetermined,
	enc_ucs2be,
	enc_ucs2le,
	enc_utf8,
	enc_iso8859_1
	};
	static enum enc encoding;

	static int
	phase2_getc ()
	{
	if (phase2_pushback_length)
	return phase2_pushback[--phase2_pushback_length];

	if (encoding == enc_undetermined)
	{
	/* Determine the input file's encoding. */
	int c0, c1;

	c0 = phase1_getc ();
	if (c0 == EOF)
	return UEOF;
	c1 = phase1_getc ();
	if (c1 == EOF)
	{
	phase1_ungetc (c0);
	encoding = enc_iso8859_1;
	}
	else if (c0 == 0xfe && c1 == 0xff)
	encoding = enc_ucs2be;
	else if (c0 == 0xff && c1 == 0xfe)
	encoding = enc_ucs2le;
	else
	{
	int c2;

	c2 = phase1_getc ();
	if (c2 == EOF)
	{
	phase1_ungetc (c1);
	phase1_ungetc (c0);
	encoding = enc_iso8859_1;
	}
	else if (c0 == 0xef && c1 == 0xbb && c2 == 0xbf)
	encoding = enc_utf8;
	else
	{
	phase1_ungetc (c2);
	phase1_ungetc (c1);
	phase1_ungetc (c0);
	encoding = enc_iso8859_1;
	}
	}
	}

	switch (encoding)
	{
	case enc_ucs2be:
	/* Read an UCS-2BE encoded character. */
	{
	int c0, c1;

	c0 = phase1_getc ();
	if (c0 == EOF)
	return UEOF;
	c1 = phase1_getc ();
	if (c1 == EOF)
	return UEOF;
	return (c0 << 8) + c1;
	}

	case enc_ucs2le:
	/* Read an UCS-2LE encoded character. */
	{
	int c0, c1;

	c0 = phase1_getc ();
	if (c0 == EOF)
	return UEOF;
	c1 = phase1_getc ();
	if (c1 == EOF)
	return UEOF;
	return c0 + (c1 << 8);
	}

	case enc_utf8:
	/* Read an UTF-8 encoded character. */
	{
	unsigned char buf[6];
	unsigned int count;
	int c;
	ucs4_t uc;

	c = phase1_getc ();
	if (c == EOF)
	return UEOF;
	buf[0] = c;
	count = 1;

	if (buf[0] >= 0xc0)
	{
	c = phase1_getc ();
	if (c == EOF)
	return UEOF;
	buf[1] = c;
	count = 2;

	if (buf[0] >= 0xe0
	&& ((buf[1] ^ 0x80) < 0x40))
	{
	c = phase1_getc ();
	if (c == EOF)
	return UEOF;
	buf[2] = c;
	count = 3;

	if (buf[0] >= 0xf0
	&& ((buf[2] ^ 0x80) < 0x40))
	{
	c = phase1_getc ();
	if (c == EOF)
	return UEOF;
	buf[3] = c;
	count = 4;

	if (buf[0] >= 0xf8
	&& ((buf[3] ^ 0x80) < 0x40))
	{
	c = phase1_getc ();
	if (c == EOF)
	return UEOF;
	buf[4] = c;
	count = 5;

	if (buf[0] >= 0xfc
	&& ((buf[4] ^ 0x80) < 0x40))
	{
	c = phase1_getc ();
	if (c == EOF)
	return UEOF;
	buf[5] = c;
	count = 6;
	}
	}
	}
	}
	}

	u8_mbtouc (&uc, buf, count);
	return uc;
	}

	case enc_iso8859_1:
	/* Read an ISO-8859-1 encoded character. */
	{
	int c = phase1_getc ();

	if (c == EOF)
	return UEOF;
	return c;
	}

	default:
	abort ();
	}
	}

	static void
	phase2_ungetc (int c)
	{
	if (c != UEOF)
	phase2_pushback[phase2_pushback_length++] = c;
	}


	/* Phase 3: Read an UCS-4 character, with line number handling. */

	static int
	phase3_getc ()
	{
	int c = phase2_getc ();

	if (c == '\n')
	gram_pos.line_number++;

	return c;
	}

	static void
	phase3_ungetc (int c)
	{
	if (c == '\n')
	--gram_pos.line_number;
	phase2_ungetc (c);
	}


	/* Convert from UCS-4 to UTF-8. */
	static char *
	conv_from_ucs4 (const int *buffer, size_t buflen)
	{
	unsigned char *utf8_string;
	size_t pos;
	unsigned char *q;

	/* Each UCS-4 word needs 6 bytes at worst. */
	utf8_string = XNMALLOC (6 * buflen + 1, unsigned char);

	for (pos = 0, q = utf8_string; pos < buflen; )
	{
	unsigned int uc;
	int n;

	uc = buffer[pos++];
	n = u8_uctomb (q, uc, 6);
	assert (n > 0);
	q += n;
	}
	*q = '\0';
	assert (q - utf8_string <= 6 * buflen);

	return (char *) utf8_string;
	}


	/* Parse a string enclosed in double-quotes. Input is UCS-4 encoded.
	Return the string in UTF-8 encoding, or NULL if the input doesn't represent
	a valid string enclosed in double-quotes. */
	static char *
	parse_escaped_string (const int *string, size_t length)
	{
	static int *buffer;
	static size_t bufmax;
	static size_t buflen;
	const int *string_limit = string + length;
	int c;

	if (string == string_limit)
	return NULL;
	c = *string++;
	if (c != '"')
	return NULL;
	buflen = 0;
	for (;;)
	{
	if (string == string_limit)
	return NULL;
	c = *string++;
	if (c == '"')
	break;
	if (c == '\\')
	{
	if (string == string_limit)
	return NULL;
	c = *string++;
	if (c >= '0' && c <= '7')
	{
	unsigned int n = 0;
	int j = 0;
	for (;;)
	{
	n = n * 8 + (c - '0');
	if (++j == 3)
	break;
	if (string == string_limit)
	break;
	c = *string;
	if (!(c >= '0' && c <= '7'))
	break;
	string++;
	}
	c = n;
	}
	else if (c == 'u' \|\| c == 'U')
	{
	unsigned int n = 0;
	int j;
	for (j = 0; j < 4; j++)
	{
	if (string == string_limit)
	break;
	c = *string;
	if (c >= '0' && c <= '9')
	n = n * 16 + (c - '0');
	else if (c >= 'A' && c <= 'F')
	n = n * 16 + (c - 'A' + 10);
	else if (c >= 'a' && c <= 'f')
	n = n * 16 + (c - 'a' + 10);
	else
	break;
	string++;
	}
	c = n;
	}
	else
	switch (c)
	{
	case 'a': c = '\a'; break;
	case 'b': c = '\b'; break;
	case 't': c = '\t'; break;
	case 'r': c = '\r'; break;
	case 'n': c = '\n'; break;
	case 'v': c = '\v'; break;
	case 'f': c = '\f'; break;
	}
	}
	if (buflen >= bufmax)
	{
	bufmax = 2 * bufmax + 10;
	buffer = xrealloc (buffer, bufmax * sizeof (int));
	}
	buffer[buflen++] = c;
	}

	return conv_from_ucs4 (buffer, buflen);
	}


	/* Accumulating flag comments. */

	static char *special_comment;

	static inline void
	special_comment_reset ()
	{
	if (special_comment != NULL)
	free (special_comment);
	special_comment = NULL;
	}

	static void
	special_comment_add (const char *flag)
	{
	if (special_comment == NULL)
	special_comment = xstrdup (flag);
	else
	{
	size_t total_len = strlen (special_comment) + 2 + strlen (flag) + 1;
	special_comment = xrealloc (special_comment, total_len);
	strcat (special_comment, ", ");
	strcat (special_comment, flag);
	}
	}

	static inline void
	special_comment_finish ()
	{
	if (special_comment != NULL)
	{
	po_callback_comment_special (special_comment);
	free (special_comment);
	special_comment = NULL;
	}
	}


	/* Accumulating comments. */

	static int *buffer;
	static size_t bufmax;
	static size_t buflen;
	static bool next_is_obsolete;
	static bool next_is_fuzzy;
	static char *fuzzy_msgstr;
	static bool expect_fuzzy_msgstr_as_c_comment;
	static bool expect_fuzzy_msgstr_as_cxx_comment;

	static inline void
	comment_start ()
	{
	buflen = 0;
	}

	static inline void
	comment_add (int c)
	{
	if (buflen >= bufmax)
	{
	bufmax = 2 * bufmax + 10;
	buffer = xrealloc (buffer, bufmax * sizeof (int));
	}
	buffer[buflen++] = c;
	}

	static inline void
	comment_line_end (size_t chars_to_remove, bool test_for_fuzzy_msgstr)
	{
	char *line;

	buflen -= chars_to_remove;
	/* Drop trailing white space, but not EOLs. */
	while (buflen >= 1
	&& (buffer[buflen - 1] == ' ' \|\| buffer[buflen - 1] == '\t'))
	--buflen;

	/* At special positions we interpret a comment of the form
	= "escaped string"
	with an optional trailing semicolon as being the fuzzy msgstr, not a
	regular comment. */
	if (test_for_fuzzy_msgstr
	&& buflen > 2 && buffer[0] == '=' && buffer[1] == ' '
	&& (fuzzy_msgstr =
	parse_escaped_string (buffer + 2,
	buflen - (buffer[buflen - 1] == ';') - 2)))
	return;

	line = conv_from_ucs4 (buffer, buflen);

	if (strcmp (line, "Flag: untranslated") == 0)
	{
	special_comment_add ("fuzzy");
	next_is_fuzzy = true;
	}
	else if (strcmp (line, "Flag: unmatched") == 0)
	next_is_obsolete = true;
	else if (strlen (line) >= 6 && memcmp (line, "Flag: ", 6) == 0)
	special_comment_add (line + 6);
	else if (strlen (line) >= 9 && memcmp (line, "Comment: ", 9) == 0)
	/* A comment extracted from the source. */
	po_callback_comment_dot (line + 9);
	else
	{
	char *last_colon;
	unsigned long number;
	char *endp;

	if (strlen (line) >= 6 && memcmp (line, "File: ", 6) == 0
	&& (last_colon = strrchr (line + 6, ':')) != NULL
	&& *(last_colon + 1) != '\0'
	&& (number = strtoul (last_colon + 1, &endp, 10), *endp == '\0'))
	{
	/* A "File: <filename>:<number>" type comment. */
	*last_colon = '\0';
	po_callback_comment_filepos (line + 6, number);
	}
	else
	po_callback_comment (line);
	}
	}


	/* Phase 4: Replace each comment that is not inside a string with a space
	character. */

	static int
	phase4_getc ()
	{
	int c;

	c = phase3_getc ();
	if (c != '/')
	return c;
	c = phase3_getc ();
	switch (c)
	{
	default:
	phase3_ungetc (c);
	return '/';

	case '*':
	/* C style comment. */
	{
	bool last_was_star;
	size_t trailing_stars;
	bool seen_newline;

	comment_start ();
	last_was_star = false;
	trailing_stars = 0;
	seen_newline = false;
	/* Drop additional stars at the beginning of the comment. */
	for (;;)
	{
	c = phase3_getc ();
	if (c != '*')
	break;
	last_was_star = true;
	}
	phase3_ungetc (c);
	for (;;)
	{
	c = phase3_getc ();
	if (c == UEOF)
	break;
	/* We skip all leading white space, but not EOLs. */
	if (!(buflen == 0 && (c == ' ' \|\| c == '\t')))
	comment_add (c);
	switch (c)
	{
	case '\n':
	seen_newline = true;
	comment_line_end (1, false);
	comment_start ();
	last_was_star = false;
	trailing_stars = 0;
	continue;

	case '*':
	last_was_star = true;
	trailing_stars++;
	continue;

	case '/':
	if (last_was_star)
	{
	/* Drop additional stars at the end of the comment. */
	comment_line_end (trailing_stars + 1,
	expect_fuzzy_msgstr_as_c_comment
	&& !seen_newline);
	break;
	}
	FALLTHROUGH;

	default:
	last_was_star = false;
	trailing_stars = 0;
	continue;
	}
	break;
	}
	return ' ';
	}

	case '/':
	/* C++ style comment. */
	comment_start ();
	for (;;)
	{
	c = phase3_getc ();
	if (c == '\n' \|\| c == UEOF)
	break;
	/* We skip all leading white space, but not EOLs. */
	if (!(buflen == 0 && (c == ' ' \|\| c == '\t')))
	comment_add (c);
	}
	comment_line_end (0, expect_fuzzy_msgstr_as_cxx_comment);
	return '\n';
	}
	}

	static inline void
	phase4_ungetc (int c)
	{
	phase3_ungetc (c);
	}


	/* Return true if a character is considered as whitespace. */
	static bool
	is_whitespace (int c)
	{
	return (c == ' ' \|\| c == '\t' \|\| c == '\r' \|\| c == '\n' \|\| c == '\f'
	\|\| c == '\b');
	}

	/* Return true if a character needs quoting, i.e. cannot be used in unquoted
	tokens. */
	static bool
	is_quotable (int c)
	{
	if ((c >= '0' && c <= '9')
	\|\| (c >= 'A' && c <= 'Z') \|\| (c >= 'a' && c <= 'z'))
	return false;
	switch (c)
	{
	case '!': case '#': case '$': case '%': case '&': case '*':
	case '+': case '-': case '.': case '/': case ':': case '?':
	case '@': case '\|': case '~': case '_': case '^':
	return false;
	default:
	return true;
	}
	}


	/* Read a key or value string.
	Return the string in UTF-8 encoding, or NULL if no string is seen.
	Return the start position of the string in pos. /
	static char *
	read_string (lex_pos_ty *pos)
	{
	static int *buffer;
	static size_t bufmax;
	static size_t buflen;
	int c;

	/* Skip whitespace before the string. */
	do
	c = phase4_getc ();
	while (is_whitespace (c));

	if (c == UEOF)
	/* No more string. */
	return NULL;

	*pos = gram_pos;
	buflen = 0;
	if (c == '"')
	{
	/* Read a string enclosed in double-quotes. */
	for (;;)
	{
	c = phase3_getc ();
	if (c == UEOF \|\| c == '"')
	break;
	if (c == '\\')
	{
	c = phase3_getc ();
	if (c == UEOF)
	break;
	if (c >= '0' && c <= '7')
	{
	unsigned int n = 0;
	int j = 0;
	for (;;)
	{
	n = n * 8 + (c - '0');
	if (++j == 3)
	break;
	c = phase3_getc ();
	if (!(c >= '0' && c <= '7'))
	{
	phase3_ungetc (c);
	break;
	}
	}
	c = n;
	}
	else if (c == 'u' \|\| c == 'U')
	{
	unsigned int n = 0;
	int j;
	for (j = 0; j < 4; j++)
	{
	c = phase3_getc ();
	if (c >= '0' && c <= '9')
	n = n * 16 + (c - '0');
	else if (c >= 'A' && c <= 'F')
	n = n * 16 + (c - 'A' + 10);
	else if (c >= 'a' && c <= 'f')
	n = n * 16 + (c - 'a' + 10);
	else
	{
	phase3_ungetc (c);
	break;
	}
	}
	c = n;
	}
	else
	switch (c)
	{
	case 'a': c = '\a'; break;
	case 'b': c = '\b'; break;
	case 't': c = '\t'; break;
	case 'r': c = '\r'; break;
	case 'n': c = '\n'; break;
	case 'v': c = '\v'; break;
	case 'f': c = '\f'; break;
	}
	}
	if (buflen >= bufmax)
	{
	bufmax = 2 * bufmax + 10;
	buffer = xrealloc (buffer, bufmax * sizeof (int));
	}
	buffer[buflen++] = c;
	}
	if (c == UEOF)
	po_xerror (PO_SEVERITY_ERROR, NULL,
	real_file_name, gram_pos.line_number, (size_t)(-1), false,
	_("warning: unterminated string"));
	}
	else
	{
	/* Read a token outside quotes. */
	if (is_quotable (c))
	po_xerror (PO_SEVERITY_ERROR, NULL,
	real_file_name, gram_pos.line_number, (size_t)(-1), false,
	_("warning: syntax error"));
	for (; c != UEOF && !is_quotable (c); c = phase4_getc ())
	{
	if (buflen >= bufmax)
	{
	bufmax = 2 * bufmax + 10;
	buffer = xrealloc (buffer, bufmax * sizeof (int));
	}
	buffer[buflen++] = c;
	}
	}

	return conv_from_ucs4 (buffer, buflen);
	}


	/* Read a .strings file from a stream, and dispatch to the various
	abstract_catalog_reader_class_ty methods. */
	static void
	stringtable_parse (abstract_catalog_reader_ty pop, FILE file,
	const char real_filename, const char logical_filename)
	{
	fp = file;
	real_file_name = real_filename;
	gram_pos.file_name = xstrdup (real_file_name);
	gram_pos.line_number = 1;
	encoding = enc_undetermined;
	expect_fuzzy_msgstr_as_c_comment = false;
	expect_fuzzy_msgstr_as_cxx_comment = false;

	for (;;)
	{
	char *msgid;
	lex_pos_ty msgid_pos;
	char *msgstr;
	lex_pos_ty msgstr_pos;
	int c;

	/* Prepare for next msgid/msgstr pair. */
	special_comment_reset ();
	next_is_obsolete = false;
	next_is_fuzzy = false;
	fuzzy_msgstr = NULL;

	/* Read the key and all the comments preceding it. */
	msgid = read_string (&msgid_pos);
	if (msgid == NULL)
	break;

	special_comment_finish ();

	/* Skip whitespace. */
	do
	c = phase4_getc ();
	while (is_whitespace (c));

	/* Expect a '=' or ';'. */
	if (c == UEOF)
	{
	po_xerror (PO_SEVERITY_ERROR, NULL,
	real_file_name, gram_pos.line_number, (size_t)(-1), false,
	_("warning: unterminated key/value pair"));
	break;
	}
	if (c == ';')
	{
	/* "key"; is an abbreviation for "key"=""; and does not
	necessarily designate an untranslated entry. */
	msgstr = xstrdup ("");
	msgstr_pos = msgid_pos;
	po_callback_message (NULL, msgid, &msgid_pos, NULL,
	msgstr, strlen (msgstr) + 1, &msgstr_pos,
	NULL, NULL, NULL,
	false, next_is_obsolete);
	}
	else if (c == '=')
	{
	/* Read the value. */
	msgstr = read_string (&msgstr_pos);
	if (msgstr == NULL)
	{
	po_xerror (PO_SEVERITY_ERROR, NULL,
	real_file_name, gram_pos.line_number, (size_t)(-1),
	false, _("warning: unterminated key/value pair"));
	break;
	}

	/* Skip whitespace. But for fuzzy key/value pairs, look for the
	tentative msgstr in the form of a C style comment. */
	expect_fuzzy_msgstr_as_c_comment = next_is_fuzzy;
	do
	{
	c = phase4_getc ();
	if (fuzzy_msgstr != NULL)
	expect_fuzzy_msgstr_as_c_comment = false;
	}
	while (is_whitespace (c));
	expect_fuzzy_msgstr_as_c_comment = false;

	/* Expect a ';'. */
	if (c == ';')
	{
	/* But for fuzzy key/value pairs, look for the tentative msgstr
	in the form of a C++ style comment. */
	if (fuzzy_msgstr == NULL && next_is_fuzzy)
	{
	do
	c = phase3_getc ();
	while (c == ' ');
	phase3_ungetc (c);

	expect_fuzzy_msgstr_as_cxx_comment = true;
	c = phase4_getc ();
	phase4_ungetc (c);
	expect_fuzzy_msgstr_as_cxx_comment = false;
	}
	if (fuzzy_msgstr != NULL && strcmp (msgstr, msgid) == 0)
	msgstr = fuzzy_msgstr;

	/* A key/value pair. */
	po_callback_message (NULL, msgid, &msgid_pos, NULL,
	msgstr, strlen (msgstr) + 1, &msgstr_pos,
	NULL, NULL, NULL,
	false, next_is_obsolete);
	}
	else
	{
	po_xerror (PO_SEVERITY_ERROR, NULL,
	real_file_name, gram_pos.line_number, (size_t)(-1),
	false,
	_("warning: syntax error, expected ';' after string"));
	break;
	}
	}
	else
	{
	po_xerror (PO_SEVERITY_ERROR, NULL,
	real_file_name, gram_pos.line_number, (size_t)(-1), false,
	_("warning: syntax error, expected '=' or ';' after string"));
	break;
	}
	}

	fp = NULL;
	real_file_name = NULL;
	gram_pos.line_number = 0;
	}

	const struct catalog_input_format input_format_stringtable =
	{
	stringtable_parse, /* parse */
	true /* produces_utf8 */
	};