mingw/gettext/gettext-tools/src/x-java.c - kiwivm - Git at Google

 /* xgettext Java backend.
    Copyright (C) 2003, 2005-2009, 2018-2020 Free Software Foundation, Inc.
    Written by Bruno Haible <bruno@clisp.org>, 2003.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */

 #ifdef HAVE_CONFIG_H
 # include "config.h"
 #endif

 /* Specification.  */
 #include "x-java.h"

 #include <errno.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #include "attribute.h"
 #include "message.h"
 #include "rc-str-list.h"
 #include "xgettext.h"
 #include "xg-pos.h"
 #include "xg-encoding.h"
 #include "xg-mixed-string.h"
 #include "xg-arglist-context.h"
 #include "xg-arglist-callshape.h"
 #include "xg-arglist-parser.h"
 #include "xg-message.h"
 #include "error.h"
 #include "error-progname.h"
 #include "xalloc.h"
 #include "mem-hash-map.h"
 #include "po-charset.h"
 #include "unistr.h"
 #include "unictype.h"
 #include "gettext.h"

 #define _(s) gettext(s)

 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))


 /* The Java syntax is defined in the
      Java Language Specification
      (available from https://docs.oracle.com/javase/specs/),
      chapter 3 "Lexical Structure".  */


 /* ====================== Keyword set customization.  ====================== */

 /* If true extract all strings.  */
 static bool extract_all = false;

 static hash_table keywords;
 static bool default_keywords = true;


 void
 x_java_extract_all ()
 {
   extract_all = true;
 }


 void
 x_java_keyword (const char *name)
 {
   if (name == NULL)
     default_keywords = false;
   else
     {
       const char *end;
       struct callshape shape;
       const char *colon;

       if (keywords.table == NULL)
         hash_init (&keywords, 100);

       split_keywordspec (name, &end, &shape);

       /* The characters between name and end should form a valid Java
          identifier sequence with dots.
          A colon means an invalid parse in split_keywordspec().  */
       colon = strchr (name, ':');
       if (colon == NULL || colon >= end)
         insert_keyword_callshape (&keywords, name, end - name, &shape);
     }
 }

 /* Finish initializing the keywords hash table.
    Called after argument processing, before each file is processed.  */
 static void
 init_keywords ()
 {
   if (default_keywords)
     {
       /* When adding new keywords here, also update the documentation in
          xgettext.texi!  */
       x_java_keyword ("GettextResource.gettext:2");        /* static method */
       x_java_keyword ("GettextResource.ngettext:2,3");     /* static method */
       x_java_keyword ("GettextResource.pgettext:2c,3");    /* static method */
       x_java_keyword ("GettextResource.npgettext:2c,3,4"); /* static method */
       x_java_keyword ("gettext");
       x_java_keyword ("ngettext:1,2");
       x_java_keyword ("pgettext:1c,2");
       x_java_keyword ("npgettext:1c,2,3");
       x_java_keyword ("getString");     /* ResourceBundle.getString */
       default_keywords = false;
     }
 }

 void
 init_flag_table_java ()
 {
   xgettext_record_flag ("GettextResource.gettext:2:pass-java-format");
   xgettext_record_flag ("GettextResource.gettext:2:pass-java-printf-format");
   xgettext_record_flag ("GettextResource.ngettext:2:pass-java-format");
   xgettext_record_flag ("GettextResource.ngettext:2:pass-java-printf-format");
   xgettext_record_flag ("GettextResource.ngettext:3:pass-java-format");
   xgettext_record_flag ("GettextResource.ngettext:3:pass-java-printf-format");
   xgettext_record_flag ("GettextResource.pgettext:3:pass-java-format");
   xgettext_record_flag ("GettextResource.pgettext:3:pass-java-printf-format");
   xgettext_record_flag ("GettextResource.npgettext:3:pass-java-format");
   xgettext_record_flag ("GettextResource.npgettext:3:pass-java-printf-format");
   xgettext_record_flag ("GettextResource.npgettext:4:pass-java-format");
   xgettext_record_flag ("GettextResource.npgettext:4:pass-java-printf-format");
   xgettext_record_flag ("gettext:1:pass-java-format");
   xgettext_record_flag ("gettext:1:pass-java-printf-format");
   xgettext_record_flag ("ngettext:1:pass-java-format");
   xgettext_record_flag ("ngettext:1:pass-java-printf-format");
   xgettext_record_flag ("ngettext:2:pass-java-format");
   xgettext_record_flag ("ngettext:2:pass-java-printf-format");
   xgettext_record_flag ("pgettext:2:pass-java-format");
   xgettext_record_flag ("pgettext:2:pass-java-printf-format");
   xgettext_record_flag ("npgettext:2:pass-java-format");
   xgettext_record_flag ("npgettext:2:pass-java-printf-format");
   xgettext_record_flag ("npgettext:3:pass-java-format");
   xgettext_record_flag ("npgettext:3:pass-java-printf-format");
   xgettext_record_flag ("getString:1:pass-java-format");
   xgettext_record_flag ("getString:1:pass-java-printf-format");
   xgettext_record_flag ("MessageFormat:1:java-format");
   xgettext_record_flag ("MessageFormat.format:1:java-format");
   xgettext_record_flag ("String.format:1:java-printf-format");
   xgettext_record_flag ("printf:1:java-printf-format"); /* PrintStream.printf */
 }


 /* ======================== Reading of characters.  ======================== */

 /* The input file stream.  */
 static FILE *fp;


 /* Fetch the next single-byte character from the input file.
    Pushback can consist of an unlimited number of 'u' followed by up to 4
    other characters.  */

 /* Special coding of multiple 'u's in the pushback buffer.  */
 #define MULTIPLE_U(count) (0x1000 + (count))

 static int phase1_pushback[5];
 static unsigned int phase1_pushback_length;

 static int
 phase1_getc ()
 {
   int c;

   if (phase1_pushback_length)
     {
       c = phase1_pushback[--phase1_pushback_length];
       if (c >= MULTIPLE_U (0))
         {
           if (c > MULTIPLE_U (1))
             phase1_pushback[phase1_pushback_length++] = c - 1;
           return 'u';
         }
       else
         return c;
     }

   c = getc (fp);

   if (c == EOF)
     {
       if (ferror (fp))
         error (EXIT_FAILURE, errno,
                _("error while reading \"%s\""), real_file_name);
     }

   return c;
 }

 /* Supports any number of 'u' and up to 4 arbitrary characters of pushback.  */
 static void
 phase1_ungetc (int c)
 {
   if (c != EOF)
     {
       if (c == 'u')
         {
           if (phase1_pushback_length > 0
               && phase1_pushback[phase1_pushback_length - 1] >= MULTIPLE_U (0))
             phase1_pushback[phase1_pushback_length - 1]++;
           else
             {
               if (phase1_pushback_length == SIZEOF (phase1_pushback))
                 abort ();
               phase1_pushback[phase1_pushback_length++] = MULTIPLE_U (1);
             }
         }
       else
         {
           if (phase1_pushback_length == SIZEOF (phase1_pushback))
             abort ();
           phase1_pushback[phase1_pushback_length++] = c;
         }
     }
 }


 /* Fetch the next single-byte character or Unicode character from the file.
    (Here, as in the Java Language Specification, when we say "Unicode
    character", we actually mean "UTF-16 encoding unit".)  */

 /* Return value of phase 2, 3, 4 when EOF is reached.  */
 #define P2_EOF 0xffff

 /* Convert an UTF-16 code point to a return value that can be distinguished
    from a single-byte return value.  */
 #define UNICODE(code) (0x10000 + (code))

 /* Test a return value of phase 2, 3, 4 whether it designates an UTF-16 code
    point.  */
 #define IS_UNICODE(p2_result) ((p2_result) >= 0x10000)

 /* Extract the UTF-16 code of a return value that satisfies IS_UNICODE.  */
 #define UTF16_VALUE(p2_result) ((p2_result) - 0x10000)

 /* Reduces a return value of phase 2, 3, 4 by unmasking the UNICODE bit,
    so that it can be more easily compared against an ASCII character.
    (RED (c) == 'x')  is equivalent to  (c == 'x' || c == UNICODE ('x')).  */
 #define RED(p2_result) ((p2_result) & 0xffff)

 static int phase2_pushback[1];
 static int phase2_pushback_length;

 static int
 phase2_getc ()
 {
   int c;

   if (phase2_pushback_length)
     return phase2_pushback[--phase2_pushback_length];

   c = phase1_getc ();
   if (c == EOF)
     return P2_EOF;
   if (c == '\\')
     {
       c = phase1_getc ();
       if (c == 'u')
         {
           unsigned int u_count = 1;
           unsigned char buf[4];
           unsigned int n;
           int i;

           for (;;)
             {
               c = phase1_getc ();
               if (c != 'u')
                 break;
               u_count++;
             }
           phase1_ungetc (c);

           n = 0;
           for (i = 0; i < 4; i++)
             {
               c = phase1_getc ();

               if (c >= '0' && c <= '9')
                 n = (n << 4) + (c - '0');
               else if (c >= 'A' && c <= 'F')
                 n = (n << 4) + (c - 'A' + 10);
               else if (c >= 'a' && c <= 'f')
                 n = (n << 4) + (c - 'a' + 10);
               else
                 {
                   phase1_ungetc (c);
                   while (--i >= 0)
                     phase1_ungetc (buf[i]);
                   for (; u_count > 0; u_count--)
                     phase1_ungetc ('u');
                   return '\\';
                 }

               buf[i] = c;
             }
           return UNICODE (n);
         }
       phase1_ungetc (c);
       return '\\';
     }
   return c;
 }

 /* Supports only one pushback character.  */
 static void
 phase2_ungetc (int c)
 {
   if (c != P2_EOF)
     {
       if (phase2_pushback_length == SIZEOF (phase2_pushback))
         abort ();
       phase2_pushback[phase2_pushback_length++] = c;
     }
 }


 /* Fetch the next single-byte character or Unicode character from the file.
    With line number handling.
    Convert line terminators to '\n' or UNICODE ('\n').  */

 static int phase3_pushback[2];
 static int phase3_pushback_length;

 static int
 phase3_getc ()
 {
   int c;

   if (phase3_pushback_length)
     {
       c = phase3_pushback[--phase3_pushback_length];
       if (c == '\n')
         ++line_number;
       return c;
     }

   c = phase2_getc ();

   /* Handle line terminators.  */
   if (RED (c) == '\r')
     {
       int c1 = phase2_getc ();

       if (RED (c1) != '\n')
         phase2_ungetc (c1);

       /* Seen line terminator CR or CR/LF.  */
       if (c == '\r' || c1 == '\n')
         {
           ++line_number;
           return '\n';
         }
       else
         return UNICODE ('\n');
     }
   else if (RED (c) == '\n')
     {
       /* Seen line terminator LF.  */
       if (c == '\n')
         {
           ++line_number;
           return '\n';
         }
       else
         return UNICODE ('\n');
     }

   return c;
 }

 /* Supports 2 characters of pushback.  */
 static void
 phase3_ungetc (int c)
 {
   if (c != P2_EOF)
     {
       if (c == '\n')
         --line_number;
       if (phase3_pushback_length == SIZEOF (phase3_pushback))
         abort ();
       phase3_pushback[phase3_pushback_length++] = c;
     }
 }


 /* ========================= Accumulating strings.  ======================== */

 /* See xg-mixed-string.h for the main API.  */

 /* Append a character or Unicode character to a 'struct mixed_string_buffer'.  */
 static void
 mixed_string_buffer_append (struct mixed_string_buffer *bp, int c)
 {
   if (IS_UNICODE (c))
     {
       /* Append a Unicode character.  */
       mixed_string_buffer_append_unicode (bp, UTF16_VALUE (c));
     }
   else
     {
       /* Append a single byte.  */
       mixed_string_buffer_append_char (bp, (unsigned char) c);
     }
 }


 /* ======================== Accumulating comments.  ======================== */


 /* Accumulating a single comment line.  */

 static struct mixed_string_buffer comment_buffer;

 static inline void
 comment_start ()
 {
   mixed_string_buffer_init (&comment_buffer, lc_comment,
                             logical_file_name, line_number);
 }

 static inline bool
 comment_at_start ()
 {
   return mixed_string_buffer_is_empty (&comment_buffer);
 }

 static inline void
 comment_add (int c)
 {
   mixed_string_buffer_append (&comment_buffer, c);
 }

 static inline void
 comment_line_end (size_t chars_to_remove)
 {
   char *buffer =
     mixed_string_contents_free1 (mixed_string_buffer_result (&comment_buffer));
   size_t buflen = strlen (buffer);

   buflen -= chars_to_remove;
   while (buflen >= 1
          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
     --buflen;
   buffer[buflen] = '\0';
   savable_comment_add (buffer);
 }


 /* These are for tracking whether comments count as immediately before
    keyword.  */
 static int last_comment_line;
 static int last_non_comment_line;


 /* Replace each comment that is not inside a character constant or string
    literal with a space or newline character.  */

 static int
 phase4_getc ()
 {
   int c0;
   int c;
   bool last_was_star;

   c0 = phase3_getc ();
   if (RED (c0) != '/')
     return c0;
   c = phase3_getc ();
   switch (RED (c))
     {
     default:
       phase3_ungetc (c);
       return c0;

     case '*':
       /* C style comment.  */
       comment_start ();
       last_was_star = false;
       for (;;)
         {
           c = phase3_getc ();
           if (c == P2_EOF)
             break;
           /* We skip all leading white space, but not EOLs.  */
           if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
             comment_add (c);
           switch (RED (c))
             {
             case '\n':
               comment_line_end (1);
               comment_start ();
               last_was_star = false;
               continue;

             case '*':
               last_was_star = true;
               continue;

             case '/':
               if (last_was_star)
                 {
                   comment_line_end (2);
                   break;
                 }
               FALLTHROUGH;

             default:
               last_was_star = false;
               continue;
             }
           break;
         }
       last_comment_line = line_number;
       return ' ';

     case '/':
       /* C++ style comment.  */
       last_comment_line = line_number;
       comment_start ();
       for (;;)
         {
           c = phase3_getc ();
           if (RED (c) == '\n' || c == P2_EOF)
             break;
           /* We skip all leading white space, but not EOLs.  */
           if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
             comment_add (c);
         }
       phase3_ungetc (c); /* push back the newline, to decrement line_number */
       comment_line_end (0);
       phase3_getc (); /* read the newline again */
       return '\n';
     }
 }

 /* Supports only one pushback character.  */
 static void
 phase4_ungetc (int c)
 {
   phase3_ungetc (c);
 }


 /* ========================== Reading of tokens.  ========================== */

 enum token_type_ty
 {
   token_type_eof,
   token_type_lparen,            /* ( */
   token_type_rparen,            /* ) */
   token_type_lbrace,            /* { */
   token_type_rbrace,            /* } */
   token_type_comma,             /* , */
   token_type_dot,               /* . */
   token_type_string_literal,    /* "abc", """text block""" */
   token_type_number,            /* 1.23 */
   token_type_symbol,            /* identifier, keyword, null */
   token_type_plus,              /* + */
   token_type_other              /* character literal, misc. operator */
 };
 typedef enum token_type_ty token_type_ty;

 typedef struct token_ty token_ty;
 struct token_ty
 {
   token_type_ty type;
   char *string;                         /* for token_type_symbol */
   mixed_string_ty *mixed_string;        /* for token_type_string_literal */
   refcounted_string_list_ty *comment;   /* for token_type_string_literal */
   int line_number;
 };


 /* Free the memory pointed to by a 'struct token_ty'.  */
 static inline void
 free_token (token_ty *tp)
 {
   if (tp->type == token_type_symbol)
     free (tp->string);
   if (tp->type == token_type_string_literal)
     {
       free (tp->mixed_string);
       drop_reference (tp->comment);
     }
 }


 /* Read an escape sequence inside a string literal or character literal.  */
 static inline int
 do_getc_escaped ()
 {
   int c;

   /* Use phase 3, because phase 4 elides comments.  */
   c = phase3_getc ();
   if (c == P2_EOF)
     return UNICODE ('\\');
   switch (RED (c))
     {
     case 'b':
       return UNICODE (0x08);
     case 't':
       return UNICODE (0x09);
     case 'n':
       return UNICODE (0x0a);
     case 'f':
       return UNICODE (0x0c);
     case 'r':
       return UNICODE (0x0d);
     case '"':
       return UNICODE ('"');
     case '\'':
       return UNICODE ('\'');
     case '\\':
       return UNICODE ('\\');
     case '0': case '1': case '2': case '3':
     case '4': case '5': case '6': case '7':
       {
         int n = RED (c) - '0';
         bool maybe3digits = (n < 4);

         c = phase3_getc ();
         if (RED (c) >= '0' && RED (c) <= '7')
           {
             n = (n << 3) + (RED (c) - '0');
             if (maybe3digits)
               {
                 c = phase3_getc ();
                 if (RED (c) >= '0' && RED (c) <= '7')
                   n = (n << 3) + (RED (c) - '0');
                 else
                   phase3_ungetc (c);
               }
           }
         else
           phase3_ungetc (c);

         return UNICODE (n);
       }
     default:
       /* Invalid escape sequence.  */
       phase3_ungetc (c);
       return UNICODE ('\\');
     }
 }

 /* Read a string literal or character literal.  */
 static void
 accumulate_escaped (struct mixed_string_buffer *literal, int delimiter)
 {
   int c;

   for (;;)
     {
       /* Use phase 3, because phase 4 elides comments.  */
       c = phase3_getc ();
       if (c == P2_EOF || RED (c) == delimiter)
         break;
       if (RED (c) == '\n')
         {
           phase3_ungetc (c);
           error_with_progname = false;
           if (delimiter == '\'')
             error (0, 0, _("%s:%d: warning: unterminated character constant"),
                    logical_file_name, line_number);
           else
             error (0, 0, _("%s:%d: warning: unterminated string constant"),
                    logical_file_name, line_number);
           error_with_progname = true;
           break;
         }
       if (RED (c) == '\\')
         c = do_getc_escaped ();
       mixed_string_buffer_append (literal, c);
     }
 }


 /* Strip the common indentation of the non-blank lines of the given string and
    remove all trailing whitespace of all lines.
    Like the Java method String.stripIndent does.
    <https://docs.oracle.com/en/java/javase/13/docs/api/java.base/java/lang/String.html#stripIndent()>  */
 static void
 strip_indent (mixed_string_ty *ms)
 {
   size_t nsegments = ms->nsegments;
   size_t minimum_indentation = SIZE_MAX;
   {
     size_t curr_line_indentation = 0;
     bool curr_line_blank = true;
     size_t i;

     for (i = 0; i < nsegments; i++)
       {
         struct mixed_string_segment *segment = ms->segments[i];

         if (segment->type == utf8_encoded
             || (segment->type == source_encoded
                 && xgettext_current_source_encoding == po_charset_utf8))
           {
             /* Consider Unicode whitespace characters.  */
             size_t seglength = segment->length;
             size_t j;

             for (j = 0; j < seglength; )
               {
                 ucs4_t uc;
                 int bytes =
                   u8_mbtouc (&uc, (const uint8_t *) &segment->contents[j],
                              seglength - j);
                 j += bytes;
                 if (uc == 0x000a)
                   {
                     /* Newline.  */
                     if (!curr_line_blank)
                       if (minimum_indentation > curr_line_indentation)
                         minimum_indentation = curr_line_indentation;
                     curr_line_indentation = 0;
                     curr_line_blank = true;
                   }
                 else if (uc_is_java_whitespace (uc))
                   {
                     /* Whitespace character.  */
                     if (curr_line_blank)
                       /* Every whitespace character counts as 1, even the TAB
                          character.  */
                       curr_line_indentation++;
                   }
                 else
                   {
                     /* Other character.  */
                     curr_line_blank = false;
                   }
               }
           }
         else
           {
             /* When the encoding is not UTF-8, consider only ASCII whitespace
                characters.  */
             size_t seglength = segment->length;
             size_t j;

             for (j = 0; j < seglength; j++)
               {
                 char c = segment->contents[j];
                 if (c == '\n')
                   {
                     /* Newline.  */
                     if (!curr_line_blank)
                       if (minimum_indentation > curr_line_indentation)
                         minimum_indentation = curr_line_indentation;
                     curr_line_indentation = 0;
                     curr_line_blank = true;
                   }
                 else if (c == ' '
                          || (c >= 0x09 && c <= 0x0d)
                          || (c >= 0x1c && c <= 0x1f))
                   {
                     /* Whitespace character.  */
                     if (curr_line_blank)
                       /* Every whitespace character counts as 1, even the TAB
                          character.  */
                       curr_line_indentation++;
                   }
                 else
                   {
                     /* Other character.  */
                     curr_line_blank = false;
                   }
               }
           }
       }
     /* The indentation of the last line matters even if is blank.  */
     if (minimum_indentation > curr_line_indentation)
       minimum_indentation = curr_line_indentation;
   }

   /* The same loop as above, but this time remove the leading
      minimum_indentation whitespace characters and all trailing whitespace
      characters from every line.  */
   {
     size_t start_of_curr_line_i = 0;
     size_t start_of_curr_line_j = 0;
     size_t start_of_trailing_whitespace_i = 0;
     size_t start_of_trailing_whitespace_j = 0;
     size_t whitespace_to_remove = minimum_indentation;
     size_t i;

     for (i = 0; i < nsegments; i++)
       {
         struct mixed_string_segment *segment = ms->segments[i];
         /* Perform a sliding copy from segment->contents[from_j] to
            segment->contents[to_j].  0 <= to_j <= from_j.  */
         size_t to_j;

         if (segment->type == utf8_encoded
             || (segment->type == source_encoded
                 && xgettext_current_source_encoding == po_charset_utf8))
           {
             /* Consider Unicode whitespace characters.  */
             size_t seglength = segment->length;
             size_t from_j;

             for (to_j = from_j = 0; from_j < seglength; )
               {
                 ucs4_t uc;
                 int bytes =
                   u8_mbtouc (&uc, (const uint8_t *) &segment->contents[from_j],
                              seglength - from_j);
                 if (uc == 0x000a)
                   {
                     /* Newline.  */
                     if (whitespace_to_remove > 0)
                       {
                         /* It was a blank line with fewer than minimum_indentation
                            whitespace characters.  Remove all this whitespace.  */
                         if (start_of_curr_line_i < i)
                           {
                             size_t k;
                             ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
                             for (k = start_of_curr_line_i + 1; k < i; k++)
                               ms->segments[k]->length = 0;
                             to_j = 0;
                           }
                         else
                           to_j = start_of_curr_line_j;
                       }
                     else
                       {
                         /* Remove the trailing whitespace characters from the
                            current line.  */
                         if (start_of_trailing_whitespace_i < i)
                           {
                             size_t k;
                             ms->segments[start_of_trailing_whitespace_i]->length = start_of_trailing_whitespace_j;
                             for (k = start_of_trailing_whitespace_i + 1; k < i; k++)
                               ms->segments[k]->length = 0;
                             to_j = 0;
                           }
                         else
                           to_j = start_of_trailing_whitespace_j;
                       }
                   }
                 if (to_j < from_j)
                   memmove (&segment->contents[to_j], &segment->contents[from_j], bytes);
                 from_j += bytes;
                 to_j += bytes;
                 if (uc == 0x000a)
                   {
                     /* Newline.  */
                     start_of_curr_line_i = i;
                     start_of_curr_line_j = to_j;
                     start_of_trailing_whitespace_i = i;
                     start_of_trailing_whitespace_j = to_j;
                     whitespace_to_remove = minimum_indentation;
                   }
                 else if (uc_is_java_whitespace (uc))
                   {
                     /* Whitespace character.  */
                     if (whitespace_to_remove > 0
                         && --whitespace_to_remove == 0)
                       {
                         /* Remove the leading minimum_indentation whitespace
                            characters from the current line.  */
                         if (start_of_curr_line_i < i)
                           {
                             size_t k;
                             ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
                             for (k = start_of_curr_line_i + 1; k < i; k++)
                               ms->segments[k]->length = 0;
                             to_j = 0;
                           }
                         else
                           to_j = start_of_curr_line_j;
                       }
                   }
                 else
                   {
                     /* Other character.  */
                     if (whitespace_to_remove > 0)
                       abort ();
                     start_of_trailing_whitespace_i = i;
                     start_of_trailing_whitespace_j = to_j;
                   }
               }
           }
         else
           {
             /* When the encoding is not UTF-8, consider only ASCII whitespace
                characters.  */
             size_t seglength = segment->length;
             size_t from_j;

             for (to_j = from_j = 0; from_j < seglength; )
               {
                 char c = segment->contents[from_j++];
                 if (c == '\n')
                   {
                     /* Newline.  */
                     if (whitespace_to_remove > 0)
                       {
                         /* It was a blank line with fewer than minimum_indentation
                            whitespace characters.  Remove all this whitespace.  */
                         if (start_of_curr_line_i < i)
                           {
                             size_t k;
                             ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
                             for (k = start_of_curr_line_i + 1; k < i; k++)
                               ms->segments[k]->length = 0;
                             to_j = 0;
                           }
                         else
                           to_j = start_of_curr_line_j;
                       }
                     else
                       {
                         /* Remove the trailing whitespace characters from the
                            current line.  */
                         if (start_of_trailing_whitespace_i < i)
                           {
                             size_t k;
                             ms->segments[start_of_trailing_whitespace_i]->length = start_of_trailing_whitespace_j;
                             for (k = start_of_trailing_whitespace_i + 1; k < i; k++)
                               ms->segments[k]->length = 0;
                             to_j = 0;
                           }
                         else
                           to_j = start_of_trailing_whitespace_j;
                       }
                   }
                 segment->contents[to_j++] = c;
                 if (c == '\n')
                   {
                     /* Newline.  */
                     start_of_curr_line_i = i;
                     start_of_curr_line_j = to_j;
                     start_of_trailing_whitespace_i = i;
                     start_of_trailing_whitespace_j = to_j;
                     whitespace_to_remove = minimum_indentation;
                   }
                 else if (c == ' '
                          || (c >= 0x09 && c <= 0x0d)
                          || (c >= 0x1c && c <= 0x1f))
                   {
                     /* Whitespace character.  */
                     if (whitespace_to_remove > 0
                         && --whitespace_to_remove == 0)
                       {
                         /* Remove the leading minimum_indentation whitespace
                            characters from the current line.  */
                         if (start_of_curr_line_i < i)
                           {
                             size_t k;
                             ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
                             for (k = start_of_curr_line_i + 1; k < i; k++)
                               ms->segments[k]->length = 0;
                             to_j = 0;
                           }
                         else
                           to_j = start_of_curr_line_j;
                       }
                   }
                 else
                   {
                     /* Other character.  */
                     if (whitespace_to_remove > 0)
                       abort ();
                     start_of_trailing_whitespace_i = i;
                     start_of_trailing_whitespace_j = to_j;
                   }
               }
           }
         if (i + 1 == nsegments)
           {
             /* Handle the last line.  */
             if (whitespace_to_remove > 0)
               {
                 /* It was a blank line with fewer than minimum_indentation
                    whitespace characters.  Remove all this whitespace.  */
                 if (start_of_curr_line_i < i)
                   {
                     size_t k;
                     ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
                     for (k = start_of_curr_line_i + 1; k < i; k++)
                       ms->segments[k]->length = 0;
                     to_j = 0;
                   }
                 else
                   to_j = start_of_curr_line_j;
               }
             else
               {
                 /* Remove the trailing whitespace characters from the
                    current line.  */
                 if (start_of_trailing_whitespace_i < i)
                   {
                     size_t k;
                     ms->segments[start_of_trailing_whitespace_i]->length = start_of_trailing_whitespace_j;
                     for (k = start_of_trailing_whitespace_i + 1; k < i; k++)
                       ms->segments[k]->length = 0;
                     to_j = 0;
                   }
                 else
                   to_j = start_of_trailing_whitespace_j;
               }
           }
         segment->length = to_j;
       }
   }
 }


 /* Combine characters into tokens.  Discard whitespace.  */

 static token_ty phase5_pushback[3];
 static int phase5_pushback_length;

 static void
 phase5_get (token_ty *tp)
 {
   int c;

   if (phase5_pushback_length)
     {
       *tp = phase5_pushback[--phase5_pushback_length];
       return;
     }
   tp->string = NULL;

   for (;;)
     {
       tp->line_number = line_number;
       c = phase4_getc ();

       if (c == P2_EOF)
         {
           tp->type = token_type_eof;
           return;
         }

       switch (RED (c))
         {
         case '\n':
           if (last_non_comment_line > last_comment_line)
             savable_comment_reset ();
           FALLTHROUGH;
         case ' ':
         case '\t':
         case '\f':
           /* Ignore whitespace and comments.  */
           continue;
         }

       last_non_comment_line = tp->line_number;

       switch (RED (c))
         {
         case '(':
           tp->type = token_type_lparen;
           return;

         case ')':
           tp->type = token_type_rparen;
           return;

         case '{':
           tp->type = token_type_lbrace;
           return;

         case '}':
           tp->type = token_type_rbrace;
           return;

         case ',':
           tp->type = token_type_comma;
           return;

         case '.':
           c = phase4_getc ();
           if (!(RED (c) >= '0' && RED (c) <= '9'))
             {
               phase4_ungetc (c);
               tp->type = token_type_dot;
               return;
             }
           FALLTHROUGH;

         case '0': case '1': case '2': case '3': case '4':
         case '5': case '6': case '7': case '8': case '9':
           {
             /* Don't need to verify the complicated syntax of integers and
                floating-point numbers.  We assume a valid Java input.
                The simplified syntax that we recognize as number is: any
                sequence of alphanumeric characters, additionally '+' and '-'
                immediately after 'e' or 'E' except in hexadecimal numbers.  */
             bool hexadecimal = false;

             for (;;)
               {
                 c = phase4_getc ();
                 if (RED (c) >= '0' && RED (c) <= '9')
                   continue;
                 if ((RED (c) >= 'A' && RED (c) <= 'Z')
                     || (RED (c) >= 'a' && RED (c) <= 'z'))
                   {
                     if (RED (c) == 'X' || RED (c) == 'x')
                       hexadecimal = true;
                     if ((RED (c) == 'E' || RED (c) == 'e') && !hexadecimal)
                       {
                         c = phase4_getc ();
                         if (!(RED (c) == '+' || RED (c) == '-'))
                           phase4_ungetc (c);
                       }
                     continue;
                   }
                 if (RED (c) == '.')
                   continue;
                 break;
               }
             phase4_ungetc (c);
             tp->type = token_type_number;
             return;
           }

         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
         case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
         case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
         case 'V': case 'W': case 'X': case 'Y': case 'Z':
         case '_':
         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
         case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
         case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
         case 'v': case 'w': case 'x': case 'y': case 'z':
           /* Although Java allows identifiers containing many Unicode
              characters, we recognize only identifiers consisting of ASCII
              characters.  This avoids conversion hassles w.r.t. the --keyword
              arguments, and shouldn't be a big problem in practice.  */
           {
             static char *buffer;
             static int bufmax;
             int bufpos = 0;
             for (;;)
               {
                 if (bufpos >= bufmax)
                   {
                     bufmax = 2 * bufmax + 10;
                     buffer = xrealloc (buffer, bufmax);
                   }
                 buffer[bufpos++] = RED (c);
                 c = phase4_getc ();
                 if (!((RED (c) >= 'A' && RED (c) <= 'Z')
                       || (RED (c) >= 'a' && RED (c) <= 'z')
                       || (RED (c) >= '0' && RED (c) <= '9')
                       || RED (c) == '_'))
                   break;
               }
             phase4_ungetc (c);
             if (bufpos >= bufmax)
               {
                 bufmax = 2 * bufmax + 10;
                 buffer = xrealloc (buffer, bufmax);
               }
             buffer[bufpos] = '\0';
             tp->string = xstrdup (buffer);
             tp->type = token_type_symbol;
             return;
           }

         case '"':
           {
             int c2 = phase3_getc ();
             if (c2 == '"')
               {
                 int c3 = phase3_getc ();
                 if (c3 == '"')
                   {
                     /* Text block.  Specification:
                        <https://docs.oracle.com/javase/specs/jls/se13/preview/text-blocks.html>  */
                     struct mixed_string_buffer block;
                     unsigned int consecutive_unescaped_doublequotes;
                     mixed_string_ty *block_content;

                     /* Parse the part up to and including the first newline.  */
                     for (;;)
                       {
                         int ic = phase3_getc ();
                         if (ic == P2_EOF)
                           {
                             error_with_progname = false;
                             error (0, 0, _("%s:%d: warning: unterminated text block"),
                                    logical_file_name, line_number);
                             error_with_progname = true;
                             tp->type = token_type_other;
                             return;
                           }
                         if (RED (ic) == ' ' || RED (ic) == '\t' || RED (ic) == '\f')
                           ;
                         else if (RED (ic) == '\n')
                           break;
                         else
                           {
                             error_with_progname = false;
                             error (0, 0, _("%s:%d: warning: invalid syntax in text block"),
                                    logical_file_name, line_number);
                             error_with_progname = true;
                             tp->type = token_type_other;
                             return;
                           }
                       }

                     /* Parse the part after the first newline.  */
                     mixed_string_buffer_init (&block, lc_string,
                                               logical_file_name, line_number);
                     consecutive_unescaped_doublequotes = 0;
                     for (;;)
                       {
                         int ic = phase3_getc ();
                         if (RED (ic) == '"')
                           {
                             consecutive_unescaped_doublequotes++;
                             if (consecutive_unescaped_doublequotes == 3)
                               break;
                           }
                         else
                           {
                             while (consecutive_unescaped_doublequotes > 0)
                               {
                                 mixed_string_buffer_append (&block, '"');
                                 consecutive_unescaped_doublequotes--;
                               }
                             if (ic == P2_EOF)
                               {
                                 error_with_progname = false;
                                 error (0, 0, _("%s:%d: warning: unterminated text block"),
                                        logical_file_name, block.line_number);
                                 error_with_progname = true;
                                 break;
                               }
                             if (RED (ic) == '\\')
                               ic = do_getc_escaped ();
                             mixed_string_buffer_append (&block, ic);
                           }
                       }
                     block_content = mixed_string_buffer_result (&block);

                     /* Remove the common indentation from the content.  */
                     strip_indent (block_content);

                     tp->mixed_string = block_content;
                     tp->comment = add_reference (savable_comment);
                     tp->type = token_type_string_literal;
                     return;
                   }
                 phase3_ungetc (c3);
               }
             phase3_ungetc (c2);
           }
           /* String literal.  */
           {
             struct mixed_string_buffer literal;

             mixed_string_buffer_init (&literal, lc_string,
                                       logical_file_name, line_number);
             accumulate_escaped (&literal, '"');
             tp->mixed_string = mixed_string_buffer_result (&literal);
             tp->comment = add_reference (savable_comment);
             tp->type = token_type_string_literal;
             return;
           }

         case '\'':
           /* Character literal.  */
           {
             struct mixed_string_buffer literal;

             mixed_string_buffer_init (&literal, lc_outside,
                                       logical_file_name, line_number);
             accumulate_escaped (&literal, '\'');
             mixed_string_buffer_destroy (&literal);
             tp->type = token_type_other;
             return;
           }

         case '+':
           c = phase4_getc ();
           if (RED (c) == '+')
             /* Operator ++ */
             tp->type = token_type_other;
           else if (RED (c) == '=')
             /* Operator += */
             tp->type = token_type_other;
           else
             {
               /* Operator + */
               phase4_ungetc (c);
               tp->type = token_type_plus;
             }
           return;

         default:
           /* Misc. operator.  */
           tp->type = token_type_other;
           return;
         }
     }
 }

 /* Supports 3 tokens of pushback.  */
 static void
 phase5_unget (token_ty *tp)
 {
   if (tp->type != token_type_eof)
     {
       if (phase5_pushback_length == SIZEOF (phase5_pushback))
         abort ();
       phase5_pushback[phase5_pushback_length++] = *tp;
     }
 }


 /* Compile-time optimization of string literal concatenation.
    Combine "string1" + ... + "stringN" to the concatenated string if
      - the token before this expression is not ')' (because then the first
        string could be part of a cast expression),
      - the token after this expression is not '.' (because then the last
        string could be part of a method call expression).  */

 static token_ty phase6_pushback[2];
 static int phase6_pushback_length;

 static token_type_ty phase6_last;

 static void
 phase6_get (token_ty *tp)
 {
   if (phase6_pushback_length)
     {
       *tp = phase6_pushback[--phase6_pushback_length];
       return;
     }

   phase5_get (tp);
   if (tp->type == token_type_string_literal && phase6_last != token_type_rparen)
     {
       mixed_string_ty *sum = tp->mixed_string;

       for (;;)
         {
           token_ty token2;

           phase5_get (&token2);
           if (token2.type == token_type_plus)
             {
               token_ty token3;

               phase5_get (&token3);
               if (token3.type == token_type_string_literal)
                 {
                   token_ty token_after;

                   phase5_get (&token_after);
                   if (token_after.type != token_type_dot)
                     {
                       sum = mixed_string_concat_free1 (sum, token3.mixed_string);

                       phase5_unget (&token_after);
                       free_token (&token3);
                       free_token (&token2);
                       continue;
                     }
                   phase5_unget (&token_after);
                 }
               phase5_unget (&token3);
             }
           phase5_unget (&token2);
           break;
         }
       tp->mixed_string = sum;
     }
   phase6_last = tp->type;
 }

 /* Supports 2 tokens of pushback.  */
 static void
 phase6_unget (token_ty *tp)
 {
   if (tp->type != token_type_eof)
     {
       if (phase6_pushback_length == SIZEOF (phase6_pushback))
         abort ();
       phase6_pushback[phase6_pushback_length++] = *tp;
     }
 }


 static void
 x_java_lex (token_ty *tp)
 {
   phase6_get (tp);
 }

 /* Supports 2 tokens of pushback.  */
 static void
 x_java_unlex (token_ty *tp)
 {
   phase6_unget (tp);
 }


 /* ========================= Extracting strings.  ========================== */


 /* Context lookup table.  */
 static flag_context_list_table_ty *flag_context_list_table;


 /* The file is broken into tokens.  Scan the token stream, looking for
    a keyword, followed by a left paren, followed by a string.  When we
    see this sequence, we have something to remember.  We assume we are
    looking at a valid C or C++ program, and leave the complaints about
    the grammar to the compiler.

      Normal handling: Look for
        keyword ( ... msgid ... )
      Plural handling: Look for
        keyword ( ... msgid ... msgid_plural ... )

    We use recursion because the arguments before msgid or between msgid
    and msgid_plural can contain subexpressions of the same form.  */


 /* Extract messages until the next balanced closing parenthesis or brace,
    depending on TERMINATOR.
    Extracted messages are added to MLP.
    Return true upon eof, false upon closing parenthesis or brace.  */
 static bool
 extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
                        flag_context_ty outer_context,
                        flag_context_list_iterator_ty context_iter,
                        struct arglist_parser *argparser)
 {
   /* Current argument number.  */
   int arg = 1;
   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
   int state;
   /* Parameters of the keyword just seen.  Defined only in state 1.  */
   const struct callshapes *next_shapes = NULL;
   /* Context iterator that will be used if the next token is a '('.  */
   flag_context_list_iterator_ty next_context_iter =
     passthrough_context_list_iterator;
   /* Current context.  */
   flag_context_ty inner_context =
     inherited_context (outer_context,
                        flag_context_list_iterator_advance (&context_iter));

   /* Start state is 0.  */
   state = 0;

   for (;;)
     {
       token_ty token;

       x_java_lex (&token);
       switch (token.type)
         {
         case token_type_symbol:
           {
             /* Combine symbol1 . ... . symbolN to a single strings, so that
                we can recognize static function calls like
                GettextResource.gettext.  The information present for
                symbolI.....symbolN has precedence over the information for
                symbolJ.....symbolN with J > I.  */
             char *sum = token.string;
             size_t sum_len = strlen (sum);
             const char *dottedname;
             flag_context_list_ty *context_list;

             for (;;)
               {
                 token_ty token2;

                 x_java_lex (&token2);
                 if (token2.type == token_type_dot)
                   {
                     token_ty token3;

                     x_java_lex (&token3);
                     if (token3.type == token_type_symbol)
                       {
                         char *addend = token3.string;
                         size_t addend_len = strlen (addend);

                         sum =
                           (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
                         sum[sum_len] = '.';
                         memcpy (sum + sum_len + 1, addend, addend_len + 1);
                         sum_len += 1 + addend_len;

                         free_token (&token3);
                         free_token (&token2);
                         continue;
                       }
                     x_java_unlex (&token3);
                   }
                 x_java_unlex (&token2);
                 break;
               }

             for (dottedname = sum;;)
               {
                 void *keyword_value;

                 if (hash_find_entry (&keywords, dottedname, strlen (dottedname),
                                      &keyword_value)
                     == 0)
                   {
                     next_shapes = (const struct callshapes *) keyword_value;
                     state = 1;
                     break;
                   }

                 dottedname = strchr (dottedname, '.');
                 if (dottedname == NULL)
                   {
                     state = 0;
                     break;
                   }
                 dottedname++;
               }

             for (dottedname = sum;;)
               {
                 context_list =
                   flag_context_list_table_lookup (
                     flag_context_list_table,
                     dottedname, strlen (dottedname));
                 if (context_list != NULL)
                   break;

                 dottedname = strchr (dottedname, '.');
                 if (dottedname == NULL)
                   break;
                 dottedname++;
               }
             next_context_iter = flag_context_list_iterator (context_list);

             free (sum);
             continue;
           }

         case token_type_lparen:
           if (extract_parenthesized (mlp, token_type_rparen,
                                      inner_context, next_context_iter,
                                      arglist_parser_alloc (mlp,
                                                            state ? next_shapes : NULL)))
             {
               arglist_parser_done (argparser, arg);
               return true;
             }
           next_context_iter = null_context_list_iterator;
           state = 0;
           continue;

         case token_type_rparen:
           if (terminator == token_type_rparen)
             {
               arglist_parser_done (argparser, arg);
               return false;
             }
           if (terminator == token_type_rbrace)
             {
               error_with_progname = false;
               error (0, 0,
                      _("%s:%d: warning: ')' found where '}' was expected"),
                      logical_file_name, token.line_number);
               error_with_progname = true;
             }
           next_context_iter = null_context_list_iterator;
           state = 0;
           continue;

         case token_type_lbrace:
           if (extract_parenthesized (mlp, token_type_rbrace,
                                      null_context, null_context_list_iterator,
                                      arglist_parser_alloc (mlp, NULL)))
             {
               arglist_parser_done (argparser, arg);
               return true;
             }
           next_context_iter = null_context_list_iterator;
           state = 0;
           continue;

         case token_type_rbrace:
           if (terminator == token_type_rbrace)
             {
               arglist_parser_done (argparser, arg);
               return false;
             }
           if (terminator == token_type_rparen)
             {
               error_with_progname = false;
               error (0, 0,
                      _("%s:%d: warning: '}' found where ')' was expected"),
                      logical_file_name, token.line_number);
               error_with_progname = true;
             }
           next_context_iter = null_context_list_iterator;
           state = 0;
           continue;

         case token_type_comma:
           arg++;
           inner_context =
             inherited_context (outer_context,
                                flag_context_list_iterator_advance (
                                  &context_iter));
           next_context_iter = passthrough_context_list_iterator;
           state = 0;
           continue;

         case token_type_string_literal:
           {
             lex_pos_ty pos;

             pos.file_name = logical_file_name;
             pos.line_number = token.line_number;

             if (extract_all)
               {
                 char *string = mixed_string_contents (token.mixed_string);
                 mixed_string_free (token.mixed_string);
                 remember_a_message (mlp, NULL, string, true, false,
                                     inner_context, &pos,
                                     NULL, token.comment, true);
               }
             else
               arglist_parser_remember (argparser, arg, token.mixed_string,
                                        inner_context,
                                        pos.file_name, pos.line_number,
                                        token.comment, true);
           }
           drop_reference (token.comment);
           next_context_iter = null_context_list_iterator;
           state = 0;
           continue;

         case token_type_eof:
           arglist_parser_done (argparser, arg);
           return true;

         case token_type_dot:
         case token_type_number:
         case token_type_plus:
         case token_type_other:
           next_context_iter = null_context_list_iterator;
           state = 0;
           continue;

         default:
           abort ();
         }
     }
 }


 void
 extract_java (FILE *f,
               const char *real_filename, const char *logical_filename,
               flag_context_list_table_ty *flag_table,
               msgdomain_list_ty *mdlp)
 {
   message_list_ty *mlp = mdlp->item[0]->messages;

   fp = f;
   real_file_name = real_filename;
   logical_file_name = xstrdup (logical_filename);
   line_number = 1;

   phase1_pushback_length = 0;
   phase2_pushback_length = 0;
   phase3_pushback_length = 0;

   last_comment_line = -1;
   last_non_comment_line = -1;

   phase5_pushback_length = 0;
   phase6_pushback_length = 0;
   phase6_last = token_type_eof;

   flag_context_list_table = flag_table;

   init_keywords ();

   /* Eat tokens until eof is seen.  When extract_parenthesized returns
      due to an unbalanced closing parenthesis, just restart it.  */
   while (!extract_parenthesized (mlp, token_type_eof,
                                  null_context, null_context_list_iterator,
                                  arglist_parser_alloc (mlp, NULL)))
     ;

   fp = NULL;
   real_file_name = NULL;
   logical_file_name = NULL;
   line_number = 0;
 }