blob: e89ab95f477603d119ab6cee13d4c1c1011815eb [file] [log] [blame]
/* xgettext Java backend.
Copyright (C) 2003, 2005-2009, 2018-2020 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2003.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
/* Specification. */
#include "x-java.h"
#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "attribute.h"
#include "message.h"
#include "rc-str-list.h"
#include "xgettext.h"
#include "xg-pos.h"
#include "xg-encoding.h"
#include "xg-mixed-string.h"
#include "xg-arglist-context.h"
#include "xg-arglist-callshape.h"
#include "xg-arglist-parser.h"
#include "xg-message.h"
#include "error.h"
#include "error-progname.h"
#include "xalloc.h"
#include "mem-hash-map.h"
#include "po-charset.h"
#include "unistr.h"
#include "unictype.h"
#include "gettext.h"
#define _(s) gettext(s)
#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
/* The Java syntax is defined in the
Java Language Specification
(available from https://docs.oracle.com/javase/specs/),
chapter 3 "Lexical Structure". */
/* ====================== Keyword set customization. ====================== */
/* If true extract all strings. */
static bool extract_all = false;
static hash_table keywords;
static bool default_keywords = true;
void
x_java_extract_all ()
{
extract_all = true;
}
void
x_java_keyword (const char *name)
{
if (name == NULL)
default_keywords = false;
else
{
const char *end;
struct callshape shape;
const char *colon;
if (keywords.table == NULL)
hash_init (&keywords, 100);
split_keywordspec (name, &end, &shape);
/* The characters between name and end should form a valid Java
identifier sequence with dots.
A colon means an invalid parse in split_keywordspec(). */
colon = strchr (name, ':');
if (colon == NULL || colon >= end)
insert_keyword_callshape (&keywords, name, end - name, &shape);
}
}
/* Finish initializing the keywords hash table.
Called after argument processing, before each file is processed. */
static void
init_keywords ()
{
if (default_keywords)
{
/* When adding new keywords here, also update the documentation in
xgettext.texi! */
x_java_keyword ("GettextResource.gettext:2"); /* static method */
x_java_keyword ("GettextResource.ngettext:2,3"); /* static method */
x_java_keyword ("GettextResource.pgettext:2c,3"); /* static method */
x_java_keyword ("GettextResource.npgettext:2c,3,4"); /* static method */
x_java_keyword ("gettext");
x_java_keyword ("ngettext:1,2");
x_java_keyword ("pgettext:1c,2");
x_java_keyword ("npgettext:1c,2,3");
x_java_keyword ("getString"); /* ResourceBundle.getString */
default_keywords = false;
}
}
void
init_flag_table_java ()
{
xgettext_record_flag ("GettextResource.gettext:2:pass-java-format");
xgettext_record_flag ("GettextResource.gettext:2:pass-java-printf-format");
xgettext_record_flag ("GettextResource.ngettext:2:pass-java-format");
xgettext_record_flag ("GettextResource.ngettext:2:pass-java-printf-format");
xgettext_record_flag ("GettextResource.ngettext:3:pass-java-format");
xgettext_record_flag ("GettextResource.ngettext:3:pass-java-printf-format");
xgettext_record_flag ("GettextResource.pgettext:3:pass-java-format");
xgettext_record_flag ("GettextResource.pgettext:3:pass-java-printf-format");
xgettext_record_flag ("GettextResource.npgettext:3:pass-java-format");
xgettext_record_flag ("GettextResource.npgettext:3:pass-java-printf-format");
xgettext_record_flag ("GettextResource.npgettext:4:pass-java-format");
xgettext_record_flag ("GettextResource.npgettext:4:pass-java-printf-format");
xgettext_record_flag ("gettext:1:pass-java-format");
xgettext_record_flag ("gettext:1:pass-java-printf-format");
xgettext_record_flag ("ngettext:1:pass-java-format");
xgettext_record_flag ("ngettext:1:pass-java-printf-format");
xgettext_record_flag ("ngettext:2:pass-java-format");
xgettext_record_flag ("ngettext:2:pass-java-printf-format");
xgettext_record_flag ("pgettext:2:pass-java-format");
xgettext_record_flag ("pgettext:2:pass-java-printf-format");
xgettext_record_flag ("npgettext:2:pass-java-format");
xgettext_record_flag ("npgettext:2:pass-java-printf-format");
xgettext_record_flag ("npgettext:3:pass-java-format");
xgettext_record_flag ("npgettext:3:pass-java-printf-format");
xgettext_record_flag ("getString:1:pass-java-format");
xgettext_record_flag ("getString:1:pass-java-printf-format");
xgettext_record_flag ("MessageFormat:1:java-format");
xgettext_record_flag ("MessageFormat.format:1:java-format");
xgettext_record_flag ("String.format:1:java-printf-format");
xgettext_record_flag ("printf:1:java-printf-format"); /* PrintStream.printf */
}
/* ======================== Reading of characters. ======================== */
/* The input file stream. */
static FILE *fp;
/* Fetch the next single-byte character from the input file.
Pushback can consist of an unlimited number of 'u' followed by up to 4
other characters. */
/* Special coding of multiple 'u's in the pushback buffer. */
#define MULTIPLE_U(count) (0x1000 + (count))
static int phase1_pushback[5];
static unsigned int phase1_pushback_length;
static int
phase1_getc ()
{
int c;
if (phase1_pushback_length)
{
c = phase1_pushback[--phase1_pushback_length];
if (c >= MULTIPLE_U (0))
{
if (c > MULTIPLE_U (1))
phase1_pushback[phase1_pushback_length++] = c - 1;
return 'u';
}
else
return c;
}
c = getc (fp);
if (c == EOF)
{
if (ferror (fp))
error (EXIT_FAILURE, errno,
_("error while reading \"%s\""), real_file_name);
}
return c;
}
/* Supports any number of 'u' and up to 4 arbitrary characters of pushback. */
static void
phase1_ungetc (int c)
{
if (c != EOF)
{
if (c == 'u')
{
if (phase1_pushback_length > 0
&& phase1_pushback[phase1_pushback_length - 1] >= MULTIPLE_U (0))
phase1_pushback[phase1_pushback_length - 1]++;
else
{
if (phase1_pushback_length == SIZEOF (phase1_pushback))
abort ();
phase1_pushback[phase1_pushback_length++] = MULTIPLE_U (1);
}
}
else
{
if (phase1_pushback_length == SIZEOF (phase1_pushback))
abort ();
phase1_pushback[phase1_pushback_length++] = c;
}
}
}
/* Fetch the next single-byte character or Unicode character from the file.
(Here, as in the Java Language Specification, when we say "Unicode
character", we actually mean "UTF-16 encoding unit".) */
/* Return value of phase 2, 3, 4 when EOF is reached. */
#define P2_EOF 0xffff
/* Convert an UTF-16 code point to a return value that can be distinguished
from a single-byte return value. */
#define UNICODE(code) (0x10000 + (code))
/* Test a return value of phase 2, 3, 4 whether it designates an UTF-16 code
point. */
#define IS_UNICODE(p2_result) ((p2_result) >= 0x10000)
/* Extract the UTF-16 code of a return value that satisfies IS_UNICODE. */
#define UTF16_VALUE(p2_result) ((p2_result) - 0x10000)
/* Reduces a return value of phase 2, 3, 4 by unmasking the UNICODE bit,
so that it can be more easily compared against an ASCII character.
(RED (c) == 'x') is equivalent to (c == 'x' || c == UNICODE ('x')). */
#define RED(p2_result) ((p2_result) & 0xffff)
static int phase2_pushback[1];
static int phase2_pushback_length;
static int
phase2_getc ()
{
int c;
if (phase2_pushback_length)
return phase2_pushback[--phase2_pushback_length];
c = phase1_getc ();
if (c == EOF)
return P2_EOF;
if (c == '\\')
{
c = phase1_getc ();
if (c == 'u')
{
unsigned int u_count = 1;
unsigned char buf[4];
unsigned int n;
int i;
for (;;)
{
c = phase1_getc ();
if (c != 'u')
break;
u_count++;
}
phase1_ungetc (c);
n = 0;
for (i = 0; i < 4; i++)
{
c = phase1_getc ();
if (c >= '0' && c <= '9')
n = (n << 4) + (c - '0');
else if (c >= 'A' && c <= 'F')
n = (n << 4) + (c - 'A' + 10);
else if (c >= 'a' && c <= 'f')
n = (n << 4) + (c - 'a' + 10);
else
{
phase1_ungetc (c);
while (--i >= 0)
phase1_ungetc (buf[i]);
for (; u_count > 0; u_count--)
phase1_ungetc ('u');
return '\\';
}
buf[i] = c;
}
return UNICODE (n);
}
phase1_ungetc (c);
return '\\';
}
return c;
}
/* Supports only one pushback character. */
static void
phase2_ungetc (int c)
{
if (c != P2_EOF)
{
if (phase2_pushback_length == SIZEOF (phase2_pushback))
abort ();
phase2_pushback[phase2_pushback_length++] = c;
}
}
/* Fetch the next single-byte character or Unicode character from the file.
With line number handling.
Convert line terminators to '\n' or UNICODE ('\n'). */
static int phase3_pushback[2];
static int phase3_pushback_length;
static int
phase3_getc ()
{
int c;
if (phase3_pushback_length)
{
c = phase3_pushback[--phase3_pushback_length];
if (c == '\n')
++line_number;
return c;
}
c = phase2_getc ();
/* Handle line terminators. */
if (RED (c) == '\r')
{
int c1 = phase2_getc ();
if (RED (c1) != '\n')
phase2_ungetc (c1);
/* Seen line terminator CR or CR/LF. */
if (c == '\r' || c1 == '\n')
{
++line_number;
return '\n';
}
else
return UNICODE ('\n');
}
else if (RED (c) == '\n')
{
/* Seen line terminator LF. */
if (c == '\n')
{
++line_number;
return '\n';
}
else
return UNICODE ('\n');
}
return c;
}
/* Supports 2 characters of pushback. */
static void
phase3_ungetc (int c)
{
if (c != P2_EOF)
{
if (c == '\n')
--line_number;
if (phase3_pushback_length == SIZEOF (phase3_pushback))
abort ();
phase3_pushback[phase3_pushback_length++] = c;
}
}
/* ========================= Accumulating strings. ======================== */
/* See xg-mixed-string.h for the main API. */
/* Append a character or Unicode character to a 'struct mixed_string_buffer'. */
static void
mixed_string_buffer_append (struct mixed_string_buffer *bp, int c)
{
if (IS_UNICODE (c))
{
/* Append a Unicode character. */
mixed_string_buffer_append_unicode (bp, UTF16_VALUE (c));
}
else
{
/* Append a single byte. */
mixed_string_buffer_append_char (bp, (unsigned char) c);
}
}
/* ======================== Accumulating comments. ======================== */
/* Accumulating a single comment line. */
static struct mixed_string_buffer comment_buffer;
static inline void
comment_start ()
{
mixed_string_buffer_init (&comment_buffer, lc_comment,
logical_file_name, line_number);
}
static inline bool
comment_at_start ()
{
return mixed_string_buffer_is_empty (&comment_buffer);
}
static inline void
comment_add (int c)
{
mixed_string_buffer_append (&comment_buffer, c);
}
static inline void
comment_line_end (size_t chars_to_remove)
{
char *buffer =
mixed_string_contents_free1 (mixed_string_buffer_result (&comment_buffer));
size_t buflen = strlen (buffer);
buflen -= chars_to_remove;
while (buflen >= 1
&& (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
--buflen;
buffer[buflen] = '\0';
savable_comment_add (buffer);
}
/* These are for tracking whether comments count as immediately before
keyword. */
static int last_comment_line;
static int last_non_comment_line;
/* Replace each comment that is not inside a character constant or string
literal with a space or newline character. */
static int
phase4_getc ()
{
int c0;
int c;
bool last_was_star;
c0 = phase3_getc ();
if (RED (c0) != '/')
return c0;
c = phase3_getc ();
switch (RED (c))
{
default:
phase3_ungetc (c);
return c0;
case '*':
/* C style comment. */
comment_start ();
last_was_star = false;
for (;;)
{
c = phase3_getc ();
if (c == P2_EOF)
break;
/* We skip all leading white space, but not EOLs. */
if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
comment_add (c);
switch (RED (c))
{
case '\n':
comment_line_end (1);
comment_start ();
last_was_star = false;
continue;
case '*':
last_was_star = true;
continue;
case '/':
if (last_was_star)
{
comment_line_end (2);
break;
}
FALLTHROUGH;
default:
last_was_star = false;
continue;
}
break;
}
last_comment_line = line_number;
return ' ';
case '/':
/* C++ style comment. */
last_comment_line = line_number;
comment_start ();
for (;;)
{
c = phase3_getc ();
if (RED (c) == '\n' || c == P2_EOF)
break;
/* We skip all leading white space, but not EOLs. */
if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
comment_add (c);
}
phase3_ungetc (c); /* push back the newline, to decrement line_number */
comment_line_end (0);
phase3_getc (); /* read the newline again */
return '\n';
}
}
/* Supports only one pushback character. */
static void
phase4_ungetc (int c)
{
phase3_ungetc (c);
}
/* ========================== Reading of tokens. ========================== */
enum token_type_ty
{
token_type_eof,
token_type_lparen, /* ( */
token_type_rparen, /* ) */
token_type_lbrace, /* { */
token_type_rbrace, /* } */
token_type_comma, /* , */
token_type_dot, /* . */
token_type_string_literal, /* "abc", """text block""" */
token_type_number, /* 1.23 */
token_type_symbol, /* identifier, keyword, null */
token_type_plus, /* + */
token_type_other /* character literal, misc. operator */
};
typedef enum token_type_ty token_type_ty;
typedef struct token_ty token_ty;
struct token_ty
{
token_type_ty type;
char *string; /* for token_type_symbol */
mixed_string_ty *mixed_string; /* for token_type_string_literal */
refcounted_string_list_ty *comment; /* for token_type_string_literal */
int line_number;
};
/* Free the memory pointed to by a 'struct token_ty'. */
static inline void
free_token (token_ty *tp)
{
if (tp->type == token_type_symbol)
free (tp->string);
if (tp->type == token_type_string_literal)
{
free (tp->mixed_string);
drop_reference (tp->comment);
}
}
/* Read an escape sequence inside a string literal or character literal. */
static inline int
do_getc_escaped ()
{
int c;
/* Use phase 3, because phase 4 elides comments. */
c = phase3_getc ();
if (c == P2_EOF)
return UNICODE ('\\');
switch (RED (c))
{
case 'b':
return UNICODE (0x08);
case 't':
return UNICODE (0x09);
case 'n':
return UNICODE (0x0a);
case 'f':
return UNICODE (0x0c);
case 'r':
return UNICODE (0x0d);
case '"':
return UNICODE ('"');
case '\'':
return UNICODE ('\'');
case '\\':
return UNICODE ('\\');
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
{
int n = RED (c) - '0';
bool maybe3digits = (n < 4);
c = phase3_getc ();
if (RED (c) >= '0' && RED (c) <= '7')
{
n = (n << 3) + (RED (c) - '0');
if (maybe3digits)
{
c = phase3_getc ();
if (RED (c) >= '0' && RED (c) <= '7')
n = (n << 3) + (RED (c) - '0');
else
phase3_ungetc (c);
}
}
else
phase3_ungetc (c);
return UNICODE (n);
}
default:
/* Invalid escape sequence. */
phase3_ungetc (c);
return UNICODE ('\\');
}
}
/* Read a string literal or character literal. */
static void
accumulate_escaped (struct mixed_string_buffer *literal, int delimiter)
{
int c;
for (;;)
{
/* Use phase 3, because phase 4 elides comments. */
c = phase3_getc ();
if (c == P2_EOF || RED (c) == delimiter)
break;
if (RED (c) == '\n')
{
phase3_ungetc (c);
error_with_progname = false;
if (delimiter == '\'')
error (0, 0, _("%s:%d: warning: unterminated character constant"),
logical_file_name, line_number);
else
error (0, 0, _("%s:%d: warning: unterminated string constant"),
logical_file_name, line_number);
error_with_progname = true;
break;
}
if (RED (c) == '\\')
c = do_getc_escaped ();
mixed_string_buffer_append (literal, c);
}
}
/* Strip the common indentation of the non-blank lines of the given string and
remove all trailing whitespace of all lines.
Like the Java method String.stripIndent does.
<https://docs.oracle.com/en/java/javase/13/docs/api/java.base/java/lang/String.html#stripIndent()> */
static void
strip_indent (mixed_string_ty *ms)
{
size_t nsegments = ms->nsegments;
size_t minimum_indentation = SIZE_MAX;
{
size_t curr_line_indentation = 0;
bool curr_line_blank = true;
size_t i;
for (i = 0; i < nsegments; i++)
{
struct mixed_string_segment *segment = ms->segments[i];
if (segment->type == utf8_encoded
|| (segment->type == source_encoded
&& xgettext_current_source_encoding == po_charset_utf8))
{
/* Consider Unicode whitespace characters. */
size_t seglength = segment->length;
size_t j;
for (j = 0; j < seglength; )
{
ucs4_t uc;
int bytes =
u8_mbtouc (&uc, (const uint8_t *) &segment->contents[j],
seglength - j);
j += bytes;
if (uc == 0x000a)
{
/* Newline. */
if (!curr_line_blank)
if (minimum_indentation > curr_line_indentation)
minimum_indentation = curr_line_indentation;
curr_line_indentation = 0;
curr_line_blank = true;
}
else if (uc_is_java_whitespace (uc))
{
/* Whitespace character. */
if (curr_line_blank)
/* Every whitespace character counts as 1, even the TAB
character. */
curr_line_indentation++;
}
else
{
/* Other character. */
curr_line_blank = false;
}
}
}
else
{
/* When the encoding is not UTF-8, consider only ASCII whitespace
characters. */
size_t seglength = segment->length;
size_t j;
for (j = 0; j < seglength; j++)
{
char c = segment->contents[j];
if (c == '\n')
{
/* Newline. */
if (!curr_line_blank)
if (minimum_indentation > curr_line_indentation)
minimum_indentation = curr_line_indentation;
curr_line_indentation = 0;
curr_line_blank = true;
}
else if (c == ' '
|| (c >= 0x09 && c <= 0x0d)
|| (c >= 0x1c && c <= 0x1f))
{
/* Whitespace character. */
if (curr_line_blank)
/* Every whitespace character counts as 1, even the TAB
character. */
curr_line_indentation++;
}
else
{
/* Other character. */
curr_line_blank = false;
}
}
}
}
/* The indentation of the last line matters even if is blank. */
if (minimum_indentation > curr_line_indentation)
minimum_indentation = curr_line_indentation;
}
/* The same loop as above, but this time remove the leading
minimum_indentation whitespace characters and all trailing whitespace
characters from every line. */
{
size_t start_of_curr_line_i = 0;
size_t start_of_curr_line_j = 0;
size_t start_of_trailing_whitespace_i = 0;
size_t start_of_trailing_whitespace_j = 0;
size_t whitespace_to_remove = minimum_indentation;
size_t i;
for (i = 0; i < nsegments; i++)
{
struct mixed_string_segment *segment = ms->segments[i];
/* Perform a sliding copy from segment->contents[from_j] to
segment->contents[to_j]. 0 <= to_j <= from_j. */
size_t to_j;
if (segment->type == utf8_encoded
|| (segment->type == source_encoded
&& xgettext_current_source_encoding == po_charset_utf8))
{
/* Consider Unicode whitespace characters. */
size_t seglength = segment->length;
size_t from_j;
for (to_j = from_j = 0; from_j < seglength; )
{
ucs4_t uc;
int bytes =
u8_mbtouc (&uc, (const uint8_t *) &segment->contents[from_j],
seglength - from_j);
if (uc == 0x000a)
{
/* Newline. */
if (whitespace_to_remove > 0)
{
/* It was a blank line with fewer than minimum_indentation
whitespace characters. Remove all this whitespace. */
if (start_of_curr_line_i < i)
{
size_t k;
ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
for (k = start_of_curr_line_i + 1; k < i; k++)
ms->segments[k]->length = 0;
to_j = 0;
}
else
to_j = start_of_curr_line_j;
}
else
{
/* Remove the trailing whitespace characters from the
current line. */
if (start_of_trailing_whitespace_i < i)
{
size_t k;
ms->segments[start_of_trailing_whitespace_i]->length = start_of_trailing_whitespace_j;
for (k = start_of_trailing_whitespace_i + 1; k < i; k++)
ms->segments[k]->length = 0;
to_j = 0;
}
else
to_j = start_of_trailing_whitespace_j;
}
}
if (to_j < from_j)
memmove (&segment->contents[to_j], &segment->contents[from_j], bytes);
from_j += bytes;
to_j += bytes;
if (uc == 0x000a)
{
/* Newline. */
start_of_curr_line_i = i;
start_of_curr_line_j = to_j;
start_of_trailing_whitespace_i = i;
start_of_trailing_whitespace_j = to_j;
whitespace_to_remove = minimum_indentation;
}
else if (uc_is_java_whitespace (uc))
{
/* Whitespace character. */
if (whitespace_to_remove > 0
&& --whitespace_to_remove == 0)
{
/* Remove the leading minimum_indentation whitespace
characters from the current line. */
if (start_of_curr_line_i < i)
{
size_t k;
ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
for (k = start_of_curr_line_i + 1; k < i; k++)
ms->segments[k]->length = 0;
to_j = 0;
}
else
to_j = start_of_curr_line_j;
}
}
else
{
/* Other character. */
if (whitespace_to_remove > 0)
abort ();
start_of_trailing_whitespace_i = i;
start_of_trailing_whitespace_j = to_j;
}
}
}
else
{
/* When the encoding is not UTF-8, consider only ASCII whitespace
characters. */
size_t seglength = segment->length;
size_t from_j;
for (to_j = from_j = 0; from_j < seglength; )
{
char c = segment->contents[from_j++];
if (c == '\n')
{
/* Newline. */
if (whitespace_to_remove > 0)
{
/* It was a blank line with fewer than minimum_indentation
whitespace characters. Remove all this whitespace. */
if (start_of_curr_line_i < i)
{
size_t k;
ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
for (k = start_of_curr_line_i + 1; k < i; k++)
ms->segments[k]->length = 0;
to_j = 0;
}
else
to_j = start_of_curr_line_j;
}
else
{
/* Remove the trailing whitespace characters from the
current line. */
if (start_of_trailing_whitespace_i < i)
{
size_t k;
ms->segments[start_of_trailing_whitespace_i]->length = start_of_trailing_whitespace_j;
for (k = start_of_trailing_whitespace_i + 1; k < i; k++)
ms->segments[k]->length = 0;
to_j = 0;
}
else
to_j = start_of_trailing_whitespace_j;
}
}
segment->contents[to_j++] = c;
if (c == '\n')
{
/* Newline. */
start_of_curr_line_i = i;
start_of_curr_line_j = to_j;
start_of_trailing_whitespace_i = i;
start_of_trailing_whitespace_j = to_j;
whitespace_to_remove = minimum_indentation;
}
else if (c == ' '
|| (c >= 0x09 && c <= 0x0d)
|| (c >= 0x1c && c <= 0x1f))
{
/* Whitespace character. */
if (whitespace_to_remove > 0
&& --whitespace_to_remove == 0)
{
/* Remove the leading minimum_indentation whitespace
characters from the current line. */
if (start_of_curr_line_i < i)
{
size_t k;
ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
for (k = start_of_curr_line_i + 1; k < i; k++)
ms->segments[k]->length = 0;
to_j = 0;
}
else
to_j = start_of_curr_line_j;
}
}
else
{
/* Other character. */
if (whitespace_to_remove > 0)
abort ();
start_of_trailing_whitespace_i = i;
start_of_trailing_whitespace_j = to_j;
}
}
}
if (i + 1 == nsegments)
{
/* Handle the last line. */
if (whitespace_to_remove > 0)
{
/* It was a blank line with fewer than minimum_indentation
whitespace characters. Remove all this whitespace. */
if (start_of_curr_line_i < i)
{
size_t k;
ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
for (k = start_of_curr_line_i + 1; k < i; k++)
ms->segments[k]->length = 0;
to_j = 0;
}
else
to_j = start_of_curr_line_j;
}
else
{
/* Remove the trailing whitespace characters from the
current line. */
if (start_of_trailing_whitespace_i < i)
{
size_t k;
ms->segments[start_of_trailing_whitespace_i]->length = start_of_trailing_whitespace_j;
for (k = start_of_trailing_whitespace_i + 1; k < i; k++)
ms->segments[k]->length = 0;
to_j = 0;
}
else
to_j = start_of_trailing_whitespace_j;
}
}
segment->length = to_j;
}
}
}
/* Combine characters into tokens. Discard whitespace. */
static token_ty phase5_pushback[3];
static int phase5_pushback_length;
static void
phase5_get (token_ty *tp)
{
int c;
if (phase5_pushback_length)
{
*tp = phase5_pushback[--phase5_pushback_length];
return;
}
tp->string = NULL;
for (;;)
{
tp->line_number = line_number;
c = phase4_getc ();
if (c == P2_EOF)
{
tp->type = token_type_eof;
return;
}
switch (RED (c))
{
case '\n':
if (last_non_comment_line > last_comment_line)
savable_comment_reset ();
FALLTHROUGH;
case ' ':
case '\t':
case '\f':
/* Ignore whitespace and comments. */
continue;
}
last_non_comment_line = tp->line_number;
switch (RED (c))
{
case '(':
tp->type = token_type_lparen;
return;
case ')':
tp->type = token_type_rparen;
return;
case '{':
tp->type = token_type_lbrace;
return;
case '}':
tp->type = token_type_rbrace;
return;
case ',':
tp->type = token_type_comma;
return;
case '.':
c = phase4_getc ();
if (!(RED (c) >= '0' && RED (c) <= '9'))
{
phase4_ungetc (c);
tp->type = token_type_dot;
return;
}
FALLTHROUGH;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
{
/* Don't need to verify the complicated syntax of integers and
floating-point numbers. We assume a valid Java input.
The simplified syntax that we recognize as number is: any
sequence of alphanumeric characters, additionally '+' and '-'
immediately after 'e' or 'E' except in hexadecimal numbers. */
bool hexadecimal = false;
for (;;)
{
c = phase4_getc ();
if (RED (c) >= '0' && RED (c) <= '9')
continue;
if ((RED (c) >= 'A' && RED (c) <= 'Z')
|| (RED (c) >= 'a' && RED (c) <= 'z'))
{
if (RED (c) == 'X' || RED (c) == 'x')
hexadecimal = true;
if ((RED (c) == 'E' || RED (c) == 'e') && !hexadecimal)
{
c = phase4_getc ();
if (!(RED (c) == '+' || RED (c) == '-'))
phase4_ungetc (c);
}
continue;
}
if (RED (c) == '.')
continue;
break;
}
phase4_ungetc (c);
tp->type = token_type_number;
return;
}
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
case 'V': case 'W': case 'X': case 'Y': case 'Z':
case '_':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
case 'v': case 'w': case 'x': case 'y': case 'z':
/* Although Java allows identifiers containing many Unicode
characters, we recognize only identifiers consisting of ASCII
characters. This avoids conversion hassles w.r.t. the --keyword
arguments, and shouldn't be a big problem in practice. */
{
static char *buffer;
static int bufmax;
int bufpos = 0;
for (;;)
{
if (bufpos >= bufmax)
{
bufmax = 2 * bufmax + 10;
buffer = xrealloc (buffer, bufmax);
}
buffer[bufpos++] = RED (c);
c = phase4_getc ();
if (!((RED (c) >= 'A' && RED (c) <= 'Z')
|| (RED (c) >= 'a' && RED (c) <= 'z')
|| (RED (c) >= '0' && RED (c) <= '9')
|| RED (c) == '_'))
break;
}
phase4_ungetc (c);
if (bufpos >= bufmax)
{
bufmax = 2 * bufmax + 10;
buffer = xrealloc (buffer, bufmax);
}
buffer[bufpos] = '\0';
tp->string = xstrdup (buffer);
tp->type = token_type_symbol;
return;
}
case '"':
{
int c2 = phase3_getc ();
if (c2 == '"')
{
int c3 = phase3_getc ();
if (c3 == '"')
{
/* Text block. Specification:
<https://docs.oracle.com/javase/specs/jls/se13/preview/text-blocks.html> */
struct mixed_string_buffer block;
unsigned int consecutive_unescaped_doublequotes;
mixed_string_ty *block_content;
/* Parse the part up to and including the first newline. */
for (;;)
{
int ic = phase3_getc ();
if (ic == P2_EOF)
{
error_with_progname = false;
error (0, 0, _("%s:%d: warning: unterminated text block"),
logical_file_name, line_number);
error_with_progname = true;
tp->type = token_type_other;
return;
}
if (RED (ic) == ' ' || RED (ic) == '\t' || RED (ic) == '\f')
;
else if (RED (ic) == '\n')
break;
else
{
error_with_progname = false;
error (0, 0, _("%s:%d: warning: invalid syntax in text block"),
logical_file_name, line_number);
error_with_progname = true;
tp->type = token_type_other;
return;
}
}
/* Parse the part after the first newline. */
mixed_string_buffer_init (&block, lc_string,
logical_file_name, line_number);
consecutive_unescaped_doublequotes = 0;
for (;;)
{
int ic = phase3_getc ();
if (RED (ic) == '"')
{
consecutive_unescaped_doublequotes++;
if (consecutive_unescaped_doublequotes == 3)
break;
}
else
{
while (consecutive_unescaped_doublequotes > 0)
{
mixed_string_buffer_append (&block, '"');
consecutive_unescaped_doublequotes--;
}
if (ic == P2_EOF)
{
error_with_progname = false;
error (0, 0, _("%s:%d: warning: unterminated text block"),
logical_file_name, block.line_number);
error_with_progname = true;
break;
}
if (RED (ic) == '\\')
ic = do_getc_escaped ();
mixed_string_buffer_append (&block, ic);
}
}
block_content = mixed_string_buffer_result (&block);
/* Remove the common indentation from the content. */
strip_indent (block_content);
tp->mixed_string = block_content;
tp->comment = add_reference (savable_comment);
tp->type = token_type_string_literal;
return;
}
phase3_ungetc (c3);
}
phase3_ungetc (c2);
}
/* String literal. */
{
struct mixed_string_buffer literal;
mixed_string_buffer_init (&literal, lc_string,
logical_file_name, line_number);
accumulate_escaped (&literal, '"');
tp->mixed_string = mixed_string_buffer_result (&literal);
tp->comment = add_reference (savable_comment);
tp->type = token_type_string_literal;
return;
}
case '\'':
/* Character literal. */
{
struct mixed_string_buffer literal;
mixed_string_buffer_init (&literal, lc_outside,
logical_file_name, line_number);
accumulate_escaped (&literal, '\'');
mixed_string_buffer_destroy (&literal);
tp->type = token_type_other;
return;
}
case '+':
c = phase4_getc ();
if (RED (c) == '+')
/* Operator ++ */
tp->type = token_type_other;
else if (RED (c) == '=')
/* Operator += */
tp->type = token_type_other;
else
{
/* Operator + */
phase4_ungetc (c);
tp->type = token_type_plus;
}
return;
default:
/* Misc. operator. */
tp->type = token_type_other;
return;
}
}
}
/* Supports 3 tokens of pushback. */
static void
phase5_unget (token_ty *tp)
{
if (tp->type != token_type_eof)
{
if (phase5_pushback_length == SIZEOF (phase5_pushback))
abort ();
phase5_pushback[phase5_pushback_length++] = *tp;
}
}
/* Compile-time optimization of string literal concatenation.
Combine "string1" + ... + "stringN" to the concatenated string if
- the token before this expression is not ')' (because then the first
string could be part of a cast expression),
- the token after this expression is not '.' (because then the last
string could be part of a method call expression). */
static token_ty phase6_pushback[2];
static int phase6_pushback_length;
static token_type_ty phase6_last;
static void
phase6_get (token_ty *tp)
{
if (phase6_pushback_length)
{
*tp = phase6_pushback[--phase6_pushback_length];
return;
}
phase5_get (tp);
if (tp->type == token_type_string_literal && phase6_last != token_type_rparen)
{
mixed_string_ty *sum = tp->mixed_string;
for (;;)
{
token_ty token2;
phase5_get (&token2);
if (token2.type == token_type_plus)
{
token_ty token3;
phase5_get (&token3);
if (token3.type == token_type_string_literal)
{
token_ty token_after;
phase5_get (&token_after);
if (token_after.type != token_type_dot)
{
sum = mixed_string_concat_free1 (sum, token3.mixed_string);
phase5_unget (&token_after);
free_token (&token3);
free_token (&token2);
continue;
}
phase5_unget (&token_after);
}
phase5_unget (&token3);
}
phase5_unget (&token2);
break;
}
tp->mixed_string = sum;
}
phase6_last = tp->type;
}
/* Supports 2 tokens of pushback. */
static void
phase6_unget (token_ty *tp)
{
if (tp->type != token_type_eof)
{
if (phase6_pushback_length == SIZEOF (phase6_pushback))
abort ();
phase6_pushback[phase6_pushback_length++] = *tp;
}
}
static void
x_java_lex (token_ty *tp)
{
phase6_get (tp);
}
/* Supports 2 tokens of pushback. */
static void
x_java_unlex (token_ty *tp)
{
phase6_unget (tp);
}
/* ========================= Extracting strings. ========================== */
/* Context lookup table. */
static flag_context_list_table_ty *flag_context_list_table;
/* The file is broken into tokens. Scan the token stream, looking for
a keyword, followed by a left paren, followed by a string. When we
see this sequence, we have something to remember. We assume we are
looking at a valid C or C++ program, and leave the complaints about
the grammar to the compiler.
Normal handling: Look for
keyword ( ... msgid ... )
Plural handling: Look for
keyword ( ... msgid ... msgid_plural ... )
We use recursion because the arguments before msgid or between msgid
and msgid_plural can contain subexpressions of the same form. */
/* Extract messages until the next balanced closing parenthesis or brace,
depending on TERMINATOR.
Extracted messages are added to MLP.
Return true upon eof, false upon closing parenthesis or brace. */
static bool
extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
flag_context_ty outer_context,
flag_context_list_iterator_ty context_iter,
struct arglist_parser *argparser)
{
/* Current argument number. */
int arg = 1;
/* 0 when no keyword has been seen. 1 right after a keyword is seen. */
int state;
/* Parameters of the keyword just seen. Defined only in state 1. */
const struct callshapes *next_shapes = NULL;
/* Context iterator that will be used if the next token is a '('. */
flag_context_list_iterator_ty next_context_iter =
passthrough_context_list_iterator;
/* Current context. */
flag_context_ty inner_context =
inherited_context (outer_context,
flag_context_list_iterator_advance (&context_iter));
/* Start state is 0. */
state = 0;
for (;;)
{
token_ty token;
x_java_lex (&token);
switch (token.type)
{
case token_type_symbol:
{
/* Combine symbol1 . ... . symbolN to a single strings, so that
we can recognize static function calls like
GettextResource.gettext. The information present for
symbolI.....symbolN has precedence over the information for
symbolJ.....symbolN with J > I. */
char *sum = token.string;
size_t sum_len = strlen (sum);
const char *dottedname;
flag_context_list_ty *context_list;
for (;;)
{
token_ty token2;
x_java_lex (&token2);
if (token2.type == token_type_dot)
{
token_ty token3;
x_java_lex (&token3);
if (token3.type == token_type_symbol)
{
char *addend = token3.string;
size_t addend_len = strlen (addend);
sum =
(char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
sum[sum_len] = '.';
memcpy (sum + sum_len + 1, addend, addend_len + 1);
sum_len += 1 + addend_len;
free_token (&token3);
free_token (&token2);
continue;
}
x_java_unlex (&token3);
}
x_java_unlex (&token2);
break;
}
for (dottedname = sum;;)
{
void *keyword_value;
if (hash_find_entry (&keywords, dottedname, strlen (dottedname),
&keyword_value)
== 0)
{
next_shapes = (const struct callshapes *) keyword_value;
state = 1;
break;
}
dottedname = strchr (dottedname, '.');
if (dottedname == NULL)
{
state = 0;
break;
}
dottedname++;
}
for (dottedname = sum;;)
{
context_list =
flag_context_list_table_lookup (
flag_context_list_table,
dottedname, strlen (dottedname));
if (context_list != NULL)
break;
dottedname = strchr (dottedname, '.');
if (dottedname == NULL)
break;
dottedname++;
}
next_context_iter = flag_context_list_iterator (context_list);
free (sum);
continue;
}
case token_type_lparen:
if (extract_parenthesized (mlp, token_type_rparen,
inner_context, next_context_iter,
arglist_parser_alloc (mlp,
state ? next_shapes : NULL)))
{
arglist_parser_done (argparser, arg);
return true;
}
next_context_iter = null_context_list_iterator;
state = 0;
continue;
case token_type_rparen:
if (terminator == token_type_rparen)
{
arglist_parser_done (argparser, arg);
return false;
}
if (terminator == token_type_rbrace)
{
error_with_progname = false;
error (0, 0,
_("%s:%d: warning: ')' found where '}' was expected"),
logical_file_name, token.line_number);
error_with_progname = true;
}
next_context_iter = null_context_list_iterator;
state = 0;
continue;
case token_type_lbrace:
if (extract_parenthesized (mlp, token_type_rbrace,
null_context, null_context_list_iterator,
arglist_parser_alloc (mlp, NULL)))
{
arglist_parser_done (argparser, arg);
return true;
}
next_context_iter = null_context_list_iterator;
state = 0;
continue;
case token_type_rbrace:
if (terminator == token_type_rbrace)
{
arglist_parser_done (argparser, arg);
return false;
}
if (terminator == token_type_rparen)
{
error_with_progname = false;
error (0, 0,
_("%s:%d: warning: '}' found where ')' was expected"),
logical_file_name, token.line_number);
error_with_progname = true;
}
next_context_iter = null_context_list_iterator;
state = 0;
continue;
case token_type_comma:
arg++;
inner_context =
inherited_context (outer_context,
flag_context_list_iterator_advance (
&context_iter));
next_context_iter = passthrough_context_list_iterator;
state = 0;
continue;
case token_type_string_literal:
{
lex_pos_ty pos;
pos.file_name = logical_file_name;
pos.line_number = token.line_number;
if (extract_all)
{
char *string = mixed_string_contents (token.mixed_string);
mixed_string_free (token.mixed_string);
remember_a_message (mlp, NULL, string, true, false,
inner_context, &pos,
NULL, token.comment, true);
}
else
arglist_parser_remember (argparser, arg, token.mixed_string,
inner_context,
pos.file_name, pos.line_number,
token.comment, true);
}
drop_reference (token.comment);
next_context_iter = null_context_list_iterator;
state = 0;
continue;
case token_type_eof:
arglist_parser_done (argparser, arg);
return true;
case token_type_dot:
case token_type_number:
case token_type_plus:
case token_type_other:
next_context_iter = null_context_list_iterator;
state = 0;
continue;
default:
abort ();
}
}
}
void
extract_java (FILE *f,
const char *real_filename, const char *logical_filename,
flag_context_list_table_ty *flag_table,
msgdomain_list_ty *mdlp)
{
message_list_ty *mlp = mdlp->item[0]->messages;
fp = f;
real_file_name = real_filename;
logical_file_name = xstrdup (logical_filename);
line_number = 1;
phase1_pushback_length = 0;
phase2_pushback_length = 0;
phase3_pushback_length = 0;
last_comment_line = -1;
last_non_comment_line = -1;
phase5_pushback_length = 0;
phase6_pushback_length = 0;
phase6_last = token_type_eof;
flag_context_list_table = flag_table;
init_keywords ();
/* Eat tokens until eof is seen. When extract_parenthesized returns
due to an unbalanced closing parenthesis, just restart it. */
while (!extract_parenthesized (mlp, token_type_eof,
null_context, null_context_list_iterator,
arglist_parser_alloc (mlp, NULL)))
;
fp = NULL;
real_file_name = NULL;
logical_file_name = NULL;
line_number = 0;
}