blob: a3f70086b41742771046aa2939d14942b88d23e9 [file] [log] [blame] [edit]
/* xgettext sh backend.
Copyright (C) 2003, 2005-2009, 2014, 2018-2020 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2003.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
/* Specification. */
#include "x-sh.h"
#include <errno.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "attribute.h"
#include "message.h"
#include "xgettext.h"
#include "xg-pos.h"
#include "xg-mixed-string.h"
#include "xg-arglist-context.h"
#include "xg-arglist-callshape.h"
#include "xg-arglist-parser.h"
#include "xg-message.h"
#include "error.h"
#include "error-progname.h"
#include "xalloc.h"
#include "mem-hash-map.h"
#include "../../gettext-runtime/src/escapes.h"
#include "gettext.h"
#define _(s) gettext(s)
#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
/* The sh syntax is defined in POSIX:2001, see
http://www.opengroup.org/onlinepubs/007904975/utilities/xcu_chap02.html
Summary of sh syntax:
- Input is broken into words, which are then subject to
- tilde expansion ~...
- command substitution `...`
- variable substitution $var
- arithmetic substitution $((...))
- field splitting at whitespace (IFS)
- wildcard pattern expansion *?
- quote removal
- Strings are enclosed in "..."; command substitution, variable
substitution and arithmetic substitution are performed here as well.
- '...' is a string without substitutions.
- The list of resulting words is split into commands by semicolon and
newline.
- '#' at the beginning of a word introduces a comment until end of line.
The parser is implemented in bash-2.05b/parse.y. */
/* ====================== Keyword set customization. ====================== */
/* If true extract all strings. */
static bool extract_all = false;
static hash_table keywords;
static bool default_keywords = true;
void
x_sh_extract_all ()
{
extract_all = true;
}
void
x_sh_keyword (const char *name)
{
if (name == NULL)
default_keywords = false;
else
{
const char *end;
struct callshape shape;
const char *colon;
if (keywords.table == NULL)
hash_init (&keywords, 100);
split_keywordspec (name, &end, &shape);
/* The characters between name and end should form a valid C identifier.
A colon means an invalid parse in split_keywordspec(). */
colon = strchr (name, ':');
if (colon == NULL || colon >= end)
insert_keyword_callshape (&keywords, name, end - name, &shape);
}
}
/* Finish initializing the keywords hash table.
Called after argument processing, before each file is processed. */
static void
init_keywords ()
{
if (default_keywords)
{
/* When adding new keywords here, also update the documentation in
xgettext.texi! */
x_sh_keyword ("gettext");
x_sh_keyword ("ngettext:1,2");
/* Note: There is also special handling for 'gettext' and 'ngettext'
in read_command, below. */
x_sh_keyword ("eval_gettext");
x_sh_keyword ("eval_ngettext:1,2");
x_sh_keyword ("eval_pgettext:1c,2");
x_sh_keyword ("eval_npgettext:1c,2,3");
default_keywords = false;
}
}
void
init_flag_table_sh ()
{
xgettext_record_flag ("gettext:1:pass-sh-format");
xgettext_record_flag ("ngettext:1:pass-sh-format");
xgettext_record_flag ("ngettext:2:pass-sh-format");
xgettext_record_flag ("eval_gettext:1:sh-format");
xgettext_record_flag ("eval_ngettext:1:sh-format");
xgettext_record_flag ("eval_ngettext:2:sh-format");
xgettext_record_flag ("eval_pgettext:2:sh-format");
xgettext_record_flag ("eval_npgettext:2:sh-format");
xgettext_record_flag ("eval_npgettext:3:sh-format");
}
/* ======================== Reading of characters. ======================== */
/* The input file stream. */
static FILE *fp;
/* Fetch the next character from the input file. */
static int
do_getc ()
{
int c = getc (fp);
if (c == EOF)
{
if (ferror (fp))
error (EXIT_FAILURE, errno,
_("error while reading \"%s\""), real_file_name);
}
else if (c == '\n')
line_number++;
return c;
}
/* Put back the last fetched character, not EOF. */
static void
do_ungetc (int c)
{
if (c == '\n')
line_number--;
ungetc (c, fp);
}
/* Remove backslash followed by newline from the input stream. */
static int phase1_pushback[1];
static int phase1_pushback_length;
static int
phase1_getc ()
{
int c;
if (phase1_pushback_length)
{
c = phase1_pushback[--phase1_pushback_length];
if (c == '\n')
++line_number;
return c;
}
for (;;)
{
c = do_getc ();
if (c != '\\')
return c;
c = do_getc ();
if (c != '\n')
{
if (c != EOF)
do_ungetc (c);
return '\\';
}
}
}
/* Supports only one pushback character. */
static void
phase1_ungetc (int c)
{
switch (c)
{
case EOF:
break;
case '\n':
--line_number;
FALLTHROUGH;
default:
if (phase1_pushback_length == SIZEOF (phase1_pushback))
abort ();
phase1_pushback[phase1_pushback_length++] = c;
break;
}
}
/* ========================== Reading of tokens. ========================== */
/* A token consists of a sequence of characters. */
struct token
{
int allocated; /* number of allocated 'token_char's */
int charcount; /* number of used 'token_char's */
char *chars; /* the token's constituents */
};
/* Initialize a 'struct token'. */
static inline void
init_token (struct token *tp)
{
tp->allocated = 10;
tp->chars = XNMALLOC (tp->allocated, char);
tp->charcount = 0;
}
/* Free the memory pointed to by a 'struct token'. */
static inline void
free_token (struct token *tp)
{
free (tp->chars);
}
/* Ensure there is enough room in the token for one more character. */
static inline void
grow_token (struct token *tp)
{
if (tp->charcount == tp->allocated)
{
tp->allocated *= 2;
tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
}
}
/* Convert a struct token * to a char*. */
static char *
string_of_token (const struct token *tp)
{
char *str;
int n;
n = tp->charcount;
str = XNMALLOC (n + 1, char);
memcpy (str, tp->chars, n);
str[n] = '\0';
return str;
}
/* ========================= Accumulating messages ========================= */
static message_list_ty *mlp;
/* ========================= Accumulating comments ========================= */
static char *buffer;
static size_t bufmax;
static size_t buflen;
static inline void
comment_start ()
{
buflen = 0;
}
static inline void
comment_add (int c)
{
if (buflen >= bufmax)
{
bufmax = 2 * bufmax + 10;
buffer = xrealloc (buffer, bufmax);
}
buffer[buflen++] = c;
}
static inline void
comment_line_end ()
{
while (buflen >= 1
&& (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
--buflen;
if (buflen >= bufmax)
{
bufmax = 2 * bufmax + 10;
buffer = xrealloc (buffer, bufmax);
}
buffer[buflen] = '\0';
savable_comment_add (buffer);
}
/* These are for tracking whether comments count as immediately before
keyword. */
static int last_comment_line;
static int last_non_comment_line;
/* ========================= Debackslashification ========================== */
/* This state tracks the effect of backquotes, double-quotes and single-quotes
on the parsing of backslashes. We make a single pass through the input
file, keeping the state up to date. This is much faster than accumulating
strings and processing them with explicit debackslashification, like the
shell does it. */
/* The number of nested `...` or "`...`" constructs. Assumed to be <= 32. */
static unsigned int nested_backquotes;
/* A bit mask indicating which of the currently open `...` or "`...`"
constructs is with double-quotes: "`...`".
A bit value of 1 stands for "`...`", a bit value of 0 stands for `...`.
Bit position 0 designates the outermost backquotes nesting,
bit position 1 the second-outermost backquotes nesting,
...
bit position (nested_backquotes-1) the innermost backquotes nesting. */
static unsigned int open_doublequotes_mask;
/* A bit indicating whether a double-quote is currently open inside the
innermost backquotes nesting. */
static bool open_doublequote;
/* A bit indicating whether a single-quote is currently open inside the
innermost backquotes nesting. */
static bool open_singlequote;
/* The expected terminator of the currently open single-quote.
Usually '\'', but can be '"' for i18n-quotes. */
static char open_singlequote_terminator;
/* Functions to update the state. */
static inline void
saw_opening_backquote ()
{
if (open_singlequote)
abort ();
if (open_doublequote)
open_doublequotes_mask |= (unsigned int) 1 << nested_backquotes;
nested_backquotes++;
open_doublequote = false;
}
static inline void
saw_closing_backquote ()
{
nested_backquotes--;
open_doublequote = (open_doublequotes_mask >> nested_backquotes) & 1;
open_doublequotes_mask &= ((unsigned int) 1 << nested_backquotes) - 1;
open_singlequote = false; /* just for safety */
}
static inline void
saw_opening_doublequote ()
{
if (open_singlequote || open_doublequote)
abort ();
open_doublequote = true;
}
static inline void
saw_closing_doublequote ()
{
if (open_singlequote || !open_doublequote)
abort ();
open_doublequote = false;
}
static inline void
saw_opening_singlequote ()
{
if (open_doublequote || open_singlequote)
abort ();
open_singlequote = true;
open_singlequote_terminator = '\'';
}
static inline void
saw_closing_singlequote ()
{
if (open_doublequote || !open_singlequote)
abort ();
open_singlequote = false;
}
/* ========================== Reading of commands ========================== */
/* We are only interested in constant strings. Other words need not to be
represented precisely. */
enum word_type
{
t_string, /* constant string */
t_assignment, /* variable assignment */
t_other, /* other string */
t_separator, /* command separator: semicolon or newline */
t_redirect, /* redirection: one of < > >| << <<- >> <> <& >& */
t_backquote, /* closing '`' pseudo word */
t_paren, /* closing ')' pseudo word */
t_eof /* EOF marker */
};
struct word
{
enum word_type type;
struct token *token; /* for t_string */
int line_number_at_start; /* for t_string */
};
/* Free the memory pointed to by a 'struct word'. */
static inline void
free_word (struct word *wp)
{
if (wp->type == t_string)
{
free_token (wp->token);
free (wp->token);
}
}
/* Convert a t_string token to a char*. */
static char *
string_of_word (const struct word *wp)
{
char *str;
int n;
if (!(wp->type == t_string))
abort ();
n = wp->token->charcount;
str = XNMALLOC (n + 1, char);
memcpy (str, wp->token->chars, n);
str[n] = '\0';
return str;
}
/* Convert a t_string token to a char*, ignoring the first OFFSET bytes. */
static char *
substring_of_word (const struct word *wp, size_t offset)
{
char *str;
int n;
if (!(wp->type == t_string))
abort ();
n = wp->token->charcount;
if (!(offset <= n))
abort ();
str = XNMALLOC (n - offset + 1, char);
memcpy (str, wp->token->chars + offset, n - offset);
str[n - offset] = '\0';
return str;
}
/* Whitespace recognition. */
static inline bool
is_whitespace (int c)
{
return (c == ' ' || c == '\t' || c == '\n');
}
/* Operator character recognition. */
static inline bool
is_operator_start (int c)
{
return (c == '|' || c == '&' || c == ';' || c == '<' || c == '>'
|| c == '(' || c == ')');
}
/* Denotation of a quoted character.
The distinction between quoted and unquoted character is important only for
the special, whitespace and operator characters; it is irrelevant for
alphanumeric characters, '\\' and many others. */
#define QUOTED(c) (UCHAR_MAX + 1 + (c))
/* Values in the 'unsigned char' range are implicitly unquoted. Among these,
the following are important:
'"' opening or closing double quote
'\'' opening or closing single quote
'$' the unknown result of a dollar expansion
'`' does not occur - replaced with OPENING_BACKQUOTE or
CLOSING_BACKQUOTE
*/
#define OPENING_BACKQUOTE (2 * (UCHAR_MAX + 1) + '`')
#define CLOSING_BACKQUOTE (3 * (UCHAR_MAX + 1) + '`')
/* 2 characters of pushback are supported.
2 characters of pushback occur only when the first is an 'x'; in all
other cases only one character of pushback is needed. */
static int phase2_pushback[2];
static int phase2_pushback_length;
/* Return the next character, with backslashes removed.
The result is QUOTED(c) for some unsigned char c, if the next character
is escaped sufficiently often to make it a regular constituent character,
or simply an 'unsigned char' if it has its special meaning (of special,
whitespace or operator charcter), or OPENING_BACKQUOTE, CLOSING_BACKQUOTE,
EOF.
It's the caller's responsibility to update the state. */
static int
phase2_getc ()
{
int c;
if (phase2_pushback_length)
{
c = phase2_pushback[--phase2_pushback_length];
if (c == '\n')
++line_number;
return c;
}
c = phase1_getc ();
if (c == EOF)
return c;
if (c == '\'')
return ((open_doublequote
|| (open_singlequote && open_singlequote_terminator != c))
? QUOTED (c)
: c);
if (open_singlequote)
{
if (c == open_singlequote_terminator)
return c;
}
else
{
if (c == '"' || c == '$')
return c;
if (c == '`')
return (nested_backquotes > 0 ? CLOSING_BACKQUOTE : OPENING_BACKQUOTE);
}
if (c == '\\')
{
/* Number of debackslashification passes that are active at the
current point. */
unsigned int debackslashify =
nested_backquotes + (open_singlequote ? 0 : 1);
/* Normal number of backslashes that yield a single backslash in the
final output. */
unsigned int expected_count =
(unsigned int) 1 << debackslashify;
/* Number of backslashes found. */
unsigned int count;
for (count = 1; count < expected_count; count++)
{
c = phase1_getc ();
if (c != '\\')
break;
}
if (count == expected_count)
return '\\';
/* The count of backslashes is > 0 and < expected_count, therefore the
result depends on c, the first character after the backslashes.
Note: The formulas below don't necessarily have a logic; they were
empirically determined such that 1. the xgettext-sh-1 test succeeds,
2. the behaviour for count == 0 would correspond to the one without
any baskslash. */
if (c == '\'')
{
if (!open_singlequote && count > (expected_count >> 1))
{
phase1_ungetc (c);
return '\\';
}
else
return ((open_doublequote
|| (open_singlequote
? open_singlequote_terminator != c
: count == (expected_count >> 1)))
? QUOTED (c)
: c);
}
else if (c == '"')
{
/* Each debackslashification pass converts \\ to \ and \" to ";
passes corresponding to `...` drop a lone " whereas passes
corresponding to "`...`" leave it alone. Therefore, the
minimum number of backslashes needed to get one double-quote
in the end is open_doublequotes_mask + 1. */
if (open_singlequote)
{
if (count > open_doublequotes_mask)
{
phase1_ungetc (c);
return '\\';
}
else
return (open_singlequote_terminator != c ? QUOTED (c) : c);
}
else
{
if (count > open_doublequotes_mask)
return QUOTED (c);
else
/* Some of the count values <= open_doublequotes_mask are
actually invalid here, but we assume a syntactically
correct input file anyway. */
return c;
}
}
else if (c == '`')
{
/* FIXME: This code looks fishy. */
if (count == expected_count - 1)
return c;
else
/* Some of the count values < expected_count - 1 are
actually invalid here, but we assume a syntactically
correct input file anyway. */
if (nested_backquotes > 0 && !open_singlequote
&& count >= (expected_count >> 2))
return OPENING_BACKQUOTE;
else
return CLOSING_BACKQUOTE;
}
else if (c == '$')
{
if (open_singlequote)
return QUOTED (c);
if (count >= (expected_count >> 1))
return QUOTED (c);
else
return c;
}
else
{
/* When not followed by a quoting character or backslash or dollar,
a backslash survives a debackslashification pass unmodified.
Therefore each debackslashification pass performs a
count := (count + 1) >> 1
operation. Therefore the minimum number of backslashes needed
to get one backslash in the end is (expected_count >> 1) + 1. */
if (open_doublequote || open_singlequote)
{
if (count > 0)
{
phase1_ungetc (c);
return '\\';
}
else
return QUOTED (c);
}
else
{
if (count > (expected_count >> 1))
{
phase1_ungetc (c);
return '\\';
}
else if (count > 0)
return QUOTED (c);
else
return c;
}
}
}
return (open_singlequote || open_doublequote ? QUOTED (c) : c);
}
/* Supports 2 characters of pushback. */
static void
phase2_ungetc (int c)
{
switch (c)
{
case EOF:
break;
case '\n':
--line_number;
FALLTHROUGH;
default:
if (phase2_pushback_length == SIZEOF (phase2_pushback))
abort ();
phase2_pushback[phase2_pushback_length++] = c;
break;
}
}
/* Context lookup table. */
static flag_context_list_table_ty *flag_context_list_table;
/* Forward declaration of local functions. */
static enum word_type read_command_list (int looking_for,
flag_context_ty outer_context);
/* Read the next word.
'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
or '\0'. */
static void
read_word (struct word *wp, int looking_for, flag_context_ty context)
{
int c;
bool all_unquoted_digits;
bool all_unquoted_name_characters;
do
{
c = phase2_getc ();
if (c == '#')
{
/* Skip a comment up to end of line. */
last_comment_line = line_number;
comment_start ();
for (;;)
{
c = phase1_getc ();
if (c == EOF || c == '\n')
break;
/* We skip all leading white space, but not EOLs. */
if (!(buflen == 0 && (c == ' ' || c == '\t')))
comment_add (c);
}
comment_line_end ();
}
if (c == '\n')
{
/* Comments assumed to be grouped with a message must immediately
precede it, with no non-whitespace token on a line between
both. */
if (last_non_comment_line > last_comment_line)
savable_comment_reset ();
wp->type = t_separator;
return;
}
}
while (is_whitespace (c));
if (c == EOF)
{
wp->type = t_eof;
return;
}
if (c == '<' || c == '>')
{
/* Recognize the redirection operators < > >| << <<- >> <> <& >&
But <( and >) are handled below, not here. */
int c2 = phase2_getc ();
if (c2 != '(')
{
if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
{
if (c == '<' && c2 == '<')
{
int c3 = phase2_getc ();
if (c3 != '-')
phase2_ungetc (c3);
}
}
else
phase2_ungetc (c2);
wp->type = t_redirect;
return;
}
else
phase2_ungetc (c2);
}
if (c == CLOSING_BACKQUOTE)
{
if (looking_for == CLOSING_BACKQUOTE)
{
saw_closing_backquote ();
wp->type = t_backquote;
last_non_comment_line = line_number;
return;
}
else if (looking_for == ')')
{
/* The input is invalid syntax, such as `a<(`
Push back the closing backquote and pretend that we have seen a
closing parenthesis. */
phase2_ungetc (c);
wp->type = t_paren;
last_non_comment_line = line_number;
return;
}
else
/* We shouldn't be reading a CLOSING_BACKQUOTE when
looking_for == '\0'. */
abort ();
}
if (looking_for == ')' && c == ')')
{
wp->type = t_paren;
last_non_comment_line = line_number;
return;
}
if (is_operator_start (c))
{
wp->type = (c == ';' ? t_separator : t_other);
return;
}
wp->type = t_string;
wp->token = XMALLOC (struct token);
init_token (wp->token);
wp->line_number_at_start = line_number;
/* True while all characters in the token seen so far are digits. */
all_unquoted_digits = true;
/* True while all characters in the token seen so far form a "name":
all characters are unquoted underscores, digits, or alphabetics from the
portable character set, and the first character is not a digit. Cf.
<https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_235>
*/
all_unquoted_name_characters = true;
for (;; c = phase2_getc ())
{
if (c == EOF)
break;
if (all_unquoted_digits && (c == '<' || c == '>'))
{
/* Recognize the redirection operators < > >| << <<- >> <> <& >&
prefixed with a nonempty sequence of unquoted digits. */
int c2 = phase2_getc ();
if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
{
if (c == '<' && c2 == '<')
{
int c3 = phase2_getc ();
if (c3 != '-')
phase2_ungetc (c3);
}
}
else
phase2_ungetc (c2);
wp->type = t_redirect;
free_token (wp->token);
free (wp->token);
last_non_comment_line = line_number;
return;
}
all_unquoted_digits = all_unquoted_digits && (c >= '0' && c <= '9');
if (all_unquoted_name_characters && wp->token->charcount > 0 && c == '=')
{
wp->type = t_assignment;
continue;
}
all_unquoted_name_characters =
all_unquoted_name_characters
&& ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_'
|| (wp->token->charcount > 0 && c >= '0' && c <= '9'));
if (c == '$')
{
int c2;
/* An unquoted dollar indicates we are not inside '...'. */
if (open_singlequote)
abort ();
/* After reading a dollar, we know that there is no pushed back
character from an earlier lookahead. */
if (phase2_pushback_length > 0)
abort ();
/* Therefore we can use phase1 without interfering with phase2.
We need to recognize $( outside and inside double-quotes.
It would be incorrect to do
c2 = phase2_getc ();
if (c2 == '(' || c2 == QUOTED ('('))
because that would also trigger for $\(. */
c2 = phase1_getc ();
if (c2 == '(')
{
bool saved_open_doublequote;
int c3;
phase1_ungetc (c2);
/* The entire inner command or arithmetic expression is read
ignoring possible surrounding double-quotes. */
saved_open_doublequote = open_doublequote;
open_doublequote = false;
c2 = phase2_getc ();
if (c2 != '(')
abort ();
c3 = phase2_getc ();
if (c3 == '(')
{
/* Arithmetic expression (Bash syntax). Skip until the
matching closing parenthesis. */
unsigned int depth = 2;
do
{
c = phase2_getc ();
if (c == '(')
depth++;
else if (c == ')')
if (--depth == 0)
break;
}
while (c != EOF);
}
else
{
/* Command substitution (Bash syntax). */
phase2_ungetc (c3);
read_command_list (')', context);
}
open_doublequote = saved_open_doublequote;
}
else
{
phase1_ungetc (c2);
c2 = phase2_getc ();
if (c2 == '\'' && !open_singlequote)
{
/* Bash builtin for string with ANSI-C escape sequences. */
for (;;)
{
/* We have to use phase1 throughout this loop,
because phase2 does debackslashification,
which is undesirable when parsing ANSI-C
escape sequences. */
c = phase1_getc ();
if (c == EOF)
break;
if (c == '\'')
break;
if (c == '\\')
{
c = phase1_getc ();
switch (c)
{
default:
phase1_ungetc (c);
c = '\\';
break;
case '\\':
break;
case '\'':
break;
case '"':
break;
case 'a':
c = '\a';
break;
case 'b':
c = '\b';
break;
case 'e':
case 'E':
c = 0x1b; /* ESC */
break;
case 'f':
c = '\f';
break;
case 'n':
c = '\n';
break;
case 'r':
c = '\r';
break;
case 't':
c = '\t';
break;
case 'v':
c = '\v';
break;
case 'x':
c = phase1_getc ();
if ((c >= '0' && c <= '9')
|| (c >= 'A' && c <= 'F')
|| (c >= 'a' && c <= 'f'))
{
int n;
if (c >= '0' && c <= '9')
n = c - '0';
else if (c >= 'A' && c <= 'F')
n = 10 + c - 'A';
else if (c >= 'a' && c <= 'f')
n = 10 + c - 'a';
else
abort ();
c = phase1_getc ();
if ((c >= '0' && c <= '9')
|| (c >= 'A' && c <= 'F')
|| (c >= 'a' && c <= 'f'))
{
if (c >= '0' && c <= '9')
n = n * 16 + c - '0';
else if (c >= 'A' && c <= 'F')
n = n * 16 + 10 + c - 'A';
else if (c >= 'a' && c <= 'f')
n = n * 16 + 10 + c - 'a';
else
abort ();
}
else
phase1_ungetc (c);
c = n;
}
else
{
phase1_ungetc (c);
phase1_ungetc ('x');
c = '\\';
}
break;
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
{
int n = c - '0';
c = phase1_getc ();
if (c >= '0' && c <= '7')
{
n = n * 8 + c - '0';
c = phase1_getc ();
if (c >= '0' && c <= '7')
n = n * 8 + c - '0';
else
phase1_ungetc (c);
}
else
phase1_ungetc (c);
c = n;
}
break;
}
}
if (wp->type == t_string)
{
grow_token (wp->token);
wp->token->chars[wp->token->charcount++] =
(unsigned char) c;
}
}
/* The result is a literal string. Don't change wp->type. */
continue;
}
else if (c2 == '"' && !open_doublequote)
{
/* Bash builtin for internationalized string. */
lex_pos_ty pos;
struct token string;
saw_opening_singlequote ();
open_singlequote_terminator = '"';
pos.file_name = logical_file_name;
pos.line_number = line_number;
init_token (&string);
for (;;)
{
c = phase2_getc ();
if (c == EOF)
break;
if (c == '"')
{
saw_closing_singlequote ();
break;
}
grow_token (&string);
string.chars[string.charcount++] = (unsigned char) c;
}
remember_a_message (mlp, NULL, string_of_token (&string),
false, false, context, &pos,
NULL, savable_comment, false);
free_token (&string);
error_with_progname = false;
error (0, 0, _("%s:%lu: warning: the syntax $\"...\" is deprecated due to security reasons; use eval_gettext instead"),
pos.file_name, (unsigned long) pos.line_number);
error_with_progname = true;
/* The result at runtime is not constant. Therefore we
change wp->type. */
}
else
phase2_ungetc (c2);
}
wp->type = t_other;
continue;
}
if (c == '\'')
{
if (!open_singlequote)
{
/* Handle an opening single quote. */
saw_opening_singlequote ();
}
else
{
/* Handle a closing single quote. */
saw_closing_singlequote ();
}
continue;
}
if (c == '"')
{
if (open_singlequote && open_singlequote_terminator == '"')
{
/* Handle a closing i18n quote. */
saw_closing_singlequote ();
}
else if (!open_doublequote)
{
/* Handle an opening double quote. */
saw_opening_doublequote ();
}
else
{
/* Handle a closing double quote. */
saw_closing_doublequote ();
}
continue;
}
if (c == OPENING_BACKQUOTE)
{
/* Handle an opening backquote. */
saw_opening_backquote ();
read_command_list (CLOSING_BACKQUOTE, context);
wp->type = t_other;
continue;
}
if (c == CLOSING_BACKQUOTE)
break;
if (c == '<' || c == '>')
{
int c2;
/* An unquoted c indicates we are not inside '...' nor "...". */
if (open_singlequote || open_doublequote)
abort ();
c2 = phase2_getc ();
if (c2 == '(')
{
/* Process substitution (Bash syntax). */
read_command_list (')', context);
wp->type = t_other;
continue;
}
else
phase2_ungetc (c2);
}
if (!open_singlequote && !open_doublequote
&& (is_whitespace (c) || is_operator_start (c)))
break;
if (wp->type == t_string)
{
grow_token (wp->token);
wp->token->chars[wp->token->charcount++] = (unsigned char) c;
}
}
phase2_ungetc (c);
if (wp->type != t_string)
{
free_token (wp->token);
free (wp->token);
}
last_non_comment_line = line_number;
}
/* Read the next command.
'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
or '\0'.
Returns the type of the word that terminated the command. */
static enum word_type
read_command (int looking_for, flag_context_ty outer_context)
{
/* Read the words that make up the command.
Here we completely ignore field splitting at whitespace and wildcard
expansions; i.e. we assume that the source is written in such a way that
every word in the program determines exactly one word in the resulting
command.
But we do not require that the 'gettext'/'ngettext' command is the
first in the command; this is because 1. we want to allow for prefixes
like "$verbose" that may expand to nothing, and 2. it's a big effort
to know where a command starts in a $(for ...) or $(case ...) compound
command. */
int arg = 0; /* Current argument number. */
bool arg_of_redirect = false; /* True right after a redirection operator. */
bool must_expand_arg_strings = false; /* True if need to expand escape
sequences in arguments. */
flag_context_list_iterator_ty context_iter;
const struct callshapes *shapes = NULL;
struct arglist_parser *argparser = NULL;
for (;;)
{
struct word inner;
flag_context_ty inner_context;
if (arg == 0)
inner_context = null_context;
else
inner_context =
inherited_context (outer_context,
flag_context_list_iterator_advance (
&context_iter));
read_word (&inner, looking_for, inner_context);
/* Recognize end of command. */
if (inner.type == t_separator
|| inner.type == t_backquote || inner.type == t_paren
|| inner.type == t_eof)
{
if (argparser != NULL)
arglist_parser_done (argparser, arg);
return inner.type;
}
if (extract_all)
{
if (inner.type == t_string)
{
lex_pos_ty pos;
pos.file_name = logical_file_name;
pos.line_number = inner.line_number_at_start;
remember_a_message (mlp, NULL, string_of_word (&inner), false,
false, inner_context, &pos,
NULL, savable_comment, false);
}
}
if (arg_of_redirect)
{
/* Ignore arguments of redirection operators. */
arg_of_redirect = false;
}
else if (inner.type == t_redirect)
{
/* Ignore this word and the following one. */
arg_of_redirect = true;
}
else
{
bool matters_for_argparser = true;
if (argparser == NULL)
{
/* This is the function position. */
arg = 0;
if (inner.type == t_assignment)
{
/* An assignment just sets an environment variable.
Ignore it. */
/* Don't increment arg in this round. */
matters_for_argparser = false;
}
else if (inner.type == t_string)
{
char *function_name = string_of_word (&inner);
if (strcmp (function_name, "env") == 0)
{
/* The 'env' command just introduces more assignments.
Ignore it. */
/* Don't increment arg in this round. */
matters_for_argparser = false;
}
else
{
void *keyword_value;
if (hash_find_entry (&keywords,
function_name,
strlen (function_name),
&keyword_value)
== 0)
shapes = (const struct callshapes *) keyword_value;
argparser = arglist_parser_alloc (mlp, shapes);
context_iter =
flag_context_list_iterator (
flag_context_list_table_lookup (
flag_context_list_table,
function_name, strlen (function_name)));
}
free (function_name);
}
else
context_iter = null_context_list_iterator;
}
else
{
/* These are the argument positions. */
if (inner.type == t_string)
{
bool accepts_context =
((argparser->keyword_len == 7
&& memcmp (argparser->keyword, "gettext", 7) == 0)
|| (argparser->keyword_len == 8
&& memcmp (argparser->keyword, "ngettext", 8) == 0));
bool accepts_expand =
((argparser->keyword_len == 7
&& memcmp (argparser->keyword, "gettext", 7) == 0)
|| (argparser->keyword_len == 8
&& memcmp (argparser->keyword, "ngettext", 8) == 0));
if (accepts_context && argparser->next_is_msgctxt)
{
char *s = string_of_word (&inner);
mixed_string_ty *ms =
mixed_string_alloc_simple (s, lc_string,
logical_file_name,
inner.line_number_at_start);
free (s);
argparser->next_is_msgctxt = false;
arglist_parser_remember_msgctxt (argparser, ms,
inner_context,
logical_file_name,
inner.line_number_at_start);
matters_for_argparser = false;
}
else if (accepts_context
&& ((inner.token->charcount == 2
&& memcmp (inner.token->chars, "-c", 2) == 0)
|| (inner.token->charcount == 9
&& memcmp (inner.token->chars, "--context", 9) == 0)))
{
argparser->next_is_msgctxt = true;
matters_for_argparser = false;
}
else if (accepts_context
&& (inner.token->charcount >= 10
&& memcmp (inner.token->chars, "--context=", 10) == 0))
{
char *s = substring_of_word (&inner, 10);
mixed_string_ty *ms =
mixed_string_alloc_simple (s, lc_string,
logical_file_name,
inner.line_number_at_start);
free (s);
argparser->next_is_msgctxt = false;
arglist_parser_remember_msgctxt (argparser, ms,
inner_context,
logical_file_name,
inner.line_number_at_start);
matters_for_argparser = false;
}
else if (accepts_expand
&& inner.token->charcount == 2
&& memcmp (inner.token->chars, "-e", 2) == 0)
{
must_expand_arg_strings = true;
matters_for_argparser = false;
}
else
{
char *s = string_of_word (&inner);
mixed_string_ty *ms;
/* When '-e' was specified, expand escape sequences in s. */
if (accepts_expand && must_expand_arg_strings)
{
bool expands_backslash_c =
(argparser->keyword_len == 7
&& memcmp (argparser->keyword, "gettext", 7) == 0);
bool backslash_c = false;
char *expanded =
(char *)
expand_escapes (s, expands_backslash_c ? &backslash_c : NULL);
/* We can ignore the value of expands_backslash_c, because
here we don't support the gettext '-s' option. */
if (expanded != s)
free (s);
s = expanded;
}
ms = mixed_string_alloc_simple (s, lc_string,
logical_file_name,
inner.line_number_at_start);
free (s);
arglist_parser_remember (argparser, arg, ms,
inner_context,
logical_file_name,
inner.line_number_at_start,
savable_comment, false);
}
}
if (matters_for_argparser)
if (arglist_parser_decidedp (argparser, arg))
{
/* Stop looking for arguments of the last function_name. */
/* FIXME: What about context_iter? */
arglist_parser_done (argparser, arg);
shapes = NULL;
argparser = NULL;
}
}
if (matters_for_argparser)
arg++;
}
free_word (&inner);
}
}
/* Read a list of commands.
'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
or '\0'.
Returns the type of the word that terminated the command list. */
static enum word_type
read_command_list (int looking_for, flag_context_ty outer_context)
{
for (;;)
{
enum word_type terminator;
terminator = read_command (looking_for, outer_context);
if (terminator != t_separator)
return terminator;
}
}
void
extract_sh (FILE *f,
const char *real_filename, const char *logical_filename,
flag_context_list_table_ty *flag_table,
msgdomain_list_ty *mdlp)
{
mlp = mdlp->item[0]->messages;
fp = f;
real_file_name = real_filename;
logical_file_name = xstrdup (logical_filename);
line_number = 1;
phase1_pushback_length = 0;
last_comment_line = -1;
last_non_comment_line = -1;
nested_backquotes = 0;
open_doublequotes_mask = 0;
open_doublequote = false;
open_singlequote = false;
phase2_pushback_length = 0;
flag_context_list_table = flag_table;
init_keywords ();
/* Eat tokens until eof is seen. */
read_command_list ('\0', null_context);
fp = NULL;
real_file_name = NULL;
logical_file_name = NULL;
line_number = 0;
}