| /* xgettext Vala backend. |
| Copyright (C) 2013-2014, 2018-2020 Free Software Foundation, Inc. |
| |
| This file was written by Daiki Ueno <ueno@gnu.org>, 2013. |
| |
| This program is free software: you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 3 of the License, or |
| (at your option) any later version. |
| |
| This program is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
| |
| #ifdef HAVE_CONFIG_H |
| # include "config.h" |
| #endif |
| |
| /* Specification. */ |
| #include "x-vala.h" |
| |
| #include <assert.h> |
| #include <errno.h> |
| #include <stdbool.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "attribute.h" |
| #include "message.h" |
| #include "rc-str-list.h" |
| #include "xgettext.h" |
| #include "xg-pos.h" |
| #include "xg-encoding.h" |
| #include "xg-mixed-string.h" |
| #include "xg-arglist-context.h" |
| #include "xg-arglist-callshape.h" |
| #include "xg-arglist-parser.h" |
| #include "xg-message.h" |
| #include "error.h" |
| #include "error-progname.h" |
| #include "xalloc.h" |
| #include "xvasprintf.h" |
| #include "mem-hash-map.h" |
| #include "po-charset.h" |
| #include "gettext.h" |
| |
| #define _(s) gettext(s) |
| |
| #define SIZEOF(a) (sizeof(a) / sizeof(a[0])) |
| |
| /* The Vala syntax is defined in the Vala Reference Manual |
| https://www.vala-project.org/doc/vala/. |
| See also vala/valascanner.vala. */ |
| |
| /* ====================== Keyword set customization. ====================== */ |
| |
| /* If true extract all strings. */ |
| static bool extract_all = false; |
| |
| static hash_table keywords; |
| static bool default_keywords = true; |
| |
| |
| void |
| x_vala_extract_all () |
| { |
| extract_all = true; |
| } |
| |
| |
| static void |
| add_keyword (const char *name, hash_table *keywords) |
| { |
| if (name == NULL) |
| default_keywords = false; |
| else |
| { |
| const char *end; |
| struct callshape shape; |
| const char *colon; |
| |
| if (keywords->table == NULL) |
| hash_init (keywords, 100); |
| |
| split_keywordspec (name, &end, &shape); |
| |
| /* The characters between name and end should form a valid C identifier. |
| A colon means an invalid parse in split_keywordspec(). */ |
| colon = strchr (name, ':'); |
| if (colon == NULL || colon >= end) |
| insert_keyword_callshape (keywords, name, end - name, &shape); |
| } |
| } |
| |
| void |
| x_vala_keyword (const char *name) |
| { |
| add_keyword (name, &keywords); |
| } |
| |
| static void |
| init_keywords () |
| { |
| if (default_keywords) |
| { |
| /* When adding new keywords here, also update the documentation in |
| xgettext.texi! */ |
| x_vala_keyword ("dgettext:2"); |
| x_vala_keyword ("dcgettext:2"); |
| x_vala_keyword ("ngettext:1,2"); |
| x_vala_keyword ("dngettext:2,3"); |
| x_vala_keyword ("dpgettext:2g"); |
| x_vala_keyword ("dpgettext2:2c,3"); |
| x_vala_keyword ("_"); |
| x_vala_keyword ("Q_"); |
| x_vala_keyword ("N_"); |
| x_vala_keyword ("NC_:1c,2"); |
| |
| default_keywords = false; |
| } |
| } |
| |
| void |
| init_flag_table_vala () |
| { |
| xgettext_record_flag ("dgettext:2:pass-c-format"); |
| xgettext_record_flag ("dcgettext:2:pass-c-format"); |
| xgettext_record_flag ("ngettext:1:pass-c-format"); |
| xgettext_record_flag ("ngettext:2:pass-c-format"); |
| xgettext_record_flag ("dngettext:2:pass-c-format"); |
| xgettext_record_flag ("dngettext:3:pass-c-format"); |
| xgettext_record_flag ("dpgettext:2:pass-c-format"); |
| xgettext_record_flag ("dpgettext2:3:pass-c-format"); |
| xgettext_record_flag ("_:1:pass-c-format"); |
| xgettext_record_flag ("Q_:1:pass-c-format"); |
| xgettext_record_flag ("N_:1:pass-c-format"); |
| xgettext_record_flag ("NC_:2:pass-c-format"); |
| |
| /* Vala leaves string formatting to Glib functions and thus the |
| format string is exactly same as C. See also |
| vapi/glib-2.0.vapi. */ |
| xgettext_record_flag ("printf:1:c-format"); |
| xgettext_record_flag ("vprintf:1:c-format"); |
| } |
| |
| |
| /* ======================== Reading of characters. ======================== */ |
| |
| /* The input file stream. */ |
| static FILE *fp; |
| |
| |
| /* 1. line_number handling. */ |
| |
| #define MAX_PHASE1_PUSHBACK 16 |
| static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK]; |
| static int phase1_pushback_length; |
| |
| |
| static int |
| phase1_getc () |
| { |
| int c; |
| |
| if (phase1_pushback_length) |
| c = phase1_pushback[--phase1_pushback_length]; |
| else |
| { |
| c = getc (fp); |
| if (c == EOF) |
| { |
| if (ferror (fp)) |
| error (EXIT_FAILURE, errno, _("error while reading \"%s\""), |
| real_file_name); |
| return EOF; |
| } |
| } |
| |
| if (c == '\n') |
| ++line_number; |
| return c; |
| } |
| |
| |
| /* Supports 2 characters of pushback. */ |
| static void |
| phase1_ungetc (int c) |
| { |
| if (c != EOF) |
| { |
| if (c == '\n') |
| --line_number; |
| |
| if (phase1_pushback_length == SIZEOF (phase1_pushback)) |
| abort (); |
| phase1_pushback[phase1_pushback_length++] = c; |
| } |
| } |
| |
| |
| /* These are for tracking whether comments count as immediately before |
| keyword. */ |
| static int last_comment_line; |
| static int last_non_comment_line; |
| |
| /* Accumulating comments. */ |
| |
| static char *buffer; |
| static size_t bufmax; |
| static size_t buflen; |
| |
| static inline void |
| comment_start () |
| { |
| buflen = 0; |
| } |
| |
| static inline void |
| comment_add (int c) |
| { |
| if (buflen >= bufmax) |
| { |
| bufmax = 2 * bufmax + 10; |
| buffer = xrealloc (buffer, bufmax); |
| } |
| buffer[buflen++] = c; |
| } |
| |
| static inline void |
| comment_line_end (size_t chars_to_remove) |
| { |
| buflen -= chars_to_remove; |
| while (buflen >= 1 |
| && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) |
| --buflen; |
| if (chars_to_remove == 0 && buflen >= bufmax) |
| { |
| bufmax = 2 * bufmax + 10; |
| buffer = xrealloc (buffer, bufmax); |
| } |
| buffer[buflen] = '\0'; |
| savable_comment_add (buffer); |
| } |
| |
| |
| /* 2. Replace each comment that is not inside a character constant or |
| string literal with a space character. */ |
| |
| static int |
| phase2_getc () |
| { |
| int c; |
| bool last_was_star; |
| |
| c = phase1_getc (); |
| if (c != '/') |
| return c; |
| c = phase1_getc (); |
| switch (c) |
| { |
| default: |
| phase1_ungetc (c); |
| return '/'; |
| |
| case '*': |
| /* C comment. */ |
| comment_start (); |
| last_was_star = false; |
| for (;;) |
| { |
| c = phase1_getc (); |
| if (c == EOF) |
| break; |
| /* We skip all leading white space, but not EOLs. */ |
| if (!(buflen == 0 && (c == ' ' || c == '\t'))) |
| comment_add (c); |
| switch (c) |
| { |
| case '\n': |
| comment_line_end (1); |
| comment_start (); |
| last_was_star = false; |
| continue; |
| |
| case '*': |
| last_was_star = true; |
| continue; |
| |
| case '/': |
| if (last_was_star) |
| { |
| comment_line_end (2); |
| break; |
| } |
| FALLTHROUGH; |
| |
| default: |
| last_was_star = false; |
| continue; |
| } |
| break; |
| } |
| last_comment_line = line_number; |
| return ' '; |
| |
| case '/': |
| /* C++ or ISO C 99 comment. */ |
| comment_start (); |
| for (;;) |
| { |
| c = phase1_getc (); |
| if (c == '\n' || c == EOF) |
| break; |
| /* We skip all leading white space, but not EOLs. */ |
| if (!(buflen == 0 && (c == ' ' || c == '\t'))) |
| comment_add (c); |
| } |
| comment_line_end (0); |
| last_comment_line = line_number; |
| return '\n'; |
| } |
| } |
| |
| |
| static void |
| phase2_ungetc (int c) |
| { |
| phase1_ungetc (c); |
| } |
| |
| |
| /* ========================== Reading of tokens. ========================== */ |
| |
| enum token_type_ty |
| { |
| token_type_character_constant, /* 'x' */ |
| token_type_eof, |
| token_type_lparen, /* ( */ |
| token_type_rparen, /* ) */ |
| token_type_lbrace, /* { */ |
| token_type_rbrace, /* } */ |
| token_type_assign, /* = += -= *= /= %= <<= >>= &= |= ^= */ |
| token_type_return, /* return */ |
| token_type_plus, /* + */ |
| token_type_arithmetic_operator, /* - * / % << >> & | ^ */ |
| token_type_equality_test_operator, /* == < > >= <= != */ |
| token_type_logic_operator, /* ! && || */ |
| token_type_comma, /* , */ |
| token_type_question, /* ? */ |
| token_type_colon, /* : */ |
| token_type_number, /* 2.7 */ |
| token_type_string_literal, /* "abc" */ |
| token_type_string_template, /* @"abc" */ |
| token_type_regex_literal, /* /.../ */ |
| token_type_symbol, /* if else etc. */ |
| token_type_other |
| }; |
| typedef enum token_type_ty token_type_ty; |
| |
| typedef struct token_ty token_ty; |
| struct token_ty |
| { |
| token_type_ty type; |
| char *string; /* for token_type_symbol */ |
| mixed_string_ty *mixed_string; /* for token_type_string_literal */ |
| refcounted_string_list_ty *comment; /* for token_type_string_literal */ |
| int line_number; |
| }; |
| |
| /* Free the memory pointed to by a 'struct token_ty'. */ |
| static inline void |
| free_token (token_ty *tp) |
| { |
| if (tp->type == token_type_symbol) |
| free (tp->string); |
| if (tp->type == token_type_string_literal) |
| { |
| mixed_string_free (tp->mixed_string); |
| drop_reference (tp->comment); |
| } |
| } |
| |
| |
| /* Return value of phase7_getc when EOF is reached. */ |
| #define P7_EOF (-1) |
| #define P7_STRING_END (-2) |
| |
| /* Replace escape sequences within character strings with their single |
| character equivalents. */ |
| #define P7_QUOTES (-3) |
| #define P7_QUOTE (-4) |
| #define P7_NEWLINE (-5) |
| |
| /* Convert an UTF-16 or UTF-32 code point to a return value that can be |
| distinguished from a single-byte return value. */ |
| #define UNICODE(code) (0x100 + (code)) |
| |
| /* Test a return value of phase7_getuc whether it designates an UTF-16 or |
| UTF-32 code point. */ |
| #define IS_UNICODE(p7_result) ((p7_result) >= 0x100) |
| |
| /* Extract the UTF-16 or UTF-32 code of a return value that satisfies |
| IS_UNICODE. */ |
| #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100) |
| |
| |
| static int |
| phase7_getc () |
| { |
| int c, n, j; |
| |
| /* Use phase 1, because phase 2 elides comments. */ |
| c = phase1_getc (); |
| |
| /* Return a magic newline indicator, so that we can distinguish |
| between the user requesting a newline in the string (e.g. using |
| "\n" or "\012") from the user failing to terminate the string or |
| character constant. The ANSI C standard says: 3.1.3.4 Character |
| Constants contain "any character except single quote, backslash or |
| newline; or an escape sequence" and 3.1.4 String Literals contain |
| "any character except double quote, backslash or newline; or an |
| escape sequence". |
| |
| Most compilers give a fatal error in this case, however gcc is |
| stupidly silent, even though this is a very common typo. OK, so |
| "gcc --pedantic" will tell me, but that gripes about too much other |
| stuff. Could I have a "gcc -Wnewline-in-string" option, or |
| better yet a "gcc -fno-newline-in-string" option, please? Gcc is |
| also inconsistent between string literals and character constants: |
| you may not embed newlines in character constants; try it, you get |
| a useful diagnostic. --PMiller */ |
| if (c == '\n') |
| return P7_NEWLINE; |
| |
| if (c == '"') |
| return P7_QUOTES; |
| if (c == '\'') |
| return P7_QUOTE; |
| if (c != '\\') |
| return c; |
| c = phase1_getc (); |
| switch (c) |
| { |
| default: |
| /* Unknown escape sequences really should be an error, but just |
| ignore them, and let the real compiler complain. */ |
| phase1_ungetc (c); |
| return '\\'; |
| |
| case '"': |
| case '\'': |
| case '\\': |
| case '$': |
| return c; |
| |
| case 'b': |
| return '\b'; |
| |
| case 'f': |
| return '\f'; |
| case 'n': |
| return '\n'; |
| case 'r': |
| return '\r'; |
| case 't': |
| return '\t'; |
| case 'v': |
| return '\v'; |
| |
| case 'x': |
| c = phase1_getc (); |
| switch (c) |
| { |
| default: |
| phase1_ungetc (c); |
| phase1_ungetc ('x'); |
| return '\\'; |
| |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
| case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
| break; |
| } |
| n = 0; |
| for (;;) |
| { |
| switch (c) |
| { |
| default: |
| phase1_ungetc (c); |
| return n; |
| |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| n = n * 16 + c - '0'; |
| break; |
| |
| case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
| n = n * 16 + 10 + c - 'A'; |
| break; |
| |
| case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
| n = n * 16 + 10 + c - 'a'; |
| break; |
| } |
| c = phase1_getc (); |
| } |
| return n; |
| |
| case '0': |
| n = 0; |
| for (j = 0; j < 3; ++j) |
| { |
| n = n * 8 + c - '0'; |
| c = phase1_getc (); |
| switch (c) |
| { |
| default: |
| break; |
| |
| case '0': case '1': case '2': case '3': |
| case '4': case '5': case '6': case '7': |
| continue; |
| } |
| break; |
| } |
| phase1_ungetc (c); |
| return n; |
| |
| case 'u': |
| { |
| unsigned char buf[8]; |
| |
| n = 0; |
| for (j = 0; j < 4; j++) |
| { |
| int c1 = phase1_getc (); |
| |
| if (c1 >= '0' && c1 <= '9') |
| n = (n << 4) + (c1 - '0'); |
| else if (c1 >= 'A' && c1 <= 'F') |
| n = (n << 4) + (c1 - 'A' + 10); |
| else if (c1 >= 'a' && c1 <= 'f') |
| n = (n << 4) + (c1 - 'a' + 10); |
| else |
| { |
| phase1_ungetc (c1); |
| while (--j >= 0) |
| phase1_ungetc (buf[j]); |
| phase1_ungetc (c); |
| return '\\'; |
| } |
| |
| buf[j] = c1; |
| } |
| |
| if (n < 0x110000) |
| return UNICODE (n); |
| |
| error_with_progname = false; |
| error (0, 0, _("%s:%d: warning: invalid Unicode character"), |
| logical_file_name, line_number); |
| error_with_progname = true; |
| |
| while (--j >= 0) |
| phase1_ungetc (buf[j]); |
| phase1_ungetc (c); |
| return '\\'; |
| } |
| } |
| } |
| |
| |
| static void |
| phase7_ungetc (int c) |
| { |
| phase1_ungetc (c); |
| } |
| |
| |
| /* 3. Parse each resulting logical line as preprocessing tokens and |
| white space. Preprocessing tokens and Vala tokens don't always |
| match. */ |
| |
| static token_ty phase3_pushback[2]; |
| static int phase3_pushback_length; |
| |
| |
| static token_type_ty last_token_type; |
| |
| static void |
| phase3_scan_regex () |
| { |
| int c; |
| |
| for (;;) |
| { |
| c = phase1_getc (); |
| if (c == '/') |
| break; |
| if (c == '\\') |
| { |
| c = phase1_getc (); |
| if (c != EOF) |
| continue; |
| } |
| if (c == EOF) |
| { |
| error_with_progname = false; |
| error (0, 0, |
| _("%s:%d: warning: regular expression literal terminated too early"), |
| logical_file_name, line_number); |
| error_with_progname = true; |
| return; |
| } |
| } |
| |
| c = phase2_getc (); |
| if (!(c == 'i' || c == 's' || c == 'm' || c == 'x')) |
| phase2_ungetc (c); |
| } |
| |
| static void |
| phase3_get (token_ty *tp) |
| { |
| static char *buffer; |
| static int bufmax; |
| int bufpos; |
| |
| #undef APPEND |
| #define APPEND(c) \ |
| do \ |
| { \ |
| if (bufpos >= bufmax) \ |
| { \ |
| bufmax = 2 * bufmax + 10; \ |
| buffer = xrealloc (buffer, bufmax); \ |
| } \ |
| buffer[bufpos++] = c; \ |
| } \ |
| while (0) |
| |
| if (phase3_pushback_length) |
| { |
| *tp = phase3_pushback[--phase3_pushback_length]; |
| last_token_type = tp->type; |
| return; |
| } |
| |
| for (;;) |
| { |
| bool template; |
| bool verbatim; |
| int c; |
| |
| tp->line_number = line_number; |
| c = phase2_getc (); |
| |
| switch (c) |
| { |
| case EOF: |
| tp->type = last_token_type = token_type_eof; |
| return; |
| |
| case '\n': |
| if (last_non_comment_line > last_comment_line) |
| savable_comment_reset (); |
| FALLTHROUGH; |
| case ' ': |
| case '\f': |
| case '\t': |
| /* Ignore whitespace and comments. */ |
| continue; |
| default: |
| break; |
| } |
| |
| last_non_comment_line = tp->line_number; |
| template = false; |
| verbatim = false; |
| |
| switch (c) |
| { |
| case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': |
| case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': |
| case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': |
| case 'V': case 'W': case 'X': case 'Y': case 'Z': |
| case '_': |
| case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': |
| case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': |
| case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': |
| case 'v': case 'w': case 'x': case 'y': case 'z': |
| bufpos = 0; |
| for (;;) |
| { |
| APPEND (c); |
| c = phase2_getc (); |
| switch (c) |
| { |
| case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
| case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': |
| case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': |
| case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': |
| case 'Y': case 'Z': |
| case '_': |
| case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
| case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': |
| case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': |
| case 's': case 't': case 'u': case 'v': case 'w': case 'x': |
| case 'y': case 'z': |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| continue; |
| |
| default: |
| phase2_ungetc (c); |
| break; |
| } |
| break; |
| } |
| APPEND (0); |
| if (strcmp (buffer, "return") == 0) |
| tp->type = last_token_type = token_type_return; |
| else |
| { |
| tp->string = xstrdup (buffer); |
| tp->type = last_token_type = token_type_symbol; |
| } |
| return; |
| |
| case '.': |
| c = phase2_getc (); |
| phase2_ungetc (c); |
| switch (c) |
| { |
| default: |
| tp->string = xstrdup ("."); |
| tp->type = last_token_type = token_type_symbol; |
| return; |
| |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| c = '.'; |
| break; |
| } |
| FALLTHROUGH; |
| |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| /* The preprocessing number token is more "generous" than the C |
| number tokens. This is mostly due to token pasting (another |
| thing we can ignore here). */ |
| bufpos = 0; |
| for (;;) |
| { |
| APPEND (c); |
| c = phase2_getc (); |
| switch (c) |
| { |
| case 'e': |
| case 'E': |
| APPEND (c); |
| c = phase2_getc (); |
| if (c != '+' && c != '-') |
| { |
| phase2_ungetc (c); |
| break; |
| } |
| continue; |
| |
| case 'A': case 'B': case 'C': case 'D': case 'F': |
| case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': |
| case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': |
| case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': |
| case 'Y': case 'Z': |
| case 'a': case 'b': case 'c': case 'd': case 'f': |
| case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': |
| case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': |
| case 's': case 't': case 'u': case 'v': case 'w': case 'x': |
| case 'y': case 'z': |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| case '.': |
| continue; |
| |
| default: |
| phase2_ungetc (c); |
| break; |
| } |
| break; |
| } |
| APPEND (0); |
| tp->type = last_token_type = token_type_number; |
| return; |
| |
| case '\'': |
| for (;;) |
| { |
| c = phase7_getc (); |
| if (c == P7_NEWLINE) |
| { |
| error_with_progname = false; |
| error (0, 0, _("%s:%d: warning: unterminated character constant"), |
| logical_file_name, line_number - 1); |
| error_with_progname = true; |
| phase7_ungetc ('\n'); |
| break; |
| } |
| if (c == EOF || c == P7_QUOTE) |
| break; |
| } |
| tp->type = last_token_type = token_type_character_constant; |
| return; |
| |
| /* Vala provides strings in three different formats. |
| |
| Usual string literals: |
| "..." |
| Verbatim string literals: |
| """...""" (where ... can include newlines and double quotes) |
| String templates. |
| @"...", @"""...""" |
| |
| Note that, with the current implementation string |
| templates are not subject to translation, because they are |
| inspected at compile time. For example, the following code |
| |
| string bar = "bar"; |
| string foo = _(@"foo $bar"); |
| |
| will be translated into the C code, like: |
| |
| _(g_strconcat ("foo ", "bar", NULL)); */ |
| case '@': |
| c = phase2_getc (); |
| if (c != '"') |
| { |
| phase2_ungetc (c); |
| tp->type = last_token_type = token_type_other; |
| return; |
| } |
| template = true; |
| FALLTHROUGH; |
| case '"': |
| { |
| struct mixed_string_buffer msb; |
| int c2 = phase1_getc (); |
| |
| if (c2 == '"') |
| { |
| int c3 = phase1_getc (); |
| if (c3 == '"') |
| verbatim = true; |
| else |
| { |
| phase1_ungetc (c3); |
| phase1_ungetc (c2); |
| } |
| } |
| else |
| phase2_ungetc (c2); |
| |
| /* Start accumulating the string. */ |
| mixed_string_buffer_init (&msb, lc_string, |
| logical_file_name, line_number); |
| if (verbatim) |
| for (;;) |
| { |
| c = phase1_getc (); |
| |
| /* Keep line_number in sync. */ |
| msb.line_number = line_number; |
| |
| if (c == '"') |
| { |
| int c2 = phase1_getc (); |
| if (c2 == '"') |
| { |
| int c3 = phase1_getc (); |
| if (c3 == '"') |
| break; |
| phase1_ungetc (c3); |
| } |
| phase1_ungetc (c2); |
| } |
| if (c == EOF) |
| break; |
| mixed_string_buffer_append_char (&msb, c); |
| } |
| else |
| for (;;) |
| { |
| c = phase7_getc (); |
| |
| /* Keep line_number in sync. */ |
| msb.line_number = line_number; |
| |
| if (c == P7_NEWLINE) |
| { |
| error_with_progname = false; |
| error (0, 0, |
| _("%s:%d: warning: unterminated string literal"), |
| logical_file_name, line_number - 1); |
| error_with_progname = true; |
| phase7_ungetc ('\n'); |
| break; |
| } |
| if (c == P7_QUOTES) |
| break; |
| if (c == EOF) |
| break; |
| if (c == P7_QUOTE) |
| c = '\''; |
| if (IS_UNICODE (c)) |
| { |
| assert (UNICODE_VALUE (c) >= 0 |
| && UNICODE_VALUE (c) < 0x110000); |
| mixed_string_buffer_append_unicode (&msb, |
| UNICODE_VALUE (c)); |
| } |
| else |
| mixed_string_buffer_append_char (&msb, c); |
| } |
| /* Done accumulating the string. */ |
| if (template) |
| { |
| tp->type = token_type_string_template; |
| mixed_string_buffer_destroy (&msb); |
| } |
| else |
| { |
| tp->type = token_type_string_literal; |
| tp->mixed_string = mixed_string_buffer_result (&msb); |
| tp->comment = add_reference (savable_comment); |
| } |
| last_token_type = tp->type; |
| return; |
| } |
| |
| case '/': |
| switch (last_token_type) |
| { |
| case token_type_lparen: |
| case token_type_lbrace: |
| case token_type_assign: |
| case token_type_return: |
| case token_type_plus: |
| case token_type_arithmetic_operator: |
| case token_type_equality_test_operator: |
| case token_type_logic_operator: |
| case token_type_comma: |
| case token_type_question: |
| case token_type_colon: |
| phase3_scan_regex (); |
| tp->type = last_token_type = token_type_regex_literal; |
| break; |
| default: |
| { |
| int c2 = phase2_getc (); |
| if (c2 == '=') |
| tp->type = last_token_type = token_type_assign; |
| else |
| { |
| phase2_ungetc (c2); |
| tp->type = last_token_type = token_type_arithmetic_operator; |
| } |
| break; |
| } |
| } |
| return; |
| |
| case '(': |
| tp->type = last_token_type = token_type_lparen; |
| return; |
| |
| case ')': |
| tp->type = last_token_type = token_type_rparen; |
| return; |
| |
| case '{': |
| tp->type = last_token_type = token_type_lbrace; |
| return; |
| |
| case '}': |
| tp->type = last_token_type = token_type_rbrace; |
| return; |
| |
| case '+': |
| { |
| int c2 = phase2_getc (); |
| switch (c2) |
| { |
| case '+': |
| tp->type = last_token_type = token_type_other; |
| break; |
| case '=': |
| tp->type = last_token_type = token_type_assign; |
| break; |
| default: |
| phase2_ungetc (c2); |
| tp->type = last_token_type = token_type_plus; |
| break; |
| } |
| return; |
| } |
| |
| case '-': |
| { |
| int c2 = phase2_getc (); |
| switch (c2) |
| { |
| case '-': |
| tp->type = last_token_type = token_type_other; |
| break; |
| case '=': |
| tp->type = last_token_type = token_type_assign; |
| break; |
| default: |
| phase2_ungetc (c2); |
| tp->type = last_token_type = token_type_arithmetic_operator; |
| break; |
| } |
| return; |
| } |
| |
| case '%': |
| case '^': |
| { |
| int c2 = phase2_getc (); |
| if (c2 == '=') |
| tp->type = last_token_type = token_type_assign; |
| else |
| { |
| phase2_ungetc (c2); |
| tp->type = last_token_type = token_type_logic_operator; |
| } |
| return; |
| } |
| |
| case '=': |
| { |
| int c2 = phase2_getc (); |
| switch (c2) |
| { |
| case '=': |
| tp->type = last_token_type = token_type_equality_test_operator; |
| break; |
| case '>': |
| tp->type = last_token_type = token_type_other; |
| break; |
| default: |
| phase2_ungetc (c2); |
| tp->type = last_token_type = token_type_assign; |
| break; |
| } |
| return; |
| } |
| |
| case '!': |
| { |
| int c2 = phase2_getc (); |
| if (c2 == '=') |
| tp->type = last_token_type = token_type_equality_test_operator; |
| else |
| { |
| phase2_ungetc (c2); |
| tp->type = last_token_type = token_type_logic_operator; |
| } |
| return; |
| } |
| |
| case '>': |
| case '<': |
| { |
| int c2 = phase2_getc (); |
| if (c2 == '=') |
| tp->type = last_token_type = token_type_equality_test_operator; |
| else if (c2 == c) |
| { |
| int c3 = phase2_getc (); |
| if (c3 == '=') |
| tp->type = last_token_type = token_type_assign; |
| else |
| { |
| phase2_ungetc (c2); |
| phase2_ungetc (c3); |
| tp->type = last_token_type = token_type_other; |
| } |
| } |
| else |
| { |
| phase2_ungetc (c2); |
| tp->type = last_token_type = token_type_equality_test_operator; |
| } |
| return; |
| } |
| |
| case ',': |
| tp->type = last_token_type = token_type_comma; |
| return; |
| |
| case ':': |
| tp->type = last_token_type = token_type_colon; |
| return; |
| |
| case '&': |
| case '|': |
| { |
| int c2 = phase2_getc (); |
| if (c2 == c) |
| tp->type = last_token_type = token_type_logic_operator; |
| else if (c2 == '=') |
| tp->type = last_token_type = token_type_assign; |
| else |
| { |
| phase2_ungetc (c2); |
| tp->type = last_token_type = token_type_arithmetic_operator; |
| } |
| return; |
| } |
| |
| case '?': |
| { |
| int c2 = phase2_getc (); |
| if (c2 == '?') |
| tp->type = last_token_type = token_type_logic_operator; |
| else |
| { |
| phase2_ungetc (c2); |
| tp->type = last_token_type = token_type_question; |
| } |
| return; |
| } |
| |
| default: |
| tp->type = last_token_type = token_type_other; |
| return; |
| } |
| } |
| #undef APPEND |
| } |
| |
| static void |
| phase3_unget (token_ty *tp) |
| { |
| if (tp->type != token_type_eof) |
| { |
| if (phase3_pushback_length == SIZEOF (phase3_pushback)) |
| abort (); |
| phase3_pushback[phase3_pushback_length++] = *tp; |
| } |
| } |
| |
| |
| /* String concatenation with '+'. */ |
| |
| static void |
| x_vala_lex (token_ty *tp) |
| { |
| phase3_get (tp); |
| if (tp->type == token_type_string_literal) |
| { |
| mixed_string_ty *sum = tp->mixed_string; |
| |
| for (;;) |
| { |
| token_ty token2; |
| |
| phase3_get (&token2); |
| if (token2.type == token_type_plus) |
| { |
| token_ty token3; |
| |
| phase3_get (&token3); |
| if (token3.type == token_type_string_literal) |
| { |
| sum = mixed_string_concat_free1 (sum, token3.mixed_string); |
| |
| free_token (&token3); |
| free_token (&token2); |
| continue; |
| } |
| phase3_unget (&token3); |
| } |
| phase3_unget (&token2); |
| break; |
| } |
| tp->mixed_string = sum; |
| } |
| } |
| |
| |
| /* ========================= Extracting strings. ========================== */ |
| |
| |
| /* Context lookup table. */ |
| static flag_context_list_table_ty *flag_context_list_table; |
| |
| |
| /* The file is broken into tokens. Scan the token stream, looking for |
| a keyword, followed by a left paren, followed by a string. When we |
| see this sequence, we have something to remember. We assume we are |
| looking at a valid Vala program, and leave the complaints about the |
| grammar to the compiler. |
| |
| Normal handling: Look for |
| keyword ( ... msgid ... ) |
| keyword msgid |
| Plural handling: Look for |
| keyword ( ... msgid ... msgid_plural ... ) |
| |
| We use recursion because the arguments before msgid or between msgid |
| and msgid_plural can contain subexpressions of the same form. */ |
| |
| /* Extract messages until the next balanced closing parenthesis or bracket. |
| Extracted messages are added to MLP. |
| DELIM can be either token_type_rparen or token_type_rbracket, or |
| token_type_eof to accept both. |
| Return true upon eof, false upon closing parenthesis or bracket. */ |
| static bool |
| extract_balanced (message_list_ty *mlp, token_type_ty delim, |
| flag_context_ty outer_context, |
| flag_context_list_iterator_ty context_iter, |
| struct arglist_parser *argparser) |
| { |
| /* Current argument number. */ |
| int arg = 1; |
| /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ |
| int state; |
| /* Parameters of the keyword just seen. Defined only in state 1. */ |
| const struct callshapes *next_shapes = NULL; |
| /* Context iterator that will be used if the next token is a '('. */ |
| flag_context_list_iterator_ty next_context_iter = |
| passthrough_context_list_iterator; |
| /* Current context. */ |
| flag_context_ty inner_context = |
| inherited_context (outer_context, |
| flag_context_list_iterator_advance (&context_iter)); |
| |
| /* Start state is 0. */ |
| state = 0; |
| |
| for (;;) |
| { |
| token_ty token; |
| |
| x_vala_lex (&token); |
| |
| switch (token.type) |
| { |
| case token_type_symbol: |
| { |
| void *keyword_value; |
| |
| if (hash_find_entry (&keywords, token.string, strlen (token.string), |
| &keyword_value) |
| == 0) |
| { |
| next_shapes = (const struct callshapes *) keyword_value; |
| state = 1; |
| } |
| else |
| state = 0; |
| } |
| next_context_iter = |
| flag_context_list_iterator ( |
| flag_context_list_table_lookup ( |
| flag_context_list_table, |
| token.string, strlen (token.string))); |
| free (token.string); |
| continue; |
| |
| case token_type_lparen: |
| if (extract_balanced (mlp, token_type_rparen, |
| inner_context, next_context_iter, |
| arglist_parser_alloc (mlp, |
| state ? next_shapes : NULL))) |
| { |
| arglist_parser_done (argparser, arg); |
| return true; |
| } |
| next_context_iter = null_context_list_iterator; |
| state = 0; |
| break; |
| |
| case token_type_rparen: |
| if (delim == token_type_rparen || delim == token_type_eof) |
| { |
| arglist_parser_done (argparser, arg); |
| return false; |
| } |
| |
| next_context_iter = null_context_list_iterator; |
| state = 0; |
| continue; |
| |
| case token_type_comma: |
| arg++; |
| inner_context = |
| inherited_context (outer_context, |
| flag_context_list_iterator_advance ( |
| &context_iter)); |
| next_context_iter = passthrough_context_list_iterator; |
| state = 0; |
| continue; |
| |
| case token_type_eof: |
| arglist_parser_done (argparser, arg); |
| return true; |
| |
| case token_type_string_literal: |
| { |
| lex_pos_ty pos; |
| |
| pos.file_name = logical_file_name; |
| pos.line_number = token.line_number; |
| |
| if (extract_all) |
| { |
| char *string = mixed_string_contents (token.mixed_string); |
| mixed_string_free (token.mixed_string); |
| remember_a_message (mlp, NULL, string, true, false, |
| inner_context, &pos, |
| NULL, token.comment, false); |
| } |
| else |
| { |
| /* A string immediately after a symbol means a function call. */ |
| if (state) |
| { |
| struct arglist_parser *tmp_argparser; |
| tmp_argparser = arglist_parser_alloc (mlp, next_shapes); |
| |
| arglist_parser_remember (tmp_argparser, 1, |
| token.mixed_string, inner_context, |
| pos.file_name, pos.line_number, |
| token.comment, false); |
| arglist_parser_done (tmp_argparser, 1); |
| } |
| else |
| arglist_parser_remember (argparser, arg, |
| token.mixed_string, inner_context, |
| pos.file_name, pos.line_number, |
| token.comment, false); |
| } |
| } |
| drop_reference (token.comment); |
| next_context_iter = null_context_list_iterator; |
| state = 0; |
| continue; |
| |
| case token_type_character_constant: |
| case token_type_lbrace: |
| case token_type_rbrace: |
| case token_type_assign: |
| case token_type_return: |
| case token_type_plus: |
| case token_type_arithmetic_operator: |
| case token_type_equality_test_operator: |
| case token_type_logic_operator: |
| case token_type_question: |
| case token_type_colon: |
| case token_type_number: |
| case token_type_string_template: |
| case token_type_regex_literal: |
| case token_type_other: |
| next_context_iter = null_context_list_iterator; |
| state = 0; |
| continue; |
| |
| default: |
| abort (); |
| } |
| } |
| } |
| |
| void |
| extract_vala (FILE *f, |
| const char *real_filename, const char *logical_filename, |
| flag_context_list_table_ty *flag_table, |
| msgdomain_list_ty *mdlp) |
| { |
| message_list_ty *mlp = mdlp->item[0]->messages; |
| |
| fp = f; |
| real_file_name = real_filename; |
| logical_file_name = xstrdup (logical_filename); |
| line_number = 1; |
| |
| phase1_pushback_length = 0; |
| |
| last_comment_line = -1; |
| last_non_comment_line = -1; |
| |
| phase3_pushback_length = 0; |
| last_token_type = token_type_other; |
| |
| flag_context_list_table = flag_table; |
| |
| init_keywords (); |
| |
| /* Eat tokens until eof is seen. When extract_parenthesized returns |
| due to an unbalanced closing parenthesis, just restart it. */ |
| while (!extract_balanced (mlp, token_type_eof, |
| null_context, null_context_list_iterator, |
| arglist_parser_alloc (mlp, NULL))) |
| ; |
| |
| fp = NULL; |
| real_file_name = NULL; |
| logical_file_name = NULL; |
| line_number = 0; |
| } |