| /* GNU gettext - internationalization aids |
| Copyright (C) 1995-2009, 2011, 2019 Free Software Foundation, Inc. |
| |
| This file was written by Peter Miller <millerp@canb.auug.org.au>. |
| Multibyte character handling by Bruno Haible <haible@clisp.cons.org>. |
| |
| This program is free software: you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 3 of the License, or |
| (at your option) any later version. |
| |
| This program is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
| |
| |
| #ifdef HAVE_CONFIG_H |
| # include "config.h" |
| #endif |
| |
| /* Specification. */ |
| #include "po-lex.h" |
| |
| #include <errno.h> |
| #include <limits.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <stdarg.h> |
| |
| #if HAVE_ICONV |
| # include <iconv.h> |
| #endif |
| |
| #include "c-ctype.h" |
| #include "uniwidth.h" |
| #include "gettext.h" |
| #include "po-charset.h" |
| #include "xalloc.h" |
| #include "error.h" |
| #include "error-progname.h" |
| #include "xvasprintf.h" |
| #include "po-error.h" |
| #include "po-xerror.h" |
| #include "pos.h" |
| #include "message.h" |
| #include "str-list.h" |
| #include "po-gram-gen2.h" |
| |
| #define _(str) gettext(str) |
| |
| #if HAVE_ICONV |
| # include "unistr.h" |
| #endif |
| |
| #if HAVE_DECL_GETC_UNLOCKED |
| # undef getc |
| # define getc getc_unlocked |
| #endif |
| |
| |
| /* Current position within the PO file. */ |
| lex_pos_ty gram_pos; |
| int gram_pos_column; |
| |
| |
| /* Error handling during the parsing of a PO file. |
| These functions can access gram_pos and gram_pos_column. */ |
| |
| /* VARARGS1 */ |
| void |
| po_gram_error (const char *fmt, ...) |
| { |
| va_list ap; |
| char *buffer; |
| |
| va_start (ap, fmt); |
| if (vasprintf (&buffer, fmt, ap) < 0) |
| error (EXIT_FAILURE, 0, _("memory exhausted")); |
| va_end (ap); |
| po_xerror (PO_SEVERITY_ERROR, NULL, gram_pos.file_name, gram_pos.line_number, |
| gram_pos_column + 1, false, buffer); |
| free (buffer); |
| |
| if (error_message_count >= gram_max_allowed_errors) |
| po_error (EXIT_FAILURE, 0, _("too many errors, aborting")); |
| } |
| |
| /* VARARGS2 */ |
| void |
| po_gram_error_at_line (const lex_pos_ty *pp, const char *fmt, ...) |
| { |
| va_list ap; |
| char *buffer; |
| |
| va_start (ap, fmt); |
| if (vasprintf (&buffer, fmt, ap) < 0) |
| error (EXIT_FAILURE, 0, _("memory exhausted")); |
| va_end (ap); |
| po_xerror (PO_SEVERITY_ERROR, NULL, pp->file_name, pp->line_number, |
| (size_t)(-1), false, buffer); |
| free (buffer); |
| |
| if (error_message_count >= gram_max_allowed_errors) |
| po_error (EXIT_FAILURE, 0, _("too many errors, aborting")); |
| } |
| |
| |
| /* The lowest level of PO file parsing converts bytes to multibyte characters. |
| This is needed |
| 1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first |
| translation phase maps bytes to characters. |
| 2. to keep track of the current column, for the sake of precise error |
| location. Emacs compile.el interprets the column in error messages |
| by default as a screen column number, not as character number. |
| 3. to avoid skipping backslash-newline in the midst of a multibyte |
| character. If XY is a multibyte character, X \ newline Y is invalid. |
| */ |
| |
| /* Multibyte character data type. */ |
| /* Note this depends on po_lex_charset and po_lex_iconv, which get set |
| while the file is being parsed. */ |
| |
| #define MBCHAR_BUF_SIZE 24 |
| |
| struct mbchar |
| { |
| size_t bytes; /* number of bytes of current character, > 0 */ |
| #if HAVE_ICONV |
| bool uc_valid; /* true if uc is a valid Unicode character */ |
| ucs4_t uc; /* if uc_valid: the current character */ |
| #endif |
| char buf[MBCHAR_BUF_SIZE]; /* room for the bytes */ |
| }; |
| |
| /* We want to pass multibyte characters by reference automatically, |
| therefore we use an array type. */ |
| typedef struct mbchar mbchar_t[1]; |
| |
| /* A version of memcpy optimized for the case n <= 1. */ |
| static inline void |
| memcpy_small (void *dst, const void *src, size_t n) |
| { |
| if (n > 0) |
| { |
| char *q = (char *) dst; |
| const char *p = (const char *) src; |
| |
| *q = *p; |
| if (--n > 0) |
| do *++q = *++p; while (--n > 0); |
| } |
| } |
| |
| /* EOF (not a real character) is represented with bytes = 0 and |
| uc_valid = false. */ |
| static inline bool |
| mb_iseof (const mbchar_t mbc) |
| { |
| return (mbc->bytes == 0); |
| } |
| |
| /* Access the current character. */ |
| static inline const char * |
| mb_ptr (const mbchar_t mbc) |
| { |
| return mbc->buf; |
| } |
| static inline size_t |
| mb_len (const mbchar_t mbc) |
| { |
| return mbc->bytes; |
| } |
| |
| /* Comparison of characters. */ |
| |
| static inline bool |
| mb_iseq (const mbchar_t mbc, char sc) |
| { |
| /* Note: It is wrong to compare only mbc->uc, because when the encoding is |
| SHIFT_JIS, mbc->buf[0] == '\\' corresponds to mbc->uc == 0x00A5, but we |
| want to treat it as an escape character, although it looks like a Yen |
| sign. */ |
| #if HAVE_ICONV && 0 |
| if (mbc->uc_valid) |
| return (mbc->uc == sc); /* wrong! */ |
| else |
| #endif |
| return (mbc->bytes == 1 && mbc->buf[0] == sc); |
| } |
| |
| static inline bool |
| mb_isnul (const mbchar_t mbc) |
| { |
| #if HAVE_ICONV |
| if (mbc->uc_valid) |
| return (mbc->uc == 0); |
| else |
| #endif |
| return (mbc->bytes == 1 && mbc->buf[0] == 0); |
| } |
| |
| static inline int |
| mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2) |
| { |
| #if HAVE_ICONV |
| if (mbc1->uc_valid && mbc2->uc_valid) |
| return (int) mbc1->uc - (int) mbc2->uc; |
| else |
| #endif |
| return (mbc1->bytes == mbc2->bytes |
| ? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) |
| : mbc1->bytes < mbc2->bytes |
| ? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1) |
| : (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1)); |
| } |
| |
| static inline bool |
| mb_equal (const mbchar_t mbc1, const mbchar_t mbc2) |
| { |
| #if HAVE_ICONV |
| if (mbc1->uc_valid && mbc2->uc_valid) |
| return mbc1->uc == mbc2->uc; |
| else |
| #endif |
| return (mbc1->bytes == mbc2->bytes |
| && memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0); |
| } |
| |
| /* <ctype.h>, <wctype.h> classification. */ |
| |
| static inline bool |
| mb_isascii (const mbchar_t mbc) |
| { |
| #if HAVE_ICONV |
| if (mbc->uc_valid) |
| return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F); |
| else |
| #endif |
| return (mbc->bytes == 1 |
| #if CHAR_MIN < 0x00 /* to avoid gcc warning */ |
| && mbc->buf[0] >= 0x00 |
| #endif |
| #if CHAR_MAX > 0x7F /* to avoid gcc warning */ |
| && mbc->buf[0] <= 0x7F |
| #endif |
| ); |
| } |
| |
| /* Extra <wchar.h> function. */ |
| |
| /* Unprintable characters appear as a small box of width 1. */ |
| #define MB_UNPRINTABLE_WIDTH 1 |
| |
| static int |
| mb_width (const mbchar_t mbc) |
| { |
| #if HAVE_ICONV |
| if (mbc->uc_valid) |
| { |
| ucs4_t uc = mbc->uc; |
| const char *encoding = |
| (po_lex_iconv != (iconv_t)(-1) ? po_lex_charset : ""); |
| int w = uc_width (uc, encoding); |
| /* For unprintable characters, arbitrarily return 0 for control |
| characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise. */ |
| if (w >= 0) |
| return w; |
| if (uc >= 0x0000 && uc <= 0x001F) |
| { |
| if (uc == 0x0009) |
| return 8 - (gram_pos_column & 7); |
| return 0; |
| } |
| if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029)) |
| return 0; |
| return MB_UNPRINTABLE_WIDTH; |
| } |
| else |
| #endif |
| { |
| if (mbc->bytes == 1) |
| { |
| if ( |
| #if CHAR_MIN < 0x00 /* to avoid gcc warning */ |
| mbc->buf[0] >= 0x00 && |
| #endif |
| mbc->buf[0] <= 0x1F) |
| { |
| if (mbc->buf[0] == 0x09) |
| return 8 - (gram_pos_column & 7); |
| return 0; |
| } |
| if (mbc->buf[0] == 0x7F) |
| return 0; |
| } |
| return MB_UNPRINTABLE_WIDTH; |
| } |
| } |
| |
| /* Output. */ |
| static inline void |
| mb_putc (const mbchar_t mbc, FILE *stream) |
| { |
| fwrite (mbc->buf, 1, mbc->bytes, stream); |
| } |
| |
| /* Assignment. */ |
| static inline void |
| mb_setascii (mbchar_t mbc, char sc) |
| { |
| mbc->bytes = 1; |
| #if HAVE_ICONV |
| mbc->uc_valid = 1; |
| mbc->uc = sc; |
| #endif |
| mbc->buf[0] = sc; |
| } |
| |
| /* Copying a character. */ |
| static inline void |
| mb_copy (mbchar_t new_mbc, const mbchar_t old_mbc) |
| { |
| memcpy_small (&new_mbc->buf[0], &old_mbc->buf[0], old_mbc->bytes); |
| new_mbc->bytes = old_mbc->bytes; |
| #if HAVE_ICONV |
| if ((new_mbc->uc_valid = old_mbc->uc_valid)) |
| new_mbc->uc = old_mbc->uc; |
| #endif |
| } |
| |
| |
| /* Multibyte character input. */ |
| |
| /* Number of characters that can be pushed back. |
| We need 1 for lex_getc, plus 1 for lex_ungetc. */ |
| #define NPUSHBACK 2 |
| |
| /* Data type of a multibyte character input stream. */ |
| struct mbfile |
| { |
| FILE *fp; |
| bool eof_seen; |
| int have_pushback; |
| unsigned int bufcount; |
| char buf[MBCHAR_BUF_SIZE]; |
| struct mbchar pushback[NPUSHBACK]; |
| }; |
| |
| /* We want to pass multibyte streams by reference automatically, |
| therefore we use an array type. */ |
| typedef struct mbfile mbfile_t[1]; |
| |
| /* Whether invalid multibyte sequences in the input shall be signalled |
| or silently tolerated. */ |
| static bool signal_eilseq; |
| |
| static inline void |
| mbfile_init (mbfile_t mbf, FILE *stream) |
| { |
| mbf->fp = stream; |
| mbf->eof_seen = false; |
| mbf->have_pushback = 0; |
| mbf->bufcount = 0; |
| } |
| |
| /* Read the next multibyte character from mbf and put it into mbc. |
| If a read error occurs, errno is set and ferror (mbf->fp) becomes true. */ |
| static void |
| mbfile_getc (mbchar_t mbc, mbfile_t mbf) |
| { |
| size_t bytes; |
| |
| /* If EOF has already been seen, don't use getc. This matters if |
| mbf->fp is connected to an interactive tty. */ |
| if (mbf->eof_seen) |
| goto eof; |
| |
| /* Return character pushed back, if there is one. */ |
| if (mbf->have_pushback > 0) |
| { |
| mbf->have_pushback--; |
| mb_copy (mbc, &mbf->pushback[mbf->have_pushback]); |
| return; |
| } |
| |
| /* Before using iconv, we need at least one byte. */ |
| if (mbf->bufcount == 0) |
| { |
| int c = getc (mbf->fp); |
| if (c == EOF) |
| { |
| mbf->eof_seen = true; |
| goto eof; |
| } |
| mbf->buf[0] = (unsigned char) c; |
| mbf->bufcount++; |
| } |
| |
| #if HAVE_ICONV |
| if (po_lex_iconv != (iconv_t)(-1)) |
| { |
| /* Use iconv on an increasing number of bytes. Read only as many |
| bytes from mbf->fp as needed. This is needed to give reasonable |
| interactive behaviour when mbf->fp is connected to an interactive |
| tty. */ |
| for (;;) |
| { |
| unsigned char scratchbuf[64]; |
| const char *inptr = &mbf->buf[0]; |
| size_t insize = mbf->bufcount; |
| char *outptr = (char *) &scratchbuf[0]; |
| size_t outsize = sizeof (scratchbuf); |
| |
| size_t res = iconv (po_lex_iconv, |
| (ICONV_CONST char **) &inptr, &insize, |
| &outptr, &outsize); |
| /* We expect that a character has been produced if and only if |
| some input bytes have been consumed. */ |
| if ((insize < mbf->bufcount) != (outsize < sizeof (scratchbuf))) |
| abort (); |
| if (outsize == sizeof (scratchbuf)) |
| { |
| /* No character has been produced. Must be an error. */ |
| if (res != (size_t)(-1)) |
| abort (); |
| |
| if (errno == EILSEQ) |
| { |
| /* An invalid multibyte sequence was encountered. */ |
| /* Return a single byte. */ |
| if (signal_eilseq) |
| po_gram_error (_("invalid multibyte sequence")); |
| bytes = 1; |
| mbc->uc_valid = false; |
| break; |
| } |
| else if (errno == EINVAL) |
| { |
| /* An incomplete multibyte character. */ |
| int c; |
| |
| if (mbf->bufcount == MBCHAR_BUF_SIZE) |
| { |
| /* An overlong incomplete multibyte sequence was |
| encountered. */ |
| /* Return a single byte. */ |
| bytes = 1; |
| mbc->uc_valid = false; |
| break; |
| } |
| |
| /* Read one more byte and retry iconv. */ |
| c = getc (mbf->fp); |
| if (c == EOF) |
| { |
| mbf->eof_seen = true; |
| if (ferror (mbf->fp)) |
| goto eof; |
| if (signal_eilseq) |
| po_gram_error (_("incomplete multibyte sequence at end of file")); |
| bytes = mbf->bufcount; |
| mbc->uc_valid = false; |
| break; |
| } |
| mbf->buf[mbf->bufcount++] = (unsigned char) c; |
| if (c == '\n') |
| { |
| if (signal_eilseq) |
| po_gram_error (_("incomplete multibyte sequence at end of line")); |
| bytes = mbf->bufcount - 1; |
| mbc->uc_valid = false; |
| break; |
| } |
| } |
| else |
| { |
| const char *errno_description = strerror (errno); |
| po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false, |
| xasprintf ("%s: %s", |
| _("iconv failure"), |
| errno_description)); |
| } |
| } |
| else |
| { |
| size_t outbytes = sizeof (scratchbuf) - outsize; |
| bytes = mbf->bufcount - insize; |
| |
| /* We expect that one character has been produced. */ |
| if (bytes == 0) |
| abort (); |
| if (outbytes == 0) |
| abort (); |
| /* Convert it from UTF-8 to UCS-4. */ |
| if (u8_mbtoucr (&mbc->uc, scratchbuf, outbytes) < (int) outbytes) |
| { |
| /* scratchbuf contains an out-of-range Unicode character |
| (> 0x10ffff). */ |
| if (signal_eilseq) |
| po_gram_error (_("invalid multibyte sequence")); |
| mbc->uc_valid = false; |
| break; |
| } |
| mbc->uc_valid = true; |
| break; |
| } |
| } |
| } |
| else |
| #endif |
| { |
| if (po_lex_weird_cjk |
| /* Special handling of encodings with CJK structure. */ |
| && (unsigned char) mbf->buf[0] >= 0x80) |
| { |
| if (mbf->bufcount == 1) |
| { |
| /* Read one more byte. */ |
| int c = getc (mbf->fp); |
| if (c == EOF) |
| { |
| if (ferror (mbf->fp)) |
| { |
| mbf->eof_seen = true; |
| goto eof; |
| } |
| } |
| else |
| { |
| mbf->buf[1] = (unsigned char) c; |
| mbf->bufcount++; |
| } |
| } |
| if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30) |
| /* Return a double byte. */ |
| bytes = 2; |
| else |
| /* Return a single byte. */ |
| bytes = 1; |
| } |
| else |
| { |
| /* Return a single byte. */ |
| bytes = 1; |
| } |
| #if HAVE_ICONV |
| mbc->uc_valid = false; |
| #endif |
| } |
| |
| /* Return the multibyte sequence mbf->buf[0..bytes-1]. */ |
| memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes); |
| mbc->bytes = bytes; |
| |
| mbf->bufcount -= bytes; |
| if (mbf->bufcount > 0) |
| { |
| /* It's not worth calling memmove() for so few bytes. */ |
| unsigned int count = mbf->bufcount; |
| char *p = &mbf->buf[0]; |
| |
| do |
| { |
| *p = *(p + bytes); |
| p++; |
| } |
| while (--count > 0); |
| } |
| return; |
| |
| eof: |
| /* An mbchar_t with bytes == 0 is used to indicate EOF. */ |
| mbc->bytes = 0; |
| #if HAVE_ICONV |
| mbc->uc_valid = false; |
| #endif |
| return; |
| } |
| |
| static void |
| mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf) |
| { |
| if (mbf->have_pushback >= NPUSHBACK) |
| abort (); |
| mb_copy (&mbf->pushback[mbf->have_pushback], mbc); |
| mbf->have_pushback++; |
| } |
| |
| |
| /* Lexer variables. */ |
| |
| static mbfile_t mbf; |
| unsigned int gram_max_allowed_errors = 20; |
| static bool po_lex_obsolete; |
| static bool po_lex_previous; |
| static bool pass_comments = false; |
| bool pass_obsolete_entries = false; |
| |
| |
| /* Prepare lexical analysis. */ |
| void |
| lex_start (FILE *fp, const char *real_filename, const char *logical_filename) |
| { |
| /* Ignore the logical_filename, because PO file entries already have |
| their file names attached. But use real_filename for error messages. */ |
| gram_pos.file_name = xstrdup (real_filename); |
| |
| mbfile_init (mbf, fp); |
| |
| gram_pos.line_number = 1; |
| gram_pos_column = 0; |
| signal_eilseq = true; |
| po_lex_obsolete = false; |
| po_lex_previous = false; |
| po_lex_charset_init (); |
| } |
| |
| /* Terminate lexical analysis. */ |
| void |
| lex_end () |
| { |
| mbf->fp = NULL; |
| gram_pos.file_name = NULL; |
| gram_pos.line_number = 0; |
| gram_pos_column = 0; |
| signal_eilseq = false; |
| po_lex_obsolete = false; |
| po_lex_previous = false; |
| po_lex_charset_close (); |
| } |
| |
| |
| /* Read a single character, dealing with backslash-newline. |
| Also keep track of the current line number and column number. */ |
| static void |
| lex_getc (mbchar_t mbc) |
| { |
| for (;;) |
| { |
| mbfile_getc (mbc, mbf); |
| |
| if (mb_iseof (mbc)) |
| { |
| if (ferror (mbf->fp)) |
| bomb: |
| { |
| const char *errno_description = strerror (errno); |
| po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false, |
| xasprintf ("%s: %s", |
| xasprintf (_("error while reading \"%s\""), |
| gram_pos.file_name), |
| errno_description)); |
| } |
| break; |
| } |
| |
| if (mb_iseq (mbc, '\n')) |
| { |
| gram_pos.line_number++; |
| gram_pos_column = 0; |
| break; |
| } |
| |
| gram_pos_column += mb_width (mbc); |
| |
| if (mb_iseq (mbc, '\\')) |
| { |
| mbchar_t mbc2; |
| |
| mbfile_getc (mbc2, mbf); |
| |
| if (mb_iseof (mbc2)) |
| { |
| if (ferror (mbf->fp)) |
| goto bomb; |
| break; |
| } |
| |
| if (!mb_iseq (mbc2, '\n')) |
| { |
| mbfile_ungetc (mbc2, mbf); |
| break; |
| } |
| |
| gram_pos.line_number++; |
| gram_pos_column = 0; |
| } |
| else |
| break; |
| } |
| } |
| |
| |
| static void |
| lex_ungetc (const mbchar_t mbc) |
| { |
| if (!mb_iseof (mbc)) |
| { |
| if (mb_iseq (mbc, '\n')) |
| /* Decrement the line number, but don't care about the column. */ |
| gram_pos.line_number--; |
| else |
| /* Decrement the column number. Also works well enough for tabs. */ |
| gram_pos_column -= mb_width (mbc); |
| |
| mbfile_ungetc (mbc, mbf); |
| } |
| } |
| |
| |
| static int |
| keyword_p (const char *s) |
| { |
| if (!po_lex_previous) |
| { |
| if (!strcmp (s, "domain")) |
| return DOMAIN; |
| if (!strcmp (s, "msgid")) |
| return MSGID; |
| if (!strcmp (s, "msgid_plural")) |
| return MSGID_PLURAL; |
| if (!strcmp (s, "msgstr")) |
| return MSGSTR; |
| if (!strcmp (s, "msgctxt")) |
| return MSGCTXT; |
| } |
| else |
| { |
| /* Inside a "#|" context, the keywords have a different meaning. */ |
| if (!strcmp (s, "msgid")) |
| return PREV_MSGID; |
| if (!strcmp (s, "msgid_plural")) |
| return PREV_MSGID_PLURAL; |
| if (!strcmp (s, "msgctxt")) |
| return PREV_MSGCTXT; |
| } |
| po_gram_error_at_line (&gram_pos, _("keyword \"%s\" unknown"), s); |
| return NAME; |
| } |
| |
| |
| static int |
| control_sequence () |
| { |
| mbchar_t mbc; |
| int val; |
| int max; |
| |
| lex_getc (mbc); |
| if (mb_len (mbc) == 1) |
| switch (mb_ptr (mbc) [0]) |
| { |
| case 'n': |
| return '\n'; |
| |
| case 't': |
| return '\t'; |
| |
| case 'b': |
| return '\b'; |
| |
| case 'r': |
| return '\r'; |
| |
| case 'f': |
| return '\f'; |
| |
| case 'v': |
| return '\v'; |
| |
| case 'a': |
| return '\a'; |
| |
| case '\\': |
| case '"': |
| return mb_ptr (mbc) [0]; |
| |
| case '0': case '1': case '2': case '3': |
| case '4': case '5': case '6': case '7': |
| val = 0; |
| max = 0; |
| for (;;) |
| { |
| char c = mb_ptr (mbc) [0]; |
| /* Warning: not portable, can't depend on '0'..'7' ordering. */ |
| val = val * 8 + (c - '0'); |
| if (++max == 3) |
| break; |
| lex_getc (mbc); |
| if (mb_len (mbc) == 1) |
| switch (mb_ptr (mbc) [0]) |
| { |
| case '0': case '1': case '2': case '3': |
| case '4': case '5': case '6': case '7': |
| continue; |
| |
| default: |
| break; |
| } |
| lex_ungetc (mbc); |
| break; |
| } |
| return val; |
| |
| case 'x': |
| lex_getc (mbc); |
| if (mb_iseof (mbc) || mb_len (mbc) != 1 |
| || !c_isxdigit (mb_ptr (mbc) [0])) |
| break; |
| |
| val = 0; |
| for (;;) |
| { |
| char c = mb_ptr (mbc) [0]; |
| val *= 16; |
| if (c_isdigit (c)) |
| /* Warning: not portable, can't depend on '0'..'9' ordering */ |
| val += c - '0'; |
| else if (c_isupper (c)) |
| /* Warning: not portable, can't depend on 'A'..'F' ordering */ |
| val += c - 'A' + 10; |
| else |
| /* Warning: not portable, can't depend on 'a'..'f' ordering */ |
| val += c - 'a' + 10; |
| |
| lex_getc (mbc); |
| if (mb_len (mbc) == 1) |
| switch (mb_ptr (mbc) [0]) |
| { |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
| case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
| continue; |
| |
| default: |
| break; |
| } |
| lex_ungetc (mbc); |
| break; |
| } |
| return val; |
| |
| /* FIXME: \u and \U are not handled. */ |
| } |
| lex_ungetc (mbc); |
| po_gram_error (_("invalid control sequence")); |
| return ' '; |
| } |
| |
| |
| /* Return the next token in the PO file. The return codes are defined |
| in "po-gram-gen2.h". Associated data is put in 'po_gram_lval'. */ |
| int |
| po_gram_lex () |
| { |
| static char *buf; |
| static size_t bufmax; |
| mbchar_t mbc; |
| size_t bufpos; |
| |
| for (;;) |
| { |
| lex_getc (mbc); |
| |
| if (mb_iseof (mbc)) |
| /* Yacc want this for end of file. */ |
| return 0; |
| |
| if (mb_len (mbc) == 1) |
| switch (mb_ptr (mbc) [0]) |
| { |
| case '\n': |
| po_lex_obsolete = false; |
| po_lex_previous = false; |
| /* Ignore whitespace, not relevant for the grammar. */ |
| break; |
| |
| case ' ': |
| case '\t': |
| case '\r': |
| case '\f': |
| case '\v': |
| /* Ignore whitespace, not relevant for the grammar. */ |
| break; |
| |
| case '#': |
| lex_getc (mbc); |
| if (mb_iseq (mbc, '~')) |
| /* A pseudo-comment beginning with #~ is found. This is |
| not a comment. It is the format for obsolete entries. |
| We simply discard the "#~" prefix. The following |
| characters are expected to be well formed. */ |
| { |
| po_lex_obsolete = true; |
| /* A pseudo-comment beginning with #~| denotes a previous |
| untranslated string in an obsolete entry. This does not |
| make much sense semantically, and is implemented here |
| for completeness only. */ |
| lex_getc (mbc); |
| if (mb_iseq (mbc, '|')) |
| po_lex_previous = true; |
| else |
| lex_ungetc (mbc); |
| break; |
| } |
| if (mb_iseq (mbc, '|')) |
| /* A pseudo-comment beginning with #| is found. This is |
| the previous untranslated string. We discard the "#|" |
| prefix, but change the keywords and string returns |
| accordingly. */ |
| { |
| po_lex_previous = true; |
| break; |
| } |
| |
| /* Accumulate comments into a buffer. If we have been asked |
| to pass comments, generate a COMMENT token, otherwise |
| discard it. */ |
| signal_eilseq = false; |
| if (pass_comments) |
| { |
| bufpos = 0; |
| for (;;) |
| { |
| while (bufpos + mb_len (mbc) >= bufmax) |
| { |
| bufmax += 100; |
| buf = xrealloc (buf, bufmax); |
| } |
| if (mb_iseof (mbc) || mb_iseq (mbc, '\n')) |
| break; |
| |
| memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc)); |
| bufpos += mb_len (mbc); |
| |
| lex_getc (mbc); |
| } |
| buf[bufpos] = '\0'; |
| |
| po_gram_lval.string.string = buf; |
| po_gram_lval.string.pos = gram_pos; |
| po_gram_lval.string.obsolete = po_lex_obsolete; |
| po_lex_obsolete = false; |
| signal_eilseq = true; |
| return COMMENT; |
| } |
| else |
| { |
| /* We do this in separate loop because collecting large |
| comments while they get not passed to the upper layers |
| is not very efficient. */ |
| while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n')) |
| lex_getc (mbc); |
| po_lex_obsolete = false; |
| signal_eilseq = true; |
| } |
| break; |
| |
| case '"': |
| /* Accumulate a string. */ |
| bufpos = 0; |
| for (;;) |
| { |
| lex_getc (mbc); |
| while (bufpos + mb_len (mbc) >= bufmax) |
| { |
| bufmax += 100; |
| buf = xrealloc (buf, bufmax); |
| } |
| if (mb_iseof (mbc)) |
| { |
| po_gram_error_at_line (&gram_pos, |
| _("end-of-file within string")); |
| break; |
| } |
| if (mb_iseq (mbc, '\n')) |
| { |
| po_gram_error_at_line (&gram_pos, |
| _("end-of-line within string")); |
| break; |
| } |
| if (mb_iseq (mbc, '"')) |
| break; |
| if (mb_iseq (mbc, '\\')) |
| { |
| buf[bufpos++] = control_sequence (); |
| continue; |
| } |
| |
| /* Add mbc to the accumulator. */ |
| memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc)); |
| bufpos += mb_len (mbc); |
| } |
| buf[bufpos] = '\0'; |
| |
| /* Strings cannot contain the msgctxt separator, because it cannot |
| be faithfully represented in the msgid of a .mo file. */ |
| if (strchr (buf, MSGCTXT_SEPARATOR) != NULL) |
| po_gram_error_at_line (&gram_pos, |
| _("context separator <EOT> within string")); |
| |
| /* FIXME: Treatment of embedded \000 chars is incorrect. */ |
| po_gram_lval.string.string = xstrdup (buf); |
| po_gram_lval.string.pos = gram_pos; |
| po_gram_lval.string.obsolete = po_lex_obsolete; |
| return (po_lex_previous ? PREV_STRING : STRING); |
| |
| case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
| case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': |
| case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': |
| case 's': case 't': case 'u': case 'v': case 'w': case 'x': |
| case 'y': case 'z': |
| case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
| case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': |
| case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': |
| case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': |
| case 'Y': case 'Z': |
| case '_': case '$': |
| bufpos = 0; |
| for (;;) |
| { |
| char c = mb_ptr (mbc) [0]; |
| if (bufpos + 1 >= bufmax) |
| { |
| bufmax += 100; |
| buf = xrealloc (buf, bufmax); |
| } |
| buf[bufpos++] = c; |
| lex_getc (mbc); |
| if (mb_len (mbc) == 1) |
| switch (mb_ptr (mbc) [0]) |
| { |
| default: |
| break; |
| case 'a': case 'b': case 'c': case 'd': case 'e': |
| case 'f': case 'g': case 'h': case 'i': case 'j': |
| case 'k': case 'l': case 'm': case 'n': case 'o': |
| case 'p': case 'q': case 'r': case 's': case 't': |
| case 'u': case 'v': case 'w': case 'x': case 'y': |
| case 'z': |
| case 'A': case 'B': case 'C': case 'D': case 'E': |
| case 'F': case 'G': case 'H': case 'I': case 'J': |
| case 'K': case 'L': case 'M': case 'N': case 'O': |
| case 'P': case 'Q': case 'R': case 'S': case 'T': |
| case 'U': case 'V': case 'W': case 'X': case 'Y': |
| case 'Z': |
| case '_': case '$': |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| continue; |
| } |
| break; |
| } |
| lex_ungetc (mbc); |
| |
| buf[bufpos] = '\0'; |
| |
| { |
| int k = keyword_p (buf); |
| if (k == NAME) |
| { |
| po_gram_lval.string.string = xstrdup (buf); |
| po_gram_lval.string.pos = gram_pos; |
| po_gram_lval.string.obsolete = po_lex_obsolete; |
| } |
| else |
| { |
| po_gram_lval.pos.pos = gram_pos; |
| po_gram_lval.pos.obsolete = po_lex_obsolete; |
| } |
| return k; |
| } |
| |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| bufpos = 0; |
| for (;;) |
| { |
| char c = mb_ptr (mbc) [0]; |
| if (bufpos + 1 >= bufmax) |
| { |
| bufmax += 100; |
| buf = xrealloc (buf, bufmax + 1); |
| } |
| buf[bufpos++] = c; |
| lex_getc (mbc); |
| if (mb_len (mbc) == 1) |
| switch (mb_ptr (mbc) [0]) |
| { |
| default: |
| break; |
| |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| continue; |
| } |
| break; |
| } |
| lex_ungetc (mbc); |
| |
| buf[bufpos] = '\0'; |
| |
| po_gram_lval.number.number = atol (buf); |
| po_gram_lval.number.pos = gram_pos; |
| po_gram_lval.number.obsolete = po_lex_obsolete; |
| return NUMBER; |
| |
| case '[': |
| po_gram_lval.pos.pos = gram_pos; |
| po_gram_lval.pos.obsolete = po_lex_obsolete; |
| return '['; |
| |
| case ']': |
| po_gram_lval.pos.pos = gram_pos; |
| po_gram_lval.pos.obsolete = po_lex_obsolete; |
| return ']'; |
| |
| default: |
| /* This will cause a syntax error. */ |
| return JUNK; |
| } |
| else |
| /* This will cause a syntax error. */ |
| return JUNK; |
| } |
| } |
| |
| |
| /* po_gram_lex() can return comments as COMMENT. Switch this on or off. */ |
| void |
| po_lex_pass_comments (bool flag) |
| { |
| pass_comments = flag; |
| } |
| |
| |
| /* po_gram_lex() can return obsolete entries as if they were normal entries. |
| Switch this on or off. */ |
| void |
| po_lex_pass_obsolete_entries (bool flag) |
| { |
| pass_obsolete_entries = flag; |
| } |