| /* Copyright (C) 1996-2014 Free Software Foundation, Inc. |
| This file is part of the GNU C Library. |
| Contributed by Ulrich Drepper <drepper@gnu.org>, 1996. |
| |
| This program is free software; you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published |
| by the Free Software Foundation; version 2 of the License, or |
| (at your option) any later version. |
| |
| This program is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with this program; if not, see <http://www.gnu.org/licenses/>. */ |
| |
| #include "third_party/glibc_locales/programs/charmap.h" |
| |
| #include <assert.h> |
| #include <ctype.h> |
| #include <errno.h> |
| #include <error.h> |
| #include <libintl.h> |
| #include <limits.h> |
| #include <stdint.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "third_party/glibc_locales/programs/charmap-dir.h" |
| #include "third_party/glibc_locales/programs/linereader.h" |
| #include "third_party/glibc_locales/programs/localedef.h" |
| |
| /* Define the lookup function. */ |
| #include "third_party/glibc_locales/programs/charmap-kw.h" |
| |
| /* Prototypes for local functions. */ |
| static struct charmap_t *parse_charmap(struct linereader *cmfile, int verbose, |
| int be_quiet); |
| static void new_width(struct linereader *cmfile, struct charmap_t *result, |
| const char *from, const char *to, |
| unsigned long int width); |
| static void charmap_new_char(struct linereader *lr, struct charmap_t *cm, |
| size_t nbytes, unsigned char *bytes, |
| const char *from, const char *to, |
| int decimal_ellipsis, int step); |
| |
| bool enc_not_ascii_compatible; |
| |
| #ifdef NEED_NULL_POINTER |
| static const char *null_pointer; |
| #endif |
| |
| static struct linereader *cmlr_open(const char *directory, const char *name, |
| kw_hash_fct_t hf) { |
| FILE *fp; |
| |
| fp = charmap_open(directory, name); |
| if (fp == NULL) |
| return NULL; |
| else { |
| size_t dlen = strlen(directory); |
| int add_slash = (dlen == 0 || directory[dlen - 1] != '/'); |
| size_t nlen = strlen(name); |
| char *pathname; |
| char *p; |
| |
| pathname = alloca(dlen + add_slash + nlen + 1); |
| p = stpcpy(pathname, directory); |
| if (add_slash) *p++ = '/'; |
| stpcpy(p, name); |
| |
| return lr_create(fp, pathname, hf); |
| } |
| } |
| |
| struct charmap_t *charmap_read(const char *filename, int verbose, |
| int error_not_found, int be_quiet, |
| int use_default) { |
| struct charmap_t *result = NULL; |
| |
| if (filename != NULL) { |
| struct linereader *cmfile; |
| |
| /* First try the name as found in the parameter. */ |
| cmfile = lr_open(filename, charmap_hash); |
| if (cmfile == NULL) { |
| /* No successful. So start looking through the directories |
| in the I18NPATH if this is a simple name. */ |
| if (strchr(filename, '/') == NULL) { |
| char *i18npath = getenv("I18NPATH"); |
| if (i18npath != NULL && *i18npath != '\0') { |
| const size_t pathlen = strlen(i18npath); |
| char i18npathbuf[pathlen + 1]; |
| char path[pathlen + sizeof("/charmaps")]; |
| char *next; |
| i18npath = memcpy(i18npathbuf, i18npath, pathlen + 1); |
| |
| while (cmfile == NULL && (next = strsep(&i18npath, ":")) != NULL) { |
| stpcpy(stpcpy(path, next), "/charmaps"); |
| cmfile = cmlr_open(path, filename, charmap_hash); |
| |
| if (cmfile == NULL) /* Try without the "/charmaps" part. */ |
| cmfile = cmlr_open(next, filename, charmap_hash); |
| } |
| } |
| |
| if (cmfile == NULL) /* Try the default directory. */ |
| cmfile = cmlr_open(CHARMAP_PATH, filename, charmap_hash); |
| } |
| } |
| |
| if (cmfile != NULL) result = parse_charmap(cmfile, verbose, be_quiet); |
| |
| if (result == NULL && error_not_found) |
| WITH_CUR_LOCALE( |
| error(0, errno, "character map file `%s' not found", filename)); |
| } |
| |
| if (result == NULL && filename != NULL && strchr(filename, '/') == NULL) { |
| /* OK, one more try. We also accept the names given to the |
| character sets in the files. Sometimes they differ from the |
| file name. */ |
| CHARMAP_DIR *dir; |
| |
| dir = charmap_opendir(CHARMAP_PATH); |
| if (dir != NULL) { |
| const char *dirent; |
| |
| while ((dirent = charmap_readdir(dir)) != NULL) { |
| char **aliases; |
| char **p; |
| int found; |
| |
| aliases = charmap_aliases(CHARMAP_PATH, dirent); |
| found = 0; |
| for (p = aliases; *p; p++) |
| if (strcasecmp(*p, filename) == 0) { |
| found = 1; |
| break; |
| } |
| charmap_free_aliases(aliases); |
| |
| if (found) { |
| struct linereader *cmfile; |
| |
| cmfile = cmlr_open(CHARMAP_PATH, dirent, charmap_hash); |
| if (cmfile != NULL) result = parse_charmap(cmfile, verbose, be_quiet); |
| |
| break; |
| } |
| } |
| |
| charmap_closedir(dir); |
| } |
| } |
| |
| if (result == NULL && DEFAULT_CHARMAP != NULL) { |
| struct linereader *cmfile; |
| |
| cmfile = cmlr_open(CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash); |
| if (cmfile != NULL) result = parse_charmap(cmfile, verbose, be_quiet); |
| |
| if (result == NULL) |
| WITH_CUR_LOCALE(error(4, errno, |
| "default character map file `%s' not found", |
| DEFAULT_CHARMAP)); |
| } |
| |
| if (result != NULL && result->code_set_name == NULL) |
| /* The input file does not specify a code set name. This |
| shouldn't happen but we should cope with it. */ |
| result->code_set_name = basename(filename); |
| |
| /* Test of ASCII compatibility of locale encoding. |
| |
| Verify that the encoding to be used in a locale is ASCII compatible, |
| at least for the graphic characters, excluding the control characters, |
| '$' and '@'. This constraint comes from an ISO C 99 restriction. |
| |
| ISO C 99 section 7.17.(2) (about wchar_t): |
| the null character shall have the code value zero and each member of |
| the basic character set shall have a code value equal to its value |
| when used as the lone character in an integer character constant. |
| ISO C 99 section 5.2.1.(3): |
| Both the basic source and basic execution character sets shall have |
| the following members: the 26 uppercase letters of the Latin alphabet |
| A B C D E F G H I J K L M N O P Q R S T U V W X Y Z |
| the 26 lowercase letters of the Latin alphabet |
| a b c d e f g h i j k l m n o p q r s t u v w x y z |
| the 10 decimal digits |
| 0 1 2 3 4 5 6 7 8 9 |
| the following 29 graphic characters |
| ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~ |
| the space character, and control characters representing horizontal |
| tab, vertical tab, and form feed. |
| |
| Therefore, for all members of the "basic character set", the 'char' code |
| must have the same value as the 'wchar_t' code, which in glibc is the |
| same as the Unicode code, which for all of the enumerated characters |
| is identical to the ASCII code. */ |
| if (result != NULL && use_default) { |
| static const char basic_charset[] = { |
| 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
| 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
| 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', |
| 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', |
| 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', |
| '8', '9', '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', |
| ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', |
| ']', '^', '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'}; |
| int failed = 0; |
| const char *p = basic_charset; |
| |
| do { |
| struct charseq *seq = charmap_find_symbol(result, p, 1); |
| |
| if (seq == NULL || seq->ucs4 != (uint32_t)*p) failed = 1; |
| } while (*p++ != '\0'); |
| |
| if (failed) { |
| WITH_CUR_LOCALE(fprintf(stderr, |
| "character map `%s' is not ASCII compatible, " |
| "locale not ISO C compliant\n", |
| result->code_set_name)); |
| enc_not_ascii_compatible = true; |
| } |
| } |
| |
| return result; |
| } |
| |
| static struct charmap_t *parse_charmap(struct linereader *cmfile, int verbose, |
| int be_quiet) { |
| struct charmap_t *result; |
| int state; |
| enum token_t expected_tok = tok_error; |
| const char *expected_str = NULL; |
| char *from_name = NULL; |
| char *to_name = NULL; |
| enum token_t ellipsis = 0; |
| int step = 1; |
| |
| /* We don't want symbolic names in string to be translated. */ |
| cmfile->translate_strings = 0; |
| |
| /* Allocate room for result. */ |
| result = (struct charmap_t *)xmalloc(sizeof(struct charmap_t)); |
| memset(result, '\0', sizeof(struct charmap_t)); |
| /* The default DEFAULT_WIDTH is 1. */ |
| result->width_default = 1; |
| |
| #define obstack_chunk_alloc malloc |
| #define obstack_chunk_free free |
| obstack_init(&result->mem_pool); |
| |
| if (init_hash(&result->char_table, 256) || |
| init_hash(&result->byte_table, 256)) { |
| free(result); |
| return NULL; |
| } |
| |
| /* We use a state machine to describe the charmap description file |
| format. */ |
| state = 1; |
| while (1) { |
| /* What's on? */ |
| struct token *now = lr_token(cmfile, NULL, NULL, NULL, verbose); |
| enum token_t nowtok = now->tok; |
| struct token *arg; |
| |
| if (nowtok == tok_eof) break; |
| |
| switch (state) { |
| case 1: |
| /* The beginning. We expect the special declarations, EOL or |
| `CHARMAP'. */ |
| if (nowtok == tok_eol) /* Ignore empty lines. */ |
| continue; |
| |
| if (nowtok == tok_charmap) { |
| from_name = NULL; |
| to_name = NULL; |
| |
| /* We have to set up the real work. Fill in some |
| default values. */ |
| if (result->mb_cur_max == 0) result->mb_cur_max = 1; |
| if (result->mb_cur_min == 0) result->mb_cur_min = result->mb_cur_max; |
| if (result->mb_cur_min > result->mb_cur_max) { |
| if (!be_quiet) |
| WITH_CUR_LOCALE(error( |
| 0, 0, "%s: <mb_cur_max> must be greater than <mb_cur_min>\n", |
| cmfile->fname)); |
| |
| result->mb_cur_min = result->mb_cur_max; |
| } |
| |
| lr_ignore_rest(cmfile, 1); |
| |
| state = 2; |
| continue; |
| } |
| |
| if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max && |
| nowtok != tok_mb_cur_min && nowtok != tok_escape_char && |
| nowtok != tok_comment_char && nowtok != tok_g0esc && |
| nowtok != tok_g1esc && nowtok != tok_g2esc && nowtok != tok_g3esc && |
| nowtok != tok_repertoiremap && nowtok != tok_include) { |
| lr_error(cmfile, "syntax error in prolog: %s", "invalid definition"); |
| |
| lr_ignore_rest(cmfile, 0); |
| continue; |
| } |
| |
| /* We know that we need an argument. */ |
| arg = lr_token(cmfile, NULL, NULL, NULL, verbose); |
| |
| switch (nowtok) { |
| case tok_code_set_name: |
| case tok_repertoiremap: |
| if (arg->tok != tok_ident && arg->tok != tok_string) { |
| badarg: |
| lr_error(cmfile, "syntax error in prolog: %s", "bad argument"); |
| |
| lr_ignore_rest(cmfile, 0); |
| continue; |
| } |
| |
| if (nowtok == tok_code_set_name) |
| result->code_set_name = obstack_copy0( |
| &result->mem_pool, arg->val.str.startmb, arg->val.str.lenmb); |
| else |
| result->repertoiremap = obstack_copy0( |
| &result->mem_pool, arg->val.str.startmb, arg->val.str.lenmb); |
| |
| lr_ignore_rest(cmfile, 1); |
| continue; |
| |
| case tok_mb_cur_max: |
| case tok_mb_cur_min: |
| if (arg->tok != tok_number) goto badarg; |
| |
| if (verbose && |
| ((nowtok == tok_mb_cur_max && result->mb_cur_max != 0) || |
| (nowtok == tok_mb_cur_max && result->mb_cur_max != 0))) |
| lr_error(cmfile, "duplicate definition of <%s>", |
| nowtok == tok_mb_cur_min ? "mb_cur_min" : "mb_cur_max"); |
| |
| if (arg->val.num < 1) { |
| lr_error(cmfile, "value for <%s> must be 1 or greater", |
| nowtok == tok_mb_cur_min ? "mb_cur_min" : "mb_cur_max"); |
| |
| lr_ignore_rest(cmfile, 0); |
| continue; |
| } |
| if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0 && |
| (int)arg->val.num < result->mb_cur_min) || |
| (nowtok == tok_mb_cur_min && result->mb_cur_max != 0 && |
| (int)arg->val.num > result->mb_cur_max)) { |
| lr_error(cmfile, |
| "value of <%s> must be greater or equal than the value " |
| "of <%s>", |
| "mb_cur_max", "mb_cur_min"); |
| |
| lr_ignore_rest(cmfile, 0); |
| continue; |
| } |
| |
| if (nowtok == tok_mb_cur_max) |
| result->mb_cur_max = arg->val.num; |
| else |
| result->mb_cur_min = arg->val.num; |
| |
| lr_ignore_rest(cmfile, 1); |
| continue; |
| |
| case tok_escape_char: |
| case tok_comment_char: |
| if (arg->tok != tok_ident) goto badarg; |
| |
| if (arg->val.str.lenmb != 1) { |
| lr_error( |
| cmfile, "argument to <%s> must be a single character", |
| nowtok == tok_escape_char ? "escape_char" : "comment_char"); |
| |
| lr_ignore_rest(cmfile, 0); |
| continue; |
| } |
| |
| if (nowtok == tok_escape_char) |
| cmfile->escape_char = *arg->val.str.startmb; |
| else |
| cmfile->comment_char = *arg->val.str.startmb; |
| |
| lr_ignore_rest(cmfile, 1); |
| continue; |
| |
| case tok_g0esc: |
| case tok_g1esc: |
| case tok_g2esc: |
| case tok_g3esc: |
| case tok_escseq: |
| lr_ignore_rest(cmfile, 0); /* XXX */ |
| continue; |
| |
| case tok_include: |
| lr_error(cmfile, |
| "character sets with locking states are not supported"); |
| exit(4); |
| |
| default: |
| /* Cannot happen. */ |
| assert(0); |
| } |
| break; |
| |
| case 2: |
| /* We have seen `CHARMAP' and now are in the body. Each line |
| must have the format "%s %s %s\n" or "%s...%s %s %s\n". */ |
| if (nowtok == tok_eol) /* Ignore empty lines. */ |
| continue; |
| |
| if (nowtok == tok_end) { |
| expected_tok = tok_charmap; |
| expected_str = "CHARMAP"; |
| state = 90; |
| continue; |
| } |
| |
| if (nowtok != tok_bsymbol && nowtok != tok_ucs4) { |
| lr_error(cmfile, "syntax error in %s definition: %s", "CHARMAP", |
| "no symbolic name given"); |
| |
| lr_ignore_rest(cmfile, 0); |
| continue; |
| } |
| |
| /* If the previous line was not completely correct free the |
| used memory. */ |
| if (from_name != NULL) obstack_free(&result->mem_pool, from_name); |
| |
| if (nowtok == tok_bsymbol) |
| from_name = (char *)obstack_copy0( |
| &result->mem_pool, now->val.str.startmb, now->val.str.lenmb); |
| else { |
| obstack_printf(&result->mem_pool, "U%08X", cmfile->token.val.ucs4); |
| obstack_1grow(&result->mem_pool, '\0'); |
| from_name = (char *)obstack_finish(&result->mem_pool); |
| } |
| to_name = NULL; |
| |
| state = 3; |
| continue; |
| |
| case 3: |
| /* We have two possibilities: We can see an ellipsis or an |
| encoding value. */ |
| if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4 || |
| nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2 || |
| nowtok == tok_ellipsis2_2) { |
| ellipsis = nowtok; |
| if (nowtok == tok_ellipsis4_2) { |
| step = 2; |
| nowtok = tok_ellipsis4; |
| } else if (nowtok == tok_ellipsis2_2) { |
| step = 2; |
| nowtok = tok_ellipsis2; |
| } |
| state = 4; |
| continue; |
| } |
| /* FALLTHROUGH */ |
| |
| case 5: |
| if (nowtok != tok_charcode) { |
| lr_error(cmfile, "syntax error in %s definition: %s", "CHARMAP", |
| "invalid encoding given"); |
| |
| lr_ignore_rest(cmfile, 0); |
| |
| state = 2; |
| continue; |
| } |
| |
| if (now->val.charcode.nbytes < result->mb_cur_min) |
| lr_error(cmfile, "too few bytes in character encoding"); |
| else if (now->val.charcode.nbytes > result->mb_cur_max) |
| lr_error(cmfile, "too many bytes in character encoding"); |
| else |
| charmap_new_char(cmfile, result, now->val.charcode.nbytes, |
| now->val.charcode.bytes, from_name, to_name, |
| ellipsis != tok_ellipsis2, step); |
| |
| /* Ignore trailing comment silently. */ |
| lr_ignore_rest(cmfile, 0); |
| |
| from_name = NULL; |
| to_name = NULL; |
| ellipsis = tok_none; |
| step = 1; |
| |
| state = 2; |
| continue; |
| |
| case 4: |
| if (nowtok != tok_bsymbol && nowtok != tok_ucs4) { |
| lr_error(cmfile, "syntax error in %s definition: %s", "CHARMAP", |
| "no symbolic name given for end of range"); |
| |
| lr_ignore_rest(cmfile, 0); |
| continue; |
| } |
| |
| /* Copy the to-name in a safe place. */ |
| if (nowtok == tok_bsymbol) |
| to_name = (char *)obstack_copy0(&result->mem_pool, |
| cmfile->token.val.str.startmb, |
| cmfile->token.val.str.lenmb); |
| else { |
| obstack_printf(&result->mem_pool, "U%08X", cmfile->token.val.ucs4); |
| obstack_1grow(&result->mem_pool, '\0'); |
| to_name = (char *)obstack_finish(&result->mem_pool); |
| } |
| |
| state = 5; |
| continue; |
| |
| case 90: |
| if (nowtok != expected_tok) |
| lr_error(cmfile, "%1$s: definition does not end with `END %1$s'", |
| expected_str); |
| |
| lr_ignore_rest(cmfile, nowtok == expected_tok); |
| state = 91; |
| continue; |
| |
| case 91: |
| /* Waiting for WIDTH... */ |
| if (nowtok == tok_eol) /* Ignore empty lines. */ |
| continue; |
| |
| if (nowtok == tok_width_default) { |
| state = 92; |
| continue; |
| } |
| |
| if (nowtok == tok_width) { |
| lr_ignore_rest(cmfile, 1); |
| state = 93; |
| continue; |
| } |
| |
| if (nowtok == tok_width_variable) { |
| lr_ignore_rest(cmfile, 1); |
| state = 98; |
| continue; |
| } |
| |
| lr_error(cmfile, |
| "only WIDTH definitions are allowed to follow the CHARMAP " |
| "definition"); |
| |
| lr_ignore_rest(cmfile, 0); |
| continue; |
| |
| case 92: |
| if (nowtok != tok_number) |
| lr_error(cmfile, "value for %s must be an integer", "WIDTH_DEFAULT"); |
| else |
| result->width_default = now->val.num; |
| |
| lr_ignore_rest(cmfile, nowtok == tok_number); |
| |
| state = 91; |
| continue; |
| |
| case 93: |
| /* We now expect `END WIDTH' or lines of the format "%s %d\n" or |
| "%s...%s %d\n". */ |
| if (nowtok == tok_eol) /* ignore empty lines. */ |
| continue; |
| |
| if (nowtok == tok_end) { |
| expected_tok = tok_width; |
| expected_str = "WIDTH"; |
| state = 90; |
| continue; |
| } |
| |
| if (nowtok != tok_bsymbol && nowtok != tok_ucs4) { |
| lr_error(cmfile, "syntax error in %s definition: %s", "WIDTH", |
| "no symbolic name given"); |
| |
| lr_ignore_rest(cmfile, 0); |
| continue; |
| } |
| |
| if (from_name != NULL) obstack_free(&result->mem_pool, from_name); |
| |
| if (nowtok == tok_bsymbol) |
| from_name = (char *)obstack_copy0( |
| &result->mem_pool, now->val.str.startmb, now->val.str.lenmb); |
| else { |
| obstack_printf(&result->mem_pool, "U%08X", cmfile->token.val.ucs4); |
| obstack_1grow(&result->mem_pool, '\0'); |
| from_name = (char *)obstack_finish(&result->mem_pool); |
| } |
| |
| to_name = NULL; |
| |
| state = 94; |
| continue; |
| |
| case 94: |
| if (nowtok == tok_ellipsis3) { |
| state = 95; |
| continue; |
| } |
| |
| case 96: |
| if (nowtok != tok_number) |
| lr_error(cmfile, "value for %s must be an integer", "WIDTH"); |
| else { |
| /* Store width for chars. */ |
| new_width(cmfile, result, from_name, to_name, now->val.num); |
| |
| from_name = NULL; |
| to_name = NULL; |
| } |
| |
| lr_ignore_rest(cmfile, nowtok == tok_number); |
| |
| state = 93; |
| continue; |
| |
| case 95: |
| if (nowtok != tok_bsymbol && nowtok != tok_ucs4) { |
| lr_error(cmfile, "syntax error in %s definition: %s", "WIDTH", |
| "no symbolic name given for end of range"); |
| |
| lr_ignore_rest(cmfile, 0); |
| |
| state = 93; |
| continue; |
| } |
| |
| if (nowtok == tok_bsymbol) |
| to_name = (char *)obstack_copy0( |
| &result->mem_pool, now->val.str.startmb, now->val.str.lenmb); |
| else { |
| obstack_printf(&result->mem_pool, "U%08X", cmfile->token.val.ucs4); |
| obstack_1grow(&result->mem_pool, '\0'); |
| to_name = (char *)obstack_finish(&result->mem_pool); |
| } |
| |
| state = 96; |
| continue; |
| |
| case 98: |
| /* We now expect `END WIDTH_VARIABLE' or lines of the format |
| "%s\n" or "%s...%s\n". */ |
| if (nowtok == tok_eol) /* ignore empty lines. */ |
| continue; |
| |
| if (nowtok == tok_end) { |
| expected_tok = tok_width_variable; |
| expected_str = "WIDTH_VARIABLE"; |
| state = 90; |
| continue; |
| } |
| |
| if (nowtok != tok_bsymbol && nowtok != tok_ucs4) { |
| lr_error(cmfile, "syntax error in %s definition: %s", |
| "WIDTH_VARIABLE", "no symbolic name given"); |
| |
| lr_ignore_rest(cmfile, 0); |
| |
| continue; |
| } |
| |
| if (from_name != NULL) obstack_free(&result->mem_pool, from_name); |
| |
| if (nowtok == tok_bsymbol) |
| from_name = (char *)obstack_copy0( |
| &result->mem_pool, now->val.str.startmb, now->val.str.lenmb); |
| else { |
| obstack_printf(&result->mem_pool, "U%08X", cmfile->token.val.ucs4); |
| obstack_1grow(&result->mem_pool, '\0'); |
| from_name = (char *)obstack_finish(&result->mem_pool); |
| } |
| to_name = NULL; |
| |
| state = 99; |
| continue; |
| |
| case 99: |
| if (nowtok == tok_ellipsis3) state = 100; |
| |
| /* Store info. */ |
| from_name = NULL; |
| |
| /* Warn */ |
| state = 98; |
| continue; |
| |
| case 100: |
| if (nowtok != tok_bsymbol && nowtok != tok_ucs4) { |
| lr_error(cmfile, "syntax error in %s definition: %s", |
| "WIDTH_VARIABLE", "no symbolic name given for end of range"); |
| lr_ignore_rest(cmfile, 0); |
| continue; |
| } |
| |
| if (nowtok == tok_bsymbol) |
| to_name = (char *)obstack_copy0( |
| &result->mem_pool, now->val.str.startmb, now->val.str.lenmb); |
| else { |
| obstack_printf(&result->mem_pool, "U%08X", cmfile->token.val.ucs4); |
| obstack_1grow(&result->mem_pool, '\0'); |
| to_name = (char *)obstack_finish(&result->mem_pool); |
| } |
| |
| /* XXX Enter value into table. */ |
| |
| lr_ignore_rest(cmfile, 1); |
| |
| state = 98; |
| continue; |
| |
| default: |
| WITH_CUR_LOCALE(error(5, 0, "%s: error in state machine", __FILE__)); |
| /* NOTREACHED */ |
| } |
| break; |
| } |
| |
| if (state != 91 && !be_quiet) |
| WITH_CUR_LOCALE(error(0, 0, "%s: premature end of file", cmfile->fname)); |
| |
| lr_close(cmfile); |
| |
| return result; |
| } |
| |
| static void new_width(struct linereader *cmfile, struct charmap_t *result, |
| const char *from, const char *to, |
| unsigned long int width) { |
| struct charseq *from_val; |
| struct charseq *to_val; |
| |
| from_val = charmap_find_value(result, from, strlen(from)); |
| if (from_val == NULL) { |
| lr_error(cmfile, "unknown character `%s'", from); |
| return; |
| } |
| |
| if (to == NULL) |
| to_val = from_val; |
| else { |
| to_val = charmap_find_value(result, to, strlen(to)); |
| if (to_val == NULL) { |
| lr_error(cmfile, "unknown character `%s'", to); |
| return; |
| } |
| |
| /* Make sure the number of bytes for the end points of the range |
| is correct. */ |
| if (from_val->nbytes != to_val->nbytes) { |
| lr_error(cmfile, |
| "number of bytes for byte sequence of beginning and end of " |
| "range not the same: %d vs %d", |
| from_val->nbytes, to_val->nbytes); |
| return; |
| } |
| } |
| |
| if (result->nwidth_rules >= result->nwidth_rules_max) { |
| size_t new_size = result->nwidth_rules + 32; |
| struct width_rule *new_rules = (struct width_rule *)obstack_alloc( |
| &result->mem_pool, (new_size * sizeof(struct width_rule))); |
| |
| memcpy(new_rules, result->width_rules, |
| result->nwidth_rules_max * sizeof(struct width_rule)); |
| |
| result->width_rules = new_rules; |
| result->nwidth_rules_max = new_size; |
| } |
| |
| result->width_rules[result->nwidth_rules].from = from_val; |
| result->width_rules[result->nwidth_rules].to = to_val; |
| result->width_rules[result->nwidth_rules].width = (unsigned int)width; |
| ++result->nwidth_rules; |
| } |
| |
| struct charseq *charmap_find_value(const struct charmap_t *cm, const char *name, |
| size_t len) { |
| void *result; |
| |
| return (find_entry((hash_table *)&cm->char_table, name, len, &result) < 0 |
| ? NULL |
| : (struct charseq *)result); |
| } |
| |
| static void charmap_new_char(struct linereader *lr, struct charmap_t *cm, |
| size_t nbytes, unsigned char *bytes, |
| const char *from, const char *to, |
| int decimal_ellipsis, int step) { |
| hash_table *ht = &cm->char_table; |
| hash_table *bt = &cm->byte_table; |
| struct obstack *ob = &cm->mem_pool; |
| char *from_end; |
| char *to_end; |
| const char *cp; |
| int prefix_len, len1, len2; |
| unsigned int from_nr, to_nr, cnt; |
| struct charseq *newp; |
| |
| len1 = strlen(from); |
| |
| if (to == NULL) { |
| newp = (struct charseq *)obstack_alloc(ob, sizeof(*newp) + nbytes); |
| newp->nbytes = nbytes; |
| memcpy(newp->bytes, bytes, nbytes); |
| newp->name = from; |
| |
| newp->ucs4 = UNINITIALIZED_CHAR_VALUE; |
| if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9)) { |
| /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where |
| xxxx and xxxxxxxx are hexadecimal numbers. In this case |
| we use the value of xxxx or xxxxxxxx as the UCS4 value of |
| this character and we don't have to consult the repertoire |
| map. |
| |
| If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx |
| and xxxxxxxx also give the code point in UCS4 but this must |
| be in the private, i.e., unassigned, area. This should be |
| used for characters which do not (yet) have an equivalent |
| in ISO 10646 and Unicode. */ |
| char *endp; |
| |
| errno = 0; |
| newp->ucs4 = strtoul(from + 1, &endp, 16); |
| if (endp - from != len1 || |
| (newp->ucs4 == ~((uint32_t)0) && errno == ERANGE) || |
| newp->ucs4 >= 0x80000000) |
| /* This wasn't successful. Signal this name cannot be a |
| correct UCS value. */ |
| newp->ucs4 = UNINITIALIZED_CHAR_VALUE; |
| } |
| |
| insert_entry(ht, from, len1, newp); |
| insert_entry(bt, newp->bytes, nbytes, newp); |
| /* Please note that it isn't a bug if a symbol is defined more |
| than once. All later definitions are simply discarded. */ |
| return; |
| } |
| |
| /* We have a range: the names must have names with equal prefixes |
| and an equal number of digits, where the second number is greater |
| or equal than the first. */ |
| len2 = strlen(to); |
| |
| if (len1 != len2) { |
| illegal_range: |
| lr_error(lr, "invalid names for character range"); |
| return; |
| } |
| |
| cp = &from[len1 - 1]; |
| if (decimal_ellipsis) |
| while (isdigit(*cp) && cp >= from) --cp; |
| else |
| while (isxdigit(*cp) && cp >= from) { |
| if (!isdigit(*cp) && !isupper(*cp)) |
| lr_error(lr, |
| "hexadecimal range format should use only capital characters"); |
| --cp; |
| } |
| |
| prefix_len = (cp - from) + 1; |
| |
| if (cp == &from[len1 - 1] || strncmp(from, to, prefix_len) != 0) |
| goto illegal_range; |
| |
| errno = 0; |
| from_nr = strtoul(&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16); |
| if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE) || |
| ((to_nr = strtoul(&to[prefix_len], &to_end, |
| decimal_ellipsis ? 10 : 16)) == UINT_MAX && |
| errno == ERANGE) || |
| *to_end != '\0') { |
| lr_error(lr, "<%s> and <%s> are invalid names for range", from, to); |
| return; |
| } |
| |
| if (from_nr > to_nr) { |
| lr_error(lr, "upper limit in range is smaller than lower limit"); |
| return; |
| } |
| |
| for (cnt = from_nr; cnt <= to_nr; cnt += step) { |
| char *name_end; |
| obstack_printf(ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X", prefix_len, |
| from, len1 - prefix_len, cnt); |
| obstack_1grow(ob, '\0'); |
| name_end = obstack_finish(ob); |
| |
| newp = (struct charseq *)obstack_alloc(ob, sizeof(*newp) + nbytes); |
| newp->nbytes = nbytes; |
| memcpy(newp->bytes, bytes, nbytes); |
| newp->name = name_end; |
| |
| newp->ucs4 = UNINITIALIZED_CHAR_VALUE; |
| if ((name_end[0] == 'U' || name_end[0] == 'P') && |
| (len1 == 5 || len1 == 9)) { |
| /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where |
| xxxx and xxxxxxxx are hexadecimal numbers. In this case |
| we use the value of xxxx or xxxxxxxx as the UCS4 value of |
| this character and we don't have to consult the repertoire |
| map. |
| |
| If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx |
| and xxxxxxxx also give the code point in UCS4 but this must |
| be in the private, i.e., unassigned, area. This should be |
| used for characters which do not (yet) have an equivalent |
| in ISO 10646 and Unicode. */ |
| char *endp; |
| |
| errno = 0; |
| newp->ucs4 = strtoul(name_end + 1, &endp, 16); |
| if (endp - name_end != len1 || |
| (newp->ucs4 == ~((uint32_t)0) && errno == ERANGE) || |
| newp->ucs4 >= 0x80000000) |
| /* This wasn't successful. Signal this name cannot be a |
| correct UCS value. */ |
| newp->ucs4 = UNINITIALIZED_CHAR_VALUE; |
| } |
| |
| insert_entry(ht, name_end, len1, newp); |
| insert_entry(bt, newp->bytes, nbytes, newp); |
| /* Please note we don't examine the return value since it is no error |
| if we have two definitions for a symbol. */ |
| |
| /* Increment the value in the byte sequence. */ |
| if (++bytes[nbytes - 1] == '\0') { |
| int b = nbytes - 2; |
| |
| do |
| if (b < 0) { |
| lr_error(lr, "resulting bytes for range not representable."); |
| return; |
| } |
| while (++bytes[b--] == 0); |
| } |
| } |
| } |
| |
| struct charseq *charmap_find_symbol(const struct charmap_t *cm, |
| const char *bytes, size_t nbytes) { |
| void *result; |
| |
| return (find_entry((hash_table *)&cm->byte_table, bytes, nbytes, &result) < 0 |
| ? NULL |
| : (struct charseq *)result); |
| } |