| /* |
| tre-parse.c - Regexp parser |
| |
| This software is released under a BSD-style license. |
| See the file LICENSE for details and copyright. |
| |
| */ |
| |
| /* |
| This parser is just a simple recursive descent parser for POSIX.2 |
| regexps. The parser supports both the obsolete default syntax and |
| the "extended" syntax, and some nonstandard extensions. |
| */ |
| |
| |
| #ifdef HAVE_CONFIG_H |
| #include <config.h> |
| #endif /* HAVE_CONFIG_H */ |
| #include <string.h> |
| // #include <assert.h> |
| #include <limits.h> |
| |
| #include "xmalloc.h" |
| #include "tre-mem.h" |
| #include "tre-ast.h" |
| #include "tre-stack.h" |
| #include "tre-parse.h" |
| |
| #define assert(a) R_assert(a) |
| |
| /* Characters with special meanings in regexp syntax. */ |
| #define CHAR_PIPE L'|' |
| #define CHAR_LPAREN L'(' |
| #define CHAR_RPAREN L')' |
| #define CHAR_LBRACE L'{' |
| #define CHAR_RBRACE L'}' |
| #define CHAR_LBRACKET L'[' |
| #define CHAR_RBRACKET L']' |
| #define CHAR_MINUS L'-' |
| #define CHAR_STAR L'*' |
| #define CHAR_QUESTIONMARK L'?' |
| #define CHAR_PLUS L'+' |
| #define CHAR_PERIOD L'.' |
| #define CHAR_COLON L':' |
| #define CHAR_EQUAL L'=' |
| #define CHAR_COMMA L',' |
| #define CHAR_CARET L'^' |
| #define CHAR_DOLLAR L'$' |
| #define CHAR_BACKSLASH L'\\' |
| #define CHAR_HASH L'#' |
| #define CHAR_TILDE L'~' |
| |
| |
| /* Some macros for expanding \w, \s, etc. */ |
| static const struct tre_macro_struct { |
| const char c; |
| const char *expansion; |
| } tre_macros[] = |
| { {'t', "\t"}, {'n', "\n"}, {'r', "\r"}, |
| {'f', "\f"}, {'a', "\a"}, {'e', "\033"}, |
| {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, |
| {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, |
| { 0, NULL } |
| }; |
| |
| |
| /* Expands a macro delimited by `regex' and `regex_end' to `buf', which |
| must have at least `len' items. Sets buf[0] to zero if the there |
| is no match in `tre_macros'. */ |
| static void |
| tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end, |
| tre_char_t *buf, size_t buf_len) |
| { |
| int i; |
| |
| buf[0] = 0; |
| if (regex >= regex_end) |
| return; |
| |
| for (i = 0; tre_macros[i].expansion; i++) |
| { |
| if (tre_macros[i].c == *regex) |
| { |
| unsigned int j; |
| DPRINT(("Expanding macro '%c' => '%s'\n", |
| tre_macros[i].c, tre_macros[i].expansion)); |
| // R_change: - 1 is r62537 |
| for (j = 0; tre_macros[i].expansion[j] && j < buf_len - 1; j++) |
| buf[j] = tre_macros[i].expansion[j]; |
| buf[j] = 0; |
| break; |
| } |
| } |
| } |
| |
| static reg_errcode_t |
| tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i, |
| tre_ast_node_t ***items) |
| { |
| reg_errcode_t status; |
| tre_ast_node_t **array = *items; |
| /* Allocate more space if necessary. */ |
| if (*i >= *max_i) |
| { |
| tre_ast_node_t **new_items; |
| DPRINT(("out of array space, i = %d\n", *i)); |
| /* If the array is already 1024 items large, give up -- there's |
| probably an error in the regexp (e.g. not a '\0' terminated |
| string and missing ']') */ |
| if (*max_i > 1024) |
| return REG_ESPACE; |
| *max_i *= 2; |
| new_items = xrealloc(array, sizeof(*items) * *max_i); |
| if (new_items == NULL) |
| return REG_ESPACE; |
| *items = array = new_items; |
| } |
| array[*i] = tre_ast_new_literal(mem, min, max, -1); |
| status = array[*i] == NULL ? REG_ESPACE : REG_OK; |
| (*i)++; |
| return status; |
| } |
| |
| |
| /* Expands a character class to character ranges. */ |
| static reg_errcode_t |
| tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items, |
| int *i, int *max_i, int cflags) |
| { |
| reg_errcode_t status = REG_OK; |
| tre_cint_t c; |
| int j, min = -1, max = 0; |
| //assert(TRE_MB_CUR_MAX == 1); It is the ctx->cur_max that matters |
| |
| DPRINT((" expanding class to character ranges\n")); |
| for (j = 0; (j < 256) && (status == REG_OK); j++) |
| { |
| c = (tre_cint_t) j; |
| if (tre_isctype(c, class) |
| || ((cflags & REG_ICASE) |
| && (tre_isctype(tre_tolower(c), class) |
| || tre_isctype(tre_toupper(c), class)))) |
| { |
| if (min < 0) |
| min = c; |
| max = c; |
| } |
| else if (min >= 0) |
| { |
| DPRINT((" range %c (%d) to %c (%d)\n", min, min, max, max)); |
| status = tre_new_item(mem, min, max, i, max_i, items); |
| min = -1; |
| } |
| } |
| if (min >= 0 && status == REG_OK) |
| status = tre_new_item(mem, min, max, i, max_i, items); |
| return status; |
| } |
| |
| |
| static int |
| tre_compare_items(const void *a, const void *b) |
| { |
| const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a; |
| const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b; |
| tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj; |
| long a_min = l_a->code_min, b_min = l_b->code_min; |
| |
| if (a_min < b_min) |
| return -1; |
| else if (a_min > b_min) |
| return 1; |
| else |
| return 0; |
| } |
| |
| #ifndef TRE_USE_SYSTEM_WCTYPE |
| |
| /* isalnum() and the rest may be macros, so wrap them to functions. */ |
| int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); } |
| int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); } |
| |
| #ifdef tre_isascii |
| int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); } |
| #else /* !tre_isascii */ |
| int tre_isascii_func(tre_cint_t c) { return !(c >> 7); } |
| #endif /* !tre_isascii */ |
| |
| #ifdef tre_isblank |
| int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); } |
| #else /* !tre_isblank */ |
| int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); } |
| #endif /* !tre_isblank */ |
| |
| int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); } |
| int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); } |
| int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); } |
| int tre_islower_func(tre_cint_t c) { return tre_islower(c); } |
| int tre_isprint_func(tre_cint_t c) |
| { |
| #ifdef TRE_WCHAR |
| /* Windows has \t as printable via iswprint in all locales. By POSIX |
| and ?regex, we need \t to be non-printable in the C locale, so we |
| cannot use iswprint. By C99, iswprint(L'\t') should be the same as |
| isprint('\t'). In Windows, in C locale, isprint('\t') is false, |
| hence this workaround. */ |
| if (c == L'\t') return isprint('\t'); |
| #endif |
| return tre_isprint(c); /* Windows has \t as printable */ |
| } |
| int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); } |
| int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); } |
| int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); } |
| int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); } |
| |
| struct { |
| char *name; |
| int (*func)(tre_cint_t); |
| } tre_ctype_map[] = { |
| { "alnum", &tre_isalnum_func }, |
| { "alpha", &tre_isalpha_func }, |
| #ifdef tre_isascii |
| { "ascii", &tre_isascii_func }, |
| #endif /* tre_isascii */ |
| #ifdef tre_isblank |
| { "blank", &tre_isblank_func }, |
| #endif /* tre_isblank */ |
| { "cntrl", &tre_iscntrl_func }, |
| { "digit", &tre_isdigit_func }, |
| { "graph", &tre_isgraph_func }, |
| { "lower", &tre_islower_func }, |
| { "print", &tre_isprint_func }, |
| { "punct", &tre_ispunct_func }, |
| { "space", &tre_isspace_func }, |
| { "upper", &tre_isupper_func }, |
| { "xdigit", &tre_isxdigit_func }, |
| { NULL, NULL} |
| }; |
| |
| tre_ctype_t tre_ctype(const char *name) |
| { |
| int i; |
| for (i = 0; tre_ctype_map[i].name != NULL; i++) |
| { |
| if (strcmp(name, tre_ctype_map[i].name) == 0) |
| return tre_ctype_map[i].func; |
| } |
| return (tre_ctype_t)0; |
| } |
| #endif /* !TRE_USE_SYSTEM_WCTYPE */ |
| |
| /* Maximum number of character classes that can occur in a negated bracket |
| expression. */ |
| #define MAX_NEG_CLASSES 64 |
| |
| /* Maximum length of character class names. */ |
| #define MAX_CLASS_NAME |
| |
| #define REST(re) (int)(ctx->re_end - (re)), (re) |
| |
| static reg_errcode_t |
| tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, |
| tre_ctype_t neg_classes[], int *num_neg_classes, |
| tre_ast_node_t ***items, int *num_items, |
| int *items_size) |
| { |
| const tre_char_t *re = ctx->re; |
| reg_errcode_t status = REG_OK; |
| tre_ctype_t class = (tre_ctype_t)0; |
| int i = *num_items; |
| int max_i = *items_size; |
| int skip; |
| |
| /* Build an array of the items in the bracket expression. */ |
| while (status == REG_OK) |
| { |
| skip = 0; |
| if (re == ctx->re_end) |
| { |
| status = REG_EBRACK; |
| } |
| else if (*re == CHAR_RBRACKET && re > ctx->re) |
| { |
| DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re))); |
| re++; |
| break; |
| } |
| else |
| { |
| tre_cint_t min = 0, max = 0; |
| |
| class = (tre_ctype_t)0; |
| if (re + 2 < ctx->re_end |
| && *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET) |
| { |
| DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re))); |
| min = *re; |
| max = *(re + 2); |
| re += 3; |
| /* XXX - Should use collation order instead of encoding values |
| in character ranges. */ |
| if (min > max) |
| status = REG_ERANGE; |
| } |
| else if (re + 1 < ctx->re_end |
| && *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD) |
| status = REG_ECOLLATE; |
| else if (re + 1 < ctx->re_end |
| && *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL) |
| status = REG_ECOLLATE; |
| else if (re + 1 < ctx->re_end |
| && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON) |
| { |
| char tmp_str[64]; |
| const tre_char_t *endptr = re + 2; |
| size_t len; |
| DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(re))); |
| while (endptr < ctx->re_end && *endptr != CHAR_COLON) |
| endptr++; |
| if (endptr != ctx->re_end) |
| { |
| len = MIN(endptr - re - 2, 63); |
| #ifdef TRE_WCHAR |
| { |
| tre_char_t tmp_wcs[64]; |
| wcsncpy(tmp_wcs, re + 2, len); |
| tmp_wcs[len] = L'\0'; |
| #if defined HAVE_WCSRTOMBS |
| { |
| mbstate_t state; |
| const tre_char_t *src = tmp_wcs; |
| memset(&state, '\0', sizeof(state)); |
| len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state); |
| } |
| #elif defined HAVE_WCSTOMBS |
| len = wcstombs(tmp_str, tmp_wcs, 63); |
| #endif /* defined HAVE_WCSTOMBS */ |
| } |
| #else /* !TRE_WCHAR */ |
| strncpy(tmp_str, (const char*)re + 2, len); |
| #endif /* !TRE_WCHAR */ |
| tmp_str[len] = '\0'; |
| DPRINT((" class name: %s\n", tmp_str)); |
| class = tre_ctype(tmp_str); |
| if (!class) |
| status = REG_ECTYPE; |
| /* Optimize character classes for 8 bit character sets. */ |
| if (status == REG_OK && ctx->cur_max == 1) |
| { |
| status = tre_expand_ctype(ctx->mem, class, items, |
| &i, &max_i, ctx->cflags); |
| class = (tre_ctype_t)0; |
| skip = 1; |
| } |
| re = endptr + 2; |
| } |
| else |
| status = REG_ECTYPE; |
| min = 0; |
| max = TRE_CHAR_MAX; |
| } |
| else |
| { |
| DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); |
| if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET |
| && ctx->re != re) |
| /* Two ranges are not allowed to share and endpoint. */ |
| status = REG_ERANGE; |
| min = max = *re++; |
| } |
| |
| if (status != REG_OK) |
| break; |
| |
| if (class && negate) |
| if (*num_neg_classes >= MAX_NEG_CLASSES) |
| status = REG_ESPACE; |
| else |
| neg_classes[(*num_neg_classes)++] = class; |
| else if (!skip) |
| { |
| status = tre_new_item(ctx->mem, min, max, &i, &max_i, items); |
| if (status != REG_OK) |
| break; |
| ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class; |
| } |
| |
| /* Add opposite-case counterpoints if REG_ICASE is present. |
| This is broken if there are more than two "same" characters. */ |
| if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip) |
| { |
| tre_cint_t cmin, ccurr; |
| |
| DPRINT(("adding opposite-case counterpoints\n")); |
| while (min <= max) |
| { |
| if (tre_islower(min)) |
| { |
| cmin = ccurr = tre_toupper(min++); |
| while (tre_islower(min) && tre_toupper(min) == ccurr + 1 |
| && min <= max) |
| ccurr = tre_toupper(min++); |
| status = tre_new_item(ctx->mem, cmin, ccurr, |
| &i, &max_i, items); |
| } |
| else if (tre_isupper(min)) |
| { |
| cmin = ccurr = tre_tolower(min++); |
| while (tre_isupper(min) && tre_tolower(min) == ccurr + 1 |
| && min <= max) |
| ccurr = tre_tolower(min++); |
| status = tre_new_item(ctx->mem, cmin, ccurr, |
| &i, &max_i, items); |
| } |
| else min++; |
| if (status != REG_OK) |
| break; |
| } |
| if (status != REG_OK) |
| break; |
| } |
| } |
| } |
| *num_items = i; |
| *items_size = max_i; |
| ctx->re = re; |
| return status; |
| } |
| |
| static reg_errcode_t |
| tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) |
| { |
| tre_ast_node_t *node = NULL; |
| int negate = 0; |
| reg_errcode_t status = REG_OK; |
| tre_ast_node_t **items, *u, *n; |
| int i = 0, j, max_i = 32, curr_max, curr_min; |
| tre_ctype_t neg_classes[MAX_NEG_CLASSES]; |
| int num_neg_classes = 0; |
| |
| /* Start off with an array of `max_i' elements. */ |
| items = xmalloc(sizeof(*items) * max_i); |
| if (items == NULL) |
| return REG_ESPACE; |
| |
| if (*ctx->re == CHAR_CARET) |
| { |
| DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re))); |
| negate = 1; |
| ctx->re++; |
| } |
| |
| status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes, |
| &items, &i, &max_i); |
| |
| if (status != REG_OK) |
| goto parse_bracket_done; |
| |
| /* Sort the array if we need to negate it. */ |
| if (negate) |
| qsort(items, (unsigned)i, sizeof(*items), tre_compare_items); |
| |
| curr_max = curr_min = 0; |
| /* Build a union of the items in the array, negated if necessary. */ |
| for (j = 0; j < i && status == REG_OK; j++) |
| { |
| int min, max; |
| tre_literal_t *l = items[j]->obj; |
| min = (int) l->code_min; |
| max = (int) l->code_max; |
| |
| DPRINT(("item: %d - %d, class %p, curr_max = %d\n", |
| (int)l->code_min, (int)l->code_max, (void *)l->u.class, curr_max)); |
| |
| if (negate) |
| { |
| if (min < curr_max) |
| { |
| /* Overlap. */ |
| curr_max = MAX(max + 1, curr_max); |
| DPRINT(("overlap, curr_max = %d\n", curr_max)); |
| l = NULL; |
| } |
| else |
| { |
| /* No overlap. */ |
| curr_max = min - 1; |
| if (curr_max >= curr_min) |
| { |
| DPRINT(("no overlap\n")); |
| l->code_min = curr_min; |
| l->code_max = curr_max; |
| } |
| else |
| { |
| DPRINT(("no overlap, zero room\n")); |
| l = NULL; |
| } |
| curr_min = curr_max = max + 1; |
| } |
| } |
| |
| if (l != NULL) |
| { |
| int k; |
| DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max)); |
| l->position = ctx->position; |
| if (num_neg_classes > 0) |
| { |
| l->neg_classes = tre_mem_alloc(ctx->mem, |
| (sizeof(l->neg_classes) |
| * (num_neg_classes + 1))); |
| if (l->neg_classes == NULL) |
| { |
| status = REG_ESPACE; |
| break; |
| } |
| for (k = 0; k < num_neg_classes; k++) |
| l->neg_classes[k] = neg_classes[k]; |
| l->neg_classes[k] = (tre_ctype_t)0; |
| } |
| else |
| l->neg_classes = NULL; |
| if (node == NULL) |
| node = items[j]; |
| else |
| { |
| u = tre_ast_new_union(ctx->mem, node, items[j]); |
| if (u == NULL) |
| status = REG_ESPACE; |
| node = u; |
| } |
| } |
| } |
| |
| if (status != REG_OK) |
| goto parse_bracket_done; |
| |
| if (negate) |
| { |
| int k; |
| DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX)); |
| n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position); |
| if (n == NULL) |
| status = REG_ESPACE; |
| else |
| { |
| tre_literal_t *l = n->obj; |
| if (num_neg_classes > 0) |
| { |
| l->neg_classes = tre_mem_alloc(ctx->mem, |
| (sizeof(l->neg_classes) |
| * (num_neg_classes + 1))); |
| if (l->neg_classes == NULL) |
| { |
| status = REG_ESPACE; |
| goto parse_bracket_done; |
| } |
| for (k = 0; k < num_neg_classes; k++) |
| l->neg_classes[k] = neg_classes[k]; |
| l->neg_classes[k] = (tre_ctype_t)0; |
| } |
| else |
| l->neg_classes = NULL; |
| if (node == NULL) |
| node = n; |
| else |
| { |
| u = tre_ast_new_union(ctx->mem, node, n); |
| if (u == NULL) |
| status = REG_ESPACE; |
| node = u; |
| } |
| } |
| } |
| |
| if (status != REG_OK) |
| goto parse_bracket_done; |
| |
| #ifdef TRE_DEBUG |
| tre_ast_print(node); |
| #endif /* TRE_DEBUG */ |
| |
| parse_bracket_done: |
| xfree(items); |
| ctx->position++; |
| *result = node; |
| return status; |
| } |
| |
| |
| /* Parses a positive decimal integer. Returns -1 if the string does not |
| contain a valid number. */ |
| static int |
| tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end) |
| { |
| int num = -1; |
| const tre_char_t *r = *regex; |
| while (r < regex_end && *r >= L'0' && *r <= L'9') |
| { |
| if (num < 0) |
| num = 0; |
| num = num * 10 + *r - L'0'; |
| r++; |
| } |
| *regex = r; |
| return num; |
| } |
| |
| |
| static reg_errcode_t |
| tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) |
| { |
| int min, max, i; |
| int cost_ins, cost_del, cost_subst, cost_max; |
| int limit_ins, limit_del, limit_subst, limit_err; |
| const tre_char_t *r = ctx->re; |
| const tre_char_t *start; |
| int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; |
| int approx = 0; |
| int costs_set = 0; |
| int counts_set = 0; |
| |
| cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET; |
| limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET; |
| |
| /* Parse number (minimum repetition count). */ |
| min = -1; |
| if (r < ctx->re_end && *r >= L'0' && *r <= L'9') { |
| DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r))); |
| min = tre_parse_int(&r, ctx->re_end); |
| } |
| |
| /* Parse comma and second number (maximum repetition count). */ |
| max = min; |
| if (r < ctx->re_end && *r == CHAR_COMMA) |
| { |
| r++; |
| DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r))); |
| max = tre_parse_int(&r, ctx->re_end); |
| } |
| |
| /* Check that the repeat counts are sane. */ |
| if ((max >= 0 && min > max) || max > RE_DUP_MAX) |
| return REG_BADBR; |
| |
| |
| /* |
| '{' |
| optionally followed immediately by a number == minimum repcount |
| optionally followed by , then a number == maximum repcount |
| + then a number == maximum insertion count |
| - then a number == maximum deletion count |
| # then a number == maximum substitution count |
| ~ then a number == maximum number of errors |
| Any of +, -, # or ~ without followed by a number means that |
| the maximum count/number of errors is infinite. |
| |
| An equation of the form |
| Xi + Yd + Zs < C |
| can be specified to set costs and the cost limit to a value |
| different from the default value: |
| - X is the cost of an insertion |
| - Y is the cost of a deletion |
| - Z is the cost of a substitution |
| - C is the maximum cost |
| |
| If no count limit or cost is set for an operation, the operation |
| is not allowed at all. |
| */ |
| |
| |
| do { |
| int done; |
| start = r; |
| |
| /* Parse count limit settings */ |
| done = 0; |
| if (!counts_set) |
| while (r + 1 < ctx->re_end && !done) |
| { |
| switch (*r) |
| { |
| case CHAR_PLUS: /* Insert limit */ |
| DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r))); |
| r++; |
| limit_ins = tre_parse_int(&r, ctx->re_end); |
| if (limit_ins < 0) |
| limit_ins = INT_MAX; |
| counts_set = 1; |
| break; |
| case CHAR_MINUS: /* Delete limit */ |
| DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r))); |
| r++; |
| limit_del = tre_parse_int(&r, ctx->re_end); |
| if (limit_del < 0) |
| limit_del = INT_MAX; |
| counts_set = 1; |
| break; |
| case CHAR_HASH: /* Substitute limit */ |
| DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r))); |
| r++; |
| limit_subst = tre_parse_int(&r, ctx->re_end); |
| if (limit_subst < 0) |
| limit_subst = INT_MAX; |
| counts_set = 1; |
| break; |
| case CHAR_TILDE: /* Maximum number of changes */ |
| DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r))); |
| r++; |
| limit_err = tre_parse_int(&r, ctx->re_end); |
| if (limit_err < 0) |
| limit_err = INT_MAX; |
| approx = 1; |
| break; |
| case CHAR_COMMA: |
| r++; |
| break; |
| case L' ': |
| r++; |
| break; |
| case L'}': |
| done = 1; |
| break; |
| default: |
| done = 1; |
| break; |
| } |
| } |
| |
| /* Parse cost restriction equation. */ |
| done = 0; |
| if (!costs_set) |
| while (r + 1 < ctx->re_end && !done) |
| { |
| switch (*r) |
| { |
| case CHAR_PLUS: |
| case L' ': |
| r++; |
| break; |
| case L'<': |
| DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r))); |
| r++; |
| while (*r == L' ') |
| r++; |
| cost_max = tre_parse_int(&r, ctx->re_end); |
| if (cost_max < 0) |
| cost_max = INT_MAX; |
| else |
| cost_max--; |
| approx = 1; |
| break; |
| case CHAR_COMMA: |
| r++; |
| done = 1; |
| break; |
| default: |
| if (*r >= L'0' && *r <= L'9') |
| { |
| #ifdef TRE_DEBUG |
| const tre_char_t *sr = r; |
| #endif /* TRE_DEBUG */ |
| int cost = tre_parse_int(&r, ctx->re_end); |
| /* XXX - make sure r is not past end. */ |
| switch (*r) |
| { |
| case L'i': /* Insert cost */ |
| DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n", |
| REST(sr))); |
| r++; |
| cost_ins = cost; |
| costs_set = 1; |
| break; |
| case L'd': /* Delete cost */ |
| DPRINT(("tre_parse: del cost: '%.*" STRF "'\n", |
| REST(sr))); |
| r++; |
| cost_del = cost; |
| costs_set = 1; |
| break; |
| case L's': /* Substitute cost */ |
| DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n", |
| REST(sr))); |
| r++; |
| cost_subst = cost; |
| costs_set = 1; |
| break; |
| default: |
| return REG_BADBR; |
| } |
| } |
| else |
| { |
| done = 1; |
| break; |
| } |
| } |
| } |
| } while (start != r); |
| |
| /* Missing }. */ |
| if (r >= ctx->re_end) |
| return REG_EBRACE; |
| |
| /* Empty contents of {}. */ |
| if (r == ctx->re) |
| return REG_BADBR; |
| |
| /* Parse the ending '}' or '\}'.*/ |
| if (ctx->cflags & REG_EXTENDED) |
| { |
| if (r >= ctx->re_end || *r != CHAR_RBRACE) |
| return REG_BADBR; |
| r++; |
| } |
| else |
| { |
| if (r + 1 >= ctx->re_end |
| || *r != CHAR_BACKSLASH |
| || *(r + 1) != CHAR_RBRACE) |
| return REG_BADBR; |
| r += 2; |
| } |
| |
| |
| /* Parse trailing '?' marking minimal repetition. */ |
| if (r < ctx->re_end) |
| { |
| if (*r == CHAR_QUESTIONMARK) |
| { |
| minimal = !(ctx->cflags & REG_UNGREEDY); |
| r++; |
| } |
| else if (*r == CHAR_STAR || *r == CHAR_PLUS) |
| { |
| /* These are reserved for future extensions. */ |
| return REG_BADRPT; |
| } |
| } |
| |
| /* Create the AST node(s). */ |
| if (min == 0 && max == 0) |
| { |
| *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); |
| if (*result == NULL) |
| return REG_ESPACE; |
| } |
| else |
| { |
| if (min < 0 && max < 0) |
| /* Only approximate parameters set, no repetitions. */ |
| min = max = 1; |
| |
| *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal); |
| if (!*result) |
| return REG_ESPACE; |
| |
| /* If approximate matching parameters are set, add them to the |
| iteration node. */ |
| if (approx || costs_set || counts_set) |
| { |
| int *params; |
| tre_iteration_t *iter = (*result)->obj; |
| |
| if (costs_set || counts_set) |
| { |
| if (limit_ins == TRE_PARAM_UNSET) |
| { |
| if (cost_ins == TRE_PARAM_UNSET) |
| limit_ins = 0; |
| else |
| limit_ins = INT_MAX; |
| } |
| |
| if (limit_del == TRE_PARAM_UNSET) |
| { |
| if (cost_del == TRE_PARAM_UNSET) |
| limit_del = 0; |
| else |
| limit_del = INT_MAX; |
| } |
| |
| if (limit_subst == TRE_PARAM_UNSET) |
| { |
| if (cost_subst == TRE_PARAM_UNSET) |
| limit_subst = 0; |
| else |
| limit_subst = INT_MAX; |
| } |
| } |
| |
| if (cost_max == TRE_PARAM_UNSET) |
| cost_max = INT_MAX; |
| if (limit_err == TRE_PARAM_UNSET) |
| limit_err = INT_MAX; |
| |
| ctx->have_approx = 1; |
| params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST); |
| if (!params) |
| return REG_ESPACE; |
| for (i = 0; i < TRE_PARAM_LAST; i++) |
| params[i] = TRE_PARAM_UNSET; |
| params[TRE_PARAM_COST_INS] = cost_ins; |
| params[TRE_PARAM_COST_DEL] = cost_del; |
| params[TRE_PARAM_COST_SUBST] = cost_subst; |
| params[TRE_PARAM_COST_MAX] = cost_max; |
| params[TRE_PARAM_MAX_INS] = limit_ins; |
| params[TRE_PARAM_MAX_DEL] = limit_del; |
| params[TRE_PARAM_MAX_SUBST] = limit_subst; |
| params[TRE_PARAM_MAX_ERR] = limit_err; |
| iter->params = params; |
| } |
| } |
| |
| DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], " |
| "limits [%d,%d,%d, total %d]\n", |
| min, max, cost_ins, cost_del, cost_subst, cost_max, |
| limit_ins, limit_del, limit_subst, limit_err)); |
| |
| |
| ctx->re = r; |
| return REG_OK; |
| } |
| |
| typedef enum { |
| PARSE_RE = 0, |
| PARSE_ATOM, |
| PARSE_MARK_FOR_SUBMATCH, |
| PARSE_BRANCH, |
| PARSE_PIECE, |
| PARSE_CATENATION, |
| PARSE_POST_CATENATION, |
| PARSE_UNION, |
| PARSE_POST_UNION, |
| PARSE_POSTFIX, |
| PARSE_RESTORE_CFLAGS |
| } tre_parse_re_stack_symbol_t; |
| |
| |
| reg_errcode_t |
| tre_parse(tre_parse_ctx_t *ctx) |
| { |
| tre_ast_node_t *result = NULL; |
| tre_parse_re_stack_symbol_t symbol; |
| reg_errcode_t status = REG_OK; |
| tre_stack_t *stack = ctx->stack; |
| int bottom = tre_stack_num_objects(stack); |
| int depth = 0; |
| int temporary_cflags = 0; |
| |
| DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n", |
| ctx->len, ctx->re, ctx->len)); |
| |
| if (!ctx->nofirstsub) |
| { |
| STACK_PUSH(stack, int, ctx->submatch_id); |
| STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH); |
| ctx->submatch_id++; |
| } |
| STACK_PUSH(stack, int, PARSE_RE); |
| ctx->re_start = ctx->re; |
| ctx->re_end = ctx->re + ctx->len; |
| |
| |
| /* The following is basically just a recursive descent parser. I use |
| an explicit stack instead of recursive functions mostly because of |
| two reasons: compatibility with systems which have an overflowable |
| call stack, and efficiency (both in lines of code and speed). */ |
| while (tre_stack_num_objects(stack) > bottom && status == REG_OK) |
| { |
| if (status != REG_OK) |
| break; |
| symbol = tre_stack_pop_int(stack); |
| switch (symbol) |
| { |
| case PARSE_RE: |
| /* Parse a full regexp. A regexp is one or more branches, |
| separated by the union operator `|'. */ |
| #ifdef REG_LITERAL |
| if (!(ctx->cflags & REG_LITERAL) |
| && ctx->cflags & REG_EXTENDED) |
| #endif /* REG_LITERAL */ |
| STACK_PUSHX(stack, int, PARSE_UNION); |
| STACK_PUSHX(stack, int, PARSE_BRANCH); |
| break; |
| |
| case PARSE_BRANCH: |
| /* Parse a branch. A branch is one or more pieces, concatenated. |
| A piece is an atom possibly followed by a postfix operator. */ |
| STACK_PUSHX(stack, int, PARSE_CATENATION); |
| STACK_PUSHX(stack, int, PARSE_PIECE); |
| break; |
| |
| case PARSE_PIECE: |
| /* Parse a piece. A piece is an atom possibly followed by one |
| or more postfix operators. */ |
| #ifdef REG_LITERAL |
| if (!(ctx->cflags & REG_LITERAL)) |
| #endif /* REG_LITERAL */ |
| STACK_PUSHX(stack, int, PARSE_POSTFIX); |
| STACK_PUSHX(stack, int, PARSE_ATOM); |
| break; |
| |
| case PARSE_CATENATION: |
| /* If the expression has not ended, parse another piece. */ |
| { |
| tre_char_t c; |
| if (ctx->re >= ctx->re_end) |
| break; |
| c = *ctx->re; |
| #ifdef REG_LITERAL |
| if (!(ctx->cflags & REG_LITERAL)) |
| { |
| #endif /* REG_LITERAL */ |
| if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) |
| break; |
| if ((ctx->cflags & REG_EXTENDED |
| && c == CHAR_RPAREN && depth > 0) |
| || (!(ctx->cflags & REG_EXTENDED) |
| && (c == CHAR_BACKSLASH |
| && *(ctx->re + 1) == CHAR_RPAREN))) |
| { |
| if (!(ctx->cflags & REG_EXTENDED) && depth == 0) |
| status = REG_EPAREN; |
| DPRINT(("tre_parse: group end: '%.*" STRF "'\n", |
| REST(ctx->re))); |
| depth--; |
| if (!(ctx->cflags & REG_EXTENDED)) |
| ctx->re += 2; |
| break; |
| } |
| #ifdef REG_LITERAL |
| } |
| #endif /* REG_LITERAL */ |
| |
| #ifdef REG_RIGHT_ASSOC |
| if (ctx->cflags & REG_RIGHT_ASSOC) |
| { |
| /* Right associative concatenation. */ |
| STACK_PUSHX(stack, voidptr, result); |
| STACK_PUSHX(stack, int, PARSE_POST_CATENATION); |
| STACK_PUSHX(stack, int, PARSE_CATENATION); |
| STACK_PUSHX(stack, int, PARSE_PIECE); |
| } |
| else |
| #endif /* REG_RIGHT_ASSOC */ |
| { |
| /* Default case, left associative concatenation. */ |
| STACK_PUSHX(stack, int, PARSE_CATENATION); |
| STACK_PUSHX(stack, voidptr, result); |
| STACK_PUSHX(stack, int, PARSE_POST_CATENATION); |
| STACK_PUSHX(stack, int, PARSE_PIECE); |
| } |
| break; |
| } |
| |
| case PARSE_POST_CATENATION: |
| { |
| tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); |
| tre_ast_node_t *tmp_node; |
| tmp_node = tre_ast_new_catenation(ctx->mem, tree, result); |
| if (!tmp_node) |
| return REG_ESPACE; |
| result = tmp_node; |
| break; |
| } |
| |
| case PARSE_UNION: |
| if (ctx->re >= ctx->re_end) |
| break; |
| #ifdef REG_LITERAL |
| if (ctx->cflags & REG_LITERAL) |
| break; |
| #endif /* REG_LITERAL */ |
| switch (*ctx->re) |
| { |
| case CHAR_PIPE: |
| DPRINT(("tre_parse: union: '%.*" STRF "'\n", |
| REST(ctx->re))); |
| STACK_PUSHX(stack, int, PARSE_UNION); |
| STACK_PUSHX(stack, voidptr, result); |
| STACK_PUSHX(stack, int, PARSE_POST_UNION); |
| STACK_PUSHX(stack, int, PARSE_BRANCH); |
| ctx->re++; |
| break; |
| |
| case CHAR_RPAREN: |
| ctx->re++; |
| break; |
| |
| default: |
| break; |
| } |
| break; |
| |
| case PARSE_POST_UNION: |
| { |
| tre_ast_node_t *tmp_node; |
| tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); |
| tmp_node = tre_ast_new_union(ctx->mem, tree, result); |
| if (!tmp_node) |
| return REG_ESPACE; |
| result = tmp_node; |
| break; |
| } |
| |
| case PARSE_POSTFIX: |
| /* Parse postfix operators. */ |
| if (ctx->re >= ctx->re_end) |
| break; |
| #ifdef REG_LITERAL |
| if (ctx->cflags & REG_LITERAL) |
| break; |
| #endif /* REG_LITERAL */ |
| switch (*ctx->re) |
| { |
| case CHAR_PLUS: |
| case CHAR_QUESTIONMARK: |
| if (!(ctx->cflags & REG_EXTENDED)) |
| break; |
| /*FALLTHROUGH*/ |
| case CHAR_STAR: |
| { |
| tre_ast_node_t *tmp_node; |
| int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; |
| int rep_min = 0; |
| int rep_max = -1; |
| #ifdef TRE_DEBUG |
| const tre_char_t *tmp_re; |
| #endif |
| |
| if (*ctx->re == CHAR_PLUS) |
| rep_min = 1; |
| if (*ctx->re == CHAR_QUESTIONMARK) |
| rep_max = 1; |
| #ifdef TRE_DEBUG |
| tmp_re = ctx->re; |
| #endif |
| |
| if (ctx->re + 1 < ctx->re_end) |
| { |
| if (*(ctx->re + 1) == CHAR_QUESTIONMARK) |
| { |
| minimal = !(ctx->cflags & REG_UNGREEDY); |
| ctx->re++; |
| } |
| else if (*(ctx->re + 1) == CHAR_STAR |
| || *(ctx->re + 1) == CHAR_PLUS) |
| { |
| /* These are reserved for future extensions. */ |
| return REG_BADRPT; |
| } |
| } |
| |
| DPRINT(("tre_parse: %s star: '%.*" STRF "'\n", |
| minimal ? " minimal" : "greedy", REST(tmp_re))); |
| ctx->re++; |
| tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max, |
| minimal); |
| if (tmp_node == NULL) |
| return REG_ESPACE; |
| result = tmp_node; |
| STACK_PUSHX(stack, int, PARSE_POSTFIX); |
| } |
| break; |
| |
| case CHAR_BACKSLASH: |
| /* "\{" is special without REG_EXTENDED */ |
| if (!(ctx->cflags & REG_EXTENDED) |
| && ctx->re + 1 < ctx->re_end |
| && *(ctx->re + 1) == CHAR_LBRACE) |
| { |
| ctx->re++; |
| goto parse_brace; |
| } |
| else |
| break; |
| |
| case CHAR_LBRACE: |
| /* "{" is literal without REG_EXTENDED */ |
| if (!(ctx->cflags & REG_EXTENDED)) |
| break; |
| |
| parse_brace: |
| DPRINT(("tre_parse: bound: '%.*" STRF "'\n", |
| REST(ctx->re))); |
| ctx->re++; |
| |
| status = tre_parse_bound(ctx, &result); |
| if (status != REG_OK) |
| return status; |
| STACK_PUSHX(stack, int, PARSE_POSTFIX); |
| break; |
| } |
| break; |
| |
| case PARSE_ATOM: |
| /* Parse an atom. An atom is a regular expression enclosed in `()', |
| an empty set of `()', a bracket expression, `.', `^', `$', |
| a `\' followed by a character, or a single character. */ |
| |
| /* End of regexp? (empty string). */ |
| if (ctx->re >= ctx->re_end) |
| goto parse_literal; |
| |
| #ifdef REG_LITERAL |
| if (ctx->cflags & REG_LITERAL) |
| goto parse_literal; |
| #endif /* REG_LITERAL */ |
| |
| switch (*ctx->re) |
| { |
| case CHAR_LPAREN: /* parenthesized subexpression */ |
| |
| /* Handle "(?...)" extensions. They work in a way similar |
| to Perls corresponding extensions. */ |
| if (ctx->cflags & REG_EXTENDED |
| && *(ctx->re + 1) == CHAR_QUESTIONMARK) |
| { |
| int new_cflags = ctx->cflags; |
| int bit = 1; |
| DPRINT(("tre_parse: extension: '%.*" STRF "\n", |
| REST(ctx->re))); |
| ctx->re += 2; |
| while (/*CONSTCOND*/(void)1,1) |
| { |
| if (*ctx->re == L'i') |
| { |
| DPRINT(("tre_parse: icase: '%.*" STRF "\n", |
| REST(ctx->re))); |
| if (bit) |
| new_cflags |= REG_ICASE; |
| else |
| new_cflags &= ~REG_ICASE; |
| ctx->re++; |
| } |
| else if (*ctx->re == L'n') |
| { |
| DPRINT(("tre_parse: newline: '%.*" STRF "\n", |
| REST(ctx->re))); |
| if (bit) |
| new_cflags |= REG_NEWLINE; |
| else |
| new_cflags &= ~REG_NEWLINE; |
| ctx->re++; |
| } |
| #ifdef REG_RIGHT_ASSOC |
| else if (*ctx->re == L'r') |
| { |
| DPRINT(("tre_parse: right assoc: '%.*" STRF "\n", |
| REST(ctx->re))); |
| if (bit) |
| new_cflags |= REG_RIGHT_ASSOC; |
| else |
| new_cflags &= ~REG_RIGHT_ASSOC; |
| ctx->re++; |
| } |
| #endif /* REG_RIGHT_ASSOC */ |
| #ifdef REG_UNGREEDY |
| else if (*ctx->re == L'U') |
| { |
| DPRINT(("tre_parse: ungreedy: '%.*" STRF "\n", |
| REST(ctx->re))); |
| if (bit) |
| new_cflags |= REG_UNGREEDY; |
| else |
| new_cflags &= ~REG_UNGREEDY; |
| ctx->re++; |
| } |
| #endif /* REG_UNGREEDY */ |
| else if (*ctx->re == CHAR_MINUS) |
| { |
| DPRINT(("tre_parse: turn off: '%.*" STRF "\n", |
| REST(ctx->re))); |
| ctx->re++; |
| bit = 0; |
| } |
| else if (*ctx->re == CHAR_COLON) |
| { |
| DPRINT(("tre_parse: no group: '%.*" STRF "\n", |
| REST(ctx->re))); |
| ctx->re++; |
| depth++; |
| break; |
| } |
| else if (*ctx->re == CHAR_HASH) |
| { |
| DPRINT(("tre_parse: comment: '%.*" STRF "\n", |
| REST(ctx->re))); |
| /* A comment can contain any character except a |
| right parenthesis */ |
| while (*ctx->re != CHAR_RPAREN |
| && ctx->re < ctx->re_end) |
| ctx->re++; |
| if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end) |
| { |
| ctx->re++; |
| break; |
| } |
| else |
| return REG_BADPAT; |
| } |
| else if (*ctx->re == CHAR_RPAREN) |
| { |
| ctx->re++; |
| break; |
| } |
| else |
| return REG_BADPAT; |
| } |
| |
| /* Turn on the cflags changes for the rest of the |
| enclosing group. */ |
| STACK_PUSHX(stack, int, ctx->cflags); |
| STACK_PUSHX(stack, int, PARSE_RESTORE_CFLAGS); |
| STACK_PUSHX(stack, int, PARSE_RE); |
| ctx->cflags = new_cflags; |
| break; |
| } |
| |
| if (ctx->cflags & REG_EXTENDED |
| || (ctx->re > ctx->re_start |
| && *(ctx->re - 1) == CHAR_BACKSLASH)) |
| { |
| depth++; |
| if (ctx->re + 2 < ctx->re_end |
| && *(ctx->re + 1) == CHAR_QUESTIONMARK |
| && *(ctx->re + 2) == CHAR_COLON) |
| { |
| DPRINT(("tre_parse: group begin: '%.*" STRF |
| "', no submatch\n", REST(ctx->re))); |
| /* Don't mark for submatching. */ |
| ctx->re += 3; |
| STACK_PUSHX(stack, int, PARSE_RE); |
| } |
| else |
| { |
| DPRINT(("tre_parse: group begin: '%.*" STRF |
| "', submatch %d\n", REST(ctx->re), |
| ctx->submatch_id)); |
| ctx->re++; |
| /* First parse a whole RE, then mark the resulting tree |
| for submatching. */ |
| STACK_PUSHX(stack, int, ctx->submatch_id); |
| STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); |
| STACK_PUSHX(stack, int, PARSE_RE); |
| ctx->submatch_id++; |
| } |
| } |
| else |
| goto parse_literal; |
| break; |
| |
| case CHAR_RPAREN: /* end of current subexpression */ |
| if ((ctx->cflags & REG_EXTENDED && depth > 0) |
| || (!(ctx->cflags & REG_EXTENDED) && ctx->re > ctx->re_start |
| && *(ctx->re - 1) == CHAR_BACKSLASH)) |
| { |
| DPRINT(("tre_parse: empty: '%.*" STRF "'\n", |
| REST(ctx->re))); |
| /* We were expecting an atom, but instead the current |
| subexpression was closed. POSIX leaves the meaning of |
| this to be implementation-defined. We interpret this as |
| an empty expression (which matches an empty string). */ |
| result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); |
| if (result == NULL) |
| return REG_ESPACE; |
| if (!(ctx->cflags & REG_EXTENDED)) |
| ctx->re--; |
| } |
| else |
| goto parse_literal; |
| break; |
| |
| case CHAR_LBRACKET: /* bracket expression */ |
| DPRINT(("tre_parse: bracket: '%.*" STRF "'\n", |
| REST(ctx->re))); |
| ctx->re++; |
| status = tre_parse_bracket(ctx, &result); |
| if (status != REG_OK) |
| return status; |
| break; |
| |
| case CHAR_BACKSLASH: |
| /* If this is "\(" or "\)" chew off the backslash and |
| try again. */ |
| if (!(ctx->cflags & REG_EXTENDED) |
| && ctx->re + 1 < ctx->re_end |
| && (*(ctx->re + 1) == CHAR_LPAREN |
| || *(ctx->re + 1) == CHAR_RPAREN)) |
| { |
| ctx->re++; |
| STACK_PUSHX(stack, int, PARSE_ATOM); |
| break; |
| } |
| |
| /* If a macro is used, parse the expanded macro recursively. */ |
| { |
| tre_char_t buf[64]; |
| tre_expand_macro(ctx->re + 1, ctx->re_end, |
| buf, elementsof(buf)); |
| if (buf[0] != 0) |
| { |
| tre_parse_ctx_t subctx; |
| memcpy(&subctx, ctx, sizeof(subctx)); |
| subctx.re = buf; |
| subctx.len = tre_strlen(buf); |
| subctx.nofirstsub = 1; |
| status = tre_parse(&subctx); |
| if (status != REG_OK) |
| return status; |
| ctx->re += 2; |
| ctx->position = subctx.position; |
| result = subctx.result; |
| break; |
| } |
| } |
| |
| if (ctx->re + 1 >= ctx->re_end) |
| /* Trailing backslash. */ |
| return REG_EESCAPE; |
| |
| #ifdef REG_LITERAL |
| if (*(ctx->re + 1) == L'Q') |
| { |
| DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n", |
| REST(ctx->re))); |
| ctx->cflags |= REG_LITERAL; |
| temporary_cflags |= REG_LITERAL; |
| ctx->re += 2; |
| STACK_PUSHX(stack, int, PARSE_ATOM); |
| break; |
| } |
| #endif /* REG_LITERAL */ |
| |
| DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re))); |
| ctx->re++; |
| switch (*ctx->re) |
| { |
| case L'b': |
| result = tre_ast_new_literal(ctx->mem, ASSERTION, |
| ASSERT_AT_WB, -1); |
| ctx->re++; |
| break; |
| case L'B': |
| result = tre_ast_new_literal(ctx->mem, ASSERTION, |
| ASSERT_AT_WB_NEG, -1); |
| ctx->re++; |
| break; |
| case L'<': |
| result = tre_ast_new_literal(ctx->mem, ASSERTION, |
| ASSERT_AT_BOW, -1); |
| ctx->re++; |
| break; |
| case L'>': |
| result = tre_ast_new_literal(ctx->mem, ASSERTION, |
| ASSERT_AT_EOW, -1); |
| ctx->re++; |
| break; |
| case L'x': |
| ctx->re++; |
| if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end) |
| { |
| /* 8 bit hex char. */ |
| char tmp[3] = {0, 0, 0}; |
| long val; |
| DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n", |
| REST(ctx->re - 2))); |
| |
| if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end) |
| { |
| tmp[0] = (char)ctx->re[0]; |
| ctx->re++; |
| } |
| if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end) |
| { |
| tmp[1] = (char)ctx->re[0]; |
| ctx->re++; |
| } |
| val = strtol(tmp, NULL, 16); |
| result = tre_ast_new_literal(ctx->mem, (int)val, |
| (int)val, ctx->position); |
| ctx->position++; |
| break; |
| } |
| else if (ctx->re < ctx->re_end) |
| { |
| /* Wide char. */ |
| char tmp[32]; |
| long val; |
| int i = 0; |
| ctx->re++; |
| while (ctx->re_end - ctx->re >= 0) |
| { |
| if (ctx->re[0] == CHAR_RBRACE) |
| break; |
| if (tre_isxdigit(ctx->re[0])) |
| { |
| tmp[i] = (char)ctx->re[0]; |
| i++; |
| ctx->re++; |
| continue; |
| } |
| return REG_EBRACE; |
| } |
| ctx->re++; |
| tmp[i] = 0; |
| val = strtol(tmp, NULL, 16); |
| result = tre_ast_new_literal(ctx->mem, (int)val, (int)val, |
| ctx->position); |
| ctx->position++; |
| break; |
| } |
| /*FALLTHROUGH*/ |
| |
| default: |
| if (tre_isdigit(*ctx->re)) |
| { |
| /* Back reference. */ |
| int val = *ctx->re - L'0'; |
| DPRINT(("tre_parse: backref: '%.*" STRF "'\n", |
| REST(ctx->re - 1))); |
| result = tre_ast_new_literal(ctx->mem, BACKREF, val, |
| ctx->position); |
| if (result == NULL) |
| return REG_ESPACE; |
| ctx->position++; |
| ctx->max_backref = MAX(val, ctx->max_backref); |
| ctx->re++; |
| } |
| else |
| { |
| /* Escaped character. */ |
| DPRINT(("tre_parse: escaped: '%.*" STRF "'\n", |
| REST(ctx->re - 1))); |
| result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, |
| ctx->position); |
| ctx->position++; |
| ctx->re++; |
| } |
| break; |
| } |
| if (result == NULL) |
| return REG_ESPACE; |
| break; |
| |
| case CHAR_PERIOD: /* the any-symbol */ |
| DPRINT(("tre_parse: any: '%.*" STRF "'\n", |
| REST(ctx->re))); |
| if (ctx->cflags & REG_NEWLINE) |
| { |
| tre_ast_node_t *tmp1; |
| tre_ast_node_t *tmp2; |
| tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1, |
| ctx->position); |
| if (!tmp1) |
| return REG_ESPACE; |
| tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX, |
| ctx->position + 1); |
| if (!tmp2) |
| return REG_ESPACE; |
| result = tre_ast_new_union(ctx->mem, tmp1, tmp2); |
| if (!result) |
| return REG_ESPACE; |
| ctx->position += 2; |
| } |
| else |
| { |
| result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, |
| ctx->position); |
| if (!result) |
| return REG_ESPACE; |
| ctx->position++; |
| } |
| ctx->re++; |
| break; |
| |
| case CHAR_CARET: /* beginning of line assertion */ |
| /* '^' has a special meaning everywhere in EREs, and in the |
| beginning of the RE and after \( is BREs. */ |
| if (ctx->cflags & REG_EXTENDED |
| || (ctx->re - 2 >= ctx->re_start |
| && *(ctx->re - 2) == CHAR_BACKSLASH |
| && *(ctx->re - 1) == CHAR_LPAREN) |
| || ctx->re == ctx->re_start) |
| { |
| DPRINT(("tre_parse: BOL: '%.*" STRF "'\n", |
| REST(ctx->re))); |
| result = tre_ast_new_literal(ctx->mem, ASSERTION, |
| ASSERT_AT_BOL, -1); |
| if (result == NULL) |
| return REG_ESPACE; |
| ctx->re++; |
| } |
| else |
| goto parse_literal; |
| break; |
| |
| case CHAR_DOLLAR: /* end of line assertion. */ |
| /* '$' is special everywhere in EREs, and in the end of the |
| string and before \) is BREs. */ |
| if (ctx->cflags & REG_EXTENDED |
| || (ctx->re + 2 < ctx->re_end |
| && *(ctx->re + 1) == CHAR_BACKSLASH |
| && *(ctx->re + 2) == CHAR_RPAREN) |
| || ctx->re + 1 == ctx->re_end) |
| { |
| DPRINT(("tre_parse: EOL: '%.*" STRF "'\n", |
| REST(ctx->re))); |
| result = tre_ast_new_literal(ctx->mem, ASSERTION, |
| ASSERT_AT_EOL, -1); |
| if (result == NULL) |
| return REG_ESPACE; |
| ctx->re++; |
| } |
| else |
| goto parse_literal; |
| break; |
| |
| default: |
| parse_literal: |
| |
| if (temporary_cflags && ctx->re + 1 < ctx->re_end |
| && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E') |
| { |
| DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n", |
| REST(ctx->re))); |
| ctx->cflags &= ~temporary_cflags; |
| temporary_cflags = 0; |
| ctx->re += 2; |
| STACK_PUSHX(stack, int, PARSE_PIECE); |
| break; |
| } |
| |
| |
| /* We are expecting an atom. If the subexpression (or the whole |
| regexp) ends here, we interpret it as an empty expression |
| (which matches an empty string). */ |
| if ( |
| #ifdef REG_LITERAL |
| !(ctx->cflags & REG_LITERAL) && |
| #endif /* REG_LITERAL */ |
| (ctx->re >= ctx->re_end |
| || *ctx->re == CHAR_STAR |
| || (ctx->cflags & REG_EXTENDED |
| && (*ctx->re == CHAR_PIPE |
| || *ctx->re == CHAR_LBRACE |
| || *ctx->re == CHAR_PLUS |
| || *ctx->re == CHAR_QUESTIONMARK)) |
| /* Test for "\)" in BRE mode. */ |
| || (!(ctx->cflags & REG_EXTENDED) |
| && ctx->re + 1 < ctx->re_end |
| && *ctx->re == CHAR_BACKSLASH |
| && *(ctx->re + 1) == CHAR_LBRACE))) |
| { |
| DPRINT(("tre_parse: empty: '%.*" STRF "'\n", |
| REST(ctx->re))); |
| result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); |
| if (!result) |
| return REG_ESPACE; |
| break; |
| } |
| |
| /* R change */ |
| if ((ctx->cflags & REG_LITERAL) && !ctx->re[0]) { |
| DPRINT(("tre_parse: literal empty: '%.*" STRF "'\n", |
| REST(ctx->re))); |
| result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); |
| if (!result) |
| return REG_ESPACE; |
| break; |
| } |
| /* end R change */ |
| |
| DPRINT(("tre_parse: literal: '%.*" STRF "'\n", |
| REST(ctx->re))); |
| /* Note that we can't use an tre_isalpha() test here, since there |
| may be characters which are alphabetic but neither upper or |
| lower case. */ |
| if (ctx->cflags & REG_ICASE |
| && (tre_isupper(*ctx->re) || tre_islower(*ctx->re))) |
| { |
| tre_ast_node_t *tmp1; |
| tre_ast_node_t *tmp2; |
| |
| /* XXX - Can there be more than one opposite-case |
| counterpoints for some character in some locale? Or |
| more than two characters which all should be regarded |
| the same character if case is ignored? If yes, there |
| does not seem to be a portable way to detect it. I guess |
| that at least for multi-character collating elements there |
| could be several opposite-case counterpoints, but they |
| cannot be supported portably anyway. */ |
| tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re), |
| tre_toupper(*ctx->re), |
| ctx->position); |
| if (!tmp1) |
| return REG_ESPACE; |
| tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re), |
| tre_tolower(*ctx->re), |
| ctx->position); |
| if (!tmp2) |
| return REG_ESPACE; |
| result = tre_ast_new_union(ctx->mem, tmp1, tmp2); |
| if (!result) |
| return REG_ESPACE; |
| } |
| else |
| { |
| result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, |
| ctx->position); |
| if (!result) |
| return REG_ESPACE; |
| } |
| ctx->position++; |
| ctx->re++; |
| break; |
| } |
| break; |
| |
| case PARSE_MARK_FOR_SUBMATCH: |
| { |
| int submatch_id = tre_stack_pop_int(stack); |
| |
| if (result->submatch_id >= 0) |
| { |
| tre_ast_node_t *n, *tmp_node; |
| n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); |
| if (n == NULL) |
| return REG_ESPACE; |
| tmp_node = tre_ast_new_catenation(ctx->mem, n, result); |
| if (tmp_node == NULL) |
| return REG_ESPACE; |
| tmp_node->num_submatches = result->num_submatches; |
| result = tmp_node; |
| } |
| result->submatch_id = submatch_id; |
| result->num_submatches++; |
| break; |
| } |
| |
| case PARSE_RESTORE_CFLAGS: |
| ctx->cflags = tre_stack_pop_int(stack); |
| break; |
| |
| default: |
| assert(0); |
| break; |
| } |
| } |
| |
| /* Check for missing closing parentheses. */ |
| if (depth > 0) |
| return REG_EPAREN; |
| |
| if (status == REG_OK) |
| ctx->result = result; |
| |
| return status; |
| } |
| |
| /* EOF */ |