| /* source: nestlex.c */ |
| /* Copyright Gerhard Rieger and contributors (see file CHANGES) */ |
| /* Published under the GNU General Public License V.2, see file COPYING */ |
| |
| /* a function for lexical scanning of nested character patterns */ |
| |
| #include "config.h" |
| #include "mytypes.h" |
| |
| #include "sysincludes.h" |
| |
| static int _nestlex(const char **addr, |
| char **token, |
| ptrdiff_t *len, |
| const char *ends[], |
| const char *hquotes[], |
| const char *squotes[], |
| const char *nests[], |
| bool dropquotes, |
| bool c_esc, |
| bool html_esc |
| ); |
| |
| /* sub: scan a string and copy its value to output string |
| end scanning when an unescaped, unnested string from ends array is found |
| does not copy the end pattern |
| does not write a trailing \0 to token |
| allows escaping with \ and quoting (\ and quotes are removed) |
| allows nesting with div. parens |
| returns -1 if out string was too small |
| returns 1 if addr ended unexpectedly |
| returns 0 if token could be extracted successfully |
| */ |
| int nestlex(const char **addr, /* input string; aft points to end token */ |
| char **token, /* output token; aft points to first unwritten |
| char (caller might want to set it to \0) */ |
| size_t *len, /* remaining bytes in token space (incl. \0) */ |
| const char *ends[], /* list of end strings */ |
| const char *hquotes[],/* list of strings that quote (hard qu.) */ |
| const char *squotes[],/* list of strings that quote softly */ |
| const char *nests[],/* list of strings that start nesting; |
| every second one is matching end */ |
| bool dropquotes, /* drop the outermost quotes */ |
| bool c_esc, /* solve C char escapes: \n \t \0 etc */ |
| bool html_esc /* solve HTML char escapes: %0d %08 etc */ |
| ) { |
| return |
| _nestlex(addr, token, (ptrdiff_t *)len, ends, hquotes, squotes, nests, |
| dropquotes, c_esc, html_esc); |
| } |
| |
| static int _nestlex(const char **addr, |
| char **token, |
| ptrdiff_t *len, |
| const char *ends[], |
| const char *hquotes[], |
| const char *squotes[], |
| const char *nests[], |
| bool dropquotes, |
| bool c_esc, |
| bool html_esc |
| ) { |
| const char *in = *addr; /* pointer into input string */ |
| const char **endx; /* loops over end patterns */ |
| const char **quotx; /* loops over quote patterns */ |
| const char **nestx; /* loops over nest patterns */ |
| char *out = *token; /* pointer into output token */ |
| char c; |
| int i; |
| int result; |
| |
| while (true) { |
| |
| /* is this end of input string? */ |
| if (*in == 0) { |
| |
| break; /* end of string */ |
| } |
| |
| /* first check the end patterns (e.g. for ']') */ |
| endx = ends; i = 0; |
| while (*endx) { |
| if (!strncmp(in, *endx, strlen(*endx))) { |
| /* this end pattern matches */ |
| *addr = in; |
| *token = out; |
| return 0; |
| } |
| ++endx; |
| } |
| |
| /* check for hard quoting pattern */ |
| quotx = hquotes; |
| while (hquotes && *quotx) { |
| if (!strncmp(in, *quotx, strlen(*quotx))) { |
| /* this quote pattern matches */ |
| const char *endnest[2]; |
| if (dropquotes) { |
| /* we strip this quote */ |
| in += strlen(*quotx); |
| } else { |
| for (i = strlen(*quotx); i > 0; --i) { |
| *out++ = *in++; |
| if (--*len <= 0) { *addr = in; *token = out; return -1; } |
| } |
| } |
| /* we call _nestlex recursively */ |
| endnest[0] = *quotx; |
| endnest[1] = NULL; |
| result = |
| _nestlex(&in, &out, len, endnest, NULL/*hquotes*/, |
| NULL/*squotes*/, NULL/*nests*/, |
| false, c_esc, html_esc); |
| if (result == 0 && dropquotes) { |
| /* we strip this quote */ |
| in += strlen(*quotx); |
| } else if (result < 0) { |
| *addr = in; *token = out; return result; |
| } else { |
| /* we copy the trailing quote */ |
| for (i = strlen(*quotx); i > 0; --i) { |
| *out++ = *in++; |
| if (--*len <= 0) { *addr = in; *token = out; return -1; } |
| } |
| } |
| |
| break; |
| } |
| ++quotx; |
| } |
| if (hquotes && *quotx != NULL) { |
| /* there was a quote; string might continue with hard quote */ |
| continue; |
| } |
| |
| /* check for soft quoting pattern */ |
| quotx = squotes; |
| while (squotes && *quotx) { |
| if (!strncmp(in, *quotx, strlen(*quotx))) { |
| /* this quote pattern matches */ |
| /* we strip this quote */ |
| /* we call _nestlex recursively */ |
| const char *endnest[2]; |
| if (dropquotes) { |
| /* we strip this quote */ |
| in += strlen(*quotx); |
| } else { |
| for (i = strlen(*quotx); i > 0; --i) { |
| *out++ = *in++; |
| if (--*len <= 0) { *addr = in; *token = out; return -1; } |
| } |
| } |
| endnest[0] = *quotx; |
| endnest[1] = NULL; |
| result = |
| _nestlex(&in, &out, len, endnest, hquotes, |
| squotes, nests, |
| false, c_esc, html_esc); |
| |
| if (result == 0 && dropquotes) { |
| /* we strip the trailing quote */ |
| if (!in[0] || strncmp(in, *quotx, strlen(*quotx))) return 1; |
| in += strlen(*quotx); |
| } else if (result < 0) { |
| *addr = in; *token = out; return result; |
| } else { |
| /* we copy the trailing quote */ |
| for (i = strlen(*quotx); i > 0; --i) { |
| *out++ = *in++; |
| if (--*len <= 0) { *addr = in; *token = out; return -1; } |
| } |
| } |
| break; |
| } |
| ++quotx; |
| } |
| if (squotes && *quotx != NULL) { |
| /* there was a soft quote; string might continue with any quote */ |
| continue; |
| } |
| |
| /* check patterns that start a nested clause */ |
| nestx = nests; i = 0; |
| while (nests && *nestx) { |
| if (!strncmp(in, *nestx, strlen(*nestx))) { |
| /* this nest pattern matches */ |
| const char *endnest[2]; |
| endnest[0] = nestx[1]; |
| endnest[1] = NULL; |
| |
| for (i = strlen(nestx[1]); i > 0; --i) { |
| *out++ = *in++; |
| if (--*len <= 0) { *addr = in; *token = out; return -1; } |
| } |
| |
| result = |
| _nestlex(&in, &out, len, endnest, hquotes, squotes, nests, |
| false, c_esc, html_esc); |
| if (result == 0) { |
| /* copy endnest */ |
| i = strlen(nestx[1]); while (i > 0) { |
| *out++ = *in++; |
| if (--*len <= 0) { |
| *addr = in; |
| *token = out; |
| return -1; |
| } |
| --i; |
| } |
| } else if (result < 0) { |
| *addr = in; *token = out; return result; |
| } |
| break; |
| } |
| nestx += 2; /* skip matching end pattern in table */ |
| } |
| if (nests && *nestx) { |
| /* we handled a nested expression, continue loop */ |
| continue; |
| } |
| |
| /* "normal" data, possibly escaped */ |
| c = *in++; |
| if (c == '\\') { |
| /* found a plain \ escaped part */ |
| c = *in++; |
| if (c == 0) { /* Warn("trailing '\\'");*/ break; } |
| if (c_esc) { /* solve C char escapes: \n \t \0 etc */ |
| switch (c) { |
| case '0': c = '\0'; break; |
| case 'a': c = '\a'; break; |
| case 'b': c = '\b'; break; |
| case 'f': c = '\f'; break; |
| case 'n': c = '\n'; break; |
| case 'r': c = '\r'; break; |
| case 't': c = '\t'; break; |
| case 'v': c = '\v'; break; |
| #if LATER |
| case 'x': !!! 1 to 2 hex digits; break; |
| case 'u': !!! 4 hex digits?; break; |
| case 'U': !!! 8 hex digits?; break; |
| #endif |
| default: break; |
| } |
| } |
| *out++ = c; |
| --*len; |
| if (*len <= 0) { |
| *addr = in; |
| *token = out; |
| return -1; /* output overflow */ |
| } |
| continue; |
| } |
| |
| /* just a simple char */ |
| *out++ = c; |
| --*len; |
| if (*len <= 0) { |
| *addr = in; |
| *token = out; |
| return -1; /* output overflow */ |
| } |
| |
| } |
| /* never come here? */ |
| |
| *addr = in; |
| *token = out; |
| return 0; /* OK */ |
| } |