blob: 19d338117a6e82bd88a48e9b9fdbd6f1080c839f [file] [log] [blame]
/* source: nestlex.c */
/* Copyright Gerhard Rieger and contributors (see file CHANGES) */
/* Published under the GNU General Public License V.2, see file COPYING */
/* a function for lexical scanning of nested character patterns */
#include "config.h"
#include "mytypes.h"
#include "sysincludes.h"
static int _nestlex(const char **addr,
char **token,
ptrdiff_t *len,
const char *ends[],
const char *hquotes[],
const char *squotes[],
const char *nests[],
bool dropquotes,
bool c_esc,
bool html_esc
);
/* sub: scan a string and copy its value to output string
end scanning when an unescaped, unnested string from ends array is found
does not copy the end pattern
does not write a trailing \0 to token
allows escaping with \ and quoting (\ and quotes are removed)
allows nesting with div. parens
returns -1 if out string was too small
returns 1 if addr ended unexpectedly
returns 0 if token could be extracted successfully
*/
int nestlex(const char **addr, /* input string; aft points to end token */
char **token, /* output token; aft points to first unwritten
char (caller might want to set it to \0) */
size_t *len, /* remaining bytes in token space (incl. \0) */
const char *ends[], /* list of end strings */
const char *hquotes[],/* list of strings that quote (hard qu.) */
const char *squotes[],/* list of strings that quote softly */
const char *nests[],/* list of strings that start nesting;
every second one is matching end */
bool dropquotes, /* drop the outermost quotes */
bool c_esc, /* solve C char escapes: \n \t \0 etc */
bool html_esc /* solve HTML char escapes: %0d %08 etc */
) {
return
_nestlex(addr, token, (ptrdiff_t *)len, ends, hquotes, squotes, nests,
dropquotes, c_esc, html_esc);
}
static int _nestlex(const char **addr,
char **token,
ptrdiff_t *len,
const char *ends[],
const char *hquotes[],
const char *squotes[],
const char *nests[],
bool dropquotes,
bool c_esc,
bool html_esc
) {
const char *in = *addr; /* pointer into input string */
const char **endx; /* loops over end patterns */
const char **quotx; /* loops over quote patterns */
const char **nestx; /* loops over nest patterns */
char *out = *token; /* pointer into output token */
char c;
int i;
int result;
while (true) {
/* is this end of input string? */
if (*in == 0) {
break; /* end of string */
}
/* first check the end patterns (e.g. for ']') */
endx = ends; i = 0;
while (*endx) {
if (!strncmp(in, *endx, strlen(*endx))) {
/* this end pattern matches */
*addr = in;
*token = out;
return 0;
}
++endx;
}
/* check for hard quoting pattern */
quotx = hquotes;
while (hquotes && *quotx) {
if (!strncmp(in, *quotx, strlen(*quotx))) {
/* this quote pattern matches */
const char *endnest[2];
if (dropquotes) {
/* we strip this quote */
in += strlen(*quotx);
} else {
for (i = strlen(*quotx); i > 0; --i) {
*out++ = *in++;
if (--*len <= 0) { *addr = in; *token = out; return -1; }
}
}
/* we call _nestlex recursively */
endnest[0] = *quotx;
endnest[1] = NULL;
result =
_nestlex(&in, &out, len, endnest, NULL/*hquotes*/,
NULL/*squotes*/, NULL/*nests*/,
false, c_esc, html_esc);
if (result == 0 && dropquotes) {
/* we strip this quote */
in += strlen(*quotx);
} else if (result < 0) {
*addr = in; *token = out; return result;
} else {
/* we copy the trailing quote */
for (i = strlen(*quotx); i > 0; --i) {
*out++ = *in++;
if (--*len <= 0) { *addr = in; *token = out; return -1; }
}
}
break;
}
++quotx;
}
if (hquotes && *quotx != NULL) {
/* there was a quote; string might continue with hard quote */
continue;
}
/* check for soft quoting pattern */
quotx = squotes;
while (squotes && *quotx) {
if (!strncmp(in, *quotx, strlen(*quotx))) {
/* this quote pattern matches */
/* we strip this quote */
/* we call _nestlex recursively */
const char *endnest[2];
if (dropquotes) {
/* we strip this quote */
in += strlen(*quotx);
} else {
for (i = strlen(*quotx); i > 0; --i) {
*out++ = *in++;
if (--*len <= 0) { *addr = in; *token = out; return -1; }
}
}
endnest[0] = *quotx;
endnest[1] = NULL;
result =
_nestlex(&in, &out, len, endnest, hquotes,
squotes, nests,
false, c_esc, html_esc);
if (result == 0 && dropquotes) {
/* we strip the trailing quote */
if (!in[0] || strncmp(in, *quotx, strlen(*quotx))) return 1;
in += strlen(*quotx);
} else if (result < 0) {
*addr = in; *token = out; return result;
} else {
/* we copy the trailing quote */
for (i = strlen(*quotx); i > 0; --i) {
*out++ = *in++;
if (--*len <= 0) { *addr = in; *token = out; return -1; }
}
}
break;
}
++quotx;
}
if (squotes && *quotx != NULL) {
/* there was a soft quote; string might continue with any quote */
continue;
}
/* check patterns that start a nested clause */
nestx = nests; i = 0;
while (nests && *nestx) {
if (!strncmp(in, *nestx, strlen(*nestx))) {
/* this nest pattern matches */
const char *endnest[2];
endnest[0] = nestx[1];
endnest[1] = NULL;
for (i = strlen(nestx[1]); i > 0; --i) {
*out++ = *in++;
if (--*len <= 0) { *addr = in; *token = out; return -1; }
}
result =
_nestlex(&in, &out, len, endnest, hquotes, squotes, nests,
false, c_esc, html_esc);
if (result == 0) {
/* copy endnest */
i = strlen(nestx[1]); while (i > 0) {
*out++ = *in++;
if (--*len <= 0) {
*addr = in;
*token = out;
return -1;
}
--i;
}
} else if (result < 0) {
*addr = in; *token = out; return result;
}
break;
}
nestx += 2; /* skip matching end pattern in table */
}
if (nests && *nestx) {
/* we handled a nested expression, continue loop */
continue;
}
/* "normal" data, possibly escaped */
c = *in++;
if (c == '\\') {
/* found a plain \ escaped part */
c = *in++;
if (c == 0) { /* Warn("trailing '\\'");*/ break; }
if (c_esc) { /* solve C char escapes: \n \t \0 etc */
switch (c) {
case '0': c = '\0'; break;
case 'a': c = '\a'; break;
case 'b': c = '\b'; break;
case 'f': c = '\f'; break;
case 'n': c = '\n'; break;
case 'r': c = '\r'; break;
case 't': c = '\t'; break;
case 'v': c = '\v'; break;
#if LATER
case 'x': !!! 1 to 2 hex digits; break;
case 'u': !!! 4 hex digits?; break;
case 'U': !!! 8 hex digits?; break;
#endif
default: break;
}
}
*out++ = c;
--*len;
if (*len <= 0) {
*addr = in;
*token = out;
return -1; /* output overflow */
}
continue;
}
/* just a simple char */
*out++ = c;
--*len;
if (*len <= 0) {
*addr = in;
*token = out;
return -1; /* output overflow */
}
}
/* never come here? */
*addr = in;
*token = out;
return 0; /* OK */
}