blob: 84b6fb874a47f704b8227e95621843af698b5a10 [file] [log] [blame]
/* Pattern Matchers for Regular Expressions.
Copyright (C) 1992, 1998, 2000, 2005-2006, 2010, 2013 Free Software
Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
/* Specification. */
#include "libgrep.h"
#include <ctype.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <regex.h>
#include "error.h"
#include "exitfail.h"
#include "xalloc.h"
#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
# define IN_CTYPE_DOMAIN(c) 1
#else
# define IN_CTYPE_DOMAIN(c) isascii(c)
#endif
#define ISALNUM(C) (IN_CTYPE_DOMAIN (C) && isalnum (C))
#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
struct patterns
{
/* Regex compiled regexp. */
struct re_pattern_buffer regexbuf;
struct re_registers regs; /* This is here on account of a BRAIN-DEAD
Q@#%!# library interface in regex.c. */
};
struct compiled_regex {
bool match_words;
bool match_lines;
char eolbyte;
/* The Regex compiled patterns. */
struct patterns *patterns;
size_t pcount;
};
static void *
compile (const char *pattern, size_t pattern_size,
bool match_icase, bool match_words, bool match_lines, char eolbyte,
reg_syntax_t syntax)
{
struct compiled_regex *cregex;
cregex = XMALLOC (struct compiled_regex);
memset (cregex, '\0', sizeof (struct compiled_regex));
cregex->match_words = match_words;
cregex->match_lines = match_lines;
cregex->eolbyte = eolbyte;
cregex->patterns = NULL;
cregex->pcount = 0;
re_set_syntax (syntax);
/* For GNU regex compiler we have to pass the patterns separately to detect
errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
GNU regex should have raised a syntax error. The same for backref, where
the backref should have been local to each pattern. */
{
const char *sep;
size_t total = pattern_size;
const char *motif = pattern;
do
{
size_t len;
const char *err;
sep = (const char *) memchr (motif, '\n', total);
if (sep)
{
len = sep - motif;
sep++;
total -= (len + 1);
}
else
{
len = total;
total = 0;
}
cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns));
memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns));
if ((err = re_compile_pattern (motif, len,
&cregex->patterns[cregex->pcount].regexbuf)) != NULL)
error (exit_failure, 0, "%s", err);
cregex->pcount++;
motif = sep;
}
while (sep && total != 0);
}
return cregex;
}
static void *
Gcompile (const char *pattern, size_t pattern_size,
bool match_icase, bool match_words, bool match_lines, char eolbyte)
{
return compile (pattern, pattern_size,
match_icase, match_words, match_lines, eolbyte,
RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
}
static void *
Ecompile (const char *pattern, size_t pattern_size,
bool match_icase, bool match_words, bool match_lines, char eolbyte)
{
return compile (pattern, pattern_size,
match_icase, match_words, match_lines, eolbyte,
RE_SYNTAX_POSIX_EGREP);
}
static void *
AWKcompile (const char *pattern, size_t pattern_size,
bool match_icase, bool match_words, bool match_lines, char eolbyte)
{
return compile (pattern, pattern_size,
match_icase, match_words, match_lines, eolbyte,
RE_SYNTAX_AWK);
}
static size_t
EGexecute (const void *compiled_pattern,
const char *buf, size_t buf_size,
size_t *match_size, bool exact)
{
struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
char eol = cregex->eolbyte;
register const char *buflim = buf + buf_size;
register const char *beg;
register const char *end;
for (beg = buf; beg < buflim; beg = end)
{
size_t i;
end = (const char *) memchr (beg, eol, buflim - beg);
if (end == NULL)
end = buflim;
/* Here, either end < buflim && *end == eol, or end == buflim. */
for (i = 0; i < cregex->pcount; i++)
{
int start, len;
cregex->patterns[i].regexbuf.not_eol = 0;
if (0 <= (start = re_search (&cregex->patterns[i].regexbuf, beg,
end - beg, 0,
end - beg, &cregex->patterns[i].regs)))
{
len = cregex->patterns[i].regs.end[0] - start;
if (exact)
{
*match_size = len;
return start;
}
if (cregex->match_lines)
{
if (len == end - beg) /* implies start == 0 */
goto success;
}
else if (cregex->match_words)
{
/* If -w, check if the match aligns with word boundaries.
We do this iteratively because:
(a) the line may contain more than one occurence of the
pattern, and
(b) Several alternatives in the pattern might be valid at
a given point, and we may need to consider a shorter
one to find a word boundary. */
while (start >= 0)
{
if ((start == 0 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start - 1]))
&& (start + len == end - beg
|| !IS_WORD_CONSTITUENT ((unsigned char) beg[start + len])))
goto success;
if (len > 0)
{
/* Try a shorter length anchored at the same place. */
--len;
cregex->patterns[i].regexbuf.not_eol = 1;
len = re_match (&cregex->patterns[i].regexbuf, beg,
start + len, start,
&cregex->patterns[i].regs);
}
if (len <= 0)
{
/* Try looking further on. */
if (start == end - beg)
break;
++start;
cregex->patterns[i].regexbuf.not_eol = 0;
start = re_search (&cregex->patterns[i].regexbuf, beg,
end - beg,
start, end - beg - start,
&cregex->patterns[i].regs);
len = cregex->patterns[i].regs.end[0] - start;
}
}
}
else
goto success;
}
}
if (end < buflim)
end++;
}
return (size_t) -1;
success:
*match_size = end - beg;
return beg - buf;
}
static void
EGfree (void *compiled_pattern)
{
struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
free (cregex->patterns);
free (cregex);
}
/* POSIX Basic Regular Expressions */
matcher_t matcher_grep =
{
Gcompile,
EGexecute,
EGfree
};
/* POSIX Extended Regular Expressions */
matcher_t matcher_egrep =
{
Ecompile,
EGexecute,
EGfree
};
/* AWK Regular Expressions */
matcher_t matcher_awk =
{
AWKcompile,
EGexecute,
EGfree
};