blob: 4b91ce1db35a70b7832a0b336abaa7ee6560f9b3 [file] [log] [blame]
/*
tre-internal.h - TRE internal definitions
This software is released under a BSD-style license.
See the file LICENSE for details and copyright.
*/
#ifndef TRE_INTERNAL_H
#define TRE_INTERNAL_H 1
#ifdef HAVE_WCHAR_H
#include <wchar.h>
#endif /* HAVE_WCHAR_H */
#ifdef HAVE_WCTYPE_H
#include <wctype.h>
#endif /* !HAVE_WCTYPE_H */
#include <ctype.h>
#include "tre.h"
#ifdef TRE_DEBUG
#include <stdio.h>
#define DPRINT(msg) do {printf msg; fflush(stdout);} while(/*CONSTCOND*/(void)0,0)
#else /* !TRE_DEBUG */
#define DPRINT(msg) do { } while(/*CONSTCOND*/(void)0,0)
#endif /* !TRE_DEBUG */
#define elementsof(x) ( sizeof(x) / sizeof(x[0]) )
#ifdef HAVE_MBRTOWC
#define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps)))
#else /* !HAVE_MBRTOWC */
#ifdef HAVE_MBTOWC
#define tre_mbrtowc(pwc, s, n, ps) (mbtowc((pwc), (s), (n)))
#endif /* HAVE_MBTOWC */
#endif /* !HAVE_MBRTOWC */
#ifdef TRE_MULTIBYTE
#ifdef HAVE_MBSTATE_T
#define TRE_MBSTATE
#endif /* TRE_MULTIBYTE */
#endif /* HAVE_MBSTATE_T */
/* Define the character types and functions. */
#ifdef TRE_WCHAR
/* Wide characters. */
typedef wint_t tre_cint_t;
/* Workaround problem seen on AIX, (2010 & 2015), e.g.,
https://stat.ethz.ch/pipermail/r-devel/2015-October/071902.html
WCHAR_MAX = UINT32_MAX on AIX and that is "not possible to work"
Solaris-sparcv9 WCHAR_MAX = INT32_MAX
Linux amd64 WCHAR_MAX = INT32_MAX
*/
/*
[U]INT32_MAX need to be declared: this is a C99 header which we assume
*/
#include <stdint.h>
#if WCHAR_MAX == UINT32_MAX
# define TRE_CHAR_MAX INT32_MAX
#else
# define TRE_CHAR_MAX WCHAR_MAX
#endif
#ifdef TRE_MULTIBYTE
#define TRE_MB_CUR_MAX MB_CUR_MAX
#else /* !TRE_MULTIBYTE */
#define TRE_MB_CUR_MAX 1
#endif /* !TRE_MULTIBYTE */
#define tre_isalnum iswalnum
#define tre_isalpha iswalpha
#ifdef HAVE_ISWBLANK
#define tre_isblank iswblank
#endif /* HAVE_ISWBLANK */
#define tre_iscntrl iswcntrl
#define tre_isdigit iswdigit
#define tre_isgraph iswgraph
#define tre_islower iswlower
#define tre_isprint iswprint
#define tre_ispunct iswpunct
#define tre_isspace iswspace
#define tre_isupper iswupper
#define tre_isxdigit iswxdigit
#define tre_tolower towlower
#define tre_toupper towupper
#define tre_strlen wcslen
#else /* !TRE_WCHAR */
/* 8 bit characters. */
typedef short tre_cint_t;
#define TRE_CHAR_MAX 255
#define TRE_MB_CUR_MAX 1
#define tre_isalnum isalnum
#define tre_isalpha isalpha
#ifdef HAVE_ISASCII
#define tre_isascii isascii
#endif /* HAVE_ISASCII */
#ifdef HAVE_ISBLANK
#define tre_isblank isblank
#endif /* HAVE_ISBLANK */
#define tre_iscntrl iscntrl
#define tre_isdigit isdigit
#define tre_isgraph isgraph
#define tre_islower islower
#define tre_isprint isprint
#define tre_ispunct ispunct
#define tre_isspace isspace
#define tre_isupper isupper
#define tre_isxdigit isxdigit
#define tre_tolower(c) (tre_cint_t)(tolower(c))
#define tre_toupper(c) (tre_cint_t)(toupper(c))
#define tre_strlen(s) (strlen((const char*)s))
#endif /* !TRE_WCHAR */
/* _WIN32 opt-out is R addition - iswctype is missing "blank" */
#if !defined(_WIN32) && defined(TRE_WCHAR) && defined(HAVE_ISWCTYPE) && defined(HAVE_WCTYPE)
#define TRE_USE_SYSTEM_WCTYPE 1
#endif
#ifdef TRE_USE_SYSTEM_WCTYPE
/* Use system provided iswctype() and wctype(). */
typedef wctype_t tre_ctype_t;
#define tre_isctype(c, type) iswctype(c, type)
#define tre_ctype(s) wctype(s)
#else /* !TRE_USE_SYSTEM_WCTYPE */
/* Define our own versions of iswctype() and wctype(). */
typedef int (*tre_ctype_t)(tre_cint_t);
#define tre_isctype(c, type) ( (type)(c) )
tre_ctype_t tre_ctype(const char *name);
#endif /* !TRE_USE_SYSTEM_WCTYPE */
typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t;
/* Returns number of bytes to add to (char *)ptr to make it
properly aligned for the type. */
/* R change: was (long) but that is shorter than pointer on Win64 */
#define ALIGN(ptr, type) \
((((size_t)ptr) % sizeof(type)) \
? (sizeof(type) - (((size_t)ptr) % sizeof(type))) \
: 0)
#undef MAX
#undef MIN
#define MAX(a, b) (((a) >= (b)) ? (a) : (b))
#define MIN(a, b) (((a) <= (b)) ? (a) : (b))
/* Define STRF to the correct printf formatter for strings. */
#ifdef TRE_WCHAR
#define STRF "ls"
#else /* !TRE_WCHAR */
#define STRF "s"
#endif /* !TRE_WCHAR */
/* TNFA transition type. A TNFA state is an array of transitions,
the terminator is a transition with NULL `state'. */
typedef struct tnfa_transition tre_tnfa_transition_t;
struct tnfa_transition {
/* Range of accepted characters. */
tre_cint_t code_min;
tre_cint_t code_max;
/* Pointer to the destination state. */
tre_tnfa_transition_t *state;
/* ID number of the destination state. */
int state_id;
/* -1 terminated array of tags (or NULL). */
int *tags;
/* Matching parameters settings (or NULL). */
int *params;
/* Assertion bitmap. */
int assertions;
/* Assertion parameters. */
union {
/* Character class assertion. */
tre_ctype_t class;
/* Back reference assertion. */
int backref;
} u;
/* Negative character class assertions. */
tre_ctype_t *neg_classes;
};
/* Assertions. */
#define ASSERT_AT_BOL 1 /* Beginning of line. */
#define ASSERT_AT_EOL 2 /* End of line. */
#define ASSERT_CHAR_CLASS 4 /* Character class in `class'. */
#define ASSERT_CHAR_CLASS_NEG 8 /* Character classes in `neg_classes'. */
#define ASSERT_AT_BOW 16 /* Beginning of word. */
#define ASSERT_AT_EOW 32 /* End of word. */
#define ASSERT_AT_WB 64 /* Word boundary. */
#define ASSERT_AT_WB_NEG 128 /* Not a word boundary. */
#define ASSERT_BACKREF 256 /* A back reference in `backref'. */
#define ASSERT_LAST 256
/* define R_assert() which can replace assert() */
/* fake definition (important: jsut const char* str is not enough!) */
extern void Rf_error(const char *str, ...);
#ifdef NDEBUG
#define R_assert(e) ((void) 0)
#else
/* The line below requires an ANSI C preprocessor (stringify operator) */
#define R_assert(e) ((e) ? (void) 0 : Rf_error("assertion '%s' failed in executing regexp: file '%s', line %d\n", #e, __FILE__, __LINE__))
#endif /* NDEBUG */
/* Tag directions. */
typedef enum {
TRE_TAG_MINIMIZE = 0,
TRE_TAG_MAXIMIZE = 1
} tre_tag_direction_t;
/* Parameters that can be changed dynamically while matching. */
typedef enum {
TRE_PARAM_COST_INS = 0,
TRE_PARAM_COST_DEL = 1,
TRE_PARAM_COST_SUBST = 2,
TRE_PARAM_COST_MAX = 3,
TRE_PARAM_MAX_INS = 4,
TRE_PARAM_MAX_DEL = 5,
TRE_PARAM_MAX_SUBST = 6,
TRE_PARAM_MAX_ERR = 7,
TRE_PARAM_DEPTH = 8,
TRE_PARAM_LAST = 9
} tre_param_t;
/* Unset matching parameter */
#define TRE_PARAM_UNSET -1
/* Signifies the default matching parameter value. */
#define TRE_PARAM_DEFAULT -2
/* Instructions to compute submatch register values from tag values
after a successful match. */
struct tre_submatch_data {
/* Tag that gives the value for rm_so (submatch start offset). */
int so_tag;
/* Tag that gives the value for rm_eo (submatch end offset). */
int eo_tag;
/* List of submatches this submatch is contained in. */
int *parents;
};
typedef struct tre_submatch_data tre_submatch_data_t;
/* TNFA definition. */
typedef struct tnfa tre_tnfa_t;
struct tnfa {
tre_tnfa_transition_t *transitions;
unsigned int num_transitions;
tre_tnfa_transition_t *initial;
tre_tnfa_transition_t *final;
tre_submatch_data_t *submatch_data;
char *firstpos_chars;
int first_char;
unsigned int num_submatches;
tre_tag_direction_t *tag_directions;
int *minimal_tags;
int num_tags;
int num_minimals;
int end_tag;
int num_states;
int cflags;
int have_backrefs;
int have_approx;
int params_depth;
};
int
tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags);
void
tre_free(regex_t *preg);
void
tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
const tre_tnfa_t *tnfa, int *tags, int match_eo);
reg_errcode_t
tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
tre_str_type_t type, int *match_tags, int eflags,
int *match_end_ofs);
reg_errcode_t
tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
tre_str_type_t type, int *match_tags, int eflags,
int *match_end_ofs);
reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
int len, tre_str_type_t type, int *match_tags,
int eflags, int *match_end_ofs);
#ifdef TRE_APPROX
reg_errcode_t
tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
tre_str_type_t type, int *match_tags,
regamatch_t *match, regaparams_t params,
int eflags, int *match_end_ofs);
#endif /* TRE_APPROX */
#endif /* TRE_INTERNAL_H */
/* EOF */