| # regular expression test set |
| # Lines are at least three fields, separated by one or more tabs. "" stands |
| # for an empty field. First field is an RE. Second field is flags. If |
| # C flag given, regcomp() is expected to fail, and the third field is the |
| # error name (minus the leading REG_). |
| # |
| # Otherwise it is expected to succeed, and the third field is the string to |
| # try matching it against. If there is no fourth field, the match is |
| # expected to fail. If there is a fourth field, it is the substring that |
| # the RE is expected to match. If there is a fifth field, it is a comma- |
| # separated list of what the subexpressions should match, with - indicating |
| # no match for that one. In both the fourth and fifth fields, a (sub)field |
| # starting with @ indicates that the (sub)expression is expected to match |
| # a null string followed by the stuff after the @; this provides a way to |
| # test where null strings match. The character `N' in REs and strings |
| # is newline, `S' is space, `T' is tab, `Z' is NUL. |
| # |
| # The full list of flags: |
| # - placeholder, does nothing |
| # b RE is a BRE, not an ERE |
| # & try it as both an ERE and a BRE |
| # C regcomp() error expected, third field is error name |
| # i REG_ICASE |
| # m ("mundane") REG_NOSPEC |
| # s REG_NOSUB (not really testable) |
| # n REG_NEWLINE |
| # ^ REG_NOTBOL |
| # $ REG_NOTEOL |
| # # REG_STARTEND (see below) |
| # p REG_PEND |
| # |
| # For REG_STARTEND, the start/end offsets are those of the substring |
| # enclosed in (). |
| |
| # basics |
| a & a a |
| abc & abc abc |
| abc|de - abc abc |
| a|b|c - abc a |
| |
| # parentheses and perversions thereof |
| a(b)c - abc abc |
| a\(b\)c b abc abc |
| a( C EPAREN |
| a( b a( a( |
| a\( - a( a( |
| a\( bC EPAREN |
| a\(b bC EPAREN |
| a(b C EPAREN |
| a(b b a(b a(b |
| # gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly) |
| a) - a) a) |
| ) - ) ) |
| # end gagging (in a just world, those *should* give EPAREN) |
| a) b a) a) |
| a\) bC EPAREN |
| \) bC EPAREN |
| a()b - ab ab |
| a\(\)b b ab ab |
| |
| # anchoring and REG_NEWLINE |
| ^abc$ & abc abc |
| a^b - a^b |
| a^b b a^b a^b |
| a$b - a$b |
| a$b b a$b a$b |
| ^ & abc @abc |
| $ & abc @ |
| ^$ & "" @ |
| $^ - "" @ |
| \($\)\(^\) b "" @ |
| # stop retching, those are legitimate (although disgusting) |
| ^^ - "" @ |
| $$ - "" @ |
| b$ & abNc |
| b$ &n abNc b |
| ^b$ & aNbNc |
| ^b$ &n aNbNc b |
| ^$ &n aNNb @Nb |
| ^$ n abc |
| ^$ n abcN @ |
| $^ n aNNb @Nb |
| \($\)\(^\) bn aNNb @Nb |
| ^^ n^ aNNb @Nb |
| $$ n aNNb @NN |
| ^a ^ a |
| a$ $ a |
| ^a ^n aNb |
| ^b ^n aNb b |
| a$ $n bNa |
| b$ $n bNa b |
| a*(^b$)c* - b b |
| a*\(^b$\)c* b b b |
| |
| # certain syntax errors and non-errors |
| | C EMPTY |
| | b | | |
| * C BADRPT |
| * b * * |
| + C BADRPT |
| ? C BADRPT |
| "" &C EMPTY |
| () - abc @abc |
| \(\) b abc @abc |
| a||b C EMPTY |
| |ab C EMPTY |
| ab| C EMPTY |
| (|a)b C EMPTY |
| (a|)b C EMPTY |
| (*a) C BADRPT |
| (+a) C BADRPT |
| (?a) C BADRPT |
| ({1}a) C BADRPT |
| \(\{1\}a\) bC BADRPT |
| (a|*b) C BADRPT |
| (a|+b) C BADRPT |
| (a|?b) C BADRPT |
| (a|{1}b) C BADRPT |
| ^* C BADRPT |
| ^* b * * |
| ^+ C BADRPT |
| ^? C BADRPT |
| ^{1} C BADRPT |
| ^\{1\} bC BADRPT |
| |
| # metacharacters, backslashes |
| a.c & abc abc |
| a[bc]d & abd abd |
| a\*c & a*c a*c |
| a\\b & a\b a\b |
| a\\\*b & a\*b a\*b |
| # The following test is wrong. Using \b in an BRE or ERE is undefined. |
| # a\bc & abc abc |
| a\ &C EESCAPE |
| a\\bc & a\bc a\bc |
| \{ bC BADRPT |
| a\[b & a[b a[b |
| a[b &C EBRACK |
| # trailing $ is a peculiar special case for the BRE code |
| a$ & a a |
| a$ & a$ |
| a\$ & a |
| a\$ & a$ a$ |
| a\\$ & a |
| a\\$ & a$ |
| a\\$ & a\$ |
| a\\$ & a\ a\ |
| |
| # back references, ugh |
| a\(b\)\2c bC ESUBREG |
| a\(b\1\)c bC ESUBREG |
| a\(b*\)c\1d b abbcbbd abbcbbd bb |
| a\(b*\)c\1d b abbcbd |
| a\(b*\)c\1d b abbcbbbd |
| ^\(.\)\1 b abc |
| a\([bc]\)\1d b abcdabbd abbd b |
| a\(\([bc]\)\2\)*d b abbccd abbccd |
| a\(\([bc]\)\2\)*d b abbcbd |
| # actually, this next one probably ought to fail, but the spec is unclear |
| a\(\(b\)*\2\)*d b abbbd abbbd |
| # here is a case that no NFA implementation does right |
| \(ab*\)[ab]*\1 b ababaaa ababaaa a |
| # check out normal matching in the presence of back refs |
| \(a\)\1bcd b aabcd aabcd |
| \(a\)\1bc*d b aabcd aabcd |
| \(a\)\1bc*d b aabd aabd |
| \(a\)\1bc*d b aabcccd aabcccd |
| \(a\)\1bc*[ce]d b aabcccd aabcccd |
| ^\(a\)\1b\(c\)*cd$ b aabcccd aabcccd |
| |
| # ordinary repetitions |
| ab*c & abc abc |
| ab+c - abc abc |
| ab?c - abc abc |
| a\(*\)b b a*b a*b |
| a\(**\)b b ab ab |
| a\(***\)b bC BADRPT |
| *a b *a *a |
| **a b a a |
| ***a bC BADRPT |
| |
| # the dreaded bounded repetitions |
| # The following two tests are not correct: |
| #{ & { { |
| #{abc & {abc {abc |
| # '{' is always a special char outside bracket expressions. So test ony BRE: |
| { b { { |
| {abc b {abc {abc |
| {1 C BADRPT |
| {1} C BADRPT |
| # Same reason as for the two tests above: |
| #a{b & a{b a{b |
| a{b b a{b a{b |
| a{1}b - ab ab |
| a\{1\}b b ab ab |
| a{1,}b - ab ab |
| a\{1,\}b b ab ab |
| a{1,2}b - aab aab |
| a\{1,2\}b b aab aab |
| a{1 C EBRACE |
| a\{1 bC EBRACE |
| a{1a C EBRACE |
| a\{1a bC EBRACE |
| a{1a} C BADBR |
| a\{1a\} bC BADBR |
| # These four tests checks for undefined behavior. Our implementation does |
| # something different. |
| #a{,2} - a{,2} a{,2} |
| #a\{,2\} bC BADBR |
| #a{,} - a{,} a{,} |
| #a\{,\} bC BADBR |
| a{1,x} C BADBR |
| a\{1,x\} bC BADBR |
| a{1,x C EBRACE |
| a\{1,x bC EBRACE |
| # These two tests probably fails due to an arbitrary limit on the number of |
| # repetitions in the other implementation. |
| #a{300} C BADBR |
| #a\{300\} bC BADBR |
| a{1,0} C BADBR |
| a\{1,0\} bC BADBR |
| ab{0,0}c - abcac ac |
| ab\{0,0\}c b abcac ac |
| ab{0,1}c - abcac abc |
| ab\{0,1\}c b abcac abc |
| ab{0,3}c - abbcac abbc |
| ab\{0,3\}c b abbcac abbc |
| ab{1,1}c - acabc abc |
| ab\{1,1\}c b acabc abc |
| ab{1,3}c - acabc abc |
| ab\{1,3\}c b acabc abc |
| ab{2,2}c - abcabbc abbc |
| ab\{2,2\}c b abcabbc abbc |
| ab{2,4}c - abcabbc abbc |
| ab\{2,4\}c b abcabbc abbc |
| ((a{1,10}){1,10}){1,10} - a a a,a |
| |
| # multiple repetitions |
| # Wow, there is serious disconnect here. The ERE grammar is like this: |
| # ERE_expression : one_char_or_coll_elem_ERE |
| # | '^' |
| # | '$' |
| # | '(' extended_reg_exp ')' |
| # | ERE_expression ERE_dupl_symbol |
| # ; |
| # where ERE_dupl_symbol is any of the repetition methods. It is clear from |
| # this that consecutive repetition is OK. On top of this, the one test not |
| # marked as failing must fail. For BREs the situation is different, so we |
| # use the four tests. |
| #a** &C BADRPT |
| a** bC BADRPT |
| #a++ C BADRPT |
| #a?? C BADRPT |
| #a*+ C BADRPT |
| #a*? C BADRPT |
| #a+* C BADRPT |
| #a+? C BADRPT |
| #a?* C BADRPT |
| #a?+ C BADRPT |
| #a{1}{1} C BADRPT |
| #a*{1} C BADRPT |
| #a+{1} C BADRPT |
| #a?{1} C BADRPT |
| #a{1}* C BADRPT |
| #a{1}+ C BADRPT |
| #a{1}? C BADRPT |
| #a*{b} - a{b} a{b} |
| a\{1\}\{1\} bC BADRPT |
| a*\{1\} bC BADRPT |
| a\{1\}* bC BADRPT |
| |
| # brackets, and numerous perversions thereof |
| a[b]c & abc abc |
| a[ab]c & abc abc |
| a[^ab]c & adc adc |
| a[]b]c & a]c a]c |
| a[[b]c & a[c a[c |
| a[-b]c & a-c a-c |
| a[^]b]c & adc adc |
| a[^-b]c & adc adc |
| a[b-]c & a-c a-c |
| a[b &C EBRACK |
| a[] &C EBRACK |
| a[1-3]c & a2c a2c |
| a[3-1]c &C ERANGE |
| a[1-3-5]c &C ERANGE |
| a[[.-.]--]c & a-c a-c |
| # I don't thing the error value should be ERANGE since a[1-] would be |
| # valid, too. Expect EBRACK. |
| #a[1- &C ERANGE |
| a[1- &C EBRACK |
| a[[. &C EBRACK |
| a[[.x &C EBRACK |
| a[[.x. &C EBRACK |
| a[[.x.] &C EBRACK |
| a[[.x.]] & ax ax |
| a[[.x,.]] &C ECOLLATE |
| # This test is invalid. "one" is no collating symbol in any standardized |
| # locale. |
| # a[[.one.]]b & a1b a1b |
| a[[.notdef.]]b &C ECOLLATE |
| a[[.].]]b & a]b a]b |
| a[[:alpha:]]c & abc abc |
| a[[:notdef:]]c &C ECTYPE |
| a[[: &C EBRACK |
| a[[:alpha &C EBRACK |
| a[[:alpha:] &C EBRACK |
| a[[:alpha,:] &C ECTYPE |
| a[[:]:]]b &C ECTYPE |
| a[[:-:]]b &C ECTYPE |
| a[[:alph:]] &C ECTYPE |
| a[[:alphabet:]] &C ECTYPE |
| [[:alnum:]]+ - -%@a0X- a0X |
| [[:alpha:]]+ - -%@aX0- aX |
| [[:blank:]]+ - aSSTb SST |
| [[:cntrl:]]+ - aNTb NT |
| [[:digit:]]+ - a019b 019 |
| [[:graph:]]+ - Sa%bS a%b |
| [[:lower:]]+ - AabC ab |
| [[:print:]]+ - NaSbN aSb |
| [[:punct:]]+ - S%-&T %-& |
| [[:space:]]+ - aSNTb SNT |
| [[:upper:]]+ - aBCd BC |
| [[:xdigit:]]+ - p0f3Cq 0f3C |
| a[[=b=]]c & abc abc |
| a[[= &C EBRACK |
| a[[=b &C EBRACK |
| a[[=b= &C EBRACK |
| a[[=b=] &C EBRACK |
| a[[=b,=]] &C ECOLLATE |
| # This test is invalid. "one" is no collating symbol in any standardized |
| # locale. |
| #a[[=one=]]b & a1b a1b |
| |
| # complexities |
| a(((b)))c - abc abc |
| a(b|(c))d - abd abd |
| a(b*|c)d - abbd abbd |
| # just gotta have one DFA-buster, of course |
| a[ab]{20} - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab |
| # and an inline expansion in case somebody gets tricky |
| a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab |
| # and in case somebody just slips in an NFA... |
| a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night) - aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights |
| # fish for anomalies as the number of states passes 32 |
| 12345678901234567890123456789 - a12345678901234567890123456789b 12345678901234567890123456789 |
| 123456789012345678901234567890 - a123456789012345678901234567890b 123456789012345678901234567890 |
| 1234567890123456789012345678901 - a1234567890123456789012345678901b 1234567890123456789012345678901 |
| 12345678901234567890123456789012 - a12345678901234567890123456789012b 12345678901234567890123456789012 |
| 123456789012345678901234567890123 - a123456789012345678901234567890123b 123456789012345678901234567890123 |
| # and one really big one, beyond any plausible word width |
| 1234567890123456789012345678901234567890123456789012345678901234567890 - a1234567890123456789012345678901234567890123456789012345678901234567890b 1234567890123456789012345678901234567890123456789012345678901234567890 |
| # fish for problems as brackets go past 8 |
| [ab][cd][ef][gh][ij][kl][mn] - xacegikmoq acegikm |
| [ab][cd][ef][gh][ij][kl][mn][op] - xacegikmoq acegikmo |
| [ab][cd][ef][gh][ij][kl][mn][op][qr] - xacegikmoqy acegikmoq |
| [ab][cd][ef][gh][ij][kl][mn][op][q] - xacegikmoqy acegikmoq |
| |
| # subtleties of matching |
| abc & xabcy abc |
| a\(b\)?c\1d b acd |
| aBc i Abc Abc |
| a[Bc]*d i abBCcd abBCcd |
| 0[[:upper:]]1 &i 0a1 0a1 |
| 0[[:lower:]]1 &i 0A1 0A1 |
| a[^b]c &i abc |
| a[^b]c &i aBc |
| a[^b]c &i adc adc |
| [a]b[c] - abc abc |
| [a]b[a] - aba aba |
| [abc]b[abc] - abc abc |
| [abc]b[abd] - abd abd |
| a(b?c)+d - accd accd |
| (wee|week)(knights|night) - weeknights weeknights |
| (we|wee|week|frob)(knights|night|day) - weeknights weeknights |
| a[bc]d - xyzaaabcaababdacd abd |
| a[ab]c - aaabc abc |
| abc s abc abc |
| () s abc @abc |
| a* & b @b |
| |
| # Let's have some fun -- try to match a C comment. |
| # first the obvious, which looks okay at first glance... |
| /\*.*\*/ - /*x*/ /*x*/ |
| # but... |
| /\*.*\*/ - /*x*/y/*z*/ /*x*/y/*z*/ |
| # okay, we must not match */ inside; try to do that... |
| /\*([^*]|\*[^/])*\*/ - /*x*/ /*x*/ |
| /\*([^*]|\*[^/])*\*/ - /*x*/y/*z*/ /*x*/ |
| # but... |
| /\*([^*]|\*[^/])*\*/ - /*x**/y/*z*/ /*x**/y/*z*/ |
| # and a still fancier version, which does it right (I think)... |
| /\*([^*]|\*+[^*/])*\*+/ - /*x*/ /*x*/ |
| /\*([^*]|\*+[^*/])*\*+/ - /*x*/y/*z*/ /*x*/ |
| /\*([^*]|\*+[^*/])*\*+/ - /*x**/y/*z*/ /*x**/ |
| /\*([^*]|\*+[^*/])*\*+/ - /*x****/y/*z*/ /*x****/ |
| /\*([^*]|\*+[^*/])*\*+/ - /*x**x*/y/*z*/ /*x**x*/ |
| /\*([^*]|\*+[^*/])*\*+/ - /*x***x/y/*z*/ /*x***x/y/*z*/ |
| |
| # subexpressions |
| .* - abc abc - |
| a(b)(c)d - abcd abcd b,c |
| a(((b)))c - abc abc b,b,b |
| a(b|(c))d - abd abd b,- |
| a(b*|c|e)d - abbd abbd bb |
| a(b*|c|e)d - acd acd c |
| a(b*|c|e)d - ad ad @d |
| a(b?)c - abc abc b |
| a(b?)c - ac ac @c |
| a(b+)c - abc abc b |
| a(b+)c - abbbc abbbc bbb |
| a(b*)c - ac ac @c |
| (a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de |
| # the regression tester only asks for 9 subexpressions |
| a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j |
| a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l - abcdefghijkl abcdefghijkl b,c,d,e,f,g,h,i,j,k |
| a([bc]?)c - abc abc b |
| a([bc]?)c - ac ac @c |
| a([bc]+)c - abc abc b |
| a([bc]+)c - abcc abcc bc |
| a([bc]+)bc - abcbc abcbc bc |
| a(bb+|b)b - abb abb b |
| a(bbb+|bb+|b)b - abb abb b |
| a(bbb+|bb+|b)b - abbb abbb bb |
| a(bbb+|bb+|b)bb - abbb abbb b |
| (.*).* - abcdef abcdef abcdef |
| (a*)* - bc @b @b |
| |
| # do we get the right subexpression when it is used more than once? |
| a(b|c)*d - ad ad - |
| a(b|c)*d - abcd abcd c |
| a(b|c)+d - abd abd b |
| a(b|c)+d - abcd abcd c |
| a(b|c?)+d - ad ad @d |
| a(b|c?)+d - abcd abcd c |
| a(b|c){0,0}d - ad ad - |
| a(b|c){0,1}d - ad ad - |
| a(b|c){0,1}d - abd abd b |
| a(b|c){0,2}d - ad ad - |
| a(b|c){0,2}d - abcd abcd c |
| a(b|c){0,}d - ad ad - |
| a(b|c){0,}d - abcd abcd c |
| a(b|c){1,1}d - abd abd b |
| a(b|c){1,1}d - acd acd c |
| a(b|c){1,2}d - abd abd b |
| a(b|c){1,2}d - abcd abcd c |
| a(b|c){1,}d - abd abd b |
| a(b|c){1,}d - abcd abcd c |
| a(b|c){2,2}d - acbd acbd b |
| a(b|c){2,2}d - abcd abcd c |
| a(b|c){2,4}d - abcd abcd c |
| a(b|c){2,4}d - abcbd abcbd b |
| a(b|c){2,4}d - abcbcd abcbcd c |
| a(b|c){2,}d - abcd abcd c |
| a(b|c){2,}d - abcbd abcbd b |
| a(b+|((c)*))+d - abd abd b,-,- |
| a(b+|((c)*))+d - abcd abcd c,c,c |
| |
| # check out the STARTEND option |
| [abc] &# a(b)c b |
| [abc] &# a(d)c |
| [abc] &# a(bc)d b |
| [abc] &# a(dc)d c |
| . &# a()c |
| b.*c &# b(bc)c bc |
| b.* &# b(bc)c bc |
| .*c &# b(bc)c bc |
| |
| # plain strings, with the NOSPEC flag |
| abc m abc abc |
| abc m xabcy abc |
| abc m xyz |
| a*b m aba*b a*b |
| a*b m ab |
| "" mC EMPTY |
| |
| # cases involving NULs |
| aZb & a a |
| aZb &p a |
| aZb &p# (aZb) aZb |
| aZ*b &p# (ab) ab |
| a.b &# (aZb) aZb |
| a.* &# (aZb)c aZb |
| |
| # word boundaries (ick) |
| [[:<:]]a & a a |
| [[:<:]]a & ba |
| [[:<:]]a & -a a |
| a[[:>:]] & a a |
| a[[:>:]] & ab |
| a[[:>:]] & a- a |
| [[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc abc |
| [[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc-q abc |
| [[:<:]]a.c[[:>:]] & axc-dayc-dazce-abc axc |
| [[:<:]]b.c[[:>:]] & a_bxc-byc_d-bzc-q bzc |
| [[:<:]].x..[[:>:]] & y_xa_-_xb_y-_xc_-axdc _xc_ |
| [[:<:]]a_b[[:>:]] & x_a_b |
| |
| # past problems, and suspected problems |
| (A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A]) - A1 A1 |
| abcdefghijklmnop i abcdefghijklmnop abcdefghijklmnop |
| abcdefghijklmnopqrstuv i abcdefghijklmnopqrstuv abcdefghijklmnopqrstuv |
| (ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN]) - CC11 CC11 |
| CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a - CC11 CC11 |
| Char \([a-z0-9_]*\)\[.* b Char xyz[k Char xyz[k xyz |
| a?b - ab ab |
| -\{0,1\}[0-9]*$ b -5 -5 |
| a*a*a*a*a*a*a* & aaaaaa aaaaaa |
| (\b){0} - x @x - |
| \(\b\)\{0,0\} b abc @abc - |
| a(\b){0}c - ac ac - |
| a(.*)b(\1){0}c - abc abc @bc,- |
| a(.*)b(\1){0}c - axbc axbc x,- |
| |
| a\(\(b*\)\)c\1d b abbcbbd abbcbbd bb,bb |
| a\(\([bc]\)\)\2d b abcdabbd abbd b,b |
| a\(\(\(\([bc]\)\)\3\)\)*d b abbccd abbccd cc,cc,c,c |
| a(b)(c)d - abcd abcd b,c |
| a(((b)))c - abc abc b,b,b |
| a(((b|(((c))))))d - abd abd b,b,b,-,-,- |
| a(((b*|c|e)))d - abbd abbd bb,bb,bb |
| a((b|c)){0,0}d - ad ad -,- |
| a((b|c)){0,1}d - abd abd b,b |
| a((b|c)){0,2}d - abcd abcd c,c |
| a((b+|((c)*)))+d - abd abd b,b,-,- |
| a((b+|((c)*)))+d - abcd abcd c,c,c,c |
| (((\b))){0} - x @x -,-,- |
| a(((.*)))b((\2)){0}c - abc abc @bc,@bc,@bc,-,- |
| a(((.*)))b((\1)){0}c - axbc axbc x,x,x,-,- |
| |
| \b & SaT @aT |
| \b & aT @aT |
| a.*\b & abT ab |
| \b & STSS |
| \B & abc @bc |
| \B & aSbTc |
| \B & SaT @SaT |
| \B & aSTSb @TSb |
| |
| o$($|.) - oN |
| o$($|.) - op |
| o$($|.) - o o |