1# $OpenBSD: tests,v 1.5 2004/11/29 16:50:31 otto Exp $ 2# $NetBSD: tests,v 1.5 1995/04/20 22:40:00 cgd Exp $ 3 4# regular expression test set 5# Lines are at least three fields, separated by one or more tabs. "" stands 6# for an empty field. First field is an RE. Second field is flags. If 7# C flag given, regcomp() is expected to fail, and the third field is the 8# error name (minus the leading REG_). 9# 10# Otherwise it is expected to succeed, and the third field is the string to 11# try matching it against. If there is no fourth field, the match is 12# expected to fail. If there is a fourth field, it is the substring that 13# the RE is expected to match. If there is a fifth field, it is a comma- 14# separated list of what the subexpressions should match, with - indicating 15# no match for that one. In both the fourth and fifth fields, a (sub)field 16# starting with @ indicates that the (sub)expression is expected to match 17# a null string followed by the stuff after the @; this provides a way to 18# test where null strings match. The character `N' in REs and strings 19# is newline, `S' is space, `T' is tab, `Z' is NUL. 20# 21# The full list of flags: 22# - placeholder, does nothing 23# b RE is a BRE, not an ERE 24# & try it as both an ERE and a BRE 25# C regcomp() error expected, third field is error name 26# i REG_ICASE 27# m ("mundane") REG_NOSPEC 28# s REG_NOSUB (not really testable) 29# n REG_NEWLINE 30# ^ REG_NOTBOL 31# $ REG_NOTEOL 32# # REG_STARTEND (see below) 33# p REG_PEND 34# 35# For REG_STARTEND, the start/end offsets are those of the substring 36# enclosed in (). 37 38# basics 39a & a a 40abc & abc abc 41abc|de - abc abc 42a|b|c - abc a 43 44# parentheses and perversions thereof 45a(b)c - abc abc 46a\(b\)c b abc abc 47a( C EPAREN 48a( b a( a( 49a\( - a( a( 50a\( bC EPAREN 51a\(b bC EPAREN 52a(b C EPAREN 53a(b b a(b a(b 54# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly) 55a) - a) a) 56) - ) ) 57# end gagging (in a just world, those *should* give EPAREN) 58a) b a) a) 59a\) bC EPAREN 60\) bC EPAREN 61a()b - ab ab 62a\(\)b b ab ab 63 64# anchoring and REG_NEWLINE 65^abc$ & abc abc 66a^b - a^b 67a^b b a^b a^b 68a$b - a$b 69a$b b a$b a$b 70^ & abc @abc 71$ & abc @ 72^$ & "" @ 73$^ - "" @ 74\($\)\(^\) b "" @ 75# stop retching, those are legitimate (although disgusting) 76^^ - "" @ 77$$ - "" @ 78b$ & abNc 79b$ &n abNc b 80^b$ & aNbNc 81^b$ &n aNbNc b 82^$ &n aNNb @Nb 83^$ n abc 84^$ n abcN @ 85$^ n aNNb @Nb 86\($\)\(^\) bn aNNb @Nb 87^^ n^ aNNb @Nb 88$$ n aNNb @NN 89^a ^ a 90a$ $ a 91^a ^n aNb 92^b ^n aNb b 93a$ $n bNa 94b$ $n bNa b 95a*(^b$)c* - b b 96a*\(^b$\)c* b b b 97 98# certain syntax errors and non-errors 99| C EMPTY 100| b | | 101* C BADRPT 102* b * * 103+ C BADRPT 104? C BADRPT 105"" &C EMPTY 106() - abc @abc 107\(\) b abc @abc 108a||b C EMPTY 109|ab C EMPTY 110ab| C EMPTY 111(|a)b C EMPTY 112(a|)b C EMPTY 113(*a) C BADRPT 114(+a) C BADRPT 115(?a) C BADRPT 116({1}a) C BADRPT 117\(\{1\}a\) bC BADRPT 118(a|*b) C BADRPT 119(a|+b) C BADRPT 120(a|?b) C BADRPT 121(a|{1}b) C BADRPT 122^* C BADRPT 123^* b * * 124^+ C BADRPT 125^? C BADRPT 126^{1} C BADRPT 127^\{1\} bC BADRPT 128 129# metacharacters, backslashes 130a.c & abc abc 131a[bc]d & abd abd 132a\*c & a*c a*c 133a\\b & a\b a\b 134a\\\*b & a\*b a\*b 135a\bc & abc abc 136a\ &C EESCAPE 137a\\bc & a\bc a\bc 138\{ bC BADRPT 139a\[b & a[b a[b 140a[b &C EBRACK 141# trailing $ is a peculiar special case for the BRE code 142a$ & a a 143a$ & a$ 144a\$ & a 145a\$ & a$ a$ 146a\\$ & a 147a\\$ & a$ 148a\\$ & a\$ 149a\\$ & a\ a\ 150 151# back references, ugh 152a\(b\)\2c bC ESUBREG 153a\(b\1\)c bC ESUBREG 154a\(b*\)c\1d b abbcbbd abbcbbd bb 155a\(b*\)c\1d b abbcbd 156a\(b*\)c\1d b abbcbbbd 157^\(.\)\1 b abc 158a\([bc]\)\1d b abcdabbd abbd b 159a\(\([bc]\)\2\)*d b abbccd abbccd 160a\(\([bc]\)\2\)*d b abbcbd 161# actually, this next one probably ought to fail, but the spec is unclear 162a\(\(b\)*\2\)*d b abbbd abbbd 163# here is a case that no NFA implementation does right 164\(ab*\)[ab]*\1 b ababaaa ababaaa a 165# check out normal matching in the presence of back refs 166\(a\)\1bcd b aabcd aabcd 167\(a\)\1bc*d b aabcd aabcd 168\(a\)\1bc*d b aabd aabd 169\(a\)\1bc*d b aabcccd aabcccd 170\(a\)\1bc*[ce]d b aabcccd aabcccd 171^\(a\)\1b\(c\)*cd$ b aabcccd aabcccd 172\(b*\)\(a*\1\)* b ab a 173\([^_]*\)\(_*\1\)* b foo_foo_bar_bar_bar_baz foo_foo foo,_foo 174\([^_]*\)\(_*\1\)* b bar_bar_bar_baz bar_bar_bar bar,_bar 175\([^_]*\)\(_*\1\)* b foo_bar_baz foo foo 176\(.*\)\1 b "" "" 177\(.*\)\1 b a "" 178\(.*\)\1 b aa aa 179\(.*\)\1 b aaa aa 180\(.*\)\1 b aaaa aaaa 181\([^_]*\)\1 b "" "" 182\([^_]*\)\1 b a "" 183\([^_]*\)\1 b aa aa 184\([^_]*\)\1 b aaa aa 185\([^_]*\)\1 b aaaa aaaa 186foo\(.*\)bar\1 b foolbarl foolbarl l 187foo\(.*\)bar\1 b foobar foobar "" 188\(\(.\)b\)*\1 b aba 189\(\(.\)b\)*\1 b abba 190\(\(.\)b\)*\1 b abbba 191\(\(.\)b\)*\1 b abbbba bbbb bb,b 192\(\(.\)b\)*\1 b abbbbba abbbbb bb,b 193\(\(.\)b\)*\1 b abbbbbba abbbbb bb,b 194\(\(.\)b\)*\1 b abbbbbbbbbbbbbba abbbbbbbbbbbbb bb,b 195\(\(.\)b\)*\1 b abbbbbbbbbbbbbbba abbbbbbbbbbbbbbb bb,b 196 197# ordinary repetitions 198ab*c & abc abc 199ab+c - abc abc 200ab?c - abc abc 201a\(*\)b b a*b a*b 202a\(**\)b b ab ab 203a\(***\)b bC BADRPT 204*a b *a *a 205**a b a a 206***a bC BADRPT 207 208# the dreaded bounded repetitions 209{ & { { 210{abc & {abc {abc 211{1 C BADRPT 212{1} C BADRPT 213a{b & a{b a{b 214a{1}b - ab ab 215a\{1\}b b ab ab 216a{1,}b - ab ab 217a\{1,\}b b ab ab 218a{1,2}b - aab aab 219a\{1,2\}b b aab aab 220a{1 C EBRACE 221a\{1 bC EBRACE 222a{1a C EBRACE 223a\{1a bC EBRACE 224a{1a} C BADBR 225a\{1a\} bC BADBR 226a{,2} - a{,2} a{,2} 227a\{,2\} bC BADBR 228a{,} - a{,} a{,} 229a\{,\} bC BADBR 230a{1,x} C BADBR 231a\{1,x\} bC BADBR 232a{1,x C EBRACE 233a\{1,x bC EBRACE 234a{300} C BADBR 235a\{300\} bC BADBR 236a{1,0} C BADBR 237a\{1,0\} bC BADBR 238ab{0,0}c - abcac ac 239ab\{0,0\}c b abcac ac 240ab{0,1}c - abcac abc 241ab\{0,1\}c b abcac abc 242ab{0,3}c - abbcac abbc 243ab\{0,3\}c b abbcac abbc 244ab{1,1}c - acabc abc 245ab\{1,1\}c b acabc abc 246ab{1,3}c - acabc abc 247ab\{1,3\}c b acabc abc 248ab{2,2}c - abcabbc abbc 249ab\{2,2\}c b abcabbc abbc 250ab{2,4}c - abcabbc abbc 251ab\{2,4\}c b abcabbc abbc 252((a{1,10}){1,10}){1,10} - a a a,a 253 254# multiple repetitions 255a** &C BADRPT 256a++ C BADRPT 257a?? C BADRPT 258a*+ C BADRPT 259a*? C BADRPT 260a+* C BADRPT 261a+? C BADRPT 262a?* C BADRPT 263a?+ C BADRPT 264a{1}{1} C BADRPT 265a*{1} C BADRPT 266a+{1} C BADRPT 267a?{1} C BADRPT 268a{1}* C BADRPT 269a{1}+ C BADRPT 270a{1}? C BADRPT 271a*{b} - a{b} a{b} 272a\{1\}\{1\} bC BADRPT 273a*\{1\} bC BADRPT 274a\{1\}* bC BADRPT 275 276# brackets, and numerous perversions thereof 277a[b]c & abc abc 278a[ab]c & abc abc 279a[^ab]c & adc adc 280a[]b]c & a]c a]c 281a[[b]c & a[c a[c 282a[-b]c & a-c a-c 283a[^]b]c & adc adc 284a[^-b]c & adc adc 285a[b-]c & a-c a-c 286a[b &C EBRACK 287a[] &C EBRACK 288a[1-3]c & a2c a2c 289a[3-1]c &C ERANGE 290a[1-3-5]c &C ERANGE 291a[[.-.]--]c & a-c a-c 292a[1- &C ERANGE 293a[[. &C EBRACK 294a[[.x &C EBRACK 295a[[.x. &C EBRACK 296a[[.x.] &C EBRACK 297a[[.x.]] & ax ax 298a[[.x,.]] &C ECOLLATE 299a[[.one.]]b & a1b a1b 300a[[.notdef.]]b &C ECOLLATE 301a[[.].]]b & a]b a]b 302a[[:alpha:]]c & abc abc 303a[[:notdef:]]c &C ECTYPE 304a[[: &C EBRACK 305a[[:alpha &C EBRACK 306a[[:alpha:] &C EBRACK 307a[[:alpha,:] &C ECTYPE 308a[[:]:]]b &C ECTYPE 309a[[:-:]]b &C ECTYPE 310a[[:alph:]] &C ECTYPE 311a[[:alphabet:]] &C ECTYPE 312[[:alnum:]]+ - -%@a0X- a0X 313[[:alpha:]]+ - -%@aX0- aX 314[[:blank:]]+ - aSSTb SST 315[[:cntrl:]]+ - aNTb NT 316[[:digit:]]+ - a019b 019 317[[:graph:]]+ - Sa%bS a%b 318[[:lower:]]+ - AabC ab 319[[:print:]]+ - NaSbN aSb 320[[:punct:]]+ - S%-&T %-& 321[[:space:]]+ - aSNTb SNT 322[[:upper:]]+ - aBCd BC 323[[:xdigit:]]+ - p0f3Cq 0f3C 324a[[=b=]]c & abc abc 325a[[= &C EBRACK 326a[[=b &C EBRACK 327a[[=b= &C EBRACK 328a[[=b=] &C EBRACK 329a[[=b,=]] &C ECOLLATE 330a[[=one=]]b & a1b a1b 331 332# complexities 333a(((b)))c - abc abc 334a(b|(c))d - abd abd 335a(b*|c)d - abbd abbd 336# just gotta have one DFA-buster, of course 337a[ab]{20} - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab 338# and an inline expansion in case somebody gets tricky 339a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab 340# and in case somebody just slips in an NFA... 341a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night) - aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights 342# fish for anomalies as the number of states passes 32 34312345678901234567890123456789 - a12345678901234567890123456789b 12345678901234567890123456789 344123456789012345678901234567890 - a123456789012345678901234567890b 123456789012345678901234567890 3451234567890123456789012345678901 - a1234567890123456789012345678901b 1234567890123456789012345678901 34612345678901234567890123456789012 - a12345678901234567890123456789012b 12345678901234567890123456789012 347123456789012345678901234567890123 - a123456789012345678901234567890123b 123456789012345678901234567890123 348# and one really big one, beyond any plausible word width 3491234567890123456789012345678901234567890123456789012345678901234567890 - a1234567890123456789012345678901234567890123456789012345678901234567890b 1234567890123456789012345678901234567890123456789012345678901234567890 350# fish for problems as brackets go past 8 351[ab][cd][ef][gh][ij][kl][mn] - xacegikmoq acegikm 352[ab][cd][ef][gh][ij][kl][mn][op] - xacegikmoq acegikmo 353[ab][cd][ef][gh][ij][kl][mn][op][qr] - xacegikmoqy acegikmoq 354[ab][cd][ef][gh][ij][kl][mn][op][q] - xacegikmoqy acegikmoq 355 356# subtleties of matching 357abc & xabcy abc 358a\(b\)?c\1d b acd 359aBc i Abc Abc 360a[Bc]*d i abBCcd abBCcd 3610[[:upper:]]1 &i 0a1 0a1 3620[[:lower:]]1 &i 0A1 0A1 363a[^b]c &i abc 364a[^b]c &i aBc 365a[^b]c &i adc adc 366[a]b[c] - abc abc 367[a]b[a] - aba aba 368[abc]b[abc] - abc abc 369[abc]b[abd] - abd abd 370a(b?c)+d - accd accd 371(wee|week)(knights|night) - weeknights weeknights 372(we|wee|week|frob)(knights|night|day) - weeknights weeknights 373a[bc]d - xyzaaabcaababdacd abd 374a[ab]c - aaabc abc 375abc s abc abc 376a* & b @b 377 378# Let's have some fun -- try to match a C comment. 379# first the obvious, which looks okay at first glance... 380/\*.*\*/ - /*x*/ /*x*/ 381# but... 382/\*.*\*/ - /*x*/y/*z*/ /*x*/y/*z*/ 383# okay, we must not match */ inside; try to do that... 384/\*([^*]|\*[^/])*\*/ - /*x*/ /*x*/ 385/\*([^*]|\*[^/])*\*/ - /*x*/y/*z*/ /*x*/ 386# but... 387/\*([^*]|\*[^/])*\*/ - /*x**/y/*z*/ /*x**/y/*z*/ 388# and a still fancier version, which does it right (I think)... 389/\*([^*]|\*+[^*/])*\*+/ - /*x*/ /*x*/ 390/\*([^*]|\*+[^*/])*\*+/ - /*x*/y/*z*/ /*x*/ 391/\*([^*]|\*+[^*/])*\*+/ - /*x**/y/*z*/ /*x**/ 392/\*([^*]|\*+[^*/])*\*+/ - /*x****/y/*z*/ /*x****/ 393/\*([^*]|\*+[^*/])*\*+/ - /*x**x*/y/*z*/ /*x**x*/ 394/\*([^*]|\*+[^*/])*\*+/ - /*x***x/y/*z*/ /*x***x/y/*z*/ 395 396# subexpressions 397a(b)(c)d - abcd abcd b,c 398a(((b)))c - abc abc b,b,b 399a(b|(c))d - abd abd b,- 400a(b*|c|e)d - abbd abbd bb 401a(b*|c|e)d - acd acd c 402a(b*|c|e)d - ad ad @d 403a(b?)c - abc abc b 404a(b?)c - ac ac @c 405a(b+)c - abc abc b 406a(b+)c - abbbc abbbc bbb 407a(b*)c - ac ac @c 408(a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de 409# the regression tester only asks for 9 subexpressions 410a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j 411a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l - abcdefghijkl abcdefghijkl b,c,d,e,f,g,h,i,j,k 412a([bc]?)c - abc abc b 413a([bc]?)c - ac ac @c 414a([bc]+)c - abc abc b 415a([bc]+)c - abcc abcc bc 416a([bc]+)bc - abcbc abcbc bc 417a(bb+|b)b - abb abb b 418a(bbb+|bb+|b)b - abb abb b 419a(bbb+|bb+|b)b - abbb abbb bb 420a(bbb+|bb+|b)bb - abbb abbb b 421(.*).* - abcdef abcdef abcdef 422(a*)* - bc @b @b 423 424# do we get the right subexpression when it is used more than once? 425a(b|c)*d - ad ad - 426a(b|c)*d - abcd abcd c 427a(b|c)+d - abd abd b 428a(b|c)+d - abcd abcd c 429a(b|c?)+d - ad ad @d 430a(b|c?)+d - abcd abcd @d 431a(b|c){0,0}d - ad ad - 432a(b|c){0,1}d - ad ad - 433a(b|c){0,1}d - abd abd b 434a(b|c){0,2}d - ad ad - 435a(b|c){0,2}d - abcd abcd c 436a(b|c){0,}d - ad ad - 437a(b|c){0,}d - abcd abcd c 438a(b|c){1,1}d - abd abd b 439a(b|c){1,1}d - acd acd c 440a(b|c){1,2}d - abd abd b 441a(b|c){1,2}d - abcd abcd c 442a(b|c){1,}d - abd abd b 443a(b|c){1,}d - abcd abcd c 444a(b|c){2,2}d - acbd acbd b 445a(b|c){2,2}d - abcd abcd c 446a(b|c){2,4}d - abcd abcd c 447a(b|c){2,4}d - abcbd abcbd b 448a(b|c){2,4}d - abcbcd abcbcd c 449a(b|c){2,}d - abcd abcd c 450a(b|c){2,}d - abcbd abcbd b 451a(b+|((c)*))+d - abd abd @d,@d,- 452a(b+|((c)*))+d - abcd abcd @d,@d,- 453 454# check out the STARTEND option 455[abc] &# a(b)c b 456[abc] &# a(d)c 457[abc] &# a(bc)d b 458[abc] &# a(dc)d c 459. &# a()c 460b.*c &# b(bc)c bc 461b.* &# b(bc)c bc 462.*c &# b(bc)c bc 463 464# plain strings, with the NOSPEC flag 465abc m abc abc 466abc m xabcy abc 467abc m xyz 468a*b m aba*b a*b 469a*b m ab 470"" mC EMPTY 471 472# cases involving NULs 473aZb & a a 474aZb &p a 475aZb &p# (aZb) aZb 476aZ*b &p# (ab) ab 477a.b &# (aZb) aZb 478a.* &# (aZb)c aZb 479 480# word boundaries (ick) 481[[:<:]]a & a a 482[[:<:]]a & ba 483[[:<:]]a & -a a 484a[[:>:]] & a a 485a[[:>:]] & ab 486a[[:>:]] & a- a 487[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc abc 488[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc-q abc 489[[:<:]]a.c[[:>:]] & axc-dayc-dazce-abc axc 490[[:<:]]b.c[[:>:]] & a_bxc-byc_d-bzc-q bzc 491[[:<:]].x..[[:>:]] & y_xa_-_xb_y-_xc_-axdc _xc_ 492[[:<:]]a_b[[:>:]] & x_a_b 493 494# past problems, and suspected problems 495(A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A]) - A1 A1 496abcdefghijklmnop i abcdefghijklmnop abcdefghijklmnop 497abcdefghijklmnopqrstuv i abcdefghijklmnopqrstuv abcdefghijklmnopqrstuv 498(ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN]) - CC11 CC11 499CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a - CC11 CC11 500Char \([a-z0-9_]*\)\[.* b Char xyz[k Char xyz[k xyz 501a?b - ab ab 502-\{0,1\}[0-9]*$ b -5 -5 503 504