1strhas(s: string, c: int): ref Val 2{ 3 for(i := 0; i < len s; i++) 4 if(s[i] == c) 5 return true; 6 return false; 7} 8 9rsplit(r: string): (string, string) 10{ 11 esc := 0; 12 i := 1; # skip '/' 13 for(;;){ 14 c := r[i++]; 15 if(!esc && c == '/') 16 break; 17 esc = !esc && c == '\\'; 18 } 19 return (r[1: i-1], r[i: ]); 20} 21 22badflags(f: string): int 23{ 24 g := i := m := 0; 25 for(j := 0; j < len f; j++){ 26 case(f[j]){ 27 'g' => 28 g++; 29 'i' => 30 i++; 31 'm' => 32 m++; 33 * => 34 return 1; 35 } 36 } 37 return g > 1 || i > 1 || m > 1; 38} 39 40regexpvals(ex: ref Exec, v: ref Val, o: ref Ecmascript->Obj): (string, string, int) 41{ 42 if(v != nil){ 43 if(v.ty == TRegExp) 44 return (v.rev.p, v.rev.f, v.rev.i); 45 o = v.obj; 46 } 47 p := toString(ex, esget(ex, o, "source", 0)); 48 f := ""; 49 if(toBoolean(ex, esget(ex, o, "global", 0)) == true) 50 f += "g"; 51 if(toBoolean(ex, esget(ex, o, "ignoreCase", 0)) == true) 52 f += "i"; 53 if(toBoolean(ex, esget(ex, o, "multiline", 0)) == true) 54 f += "m"; 55 i := toInt32(ex, esget(ex, o, "lastIndex", 0)); 56 return (p, f, i); 57} 58 59nregexp(ex: ref Exec, nil: ref Ecmascript->Obj, args: array of ref Val): ref Ecmascript->Obj 60{ 61 pat := biarg(args, 0); 62 flags := biarg(args, 1); 63 (p, f) := ("", ""); 64 if(isregexp(pat)){ 65 if(flags == undefined) 66 (p, f, nil) = regexpvals(ex, pat, nil); 67 else 68 runtime(ex, TypeError, "flags defined"); 69 } 70 else{ 71 if(pat == undefined) 72 p = ""; 73 else 74 p = toString(ex, pat); 75 if(flags == undefined) 76 f = ""; 77 else 78 f = toString(ex, flags); 79 } 80 o := nobj(ex, nil, array[] of { regexpval(p, f, 0) }); 81 if(badflags(f)) 82 runtime(ex, SyntaxError, "bad regexp flags"); 83 regex = ex; 84 (re, err) := compile(p, 1); 85 if(re == nil || err != nil) 86 runtime(ex, SyntaxError, "bad regexp pattern"); 87 o.re = re; 88 return o; 89} 90 91cregexp(ex: ref Exec, f, nil: ref Ecmascript->Obj, args: array of ref Val): ref Val 92{ 93 pat := biarg(args, 0); 94 flags := biarg(args, 1); 95 if(isregexp(pat) && flags == undefined) 96 return pat; 97 return objval(nregexp(ex, f, args)); 98} 99 100cregexpprotoexec(ex: ref Exec, f, this: ref Ecmascript->Obj, args: array of ref Val): ref Val 101{ 102 m: array of (int, int); 103 104 regexpcheck(ex, this, f); 105 s := toString(ex, biarg(args, 0)); 106 l := len s; 107 i := toInt32(ex, esget(ex, this, "lastIndex", 0)); 108 e := 0; 109 glob := esget(ex, this, "global", 0); 110 multiline := esget(ex, this, "multiline", 0); 111 ignorecase := esget(ex, this, "ignoreCase", 0); 112 if(glob == false) 113 i = 0; 114 for(;;){ 115 if(i < 0 || i >= l){ 116 esput(ex, this, "lastIndex", numval(real 0), 0); 117 return null; 118 } 119 regex = ex; 120 m = executese(this.re, s, (i, len s), i == 0, 1, multiline == true, ignorecase == true); 121 if(m != nil) 122 break; 123 i++; 124 i = -1; # no need to loop with executese 125 } 126 (i, e) = m[0]; 127 if(glob == true) 128 esput(ex, this, "lastIndex", numval(real e), 0); 129 n := len m; 130 av := array[n] of ref Val; 131 for(j := 0; j < n; j++){ 132 (a, b) := m[j]; 133 if(a < 0) 134 av[j] = undefined; 135 else 136 av[j] = strval(s[a: b]); 137 } 138 a := narray(ex, nil, av); 139 esput(ex, a, "index", numval(real i), 0); 140 esput(ex, a, "input", strval(s), 0); 141 return objval(a); 142} 143 144cregexpprototest(ex: ref Exec, f, this: ref Ecmascript->Obj, args: array of ref Val): ref Val 145{ 146 regexpcheck(ex, this, f); 147 v := cregexpprotoexec(ex, f, this, args); 148 if(!isnull(v)) 149 return true; 150 return false; 151} 152 153cregexpprototoString(ex: ref Exec, f, this: ref Ecmascript->Obj, nil: array of ref Val): ref Val 154{ 155 regexpcheck(ex, this, f); 156 (p, fl, nil) := regexpvals(ex, nil, this); 157 return strval("/" + p + "/" + fl); 158} 159 160regexpcheck(ex: ref Exec, o: ref Ecmascript->Obj, f: ref Obj) 161{ 162 if(f == nil) 163 s := "exec"; 164 else 165 s = f.val.str; 166 if(!isregexpobj(o)) 167 runtime(ex, TypeError, "RegExp.prototype." + s + " called on non-RegExp object"); 168} 169 170cstrprotomatch(ex: ref Exec, nil, this: ref Ecmascript->Obj, args: array of ref Val): ref Val 171{ 172 v := biarg(args, 0); 173 if(!isregexp(v)) 174 re := nregexp(ex, nil, args); 175 else if(v.ty == TObj) 176 re = v.obj; 177 else 178 re = nobj(ex, nil, args); 179 s := toString(ex, this.val); 180 glob := esget(ex, re, "global", 0); 181 av := array[1] of ref Val; 182 av[0] = strval(s); 183 if(glob == false) 184 return cregexpprotoexec(ex, nil, re, av); 185 li := 0; 186 esput(ex, re, "lastIndex", numval(real li), 0); 187 ms: list of ref Val; 188 for(;;){ 189 v = cregexpprotoexec(ex, nil, re, av); 190 if(isnull(v)) 191 break; 192 ms = esget(ex, v.obj, "0", 0) :: ms; 193 ni := int toUint32(ex, esget(ex, re, "lastIndex", 0)); 194 if(ni == li) 195 esput(ex, re, "lastIndex", numval(real ++li), 0); 196 else 197 li = ni; 198 } 199 n := len ms; 200 av = array[n] of ref Val; 201 for(j := n-1; j >= 0; j--){ 202 av[j] = hd ms; 203 ms = tl ms; 204 } 205 return objval(narray(ex, nil, av)); 206} 207 208cstrprotoreplace(ex: ref Exec, nil, this: ref Ecmascript->Obj, args: array of ref Val): ref Val 209{ 210 re: ref Ecmascript->Obj; 211 212 v := biarg(args, 0); 213 rege := isregexp(v); 214 if(!rege){ 215 if(args == nil) 216 re = nregexp(ex, nil, args); 217 else 218 re = nregexp(ex, nil, args[0:1]); 219 } 220 else if(v.ty == TObj) 221 re = v.obj; 222 else 223 re = nobj(ex, nil, args); 224 s := toString(ex, this.val); 225 if(rege) 226 glob := esget(ex, re, "global", 0); 227 else 228 glob = false; 229 av := array[1] of ref Val; 230 av[0] = strval(s); 231 ms: list of ref Val; 232 li := 0; 233 if(glob == true) 234 esput(ex, re, "lastIndex", numval(real li), 0); 235 for(;;){ 236 v = cregexpprotoexec(ex, nil, re, av); 237 if(!isnull(v)) 238 ms = v :: ms; 239 if(isnull(v) || glob == false) 240 break; 241 ni := int toUint32(ex, esget(ex, re, "lastIndex", 0)); 242 if(ni == li) 243 esput(ex, re, "lastIndex", numval(real ++li), 0); 244 else 245 li = ni; 246 } 247 if(ms == nil) 248 return strval(s); 249 ms = rev(ms); 250 if(rege) 251 lcp := int toUint32(ex, esget(ex, (hd ms).obj, "length", 0))-1; 252 else 253 lcp = 0; 254 v = biarg(args, 1); 255 if(isobj(v) && isfuncobj(v.obj)){ 256 ns := s; 257 n := len ms; 258 args = array[lcp+3] of ref Val; 259 o := inc := 0; 260 for(i := 0; i < n; i++){ 261 a := (hd ms).obj; 262 ms = tl ms; 263 for(j := 0; j <= lcp; j++) 264 args[j] = esget(ex, a, string j, 0); 265 ss := toString(ex, args[0]); 266 o = offset(ss, s, o); 267 args[lcp+1] = numval(real o); 268 args[lcp+2] = strval(s); 269 rs := toString(ex, getValue(ex, escall(ex, v.obj, nil, args, 0))); 270 ns = repl(ns, o+inc, o+inc+len ss, rs); 271 o += len ss; 272 inc += len rs - len ss; 273 } 274 return strval(ns); 275 } 276 else{ 277 ps := toString(ex, v); 278 lps := len ps; 279 ns := s; 280 n := len ms; 281 o := inc := 0; 282 for(i := 0; i < n; i++){ 283 a := (hd ms).obj; 284 ms = tl ms; 285 ss := toString(ex, esget(ex, a, "0", 0)); 286 o = offset(ss, s, o); 287 rs := ""; 288 for(j := 0; j < lps; j++){ 289 if(ps[j] == '$' && j < lps-1){ 290 j++; 291 case(c := ps[j]){ 292 '$' => 293 rs += "$"; 294 '&' => 295 rs += ss; 296 '`' => 297 rs += s[0: o]; 298 ''' => 299 rs += s[o+len ss: ]; 300 '0' to '9' => 301 if(j < lps-1 && isdigit(ps[j+1])) 302 c = 10*(c-'0')+ps[++j]-'0'; 303 else 304 c = c-'0'; 305 if(c >= 1 && c <= lcp) 306 rs += toString(ex, esget(ex, a, string c, 0)); 307 } 308 } 309 else 310 rs += ps[j: j+1]; 311 } 312 ns = repl(ns, o+inc, o+inc+len ss, rs); 313 o += len ss; 314 inc += len rs - len ss; 315 } 316 return strval(ns); 317 } 318} 319 320cstrprotosearch(ex: ref Exec, nil, this: ref Ecmascript->Obj, args: array of ref Val): ref Val 321{ 322 v := biarg(args, 0); 323 if(!isregexp(v)) 324 re := nregexp(ex, nil, args); 325 else if(v.ty == TObj) 326 re = v.obj; 327 else 328 re = nobj(ex, nil, args); 329 s := toString(ex, this.val); 330 glob := esget(ex, re, "global", 0); 331 esput(ex, re, "global", false, 0); 332 av := array[1] of ref Val; 333 av[0] = strval(s); 334 v = cregexpprotoexec(ex, nil, re, av); 335 if(isnull(v)) 336 r := -1; 337 else{ 338 ss := toString(ex, esget(ex, v.obj, "0", 0)); 339 r = offset(ss, s, 0); 340 } 341 esput(ex, re, "global", glob, 0); 342 return numval(real r); 343} 344 345offset(ss: string, s: string, m: int): int 346{ 347 nn := len ss; 348 n := len s; 349 for(i := m; i <= n-nn; i++){ 350 if(s[i: i+nn] == ss) 351 return i; 352 } 353 return -1; 354} 355 356repl(s: string, a: int, b: int, ns: string): string 357{ 358 return s[0: a] + ns + s[b: ]; 359} 360 361rev(ls: list of ref Val): list of ref Val 362{ 363 ns: list of ref Val; 364 365 for( ; ls != nil; ls = tl ls) 366 ns = hd ls :: ns; 367 return ns; 368} 369 370######################################################################### 371# regex.b originally 372 373# normally imported identifiers 374 375# internal identifiers, not normally imported 376 377ALT, CAT, DOT, SET, HAT, DOL, NUL, PCLO, CLO, OPT, LPN, RPN, LPN0, RPN0, LPN1, RPN1, LPN2, RPN2, BEET, BEEF, MNCLO, LCP, IDLE: con (1<<16)+iota; 378 379# syntax 380 381# RE ALT regular expression 382# NUL 383# ALT CAT alternation 384# CAT | ALT 385# 386# CAT DUP catenation 387# DUP CAT 388# 389# DUP PRIM possibly duplicated primary 390# PCLO 391# CLO 392# OPT 393# 394# PCLO PRIM + 1 or more 395# CLO PRIM * 0 or more 396# OPT PRIM ? 0 or 1 397# 398# PRIM ( RE ) 399# () 400# DOT any character 401# CHAR a single character 402# ESC escape sequence 403# [ SET ] character set 404# NUL null string 405# HAT beginning of string 406# DOL end of string 407# 408 409regex: ref Exec; 410 411NIL : con -1; # a refRex constant 412NONE: con -2; # ditto, for an un-set value 413BAD: con 1<<16; # a non-character 414HUGE: con (1<<31) - 1; 415 416# the data structures of re.m would like to be ref-linked, but are 417# circular (see fn walk), thus instead of pointers we use indexes 418# into an array (arena) of nodes of the syntax tree of a regular expression. 419# from a storage-allocation standpoint, this replaces many small 420# allocations of one size with one big one of variable size. 421 422ReStr: adt { 423 s : string; 424 i : int; # cursor postion 425 n : int; # number of chars left; -1 on error 426 peek : fn(s: self ref ReStr): int; 427 next : fn(s: self ref ReStr): int; 428 unput: fn(s: self ref ReStr); 429}; 430 431ReStr.peek(s: self ref ReStr): int 432{ 433 if(s.n <= 0) 434 return BAD; 435 return s.s[s.i]; 436} 437 438ReStr.next(s: self ref ReStr): int 439{ 440 if(s.n <= 0) 441 syntax("bad regular expression"); 442 s.n--; 443 return s.s[s.i++]; 444} 445 446ReStr.unput(s: self ref ReStr) 447{ 448 s.n++; 449 s.i--; 450} 451 452newRe(kind: int, left, right: refRex, set: ref Set, ar: ref Arena, pno: int, greedy: int): refRex 453{ 454 ar.rex[ar.ptr] = Rex(kind, left, right, set, pno, greedy, nil); 455 return ar.ptr++; 456} 457 458# parse a regex by recursive descent to get a syntax tree 459 460re(s: ref ReStr, ar: ref Arena): refRex 461{ 462 left := cat(s, ar); 463 if(left==NIL || s.peek()!='|') 464 return left; 465 s.next(); 466 right := re(s, ar); 467 if(right == NIL) 468 return NIL; 469 return newRe(ALT, left, right, nil, ar, 0, 0); 470} 471 472cat(s: ref ReStr, ar: ref Arena): refRex 473{ 474 left := dup(s, ar); 475 if(left == NIL) 476 return left; 477 right := cat(s, ar); 478 if(right == NIL) 479 return left; 480 return newRe(CAT, left, right, nil, ar, 0, 0); 481} 482 483dup(s: ref ReStr, ar: ref Arena): refRex 484{ 485 n1, n2: int; 486 487 case s.peek() { 488 BAD or ')' or ']' or '|' or '?' or '*' or '+' => 489 return NIL; 490 } 491 prim: refRex; 492 case kind:=s.next() { 493 '(' => if(ar.pno < 0) { 494 if(s.peek() == ')') { 495 s.next(); 496 prim = newRe(NUL, NONE, NONE, nil, ar, 0, 0); 497 } else { 498 prim = re(s, ar); 499 if(prim==NIL || s.next()!=')') 500 syntax("( with no )"); 501 } 502 } else { 503 pno := ++ar.pno; 504 lp := newRe(LPN, NONE, NONE, nil, ar, pno, 0); 505 rp := newRe(RPN, NONE, NONE, nil, ar, pno, 0); 506 if(s.peek() == ')') { 507 s.next(); 508 prim = newRe(CAT, lp, rp, nil, ar, 0, 0); 509 } else { 510 if(s.peek() == '?'){ 511 s.next(); 512 case s.next(){ 513 ':' => ar.rex[lp].kind = LPN0; 514 ar.rex[rp].kind = RPN0; 515 '=' => ar.rex[lp].kind = LPN1; 516 ar.rex[rp].kind = RPN1; 517 '!' => ar.rex[lp].kind = LPN2; 518 ar.rex[rp].kind = RPN2; 519 * => syntax("bad char after ?"); 520 } 521 } 522 prim = re(s, ar); 523 if(prim==NIL || s.next()!=')') 524 syntax("( with no )"); 525 else { 526 prim = newRe(CAT, prim, rp, nil, ar, 0, 0); 527 prim = newRe(CAT, lp, prim, nil, ar, 0, 0); 528 } 529 } 530 } 531 '[' => prim = newRe(SET, NONE, NONE, newSet(s), ar, 0, 0); 532 * => case kind { 533 '.' => kind = DOT; 534 '^' => kind = HAT; 535 '$' => kind = DOL; 536 } 537 (c, set, op) := esc(s, kind, 0); 538 if(set != nil) 539 prim = newRe(SET, NONE, NONE, set, ar, 0, 0); 540 else if(op == LCP){ 541 if(c > ar.pno) 542 syntax("\num too big"); 543 prim = newRe(LCP, NONE, NONE, nil, ar, 0, 0); 544 ar.rex[prim].ns = ref Nstate(c, c); 545 } 546 else 547 prim = newRe(c, NONE, NONE, nil, ar, 0, 0); 548 } 549 case s.peek() { 550 '*' => kind = CLO; 551 '+' => kind = PCLO; 552 '?' => kind = OPT; 553 '{' => s.next(); 554 (n1, n2) = drange(s); 555 kind = MNCLO; 556 if(s.peek() != '}') 557 syntax("{ with no }"); 558 * => return prim; 559 } 560 s.next(); 561 greedy := 1; 562 if(s.peek() == '?'){ 563 # non-greedy op 564 greedy = 0; 565 s.next(); 566 } 567 prim = newRe(kind, prim, NONE, nil, ar, 0, greedy); 568 if(kind == MNCLO) 569 ns := ar.rex[prim].ns = ref Nstate(n1, n2); 570 return prim; 571} 572 573esc(s: ref ReStr, char: int, inset: int): (int, ref Set, int) 574{ 575 set: ref Set; 576 577 op := 0; 578 if(char == '\\') { 579 char = s.next(); 580 case char { 581 'b' => 582 if(inset) 583 char = '\b'; 584 else 585 char = BEET; 586 'B' => if(inset) 587 syntax("\\B in set"); 588 else 589 char = BEEF; 590 'f' => char = '\u000c'; 591 'n' => char = '\n'; 592 'r' => char = '\r'; 593 't' => char = '\t'; 594 'v' => char = '\v'; 595 '0' to '9' => 596 s.unput(); 597 char = digits(s); 598 if(char == 0) 599 char = '\0'; 600 else if(inset) 601 syntax("\num in set"); 602 else 603 op = LCP; 604 'x' => char = hexdigits(s, 2); 605 'u' => char = hexdigits(s, 4); 606 'c' => char = s.next()%32; 607 'd' or 'D' => 608 set = newset('0', '9'); 609 if(char == 'D') 610 set.neg = 1; 611 's' or 'S' => 612 set = newset(' ', ' '); 613 addsets(set, "\t\v\u000c\u00a0\n\r\u2028\u2029"); 614 if(char == 'S') 615 set.neg = 1; 616 'w' or 'W' => 617 set = newset('0', '9'); 618 addset(set, 'a', 'z'); 619 addset(set, 'A', 'Z'); 620 addset(set, '_', '_'); 621 if(char == 'W') 622 set.neg = 1; 623 * => 624 ; 625 } 626 } 627 if(char == -1){ 628 if(inset) 629 syntax("bad set"); 630 else 631 syntax("bad character"); 632 } 633 return (char, set, op); 634} 635 636isdigit(c: int): int 637{ 638 return c >= '0' && c <= '9'; 639} 640 641islower(c: int): int 642{ 643 return c >= 'a' && c <= 'z'; 644} 645 646isupper(c: int): int 647{ 648 return c >= 'A' && c <= 'Z'; 649} 650 651isalpha(c: int): int 652{ 653 return islower(c) || isupper(c); 654} 655 656hexdigit(c: int): int 657{ 658 if(isdigit(c)) 659 return c-'0'; 660 if('a' <= c && c <= 'f') 661 return c-'a'+10; 662 if('A' <= c && c <= 'F') 663 return c-'A'+10; 664 return -1; 665} 666 667digits(s: ref ReStr): int 668{ 669 n := 0; 670 while(isdigit(s.peek())) 671 n = 10*n + s.next() -'0'; 672 return n; 673} 674 675hexdigits(s: ref ReStr, n: int): int 676{ 677 x := 0; 678 for(i := 0; i < n; i++){ 679 v := hexdigit(s.next()); 680 if(v < 0) 681 return -1; 682 x = 16*x+v; 683 } 684 return x; 685} 686 687drange(s: ref ReStr): (int, int) 688{ 689 n1 := n2 := -1; 690 if(isdigit(s.peek())) 691 n1 = digits(s); 692 if(s.peek() == ','){ 693 s.next(); 694 if(isdigit(s.peek())) 695 n2 = digits(s); 696 else 697 n2 = HUGE; 698 } 699 else 700 n2 = n1; 701 if(n1 < 0 || n1 > n2) 702 syntax("bad number range"); 703 return (n1, n2); 704} 705 706# walk the tree adjusting pointers to refer to 707# next state of the finite state machine 708 709walk(r: refRex, succ: refRex, ar: ref Arena) 710{ 711 if(r==NONE) 712 return; 713 rex := ar.rex[r]; 714 case rex.kind { 715 ALT => walk(rex.left, succ, ar); 716 walk(rex.right, succ, ar); 717 return; 718 CAT => walk(rex.left, rex.right, ar); 719 walk(rex.right, succ, ar); 720 ar.rex[r] = ar.rex[rex.left]; # optimization 721 return; 722 CLO or PCLO => 723 end := newRe(OPT, r, succ, nil, ar, 0, rex.greedy); # here's the circularity 724 walk(rex.left, end, ar); 725 OPT => walk(rex.left, succ, ar); 726 MNCLO => 727 ar.ptr++; 728 walk(rex.left, r, ar); 729 LCP => 730 ar.rex[r].left = newRe(IDLE, NONE, succ, nil, ar, 0, 0); 731 } 732 ar.rex[r].right = succ; 733} 734 735prtree(r: refRex, ar: ref Arena, done: list of int, ind: string): list of int 736{ 737 sys->print("%s", ind); 738 if(r==NIL){ 739 sys->print("NIL\n"); 740 return done; 741 } 742 if(r==NONE){ 743 sys->print("NONE\n"); 744 return done; 745 } 746 printed := 0; 747 for(li := done; li != nil; li = tl li){ 748 if(hd li == r){ 749 printed = 1; 750 break; 751 } 752 } 753 rex := ar.rex[r]; 754 op := ""; 755 z := "Z"; 756 case rex.kind{ 757 ALT => op = "|"; 758 CAT => op = "and"; 759 DOT => op = "."; 760 SET => op = "[]"; 761 HAT => op = "^"; 762 DOL => op = "$"; 763 NUL => op = "NUL"; 764 PCLO => op = "+"; 765 CLO => op = "*"; 766 OPT => op = "?"; 767 LPN => op = "("; 768 RPN => op = ")"; 769 LPN0 => op = "?:"; 770 RPN0 => op = ":?"; 771 LPN1 => op = "?="; 772 RPN1 => op = "=?"; 773 LPN2 => op = "?!"; 774 RPN2 => op = "!?"; 775 BEET => op = "\\b"; 776 BEEF => op = "\\B"; 777 MNCLO => op = "{}"; 778 LCP => op = "n"; 779 IDLE => op = "i"; 780 * => z[0] = rex.kind; op = z; 781 } 782 if(printed){ 783 sys->print("node %d (%d)\n", r, r); 784 return done; 785 } 786 else{ 787 if(rex.ns != nil) 788 sys->print("%s [%d-%d] (%d)\n", op, rex.ns.m, rex.ns.n, r); 789 else 790 sys->print("%s (%d)\n", op, r); 791 done = r :: done; 792 ind += " "; 793 done = prtree(rex.left, ar, done, ind); 794 done = prtree(rex.right, ar, done, ind); 795 return done; 796 } 797} 798 799compile(e: string, flag: int): (Re, string) 800{ 801 if(e == nil) 802 return (nil, "missing expression"); 803 s := ref ReStr(e, 0, len e); 804 ar := ref Arena(array[2*s.n] of Rex, 0, 0, (flag&1)-1); 805 start := ar.start = re(s, ar); 806 if(start==NIL || s.n!=0) 807 syntax("invalid regular expression"); 808 walk(start, NIL, ar); 809 # prtree(start, ar, nil, ""); 810 if(ar.pno < 0) 811 ar.pno = 0; 812 return (ar, nil); 813} 814 815# todo: queue for epsilon and advancing transitions 816 817Num: adt{ 818 ns: ref Nstate; 819 m: int; 820 n: int; 821}; 822Gaz: adt { 823 pno: int; 824 beg: int; 825 end: int; 826}; 827Trace: adt { 828 cre: refRex; # cursor in Re 829 trans: int; # 0 epsilon transition, 1 advancing transition 830 beg: int; # where this trace began; 831 end: int; # where this trace ended if success (-1 by default) 832 gaz: list of Gaz; 833 ns: list of ref Num; 834}; 835Queue: adt { 836 ptr: int; 837 q: array of Trace; 838}; 839 840execute(re: Re, s: string): array of (int, int) 841{ 842 return executese(re, s, (-1,-1), 1, 1, 1, 0); 843} 844 845executese(re: Re, s: string, range: (int, int), bol: int, eol: int, multiline: int, ignorecase: int): array of (int,int) 846{ 847 if(re==nil) 848 return nil; 849 (s0, s1) := range; 850 if(s0 < 0) 851 s0 = 0; 852 if(s1 < 0) 853 s1 = len s; 854 match := 0; 855 todo := ref Queue(0, array[2*re.ptr] of Trace); 856 for(i:=s0; i<=s1; i++) { 857 if(!match) # no leftmost match yet 858 todo.q[todo.ptr++] = Trace(re.start, 0, i, -1, nil, nil); 859 for(k:=0; k<todo.ptr; k++) { 860 q := todo.q[k]; 861 if(q.trans) 862 continue; 863 rex := re.rex[q.cre]; 864 next0 := next1 := next2 := NONE; 865 case rex.kind { 866 NUL => 867 next1 = rex.right; 868 DOT => 869 if(i<len s && !islt(s[i])) 870 next2 = rex.right; 871 HAT => 872 if(i == s0 && bol) 873 next1 = rex.right; 874 else if(multiline && i > 0 && islt(s[i-1])) 875 next1 = rex.right; 876 DOL => 877 if(i == s1 && eol) 878 next1 = rex.right; 879 else if(multiline && i < s1 && islt(s[i])) 880 next1 = rex.right; 881 SET => 882 if(i<len s && member(s[i], rex.set, ignorecase)) 883 next2 = rex.right; 884 CAT or 885 PCLO => 886 next1 = rex.left; 887 ALT or 888 CLO or 889 OPT => 890 if(rex.kind == ALT || rex.greedy){ 891 next0 = rex.left; 892 next1 = rex.right; 893 } 894 else{ 895 next0 = rex.right; 896 next1 = rex.left; 897 } 898 LPN => 899 next1 = rex.right; 900 q.gaz = Gaz(rex.pno,i,-1)::q.gaz; 901 RPN => 902 next1 = rex.right; 903 for(r:=q.gaz; ; r=tl r) { 904 (pno,beg1,end1) := hd r; 905 if(rex.pno==pno && end1==-1) { 906 q.gaz = Gaz(pno,beg1,i)::q.gaz; 907 break; 908 } 909 } 910 LPN0 or RPN0 or RPN1 or RPN2 => 911 next1 = rex.right; 912 LPN1 => 913 (rpn, nxt, nre) := storetree(q.cre, re); 914 m := executese(nre, s, (i, -1), bol, eol, multiline, ignorecase); 915 if(m != nil && m[0].t0 == i){ 916 next1 = nxt; 917 for(j := 1; j < len m; j++) 918 if(m[j].t0 >= 0) 919 q.gaz = Gaz(j, m[j].t0, m[j].t1)::q.gaz; 920 } 921 restoretree(LPN1, rpn, nxt, nre); 922 LPN2 => 923 (rpn, nxt, nre) := storetree(q.cre, re); 924 m := executese(nre, s, (i, -1), bol, eol, multiline, ignorecase); 925 if(m == nil || m[0].t0 != i) 926 next1 = nxt; 927 restoretree(LPN2, rpn, nxt, nre); 928 MNCLO => 929 num: ref Num; 930 931 (q.ns, num) = nextn(q.cre, q.ns, rex.ns.m, rex.ns.n, re); 932 if(num.m > 0) 933 next1 = rex.left; 934 else if(num.n > 0){ 935 if(rex.greedy){ 936 next0 = rex.left; 937 next1 = rex.right; 938 } 939 else{ 940 next0 = rex.right; 941 next1 = rex.left; 942 } 943 } 944 else{ 945 next1 = rex.right; 946 (num.m, num.n) = (-1, -1); 947 } 948 LCP => 949 pno := rex.ns.m; 950 (beg1, end1) := lcpar(q.gaz, pno); 951 l := end1-beg1; 952 if(beg1 < 0) # undefined so succeeds 953 next1 = rex.right; 954 else if(i+l <= s1 && eqstr(s[beg1: end1], s[i: i+l], ignorecase)){ 955 (q.ns, nil) = nextn(rex.left, q.ns, l, l, re); 956 next1 = rex.left; # idle 957 } 958 IDLE => 959 num: ref Num; 960 961 (q.ns, num) = nextn(q.cre, q.ns, -1, -1, re); 962 if(num.m >= 0) 963 next2 = q.cre; 964 else{ 965 next1 = rex.right; 966 (num.m, num.n) = (-1, -1); 967 } 968 BEET => 969 if(iswordc(s, i-1) != iswordc(s, i)) 970 next1 = rex.right; 971 BEEF => 972 if(iswordc(s, i-1) == iswordc(s, i)) 973 next1 = rex.right; 974 * => 975 if(i<len s && (rex.kind==s[i] || (ignorecase && eqcase(rex.kind, s[i])))) 976 next2 = rex.right; 977 } 978 l := k; 979 if(next0 != NONE) { 980 if(next0 != NIL) 981 (k, l) = insert(next0, 0, q.beg, -1, q.gaz, q.ns, todo, k, l); 982 else{ 983 match = 1; 984 (k, l) = insert(NIL, 2, q.beg, i, q.gaz, nil, todo, k, l); 985 } 986 } 987 if(next1 != NONE) { 988 if(next1 != NIL) 989 (k, l) = insert(next1, 0, q.beg, -1, q.gaz, q.ns, todo, k, l); 990 else{ 991 match = 1; 992 (k, l) = insert(NIL, 2, q.beg, i, q.gaz, nil, todo, k, l); 993 } 994 } 995 if(next2 != NONE) { 996 if(next2 != NIL) 997 (k, l) = insert(next2, 1, q.beg, -1, q.gaz, q.ns, todo, k, l); 998 else{ 999 match = 1; 1000 (k, l) = insert(NIL, 2, q.beg, i+1, q.gaz, nil, todo, k, l); 1001 } 1002 } 1003 } 1004 if(!atoe(todo) && match) 1005 break; 1006 } 1007 if(todo.ptr == 0) 1008 return nil; 1009 if(todo.ptr > 1) 1010 rfatal(sys->sprint("todo.ptr = %d", todo.ptr)); 1011 if(todo.q[0].trans != 2) 1012 rfatal(sys->sprint("trans = %d", todo.q[0].trans)); 1013 if(todo.q[0].cre != NIL) 1014 rfatal(sys->sprint("cre = %d", todo.q[0].cre)); 1015 beg := todo.q[0].beg; 1016 end := todo.q[0].end; 1017 gaz := todo.q[0].gaz; 1018 if(beg == -1) 1019 return nil; 1020 result := array[re.pno+1] of { 0 => (beg,end), * => (-1,-1) }; 1021 for( ; gaz!=nil; gaz=tl gaz) { 1022 (pno, beg1, end1) := hd gaz; 1023 (rbeg, nil) := result[pno]; 1024 if(rbeg==-1 && (beg1|end1)!=-1) 1025 result[pno] = (beg1,end1); 1026 } 1027 return result; 1028} 1029 1030better(newbeg, newend, oldbeg, oldend: int): int 1031{ 1032 return oldbeg==-1 || newbeg<oldbeg || 1033 newbeg==oldbeg && newend>oldend; 1034} 1035 1036insert(next: refRex, trans: int, tbeg: int, tend: int, tgaz: list of Gaz, tns: list of ref Num, todo: ref Queue, k: int, l: int): (int, int) 1037{ 1038# sys->print("insert %d eps=%d beg=%d end=%d (k, l) = (%d %d) => ", next, trans, tbeg, tend, k, l); 1039 for(j:=0; j<todo.ptr; j++){ 1040 if(todo.q[j].trans == trans){ 1041 if(todo.q[j].cre == next){ 1042 if(better(todo.q[j].beg, todo.q[j].end, tbeg, tend)) 1043 return (k, l); 1044 else if(better(tbeg, tend, todo.q[j].beg, todo.q[j].end)) 1045 break; 1046 else if(j < k) 1047 return (k, l); 1048 else 1049 break; 1050 } 1051 } 1052 } 1053 if(j < k){ 1054 k--; 1055 l--; 1056 } 1057 if(j < todo.ptr){ 1058 todo.q[j: ] = todo.q[j+1: todo.ptr]; 1059 todo.ptr--; 1060 } 1061 todo.q[l+2: ] = todo.q[l+1: todo.ptr]; 1062 todo.ptr++; 1063 todo.q[l+1] = Trace(next, trans, tbeg, tend, tgaz, tns); 1064# for(j=0; j < todo.ptr; j++) sys->print("%d(%d) ", todo.q[j].cre, todo.q[j].trans); sys->print("\n"); 1065 return (k, l+1); 1066} 1067 1068# remove epsilon transitions and move advancing transitions to epsilon ones 1069atoe(todo: ref Queue): int 1070{ 1071 n := 0; 1072 for(j := 0; j < todo.ptr; j++){ 1073 if(todo.q[j].trans){ 1074 if(todo.q[j].trans == 1){ 1075 todo.q[j].trans = 0; 1076 n++; 1077 } 1078 } 1079 else{ 1080 todo.q[j: ] = todo.q[j+1: todo.ptr]; 1081 todo.ptr--; 1082 j--; 1083 } 1084 } 1085 return n; 1086} 1087 1088nextn(re: int, ln: list of ref Num, m: int, n: int, ar: ref Arena): (list of ref Num, ref Num) 1089{ 1090 num: ref Num; 1091 1092 ns := ar.rex[re].ns; 1093 for(l := ln; l != nil; l = tl l){ 1094 if((hd l).ns == ns){ 1095 num = hd l; 1096 break; 1097 } 1098 } 1099 if(num == nil) 1100 ln = (num = ref Num(ns, -1, -1)) :: ln; 1101 if(num.m == -1 && num.n == -1) 1102 (num.m, num.n) = (m, n); 1103 else 1104 (nil, nil) = (--num.m, --num.n); 1105 return (ln, num); 1106} 1107 1108ASCII : con 128; 1109WORD : con 32; 1110 1111mem(c: int, set: ref Set): int 1112{ 1113 return (set.ascii[c/WORD]>>c%WORD)&1; 1114} 1115 1116member(char: int, set: ref Set, ignorecase: int): int 1117{ 1118 if(set.subset != nil){ 1119 for(l := set.subset; l != nil; l = tl l) 1120 if(member(char, hd l, ignorecase)) 1121 return !set.neg; 1122 } 1123 if(char < 128){ 1124 if(ignorecase) 1125 return (mem(tolower(char), set) || mem(toupper(char), set))^set.neg; 1126 else 1127 return ((set.ascii[char/WORD]>>char%WORD)&1)^set.neg; 1128 } 1129 for(l:=set.unicode; l!=nil; l=tl l) { 1130 (beg, end) := hd l; 1131 if(char>=beg && char<=end) 1132 return !set.neg; 1133 } 1134 return set.neg; 1135} 1136 1137newSet(s: ref ReStr): ref Set 1138{ 1139 op: int; 1140 set0: ref Set; 1141 1142 set := ref Set(0, array[ASCII/WORD] of {* => 0}, nil, nil); 1143 if(s.peek() == '^') { 1144 set.neg = 1; 1145 s.next(); 1146 } 1147 while(s.n > 0) { 1148 char1 := s.next(); 1149 if(char1 == ']') 1150 return set; 1151 (char1, set0, op) = esc(s, char1, 1); 1152 if(set0 != nil) 1153 mergeset(set, set0); 1154 char2 := char1; 1155 if(s.peek() == '-') { 1156 if(set0 != nil) 1157 syntax("set in range"); 1158 s.next(); 1159 char2 = s.next(); 1160 if(char2 == ']') 1161 break; 1162 (char2, set0, op) = esc(s, char2, 1); 1163 if(set0 != nil) 1164 syntax("set in range"); 1165 if(char2 < char1) 1166 break; 1167 } 1168 addset(set, char1, char2); 1169 } 1170 syntax("bad set"); 1171 return nil; 1172} 1173 1174addset(set: ref Set, c1: int, c2: int) 1175{ 1176 for(c := c1; c <= c2; c++){ 1177 if(c < ASCII) 1178 set.ascii[c/WORD] |= 1<<c%WORD; 1179 else{ 1180 set.unicode = (c, c2) :: set.unicode; 1181 break; 1182 } 1183 } 1184} 1185 1186addsets(set: ref Set, s: string) 1187{ 1188 for(i := 0; i < len s; i++) 1189 addset(set, s[i], s[i]); 1190} 1191 1192mergeset(set: ref Set, set0: ref Set) 1193{ 1194 if(!set0.neg){ 1195 for(i := 0; i < ASCII/WORD; i++) 1196 set.ascii[i] |= set0.ascii[i]; 1197 for(l := set0.unicode; l != nil; l = tl l) 1198 set.unicode = hd l :: set.unicode; 1199 } 1200 else 1201 set.subset = set0 :: set.subset; 1202} 1203 1204newset(c1: int, c2: int): ref Set 1205{ 1206 set := ref Set(0, array[ASCII/WORD] of {* => 0}, nil, nil); 1207 addset(set, c1, c2); 1208 return set; 1209} 1210 1211storetree(lpn: int, re: ref Arena): (int, int, ref Arena) 1212{ 1213 rpn: int; 1214 1215 rex := re.rex[lpn]; 1216 k := rex.kind; 1217 l := 1; 1218 for(;;){ 1219 rpn = rex.right; 1220 rex = re.rex[rpn]; 1221 if(rex.kind == k) 1222 l++; 1223 else if(rex.kind == k+1 && --l == 0) 1224 break; 1225 } 1226 re.rex[lpn].kind = LPN; 1227 re.rex[rpn].kind = RPN; 1228 nxt := re.rex[rpn].right; 1229 re.rex[rpn].right = NIL; 1230 nre := ref *re; 1231 nre.start = lpn; 1232 return (rpn, nxt, nre); 1233} 1234 1235restoretree(lop: int, rpn: int, nxt: int, re: ref Arena) 1236{ 1237 lpn := re.start; 1238 re.rex[lpn].kind = lop; 1239 re.rex[rpn].kind = lop+1; 1240 re.rex[rpn].right = nxt; 1241} 1242 1243iswordc(s: string, i: int): int 1244{ 1245 if(i < 0 || i >= len s) 1246 return 0; 1247 c := s[i]; 1248 return isdigit(c) || isalpha(c) || c == '_'; 1249} 1250 1251lcpar(gaz: list of Gaz, pno: int): (int, int) 1252{ 1253 for(r := gaz; r != nil; r = tl r) { 1254 (pno1, beg1, end1) := hd r; 1255 if(pno == pno1) 1256 return (beg1, end1); 1257 } 1258 return (-1, -1); 1259} 1260 1261eqstr(s: string, t: string, ic: int): int 1262{ 1263 if(!ic) 1264 return s == t; 1265 if(len s != len t) 1266 return 0; 1267 for(i := 0; i < len s; i++) 1268 if(!eqcase(s[i], t[i])) 1269 return 0; 1270 return 1; 1271} 1272 1273eqcase(c1: int, c2: int): int 1274{ 1275 return toupper(c1) == toupper(c2); 1276} 1277 1278syntax(s: string) 1279{ 1280 runtime(regex, SyntaxError, s); 1281} 1282 1283rfatal(s: string) 1284{ 1285 runtime(regex, InternalError, s); 1286} 1287