xref: /inferno-os/appl/lib/ecmascript/regexp.b (revision 7ef44d652ae9e5e1f5b3465d73684e4a54de73c0)
1strhas(s: string, c: int): ref Val
2{
3	for(i := 0; i < len s; i++)
4		if(s[i] == c)
5			return true;
6	return false;
7}
8
9rsplit(r: string): (string, string)
10{
11	esc := 0;
12	i := 1;	# skip '/'
13	for(;;){
14		c := r[i++];
15		if(!esc && c == '/')
16			break;
17		esc = !esc && c == '\\';
18	}
19	return (r[1: i-1], r[i: ]);
20}
21
22badflags(f: string): int
23{
24	g := i := m := 0;
25	for(j := 0; j < len f; j++){
26		case(f[j]){
27			'g' =>
28				g++;
29			'i' =>
30				i++;
31			'm' =>
32				m++;
33			* =>
34				return 1;
35		}
36	}
37	return g > 1 || i > 1 || m > 1;
38}
39
40regexpvals(ex: ref Exec, v: ref Val, o: ref Ecmascript->Obj): (string, string, int)
41{
42	if(v != nil){
43		if(v.ty == TRegExp)
44			return (v.rev.p, v.rev.f, v.rev.i);
45		o = v.obj;
46	}
47	p := toString(ex, esget(ex, o, "source", 0));
48	f := "";
49	if(toBoolean(ex, esget(ex, o, "global", 0)) == true)
50		f += "g";
51	if(toBoolean(ex, esget(ex, o, "ignoreCase", 0)) == true)
52		f += "i";
53	if(toBoolean(ex, esget(ex, o, "multiline", 0)) == true)
54		f += "m";
55	i := toInt32(ex, esget(ex, o, "lastIndex", 0));
56	return (p, f, i);
57}
58
59nregexp(ex: ref Exec, nil: ref Ecmascript->Obj, args: array of ref Val): ref Ecmascript->Obj
60{
61	pat := biarg(args, 0);
62	flags := biarg(args, 1);
63	(p, f) := ("", "");
64	if(isregexp(pat)){
65		if(flags == undefined)
66			(p, f, nil) = regexpvals(ex, pat, nil);
67		else
68			runtime(ex, TypeError, "flags defined");
69	}
70	else{
71		if(pat == undefined)
72			p = "";
73		else
74			p = toString(ex, pat);
75		if(flags == undefined)
76			f = "";
77		else
78			f = toString(ex, flags);
79	}
80	o := nobj(ex, nil, array[] of { regexpval(p, f, 0) });
81	if(badflags(f))
82		runtime(ex, SyntaxError, "bad regexp flags");
83	regex = ex;
84	(re, err) := compile(p, 1);
85	if(re == nil || err != nil)
86		runtime(ex, SyntaxError, "bad regexp pattern");
87	o.re = re;
88	return o;
89}
90
91cregexp(ex: ref Exec, f, nil: ref Ecmascript->Obj, args: array of ref Val): ref Val
92{
93	pat := biarg(args, 0);
94	flags := biarg(args, 1);
95	if(isregexp(pat) && flags == undefined)
96		return pat;
97	return objval(nregexp(ex, f, args));
98}
99
100cregexpprotoexec(ex: ref Exec, f, this: ref Ecmascript->Obj, args: array of ref Val): ref Val
101{
102	m: array of (int, int);
103
104	regexpcheck(ex, this, f);
105	s := toString(ex, biarg(args, 0));
106	l := len s;
107	i := toInt32(ex, esget(ex, this, "lastIndex", 0));
108	e := 0;
109	glob := esget(ex, this, "global", 0);
110	multiline := esget(ex, this, "multiline", 0);
111	ignorecase := esget(ex, this, "ignoreCase", 0);
112	if(glob == false)
113		i = 0;
114	for(;;){
115		if(i < 0 || i >= l){
116			esput(ex, this, "lastIndex", numval(real 0), 0);
117			return null;
118		}
119		regex = ex;
120		m = executese(this.re, s, (i, len s), i == 0, 1, multiline == true, ignorecase == true);
121		if(m != nil)
122			break;
123		i++;
124		i = -1;	# no need to loop with executese
125	}
126	(i, e) = m[0];
127	if(glob == true)
128		esput(ex, this, "lastIndex", numval(real e), 0);
129	n := len m;
130	av := array[n] of ref Val;
131	for(j := 0; j < n; j++){
132		(a, b) := m[j];
133		if(a < 0)
134			av[j] = undefined;
135		else
136			av[j] = strval(s[a: b]);
137	}
138	a := narray(ex, nil, av);
139	esput(ex, a, "index", numval(real i), 0);
140	esput(ex, a, "input", strval(s), 0);
141	return objval(a);
142}
143
144cregexpprototest(ex: ref Exec, f, this: ref Ecmascript->Obj, args: array of ref Val): ref Val
145{
146	regexpcheck(ex, this, f);
147	v := cregexpprotoexec(ex, f, this, args);
148	if(!isnull(v))
149		return true;
150	return false;
151}
152
153cregexpprototoString(ex: ref Exec, f, this: ref Ecmascript->Obj, nil: array of ref Val): ref Val
154{
155	regexpcheck(ex, this, f);
156	(p, fl, nil) := regexpvals(ex, nil, this);
157	return strval("/" + p + "/" + fl);
158}
159
160regexpcheck(ex: ref Exec, o: ref Ecmascript->Obj, f: ref Obj)
161{
162	if(f == nil)
163		s := "exec";
164	else
165		s = f.val.str;
166	if(!isregexpobj(o))
167		runtime(ex, TypeError, "RegExp.prototype." + s + " called on non-RegExp object");
168}
169
170cstrprotomatch(ex: ref Exec, nil, this: ref Ecmascript->Obj, args: array of ref Val): ref Val
171{
172	v := biarg(args, 0);
173	if(!isregexp(v))
174		re := nregexp(ex, nil, args);
175	else if(v.ty == TObj)
176		re = v.obj;
177	else
178		re = nobj(ex, nil, args);
179	s := toString(ex, this.val);
180	glob := esget(ex, re, "global", 0);
181	av := array[1] of ref Val;
182	av[0] = strval(s);
183	if(glob == false)
184		return cregexpprotoexec(ex, nil, re, av);
185	li := 0;
186	esput(ex, re, "lastIndex", numval(real li), 0);
187	ms: list of ref Val;
188	for(;;){
189		v = cregexpprotoexec(ex, nil, re, av);
190		if(isnull(v))
191			break;
192		ms = esget(ex, v.obj, "0", 0) :: ms;
193		ni := int toUint32(ex, esget(ex, re, "lastIndex", 0));
194		if(ni == li)
195			esput(ex, re, "lastIndex", numval(real ++li), 0);
196		else
197			li = ni;
198	}
199	n := len ms;
200	av = array[n] of ref Val;
201	for(j := n-1; j >= 0; j--){
202		av[j] = hd ms;
203		ms = tl ms;
204	}
205	return objval(narray(ex, nil, av));
206}
207
208cstrprotoreplace(ex: ref Exec, nil, this: ref Ecmascript->Obj, args: array of ref Val): ref Val
209{
210	re: ref Ecmascript->Obj;
211
212	v := biarg(args, 0);
213	rege := isregexp(v);
214	if(!rege){
215		if(args == nil)
216			re = nregexp(ex, nil, args);
217		else
218			re = nregexp(ex, nil, args[0:1]);
219	}
220	else if(v.ty == TObj)
221		re = v.obj;
222	else
223		re = nobj(ex, nil, args);
224	s := toString(ex, this.val);
225	if(rege)
226		glob := esget(ex, re, "global", 0);
227	else
228		glob = false;
229	av := array[1] of ref Val;
230	av[0] = strval(s);
231	ms: list of ref Val;
232	li := 0;
233	if(glob == true)
234		esput(ex, re, "lastIndex", numval(real li), 0);
235	for(;;){
236		v = cregexpprotoexec(ex, nil, re, av);
237		if(!isnull(v))
238			ms = v :: ms;
239		if(isnull(v) || glob == false)
240			break;
241		ni := int toUint32(ex, esget(ex, re, "lastIndex", 0));
242		if(ni == li)
243			esput(ex, re, "lastIndex", numval(real ++li), 0);
244		else
245			li = ni;
246	}
247	if(ms == nil)
248		return strval(s);
249	ms = rev(ms);
250	if(rege)
251		lcp := int toUint32(ex, esget(ex, (hd ms).obj, "length", 0))-1;
252	else
253		lcp = 0;
254	v = biarg(args, 1);
255	if(isobj(v) && isfuncobj(v.obj)){
256		ns := s;
257		n := len ms;
258		args = array[lcp+3] of ref Val;
259		o := inc := 0;
260		for(i := 0; i < n; i++){
261			a := (hd ms).obj;
262			ms = tl ms;
263			for(j := 0; j <= lcp; j++)
264				args[j] = esget(ex, a, string j, 0);
265			ss := toString(ex, args[0]);
266			o = offset(ss, s, o);
267			args[lcp+1] = numval(real o);
268			args[lcp+2] = strval(s);
269			rs := toString(ex, getValue(ex, escall(ex, v.obj, nil, args, 0)));
270			ns = repl(ns, o+inc, o+inc+len ss, rs);
271			o += len ss;
272			inc += len rs - len ss;
273		}
274		return strval(ns);
275	}
276	else{
277		ps := toString(ex, v);
278		lps := len ps;
279		ns := s;
280		n := len ms;
281		o := inc := 0;
282		for(i := 0; i < n; i++){
283			a := (hd ms).obj;
284			ms = tl ms;
285			ss := toString(ex, esget(ex, a, "0", 0));
286			o = offset(ss, s, o);
287			rs := "";
288			for(j := 0; j < lps; j++){
289				if(ps[j] == '$' && j < lps-1){
290					j++;
291					case(c := ps[j]){
292						'$' =>
293							rs += "$";
294						'&' =>
295							rs += ss;
296						'`' =>
297							rs += s[0: o];
298						''' =>
299							rs += s[o+len ss: ];
300						'0' to '9' =>
301							if(j < lps-1 && isdigit(ps[j+1]))
302								c = 10*(c-'0')+ps[++j]-'0';
303							else
304								c = c-'0';
305							if(c >= 1 && c <= lcp)
306								rs += toString(ex, esget(ex, a, string c, 0));
307					}
308				}
309				else
310					rs += ps[j: j+1];
311			}
312			ns = repl(ns, o+inc, o+inc+len ss, rs);
313			o += len ss;
314			inc += len rs - len ss;
315		}
316		return strval(ns);
317	}
318}
319
320cstrprotosearch(ex: ref Exec, nil, this: ref Ecmascript->Obj, args: array of ref Val): ref Val
321{
322	v := biarg(args, 0);
323	if(!isregexp(v))
324		re := nregexp(ex, nil, args);
325	else if(v.ty == TObj)
326		re = v.obj;
327	else
328		re = nobj(ex, nil, args);
329	s := toString(ex, this.val);
330	glob := esget(ex, re, "global", 0);
331	esput(ex, re, "global", false, 0);
332	av := array[1] of ref Val;
333	av[0] = strval(s);
334	v = cregexpprotoexec(ex, nil, re, av);
335	if(isnull(v))
336		r := -1;
337	else{
338		ss := toString(ex, esget(ex, v.obj, "0", 0));
339		r = offset(ss, s, 0);
340	}
341	esput(ex, re, "global", glob, 0);
342	return numval(real r);
343}
344
345offset(ss: string, s: string, m: int): int
346{
347	nn := len ss;
348	n := len s;
349	for(i := m; i <= n-nn; i++){
350		if(s[i: i+nn] == ss)
351			return i;
352	}
353	return -1;
354}
355
356repl(s: string, a: int, b: int, ns: string): string
357{
358	return s[0: a] + ns + s[b: ];
359}
360
361rev(ls: list of ref Val): list of ref Val
362{
363	ns: list of ref Val;
364
365	for( ; ls != nil; ls = tl ls)
366		ns = hd ls :: ns;
367	return ns;
368}
369
370#########################################################################
371# regex.b originally
372
373# normally imported identifiers
374
375# internal identifiers, not normally imported
376
377ALT, CAT, DOT, SET, HAT, DOL, NUL, PCLO, CLO, OPT, LPN, RPN, LPN0, RPN0, LPN1, RPN1, LPN2, RPN2, BEET, BEEF, MNCLO, LCP, IDLE: con (1<<16)+iota;
378
379# syntax
380
381# RE	ALT		regular expression
382#	NUL
383# ALT	CAT		alternation
384# 	CAT | ALT
385#
386# CAT	DUP		catenation
387# 	DUP CAT
388#
389# DUP	PRIM		possibly duplicated primary
390# 	PCLO
391# 	CLO
392# 	OPT
393#
394# PCLO	PRIM +		1 or more
395# CLO	PRIM *		0 or more
396# OPT	PRIM ?		0 or 1
397#
398# PRIM	( RE )
399#	()
400# 	DOT		any character
401# 	CHAR		a single character
402#	ESC		escape sequence
403# 	[ SET ]		character set
404# 	NUL		null string
405# 	HAT		beginning of string
406# 	DOL		end of string
407#
408
409regex: ref Exec;
410
411NIL : con -1;		# a refRex constant
412NONE: con -2;		# ditto, for an un-set value
413BAD: con 1<<16;		# a non-character
414HUGE: con (1<<31) - 1;
415
416# the data structures of re.m would like to be ref-linked, but are
417# circular (see fn walk), thus instead of pointers we use indexes
418# into an array (arena) of nodes of the syntax tree of a regular expression.
419# from a storage-allocation standpoint, this replaces many small
420# allocations of one size with one big one of variable size.
421
422ReStr: adt {
423	s : string;
424	i : int;	# cursor postion
425	n : int;	# number of chars left; -1 on error
426	peek : fn(s: self ref ReStr): int;
427	next : fn(s: self ref ReStr): int;
428	unput: fn(s: self ref ReStr);
429};
430
431ReStr.peek(s: self ref ReStr): int
432{
433	if(s.n <= 0)
434		return BAD;
435	return s.s[s.i];
436}
437
438ReStr.next(s: self ref ReStr): int
439{
440	if(s.n <= 0)
441		syntax("bad regular expression");
442	s.n--;
443	return s.s[s.i++];
444}
445
446ReStr.unput(s: self ref ReStr)
447{
448	s.n++;
449	s.i--;
450}
451
452newRe(kind: int, left, right: refRex, set: ref Set, ar: ref Arena, pno: int, greedy: int): refRex
453{
454	ar.rex[ar.ptr] = Rex(kind, left, right, set, pno, greedy, nil);
455	return ar.ptr++;
456}
457
458# parse a regex by recursive descent to get a syntax tree
459
460re(s: ref ReStr, ar: ref Arena): refRex
461{
462	left := cat(s, ar);
463	if(left==NIL || s.peek()!='|')
464		return left;
465	s.next();
466	right := re(s, ar);
467	if(right == NIL)
468		return NIL;
469	return newRe(ALT, left, right, nil, ar, 0, 0);
470}
471
472cat(s: ref ReStr, ar: ref Arena): refRex
473{
474	left := dup(s, ar);
475	if(left == NIL)
476		return left;
477	right := cat(s, ar);
478	if(right == NIL)
479		return left;
480	return newRe(CAT, left, right, nil, ar, 0, 0);
481}
482
483dup(s: ref ReStr, ar: ref Arena): refRex
484{
485	n1, n2: int;
486
487	case s.peek() {
488	BAD or ')' or ']' or '|' or '?' or '*' or '+' =>
489		return NIL;
490	}
491	prim: refRex;
492	case kind:=s.next() {
493	'(' =>	if(ar.pno < 0) {
494			if(s.peek() == ')') {
495				s.next();
496				prim = newRe(NUL, NONE, NONE, nil, ar, 0, 0);
497			} else {
498				prim = re(s, ar);
499				if(prim==NIL || s.next()!=')')
500					syntax("( with no )");
501			}
502		} else {
503			pno := ++ar.pno;
504			lp := newRe(LPN, NONE, NONE, nil, ar, pno, 0);
505			rp := newRe(RPN, NONE, NONE, nil, ar, pno, 0);
506			if(s.peek() == ')') {
507				s.next();
508				prim = newRe(CAT, lp, rp, nil, ar, 0, 0);
509			} else {
510				if(s.peek() == '?'){
511					s.next();
512					case s.next(){
513						':' => ar.rex[lp].kind = LPN0;
514							ar.rex[rp].kind = RPN0;
515						'=' => ar.rex[lp].kind = LPN1;
516							ar.rex[rp].kind = RPN1;
517						'!' => ar.rex[lp].kind = LPN2;
518							ar.rex[rp].kind = RPN2;
519						* => syntax("bad char after ?");
520					}
521				}
522				prim = re(s, ar);
523				if(prim==NIL || s.next()!=')')
524					syntax("( with no )");
525				else {
526					prim = newRe(CAT, prim, rp, nil, ar, 0, 0);
527					prim = newRe(CAT, lp, prim, nil, ar, 0, 0);
528				}
529			}
530		}
531	'[' =>	prim = newRe(SET, NONE, NONE, newSet(s), ar, 0, 0);
532	* =>	case kind {
533		'.' =>		kind = DOT;
534		'^' =>	kind = HAT;
535		'$' =>	kind = DOL;
536		}
537		(c, set, op) := esc(s, kind, 0);
538		if(set != nil)
539			prim = newRe(SET, NONE, NONE, set, ar, 0, 0);
540		else if(op == LCP){
541			if(c > ar.pno)
542				syntax("\num too big");
543			prim = newRe(LCP, NONE, NONE, nil, ar, 0, 0);
544			ar.rex[prim].ns = ref Nstate(c, c);
545		}
546		else
547			prim = newRe(c, NONE, NONE, nil, ar, 0, 0);
548	}
549	case s.peek() {
550	'*' =>	kind = CLO;
551	'+' =>	kind = PCLO;
552	'?' =>	kind = OPT;
553	'{' =>	s.next();
554		(n1, n2) = drange(s);
555		kind = MNCLO;
556		if(s.peek() != '}')
557			syntax("{ with no }");
558	* =>	return prim;
559	}
560	s.next();
561	greedy := 1;
562	if(s.peek() == '?'){
563		# non-greedy op
564		greedy = 0;
565		s.next();
566	}
567	prim = newRe(kind, prim, NONE, nil, ar, 0, greedy);
568	if(kind == MNCLO)
569		ns := ar.rex[prim].ns = ref Nstate(n1, n2);
570	return prim;
571}
572
573esc(s: ref ReStr, char: int, inset: int): (int, ref Set, int)
574{
575	set: ref Set;
576
577	op := 0;
578	if(char == '\\') {
579		char = s.next();
580		case char {
581		'b' =>
582				if(inset)
583					char = '\b';
584				else
585					char = BEET;
586		'B' =>	if(inset)
587					syntax("\\B in set");
588				else
589					char = BEEF;
590		'f' =>		char = '\u000c';
591		'n' =>	char = '\n';
592		'r' =>		char = '\r';
593		't' =>		char = '\t';
594		'v' =>	char = '\v';
595		'0' to '9' =>
596				s.unput();
597				char = digits(s);
598				if(char == 0)
599					char = '\0';
600				else if(inset)
601					syntax("\num in set");
602				else
603					op = LCP;
604		'x' =>	char = hexdigits(s, 2);
605		'u' =>	char = hexdigits(s, 4);
606		'c' =>	char = s.next()%32;
607		'd' or 'D' =>
608				set = newset('0', '9');
609				if(char == 'D')
610					set.neg = 1;
611		's' or 'S' =>
612				set = newset(' ', ' ');
613				addsets(set, "\t\v\u000c\u00a0\n\r\u2028\u2029");
614				if(char == 'S')
615					set.neg = 1;
616		'w' or 'W' =>
617				set = newset('0', '9');
618				addset(set, 'a', 'z');
619				addset(set, 'A', 'Z');
620				addset(set, '_', '_');
621				if(char == 'W')
622					set.neg = 1;
623		* =>
624				;
625		}
626	}
627	if(char == -1){
628		if(inset)
629			syntax("bad set");
630		else
631			syntax("bad character");
632	}
633	return (char, set, op);
634}
635
636isdigit(c: int): int
637{
638	return c >= '0' && c <= '9';
639}
640
641islower(c: int): int
642{
643	return c >= 'a' && c <= 'z';
644}
645
646isupper(c: int): int
647{
648	return c >= 'A' && c <= 'Z';
649}
650
651isalpha(c: int): int
652{
653	return islower(c) || isupper(c);
654}
655
656hexdigit(c: int): int
657{
658	if(isdigit(c))
659		return c-'0';
660	if('a' <= c && c <= 'f')
661		return c-'a'+10;
662	if('A' <= c && c <= 'F')
663		return c-'A'+10;
664	return -1;
665}
666
667digits(s: ref ReStr): int
668{
669	n := 0;
670	while(isdigit(s.peek()))
671		n = 10*n + s.next() -'0';
672	return n;
673}
674
675hexdigits(s: ref ReStr, n: int): int
676{
677	x := 0;
678	for(i := 0; i < n; i++){
679		v := hexdigit(s.next());
680		if(v < 0)
681			return -1;
682		x = 16*x+v;
683	}
684	return x;
685}
686
687drange(s: ref ReStr): (int, int)
688{
689	n1 := n2 := -1;
690	if(isdigit(s.peek()))
691		n1 = digits(s);
692	if(s.peek() == ','){
693		s.next();
694		if(isdigit(s.peek()))
695			n2 = digits(s);
696		else
697			n2 = HUGE;
698	}
699	else
700		n2 = n1;
701	if(n1 < 0 || n1 > n2)
702		syntax("bad number range");
703	return (n1, n2);
704}
705
706# walk the tree adjusting pointers to refer to
707# next state of the finite state machine
708
709walk(r: refRex, succ: refRex, ar: ref Arena)
710{
711	if(r==NONE)
712		return;
713	rex := ar.rex[r];
714	case rex.kind {
715	ALT =>	walk(rex.left, succ, ar);
716		walk(rex.right, succ, ar);
717		return;
718	CAT =>	walk(rex.left, rex.right, ar);
719		walk(rex.right, succ, ar);
720		ar.rex[r] = ar.rex[rex.left];	# optimization
721		return;
722	CLO or PCLO =>
723		end := newRe(OPT, r, succ, nil, ar, 0, rex.greedy); # here's the circularity
724		walk(rex.left, end, ar);
725	OPT =>	walk(rex.left, succ, ar);
726	MNCLO =>
727		ar.ptr++;
728		walk(rex.left, r, ar);
729	LCP =>
730		ar.rex[r].left = newRe(IDLE, NONE, succ, nil, ar, 0, 0);
731	}
732	ar.rex[r].right = succ;
733}
734
735prtree(r: refRex, ar: ref Arena, done: list of int, ind: string): list of int
736{
737	sys->print("%s", ind);
738	if(r==NIL){
739		sys->print("NIL\n");
740		return done;
741	}
742	if(r==NONE){
743		sys->print("NONE\n");
744		return done;
745	}
746	printed := 0;
747	for(li := done; li != nil; li = tl li){
748		if(hd li == r){
749			printed = 1;
750			break;
751		}
752	}
753	rex := ar.rex[r];
754	op := "";
755	z := "Z";
756	case rex.kind{
757		ALT => op = "|";
758		CAT => op = "and";
759		DOT => op = ".";
760		SET => op = "[]";
761		HAT => op = "^";
762		DOL => op = "$";
763		NUL => op = "NUL";
764		PCLO => op = "+";
765		CLO => op = "*";
766		OPT => op = "?";
767		LPN => op = "(";
768		RPN => op = ")";
769		LPN0 => op = "?:";
770		RPN0 => op = ":?";
771		LPN1 => op = "?=";
772		RPN1 => op = "=?";
773		LPN2 => op = "?!";
774		RPN2 => op = "!?";
775		BEET => op = "\\b";
776		BEEF => op = "\\B";
777		MNCLO => op = "{}";
778		LCP => op = "n";
779		IDLE => op = "i";
780		* => z[0] = rex.kind; op = z;
781	}
782	if(printed){
783		sys->print("node %d (%d)\n", r, r);
784		return done;
785	}
786	else{
787		if(rex.ns != nil)
788			sys->print("%s [%d-%d] (%d)\n", op, rex.ns.m, rex.ns.n, r);
789		else
790			sys->print("%s (%d)\n", op, r);
791		done = r :: done;
792		ind += "  ";
793		done = prtree(rex.left, ar, done, ind);
794		done  = prtree(rex.right, ar, done, ind);
795		return done;
796	}
797}
798
799compile(e: string, flag: int): (Re, string)
800{
801	if(e == nil)
802		return (nil, "missing expression");
803	s := ref ReStr(e, 0, len e);
804	ar := ref Arena(array[2*s.n] of Rex, 0, 0, (flag&1)-1);
805	start := ar.start = re(s, ar);
806	if(start==NIL || s.n!=0)
807		syntax("invalid regular expression");
808	walk(start, NIL, ar);
809	# prtree(start, ar, nil, "");
810	if(ar.pno < 0)
811		ar.pno = 0;
812	return (ar, nil);
813}
814
815# todo: queue for epsilon and advancing transitions
816
817Num: adt{
818	ns: ref Nstate;
819	m: int;
820	n: int;
821};
822Gaz: adt {
823	pno: int;
824	beg: int;
825	end: int;
826};
827Trace: adt {
828	cre: refRex;		# cursor in Re
829	trans: int;		# 0 epsilon transition, 1 advancing transition
830	beg: int;		# where this trace began;
831	end: int;		# where this trace ended if success (-1 by default)
832	gaz: list of Gaz;
833	ns: list of ref Num;
834};
835Queue: adt {
836	ptr: int;
837	q: array of Trace;
838};
839
840execute(re: Re, s: string): array of (int, int)
841{
842	return executese(re, s, (-1,-1), 1, 1, 1, 0);
843}
844
845executese(re: Re, s: string, range: (int, int), bol: int, eol: int, multiline: int, ignorecase: int): array of (int,int)
846{
847	if(re==nil)
848		return nil;
849	(s0, s1) := range;
850	if(s0 < 0)
851		s0 = 0;
852	if(s1 < 0)
853		s1 = len s;
854	match := 0;
855	todo := ref Queue(0, array[2*re.ptr] of Trace);
856	for(i:=s0; i<=s1; i++) {
857		if(!match)		# no leftmost match yet
858			todo.q[todo.ptr++] = Trace(re.start, 0, i, -1, nil, nil);
859		for(k:=0; k<todo.ptr; k++) {
860			q := todo.q[k];
861			if(q.trans)
862				continue;
863			rex := re.rex[q.cre];
864			next0 := next1 := next2 := NONE;
865			case rex.kind {
866			NUL =>
867				next1 = rex.right;
868			DOT =>
869				if(i<len s && !islt(s[i]))
870					next2 = rex.right;
871			HAT =>
872				if(i == s0 && bol)
873					next1 = rex.right;
874				else if(multiline && i > 0 && islt(s[i-1]))
875					next1 = rex.right;
876			DOL =>
877				if(i == s1 && eol)
878					next1 = rex.right;
879				else if(multiline && i < s1 && islt(s[i]))
880					next1 = rex.right;
881			SET =>
882				if(i<len s && member(s[i], rex.set, ignorecase))
883					next2 = rex.right;
884			CAT or
885			PCLO =>
886				next1 = rex.left;
887			ALT or
888			CLO or
889			OPT =>
890				if(rex.kind == ALT || rex.greedy){
891					next0 = rex.left;
892					next1 = rex.right;
893				}
894				else{
895					next0 = rex.right;
896					next1 = rex.left;
897				}
898			LPN =>
899				next1 = rex.right;
900				q.gaz = Gaz(rex.pno,i,-1)::q.gaz;
901			RPN =>
902				next1 = rex.right;
903				for(r:=q.gaz; ; r=tl r) {
904					(pno,beg1,end1) := hd r;
905					if(rex.pno==pno && end1==-1) {
906						q.gaz = Gaz(pno,beg1,i)::q.gaz;
907						break;
908					}
909				}
910			LPN0 or RPN0 or RPN1 or RPN2 =>
911				next1 = rex.right;
912			LPN1 =>
913				(rpn, nxt, nre) := storetree(q.cre, re);
914				m := executese(nre, s, (i, -1), bol, eol, multiline, ignorecase);
915				if(m != nil && m[0].t0 == i){
916					next1 = nxt;
917					for(j := 1; j < len m; j++)
918						if(m[j].t0 >= 0)
919							q.gaz = Gaz(j, m[j].t0, m[j].t1)::q.gaz;
920				}
921				restoretree(LPN1, rpn, nxt, nre);
922			LPN2 =>
923				(rpn, nxt, nre) := storetree(q.cre, re);
924				m := executese(nre, s, (i, -1), bol, eol, multiline, ignorecase);
925				if(m == nil || m[0].t0 != i)
926					next1 = nxt;
927				restoretree(LPN2, rpn, nxt, nre);
928			MNCLO =>
929				num: ref Num;
930
931				(q.ns, num) = nextn(q.cre, q.ns, rex.ns.m, rex.ns.n, re);
932				if(num.m > 0)
933					next1 = rex.left;
934				else if(num.n > 0){
935					if(rex.greedy){
936						next0 = rex.left;
937						next1 = rex.right;
938					}
939					else{
940						next0 = rex.right;
941						next1 = rex.left;
942					}
943				}
944				else{
945					next1 = rex.right;
946					(num.m, num.n) = (-1, -1);
947				}
948			LCP =>
949				pno := rex.ns.m;
950				(beg1, end1) := lcpar(q.gaz, pno);
951				l := end1-beg1;
952				if(beg1 < 0)	# undefined so succeeds
953					next1 = rex.right;
954				else if(i+l <= s1 && eqstr(s[beg1: end1], s[i: i+l], ignorecase)){
955					(q.ns, nil) = nextn(rex.left, q.ns, l, l, re);
956					next1 = rex.left;	# idle
957				}
958			IDLE =>
959				num: ref Num;
960
961				(q.ns, num) = nextn(q.cre, q.ns, -1, -1, re);
962				if(num.m >= 0)
963					next2 = q.cre;
964				else{
965					next1 = rex.right;
966					(num.m, num.n) = (-1, -1);
967				}
968			BEET =>
969				if(iswordc(s, i-1) != iswordc(s, i))
970					next1 = rex.right;
971			BEEF =>
972				if(iswordc(s, i-1) == iswordc(s, i))
973					next1 = rex.right;
974			* =>
975				if(i<len s && (rex.kind==s[i] || (ignorecase && eqcase(rex.kind, s[i]))))
976					next2 = rex.right;
977			}
978			l := k;
979			if(next0 != NONE) {
980				if(next0 != NIL)
981					(k, l) = insert(next0, 0, q.beg, -1, q.gaz, q.ns, todo, k, l);
982				else{
983					match = 1;
984					(k, l) = insert(NIL, 2, q.beg, i, q.gaz, nil, todo, k, l);
985				}
986			}
987			if(next1 != NONE) {
988				if(next1 != NIL)
989					(k, l) = insert(next1, 0, q.beg, -1, q.gaz, q.ns, todo, k, l);
990				else{
991					match = 1;
992					(k, l) = insert(NIL, 2, q.beg, i, q.gaz, nil, todo, k, l);
993				}
994			}
995			if(next2 != NONE) {
996				if(next2 != NIL)
997					(k, l) = insert(next2, 1, q.beg, -1, q.gaz, q.ns, todo, k, l);
998				else{
999					match = 1;
1000					(k, l) = insert(NIL, 2, q.beg, i+1, q.gaz, nil, todo, k, l);
1001				}
1002			}
1003		}
1004		if(!atoe(todo) && match)
1005			break;
1006	}
1007	if(todo.ptr == 0)
1008		return nil;
1009	if(todo.ptr > 1)
1010		rfatal(sys->sprint("todo.ptr = %d", todo.ptr));
1011	if(todo.q[0].trans != 2)
1012		rfatal(sys->sprint("trans = %d", todo.q[0].trans));
1013	if(todo.q[0].cre != NIL)
1014		rfatal(sys->sprint("cre = %d", todo.q[0].cre));
1015	beg := todo.q[0].beg;
1016	end := todo.q[0].end;
1017	gaz := todo.q[0].gaz;
1018	if(beg == -1)
1019		return nil;
1020	result := array[re.pno+1] of { 0 => (beg,end), * => (-1,-1) };
1021	for( ; gaz!=nil; gaz=tl gaz) {
1022		(pno, beg1, end1) := hd gaz;
1023		(rbeg, nil) := result[pno];
1024		if(rbeg==-1 && (beg1|end1)!=-1)
1025			result[pno] = (beg1,end1);
1026	}
1027	return result;
1028}
1029
1030better(newbeg, newend, oldbeg, oldend: int): int
1031{
1032	return oldbeg==-1 || newbeg<oldbeg ||
1033	       newbeg==oldbeg && newend>oldend;
1034}
1035
1036insert(next: refRex, trans: int, tbeg: int, tend: int, tgaz: list of Gaz, tns: list of ref Num, todo: ref Queue, k: int, l: int): (int, int)
1037{
1038# sys->print("insert %d eps=%d beg=%d end=%d (k, l) = (%d %d) => ", next, trans, tbeg, tend, k, l);
1039	for(j:=0; j<todo.ptr; j++){
1040		if(todo.q[j].trans == trans){
1041			if(todo.q[j].cre == next){
1042				if(better(todo.q[j].beg, todo.q[j].end, tbeg, tend))
1043					return (k, l);
1044				else if(better(tbeg, tend, todo.q[j].beg, todo.q[j].end))
1045					break;
1046				else if(j < k)
1047					return (k, l);
1048				else
1049					break;
1050			}
1051		}
1052	}
1053	if(j < k){
1054		k--;
1055		l--;
1056	}
1057	if(j < todo.ptr){
1058		todo.q[j: ] = todo.q[j+1: todo.ptr];
1059		todo.ptr--;
1060	}
1061	todo.q[l+2: ] = todo.q[l+1: todo.ptr];
1062	todo.ptr++;
1063	todo.q[l+1] = Trace(next, trans, tbeg, tend, tgaz, tns);
1064# for(j=0; j < todo.ptr; j++) sys->print("%d(%d) ", todo.q[j].cre, todo.q[j].trans); sys->print("\n");
1065	return (k, l+1);
1066}
1067
1068# remove epsilon transitions and move advancing transitions to epsilon ones
1069atoe(todo: ref Queue): int
1070{
1071	n := 0;
1072	for(j := 0; j < todo.ptr; j++){
1073		if(todo.q[j].trans){
1074			if(todo.q[j].trans == 1){
1075				todo.q[j].trans = 0;
1076				n++;
1077			}
1078		}
1079		else{
1080			todo.q[j: ] = todo.q[j+1: todo.ptr];
1081			todo.ptr--;
1082			j--;
1083		}
1084	}
1085	return n;
1086}
1087
1088nextn(re: int, ln: list of ref Num, m: int, n: int, ar: ref Arena): (list of ref Num, ref Num)
1089{
1090	num: ref Num;
1091
1092	ns := ar.rex[re].ns;
1093	for(l := ln; l != nil; l = tl l){
1094		if((hd l).ns == ns){
1095			num = hd l;
1096			break;
1097		}
1098	}
1099	if(num == nil)
1100		ln = (num = ref Num(ns, -1, -1)) :: ln;
1101	if(num.m == -1 && num.n == -1)
1102		(num.m, num.n) = (m, n);
1103	else
1104		(nil, nil) = (--num.m, --num.n);
1105	return (ln, num);
1106}
1107
1108ASCII : con 128;
1109WORD : con 32;
1110
1111mem(c: int, set: ref Set): int
1112{
1113	return (set.ascii[c/WORD]>>c%WORD)&1;
1114}
1115
1116member(char: int, set: ref Set, ignorecase: int): int
1117{
1118	if(set.subset != nil){
1119		for(l := set.subset; l != nil; l = tl l)
1120			if(member(char, hd l, ignorecase))
1121				return !set.neg;
1122	}
1123	if(char < 128){
1124		if(ignorecase)
1125			return (mem(tolower(char), set) || mem(toupper(char), set))^set.neg;
1126		else
1127			return ((set.ascii[char/WORD]>>char%WORD)&1)^set.neg;
1128	}
1129	for(l:=set.unicode; l!=nil; l=tl l) {
1130		(beg, end) := hd l;
1131		if(char>=beg && char<=end)
1132			return !set.neg;
1133	}
1134	return set.neg;
1135}
1136
1137newSet(s: ref ReStr): ref Set
1138{
1139	op: int;
1140	set0: ref Set;
1141
1142	set := ref Set(0, array[ASCII/WORD] of {* => 0}, nil, nil);
1143	if(s.peek() == '^') {
1144		set.neg = 1;
1145		s.next();
1146	}
1147	while(s.n > 0) {
1148		char1 := s.next();
1149		if(char1 == ']')
1150			return set;
1151		(char1, set0, op) = esc(s, char1, 1);
1152		if(set0 != nil)
1153			mergeset(set, set0);
1154		char2 := char1;
1155		if(s.peek() == '-') {
1156			if(set0 != nil)
1157				syntax("set in range");
1158			s.next();
1159			char2 = s.next();
1160			if(char2 == ']')
1161				break;
1162			(char2, set0, op) = esc(s, char2, 1);
1163			if(set0 != nil)
1164				syntax("set in range");
1165			if(char2 < char1)
1166				break;
1167		}
1168		addset(set, char1, char2);
1169	}
1170	syntax("bad set");
1171	return nil;
1172}
1173
1174addset(set: ref Set, c1: int, c2: int)
1175{
1176	for(c := c1; c <= c2; c++){
1177		if(c < ASCII)
1178			set.ascii[c/WORD] |= 1<<c%WORD;
1179		else{
1180			set.unicode = (c, c2) :: set.unicode;
1181			break;
1182		}
1183	}
1184}
1185
1186addsets(set: ref Set, s: string)
1187{
1188	for(i := 0; i < len s; i++)
1189		addset(set, s[i], s[i]);
1190}
1191
1192mergeset(set: ref Set, set0: ref Set)
1193{
1194	if(!set0.neg){
1195		for(i := 0; i < ASCII/WORD; i++)
1196			set.ascii[i] |= set0.ascii[i];
1197		for(l := set0.unicode; l != nil; l = tl l)
1198			set.unicode = hd l :: set.unicode;
1199	}
1200	else
1201		set.subset = set0 :: set.subset;
1202}
1203
1204newset(c1: int, c2: int): ref Set
1205{
1206	set := ref Set(0, array[ASCII/WORD] of {* => 0}, nil, nil);
1207	addset(set, c1, c2);
1208	return set;
1209}
1210
1211storetree(lpn: int, re: ref Arena): (int, int, ref Arena)
1212{
1213	rpn: int;
1214
1215	rex := re.rex[lpn];
1216	k := rex.kind;
1217	l := 1;
1218	for(;;){
1219		rpn = rex.right;
1220		rex = re.rex[rpn];
1221		if(rex.kind == k)
1222			l++;
1223		else if(rex.kind == k+1 && --l == 0)
1224			break;
1225	}
1226	re.rex[lpn].kind = LPN;
1227	re.rex[rpn].kind = RPN;
1228	nxt := re.rex[rpn].right;
1229	re.rex[rpn].right = NIL;
1230	nre := ref *re;
1231	nre.start = lpn;
1232	return (rpn, nxt, nre);
1233}
1234
1235restoretree(lop: int, rpn: int, nxt: int, re: ref Arena)
1236{
1237	lpn := re.start;
1238	re.rex[lpn].kind = lop;
1239	re.rex[rpn].kind = lop+1;
1240	re.rex[rpn].right = nxt;
1241}
1242
1243iswordc(s: string, i: int): int
1244{
1245	if(i < 0 || i >= len s)
1246		return 0;
1247	c := s[i];
1248	return isdigit(c) || isalpha(c) || c == '_';
1249}
1250
1251lcpar(gaz: list of Gaz, pno: int): (int, int)
1252{
1253	for(r := gaz; r != nil; r = tl r) {
1254		(pno1, beg1, end1) := hd r;
1255		if(pno == pno1)
1256			return (beg1, end1);
1257	}
1258	return (-1, -1);
1259}
1260
1261eqstr(s: string, t: string, ic: int): int
1262{
1263	if(!ic)
1264		return s == t;
1265	if(len s != len t)
1266		return 0;
1267	for(i := 0; i < len s; i++)
1268		if(!eqcase(s[i], t[i]))
1269			return 0;
1270	return 1;
1271}
1272
1273eqcase(c1: int, c2: int): int
1274{
1275	return toupper(c1) == toupper(c2);
1276}
1277
1278syntax(s: string)
1279{
1280	runtime(regex, SyntaxError, s);
1281}
1282
1283rfatal(s: string)
1284{
1285	runtime(regex, InternalError, s);
1286}
1287