xref: /inferno-os/appl/lib/regex.b (revision d0e1d143ef6f03c75c008c7ec648859dd260cbab)
1implement Regex;
2
3include "regex.m";
4
5# syntax
6
7# RE	ALT		regular expression
8#	NUL
9# ALT	CAT		alternation
10# 	CAT | ALT
11#
12# CAT	DUP		catenation
13# 	DUP CAT
14#
15# DUP	PRIM		possibly duplicated primary
16# 	PCLO
17# 	CLO
18# 	OPT
19#
20# PCLO	PRIM +		1 or more
21# CLO	PRIM *		0 or more
22# OPT	PRIM ?		0 or 1
23#
24# PRIM	( RE )
25#	()
26# 	DOT		any character
27# 	CHAR		a single character
28#	ESC		escape sequence
29# 	[ SET ]		character set
30# 	NUL		null string
31# 	HAT		beginning of string
32# 	DOL		end of string
33#
34
35NIL : con -1;		# a refRex constant
36NONE: con -2;		# ditto, for an un-set value
37BAD: con 1<<16;		# a non-character
38HUGE: con (1<<31) - 1;
39
40# the data structures of re.m would like to be ref-linked, but are
41# circular (see fn walk), thus instead of pointers we use indexes
42# into an array (arena) of nodes of the syntax tree of a regular expression.
43# from a storage-allocation standpoint, this replaces many small
44# allocations of one size with one big one of variable size.
45
46ReStr: adt {
47	s : string;
48	i : int;	# cursor postion
49	n : int;	# number of chars left; -1 on error
50	peek : fn(s: self ref ReStr): int;
51	next : fn(s: self ref ReStr): int;
52};
53
54ReStr.peek(s: self ref ReStr): int
55{
56	if(s.n <= 0)
57		return BAD;
58	return s.s[s.i];
59}
60
61ReStr.next(s: self ref ReStr): int
62{
63	if(s.n <= 0)
64		return BAD;
65	s.n--;
66	return s.s[s.i++];
67}
68
69newRe(kind: int, left, right: refRex, set: ref Set, ar: ref Arena, pno: int): refRex
70{
71	ar.rex[ar.ptr] = Rex(kind, left, right, set, pno);
72	return ar.ptr++;
73}
74
75# parse a regex by recursive descent to get a syntax tree
76
77re(s: ref ReStr, ar: ref Arena): refRex
78{
79	left := cat(s, ar);
80	if(left==NIL || s.peek()!='|')
81		return left;
82	s.next();
83	right := re(s, ar);
84	if(right == NIL)
85		return NIL;
86	return newRe(ALT, left, right, nil, ar, 0);
87}
88
89cat(s: ref ReStr, ar: ref Arena): refRex
90{
91	left := dup(s, ar);
92	if(left == NIL)
93		return left;
94	right := cat(s, ar);
95	if(right == NIL)
96		return left;
97	return newRe(CAT, left, right, nil, ar, 0);
98}
99
100dup(s: ref ReStr, ar: ref Arena): refRex
101{
102	case s.peek() {
103	BAD or ')' or ']' or '|' or '?' or '*' or '+' =>
104		return NIL;
105	}
106	prim: refRex;
107	case kind:=s.next() {
108	'(' =>	if(ar.pno < 0) {
109			if(s.peek() == ')') {
110				s.next();
111				prim = newRe(NUL, NONE, NONE, nil, ar, 0);
112			} else {
113				prim = re(s, ar);
114				if(prim==NIL || s.next()!=')')
115					s.n = -1;
116			}
117		} else {
118			pno := ++ar.pno;
119			lp := newRe(LPN, NONE, NONE, nil, ar, pno);
120			rp := newRe(RPN, NONE, NONE, nil, ar, pno);
121			if(s.peek() == ')') {
122				s.next();
123				prim = newRe(CAT, lp, rp, nil, ar, 0);
124
125			} else {
126				prim = re(s, ar);
127				if(prim==NIL || s.next()!=')')
128					s.n = -1;
129				else {
130					prim = newRe(CAT, prim, rp, nil, ar, 0);
131					prim = newRe(CAT, lp, prim, nil, ar, 0);
132				}
133			}
134		}
135	'[' =>	prim = newRe(SET, NONE, NONE, newSet(s), ar, 0);
136	* =>	case kind {
137		'.' =>	kind = DOT;
138		'^' =>	kind = HAT;
139		'$' =>	kind = DOL;
140		}
141		prim = newRe(esc(s, kind), NONE, NONE, nil, ar, 0);
142	}
143	case s.peek() {
144	'*' =>	kind = CLO;
145	'+' =>	kind = PCLO;
146	'?' =>	kind = OPT;
147	* =>	return prim;
148	}
149	s.next();
150	return newRe(kind, prim, NONE, nil, ar, 0);
151}
152
153esc(s: ref ReStr, char: int): int
154{
155	if(char == '\\') {
156		char = s.next();
157		case char {
158		BAD =>	s.n = -1;
159		'n' =>	char = '\n';
160		}
161	}
162	return char;
163}
164
165# walk the tree adjusting pointers to refer to
166# next state of the finite state machine
167
168walk(r: refRex, succ: refRex, ar: ref Arena)
169{
170	if(r==NONE)
171		return;
172	rex := ar.rex[r];
173	case rex.kind {
174	ALT =>	walk(rex.left, succ, ar);
175		walk(rex.right, succ, ar);
176		return;
177	CAT =>	walk(rex.left, rex.right, ar);
178		walk(rex.right, succ, ar);
179		ar.rex[r] = ar.rex[rex.left];	# optimization
180		return;
181	CLO or PCLO =>
182		end := newRe(OPT, r, succ, nil, ar, 0); # here's the circularity
183		walk(rex.left, end, ar);
184	OPT =>	walk(rex.left, succ, ar);
185	}
186	ar.rex[r].right = succ;
187}
188
189compile(e: string, flag: int): (Re, string)
190{
191	if(e == nil)
192		return (nil, "missing expression");
193	s := ref ReStr(e, 0, len e);
194	ar := ref Arena(array[2*s.n] of Rex, 0, 0, (flag&1)-1);
195	start := ar.start = re(s, ar);
196	if(start==NIL || s.n!=0)
197		return (nil, "invalid regular expression");
198	walk(start, NIL, ar);
199	if(ar.pno < 0)
200		ar.pno = 0;
201	return (ar, nil);
202}
203
204# todo1, todo2: queues for epsilon and advancing transitions
205Gaz: adt {
206	pno: int;
207	beg: int;
208	end: int;
209};
210Trace: adt {
211	cre: refRex;		# cursor in Re
212	beg: int;		# where this trace began;
213	gaz: list of Gaz;
214};
215Queue: adt {
216	ptr: int;
217	q: array of Trace;
218};
219
220execute(re: Re, s: string): array of (int, int)
221{
222	return executese(re, s, (-1,-1), 1, 1);
223}
224
225executese(re: Re, s: string, range: (int, int), bol: int, eol: int): array of (int,int)
226{
227	if(re==nil)
228		return nil;
229	(s0, s1) := range;
230	if(s0 < 0)
231		s0 = 0;
232	if(s1 < 0)
233		s1 = len s;
234	gaz : list of Gaz;
235	(beg, end) := (-1, -1);
236	todo1 := ref Queue(0, array[re.ptr] of Trace);
237	todo2 := ref Queue(0, array[re.ptr] of Trace);
238	for(i:=s0; i<=s1; i++) {
239		small2 := HUGE;		# earliest possible match if advance
240		if(beg == -1)		# no leftmost match yet
241			todo1.q[todo1.ptr++] = Trace(re.start, i, nil);
242		for(k:=0; k<todo1.ptr; k++) {
243			q := todo1.q[k];
244			rex := re.rex[q.cre];
245			next1 := next2 := NONE;
246			case rex.kind {
247			NUL =>
248				next1 = rex.right;
249			DOT =>
250				if(i<len s && s[i]!='\n')
251					next2 = rex.right;
252			HAT =>
253				if(i == s0 && bol)
254					next1 = rex.right;
255			DOL =>
256				if(i == s1 && eol)
257					next1 = rex.right;
258			SET =>
259				if(i<len s && member(s[i], rex.set))
260					next2 = rex.right;
261			CAT or
262			PCLO =>
263				next1 = rex.left;
264			ALT or
265			CLO or
266			OPT =>
267				next1 = rex.right;
268				k = insert(rex.left, q.beg, q.gaz, todo1, k);
269			LPN =>
270				next1 = rex.right;
271				q.gaz = Gaz(rex.pno,i,-1)::q.gaz;
272			RPN =>
273				next1 = rex.right;
274				for(r:=q.gaz; ; r=tl r) {
275					(pno,beg1,end1) := hd r;
276					if(rex.pno==pno && end1==-1) {
277						q.gaz = Gaz(pno,beg1,i)::q.gaz;
278						break;
279					}
280				}
281			* =>
282				if(i<len s && rex.kind==s[i])
283					next2 = rex.right;
284			}
285			if(next1 != NONE) {
286				if(next1 != NIL)
287					k =insert(next1, q.beg, q.gaz, todo1, k);
288				else if(better(q.beg, i, beg, end))
289					(gaz, beg, end) = (q.gaz, q.beg, i);
290			}
291			if(next2 != NONE) {
292				if(next2 != NIL) {
293					if(q.beg < small2)
294						small2 = q.beg;
295					insert(next2, q.beg, q.gaz, todo2, 0);
296				 } else if(better(q.beg, i+1, beg, end))
297					(gaz, beg, end) = (q.gaz, q.beg, i+1);
298			}
299
300		}
301		if(beg!=-1 && beg<small2)	# nothing better possible
302			break;
303		(todo1,todo2) = (todo2, todo1);
304		todo2.ptr = 0;
305	}
306	if(beg == -1)
307		return nil;
308	result := array[re.pno+1] of { 0 => (beg,end), * => (-1,-1) };
309	for( ; gaz!=nil; gaz=tl gaz) {
310		(pno, beg1, end1) := hd gaz;
311		(rbeg, nil) := result[pno];
312		if(rbeg==-1 && (beg1|end1)!=-1)
313			result[pno] = (beg1,end1);
314	}
315	return result;
316}
317
318better(newbeg, newend, oldbeg, oldend: int): int
319{
320	return oldbeg==-1 || newbeg<oldbeg ||
321	       newbeg==oldbeg && newend>oldend;
322}
323
324insert(next: refRex, tbeg: int, tgaz: list of Gaz, todo: ref Queue, k: int): int
325{
326	for(j:=0; j<todo.ptr; j++)
327		if(todo.q[j].cre == next)
328			if(todo.q[j].beg <= tbeg)
329			 	return k;
330			else
331				break;
332	if(j < k)
333		k--;
334	if(j < todo.ptr)
335		todo.ptr--;
336	for( ; j<todo.ptr; j++)
337		todo.q[j] = todo.q[j+1];
338	todo.q[todo.ptr++] = Trace(next, tbeg, tgaz);
339	return k;
340}
341
342ASCII : con 128;
343WORD : con 32;
344
345member(char: int, set: ref Set): int
346{
347	if(char < 128)
348		return ((set.ascii[char/WORD]>>char%WORD)&1)^set.neg;
349	for(l:=set.unicode; l!=nil; l=tl l) {
350		(beg, end) := hd l;
351		if(char>=beg && char<=end)
352			return !set.neg;
353	}
354	return set.neg;
355}
356
357newSet(s: ref ReStr): ref Set
358{
359	set := ref Set(0, array[ASCII/WORD] of {* => 0}, nil);
360	if(s.peek() == '^') {
361		set.neg = 1;
362		s.next();
363	}
364	while(s.n > 0) {
365		char1 := s.next();
366		if(char1 == ']')
367			return set;
368		char1 = esc(s, char1);
369		char2 := char1;
370		if(s.peek() == '-') {
371			s.next();
372			char2 = s.next();
373			if(char2 == ']')
374				break;
375			char2 = esc(s, char2);
376			if(char2 < char1)
377				break;
378		}
379		for( ; char1<=char2; char1++)
380			if(char1 < ASCII)
381				set.ascii[char1/WORD] |= 1<<char1%WORD;
382			else {
383				set.unicode = (char1,char2)::set.unicode;
384				break;
385			}
386	}
387	s.n = -1;
388	return nil;
389}
390