xref: /plan9/sys/src/libregexp/regexec.c (revision 219b2ee8daee37f4aad58d63f21287faa8e4ffdc)
1 #include <u.h>
2 #include <libc.h>
3 #include "regexp.h"
4 #include "regcomp.h"
5 
6 
7 /*
8  *  return	0 if no match
9  *		>0 if a match
10  *		<0 if we ran out of _relist space
11  */
12 static int
13 regexec1(Reprog *progp,	/* program to run */
14 	char *bol,	/* string to run machine on */
15 	Resub *mp,	/* subexpression elements */
16 	int ms,		/* number of elements at mp */
17 	Reljunk *j
18 )
19 {
20 	int flag=0;
21 	Reinst *inst;
22 	Relist *tlp;
23 	char *s;
24 	int i, checkstart;
25 	Rune r, *rp, *ep;
26 	int n;
27 	Relist* tl;		/* This list, next list */
28 	Relist* nl;
29 	Relist* tle;		/* ends of this and next list */
30 	Relist* nle;
31 	int match;
32 	char *p;
33 
34 	match = 0;
35 	checkstart = j->starttype;
36 	if(mp)
37 		for(i=0; i<ms; i++) {
38 			mp[i].sp = 0;
39 			mp[i].ep = 0;
40 		}
41 	j->relist[0][0].inst = 0;
42 	j->relist[1][0].inst = 0;
43 
44 	/* Execute machine once for each character, including terminal NUL */
45 	s = j->starts;
46 	do{
47 		/* fast check for first char */
48 		if(checkstart) {
49 			switch(j->starttype) {
50 			case RUNE:
51 				p = utfrune(s, j->startchar);
52 				if(p == 0)
53 					return match;
54 				s = p;
55 				break;
56 			case BOL:
57 				if(s == bol)
58 					break;
59 				p = utfrune(s, '\n');
60 				if(p == 0)
61 					return match;
62 				s = p;
63 				break;
64 			}
65 		}
66 		r = *(uchar*)s;
67 		if(r < Runeself)
68 			n = 1;
69 		else
70 			n = chartorune(&r, s);
71 
72 		/* switch run lists */
73 		tl = j->relist[flag];
74 		tle = j->reliste[flag];
75 		nl = j->relist[flag^=1];
76 		nle = j->reliste[flag];
77 		nl->inst = 0;
78 
79 		/* Add first instruction to current list */
80 		if(match == 0)
81 			_renewemptythread(tl, progp->startinst, s);
82 
83 		/* Execute machine until current list is empty */
84 		for(tlp=tl; tlp->inst; tlp++){	/* assignment = */
85 			if(s == j->eol)
86 				break;
87 
88 			for(inst = tlp->inst; ; inst = inst->next){
89 				switch(inst->type){
90 				case RUNE:	/* regular character */
91 					if(inst->r == r){
92 						if(_renewthread(nl, inst->next, &tlp->se)==nle)
93 							return -1;
94 					}
95 					break;
96 				case LBRA:
97 					tlp->se.m[inst->subid].sp = s;
98 					continue;
99 				case RBRA:
100 					tlp->se.m[inst->subid].ep = s;
101 					continue;
102 				case ANY:
103 					if(r != '\n')
104 						if(_renewthread(nl, inst->next, &tlp->se)==nle)
105 							return -1;
106 					break;
107 				case ANYNL:
108 					if(_renewthread(nl, inst->next, &tlp->se)==nle)
109 							return -1;
110 					break;
111 				case BOL:
112 					if(s == bol || *(s-1) == '\n')
113 						continue;
114 					break;
115 				case EOL:
116 					if(r == 0 || r == '\n')
117 						continue;
118 					break;
119 				case CCLASS:
120 					ep = inst->cp->end;
121 					for(rp = inst->cp->spans; rp < ep; rp += 2)
122 						if(r >= rp[0] && r <= rp[1]){
123 							if(_renewthread(nl, inst->next, &tlp->se)==nle)
124 								return -1;
125 							break;
126 						}
127 					break;
128 				case NCCLASS:
129 					ep = inst->cp->end;
130 					for(rp = inst->cp->spans; rp < ep; rp += 2)
131 						if(r >= rp[0] && r <= rp[1])
132 							break;
133 					if(rp == ep)
134 						if(_renewthread(nl, inst->next, &tlp->se)==nle)
135 							return -1;
136 					break;
137 				case OR:
138 					/* evaluate right choice later */
139 					if(_renewthread(tlp, inst->right, &tlp->se) == tle)
140 						return -1;
141 					/* efficiency: advance and re-evaluate */
142 					continue;
143 				case END:	/* Match! */
144 					match = 1;
145 					tlp->se.m[0].ep = s;
146 					if(mp != 0)
147 						_renewmatch(mp, ms, &tlp->se);
148 					break;
149 				}
150 				break;
151 			}
152 		}
153 		checkstart = j->starttype && nl->inst==0;
154 		s += n;
155 	}while(r);
156 	return match;
157 }
158 
159 static int
160 regexec2(Reprog *progp,	/* program to run */
161 	char *bol,	/* string to run machine on */
162 	Resub *mp,	/* subexpression elements */
163 	int ms,		/* number of elements at mp */
164 	Reljunk *j
165 )
166 {
167 	Relist relist0[BIGLISTSIZE], relist1[BIGLISTSIZE];
168 
169 	/* mark space */
170 	j->relist[0] = relist0;
171 	j->relist[1] = relist1;
172 	j->reliste[0] = relist0 + nelem(relist0) - 2;
173 	j->reliste[1] = relist1 + nelem(relist1) - 2;
174 
175 	return regexec1(progp, bol, mp, ms, j);
176 }
177 
178 extern int
179 regexec(Reprog *progp,	/* program to run */
180 	char *bol,	/* string to run machine on */
181 	Resub *mp,	/* subexpression elements */
182 	int ms)		/* number of elements at mp */
183 {
184 	Reljunk j;
185 	Relist relist0[LISTSIZE], relist1[LISTSIZE];
186 	int rv;
187 
188 	/*
189  	 *  use user-specified starting/ending location if specified
190 	 */
191 	j.starts = bol;
192 	j.eol = 0;
193 	if(mp && ms>0){
194 		if(mp->sp)
195 			j.starts = mp->sp;
196 		if(mp->ep)
197 			j.eol = mp->ep;
198 	}
199 	j.starttype = 0;
200 	j.startchar = 0;
201 	if(progp->startinst->type == RUNE && progp->startinst->r < Runeself) {
202 		j.starttype = RUNE;
203 		j.startchar = progp->startinst->r;
204 	}
205 	if(progp->startinst->type == BOL)
206 		j.starttype = BOL;
207 
208 	/* mark space */
209 	j.relist[0] = relist0;
210 	j.relist[1] = relist1;
211 	j.reliste[0] = relist0 + nelem(relist0) - 2;
212 	j.reliste[1] = relist1 + nelem(relist1) - 2;
213 
214 	rv = regexec1(progp, bol, mp, ms, &j);
215 	if(rv >= 0)
216 		return rv;
217 	rv = regexec2(progp, bol, mp, ms, &j);
218 	if(rv >= 0)
219 		return rv;
220 	return -1;
221 }
222