xref: /plan9/sys/src/cmd/grep/comp.c (revision 82726826a7b3d40fb66339b4b0e95b60314f98b9)
1 #include	"grep.h"
2 
3 /*
4  * incremental compiler.
5  * add the branch c to the
6  * state s.
7  */
8 void
increment(State * s,int c)9 increment(State *s, int c)
10 {
11 	int i;
12 	State *t, **tt;
13 	Re *re1, *re2;
14 
15 	nfollow = 0;
16 	gen++;
17 	matched = 0;
18 	for(i=0; i<s->count; i++)
19 		fol1(s->re[i], c);
20 	qsort(follow, nfollow, sizeof(*follow), fcmp);
21 	for(tt=&state0; t = *tt;) {
22 		if(t->count > nfollow) {
23 			tt = &t->linkleft;
24 			goto cont;
25 		}
26 		if(t->count < nfollow) {
27 			tt = &t->linkright;
28 			goto cont;
29 		}
30 		for(i=0; i<nfollow; i++) {
31 			re1 = t->re[i];
32 			re2 = follow[i];
33 			if(re1 > re2) {
34 				tt = &t->linkleft;
35 				goto cont;
36 			}
37 			if(re1 < re2) {
38 				tt = &t->linkright;
39 				goto cont;
40 			}
41 		}
42 		if(!!matched && !t->match) {
43 			tt = &t->linkleft;
44 			goto cont;
45 		}
46 		if(!matched && !!t->match) {
47 			tt = &t->linkright;
48 			goto cont;
49 		}
50 		s->next[c] = t;
51 		return;
52 	cont:;
53 	}
54 
55 	t = sal(nfollow);
56 	*tt = t;
57 	for(i=0; i<nfollow; i++) {
58 		re1 = follow[i];
59 		t->re[i] = re1;
60 	}
61 	s->next[c] = t;
62 	t->match = matched;
63 }
64 
65 int
fcmp(void * va,void * vb)66 fcmp(void *va, void *vb)
67 {
68 	Re **aa, **bb;
69 	Re *a, *b;
70 
71 	aa = va;
72 	bb = vb;
73 	a = *aa;
74 	b = *bb;
75 	if(a > b)
76 		return 1;
77 	if(a < b)
78 		return -1;
79 	return 0;
80 }
81 
82 void
fol1(Re * r,int c)83 fol1(Re *r, int c)
84 {
85 	Re *r1;
86 
87 loop:
88 	if(r->gen == gen)
89 		return;
90 	if(nfollow >= maxfollow)
91 		error("nfollow");
92 	r->gen = gen;
93 	switch(r->type) {
94 	default:
95 		error("fol1");
96 
97 	case Tcase:
98 		if(c >= 0 && c < 256)
99 		if(r1 = r->cases[c])
100 			follow[nfollow++] = r1;
101 		if(r = r->next)
102 			goto loop;
103 		break;
104 
105 	case Talt:
106 	case Tor:
107 		fol1(r->alt, c);
108 		r = r->next;
109 		goto loop;
110 
111 	case Tbegin:
112 		if(c == '\n' || c == Cbegin)
113 			follow[nfollow++] = r->next;
114 		break;
115 
116 	case Tend:
117 		if(c == '\n') {
118 			if(r->next == 0) {
119 				matched = 1;
120 				break;
121 			}
122 			r = r->next;
123 			goto loop;
124 		}
125 		break;
126 
127 	case Tclass:
128 		if(c >= r->lo && c <= r->hi)
129 			follow[nfollow++] = r->next;
130 		break;
131 	}
132 }
133 
134 Rune	tab1[] =
135 {
136 	0x007f,
137 	0x07ff,
138 };
139 Rune	tab2[] =
140 {
141 	0x003f,
142 	0x0fff,
143 };
144 
145 Re2
rclass(Rune p0,Rune p1)146 rclass(Rune p0, Rune p1)
147 {
148 	char xc0[6], xc1[6];
149 	int i, n, m;
150 	Re2 x;
151 
152 	if(p0 > p1)
153 		return re2char(0xff, 0xff);	// no match
154 
155 	/*
156 	 * bust range into same length
157 	 * character sequences
158 	 */
159 	for(i=0; i<nelem(tab1); i++) {
160 		m = tab1[i];
161 		if(p0 <= m && p1 > m)
162 			return re2or(rclass(p0, m), rclass(m+1, p1));
163 	}
164 
165 	/*
166 	 * bust range into part of a single page
167 	 * or into full pages
168 	 */
169 	for(i=0; i<nelem(tab2); i++) {
170 		m = tab2[i];
171 		if((p0 & ~m) != (p1 & ~m)) {
172 			if((p0 & m) != 0)
173 				return re2or(rclass(p0, p0|m), rclass((p0|m)+1, p1));
174 			if((p1 & m) != m)
175 				return re2or(rclass(p0, (p1&~m)-1), rclass(p1&~m, p1));
176 		}
177 	}
178 
179 	n = runetochar(xc0, &p0);
180 	i = runetochar(xc1, &p1);
181 	if(i != n)
182 		error("length");
183 
184 	x = re2char(xc0[0], xc1[0]);
185 	for(i=1; i<n; i++)
186 		x = re2cat(x, re2char(xc0[i], xc1[i]));
187 	return x;
188 }
189 
190 int
pcmp(void * va,void * vb)191 pcmp(void *va, void *vb)
192 {
193 	int n;
194 	Rune *a, *b;
195 
196 	a = va;
197 	b = vb;
198 
199 	n = a[0] - b[0];
200 	if(n)
201 		return n;
202 	return a[1] - b[1];
203 }
204 
205 /*
206  * convert character chass into
207  * run-pair ranges of matches.
208  * this is 10646/utf specific and
209  * needs to be changed for some
210  * other input character set.
211  * this is the key to a fast
212  * regular search of characters
213  * by looking at sequential bytes.
214  */
215 Re2
re2class(char * s)216 re2class(char *s)
217 {
218 	Rune pairs[200+2], *p, *q, ov;
219 	int nc;
220 	Re2 x;
221 
222 	nc = 0;
223 	if(*s == '^') {
224 		nc = 1;
225 		s++;
226 	}
227 
228 	p = pairs;
229 	s += chartorune(p, s);
230 	for(;;) {
231 		if(*p == '\\')
232 			s += chartorune(p, s);
233 		if(*p == 0)
234 			break;
235 		p[1] = *p;
236 		p += 2;
237 		if(p >= pairs + nelem(pairs) - 2)
238 			error("class too big");
239 		s += chartorune(p, s);
240 		if(*p != '-')
241 			continue;
242 		s += chartorune(p, s);
243 		if(*p == '\\')
244 			s += chartorune(p, s);
245 		if(*p == 0)
246 			break;
247 		p[-1] = *p;
248 		s += chartorune(p, s);
249 	}
250 	*p = 0;
251 	qsort(pairs, (p-pairs)/2, 2*sizeof(*pairs), pcmp);
252 
253 	q = pairs;
254 	for(p=pairs+2; *p; p+=2) {
255 		if(p[0] > p[1])
256 			continue;
257 		if(p[0] > q[1] || p[1] < q[0]) {
258 			q[2] = p[0];
259 			q[3] = p[1];
260 			q += 2;
261 			continue;
262 		}
263 		if(p[0] < q[0])
264 			q[0] = p[0];
265 		if(p[1] > q[1])
266 			q[1] = p[1];
267 	}
268 	q[2] = 0;
269 
270 	p = pairs;
271 	if(nc) {
272 		x = rclass(0, p[0]-1);
273 		ov = p[1]+1;
274 		for(p+=2; *p; p+=2) {
275 			x = re2or(x, rclass(ov, p[0]-1));
276 			ov = p[1]+1;
277 		}
278 		x = re2or(x, rclass(ov, Runemask));
279 	} else {
280 		x = rclass(p[0], p[1]);
281 		for(p+=2; *p; p+=2)
282 			x = re2or(x, rclass(p[0], p[1]));
283 	}
284 	return x;
285 }
286