1*4246b616SDavid du Colombier #include <u.h>
2*4246b616SDavid du Colombier #include <libc.h>
3*4246b616SDavid du Colombier #include <bin.h>
4*4246b616SDavid du Colombier #include <bio.h>
5*4246b616SDavid du Colombier #include <regexp.h>
6*4246b616SDavid du Colombier #include "/sys/src/libregexp/regcomp.h"
7*4246b616SDavid du Colombier #include "dfa.h"
8*4246b616SDavid du Colombier
9*4246b616SDavid du Colombier void rdump(Reprog*);
10*4246b616SDavid du Colombier void dump(Dreprog*);
11*4246b616SDavid du Colombier
12*4246b616SDavid du Colombier /*
13*4246b616SDavid du Colombier * Standard NFA determinization and DFA minimization.
14*4246b616SDavid du Colombier */
15*4246b616SDavid du Colombier typedef struct Deter Deter;
16*4246b616SDavid du Colombier typedef struct Reiset Reiset;
17*4246b616SDavid du Colombier
18*4246b616SDavid du Colombier void ddump(Deter*);
19*4246b616SDavid du Colombier
20*4246b616SDavid du Colombier /* state of determinization */
21*4246b616SDavid du Colombier struct Deter
22*4246b616SDavid du Colombier {
23*4246b616SDavid du Colombier jmp_buf kaboom; /* jmp on error */
24*4246b616SDavid du Colombier
25*4246b616SDavid du Colombier Bin *bin; /* bin for temporary allocations */
26*4246b616SDavid du Colombier
27*4246b616SDavid du Colombier Reprog *p; /* program being determinized */
28*4246b616SDavid du Colombier uint ninst; /* number of instructions in program */
29*4246b616SDavid du Colombier
30*4246b616SDavid du Colombier Reiset *alloc; /* chain of all Reisets */
31*4246b616SDavid du Colombier Reiset **last;
32*4246b616SDavid du Colombier
33*4246b616SDavid du Colombier Reiset **hash; /* hash of all Reisets */
34*4246b616SDavid du Colombier uint nhash;
35*4246b616SDavid du Colombier
36*4246b616SDavid du Colombier Reiset *tmp; /* temporaries for walk */
37*4246b616SDavid du Colombier uchar *bits;
38*4246b616SDavid du Colombier
39*4246b616SDavid du Colombier Rune *c; /* ``interesting'' characters */
40*4246b616SDavid du Colombier uint nc;
41*4246b616SDavid du Colombier };
42*4246b616SDavid du Colombier
43*4246b616SDavid du Colombier /* set of Reinsts: perhaps we should use a bit list instead of the indices? */
44*4246b616SDavid du Colombier struct Reiset
45*4246b616SDavid du Colombier {
46*4246b616SDavid du Colombier uint *inst; /* indices of instructions in set */
47*4246b616SDavid du Colombier uint ninst; /* size of set */
48*4246b616SDavid du Colombier
49*4246b616SDavid du Colombier Reiset *next; /* d.alloc chain */
50*4246b616SDavid du Colombier Reiset *hash; /* d.hash chain */
51*4246b616SDavid du Colombier Reiset **delta; /* where to go on each interesting char */
52*4246b616SDavid du Colombier uint id; /* assigned id during minimization */
53*4246b616SDavid du Colombier uint isfinal; /* is an accepting (final) state */
54*4246b616SDavid du Colombier };
55*4246b616SDavid du Colombier
56*4246b616SDavid du Colombier static Reiset*
ralloc(Deter * d,int ninst)57*4246b616SDavid du Colombier ralloc(Deter *d, int ninst)
58*4246b616SDavid du Colombier {
59*4246b616SDavid du Colombier Reiset *t;
60*4246b616SDavid du Colombier
61*4246b616SDavid du Colombier t = binalloc(&d->bin, sizeof(Reiset)+2*d->nc*sizeof(Reiset*)+sizeof(uint)*ninst, 0);
62*4246b616SDavid du Colombier if(t == nil)
63*4246b616SDavid du Colombier longjmp(d->kaboom, 1);
64*4246b616SDavid du Colombier t->delta = (Reiset**)&t[1];
65*4246b616SDavid du Colombier t->inst = (uint*)&t->delta[2*d->nc];
66*4246b616SDavid du Colombier return t;
67*4246b616SDavid du Colombier }
68*4246b616SDavid du Colombier
69*4246b616SDavid du Colombier /* find the canonical form a given Reiset */
70*4246b616SDavid du Colombier static Reiset*
findreiset(Deter * d,Reiset * s)71*4246b616SDavid du Colombier findreiset(Deter *d, Reiset *s)
72*4246b616SDavid du Colombier {
73*4246b616SDavid du Colombier int i, szinst;
74*4246b616SDavid du Colombier uint h;
75*4246b616SDavid du Colombier Reiset *t;
76*4246b616SDavid du Colombier
77*4246b616SDavid du Colombier h = 0;
78*4246b616SDavid du Colombier for(i=0; i<s->ninst; i++)
79*4246b616SDavid du Colombier h = h*1000003 + s->inst[i];
80*4246b616SDavid du Colombier h %= d->nhash;
81*4246b616SDavid du Colombier
82*4246b616SDavid du Colombier szinst = s->ninst*sizeof(s->inst[0]);
83*4246b616SDavid du Colombier for(t=d->hash[h]; t; t=t->hash)
84*4246b616SDavid du Colombier if(t->ninst==s->ninst && memcmp(t->inst, s->inst, szinst)==0)
85*4246b616SDavid du Colombier return t;
86*4246b616SDavid du Colombier
87*4246b616SDavid du Colombier t = ralloc(d, s->ninst);
88*4246b616SDavid du Colombier t->hash = d->hash[h];
89*4246b616SDavid du Colombier d->hash[h] = t;
90*4246b616SDavid du Colombier
91*4246b616SDavid du Colombier *d->last = t;
92*4246b616SDavid du Colombier d->last = &t->next;
93*4246b616SDavid du Colombier t->next = 0;
94*4246b616SDavid du Colombier
95*4246b616SDavid du Colombier t->ninst = s->ninst;
96*4246b616SDavid du Colombier memmove(t->inst, s->inst, szinst);
97*4246b616SDavid du Colombier
98*4246b616SDavid du Colombier /* delta is filled in later */
99*4246b616SDavid du Colombier
100*4246b616SDavid du Colombier return t;
101*4246b616SDavid du Colombier }
102*4246b616SDavid du Colombier
103*4246b616SDavid du Colombier /* convert bits to a real reiset */
104*4246b616SDavid du Colombier static Reiset*
bits2reiset(Deter * d,uchar * bits)105*4246b616SDavid du Colombier bits2reiset(Deter *d, uchar *bits)
106*4246b616SDavid du Colombier {
107*4246b616SDavid du Colombier int k;
108*4246b616SDavid du Colombier Reiset *s;
109*4246b616SDavid du Colombier
110*4246b616SDavid du Colombier s = d->tmp;
111*4246b616SDavid du Colombier s->ninst = 0;
112*4246b616SDavid du Colombier for(k=0; k<d->ninst; k++)
113*4246b616SDavid du Colombier if(bits[k])
114*4246b616SDavid du Colombier s->inst[s->ninst++] = k;
115*4246b616SDavid du Colombier return findreiset(d, s);
116*4246b616SDavid du Colombier }
117*4246b616SDavid du Colombier
118*4246b616SDavid du Colombier /* add n to state set; if n < k, need to go around again */
119*4246b616SDavid du Colombier static int
add(int n,uchar * bits,int k)120*4246b616SDavid du Colombier add(int n, uchar *bits, int k)
121*4246b616SDavid du Colombier {
122*4246b616SDavid du Colombier if(bits[n])
123*4246b616SDavid du Colombier return 0;
124*4246b616SDavid du Colombier bits[n] = 1;
125*4246b616SDavid du Colombier return n < k;
126*4246b616SDavid du Colombier }
127*4246b616SDavid du Colombier
128*4246b616SDavid du Colombier /* update bits to follow all the empty (non-character-related) transitions possible */
129*4246b616SDavid du Colombier static void
followempty(Deter * d,uchar * bits,int bol,int eol)130*4246b616SDavid du Colombier followempty(Deter *d, uchar *bits, int bol, int eol)
131*4246b616SDavid du Colombier {
132*4246b616SDavid du Colombier int again, k;
133*4246b616SDavid du Colombier Reinst *i;
134*4246b616SDavid du Colombier
135*4246b616SDavid du Colombier do{
136*4246b616SDavid du Colombier again = 0;
137*4246b616SDavid du Colombier for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){
138*4246b616SDavid du Colombier if(!bits[k])
139*4246b616SDavid du Colombier continue;
140*4246b616SDavid du Colombier switch(i->type){
141*4246b616SDavid du Colombier case RBRA:
142*4246b616SDavid du Colombier case LBRA:
143*4246b616SDavid du Colombier again |= add(i->next - d->p->firstinst, bits, k);
144*4246b616SDavid du Colombier break;
145*4246b616SDavid du Colombier case OR:
146*4246b616SDavid du Colombier again |= add(i->left - d->p->firstinst, bits, k);
147*4246b616SDavid du Colombier again |= add(i->right - d->p->firstinst, bits, k);
148*4246b616SDavid du Colombier break;
149*4246b616SDavid du Colombier case BOL:
150*4246b616SDavid du Colombier if(bol)
151*4246b616SDavid du Colombier again |= add(i->next - d->p->firstinst, bits, k);
152*4246b616SDavid du Colombier break;
153*4246b616SDavid du Colombier case EOL:
154*4246b616SDavid du Colombier if(eol)
155*4246b616SDavid du Colombier again |= add(i->next - d->p->firstinst, bits, k);
156*4246b616SDavid du Colombier break;
157*4246b616SDavid du Colombier }
158*4246b616SDavid du Colombier }
159*4246b616SDavid du Colombier }while(again);
160*4246b616SDavid du Colombier
161*4246b616SDavid du Colombier /*
162*4246b616SDavid du Colombier * Clear bits for useless transitions. We could do this during
163*4246b616SDavid du Colombier * the switch above, but then we have no guarantee of termination
164*4246b616SDavid du Colombier * if we get a loop in the regexp.
165*4246b616SDavid du Colombier */
166*4246b616SDavid du Colombier for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){
167*4246b616SDavid du Colombier if(!bits[k])
168*4246b616SDavid du Colombier continue;
169*4246b616SDavid du Colombier switch(i->type){
170*4246b616SDavid du Colombier case RBRA:
171*4246b616SDavid du Colombier case LBRA:
172*4246b616SDavid du Colombier case OR:
173*4246b616SDavid du Colombier case BOL:
174*4246b616SDavid du Colombier case EOL:
175*4246b616SDavid du Colombier bits[k] = 0;
176*4246b616SDavid du Colombier break;
177*4246b616SDavid du Colombier }
178*4246b616SDavid du Colombier }
179*4246b616SDavid du Colombier }
180*4246b616SDavid du Colombier
181*4246b616SDavid du Colombier /*
182*4246b616SDavid du Colombier * Where does s go if it sees rune r?
183*4246b616SDavid du Colombier * Eol is true if a $ matches the string at the position just after r.
184*4246b616SDavid du Colombier */
185*4246b616SDavid du Colombier static Reiset*
transition(Deter * d,Reiset * s,Rune r,uint eol)186*4246b616SDavid du Colombier transition(Deter *d, Reiset *s, Rune r, uint eol)
187*4246b616SDavid du Colombier {
188*4246b616SDavid du Colombier int k;
189*4246b616SDavid du Colombier uchar *bits;
190*4246b616SDavid du Colombier Reinst *i, *inst0;
191*4246b616SDavid du Colombier Rune *rp, *ep;
192*4246b616SDavid du Colombier
193*4246b616SDavid du Colombier bits = d->bits;
194*4246b616SDavid du Colombier memset(bits, 0, d->ninst);
195*4246b616SDavid du Colombier
196*4246b616SDavid du Colombier inst0 = d->p->firstinst;
197*4246b616SDavid du Colombier for(k=0; k < s->ninst; k++){
198*4246b616SDavid du Colombier i = inst0 + s->inst[k];
199*4246b616SDavid du Colombier switch(i->type){
200*4246b616SDavid du Colombier default:
201*4246b616SDavid du Colombier werrstr("bad reprog: got type %d", i->type);
202*4246b616SDavid du Colombier longjmp(d->kaboom, 1);
203*4246b616SDavid du Colombier case RBRA:
204*4246b616SDavid du Colombier case LBRA:
205*4246b616SDavid du Colombier case OR:
206*4246b616SDavid du Colombier case BOL:
207*4246b616SDavid du Colombier case EOL:
208*4246b616SDavid du Colombier werrstr("internal error: got type %d", i->type);
209*4246b616SDavid du Colombier longjmp(d->kaboom, 1);
210*4246b616SDavid du Colombier
211*4246b616SDavid du Colombier case RUNE:
212*4246b616SDavid du Colombier if(r == i->r)
213*4246b616SDavid du Colombier bits[i->next - inst0] = 1;
214*4246b616SDavid du Colombier break;
215*4246b616SDavid du Colombier case ANY:
216*4246b616SDavid du Colombier if(r != L'\n')
217*4246b616SDavid du Colombier bits[i->next - inst0] = 1;
218*4246b616SDavid du Colombier break;
219*4246b616SDavid du Colombier case ANYNL:
220*4246b616SDavid du Colombier bits[i->next - inst0] = 1;
221*4246b616SDavid du Colombier break;
222*4246b616SDavid du Colombier case NCCLASS:
223*4246b616SDavid du Colombier if(r == L'\n')
224*4246b616SDavid du Colombier break;
225*4246b616SDavid du Colombier /* fall through */
226*4246b616SDavid du Colombier case CCLASS:
227*4246b616SDavid du Colombier ep = i->cp->end;
228*4246b616SDavid du Colombier for(rp = i->cp->spans; rp < ep; rp += 2)
229*4246b616SDavid du Colombier if(rp[0] <= r && r <= rp[1])
230*4246b616SDavid du Colombier break;
231*4246b616SDavid du Colombier if((rp < ep) ^! (i->type == CCLASS))
232*4246b616SDavid du Colombier bits[i->next - inst0] = 1;
233*4246b616SDavid du Colombier break;
234*4246b616SDavid du Colombier case END:
235*4246b616SDavid du Colombier break;
236*4246b616SDavid du Colombier }
237*4246b616SDavid du Colombier }
238*4246b616SDavid du Colombier
239*4246b616SDavid du Colombier followempty(d, bits, r=='\n', eol);
240*4246b616SDavid du Colombier return bits2reiset(d, bits);
241*4246b616SDavid du Colombier }
242*4246b616SDavid du Colombier
243*4246b616SDavid du Colombier static int
countinst(Reprog * pp)244*4246b616SDavid du Colombier countinst(Reprog *pp)
245*4246b616SDavid du Colombier {
246*4246b616SDavid du Colombier int n;
247*4246b616SDavid du Colombier Reinst *l;
248*4246b616SDavid du Colombier
249*4246b616SDavid du Colombier n = 0;
250*4246b616SDavid du Colombier l = pp->firstinst;
251*4246b616SDavid du Colombier while(l++->type)
252*4246b616SDavid du Colombier n++;
253*4246b616SDavid du Colombier return n;
254*4246b616SDavid du Colombier }
255*4246b616SDavid du Colombier
256*4246b616SDavid du Colombier static void
set(Deter * d,u32int ** tab,Rune r)257*4246b616SDavid du Colombier set(Deter *d, u32int **tab, Rune r)
258*4246b616SDavid du Colombier {
259*4246b616SDavid du Colombier u32int *u;
260*4246b616SDavid du Colombier
261*4246b616SDavid du Colombier if((u = tab[r/4096]) == nil){
262*4246b616SDavid du Colombier u = binalloc(&d->bin, 4096/8, 1);
263*4246b616SDavid du Colombier if(u == nil)
264*4246b616SDavid du Colombier longjmp(d->kaboom, 1);
265*4246b616SDavid du Colombier tab[r/4096] = u;
266*4246b616SDavid du Colombier }
267*4246b616SDavid du Colombier u[(r%4096)/32] |= 1<<(r%32);
268*4246b616SDavid du Colombier }
269*4246b616SDavid du Colombier
270*4246b616SDavid du Colombier /*
271*4246b616SDavid du Colombier * Compute the list of important characters.
272*4246b616SDavid du Colombier * Other characters behave like the ones that surround them.
273*4246b616SDavid du Colombier */
274*4246b616SDavid du Colombier static void
findchars(Deter * d,Reprog * p)275*4246b616SDavid du Colombier findchars(Deter *d, Reprog *p)
276*4246b616SDavid du Colombier {
277*4246b616SDavid du Colombier u32int *tab[65536/4096], *u, x;
278*4246b616SDavid du Colombier Reinst *i;
279*4246b616SDavid du Colombier Rune *rp, *ep;
280*4246b616SDavid du Colombier int k, m, n, a;
281*4246b616SDavid du Colombier
282*4246b616SDavid du Colombier memset(tab, 0, sizeof tab);
283*4246b616SDavid du Colombier set(d, tab, 0);
284*4246b616SDavid du Colombier set(d, tab, 0xFFFF);
285*4246b616SDavid du Colombier for(i=p->firstinst; i->type; i++){
286*4246b616SDavid du Colombier switch(i->type){
287*4246b616SDavid du Colombier case ANY:
288*4246b616SDavid du Colombier set(d, tab, L'\n'-1);
289*4246b616SDavid du Colombier set(d, tab, L'\n');
290*4246b616SDavid du Colombier set(d, tab, L'\n'+1);
291*4246b616SDavid du Colombier break;
292*4246b616SDavid du Colombier case RUNE:
293*4246b616SDavid du Colombier set(d, tab, i->r-1);
294*4246b616SDavid du Colombier set(d, tab, i->r);
295*4246b616SDavid du Colombier set(d, tab, i->r+1);
296*4246b616SDavid du Colombier break;
297*4246b616SDavid du Colombier case NCCLASS:
298*4246b616SDavid du Colombier set(d, tab, L'\n'-1);
299*4246b616SDavid du Colombier set(d, tab, L'\n');
300*4246b616SDavid du Colombier set(d, tab, L'\n'+1);
301*4246b616SDavid du Colombier /* fall through */
302*4246b616SDavid du Colombier case CCLASS:
303*4246b616SDavid du Colombier ep = i->cp->end;
304*4246b616SDavid du Colombier for(rp = i->cp->spans; rp < ep; rp += 2){
305*4246b616SDavid du Colombier set(d, tab, rp[0]-1);
306*4246b616SDavid du Colombier set(d, tab, rp[0]);
307*4246b616SDavid du Colombier set(d, tab, rp[1]);
308*4246b616SDavid du Colombier set(d, tab, rp[1]+1);
309*4246b616SDavid du Colombier }
310*4246b616SDavid du Colombier break;
311*4246b616SDavid du Colombier }
312*4246b616SDavid du Colombier }
313*4246b616SDavid du Colombier
314*4246b616SDavid du Colombier n = 0;
315*4246b616SDavid du Colombier for(k=0; k<nelem(tab); k++){
316*4246b616SDavid du Colombier if((u = tab[k]) == nil)
317*4246b616SDavid du Colombier continue;
318*4246b616SDavid du Colombier for(m=0; m<4096/32; m++){
319*4246b616SDavid du Colombier if((x = u[m]) == 0)
320*4246b616SDavid du Colombier continue;
321*4246b616SDavid du Colombier for(a=0; a<32; a++)
322*4246b616SDavid du Colombier if(x&(1<<a))
323*4246b616SDavid du Colombier n++;
324*4246b616SDavid du Colombier }
325*4246b616SDavid du Colombier }
326*4246b616SDavid du Colombier
327*4246b616SDavid du Colombier d->c = binalloc(&d->bin, (n+1)*sizeof(Rune), 0);
328*4246b616SDavid du Colombier if(d->c == 0)
329*4246b616SDavid du Colombier longjmp(d->kaboom, 1);
330*4246b616SDavid du Colombier d->nc = n;
331*4246b616SDavid du Colombier
332*4246b616SDavid du Colombier n = 0;
333*4246b616SDavid du Colombier for(k=0; k<nelem(tab); k++){
334*4246b616SDavid du Colombier if((u = tab[k]) == nil)
335*4246b616SDavid du Colombier continue;
336*4246b616SDavid du Colombier for(m=0; m<4096/32; m++){
337*4246b616SDavid du Colombier if((x = u[m]) == 0)
338*4246b616SDavid du Colombier continue;
339*4246b616SDavid du Colombier for(a=0; a<32; a++)
340*4246b616SDavid du Colombier if(x&(1<<a))
341*4246b616SDavid du Colombier d->c[n++] = k*4096+m*32+a;
342*4246b616SDavid du Colombier }
343*4246b616SDavid du Colombier }
344*4246b616SDavid du Colombier
345*4246b616SDavid du Colombier d->c[n] = 0;
346*4246b616SDavid du Colombier if(n != d->nc)
347*4246b616SDavid du Colombier abort();
348*4246b616SDavid du Colombier }
349*4246b616SDavid du Colombier
350*4246b616SDavid du Colombier /*
351*4246b616SDavid du Colombier * convert the Deter and Reisets into a Dreprog.
352*4246b616SDavid du Colombier * if dp and c are nil, just return the count of Drecases needed.
353*4246b616SDavid du Colombier */
354*4246b616SDavid du Colombier static int
buildprog(Deter * d,Reiset ** id2set,int nid,Dreprog * dp,Drecase * c)355*4246b616SDavid du Colombier buildprog(Deter *d, Reiset **id2set, int nid, Dreprog *dp, Drecase *c)
356*4246b616SDavid du Colombier {
357*4246b616SDavid du Colombier int i, j, id, n, nn;
358*4246b616SDavid du Colombier Dreinst *di;
359*4246b616SDavid du Colombier Reiset *s;
360*4246b616SDavid du Colombier
361*4246b616SDavid du Colombier nn = 0;
362*4246b616SDavid du Colombier di = 0;
363*4246b616SDavid du Colombier for(i=0; i<nid; i++){
364*4246b616SDavid du Colombier s = id2set[i];
365*4246b616SDavid du Colombier if(c){
366*4246b616SDavid du Colombier di = &dp->inst[i];
367*4246b616SDavid du Colombier di->isfinal = s->isfinal;
368*4246b616SDavid du Colombier }
369*4246b616SDavid du Colombier n = 0;
370*4246b616SDavid du Colombier id = -1;
371*4246b616SDavid du Colombier for(j=0; j<2*d->nc; j++){
372*4246b616SDavid du Colombier if(s->delta[j]->id != id){
373*4246b616SDavid du Colombier id = s->delta[j]->id;
374*4246b616SDavid du Colombier if(c){
375*4246b616SDavid du Colombier c[n].start = ((j/d->nc)<<16) | d->c[j%d->nc];
376*4246b616SDavid du Colombier c[n].next = &dp->inst[id];
377*4246b616SDavid du Colombier }
378*4246b616SDavid du Colombier n++;
379*4246b616SDavid du Colombier }
380*4246b616SDavid du Colombier }
381*4246b616SDavid du Colombier if(c){
382*4246b616SDavid du Colombier if(n == 1 && c[0].next == di)
383*4246b616SDavid du Colombier di->isloop = 1;
384*4246b616SDavid du Colombier di->c = c;
385*4246b616SDavid du Colombier di->nc = n;
386*4246b616SDavid du Colombier c += n;
387*4246b616SDavid du Colombier }
388*4246b616SDavid du Colombier nn += n;
389*4246b616SDavid du Colombier }
390*4246b616SDavid du Colombier return nn;
391*4246b616SDavid du Colombier }
392*4246b616SDavid du Colombier
393*4246b616SDavid du Colombier Dreprog*
dregcvt(Reprog * p)394*4246b616SDavid du Colombier dregcvt(Reprog *p)
395*4246b616SDavid du Colombier {
396*4246b616SDavid du Colombier uchar *bits;
397*4246b616SDavid du Colombier uint again, n, nid, id;
398*4246b616SDavid du Colombier Deter d;
399*4246b616SDavid du Colombier Reiset **id2set, *s, *t, *start[4];
400*4246b616SDavid du Colombier Dreprog *dp;
401*4246b616SDavid du Colombier Drecase *c;
402*4246b616SDavid du Colombier
403*4246b616SDavid du Colombier memset(&d, 0, sizeof d);
404*4246b616SDavid du Colombier
405*4246b616SDavid du Colombier if(setjmp(d.kaboom)){
406*4246b616SDavid du Colombier binfree(&d.bin);
407*4246b616SDavid du Colombier return nil;
408*4246b616SDavid du Colombier }
409*4246b616SDavid du Colombier
410*4246b616SDavid du Colombier d.p = p;
411*4246b616SDavid du Colombier d.ninst = countinst(p);
412*4246b616SDavid du Colombier
413*4246b616SDavid du Colombier d.last = &d.alloc;
414*4246b616SDavid du Colombier
415*4246b616SDavid du Colombier n = d.ninst;
416*4246b616SDavid du Colombier /* round up to power of two; this loop is the least of our efficiency problems */
417*4246b616SDavid du Colombier while(n&(n-1))
418*4246b616SDavid du Colombier n++;
419*4246b616SDavid du Colombier d.nhash = n;
420*4246b616SDavid du Colombier d.hash = binalloc(&d.bin, d.nhash*sizeof(Reinst*), 1);
421*4246b616SDavid du Colombier
422*4246b616SDavid du Colombier /* get list of important runes */
423*4246b616SDavid du Colombier findchars(&d, p);
424*4246b616SDavid du Colombier
425*4246b616SDavid du Colombier #ifdef DUMP
426*4246b616SDavid du Colombier print("relevant chars are: «%S»\n", d.c+1);
427*4246b616SDavid du Colombier #endif
428*4246b616SDavid du Colombier
429*4246b616SDavid du Colombier d.bits = bits = binalloc(&d.bin, d.ninst, 0);
430*4246b616SDavid du Colombier d.tmp = ralloc(&d, d.ninst);
431*4246b616SDavid du Colombier
432*4246b616SDavid du Colombier /*
433*4246b616SDavid du Colombier * Convert to DFA
434*4246b616SDavid du Colombier */
435*4246b616SDavid du Colombier
436*4246b616SDavid du Colombier /* 4 start states, depending on initial bol, eol */
437*4246b616SDavid du Colombier for(n=0; n<4; n++){
438*4246b616SDavid du Colombier memset(bits, 0, d.ninst);
439*4246b616SDavid du Colombier bits[p->startinst - p->firstinst] = 1;
440*4246b616SDavid du Colombier followempty(&d, bits, n&1, n&2);
441*4246b616SDavid du Colombier start[n] = bits2reiset(&d, bits);
442*4246b616SDavid du Colombier }
443*4246b616SDavid du Colombier
444*4246b616SDavid du Colombier /* explore the reiset space */
445*4246b616SDavid du Colombier for(s=d.alloc; s; s=s->next)
446*4246b616SDavid du Colombier for(n=0; n<2*d.nc; n++)
447*4246b616SDavid du Colombier s->delta[n] = transition(&d, s, d.c[n%d.nc], n/d.nc);
448*4246b616SDavid du Colombier
449*4246b616SDavid du Colombier #ifdef DUMP
450*4246b616SDavid du Colombier nid = 0;
451*4246b616SDavid du Colombier for(s=d.alloc; s; s=s->next)
452*4246b616SDavid du Colombier s->id = nid++;
453*4246b616SDavid du Colombier ddump(&d);
454*4246b616SDavid du Colombier #endif
455*4246b616SDavid du Colombier
456*4246b616SDavid du Colombier /*
457*4246b616SDavid du Colombier * Minimize.
458*4246b616SDavid du Colombier */
459*4246b616SDavid du Colombier
460*4246b616SDavid du Colombier /* first class division is final or not */
461*4246b616SDavid du Colombier for(s=d.alloc; s; s=s->next){
462*4246b616SDavid du Colombier s->isfinal = 0;
463*4246b616SDavid du Colombier for(n=0; n<s->ninst; n++)
464*4246b616SDavid du Colombier if(p->firstinst[s->inst[n]].type == END)
465*4246b616SDavid du Colombier s->isfinal = 1;
466*4246b616SDavid du Colombier s->id = s->isfinal;
467*4246b616SDavid du Colombier }
468*4246b616SDavid du Colombier
469*4246b616SDavid du Colombier /* divide states with different transition tables in id space */
470*4246b616SDavid du Colombier nid = 2;
471*4246b616SDavid du Colombier do{
472*4246b616SDavid du Colombier again = 0;
473*4246b616SDavid du Colombier for(s=d.alloc; s; s=s->next){
474*4246b616SDavid du Colombier id = -1;
475*4246b616SDavid du Colombier for(t=s->next; t; t=t->next){
476*4246b616SDavid du Colombier if(s->id != t->id)
477*4246b616SDavid du Colombier continue;
478*4246b616SDavid du Colombier for(n=0; n<2*d.nc; n++){
479*4246b616SDavid du Colombier /* until we finish the for(t) loop, s->id and id are same */
480*4246b616SDavid du Colombier if((s->delta[n]->id == t->delta[n]->id)
481*4246b616SDavid du Colombier || (s->delta[n]->id == s->id && t->delta[n]->id == id)
482*4246b616SDavid du Colombier || (s->delta[n]->id == id && t->delta[n]->id == s->id))
483*4246b616SDavid du Colombier continue;
484*4246b616SDavid du Colombier break;
485*4246b616SDavid du Colombier }
486*4246b616SDavid du Colombier if(n == 2*d.nc)
487*4246b616SDavid du Colombier continue;
488*4246b616SDavid du Colombier if(id == -1)
489*4246b616SDavid du Colombier id = nid++;
490*4246b616SDavid du Colombier t->id = id;
491*4246b616SDavid du Colombier again = 1;
492*4246b616SDavid du Colombier }
493*4246b616SDavid du Colombier }
494*4246b616SDavid du Colombier }while(again);
495*4246b616SDavid du Colombier
496*4246b616SDavid du Colombier #ifdef DUMP
497*4246b616SDavid du Colombier ddump(&d);
498*4246b616SDavid du Colombier #endif
499*4246b616SDavid du Colombier
500*4246b616SDavid du Colombier /* build dreprog */
501*4246b616SDavid du Colombier id2set = binalloc(&d.bin, nid*sizeof(Reiset*), 1);
502*4246b616SDavid du Colombier if(id2set == nil)
503*4246b616SDavid du Colombier longjmp(d.kaboom, 1);
504*4246b616SDavid du Colombier for(s=d.alloc; s; s=s->next)
505*4246b616SDavid du Colombier id2set[s->id] = s;
506*4246b616SDavid du Colombier
507*4246b616SDavid du Colombier n = buildprog(&d, id2set, nid, nil, nil);
508*4246b616SDavid du Colombier dp = mallocz(sizeof(Dreprog)+nid*sizeof(Dreinst)+n*sizeof(Drecase), 1);
509*4246b616SDavid du Colombier if(dp == nil)
510*4246b616SDavid du Colombier longjmp(d.kaboom, 1);
511*4246b616SDavid du Colombier c = (Drecase*)&dp->inst[nid];
512*4246b616SDavid du Colombier buildprog(&d, id2set, nid, dp, c);
513*4246b616SDavid du Colombier
514*4246b616SDavid du Colombier for(n=0; n<4; n++)
515*4246b616SDavid du Colombier dp->start[n] = &dp->inst[start[n]->id];
516*4246b616SDavid du Colombier dp->ninst = nid;
517*4246b616SDavid du Colombier
518*4246b616SDavid du Colombier binfree(&d.bin);
519*4246b616SDavid du Colombier return dp;
520*4246b616SDavid du Colombier }
521*4246b616SDavid du Colombier
522*4246b616SDavid du Colombier int
dregexec(Dreprog * p,char * s,int bol)523*4246b616SDavid du Colombier dregexec(Dreprog *p, char *s, int bol)
524*4246b616SDavid du Colombier {
525*4246b616SDavid du Colombier Rune r;
526*4246b616SDavid du Colombier ulong rr;
527*4246b616SDavid du Colombier Dreinst *i;
528*4246b616SDavid du Colombier Drecase *c, *ec;
529*4246b616SDavid du Colombier int best, n;
530*4246b616SDavid du Colombier char *os;
531*4246b616SDavid du Colombier
532*4246b616SDavid du Colombier i = p->start[(bol ? 1 : 0) | (s[1]=='\n' ? 2 : 0)];
533*4246b616SDavid du Colombier best = -1;
534*4246b616SDavid du Colombier os = s;
535*4246b616SDavid du Colombier for(; *s; s+=n){
536*4246b616SDavid du Colombier if(i->isfinal)
537*4246b616SDavid du Colombier best = s - os;
538*4246b616SDavid du Colombier if(i->isloop){
539*4246b616SDavid du Colombier if(i->isfinal)
540*4246b616SDavid du Colombier return strlen(os);
541*4246b616SDavid du Colombier else
542*4246b616SDavid du Colombier return best;
543*4246b616SDavid du Colombier }
544*4246b616SDavid du Colombier if((*s&0xFF) < Runeself){
545*4246b616SDavid du Colombier r = *s;
546*4246b616SDavid du Colombier n = 1;
547*4246b616SDavid du Colombier }else
548*4246b616SDavid du Colombier n = chartorune(&r, s);
549*4246b616SDavid du Colombier c = i->c;
550*4246b616SDavid du Colombier ec = c+i->nc;
551*4246b616SDavid du Colombier rr = r;
552*4246b616SDavid du Colombier if(s[n] == '\n' || s[n] == '\0')
553*4246b616SDavid du Colombier rr |= 0x10000;
554*4246b616SDavid du Colombier for(; c<ec; c++){
555*4246b616SDavid du Colombier if(c->start > rr){
556*4246b616SDavid du Colombier i = c[-1].next;
557*4246b616SDavid du Colombier goto Out;
558*4246b616SDavid du Colombier }
559*4246b616SDavid du Colombier }
560*4246b616SDavid du Colombier i = ec[-1].next;
561*4246b616SDavid du Colombier Out:;
562*4246b616SDavid du Colombier }
563*4246b616SDavid du Colombier if(i->isfinal)
564*4246b616SDavid du Colombier best = s - os;
565*4246b616SDavid du Colombier return best;
566*4246b616SDavid du Colombier }
567*4246b616SDavid du Colombier
568*4246b616SDavid du Colombier
569*4246b616SDavid du Colombier #ifdef DUMP
570*4246b616SDavid du Colombier void
ddump(Deter * d)571*4246b616SDavid du Colombier ddump(Deter *d)
572*4246b616SDavid du Colombier {
573*4246b616SDavid du Colombier int i, id;
574*4246b616SDavid du Colombier Reiset *s;
575*4246b616SDavid du Colombier
576*4246b616SDavid du Colombier for(s=d->alloc; s; s=s->next){
577*4246b616SDavid du Colombier print("%d ", s->id);
578*4246b616SDavid du Colombier id = -1;
579*4246b616SDavid du Colombier for(i=0; i<2*d->nc; i++){
580*4246b616SDavid du Colombier if(id != s->delta[i]->id){
581*4246b616SDavid du Colombier if(i==0)
582*4246b616SDavid du Colombier print(" [");
583*4246b616SDavid du Colombier else if(i/d->nc)
584*4246b616SDavid du Colombier print(" [%C$", d->c[i%d->nc]);
585*4246b616SDavid du Colombier else
586*4246b616SDavid du Colombier print(" [%C", d->c[i%d->nc]);
587*4246b616SDavid du Colombier print(" %d]", s->delta[i]->id);
588*4246b616SDavid du Colombier id = s->delta[i]->id;
589*4246b616SDavid du Colombier }
590*4246b616SDavid du Colombier }
591*4246b616SDavid du Colombier print("\n");
592*4246b616SDavid du Colombier }
593*4246b616SDavid du Colombier }
594*4246b616SDavid du Colombier
595*4246b616SDavid du Colombier void
rdump(Reprog * pp)596*4246b616SDavid du Colombier rdump(Reprog *pp)
597*4246b616SDavid du Colombier {
598*4246b616SDavid du Colombier Reinst *l;
599*4246b616SDavid du Colombier Rune *p;
600*4246b616SDavid du Colombier
601*4246b616SDavid du Colombier l = pp->firstinst;
602*4246b616SDavid du Colombier do{
603*4246b616SDavid du Colombier print("%ld:\t0%o\t%ld\t%ld", l-pp->firstinst, l->type,
604*4246b616SDavid du Colombier l->left-pp->firstinst, l->right-pp->firstinst);
605*4246b616SDavid du Colombier if(l->type == RUNE)
606*4246b616SDavid du Colombier print("\t%C\n", l->r);
607*4246b616SDavid du Colombier else if(l->type == CCLASS || l->type == NCCLASS){
608*4246b616SDavid du Colombier print("\t[");
609*4246b616SDavid du Colombier if(l->type == NCCLASS)
610*4246b616SDavid du Colombier print("^");
611*4246b616SDavid du Colombier for(p = l->cp->spans; p < l->cp->end; p += 2)
612*4246b616SDavid du Colombier if(p[0] == p[1])
613*4246b616SDavid du Colombier print("%C", p[0]);
614*4246b616SDavid du Colombier else
615*4246b616SDavid du Colombier print("%C-%C", p[0], p[1]);
616*4246b616SDavid du Colombier print("]\n");
617*4246b616SDavid du Colombier } else
618*4246b616SDavid du Colombier print("\n");
619*4246b616SDavid du Colombier }while(l++->type);
620*4246b616SDavid du Colombier }
621*4246b616SDavid du Colombier
622*4246b616SDavid du Colombier void
dump(Dreprog * pp)623*4246b616SDavid du Colombier dump(Dreprog *pp)
624*4246b616SDavid du Colombier {
625*4246b616SDavid du Colombier int i, j;
626*4246b616SDavid du Colombier Dreinst *l;
627*4246b616SDavid du Colombier
628*4246b616SDavid du Colombier print("start %ld %ld %ld %ld\n",
629*4246b616SDavid du Colombier pp->start[0]-pp->inst,
630*4246b616SDavid du Colombier pp->start[1]-pp->inst,
631*4246b616SDavid du Colombier pp->start[2]-pp->inst,
632*4246b616SDavid du Colombier pp->start[3]-pp->inst);
633*4246b616SDavid du Colombier
634*4246b616SDavid du Colombier for(i=0; i<pp->ninst; i++){
635*4246b616SDavid du Colombier l = &pp->inst[i];
636*4246b616SDavid du Colombier print("%d:", i);
637*4246b616SDavid du Colombier for(j=0; j<l->nc; j++){
638*4246b616SDavid du Colombier print(" [");
639*4246b616SDavid du Colombier if(j == 0)
640*4246b616SDavid du Colombier if(l->c[j].start != 1)
641*4246b616SDavid du Colombier abort();
642*4246b616SDavid du Colombier if(j != 0)
643*4246b616SDavid du Colombier print("%C%s", l->c[j].start&0xFFFF, (l->c[j].start&0x10000) ? "$" : "");
644*4246b616SDavid du Colombier print("-");
645*4246b616SDavid du Colombier if(j != l->nc-1)
646*4246b616SDavid du Colombier print("%C%s", (l->c[j+1].start&0xFFFF)-1, (l->c[j+1].start&0x10000) ? "$" : "");
647*4246b616SDavid du Colombier print("] %ld", l->c[j].next - pp->inst);
648*4246b616SDavid du Colombier }
649*4246b616SDavid du Colombier if(l->isfinal)
650*4246b616SDavid du Colombier print(" final");
651*4246b616SDavid du Colombier if(l->isloop)
652*4246b616SDavid du Colombier print(" loop");
653*4246b616SDavid du Colombier print("\n");
654*4246b616SDavid du Colombier }
655*4246b616SDavid du Colombier }
656*4246b616SDavid du Colombier
657*4246b616SDavid du Colombier
658*4246b616SDavid du Colombier void
main(int argc,char ** argv)659*4246b616SDavid du Colombier main(int argc, char **argv)
660*4246b616SDavid du Colombier {
661*4246b616SDavid du Colombier int i;
662*4246b616SDavid du Colombier Reprog *p;
663*4246b616SDavid du Colombier Dreprog *dp;
664*4246b616SDavid du Colombier
665*4246b616SDavid du Colombier i = 1;
666*4246b616SDavid du Colombier p = regcomp(argv[i]);
667*4246b616SDavid du Colombier if(p == 0){
668*4246b616SDavid du Colombier print("=== %s: bad regexp\n", argv[i]);
669*4246b616SDavid du Colombier }
670*4246b616SDavid du Colombier // print("=== %s\n", argv[i]);
671*4246b616SDavid du Colombier // rdump(p);
672*4246b616SDavid du Colombier dp = dregcvt(p);
673*4246b616SDavid du Colombier print("=== dfa\n");
674*4246b616SDavid du Colombier dump(dp);
675*4246b616SDavid du Colombier
676*4246b616SDavid du Colombier for(i=2; i<argc; i++)
677*4246b616SDavid du Colombier print("match %d\n", dregexec(dp, argv[i], 0));
678*4246b616SDavid du Colombier exits(0);
679*4246b616SDavid du Colombier }
680*4246b616SDavid du Colombier #endif
681*4246b616SDavid du Colombier
682*4246b616SDavid du Colombier void
Bprintdfa(Biobuf * b,Dreprog * p)683*4246b616SDavid du Colombier Bprintdfa(Biobuf *b, Dreprog *p)
684*4246b616SDavid du Colombier {
685*4246b616SDavid du Colombier int i, j, nc;
686*4246b616SDavid du Colombier
687*4246b616SDavid du Colombier Bprint(b, "# dreprog\n");
688*4246b616SDavid du Colombier nc = 0;
689*4246b616SDavid du Colombier for(i=0; i<p->ninst; i++)
690*4246b616SDavid du Colombier nc += p->inst[i].nc;
691*4246b616SDavid du Colombier Bprint(b, "%d %d %ld %ld %ld %ld\n", p->ninst, nc,
692*4246b616SDavid du Colombier p->start[0]-p->inst, p->start[1]-p->inst,
693*4246b616SDavid du Colombier p->start[2]-p->inst, p->start[3]-p->inst);
694*4246b616SDavid du Colombier for(i=0; i<p->ninst; i++){
695*4246b616SDavid du Colombier Bprint(b, "%d %d %d", p->inst[i].isfinal, p->inst[i].isloop, p->inst[i].nc);
696*4246b616SDavid du Colombier for(j=0; j<p->inst[i].nc; j++)
697*4246b616SDavid du Colombier Bprint(b, " %d %ld", p->inst[i].c[j].start, p->inst[i].c[j].next-p->inst);
698*4246b616SDavid du Colombier Bprint(b, "\n");
699*4246b616SDavid du Colombier }
700*4246b616SDavid du Colombier }
701*4246b616SDavid du Colombier
702*4246b616SDavid du Colombier static char*
egetline(Biobuf * b,int c,jmp_buf jb)703*4246b616SDavid du Colombier egetline(Biobuf *b, int c, jmp_buf jb)
704*4246b616SDavid du Colombier {
705*4246b616SDavid du Colombier char *p;
706*4246b616SDavid du Colombier
707*4246b616SDavid du Colombier p = Brdline(b, c);
708*4246b616SDavid du Colombier if(p == nil)
709*4246b616SDavid du Colombier longjmp(jb, 1);
710*4246b616SDavid du Colombier p[Blinelen(b)-1] = '\0';
711*4246b616SDavid du Colombier return p;
712*4246b616SDavid du Colombier }
713*4246b616SDavid du Colombier
714*4246b616SDavid du Colombier static void
egetc(Biobuf * b,int c,jmp_buf jb)715*4246b616SDavid du Colombier egetc(Biobuf *b, int c, jmp_buf jb)
716*4246b616SDavid du Colombier {
717*4246b616SDavid du Colombier if(Bgetc(b) != c)
718*4246b616SDavid du Colombier longjmp(jb, 1);
719*4246b616SDavid du Colombier }
720*4246b616SDavid du Colombier
721*4246b616SDavid du Colombier static int
egetnum(Biobuf * b,int want,jmp_buf jb)722*4246b616SDavid du Colombier egetnum(Biobuf *b, int want, jmp_buf jb)
723*4246b616SDavid du Colombier {
724*4246b616SDavid du Colombier int c;
725*4246b616SDavid du Colombier int n, first;
726*4246b616SDavid du Colombier
727*4246b616SDavid du Colombier n = 0;
728*4246b616SDavid du Colombier first = 1;
729*4246b616SDavid du Colombier while((c = Bgetc(b)) != Beof){
730*4246b616SDavid du Colombier if(c < '0' || c > '9'){
731*4246b616SDavid du Colombier if(want == 0){
732*4246b616SDavid du Colombier Bungetc(b);
733*4246b616SDavid du Colombier c = 0;
734*4246b616SDavid du Colombier }
735*4246b616SDavid du Colombier if(first || c != want){
736*4246b616SDavid du Colombier werrstr("format error");
737*4246b616SDavid du Colombier longjmp(jb, 1);
738*4246b616SDavid du Colombier }
739*4246b616SDavid du Colombier return n;
740*4246b616SDavid du Colombier }
741*4246b616SDavid du Colombier n = n*10 + c - '0';
742*4246b616SDavid du Colombier first = 0;
743*4246b616SDavid du Colombier }
744*4246b616SDavid du Colombier werrstr("unexpected eof");
745*4246b616SDavid du Colombier longjmp(jb, 1);
746*4246b616SDavid du Colombier return -1;
747*4246b616SDavid du Colombier }
748*4246b616SDavid du Colombier
749*4246b616SDavid du Colombier Dreprog*
Breaddfa(Biobuf * b)750*4246b616SDavid du Colombier Breaddfa(Biobuf *b)
751*4246b616SDavid du Colombier {
752*4246b616SDavid du Colombier char *s;
753*4246b616SDavid du Colombier int ninst, nc;
754*4246b616SDavid du Colombier jmp_buf jb;
755*4246b616SDavid du Colombier Dreprog *p;
756*4246b616SDavid du Colombier Drecase *c;
757*4246b616SDavid du Colombier Dreinst *l;
758*4246b616SDavid du Colombier int j, k;
759*4246b616SDavid du Colombier
760*4246b616SDavid du Colombier p = nil;
761*4246b616SDavid du Colombier if(setjmp(jb)){
762*4246b616SDavid du Colombier free(p);
763*4246b616SDavid du Colombier return nil;
764*4246b616SDavid du Colombier }
765*4246b616SDavid du Colombier
766*4246b616SDavid du Colombier s = egetline(b, '\n', jb);
767*4246b616SDavid du Colombier if(strcmp(s, "# dreprog") != 0){
768*4246b616SDavid du Colombier werrstr("format error");
769*4246b616SDavid du Colombier longjmp(jb, 1);
770*4246b616SDavid du Colombier }
771*4246b616SDavid du Colombier
772*4246b616SDavid du Colombier ninst = egetnum(b, ' ', jb);
773*4246b616SDavid du Colombier nc = egetnum(b, ' ', jb);
774*4246b616SDavid du Colombier
775*4246b616SDavid du Colombier p = mallocz(sizeof(Dreprog)+ninst*sizeof(Dreinst)+nc*sizeof(Drecase), 1);
776*4246b616SDavid du Colombier if(p == nil)
777*4246b616SDavid du Colombier longjmp(jb, 1);
778*4246b616SDavid du Colombier c = (Drecase*)&p->inst[ninst];
779*4246b616SDavid du Colombier
780*4246b616SDavid du Colombier p->start[0] = &p->inst[egetnum(b, ' ', jb)];
781*4246b616SDavid du Colombier p->start[1] = &p->inst[egetnum(b, ' ', jb)];
782*4246b616SDavid du Colombier p->start[2] = &p->inst[egetnum(b, ' ', jb)];
783*4246b616SDavid du Colombier p->start[3] = &p->inst[egetnum(b, '\n', jb)];
784*4246b616SDavid du Colombier
785*4246b616SDavid du Colombier for(j=0; j<ninst; j++){
786*4246b616SDavid du Colombier l = &p->inst[j];
787*4246b616SDavid du Colombier l->isfinal = egetnum(b, ' ', jb);
788*4246b616SDavid du Colombier l->isloop = egetnum(b, ' ', jb);
789*4246b616SDavid du Colombier l->nc = egetnum(b, 0, jb);
790*4246b616SDavid du Colombier l->c = c;
791*4246b616SDavid du Colombier for(k=0; k<l->nc; k++){
792*4246b616SDavid du Colombier egetc(b, ' ', jb);
793*4246b616SDavid du Colombier c->start = egetnum(b, ' ', jb);
794*4246b616SDavid du Colombier c->next = &p->inst[egetnum(b, 0, jb)];
795*4246b616SDavid du Colombier c++;
796*4246b616SDavid du Colombier }
797*4246b616SDavid du Colombier egetc(b, '\n', jb);
798*4246b616SDavid du Colombier }
799*4246b616SDavid du Colombier return p;
800*4246b616SDavid du Colombier }
801