xref: /netbsd-src/external/gpl2/lvm2/dist/libdm/regex/matcher.c (revision 56a34939419542e88b386b2229be7565f4f45461)
1*56a34939Shaad /*	$NetBSD: matcher.c,v 1.1.1.1 2008/12/22 00:18:36 haad Exp $	*/
2*56a34939Shaad 
3*56a34939Shaad /*
4*56a34939Shaad  * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
5*56a34939Shaad  * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6*56a34939Shaad  *
7*56a34939Shaad  * This file is part of the device-mapper userspace tools.
8*56a34939Shaad  *
9*56a34939Shaad  * This copyrighted material is made available to anyone wishing to use,
10*56a34939Shaad  * modify, copy, or redistribute it subject to the terms and conditions
11*56a34939Shaad  * of the GNU Lesser General Public License v.2.1.
12*56a34939Shaad  *
13*56a34939Shaad  * You should have received a copy of the GNU Lesser General Public License
14*56a34939Shaad  * along with this program; if not, write to the Free Software Foundation,
15*56a34939Shaad  * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16*56a34939Shaad  */
17*56a34939Shaad 
18*56a34939Shaad #include "dmlib.h"
19*56a34939Shaad #include "parse_rx.h"
20*56a34939Shaad #include "ttree.h"
21*56a34939Shaad #include "assert.h"
22*56a34939Shaad 
23*56a34939Shaad struct dfa_state {
24*56a34939Shaad 	int final;
25*56a34939Shaad 	struct dfa_state *lookup[256];
26*56a34939Shaad };
27*56a34939Shaad 
28*56a34939Shaad struct state_queue {
29*56a34939Shaad 	struct dfa_state *s;
30*56a34939Shaad 	dm_bitset_t bits;
31*56a34939Shaad 	struct state_queue *next;
32*56a34939Shaad };
33*56a34939Shaad 
34*56a34939Shaad struct dm_regex {		/* Instance variables for the lexer */
35*56a34939Shaad 	struct dfa_state *start;
36*56a34939Shaad 	unsigned num_nodes;
37*56a34939Shaad 	int nodes_entered;
38*56a34939Shaad 	struct rx_node **nodes;
39*56a34939Shaad 	struct dm_pool *scratch, *mem;
40*56a34939Shaad };
41*56a34939Shaad 
42*56a34939Shaad #define TARGET_TRANS '\0'
43*56a34939Shaad 
_count_nodes(struct rx_node * rx)44*56a34939Shaad static int _count_nodes(struct rx_node *rx)
45*56a34939Shaad {
46*56a34939Shaad 	int r = 1;
47*56a34939Shaad 
48*56a34939Shaad 	if (rx->left)
49*56a34939Shaad 		r += _count_nodes(rx->left);
50*56a34939Shaad 
51*56a34939Shaad 	if (rx->right)
52*56a34939Shaad 		r += _count_nodes(rx->right);
53*56a34939Shaad 
54*56a34939Shaad 	return r;
55*56a34939Shaad }
56*56a34939Shaad 
_fill_table(struct dm_regex * m,struct rx_node * rx)57*56a34939Shaad static void _fill_table(struct dm_regex *m, struct rx_node *rx)
58*56a34939Shaad {
59*56a34939Shaad 	assert((rx->type != OR) || (rx->left && rx->right));
60*56a34939Shaad 
61*56a34939Shaad 	if (rx->left)
62*56a34939Shaad 		_fill_table(m, rx->left);
63*56a34939Shaad 
64*56a34939Shaad 	if (rx->right)
65*56a34939Shaad 		_fill_table(m, rx->right);
66*56a34939Shaad 
67*56a34939Shaad 	m->nodes[m->nodes_entered++] = rx;
68*56a34939Shaad }
69*56a34939Shaad 
_create_bitsets(struct dm_regex * m)70*56a34939Shaad static void _create_bitsets(struct dm_regex *m)
71*56a34939Shaad {
72*56a34939Shaad 	int i;
73*56a34939Shaad 
74*56a34939Shaad 	for (i = 0; i < m->num_nodes; i++) {
75*56a34939Shaad 		struct rx_node *n = m->nodes[i];
76*56a34939Shaad 		n->firstpos = dm_bitset_create(m->scratch, m->num_nodes);
77*56a34939Shaad 		n->lastpos = dm_bitset_create(m->scratch, m->num_nodes);
78*56a34939Shaad 		n->followpos = dm_bitset_create(m->scratch, m->num_nodes);
79*56a34939Shaad 	}
80*56a34939Shaad }
81*56a34939Shaad 
_calc_functions(struct dm_regex * m)82*56a34939Shaad static void _calc_functions(struct dm_regex *m)
83*56a34939Shaad {
84*56a34939Shaad 	int i, j, final = 1;
85*56a34939Shaad 	struct rx_node *rx, *c1, *c2;
86*56a34939Shaad 
87*56a34939Shaad 	for (i = 0; i < m->num_nodes; i++) {
88*56a34939Shaad 		rx = m->nodes[i];
89*56a34939Shaad 		c1 = rx->left;
90*56a34939Shaad 		c2 = rx->right;
91*56a34939Shaad 
92*56a34939Shaad 		if (dm_bit(rx->charset, TARGET_TRANS))
93*56a34939Shaad 			rx->final = final++;
94*56a34939Shaad 
95*56a34939Shaad 		switch (rx->type) {
96*56a34939Shaad 		case CAT:
97*56a34939Shaad 			if (c1->nullable)
98*56a34939Shaad 				dm_bit_union(rx->firstpos,
99*56a34939Shaad 					  c1->firstpos, c2->firstpos);
100*56a34939Shaad 			else
101*56a34939Shaad 				dm_bit_copy(rx->firstpos, c1->firstpos);
102*56a34939Shaad 
103*56a34939Shaad 			if (c2->nullable)
104*56a34939Shaad 				dm_bit_union(rx->lastpos,
105*56a34939Shaad 					  c1->lastpos, c2->lastpos);
106*56a34939Shaad 			else
107*56a34939Shaad 				dm_bit_copy(rx->lastpos, c2->lastpos);
108*56a34939Shaad 
109*56a34939Shaad 			rx->nullable = c1->nullable && c2->nullable;
110*56a34939Shaad 			break;
111*56a34939Shaad 
112*56a34939Shaad 		case PLUS:
113*56a34939Shaad 			dm_bit_copy(rx->firstpos, c1->firstpos);
114*56a34939Shaad 			dm_bit_copy(rx->lastpos, c1->lastpos);
115*56a34939Shaad 			rx->nullable = c1->nullable;
116*56a34939Shaad 			break;
117*56a34939Shaad 
118*56a34939Shaad 		case OR:
119*56a34939Shaad 			dm_bit_union(rx->firstpos, c1->firstpos, c2->firstpos);
120*56a34939Shaad 			dm_bit_union(rx->lastpos, c1->lastpos, c2->lastpos);
121*56a34939Shaad 			rx->nullable = c1->nullable || c2->nullable;
122*56a34939Shaad 			break;
123*56a34939Shaad 
124*56a34939Shaad 		case QUEST:
125*56a34939Shaad 		case STAR:
126*56a34939Shaad 			dm_bit_copy(rx->firstpos, c1->firstpos);
127*56a34939Shaad 			dm_bit_copy(rx->lastpos, c1->lastpos);
128*56a34939Shaad 			rx->nullable = 1;
129*56a34939Shaad 			break;
130*56a34939Shaad 
131*56a34939Shaad 		case CHARSET:
132*56a34939Shaad 			dm_bit_set(rx->firstpos, i);
133*56a34939Shaad 			dm_bit_set(rx->lastpos, i);
134*56a34939Shaad 			rx->nullable = 0;
135*56a34939Shaad 			break;
136*56a34939Shaad 
137*56a34939Shaad 		default:
138*56a34939Shaad 			log_error("Internal error: Unknown calc node type");
139*56a34939Shaad 		}
140*56a34939Shaad 
141*56a34939Shaad 		/*
142*56a34939Shaad 		 * followpos has it's own switch
143*56a34939Shaad 		 * because PLUS and STAR do the
144*56a34939Shaad 		 * same thing.
145*56a34939Shaad 		 */
146*56a34939Shaad 		switch (rx->type) {
147*56a34939Shaad 		case CAT:
148*56a34939Shaad 			for (j = 0; j < m->num_nodes; j++) {
149*56a34939Shaad 				if (dm_bit(c1->lastpos, j)) {
150*56a34939Shaad 					struct rx_node *n = m->nodes[j];
151*56a34939Shaad 					dm_bit_union(n->followpos,
152*56a34939Shaad 						  n->followpos, c2->firstpos);
153*56a34939Shaad 				}
154*56a34939Shaad 			}
155*56a34939Shaad 			break;
156*56a34939Shaad 
157*56a34939Shaad 		case PLUS:
158*56a34939Shaad 		case STAR:
159*56a34939Shaad 			for (j = 0; j < m->num_nodes; j++) {
160*56a34939Shaad 				if (dm_bit(rx->lastpos, j)) {
161*56a34939Shaad 					struct rx_node *n = m->nodes[j];
162*56a34939Shaad 					dm_bit_union(n->followpos,
163*56a34939Shaad 						  n->followpos, rx->firstpos);
164*56a34939Shaad 				}
165*56a34939Shaad 			}
166*56a34939Shaad 			break;
167*56a34939Shaad 		}
168*56a34939Shaad 	}
169*56a34939Shaad }
170*56a34939Shaad 
_create_dfa_state(struct dm_pool * mem)171*56a34939Shaad static struct dfa_state *_create_dfa_state(struct dm_pool *mem)
172*56a34939Shaad {
173*56a34939Shaad 	return dm_pool_zalloc(mem, sizeof(struct dfa_state));
174*56a34939Shaad }
175*56a34939Shaad 
_create_state_queue(struct dm_pool * mem,struct dfa_state * dfa,dm_bitset_t bits)176*56a34939Shaad static struct state_queue *_create_state_queue(struct dm_pool *mem,
177*56a34939Shaad 					       struct dfa_state *dfa,
178*56a34939Shaad 					       dm_bitset_t bits)
179*56a34939Shaad {
180*56a34939Shaad 	struct state_queue *r = dm_pool_alloc(mem, sizeof(*r));
181*56a34939Shaad 
182*56a34939Shaad 	if (!r) {
183*56a34939Shaad 		stack;
184*56a34939Shaad 		return NULL;
185*56a34939Shaad 	}
186*56a34939Shaad 
187*56a34939Shaad 	r->s = dfa;
188*56a34939Shaad 	r->bits = dm_bitset_create(mem, bits[0]);	/* first element is the size */
189*56a34939Shaad 	dm_bit_copy(r->bits, bits);
190*56a34939Shaad 	r->next = 0;
191*56a34939Shaad 	return r;
192*56a34939Shaad }
193*56a34939Shaad 
_calc_states(struct dm_regex * m,struct rx_node * rx)194*56a34939Shaad static int _calc_states(struct dm_regex *m, struct rx_node *rx)
195*56a34939Shaad {
196*56a34939Shaad 	unsigned iwidth = (m->num_nodes / DM_BITS_PER_INT) + 1;
197*56a34939Shaad 	struct ttree *tt = ttree_create(m->scratch, iwidth);
198*56a34939Shaad 	struct state_queue *h, *t, *tmp;
199*56a34939Shaad 	struct dfa_state *dfa, *ldfa;
200*56a34939Shaad 	int i, a, set_bits = 0, count = 0;
201*56a34939Shaad 	dm_bitset_t bs, dfa_bits;
202*56a34939Shaad 
203*56a34939Shaad 	if (!tt)
204*56a34939Shaad 		return_0;
205*56a34939Shaad 
206*56a34939Shaad 	if (!(bs = dm_bitset_create(m->scratch, m->num_nodes)))
207*56a34939Shaad 		return_0;
208*56a34939Shaad 
209*56a34939Shaad 	/* create first state */
210*56a34939Shaad 	dfa = _create_dfa_state(m->mem);
211*56a34939Shaad 	m->start = dfa;
212*56a34939Shaad 	ttree_insert(tt, rx->firstpos + 1, dfa);
213*56a34939Shaad 
214*56a34939Shaad 	/* prime the queue */
215*56a34939Shaad 	h = t = _create_state_queue(m->scratch, dfa, rx->firstpos);
216*56a34939Shaad 	while (h) {
217*56a34939Shaad 		/* pop state off front of the queue */
218*56a34939Shaad 		dfa = h->s;
219*56a34939Shaad 		dfa_bits = h->bits;
220*56a34939Shaad 		h = h->next;
221*56a34939Shaad 
222*56a34939Shaad 		/* iterate through all the inputs for this state */
223*56a34939Shaad 		dm_bit_clear_all(bs);
224*56a34939Shaad 		for (a = 0; a < 256; a++) {
225*56a34939Shaad 			/* iterate through all the states in firstpos */
226*56a34939Shaad 			for (i = dm_bit_get_first(dfa_bits);
227*56a34939Shaad 			     i >= 0; i = dm_bit_get_next(dfa_bits, i)) {
228*56a34939Shaad 				if (dm_bit(m->nodes[i]->charset, a)) {
229*56a34939Shaad 					if (a == TARGET_TRANS)
230*56a34939Shaad 						dfa->final = m->nodes[i]->final;
231*56a34939Shaad 
232*56a34939Shaad 					dm_bit_union(bs, bs,
233*56a34939Shaad 						  m->nodes[i]->followpos);
234*56a34939Shaad 					set_bits = 1;
235*56a34939Shaad 				}
236*56a34939Shaad 			}
237*56a34939Shaad 
238*56a34939Shaad 			if (set_bits) {
239*56a34939Shaad 				ldfa = ttree_lookup(tt, bs + 1);
240*56a34939Shaad 				if (!ldfa) {
241*56a34939Shaad 					/* push */
242*56a34939Shaad 					ldfa = _create_dfa_state(m->mem);
243*56a34939Shaad 					ttree_insert(tt, bs + 1, ldfa);
244*56a34939Shaad 					tmp =
245*56a34939Shaad 					    _create_state_queue(m->scratch,
246*56a34939Shaad 								ldfa, bs);
247*56a34939Shaad 					if (!h)
248*56a34939Shaad 						h = t = tmp;
249*56a34939Shaad 					else {
250*56a34939Shaad 						t->next = tmp;
251*56a34939Shaad 						t = tmp;
252*56a34939Shaad 					}
253*56a34939Shaad 
254*56a34939Shaad 					count++;
255*56a34939Shaad 				}
256*56a34939Shaad 
257*56a34939Shaad 				dfa->lookup[a] = ldfa;
258*56a34939Shaad 				set_bits = 0;
259*56a34939Shaad 				dm_bit_clear_all(bs);
260*56a34939Shaad 			}
261*56a34939Shaad 		}
262*56a34939Shaad 	}
263*56a34939Shaad 
264*56a34939Shaad 	log_debug("Matcher built with %d dfa states", count);
265*56a34939Shaad 	return 1;
266*56a34939Shaad }
267*56a34939Shaad 
dm_regex_create(struct dm_pool * mem,const char ** patterns,unsigned num_patterns)268*56a34939Shaad struct dm_regex *dm_regex_create(struct dm_pool *mem, const char **patterns,
269*56a34939Shaad 				 unsigned num_patterns)
270*56a34939Shaad {
271*56a34939Shaad 	char *all, *ptr;
272*56a34939Shaad 	int i;
273*56a34939Shaad 	size_t len = 0;
274*56a34939Shaad 	struct rx_node *rx;
275*56a34939Shaad 	struct dm_pool *scratch = dm_pool_create("regex matcher", 10 * 1024);
276*56a34939Shaad 	struct dm_regex *m;
277*56a34939Shaad 
278*56a34939Shaad 	if (!scratch)
279*56a34939Shaad 		return_NULL;
280*56a34939Shaad 
281*56a34939Shaad 	if (!(m = dm_pool_alloc(mem, sizeof(*m)))) {
282*56a34939Shaad 		dm_pool_destroy(scratch);
283*56a34939Shaad 		return_NULL;
284*56a34939Shaad 	}
285*56a34939Shaad 
286*56a34939Shaad 	memset(m, 0, sizeof(*m));
287*56a34939Shaad 
288*56a34939Shaad 	/* join the regexps together, delimiting with zero */
289*56a34939Shaad 	for (i = 0; i < num_patterns; i++)
290*56a34939Shaad 		len += strlen(patterns[i]) + 8;
291*56a34939Shaad 
292*56a34939Shaad 	ptr = all = dm_pool_alloc(scratch, len + 1);
293*56a34939Shaad 
294*56a34939Shaad 	if (!all)
295*56a34939Shaad 		goto_bad;
296*56a34939Shaad 
297*56a34939Shaad 	for (i = 0; i < num_patterns; i++) {
298*56a34939Shaad 		ptr += sprintf(ptr, "(.*(%s)%c)", patterns[i], TARGET_TRANS);
299*56a34939Shaad 		if (i < (num_patterns - 1))
300*56a34939Shaad 			*ptr++ = '|';
301*56a34939Shaad 	}
302*56a34939Shaad 
303*56a34939Shaad 	/* parse this expression */
304*56a34939Shaad 	if (!(rx = rx_parse_tok(scratch, all, ptr))) {
305*56a34939Shaad 		log_error("Couldn't parse regex");
306*56a34939Shaad 		goto bad;
307*56a34939Shaad 	}
308*56a34939Shaad 
309*56a34939Shaad 	m->mem = mem;
310*56a34939Shaad 	m->scratch = scratch;
311*56a34939Shaad 	m->num_nodes = _count_nodes(rx);
312*56a34939Shaad 	m->nodes = dm_pool_alloc(scratch, sizeof(*m->nodes) * m->num_nodes);
313*56a34939Shaad 
314*56a34939Shaad 	if (!m->nodes)
315*56a34939Shaad 		goto_bad;
316*56a34939Shaad 
317*56a34939Shaad 	_fill_table(m, rx);
318*56a34939Shaad 	_create_bitsets(m);
319*56a34939Shaad 	_calc_functions(m);
320*56a34939Shaad 	_calc_states(m, rx);
321*56a34939Shaad 	dm_pool_destroy(scratch);
322*56a34939Shaad 	m->scratch = NULL;
323*56a34939Shaad 
324*56a34939Shaad 	return m;
325*56a34939Shaad 
326*56a34939Shaad       bad:
327*56a34939Shaad 	dm_pool_destroy(scratch);
328*56a34939Shaad 	dm_pool_free(mem, m);
329*56a34939Shaad 	return NULL;
330*56a34939Shaad }
331*56a34939Shaad 
_step_matcher(int c,struct dfa_state * cs,int * r)332*56a34939Shaad static struct dfa_state *_step_matcher(int c, struct dfa_state *cs, int *r)
333*56a34939Shaad {
334*56a34939Shaad 	if (!(cs = cs->lookup[(unsigned char) c]))
335*56a34939Shaad 		return NULL;
336*56a34939Shaad 
337*56a34939Shaad 	if (cs->final && (cs->final > *r))
338*56a34939Shaad 		*r = cs->final;
339*56a34939Shaad 
340*56a34939Shaad 	return cs;
341*56a34939Shaad }
342*56a34939Shaad 
dm_regex_match(struct dm_regex * regex,const char * s)343*56a34939Shaad int dm_regex_match(struct dm_regex *regex, const char *s)
344*56a34939Shaad {
345*56a34939Shaad 	struct dfa_state *cs = regex->start;
346*56a34939Shaad 	int r = 0;
347*56a34939Shaad 
348*56a34939Shaad 	if (!(cs = _step_matcher(HAT_CHAR, cs, &r)))
349*56a34939Shaad 		goto out;
350*56a34939Shaad 
351*56a34939Shaad 	for (; *s; s++)
352*56a34939Shaad 		if (!(cs = _step_matcher(*s, cs, &r)))
353*56a34939Shaad 			goto out;
354*56a34939Shaad 
355*56a34939Shaad 	_step_matcher(DOLLAR_CHAR, cs, &r);
356*56a34939Shaad 
357*56a34939Shaad       out:
358*56a34939Shaad 	/* subtract 1 to get back to zero index */
359*56a34939Shaad 	return r - 1;
360*56a34939Shaad }
361