xref: /openbsd-src/usr.bin/lex/ccl.c (revision be691f3bb6417f04a68938fadbcaee2d5795e764)
1 /*	$OpenBSD: ccl.c,v 1.8 2015/11/19 22:55:13 tedu Exp $	*/
2 
3 /* ccl - routines for character classes */
4 
5 /*  Copyright (c) 1990 The Regents of the University of California. */
6 /*  All rights reserved. */
7 
8 /*  This code is derived from software contributed to Berkeley by */
9 /*  Vern Paxson. */
10 
11 /*  The United States Government has rights in this work pursuant */
12 /*  to contract no. DE-AC03-76SF00098 between the United States */
13  /* Department of Energy and the University of California. */
14 
15 /*  This file is part of flex. */
16 
17 /*  Redistribution and use in source and binary forms, with or without */
18 /*  modification, are permitted provided that the following conditions */
19 /*  are met: */
20 
21 /*  1. Redistributions of source code must retain the above copyright */
22 /*     notice, this list of conditions and the following disclaimer. */
23 /*  2. Redistributions in binary form must reproduce the above copyright */
24 /*     notice, this list of conditions and the following disclaimer in the */
25 /*     documentation and/or other materials provided with the distribution. */
26 
27 /*  Neither the name of the University nor the names of its contributors */
28 /*  may be used to endorse or promote products derived from this software */
29 /*  without specific prior written permission. */
30 
31 /*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */
32 /*  IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */
33 /*  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */
34 /*  PURPOSE. */
35 
36 #include "flexdef.h"
37 
38 /* return true if the chr is in the ccl. Takes negation into account. */
39 static bool
40 ccl_contains(const int cclp, const int ch)
41 {
42 	int ind, len, i;
43 
44 	len = ccllen[cclp];
45 	ind = cclmap[cclp];
46 
47 	for (i = 0; i < len; ++i)
48 		if (ccltbl[ind + i] == ch)
49 			return !cclng[cclp];
50 
51 	return cclng[cclp];
52 }
53 
54 
55 /* ccladd - add a single character to a ccl */
56 
57 void
58 ccladd(cclp, ch)
59 	int cclp;
60 	int ch;
61 {
62 	int ind, len, newpos, i;
63 
64 	check_char(ch);
65 
66 	len = ccllen[cclp];
67 	ind = cclmap[cclp];
68 
69 	/* check to see if the character is already in the ccl */
70 
71 	for (i = 0; i < len; ++i)
72 		if (ccltbl[ind + i] == ch)
73 			return;
74 
75 	/* mark newlines */
76 	if (ch == nlch)
77 		ccl_has_nl[cclp] = true;
78 
79 	newpos = ind + len;
80 
81 	if (newpos >= current_max_ccl_tbl_size) {
82 		current_max_ccl_tbl_size += MAX_CCL_TBL_SIZE_INCREMENT;
83 
84 		++num_reallocs;
85 
86 		ccltbl = reallocate_Character_array(ccltbl,
87 		    current_max_ccl_tbl_size);
88 	}
89 	ccllen[cclp] = len + 1;
90 	ccltbl[newpos] = ch;
91 }
92 
93 /* dump_cclp - same thing as list_character_set, but for cclps.  */
94 
95 static void
96 dump_cclp(FILE * file, int cclp)
97 {
98 	int i;
99 
100 	putc('[', file);
101 
102 	for (i = 0; i < csize; ++i) {
103 		if (ccl_contains(cclp, i)) {
104 			int start_char = i;
105 
106 			putc(' ', file);
107 
108 			fputs(readable_form(i), file);
109 
110 			while (++i < csize && ccl_contains(cclp, i));
111 
112 			if (i - 1 > start_char)
113 				/* this was a run */
114 				fprintf(file, "-%s",
115 				    readable_form(i - 1));
116 
117 			putc(' ', file);
118 		}
119 	}
120 
121 	putc(']', file);
122 }
123 
124 
125 
126 /* ccl_set_diff - create a new ccl as the set difference of the two given ccls. */
127 int
128 ccl_set_diff(int a, int b)
129 {
130 	int d, ch;
131 
132 	/* create new class  */
133 	d = cclinit();
134 
135 	/*
136 	 * In order to handle negation, we spin through all possible chars,
137 	 * addding each char in a that is not in b. (This could be O(n^2),
138 	 * but n is small and bounded.)
139 	 */
140 	for (ch = 0; ch < csize; ++ch)
141 		if (ccl_contains(a, ch) && !ccl_contains(b, ch))
142 			ccladd(d, ch);
143 
144 	/* debug */
145 	if (0) {
146 		fprintf(stderr, "ccl_set_diff (");
147 		fprintf(stderr, "\n    ");
148 		dump_cclp(stderr, a);
149 		fprintf(stderr, "\n    ");
150 		dump_cclp(stderr, b);
151 		fprintf(stderr, "\n    ");
152 		dump_cclp(stderr, d);
153 		fprintf(stderr, "\n)\n");
154 	}
155 	return d;
156 }
157 
158 /* ccl_set_union - create a new ccl as the set union of the two given ccls. */
159 int
160 ccl_set_union(int a, int b)
161 {
162 	int d, i;
163 
164 	/* create new class  */
165 	d = cclinit();
166 
167 	/* Add all of a */
168 	for (i = 0; i < ccllen[a]; ++i)
169 		ccladd(d, ccltbl[cclmap[a] + i]);
170 
171 	/* Add all of b */
172 	for (i = 0; i < ccllen[b]; ++i)
173 		ccladd(d, ccltbl[cclmap[b] + i]);
174 
175 	/* debug */
176 	if (0) {
177 		fprintf(stderr, "ccl_set_union (%d + %d = %d", a, b, d);
178 		fprintf(stderr, "\n    ");
179 		dump_cclp(stderr, a);
180 		fprintf(stderr, "\n    ");
181 		dump_cclp(stderr, b);
182 		fprintf(stderr, "\n    ");
183 		dump_cclp(stderr, d);
184 		fprintf(stderr, "\n)\n");
185 	}
186 	return d;
187 }
188 
189 
190 /* cclinit - return an empty ccl */
191 
192 int
193 cclinit()
194 {
195 	if (++lastccl >= current_maxccls) {
196 		current_maxccls += MAX_CCLS_INCREMENT;
197 
198 		++num_reallocs;
199 
200 		cclmap =
201 		    reallocate_integer_array(cclmap, current_maxccls);
202 		ccllen =
203 		    reallocate_integer_array(ccllen, current_maxccls);
204 		cclng = reallocate_integer_array(cclng, current_maxccls);
205 		ccl_has_nl =
206 		    reallocate_bool_array(ccl_has_nl,
207 		    current_maxccls);
208 	}
209 	if (lastccl == 1)
210 		/* we're making the first ccl */
211 		cclmap[lastccl] = 0;
212 
213 	else
214 		/*
215 		 * The new pointer is just past the end of the last ccl.
216 		 * Since the cclmap points to the \first/ character of a ccl,
217 		 * adding the length of the ccl to the cclmap pointer will
218 		 * produce a cursor to the first free space.
219 		 */
220 		cclmap[lastccl] =
221 		    cclmap[lastccl - 1] + ccllen[lastccl - 1];
222 
223 	ccllen[lastccl] = 0;
224 	cclng[lastccl] = 0;	/* ccl's start out life un-negated */
225 	ccl_has_nl[lastccl] = false;
226 
227 	return lastccl;
228 }
229 
230 
231 /* cclnegate - negate the given ccl */
232 
233 void
234 cclnegate(cclp)
235 	int cclp;
236 {
237 	cclng[cclp] = 1;
238 	ccl_has_nl[cclp] = !ccl_has_nl[cclp];
239 }
240 
241 
242 /* list_character_set - list the members of a set of characters in CCL form
243  *
244  * Writes to the given file a character-class representation of those
245  * characters present in the given CCL.  A character is present if it
246  * has a non-zero value in the cset array.
247  */
248 
249 void
250 list_character_set(file, cset)
251 	FILE *file;
252 	int cset[];
253 {
254 	int i;
255 
256 	putc('[', file);
257 
258 	for (i = 0; i < csize; ++i) {
259 		if (cset[i]) {
260 			int start_char = i;
261 
262 			putc(' ', file);
263 
264 			fputs(readable_form(i), file);
265 
266 			while (++i < csize && cset[i]);
267 
268 			if (i - 1 > start_char)
269 				/* this was a run */
270 				fprintf(file, "-%s",
271 				    readable_form(i - 1));
272 
273 			putc(' ', file);
274 		}
275 	}
276 
277 	putc(']', file);
278 }
279 
280 /** Determines if the range [c1-c2] is unambiguous in a case-insensitive
281  * scanner.  Specifically, if a lowercase or uppercase character, x, is in the
282  * range [c1-c2], then we require that UPPERCASE(x) and LOWERCASE(x) must also
283  * be in the range. If not, then this range is ambiguous, and the function
284  * returns false.  For example, [@-_] spans [a-z] but not [A-Z].  Beware that
285  * [a-z] will be labeled ambiguous because it does not include [A-Z].
286  *
287  * @param c1 the lower end of the range
288  * @param c2 the upper end of the range
289  * @return true if [c1-c2] is not ambiguous for a caseless scanner.
290  */
291 bool
292 range_covers_case(int c1, int c2)
293 {
294 	int i, o;
295 
296 	for (i = c1; i <= c2; i++) {
297 		if (has_case(i)) {
298 			o = reverse_case(i);
299 			if (o < c1 || c2 < o)
300 				return false;
301 		}
302 	}
303 	return true;
304 }
305 
306 /** Reverse the case of a character, if possible.
307  * @return c if case-reversal does not apply.
308  */
309 int
310 reverse_case(int c)
311 {
312 	return isupper(c) ? tolower(c) : (islower(c) ? toupper(c) : c);
313 }
314 
315 /** Return true if c is uppercase or lowercase. */
316 bool
317 has_case(int c)
318 {
319 	return (isupper(c) || islower(c)) ? true : false;
320 }
321