1 /* $OpenBSD: ccl.c,v 1.8 2015/11/19 22:55:13 tedu Exp $ */ 2 3 /* ccl - routines for character classes */ 4 5 /* Copyright (c) 1990 The Regents of the University of California. */ 6 /* All rights reserved. */ 7 8 /* This code is derived from software contributed to Berkeley by */ 9 /* Vern Paxson. */ 10 11 /* The United States Government has rights in this work pursuant */ 12 /* to contract no. DE-AC03-76SF00098 between the United States */ 13 /* Department of Energy and the University of California. */ 14 15 /* This file is part of flex. */ 16 17 /* Redistribution and use in source and binary forms, with or without */ 18 /* modification, are permitted provided that the following conditions */ 19 /* are met: */ 20 21 /* 1. Redistributions of source code must retain the above copyright */ 22 /* notice, this list of conditions and the following disclaimer. */ 23 /* 2. Redistributions in binary form must reproduce the above copyright */ 24 /* notice, this list of conditions and the following disclaimer in the */ 25 /* documentation and/or other materials provided with the distribution. */ 26 27 /* Neither the name of the University nor the names of its contributors */ 28 /* may be used to endorse or promote products derived from this software */ 29 /* without specific prior written permission. */ 30 31 /* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */ 32 /* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */ 33 /* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */ 34 /* PURPOSE. */ 35 36 #include "flexdef.h" 37 38 /* return true if the chr is in the ccl. Takes negation into account. */ 39 static bool 40 ccl_contains(const int cclp, const int ch) 41 { 42 int ind, len, i; 43 44 len = ccllen[cclp]; 45 ind = cclmap[cclp]; 46 47 for (i = 0; i < len; ++i) 48 if (ccltbl[ind + i] == ch) 49 return !cclng[cclp]; 50 51 return cclng[cclp]; 52 } 53 54 55 /* ccladd - add a single character to a ccl */ 56 57 void 58 ccladd(cclp, ch) 59 int cclp; 60 int ch; 61 { 62 int ind, len, newpos, i; 63 64 check_char(ch); 65 66 len = ccllen[cclp]; 67 ind = cclmap[cclp]; 68 69 /* check to see if the character is already in the ccl */ 70 71 for (i = 0; i < len; ++i) 72 if (ccltbl[ind + i] == ch) 73 return; 74 75 /* mark newlines */ 76 if (ch == nlch) 77 ccl_has_nl[cclp] = true; 78 79 newpos = ind + len; 80 81 if (newpos >= current_max_ccl_tbl_size) { 82 current_max_ccl_tbl_size += MAX_CCL_TBL_SIZE_INCREMENT; 83 84 ++num_reallocs; 85 86 ccltbl = reallocate_Character_array(ccltbl, 87 current_max_ccl_tbl_size); 88 } 89 ccllen[cclp] = len + 1; 90 ccltbl[newpos] = ch; 91 } 92 93 /* dump_cclp - same thing as list_character_set, but for cclps. */ 94 95 static void 96 dump_cclp(FILE * file, int cclp) 97 { 98 int i; 99 100 putc('[', file); 101 102 for (i = 0; i < csize; ++i) { 103 if (ccl_contains(cclp, i)) { 104 int start_char = i; 105 106 putc(' ', file); 107 108 fputs(readable_form(i), file); 109 110 while (++i < csize && ccl_contains(cclp, i)); 111 112 if (i - 1 > start_char) 113 /* this was a run */ 114 fprintf(file, "-%s", 115 readable_form(i - 1)); 116 117 putc(' ', file); 118 } 119 } 120 121 putc(']', file); 122 } 123 124 125 126 /* ccl_set_diff - create a new ccl as the set difference of the two given ccls. */ 127 int 128 ccl_set_diff(int a, int b) 129 { 130 int d, ch; 131 132 /* create new class */ 133 d = cclinit(); 134 135 /* 136 * In order to handle negation, we spin through all possible chars, 137 * addding each char in a that is not in b. (This could be O(n^2), 138 * but n is small and bounded.) 139 */ 140 for (ch = 0; ch < csize; ++ch) 141 if (ccl_contains(a, ch) && !ccl_contains(b, ch)) 142 ccladd(d, ch); 143 144 /* debug */ 145 if (0) { 146 fprintf(stderr, "ccl_set_diff ("); 147 fprintf(stderr, "\n "); 148 dump_cclp(stderr, a); 149 fprintf(stderr, "\n "); 150 dump_cclp(stderr, b); 151 fprintf(stderr, "\n "); 152 dump_cclp(stderr, d); 153 fprintf(stderr, "\n)\n"); 154 } 155 return d; 156 } 157 158 /* ccl_set_union - create a new ccl as the set union of the two given ccls. */ 159 int 160 ccl_set_union(int a, int b) 161 { 162 int d, i; 163 164 /* create new class */ 165 d = cclinit(); 166 167 /* Add all of a */ 168 for (i = 0; i < ccllen[a]; ++i) 169 ccladd(d, ccltbl[cclmap[a] + i]); 170 171 /* Add all of b */ 172 for (i = 0; i < ccllen[b]; ++i) 173 ccladd(d, ccltbl[cclmap[b] + i]); 174 175 /* debug */ 176 if (0) { 177 fprintf(stderr, "ccl_set_union (%d + %d = %d", a, b, d); 178 fprintf(stderr, "\n "); 179 dump_cclp(stderr, a); 180 fprintf(stderr, "\n "); 181 dump_cclp(stderr, b); 182 fprintf(stderr, "\n "); 183 dump_cclp(stderr, d); 184 fprintf(stderr, "\n)\n"); 185 } 186 return d; 187 } 188 189 190 /* cclinit - return an empty ccl */ 191 192 int 193 cclinit() 194 { 195 if (++lastccl >= current_maxccls) { 196 current_maxccls += MAX_CCLS_INCREMENT; 197 198 ++num_reallocs; 199 200 cclmap = 201 reallocate_integer_array(cclmap, current_maxccls); 202 ccllen = 203 reallocate_integer_array(ccllen, current_maxccls); 204 cclng = reallocate_integer_array(cclng, current_maxccls); 205 ccl_has_nl = 206 reallocate_bool_array(ccl_has_nl, 207 current_maxccls); 208 } 209 if (lastccl == 1) 210 /* we're making the first ccl */ 211 cclmap[lastccl] = 0; 212 213 else 214 /* 215 * The new pointer is just past the end of the last ccl. 216 * Since the cclmap points to the \first/ character of a ccl, 217 * adding the length of the ccl to the cclmap pointer will 218 * produce a cursor to the first free space. 219 */ 220 cclmap[lastccl] = 221 cclmap[lastccl - 1] + ccllen[lastccl - 1]; 222 223 ccllen[lastccl] = 0; 224 cclng[lastccl] = 0; /* ccl's start out life un-negated */ 225 ccl_has_nl[lastccl] = false; 226 227 return lastccl; 228 } 229 230 231 /* cclnegate - negate the given ccl */ 232 233 void 234 cclnegate(cclp) 235 int cclp; 236 { 237 cclng[cclp] = 1; 238 ccl_has_nl[cclp] = !ccl_has_nl[cclp]; 239 } 240 241 242 /* list_character_set - list the members of a set of characters in CCL form 243 * 244 * Writes to the given file a character-class representation of those 245 * characters present in the given CCL. A character is present if it 246 * has a non-zero value in the cset array. 247 */ 248 249 void 250 list_character_set(file, cset) 251 FILE *file; 252 int cset[]; 253 { 254 int i; 255 256 putc('[', file); 257 258 for (i = 0; i < csize; ++i) { 259 if (cset[i]) { 260 int start_char = i; 261 262 putc(' ', file); 263 264 fputs(readable_form(i), file); 265 266 while (++i < csize && cset[i]); 267 268 if (i - 1 > start_char) 269 /* this was a run */ 270 fprintf(file, "-%s", 271 readable_form(i - 1)); 272 273 putc(' ', file); 274 } 275 } 276 277 putc(']', file); 278 } 279 280 /** Determines if the range [c1-c2] is unambiguous in a case-insensitive 281 * scanner. Specifically, if a lowercase or uppercase character, x, is in the 282 * range [c1-c2], then we require that UPPERCASE(x) and LOWERCASE(x) must also 283 * be in the range. If not, then this range is ambiguous, and the function 284 * returns false. For example, [@-_] spans [a-z] but not [A-Z]. Beware that 285 * [a-z] will be labeled ambiguous because it does not include [A-Z]. 286 * 287 * @param c1 the lower end of the range 288 * @param c2 the upper end of the range 289 * @return true if [c1-c2] is not ambiguous for a caseless scanner. 290 */ 291 bool 292 range_covers_case(int c1, int c2) 293 { 294 int i, o; 295 296 for (i = c1; i <= c2; i++) { 297 if (has_case(i)) { 298 o = reverse_case(i); 299 if (o < c1 || c2 < o) 300 return false; 301 } 302 } 303 return true; 304 } 305 306 /** Reverse the case of a character, if possible. 307 * @return c if case-reversal does not apply. 308 */ 309 int 310 reverse_case(int c) 311 { 312 return isupper(c) ? tolower(c) : (islower(c) ? toupper(c) : c); 313 } 314 315 /** Return true if c is uppercase or lowercase. */ 316 bool 317 has_case(int c) 318 { 319 return (isupper(c) || islower(c)) ? true : false; 320 } 321