1 /* ccl - routines for character classes */ 2 3 /* Copyright (c) 1990 The Regents of the University of California. */ 4 /* All rights reserved. */ 5 6 /* This code is derived from software contributed to Berkeley by */ 7 /* Vern Paxson. */ 8 9 /* The United States Government has rights in this work pursuant */ 10 /* to contract no. DE-AC03-76SF00098 between the United States */ 11 /* Department of Energy and the University of California. */ 12 13 /* This file is part of flex. */ 14 15 /* Redistribution and use in source and binary forms, with or without */ 16 /* modification, are permitted provided that the following conditions */ 17 /* are met: */ 18 19 /* 1. Redistributions of source code must retain the above copyright */ 20 /* notice, this list of conditions and the following disclaimer. */ 21 /* 2. Redistributions in binary form must reproduce the above copyright */ 22 /* notice, this list of conditions and the following disclaimer in the */ 23 /* documentation and/or other materials provided with the distribution. */ 24 25 /* Neither the name of the University nor the names of its contributors */ 26 /* may be used to endorse or promote products derived from this software */ 27 /* without specific prior written permission. */ 28 29 /* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */ 30 /* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */ 31 /* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */ 32 /* PURPOSE. */ 33 #include "flexdef.h" 34 __RCSID("$NetBSD: ccl.c,v 1.2 2016/01/09 17:38:57 christos Exp $"); 35 36 37 /* return true if the chr is in the ccl. Takes negation into account. */ 38 static bool 39 ccl_contains (const int cclp, const int ch) 40 { 41 int ind, len, i; 42 43 len = ccllen[cclp]; 44 ind = cclmap[cclp]; 45 46 for (i = 0; i < len; ++i) 47 if (ccltbl[ind + i] == ch) 48 return !cclng[cclp]; 49 50 return cclng[cclp]; 51 } 52 53 54 /* ccladd - add a single character to a ccl */ 55 56 void ccladd (cclp, ch) 57 int cclp; 58 int ch; 59 { 60 int ind, len, newpos, i; 61 62 check_char (ch); 63 64 len = ccllen[cclp]; 65 ind = cclmap[cclp]; 66 67 /* check to see if the character is already in the ccl */ 68 69 for (i = 0; i < len; ++i) 70 if (ccltbl[ind + i] == ch) 71 return; 72 73 /* mark newlines */ 74 if (ch == nlch) 75 ccl_has_nl[cclp] = true; 76 77 newpos = ind + len; 78 79 if (newpos >= current_max_ccl_tbl_size) { 80 current_max_ccl_tbl_size += MAX_CCL_TBL_SIZE_INCREMENT; 81 82 ++num_reallocs; 83 84 ccltbl = reallocate_Character_array (ccltbl, 85 current_max_ccl_tbl_size); 86 } 87 88 ccllen[cclp] = len + 1; 89 ccltbl[newpos] = ch; 90 } 91 92 /* dump_cclp - same thing as list_character_set, but for cclps. */ 93 94 static void dump_cclp (FILE* file, int cclp) 95 { 96 int i; 97 98 putc ('[', file); 99 100 for (i = 0; i < csize; ++i) { 101 if (ccl_contains(cclp, i)){ 102 int start_char = i; 103 104 putc (' ', file); 105 106 fputs (readable_form (i), file); 107 108 while (++i < csize && ccl_contains(cclp,i)) ; 109 110 if (i - 1 > start_char) 111 /* this was a run */ 112 fprintf (file, "-%s", 113 readable_form (i - 1)); 114 115 putc (' ', file); 116 } 117 } 118 119 putc (']', file); 120 } 121 122 123 124 /* ccl_set_diff - create a new ccl as the set difference of the two given ccls. */ 125 int 126 ccl_set_diff (int a, int b) 127 { 128 int d, ch; 129 130 /* create new class */ 131 d = cclinit(); 132 133 /* In order to handle negation, we spin through all possible chars, 134 * addding each char in a that is not in b. 135 * (This could be O(n^2), but n is small and bounded.) 136 */ 137 for ( ch = 0; ch < csize; ++ch ) 138 if (ccl_contains (a, ch) && !ccl_contains(b, ch)) 139 ccladd (d, ch); 140 141 /* debug */ 142 if (0){ 143 fprintf(stderr, "ccl_set_diff ("); 144 fprintf(stderr, "\n "); 145 dump_cclp (stderr, a); 146 fprintf(stderr, "\n "); 147 dump_cclp (stderr, b); 148 fprintf(stderr, "\n "); 149 dump_cclp (stderr, d); 150 fprintf(stderr, "\n)\n"); 151 } 152 return d; 153 } 154 155 /* ccl_set_union - create a new ccl as the set union of the two given ccls. */ 156 int 157 ccl_set_union (int a, int b) 158 { 159 int d, i; 160 161 /* create new class */ 162 d = cclinit(); 163 164 /* Add all of a */ 165 for (i = 0; i < ccllen[a]; ++i) 166 ccladd (d, ccltbl[cclmap[a] + i]); 167 168 /* Add all of b */ 169 for (i = 0; i < ccllen[b]; ++i) 170 ccladd (d, ccltbl[cclmap[b] + i]); 171 172 /* debug */ 173 if (0){ 174 fprintf(stderr, "ccl_set_union (%d + %d = %d", a, b, d); 175 fprintf(stderr, "\n "); 176 dump_cclp (stderr, a); 177 fprintf(stderr, "\n "); 178 dump_cclp (stderr, b); 179 fprintf(stderr, "\n "); 180 dump_cclp (stderr, d); 181 fprintf(stderr, "\n)\n"); 182 } 183 return d; 184 } 185 186 187 /* cclinit - return an empty ccl */ 188 189 int cclinit () 190 { 191 if (++lastccl >= current_maxccls) { 192 current_maxccls += MAX_CCLS_INCREMENT; 193 194 ++num_reallocs; 195 196 cclmap = 197 reallocate_integer_array (cclmap, current_maxccls); 198 ccllen = 199 reallocate_integer_array (ccllen, current_maxccls); 200 cclng = reallocate_integer_array (cclng, current_maxccls); 201 ccl_has_nl = 202 reallocate_bool_array (ccl_has_nl, 203 current_maxccls); 204 } 205 206 if (lastccl == 1) 207 /* we're making the first ccl */ 208 cclmap[lastccl] = 0; 209 210 else 211 /* The new pointer is just past the end of the last ccl. 212 * Since the cclmap points to the \first/ character of a 213 * ccl, adding the length of the ccl to the cclmap pointer 214 * will produce a cursor to the first free space. 215 */ 216 cclmap[lastccl] = 217 cclmap[lastccl - 1] + ccllen[lastccl - 1]; 218 219 ccllen[lastccl] = 0; 220 cclng[lastccl] = 0; /* ccl's start out life un-negated */ 221 ccl_has_nl[lastccl] = false; 222 223 return lastccl; 224 } 225 226 227 /* cclnegate - negate the given ccl */ 228 229 void cclnegate (cclp) 230 int cclp; 231 { 232 cclng[cclp] = 1; 233 ccl_has_nl[cclp] = !ccl_has_nl[cclp]; 234 } 235 236 237 /* list_character_set - list the members of a set of characters in CCL form 238 * 239 * Writes to the given file a character-class representation of those 240 * characters present in the given CCL. A character is present if it 241 * has a non-zero value in the cset array. 242 */ 243 244 void list_character_set (file, cset) 245 FILE *file; 246 int cset[]; 247 { 248 int i; 249 250 putc ('[', file); 251 252 for (i = 0; i < csize; ++i) { 253 if (cset[i]) { 254 int start_char = i; 255 256 putc (' ', file); 257 258 fputs (readable_form (i), file); 259 260 while (++i < csize && cset[i]) ; 261 262 if (i - 1 > start_char) 263 /* this was a run */ 264 fprintf (file, "-%s", 265 readable_form (i - 1)); 266 267 putc (' ', file); 268 } 269 } 270 271 putc (']', file); 272 } 273 274 /** Determines if the range [c1-c2] is unambiguous in a case-insensitive 275 * scanner. Specifically, if a lowercase or uppercase character, x, is in the 276 * range [c1-c2], then we require that UPPERCASE(x) and LOWERCASE(x) must also 277 * be in the range. If not, then this range is ambiguous, and the function 278 * returns false. For example, [@-_] spans [a-z] but not [A-Z]. Beware that 279 * [a-z] will be labeled ambiguous because it does not include [A-Z]. 280 * 281 * @param c1 the lower end of the range 282 * @param c2 the upper end of the range 283 * @return true if [c1-c2] is not ambiguous for a caseless scanner. 284 */ 285 bool range_covers_case (int c1, int c2) 286 { 287 int i, o; 288 289 for (i = c1; i <= c2; i++) { 290 if (has_case (i)) { 291 o = reverse_case (i); 292 if (o < c1 || c2 < o) 293 return false; 294 } 295 } 296 return true; 297 } 298 299 /** Reverse the case of a character, if possible. 300 * @return c if case-reversal does not apply. 301 */ 302 int reverse_case (int c) 303 { 304 return isupper (c) ? tolower (c) : (islower (c) ? toupper (c) : c); 305 } 306 307 /** Return true if c is uppercase or lowercase. */ 308 bool has_case (int c) 309 { 310 return (isupper (c) || islower (c)) ? true : false; 311 } 312