1 /* $OpenBSD: ccl.c,v 1.10 2024/11/09 18:03:44 op Exp $ */ 2 3 /* ccl - routines for character classes */ 4 5 /* Copyright (c) 1990 The Regents of the University of California. */ 6 /* All rights reserved. */ 7 8 /* This code is derived from software contributed to Berkeley by */ 9 /* Vern Paxson. */ 10 11 /* The United States Government has rights in this work pursuant */ 12 /* to contract no. DE-AC03-76SF00098 between the United States */ 13 /* Department of Energy and the University of California. */ 14 15 /* This file is part of flex. */ 16 17 /* Redistribution and use in source and binary forms, with or without */ 18 /* modification, are permitted provided that the following conditions */ 19 /* are met: */ 20 21 /* 1. Redistributions of source code must retain the above copyright */ 22 /* notice, this list of conditions and the following disclaimer. */ 23 /* 2. Redistributions in binary form must reproduce the above copyright */ 24 /* notice, this list of conditions and the following disclaimer in the */ 25 /* documentation and/or other materials provided with the distribution. */ 26 27 /* Neither the name of the University nor the names of its contributors */ 28 /* may be used to endorse or promote products derived from this software */ 29 /* without specific prior written permission. */ 30 31 /* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */ 32 /* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */ 33 /* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */ 34 /* PURPOSE. */ 35 36 #include "flexdef.h" 37 38 /* return true if the chr is in the ccl. Takes negation into account. */ 39 static bool 40 ccl_contains(const int cclp, const int ch) 41 { 42 int ind, len, i; 43 44 len = ccllen[cclp]; 45 ind = cclmap[cclp]; 46 47 for (i = 0; i < len; ++i) 48 if (ccltbl[ind + i] == ch) 49 return !cclng[cclp]; 50 51 return cclng[cclp]; 52 } 53 54 55 /* ccladd - add a single character to a ccl */ 56 57 void 58 ccladd(int cclp, int ch) 59 { 60 int ind, len, newpos, i; 61 62 check_char(ch); 63 64 len = ccllen[cclp]; 65 ind = cclmap[cclp]; 66 67 /* check to see if the character is already in the ccl */ 68 69 for (i = 0; i < len; ++i) 70 if (ccltbl[ind + i] == ch) 71 return; 72 73 /* mark newlines */ 74 if (ch == nlch) 75 ccl_has_nl[cclp] = true; 76 77 newpos = ind + len; 78 79 if (newpos >= current_max_ccl_tbl_size) { 80 current_max_ccl_tbl_size += MAX_CCL_TBL_SIZE_INCREMENT; 81 82 ++num_reallocs; 83 84 ccltbl = reallocate_Character_array(ccltbl, 85 current_max_ccl_tbl_size); 86 } 87 ccllen[cclp] = len + 1; 88 ccltbl[newpos] = ch; 89 } 90 91 /* dump_cclp - same thing as list_character_set, but for cclps. */ 92 93 static void 94 dump_cclp(FILE * file, int cclp) 95 { 96 int i; 97 98 putc('[', file); 99 100 for (i = 0; i < csize; ++i) { 101 if (ccl_contains(cclp, i)) { 102 int start_char = i; 103 104 putc(' ', file); 105 106 fputs(readable_form(i), file); 107 108 while (++i < csize && ccl_contains(cclp, i)); 109 110 if (i - 1 > start_char) 111 /* this was a run */ 112 fprintf(file, "-%s", 113 readable_form(i - 1)); 114 115 putc(' ', file); 116 } 117 } 118 119 putc(']', file); 120 } 121 122 123 124 /* ccl_set_diff - create a new ccl as the set difference of the two given ccls. */ 125 int 126 ccl_set_diff(int a, int b) 127 { 128 int d, ch; 129 130 /* create new class */ 131 d = cclinit(); 132 133 /* 134 * In order to handle negation, we spin through all possible chars, 135 * adding each char in a that is not in b. (This could be O(n^2), 136 * but n is small and bounded.) 137 */ 138 for (ch = 0; ch < csize; ++ch) 139 if (ccl_contains(a, ch) && !ccl_contains(b, ch)) 140 ccladd(d, ch); 141 142 /* debug */ 143 if (0) { 144 fprintf(stderr, "ccl_set_diff ("); 145 fprintf(stderr, "\n "); 146 dump_cclp(stderr, a); 147 fprintf(stderr, "\n "); 148 dump_cclp(stderr, b); 149 fprintf(stderr, "\n "); 150 dump_cclp(stderr, d); 151 fprintf(stderr, "\n)\n"); 152 } 153 return d; 154 } 155 156 /* ccl_set_union - create a new ccl as the set union of the two given ccls. */ 157 int 158 ccl_set_union(int a, int b) 159 { 160 int d, i; 161 162 /* create new class */ 163 d = cclinit(); 164 165 /* Add all of a */ 166 for (i = 0; i < ccllen[a]; ++i) 167 ccladd(d, ccltbl[cclmap[a] + i]); 168 169 /* Add all of b */ 170 for (i = 0; i < ccllen[b]; ++i) 171 ccladd(d, ccltbl[cclmap[b] + i]); 172 173 /* debug */ 174 if (0) { 175 fprintf(stderr, "ccl_set_union (%d + %d = %d", a, b, d); 176 fprintf(stderr, "\n "); 177 dump_cclp(stderr, a); 178 fprintf(stderr, "\n "); 179 dump_cclp(stderr, b); 180 fprintf(stderr, "\n "); 181 dump_cclp(stderr, d); 182 fprintf(stderr, "\n)\n"); 183 } 184 return d; 185 } 186 187 188 /* cclinit - return an empty ccl */ 189 190 int 191 cclinit(void) 192 { 193 if (++lastccl >= current_maxccls) { 194 current_maxccls += MAX_CCLS_INCREMENT; 195 196 ++num_reallocs; 197 198 cclmap = 199 reallocate_integer_array(cclmap, current_maxccls); 200 ccllen = 201 reallocate_integer_array(ccllen, current_maxccls); 202 cclng = reallocate_integer_array(cclng, current_maxccls); 203 ccl_has_nl = 204 reallocate_bool_array(ccl_has_nl, 205 current_maxccls); 206 } 207 if (lastccl == 1) 208 /* we're making the first ccl */ 209 cclmap[lastccl] = 0; 210 211 else 212 /* 213 * The new pointer is just past the end of the last ccl. 214 * Since the cclmap points to the \first/ character of a ccl, 215 * adding the length of the ccl to the cclmap pointer will 216 * produce a cursor to the first free space. 217 */ 218 cclmap[lastccl] = 219 cclmap[lastccl - 1] + ccllen[lastccl - 1]; 220 221 ccllen[lastccl] = 0; 222 cclng[lastccl] = 0; /* ccl's start out life un-negated */ 223 ccl_has_nl[lastccl] = false; 224 225 return lastccl; 226 } 227 228 229 /* cclnegate - negate the given ccl */ 230 231 void 232 cclnegate(int cclp) 233 { 234 cclng[cclp] = 1; 235 ccl_has_nl[cclp] = !ccl_has_nl[cclp]; 236 } 237 238 239 /* list_character_set - list the members of a set of characters in CCL form 240 * 241 * Writes to the given file a character-class representation of those 242 * characters present in the given CCL. A character is present if it 243 * has a non-zero value in the cset array. 244 */ 245 246 void 247 list_character_set(FILE *file, int cset[]) 248 { 249 int i; 250 251 putc('[', file); 252 253 for (i = 0; i < csize; ++i) { 254 if (cset[i]) { 255 int start_char = i; 256 257 putc(' ', file); 258 259 fputs(readable_form(i), file); 260 261 while (++i < csize && cset[i]); 262 263 if (i - 1 > start_char) 264 /* this was a run */ 265 fprintf(file, "-%s", 266 readable_form(i - 1)); 267 268 putc(' ', file); 269 } 270 } 271 272 putc(']', file); 273 } 274 275 /** Determines if the range [c1-c2] is unambiguous in a case-insensitive 276 * scanner. Specifically, if a lowercase or uppercase character, x, is in the 277 * range [c1-c2], then we require that UPPERCASE(x) and LOWERCASE(x) must also 278 * be in the range. If not, then this range is ambiguous, and the function 279 * returns false. For example, [@-_] spans [a-z] but not [A-Z]. Beware that 280 * [a-z] will be labeled ambiguous because it does not include [A-Z]. 281 * 282 * @param c1 the lower end of the range 283 * @param c2 the upper end of the range 284 * @return true if [c1-c2] is not ambiguous for a caseless scanner. 285 */ 286 bool 287 range_covers_case(int c1, int c2) 288 { 289 int i, o; 290 291 for (i = c1; i <= c2; i++) { 292 if (has_case(i)) { 293 o = reverse_case(i); 294 if (o < c1 || c2 < o) 295 return false; 296 } 297 } 298 return true; 299 } 300 301 /** Reverse the case of a character, if possible. 302 * @return c if case-reversal does not apply. 303 */ 304 int 305 reverse_case(int c) 306 { 307 return isupper(c) ? tolower(c) : (islower(c) ? toupper(c) : c); 308 } 309 310 /** Return true if c is uppercase or lowercase. */ 311 bool 312 has_case(int c) 313 { 314 return (isupper(c) || islower(c)) ? true : false; 315 } 316