1 /* $NetBSD: token.cpp,v 1.1.1.1 2016/01/13 18:41:49 christos Exp $ */ 2 3 // -*- C++ -*- 4 /* Copyright (C) 1989, 1990, 1991, 1992, 2001 Free Software Foundation, Inc. 5 Written by James Clark (jjc@jclark.com) 6 7 This file is part of groff. 8 9 groff is free software; you can redistribute it and/or modify it under 10 the terms of the GNU General Public License as published by the Free 11 Software Foundation; either version 2, or (at your option) any later 12 version. 13 14 groff is distributed in the hope that it will be useful, but WITHOUT ANY 15 WARRANTY; without even the implied warranty of MERCHANTABILITY or 16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 17 for more details. 18 19 You should have received a copy of the GNU General Public License along 20 with groff; see the file COPYING. If not, write to the Free Software 21 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */ 22 23 #include "refer.h" 24 #include "token.h" 25 26 #define TOKEN_TABLE_SIZE 1009 27 // I believe in Icelandic thorn sorts after z. 28 #define THORN_SORT_KEY "{" 29 30 struct token_table_entry { 31 const char *tok; 32 token_info ti; 33 token_table_entry(); 34 }; 35 36 token_table_entry token_table[TOKEN_TABLE_SIZE]; 37 int ntokens = 0; 38 39 static void skip_name(const char **ptr, const char *end) 40 { 41 if (*ptr < end) { 42 switch (*(*ptr)++) { 43 case '(': 44 if (*ptr < end) { 45 *ptr += 1; 46 if (*ptr < end) 47 *ptr += 1; 48 } 49 break; 50 case '[': 51 while (*ptr < end) 52 if (*(*ptr)++ == ']') 53 break; 54 break; 55 } 56 } 57 } 58 59 int get_token(const char **ptr, const char *end) 60 { 61 if (*ptr >= end) 62 return 0; 63 char c = *(*ptr)++; 64 if (c == '\\' && *ptr < end) { 65 switch (**ptr) { 66 default: 67 *ptr += 1; 68 break; 69 case '(': 70 case '[': 71 skip_name(ptr, end); 72 break; 73 case '*': 74 case 'f': 75 *ptr += 1; 76 skip_name(ptr, end); 77 break; 78 } 79 } 80 return 1; 81 } 82 83 token_info::token_info() 84 : type(TOKEN_OTHER), sort_key(0), other_case(0) 85 { 86 } 87 88 void token_info::set(token_type t, const char *sk, const char *oc) 89 { 90 assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER); 91 type = t; 92 sort_key = sk; 93 other_case = oc; 94 } 95 96 void token_info::sortify(const char *start, const char *end, string &result) 97 const 98 { 99 if (sort_key) 100 result += sort_key; 101 else if (type == TOKEN_UPPER || type == TOKEN_LOWER) { 102 for (; start < end; start++) 103 if (csalpha(*start)) 104 result += cmlower(*start); 105 } 106 } 107 108 int token_info::sortify_non_empty(const char *start, const char *end) const 109 { 110 if (sort_key) 111 return *sort_key != '\0'; 112 if (type != TOKEN_UPPER && type != TOKEN_LOWER) 113 return 0; 114 for (; start < end; start++) 115 if (csalpha(*start)) 116 return 1; 117 return 0; 118 } 119 120 121 void token_info::lower_case(const char *start, const char *end, 122 string &result) const 123 { 124 if (type != TOKEN_UPPER) { 125 while (start < end) 126 result += *start++; 127 } 128 else if (other_case) 129 result += other_case; 130 else { 131 while (start < end) 132 result += cmlower(*start++); 133 } 134 } 135 136 void token_info::upper_case(const char *start, const char *end, 137 string &result) const 138 { 139 if (type != TOKEN_LOWER) { 140 while (start < end) 141 result += *start++; 142 } 143 else if (other_case) 144 result += other_case; 145 else { 146 while (start < end) 147 result += cmupper(*start++); 148 } 149 } 150 151 token_table_entry::token_table_entry() 152 : tok(0) 153 { 154 } 155 156 static void store_token(const char *tok, token_type typ, 157 const char *sk = 0, const char *oc = 0) 158 { 159 unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE; 160 for (;;) { 161 if (token_table[n].tok == 0) { 162 if (++ntokens == TOKEN_TABLE_SIZE) 163 assert(0); 164 token_table[n].tok = tok; 165 break; 166 } 167 if (strcmp(tok, token_table[n].tok) == 0) 168 break; 169 if (n == 0) 170 n = TOKEN_TABLE_SIZE - 1; 171 else 172 --n; 173 } 174 token_table[n].ti.set(typ, sk, oc); 175 } 176 177 178 token_info default_token_info; 179 180 const token_info *lookup_token(const char *start, const char *end) 181 { 182 unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE; 183 for (;;) { 184 if (token_table[n].tok == 0) 185 break; 186 if (strlen(token_table[n].tok) == size_t(end - start) 187 && memcmp(token_table[n].tok, start, end - start) == 0) 188 return &(token_table[n].ti); 189 if (n == 0) 190 n = TOKEN_TABLE_SIZE - 1; 191 else 192 --n; 193 } 194 return &default_token_info; 195 } 196 197 static void init_ascii() 198 { 199 const char *p; 200 for (p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) { 201 char buf[2]; 202 buf[0] = *p; 203 buf[1] = '\0'; 204 store_token(strsave(buf), TOKEN_LOWER); 205 buf[0] = cmupper(buf[0]); 206 store_token(strsave(buf), TOKEN_UPPER); 207 } 208 for (p = "0123456789"; *p; p++) { 209 char buf[2]; 210 buf[0] = *p; 211 buf[1] = '\0'; 212 const char *s = strsave(buf); 213 store_token(s, TOKEN_OTHER, s); 214 } 215 for (p = ".,:;?!"; *p; p++) { 216 char buf[2]; 217 buf[0] = *p; 218 buf[1] = '\0'; 219 store_token(strsave(buf), TOKEN_PUNCT); 220 } 221 store_token("-", TOKEN_HYPHEN); 222 } 223 224 static void store_letter(const char *lower, const char *upper, 225 const char *sort_key = 0) 226 { 227 store_token(lower, TOKEN_LOWER, sort_key, upper); 228 store_token(upper, TOKEN_UPPER, sort_key, lower); 229 } 230 231 static void init_letter(unsigned char uc_code, unsigned char lc_code, 232 const char *sort_key) 233 { 234 char lbuf[2]; 235 lbuf[0] = lc_code; 236 lbuf[1] = 0; 237 char ubuf[2]; 238 ubuf[0] = uc_code; 239 ubuf[1] = 0; 240 store_letter(strsave(lbuf), strsave(ubuf), sort_key); 241 } 242 243 static void init_latin1() 244 { 245 init_letter(0xc0, 0xe0, "a"); 246 init_letter(0xc1, 0xe1, "a"); 247 init_letter(0xc2, 0xe2, "a"); 248 init_letter(0xc3, 0xe3, "a"); 249 init_letter(0xc4, 0xe4, "a"); 250 init_letter(0xc5, 0xe5, "a"); 251 init_letter(0xc6, 0xe6, "ae"); 252 init_letter(0xc7, 0xe7, "c"); 253 init_letter(0xc8, 0xe8, "e"); 254 init_letter(0xc9, 0xe9, "e"); 255 init_letter(0xca, 0xea, "e"); 256 init_letter(0xcb, 0xeb, "e"); 257 init_letter(0xcc, 0xec, "i"); 258 init_letter(0xcd, 0xed, "i"); 259 init_letter(0xce, 0xee, "i"); 260 init_letter(0xcf, 0xef, "i"); 261 262 init_letter(0xd0, 0xf0, "d"); 263 init_letter(0xd1, 0xf1, "n"); 264 init_letter(0xd2, 0xf2, "o"); 265 init_letter(0xd3, 0xf3, "o"); 266 init_letter(0xd4, 0xf4, "o"); 267 init_letter(0xd5, 0xf5, "o"); 268 init_letter(0xd6, 0xf6, "o"); 269 init_letter(0xd8, 0xf8, "o"); 270 init_letter(0xd9, 0xf9, "u"); 271 init_letter(0xda, 0xfa, "u"); 272 init_letter(0xdb, 0xfb, "u"); 273 init_letter(0xdc, 0xfc, "u"); 274 init_letter(0xdd, 0xfd, "y"); 275 init_letter(0xde, 0xfe, THORN_SORT_KEY); 276 277 store_token("\337", TOKEN_LOWER, "ss", "SS"); 278 store_token("\377", TOKEN_LOWER, "y", "Y"); 279 } 280 281 static void init_two_char_letter(char l1, char l2, char u1, char u2, 282 const char *sk = 0) 283 { 284 char buf[6]; 285 buf[0] = '\\'; 286 buf[1] = '('; 287 buf[2] = l1; 288 buf[3] = l2; 289 buf[4] = '\0'; 290 const char *p = strsave(buf); 291 buf[2] = u1; 292 buf[3] = u2; 293 store_letter(p, strsave(buf), sk); 294 buf[1] = '['; 295 buf[4] = ']'; 296 buf[5] = '\0'; 297 p = strsave(buf); 298 buf[2] = l1; 299 buf[3] = l2; 300 store_letter(strsave(buf), p, sk); 301 302 } 303 304 static void init_special_chars() 305 { 306 const char *p; 307 for (p = "':^`~"; *p; p++) 308 for (const char *q = "aeiouy"; *q; q++) { 309 // Use a variable to work around bug in gcc 2.0 310 char c = cmupper(*q); 311 init_two_char_letter(*p, *q, *p, c); 312 } 313 for (p = "/l/o~n,coeaeij"; *p; p += 2) { 314 // Use variables to work around bug in gcc 2.0 315 char c0 = cmupper(p[0]); 316 char c1 = cmupper(p[1]); 317 init_two_char_letter(p[0], p[1], c0, c1); 318 } 319 init_two_char_letter('v', 's', 'v', 'S', "s"); 320 init_two_char_letter('v', 'z', 'v', 'Z', "z"); 321 init_two_char_letter('o', 'a', 'o', 'A', "a"); 322 init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY); 323 init_two_char_letter('-', 'd', '-', 'D'); 324 325 store_token("\\(ss", TOKEN_LOWER, 0, "SS"); 326 store_token("\\[ss]", TOKEN_LOWER, 0, "SS"); 327 328 store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D"); 329 store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]"); 330 store_token("\\(hy", TOKEN_HYPHEN); 331 store_token("\\[hy]", TOKEN_HYPHEN); 332 store_token("\\(en", TOKEN_RANGE_SEP); 333 store_token("\\[en]", TOKEN_RANGE_SEP); 334 } 335 336 static void init_strings() 337 { 338 char buf[6]; 339 buf[0] = '\\'; 340 buf[1] = '*'; 341 for (const char *p = "'`^^,:~v_o./;"; *p; p++) { 342 buf[2] = *p; 343 buf[3] = '\0'; 344 store_token(strsave(buf), TOKEN_ACCENT); 345 buf[2] = '['; 346 buf[3] = *p; 347 buf[4] = ']'; 348 buf[5] = '\0'; 349 store_token(strsave(buf), TOKEN_ACCENT); 350 } 351 352 // -ms special letters 353 store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY); 354 store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY); 355 store_letter("\\*(d-", "\\*(D-"); 356 store_letter("\\*[d-]", "\\*[D-]"); 357 store_letter("\\*(ae", "\\*(Ae", "ae"); 358 store_letter("\\*[ae]", "\\*[Ae]", "ae"); 359 store_letter("\\*(oe", "\\*(Oe", "oe"); 360 store_letter("\\*[oe]", "\\*[Oe]", "oe"); 361 362 store_token("\\*3", TOKEN_LOWER, "y", "Y"); 363 store_token("\\*8", TOKEN_LOWER, "ss", "SS"); 364 store_token("\\*q", TOKEN_LOWER, "o", "O"); 365 } 366 367 struct token_initer { 368 token_initer(); 369 }; 370 371 static token_initer the_token_initer; 372 373 token_initer::token_initer() 374 { 375 init_ascii(); 376 init_latin1(); 377 init_special_chars(); 378 init_strings(); 379 default_token_info.set(TOKEN_OTHER); 380 } 381