xref: /netbsd-src/external/gpl2/groff/dist/src/preproc/refer/token.cpp (revision 89a07cf815a29524268025a1139fac4c5190f765)
1 /*	$NetBSD: token.cpp,v 1.1.1.1 2016/01/13 18:41:49 christos Exp $	*/
2 
3 // -*- C++ -*-
4 /* Copyright (C) 1989, 1990, 1991, 1992, 2001 Free Software Foundation, Inc.
5      Written by James Clark (jjc@jclark.com)
6 
7 This file is part of groff.
8 
9 groff is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 2, or (at your option) any later
12 version.
13 
14 groff is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
17 for more details.
18 
19 You should have received a copy of the GNU General Public License along
20 with groff; see the file COPYING.  If not, write to the Free Software
21 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
22 
23 #include "refer.h"
24 #include "token.h"
25 
26 #define TOKEN_TABLE_SIZE 1009
27 // I believe in Icelandic thorn sorts after z.
28 #define THORN_SORT_KEY "{"
29 
30 struct token_table_entry {
31   const char *tok;
32   token_info ti;
33   token_table_entry();
34 };
35 
36 token_table_entry token_table[TOKEN_TABLE_SIZE];
37 int ntokens = 0;
38 
skip_name(const char ** ptr,const char * end)39 static void skip_name(const char **ptr, const char *end)
40 {
41   if (*ptr < end) {
42     switch (*(*ptr)++) {
43     case '(':
44       if (*ptr < end) {
45 	*ptr += 1;
46 	if (*ptr < end)
47 	  *ptr += 1;
48       }
49       break;
50     case '[':
51       while (*ptr < end)
52 	if (*(*ptr)++ == ']')
53 	  break;
54       break;
55     }
56   }
57 }
58 
get_token(const char ** ptr,const char * end)59 int get_token(const char **ptr, const char *end)
60 {
61   if (*ptr >= end)
62     return 0;
63   char c = *(*ptr)++;
64   if (c == '\\' && *ptr < end) {
65     switch (**ptr) {
66     default:
67       *ptr += 1;
68       break;
69     case '(':
70     case '[':
71       skip_name(ptr, end);
72       break;
73     case '*':
74     case 'f':
75       *ptr += 1;
76       skip_name(ptr, end);
77       break;
78     }
79   }
80   return 1;
81 }
82 
token_info()83 token_info::token_info()
84 : type(TOKEN_OTHER), sort_key(0), other_case(0)
85 {
86 }
87 
set(token_type t,const char * sk,const char * oc)88 void token_info::set(token_type t, const char *sk, const char *oc)
89 {
90   assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER);
91   type = t;
92   sort_key = sk;
93   other_case = oc;
94 }
95 
sortify(const char * start,const char * end,string & result) const96 void token_info::sortify(const char *start, const char *end, string &result)
97      const
98 {
99   if (sort_key)
100     result += sort_key;
101   else if (type == TOKEN_UPPER || type == TOKEN_LOWER) {
102     for (; start < end; start++)
103       if (csalpha(*start))
104 	result += cmlower(*start);
105   }
106 }
107 
sortify_non_empty(const char * start,const char * end) const108 int token_info::sortify_non_empty(const char *start, const char *end) const
109 {
110   if (sort_key)
111     return *sort_key != '\0';
112   if (type != TOKEN_UPPER && type != TOKEN_LOWER)
113     return 0;
114   for (; start < end; start++)
115     if (csalpha(*start))
116       return 1;
117   return 0;
118 }
119 
120 
lower_case(const char * start,const char * end,string & result) const121 void token_info::lower_case(const char *start, const char *end,
122 			    string &result) const
123 {
124   if (type != TOKEN_UPPER) {
125     while (start < end)
126       result += *start++;
127   }
128   else if (other_case)
129     result += other_case;
130   else {
131     while (start < end)
132       result += cmlower(*start++);
133   }
134 }
135 
upper_case(const char * start,const char * end,string & result) const136 void token_info::upper_case(const char *start, const char *end,
137 			    string &result) const
138 {
139   if (type != TOKEN_LOWER) {
140     while (start < end)
141       result += *start++;
142   }
143   else if (other_case)
144     result += other_case;
145   else {
146     while (start < end)
147       result += cmupper(*start++);
148   }
149 }
150 
token_table_entry()151 token_table_entry::token_table_entry()
152 : tok(0)
153 {
154 }
155 
store_token(const char * tok,token_type typ,const char * sk=0,const char * oc=0)156 static void store_token(const char *tok, token_type typ,
157 			const char *sk = 0, const char *oc = 0)
158 {
159   unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE;
160   for (;;) {
161     if (token_table[n].tok == 0) {
162       if (++ntokens == TOKEN_TABLE_SIZE)
163 	assert(0);
164       token_table[n].tok = tok;
165       break;
166     }
167     if (strcmp(tok, token_table[n].tok) == 0)
168       break;
169     if (n == 0)
170       n = TOKEN_TABLE_SIZE - 1;
171     else
172       --n;
173   }
174   token_table[n].ti.set(typ, sk, oc);
175 }
176 
177 
178 token_info default_token_info;
179 
lookup_token(const char * start,const char * end)180 const token_info *lookup_token(const char *start, const char *end)
181 {
182   unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE;
183   for (;;) {
184     if (token_table[n].tok == 0)
185       break;
186     if (strlen(token_table[n].tok) == size_t(end - start)
187 	&& memcmp(token_table[n].tok, start, end - start) == 0)
188       return &(token_table[n].ti);
189     if (n == 0)
190       n = TOKEN_TABLE_SIZE - 1;
191     else
192       --n;
193   }
194   return &default_token_info;
195 }
196 
init_ascii()197 static void init_ascii()
198 {
199   const char *p;
200   for (p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) {
201     char buf[2];
202     buf[0] = *p;
203     buf[1] = '\0';
204     store_token(strsave(buf), TOKEN_LOWER);
205     buf[0] = cmupper(buf[0]);
206     store_token(strsave(buf), TOKEN_UPPER);
207   }
208   for (p = "0123456789"; *p; p++) {
209     char buf[2];
210     buf[0] = *p;
211     buf[1] = '\0';
212     const char *s = strsave(buf);
213     store_token(s, TOKEN_OTHER, s);
214   }
215   for (p = ".,:;?!"; *p; p++) {
216     char buf[2];
217     buf[0] = *p;
218     buf[1] = '\0';
219     store_token(strsave(buf), TOKEN_PUNCT);
220   }
221   store_token("-", TOKEN_HYPHEN);
222 }
223 
store_letter(const char * lower,const char * upper,const char * sort_key=0)224 static void store_letter(const char *lower, const char *upper,
225 		  const char *sort_key = 0)
226 {
227   store_token(lower, TOKEN_LOWER, sort_key, upper);
228   store_token(upper, TOKEN_UPPER, sort_key, lower);
229 }
230 
init_letter(unsigned char uc_code,unsigned char lc_code,const char * sort_key)231 static void init_letter(unsigned char uc_code, unsigned char lc_code,
232 		 const char *sort_key)
233 {
234   char lbuf[2];
235   lbuf[0] = lc_code;
236   lbuf[1] = 0;
237   char ubuf[2];
238   ubuf[0] = uc_code;
239   ubuf[1] = 0;
240   store_letter(strsave(lbuf), strsave(ubuf), sort_key);
241 }
242 
init_latin1()243 static void init_latin1()
244 {
245   init_letter(0xc0, 0xe0, "a");
246   init_letter(0xc1, 0xe1, "a");
247   init_letter(0xc2, 0xe2, "a");
248   init_letter(0xc3, 0xe3, "a");
249   init_letter(0xc4, 0xe4, "a");
250   init_letter(0xc5, 0xe5, "a");
251   init_letter(0xc6, 0xe6, "ae");
252   init_letter(0xc7, 0xe7, "c");
253   init_letter(0xc8, 0xe8, "e");
254   init_letter(0xc9, 0xe9, "e");
255   init_letter(0xca, 0xea, "e");
256   init_letter(0xcb, 0xeb, "e");
257   init_letter(0xcc, 0xec, "i");
258   init_letter(0xcd, 0xed, "i");
259   init_letter(0xce, 0xee, "i");
260   init_letter(0xcf, 0xef, "i");
261 
262   init_letter(0xd0, 0xf0, "d");
263   init_letter(0xd1, 0xf1, "n");
264   init_letter(0xd2, 0xf2, "o");
265   init_letter(0xd3, 0xf3, "o");
266   init_letter(0xd4, 0xf4, "o");
267   init_letter(0xd5, 0xf5, "o");
268   init_letter(0xd6, 0xf6, "o");
269   init_letter(0xd8, 0xf8, "o");
270   init_letter(0xd9, 0xf9, "u");
271   init_letter(0xda, 0xfa, "u");
272   init_letter(0xdb, 0xfb, "u");
273   init_letter(0xdc, 0xfc, "u");
274   init_letter(0xdd, 0xfd, "y");
275   init_letter(0xde, 0xfe, THORN_SORT_KEY);
276 
277   store_token("\337", TOKEN_LOWER, "ss", "SS");
278   store_token("\377", TOKEN_LOWER, "y", "Y");
279 }
280 
init_two_char_letter(char l1,char l2,char u1,char u2,const char * sk=0)281 static void init_two_char_letter(char l1, char l2, char u1, char u2,
282 				 const char *sk = 0)
283 {
284   char buf[6];
285   buf[0] = '\\';
286   buf[1] = '(';
287   buf[2] = l1;
288   buf[3] = l2;
289   buf[4] = '\0';
290   const char *p = strsave(buf);
291   buf[2] = u1;
292   buf[3] = u2;
293   store_letter(p, strsave(buf), sk);
294   buf[1] = '[';
295   buf[4] = ']';
296   buf[5] = '\0';
297   p = strsave(buf);
298   buf[2] = l1;
299   buf[3] = l2;
300   store_letter(strsave(buf), p, sk);
301 
302 }
303 
init_special_chars()304 static void init_special_chars()
305 {
306   const char *p;
307   for (p = "':^`~"; *p; p++)
308     for (const char *q = "aeiouy"; *q; q++) {
309       // Use a variable to work around bug in gcc 2.0
310       char c = cmupper(*q);
311       init_two_char_letter(*p, *q, *p, c);
312     }
313   for (p = "/l/o~n,coeaeij"; *p; p += 2) {
314     // Use variables to work around bug in gcc 2.0
315     char c0 = cmupper(p[0]);
316     char c1 = cmupper(p[1]);
317     init_two_char_letter(p[0], p[1], c0, c1);
318   }
319   init_two_char_letter('v', 's', 'v', 'S', "s");
320   init_two_char_letter('v', 'z', 'v', 'Z', "z");
321   init_two_char_letter('o', 'a', 'o', 'A', "a");
322   init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY);
323   init_two_char_letter('-', 'd', '-', 'D');
324 
325   store_token("\\(ss", TOKEN_LOWER, 0, "SS");
326   store_token("\\[ss]", TOKEN_LOWER, 0, "SS");
327 
328   store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D");
329   store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]");
330   store_token("\\(hy", TOKEN_HYPHEN);
331   store_token("\\[hy]", TOKEN_HYPHEN);
332   store_token("\\(en", TOKEN_RANGE_SEP);
333   store_token("\\[en]", TOKEN_RANGE_SEP);
334 }
335 
init_strings()336 static void init_strings()
337 {
338   char buf[6];
339   buf[0] = '\\';
340   buf[1] = '*';
341   for (const char *p = "'`^^,:~v_o./;"; *p; p++) {
342     buf[2] = *p;
343     buf[3] = '\0';
344     store_token(strsave(buf), TOKEN_ACCENT);
345     buf[2] = '[';
346     buf[3] = *p;
347     buf[4] = ']';
348     buf[5] = '\0';
349     store_token(strsave(buf), TOKEN_ACCENT);
350   }
351 
352   // -ms special letters
353   store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY);
354   store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY);
355   store_letter("\\*(d-", "\\*(D-");
356   store_letter("\\*[d-]", "\\*[D-]");
357   store_letter("\\*(ae", "\\*(Ae", "ae");
358   store_letter("\\*[ae]", "\\*[Ae]", "ae");
359   store_letter("\\*(oe", "\\*(Oe", "oe");
360   store_letter("\\*[oe]", "\\*[Oe]", "oe");
361 
362   store_token("\\*3", TOKEN_LOWER, "y", "Y");
363   store_token("\\*8", TOKEN_LOWER, "ss", "SS");
364   store_token("\\*q", TOKEN_LOWER, "o", "O");
365 }
366 
367 struct token_initer {
368   token_initer();
369 };
370 
371 static token_initer the_token_initer;
372 
token_initer()373 token_initer::token_initer()
374 {
375   init_ascii();
376   init_latin1();
377   init_special_chars();
378   init_strings();
379   default_token_info.set(TOKEN_OTHER);
380 }
381