xref: /dflybsd-src/contrib/less/charset.c (revision 3187ec284fac076edebba0a9f1e5268c0ef8e50d)
1  /*
2   * Copyright (C) 1984-2024  Mark Nudelman
3   *
4   * You may distribute under the terms of either the GNU General Public
5   * License or the Less License, as specified in the README file.
6   *
7   * For more information, see the README file.
8   */
9  
10  
11  /*
12   * Functions to define the character set
13   * and do things specific to the character set.
14   */
15  
16  #include "less.h"
17  #if HAVE_LOCALE
18  #include <locale.h>
19  #include <ctype.h>
20  #include <langinfo.h>
21  #endif
22  
23  #include "charset.h"
24  #include "xbuf.h"
25  
26  #if MSDOS_COMPILER==WIN32C
27  #define WIN32_LEAN_AND_MEAN
28  #include <windows.h>
29  #endif
30  
31  extern int bs_mode;
32  
33  public int utf_mode = 0;
34  
35  /*
36   * Predefined character sets,
37   * selected by the LESSCHARSET environment variable.
38   */
39  struct charset {
40  	char *name;
41  	int *p_flag;
42  	char *desc;
43  } charsets[] = {
44  		{ "ascii",              NULL,       "8bcccbcc18b95.b" },
45  		{ "utf-8",              &utf_mode,  "8bcccbcc18b95.b126.bb" },
46  		{ "iso8859",            NULL,       "8bcccbcc18b95.33b." },
47  		{ "latin3",             NULL,       "8bcccbcc18b95.33b5.b8.b15.b4.b12.b18.b12.b." },
48  		{ "arabic",             NULL,       "8bcccbcc18b95.33b.3b.7b2.13b.3b.b26.5b19.b" },
49  		{ "greek",              NULL,       "8bcccbcc18b95.33b4.2b4.b3.b35.b44.b" },
50  		{ "greek2005",          NULL,       "8bcccbcc18b95.33b14.b35.b44.b" },
51  		{ "hebrew",             NULL,       "8bcccbcc18b95.33b.b29.32b28.2b2.b" },
52  		{ "koi8-r",             NULL,       "8bcccbcc18b95.b." },
53  		{ "KOI8-T",             NULL,       "8bcccbcc18b95.b8.b6.b8.b.b.5b7.3b4.b4.b3.b.b.3b." },
54  		{ "georgianps",         NULL,       "8bcccbcc18b95.3b11.4b12.2b." },
55  		{ "tcvn",               NULL,       "b..b...bcccbccbbb7.8b95.b48.5b." },
56  		{ "TIS-620",            NULL,       "8bcccbcc18b95.b.4b.11b7.8b." },
57  		{ "next",               NULL,       "8bcccbcc18b95.bb125.bb" },
58  		{ "dos",                NULL,       "8bcccbcc12bc5b95.b." },
59  		{ "windows-1251",       NULL,       "8bcccbcc12bc5b95.b24.b." },
60  		{ "windows-1252",       NULL,       "8bcccbcc12bc5b95.b.b11.b.2b12.b." },
61  		{ "windows-1255",       NULL,       "8bcccbcc12bc5b95.b.b8.b.5b9.b.4b." },
62  		{ "ebcdic",             NULL,       "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
63  		{ "IBM-1047",           NULL,       "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
64  		{ NULL, NULL, NULL }
65  };
66  
67  /*
68   * Support "locale charmap"/nl_langinfo(CODESET) values, as well as others.
69   */
70  struct cs_alias {
71  	char *name;
72  	char *oname;
73  } cs_aliases[] = {
74  	{ "UTF-8",              "utf-8" },
75  	{ "utf8",               "utf-8" },
76  	{ "UTF8",               "utf-8" },
77  	{ "ANSI_X3.4-1968",     "ascii" },
78  	{ "US-ASCII",           "ascii" },
79  	{ "latin1",             "iso8859" },
80  	{ "ISO-8859-1",         "iso8859" },
81  	{ "latin9",             "iso8859" },
82  	{ "ISO-8859-15",        "iso8859" },
83  	{ "latin2",             "iso8859" },
84  	{ "ISO-8859-2",         "iso8859" },
85  	{ "ISO-8859-3",         "latin3" },
86  	{ "latin4",             "iso8859" },
87  	{ "ISO-8859-4",         "iso8859" },
88  	{ "cyrillic",           "iso8859" },
89  	{ "ISO-8859-5",         "iso8859" },
90  	{ "ISO-8859-6",         "arabic" },
91  	{ "ISO-8859-7",         "greek" },
92  	{ "IBM9005",            "greek2005" },
93  	{ "ISO-8859-8",         "hebrew" },
94  	{ "latin5",             "iso8859" },
95  	{ "ISO-8859-9",         "iso8859" },
96  	{ "latin6",             "iso8859" },
97  	{ "ISO-8859-10",        "iso8859" },
98  	{ "latin7",             "iso8859" },
99  	{ "ISO-8859-13",        "iso8859" },
100  	{ "latin8",             "iso8859" },
101  	{ "ISO-8859-14",        "iso8859" },
102  	{ "latin10",            "iso8859" },
103  	{ "ISO-8859-16",        "iso8859" },
104  	{ "IBM437",             "dos" },
105  	{ "EBCDIC-US",          "ebcdic" },
106  	{ "IBM1047",            "IBM-1047" },
107  	{ "KOI8-R",             "koi8-r" },
108  	{ "KOI8-U",             "koi8-r" },
109  	{ "GEORGIAN-PS",        "georgianps" },
110  	{ "TCVN5712-1",         "tcvn" },
111  	{ "NEXTSTEP",           "next" },
112  	{ "windows",            "windows-1252" }, /* backward compatibility */
113  	{ "CP1251",             "windows-1251" },
114  	{ "CP1252",             "windows-1252" },
115  	{ "CP1255",             "windows-1255" },
116  	{ NULL, NULL }
117  };
118  
119  #define IS_BINARY_CHAR  01
120  #define IS_CONTROL_CHAR 02
121  
122  static char chardef[256];
123  static constant char *binfmt = NULL;
124  static constant char *utfbinfmt = NULL;
125  public int binattr = AT_STANDOUT|AT_COLOR_BIN;
126  
127  static struct xbuffer user_wide_array;
128  static struct xbuffer user_ubin_array;
129  static struct xbuffer user_compose_array;
130  static struct xbuffer user_prt_array;
131  static struct wchar_range_table user_wide_table;
132  static struct wchar_range_table user_ubin_table;
133  static struct wchar_range_table user_compose_table;
134  static struct wchar_range_table user_prt_table;
135  
136  /*
137   * Set a wchar_range_table to the table in an xbuffer.
138   */
139  static void wchar_range_table_set(struct wchar_range_table *tbl, struct xbuffer *arr)
140  {
141  	tbl->table = (struct wchar_range *) arr->data;
142  	tbl->count = (unsigned int) (arr->end / sizeof(struct wchar_range));
143  }
144  
145  /*
146   * Skip over a "U" or "U+" prefix before a hex codepoint.
147   */
148  static constant char * skip_uprefix(constant char *s)
149  {
150  	if (*s == 'U' || *s == 'u')
151  		if (*++s == '+') ++s;
152  	return s;
153  }
154  
155  /*
156   * Parse a dash-separated range of hex values.
157   */
158  static void wchar_range_get(constant char **ss, struct wchar_range *range)
159  {
160  	constant char *s = skip_uprefix(*ss);
161  	range->first = lstrtoulc(s, &s, 16);
162  	if (s[0] == '-')
163  	{
164  		s = skip_uprefix(&s[1]);
165  		range->last = lstrtoulc(s, &s, 16);
166  	} else
167  	{
168  		range->last = range->first;
169  	}
170  	*ss = s;
171  }
172  
173  /*
174   * Parse the LESSUTFCHARDEF variable.
175   */
176  static void ichardef_utf(constant char *s)
177  {
178  	xbuf_init(&user_wide_array);
179  	xbuf_init(&user_ubin_array);
180  	xbuf_init(&user_compose_array);
181  	xbuf_init(&user_prt_array);
182  
183  	if (s != NULL)
184  	{
185  		while (s[0] != '\0')
186  		{
187  			struct wchar_range range;
188  			wchar_range_get(&s, &range);
189  			if (range.last == 0)
190  			{
191  				error("invalid hex number(s) in LESSUTFCHARDEF", NULL_PARG);
192  				quit(QUIT_ERROR);
193  			}
194  			if (*s++ != ':')
195  			{
196  				error("missing colon in LESSUTFCHARDEF", NULL_PARG);
197  				quit(QUIT_ERROR);
198  			}
199  			switch (*s++)
200  			{
201  			case 'b':
202  				xbuf_add_data(&user_ubin_array, (unsigned char *) &range, sizeof(range));
203  				break;
204  			case 'c':
205  				xbuf_add_data(&user_compose_array, (unsigned char *) &range, sizeof(range));
206  				break;
207  			case 'w':
208  				xbuf_add_data(&user_wide_array, (unsigned char *) &range, sizeof(range));
209  				xbuf_add_data(&user_prt_array, (unsigned char *) &range, sizeof(range));
210  				break;
211  			case 'p': case '.':
212  				xbuf_add_data(&user_prt_array, (unsigned char *) &range, sizeof(range));
213  				break;
214  			case '\0':
215  				s--;
216  				break;
217  			default:
218  				/* Ignore unknown character attribute. */
219  				break;
220  			}
221  			if (s[0] == ',') ++s;
222  		}
223  	}
224  	wchar_range_table_set(&user_wide_table, &user_wide_array);
225  	wchar_range_table_set(&user_ubin_table, &user_ubin_array);
226  	wchar_range_table_set(&user_compose_table, &user_compose_array);
227  	wchar_range_table_set(&user_prt_table, &user_prt_array);
228  }
229  
230  /*
231   * Define a charset, given a description string.
232   * The string consists of 256 letters,
233   * one for each character in the charset.
234   * If the string is shorter than 256 letters, missing letters
235   * are taken to be identical to the last one.
236   * A decimal number followed by a letter is taken to be a
237   * repetition of the letter.
238   *
239   * Each letter is one of:
240   *      . normal character
241   *      b binary character
242   *      c control character
243   */
244  static void ichardef(constant char *s)
245  {
246  	char *cp;
247  	int n;
248  	char v;
249  
250  	n = 0;
251  	v = 0;
252  	cp = chardef;
253  	while (*s != '\0')
254  	{
255  		switch (*s++)
256  		{
257  		case '.':
258  			v = 0;
259  			break;
260  		case 'c':
261  			v = IS_CONTROL_CHAR;
262  			break;
263  		case 'b':
264  			v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
265  			break;
266  
267  		case '0': case '1': case '2': case '3': case '4':
268  		case '5': case '6': case '7': case '8': case '9':
269  			if (ckd_mul(&n, n, 10) || ckd_add(&n, n, s[-1] - '0'))
270  				goto invalid_chardef;
271  			continue;
272  
273  		default:
274  		invalid_chardef:
275  			error("invalid chardef", NULL_PARG);
276  			quit(QUIT_ERROR);
277  			/*NOTREACHED*/
278  		}
279  
280  		do
281  		{
282  			if (cp >= chardef + sizeof(chardef))
283  			{
284  				error("chardef longer than 256", NULL_PARG);
285  				quit(QUIT_ERROR);
286  				/*NOTREACHED*/
287  			}
288  			*cp++ = v;
289  		} while (--n > 0);
290  		n = 0;
291  	}
292  
293  	while (cp < chardef + sizeof(chardef))
294  		*cp++ = v;
295  }
296  
297  /*
298   * Define a charset, given a charset name.
299   * The valid charset names are listed in the "charsets" array.
300   */
301  static int icharset(constant char *name, int no_error)
302  {
303  	struct charset *p;
304  	struct cs_alias *a;
305  
306  	if (name == NULL || *name == '\0')
307  		return (0);
308  
309  	/* First see if the name is an alias. */
310  	for (a = cs_aliases;  a->name != NULL;  a++)
311  	{
312  		if (strcmp(name, a->name) == 0)
313  		{
314  			name = a->oname;
315  			break;
316  		}
317  	}
318  
319  	for (p = charsets;  p->name != NULL;  p++)
320  	{
321  		if (strcmp(name, p->name) == 0)
322  		{
323  			ichardef(p->desc);
324  			if (p->p_flag != NULL)
325  			{
326  #if MSDOS_COMPILER==WIN32C
327  				*(p->p_flag) = 1 + (GetConsoleOutputCP() != CP_UTF8);
328  #else
329  				*(p->p_flag) = 1;
330  #endif
331  			}
332  			return (1);
333  		}
334  	}
335  
336  	if (!no_error) {
337  		error("invalid charset name", NULL_PARG);
338  		quit(QUIT_ERROR);
339  	}
340  	return (0);
341  }
342  
343  #if HAVE_LOCALE
344  /*
345   * Define a charset, given a locale name.
346   */
347  static void ilocale(void)
348  {
349  	int c;
350  
351  	for (c = 0;  c < (int) sizeof(chardef);  c++)
352  	{
353  		if (isprint(c))
354  			chardef[c] = 0;
355  		else if (iscntrl(c))
356  			chardef[c] = IS_CONTROL_CHAR;
357  		else
358  			chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
359  	}
360  }
361  #endif
362  
363  /*
364   * Define the printing format for control (or binary utf) chars.
365   */
366  public void setfmt(constant char *s, constant char **fmtvarptr, int *attrptr, constant char *default_fmt, lbool for_printf)
367  {
368  	if (s == NULL || *s == '\0')
369  		s = default_fmt;
370  	else if (for_printf &&
371  	    ((*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) ||
372  	     (*s != '*' && strchr(s, 'n'))))
373  		/* %n is evil */
374  		s = default_fmt;
375  
376  	/*
377  	 * Select the attributes if it starts with "*".
378  	 */
379  	if (*s == '*' && s[1] != '\0')
380  	{
381  		switch (s[1])
382  		{
383  		case 'd':  *attrptr = AT_BOLD;      break;
384  		case 'k':  *attrptr = AT_BLINK;     break;
385  		case 's':  *attrptr = AT_STANDOUT;  break;
386  		case 'u':  *attrptr = AT_UNDERLINE; break;
387  		default:   *attrptr = AT_NORMAL;    break;
388  		}
389  		s += 2;
390  	}
391  	*fmtvarptr = s;
392  }
393  
394  /*
395   *
396   */
397  static void set_charset(void)
398  {
399  	constant char *s;
400  
401  	ichardef_utf(lgetenv("LESSUTFCHARDEF"));
402  
403  	/*
404  	 * See if environment variable LESSCHARSET is defined.
405  	 */
406  	s = lgetenv("LESSCHARSET");
407  	if (icharset(s, 0))
408  		return;
409  
410  	/*
411  	 * LESSCHARSET is not defined: try LESSCHARDEF.
412  	 */
413  	s = lgetenv("LESSCHARDEF");
414  	if (!isnullenv(s))
415  	{
416  		ichardef(s);
417  		return;
418  	}
419  
420  #if HAVE_LOCALE
421  #ifdef CODESET
422  	/*
423  	 * Try using the codeset name as the charset name.
424  	 */
425  	s = nl_langinfo(CODESET);
426  	if (icharset(s, 1))
427  		return;
428  #endif
429  #endif
430  
431  #if HAVE_STRSTR
432  	/*
433  	 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
434  	 */
435  	if ((s = lgetenv("LC_ALL")) != NULL ||
436  	    (s = lgetenv("LC_CTYPE")) != NULL ||
437  	    (s = lgetenv("LANG")) != NULL)
438  	{
439  		if (   strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL
440  		    || strstr(s, "UTF8")  != NULL || strstr(s, "utf8")  != NULL)
441  			if (icharset("utf-8", 1))
442  				return;
443  	}
444  #endif
445  
446  #if HAVE_LOCALE
447  	/*
448  	 * Get character definitions from locale functions,
449  	 * rather than from predefined charset entry.
450  	 */
451  	ilocale();
452  #else
453  #if MSDOS_COMPILER
454  #if MSDOS_COMPILER==WIN32C
455  	(void) icharset("utf-8", 1);
456  #else
457  	(void) icharset("dos", 1);
458  #endif
459  #else
460  	(void) icharset("utf-8", 1);
461  #endif
462  #endif
463  }
464  
465  /*
466   * Initialize charset data structures.
467   */
468  public void init_charset(void)
469  {
470  	constant char *s;
471  
472  #if HAVE_LOCALE
473  	setlocale(LC_ALL, "");
474  #endif
475  
476  	set_charset();
477  
478  	s = lgetenv("LESSBINFMT");
479  	setfmt(s, &binfmt, &binattr, "*s<%02X>", TRUE);
480  
481  	s = lgetenv("LESSUTFBINFMT");
482  	setfmt(s, &utfbinfmt, &binattr, "<U+%04lX>", TRUE);
483  }
484  
485  /*
486   * Is a given character a "binary" character?
487   */
488  public lbool binary_char(LWCHAR c)
489  {
490  	if (utf_mode)
491  		return (is_ubin_char(c));
492  	if (c >= sizeof(chardef))
493  		return TRUE;
494  	return ((chardef[c] & IS_BINARY_CHAR) != 0);
495  }
496  
497  /*
498   * Is a given character a "control" character?
499   */
500  public lbool control_char(LWCHAR c)
501  {
502  	if (c >= sizeof(chardef))
503  		return TRUE;
504  	return (chardef[c] & IS_CONTROL_CHAR);
505  }
506  
507  /*
508   * Return the printable form of a character.
509   * For example, in the "ascii" charset '\3' is printed as "^C".
510   */
511  public constant char * prchar(LWCHAR c)
512  {
513  	/* {{ Fixed buffer size means LESSBINFMT etc can be truncated. }} */
514  	static char buf[MAX_PRCHAR_LEN+1];
515  
516  	c &= 0377; /*{{type-issue}}*/
517  	if ((c < 128 || !utf_mode) && !control_char(c))
518  		SNPRINTF1(buf, sizeof(buf), "%c", (int) c);
519  	else if (c == ESC)
520  		strcpy(buf, "ESC");
521  #if IS_EBCDIC_HOST
522  	else if (!binary_char(c) && c < 64)
523  		SNPRINTF1(buf, sizeof(buf), "^%c",
524  		/*
525  		 * This array roughly inverts CONTROL() #defined in less.h,
526  		 * and should be kept in sync with CONTROL() and IBM-1047.
527  		 */
528  		"@ABC.I.?...KLMNO"
529  		"PQRS.JH.XY.."
530  		"\\]^_"
531  		"......W[.....EFG"
532  		"..V....D....TU.Z"[c]);
533  #else
534  	else if (c < 128 && !control_char(c ^ 0100))
535  		SNPRINTF1(buf, sizeof(buf), "^%c", (int) (c ^ 0100));
536  #endif
537  	else
538  		SNPRINTF1(buf, sizeof(buf), binfmt, c);
539  	return (buf);
540  }
541  
542  /*
543   * Return the printable form of a UTF-8 character.
544   */
545  public constant char * prutfchar(LWCHAR ch)
546  {
547  	static char buf[MAX_PRCHAR_LEN+1];
548  
549  	if (ch == ESC)
550  		strcpy(buf, "ESC");
551  	else if (ch < 128 && control_char(ch))
552  	{
553  		if (!control_char(ch ^ 0100))
554  			SNPRINTF1(buf, sizeof(buf), "^%c", ((char) ch) ^ 0100);
555  		else
556  			SNPRINTF1(buf, sizeof(buf), binfmt, (char) ch);
557  	} else if (is_ubin_char(ch))
558  	{
559  		SNPRINTF1(buf, sizeof(buf), utfbinfmt, ch);
560  	} else
561  	{
562  		char *p = buf;
563  		if (ch >= 0x80000000)
564  			ch = 0xFFFD; /* REPLACEMENT CHARACTER */
565  		put_wchar(&p, ch);
566  		*p = '\0';
567  	}
568  	return (buf);
569  }
570  
571  /*
572   * Get the length of a UTF-8 character in bytes.
573   */
574  public int utf_len(char ch)
575  {
576  	if ((ch & 0x80) == 0)
577  		return 1;
578  	if ((ch & 0xE0) == 0xC0)
579  		return 2;
580  	if ((ch & 0xF0) == 0xE0)
581  		return 3;
582  	if ((ch & 0xF8) == 0xF0)
583  		return 4;
584  #if 0
585  	if ((ch & 0xFC) == 0xF8)
586  		return 5;
587  	if ((ch & 0xFE) == 0xFC)
588  		return 6;
589  #endif
590  	/* Invalid UTF-8 encoding. */
591  	return 1;
592  }
593  
594  /*
595   * Does the parameter point to the lead byte of a well-formed UTF-8 character?
596   */
597  public lbool is_utf8_well_formed(constant char *ss, int slen)
598  {
599  	int i;
600  	int len;
601  	unsigned char s0 = (unsigned char) ss[0];
602  
603  	if (IS_UTF8_INVALID(s0))
604  		return (FALSE);
605  
606  	len = utf_len(ss[0]);
607  	if (len > slen)
608  		return (FALSE);
609  	if (len == 1)
610  		return (TRUE);
611  	if (len == 2)
612  	{
613  		if (s0 < 0xC2)
614  			return (FALSE);
615  	} else
616  	{
617  		unsigned char mask = (unsigned char) (~((1 << (8-len)) - 1));
618  		if (s0 == mask && (ss[1] & mask) == 0x80)
619  			return (FALSE);
620  	}
621  
622  	for (i = 1;  i < len;  i++)
623  		if (!IS_UTF8_TRAIL(ss[i]))
624  			return (FALSE);
625  	return (TRUE);
626  }
627  
628  /*
629   * Skip bytes until a UTF-8 lead byte (11xxxxxx) or ASCII byte (0xxxxxxx) is found.
630   */
631  public void utf_skip_to_lead(constant char **pp, constant char *limit)
632  {
633  	do {
634  		++(*pp);
635  	} while (*pp < limit && !IS_UTF8_LEAD((*pp)[0] & 0377) && !IS_ASCII_OCTET((*pp)[0]));
636  }
637  
638  
639  /*
640   * Get the value of a UTF-8 character.
641   */
642  public LWCHAR get_wchar(constant char *sp)
643  {
644  	constant unsigned char *p = (constant unsigned char *) sp;
645  	switch (utf_len(sp[0]))
646  	{
647  	case 1:
648  	default:
649  		/* 0xxxxxxx */
650  		return (LWCHAR)
651  			(p[0] & 0xFF);
652  	case 2:
653  		/* 110xxxxx 10xxxxxx */
654  		return (LWCHAR) (
655  			((p[0] & 0x1F) << 6) |
656  			(p[1] & 0x3F));
657  	case 3:
658  		/* 1110xxxx 10xxxxxx 10xxxxxx */
659  		return (LWCHAR) (
660  			((p[0] & 0x0F) << 12) |
661  			((p[1] & 0x3F) << 6) |
662  			(p[2] & 0x3F));
663  	case 4:
664  		/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
665  		return (LWCHAR) (
666  			((p[0] & 0x07) << 18) |
667  			((p[1] & 0x3F) << 12) |
668  			((p[2] & 0x3F) << 6) |
669  			(p[3] & 0x3F));
670  #if 0
671  	case 5:
672  		/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
673  		return (LWCHAR) (
674  			((p[0] & 0x03) << 24) |
675  			((p[1] & 0x3F) << 18) |
676  			((p[2] & 0x3F) << 12) |
677  			((p[3] & 0x3F) << 6) |
678  			(p[4] & 0x3F));
679  	case 6:
680  		/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
681  		return (LWCHAR) (
682  			((p[0] & 0x01) << 30) |
683  			((p[1] & 0x3F) << 24) |
684  			((p[2] & 0x3F) << 18) |
685  			((p[3] & 0x3F) << 12) |
686  			((p[4] & 0x3F) << 6) |
687  			(p[5] & 0x3F));
688  #endif
689  	}
690  }
691  
692  /*
693   * Store a character into a UTF-8 string.
694   */
695  public void put_wchar(mutable char **pp, LWCHAR ch)
696  {
697  	if (!utf_mode || ch < 0x80)
698  	{
699  		/* 0xxxxxxx */
700  		*(*pp)++ = (char) ch;
701  	} else if (ch < 0x800)
702  	{
703  		/* 110xxxxx 10xxxxxx */
704  		*(*pp)++ = (char) (0xC0 | ((ch >> 6) & 0x1F));
705  		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
706  	} else if (ch < 0x10000)
707  	{
708  		/* 1110xxxx 10xxxxxx 10xxxxxx */
709  		*(*pp)++ = (char) (0xE0 | ((ch >> 12) & 0x0F));
710  		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
711  		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
712  	} else if (ch < 0x200000)
713  	{
714  		/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
715  		*(*pp)++ = (char) (0xF0 | ((ch >> 18) & 0x07));
716  		*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
717  		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
718  		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
719  #if 0
720  	} else if (ch < 0x4000000)
721  	{
722  		/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
723  		*(*pp)++ = (char) (0xF0 | ((ch >> 24) & 0x03));
724  		*(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
725  		*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
726  		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
727  		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
728  	} else
729  	{
730  		/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
731  		*(*pp)++ = (char) (0xF0 | ((ch >> 30) & 0x01));
732  		*(*pp)++ = (char) (0x80 | ((ch >> 24) & 0x3F));
733  		*(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
734  		*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
735  		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
736  		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
737  #endif
738  	}
739  }
740  
741  /*
742   * Step forward or backward one character in a string.
743   */
744  public LWCHAR step_charc(constant char **pp, signed int dir, constant char *limit)
745  {
746  	LWCHAR ch;
747  	int len;
748  	constant char *p = *pp;
749  
750  	if (!utf_mode)
751  	{
752  		/* It's easy if chars are one byte. */
753  		if (dir > 0)
754  			ch = (LWCHAR) (unsigned char) ((p < limit) ? *p++ : 0);
755  		else
756  			ch = (LWCHAR) (unsigned char) ((p > limit) ? *--p : 0);
757  	} else if (dir > 0)
758  	{
759  		len = utf_len(*p);
760  		if (p + len > limit)
761  		{
762  			ch = 0;
763  			p = (char *) limit;
764  		} else
765  		{
766  			ch = get_wchar(p);
767  			p += len;
768  		}
769  	} else
770  	{
771  		while (p > limit && IS_UTF8_TRAIL(p[-1]))
772  			p--;
773  		if (p > limit)
774  			ch = get_wchar(--p);
775  		else
776  			ch = 0;
777  	}
778  	*pp = p;
779  	return ch;
780  }
781  
782  public LWCHAR step_char(char **pp, signed int dir, constant char *limit)
783  {
784  	constant char *p = (constant char *) *pp;
785  	LWCHAR ch = step_charc(&p, dir, limit);
786  	*pp = (char *) p;
787  	return ch;
788  }
789  
790  /*
791   * Unicode characters data
792   * Actual data is in the generated *.uni files.
793   */
794  
795  #define DECLARE_RANGE_TABLE_START(name) \
796  	static struct wchar_range name##_array[] = {
797  #define DECLARE_RANGE_TABLE_END(name) \
798  	}; struct wchar_range_table name##_table = { name##_array, countof(name##_array) };
799  
800  DECLARE_RANGE_TABLE_START(compose)
801  #include "compose.uni"
802  DECLARE_RANGE_TABLE_END(compose)
803  
804  DECLARE_RANGE_TABLE_START(ubin)
805  #include "ubin.uni"
806  DECLARE_RANGE_TABLE_END(ubin)
807  
808  DECLARE_RANGE_TABLE_START(wide)
809  #include "wide.uni"
810  DECLARE_RANGE_TABLE_END(wide)
811  
812  DECLARE_RANGE_TABLE_START(fmt)
813  #include "fmt.uni"
814  DECLARE_RANGE_TABLE_END(fmt)
815  
816  /* comb_table is special pairs, not ranges. */
817  static struct wchar_range comb_table[] = {
818  	{0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627},
819  };
820  
821  
822  static lbool is_in_table(LWCHAR ch, struct wchar_range_table *table)
823  {
824  	unsigned int hi;
825  	unsigned int lo;
826  
827  	/* Binary search in the table. */
828  	if (table->table == NULL || table->count == 0 || ch < table->table[0].first)
829  		return FALSE;
830  	lo = 0;
831  	hi = table->count - 1;
832  	while (lo <= hi)
833  	{
834  		unsigned int mid = (lo + hi) / 2;
835  		if (ch > table->table[mid].last)
836  			lo = mid + 1;
837  		else if (ch < table->table[mid].first)
838  			hi = mid - 1;
839  		else
840  			return TRUE;
841  	}
842  	return FALSE;
843  }
844  
845  /*
846   * Is a character a UTF-8 composing character?
847   * If a composing character follows any char, the two combine into one glyph.
848   */
849  public lbool is_composing_char(LWCHAR ch)
850  {
851  	if (is_in_table(ch, &user_prt_table)) return FALSE;
852  	return is_in_table(ch, &user_compose_table) ||
853  	       is_in_table(ch, &compose_table) ||
854  	       (bs_mode != BS_CONTROL && is_in_table(ch, &fmt_table));
855  }
856  
857  /*
858   * Should this UTF-8 character be treated as binary?
859   */
860  public lbool is_ubin_char(LWCHAR ch)
861  {
862  	if (is_in_table(ch, &user_prt_table)) return FALSE;
863  	return is_in_table(ch, &user_ubin_table) ||
864  	       is_in_table(ch, &ubin_table) ||
865  	       (bs_mode == BS_CONTROL && is_in_table(ch, &fmt_table));
866  }
867  
868  /*
869   * Is this a double width UTF-8 character?
870   */
871  public lbool is_wide_char(LWCHAR ch)
872  {
873  	return is_in_table(ch, &user_wide_table) ||
874  	       is_in_table(ch, &wide_table);
875  }
876  
877  /*
878   * Is a character a UTF-8 combining character?
879   * A combining char acts like an ordinary char, but if it follows
880   * a specific char (not any char), the two combine into one glyph.
881   */
882  public lbool is_combining_char(LWCHAR ch1, LWCHAR ch2)
883  {
884  	/* The table is small; use linear search. */
885  	int i;
886  	for (i = 0;  i < countof(comb_table);  i++)
887  	{
888  		if (ch1 == comb_table[i].first &&
889  		    ch2 == comb_table[i].last)
890  			return TRUE;
891  	}
892  	return FALSE;
893  }
894  
895