11133e27eSPeter Avalos /*
2*e433da38SAaron LI * Copyright (C) 1984-2024 Mark Nudelman
31133e27eSPeter Avalos *
41133e27eSPeter Avalos * You may distribute under the terms of either the GNU General Public
51133e27eSPeter Avalos * License or the Less License, as specified in the README file.
61133e27eSPeter Avalos *
7e639dc31SJohn Marino * For more information, see the README file.
81133e27eSPeter Avalos */
91133e27eSPeter Avalos
101133e27eSPeter Avalos
111133e27eSPeter Avalos /*
121133e27eSPeter Avalos * Functions to define the character set
131133e27eSPeter Avalos * and do things specific to the character set.
141133e27eSPeter Avalos */
151133e27eSPeter Avalos
161133e27eSPeter Avalos #include "less.h"
171133e27eSPeter Avalos #if HAVE_LOCALE
181133e27eSPeter Avalos #include <locale.h>
191133e27eSPeter Avalos #include <ctype.h>
201133e27eSPeter Avalos #include <langinfo.h>
211133e27eSPeter Avalos #endif
221133e27eSPeter Avalos
231133e27eSPeter Avalos #include "charset.h"
24320d7c8aSAaron LI #include "xbuf.h"
251133e27eSPeter Avalos
2602d62a0fSDaniel Fojt #if MSDOS_COMPILER==WIN32C
2702d62a0fSDaniel Fojt #define WIN32_LEAN_AND_MEAN
2802d62a0fSDaniel Fojt #include <windows.h>
2902d62a0fSDaniel Fojt #endif
3002d62a0fSDaniel Fojt
3102d62a0fSDaniel Fojt extern int bs_mode;
3202d62a0fSDaniel Fojt
331133e27eSPeter Avalos public int utf_mode = 0;
341133e27eSPeter Avalos
351133e27eSPeter Avalos /*
361133e27eSPeter Avalos * Predefined character sets,
371133e27eSPeter Avalos * selected by the LESSCHARSET environment variable.
381133e27eSPeter Avalos */
391133e27eSPeter Avalos struct charset {
401133e27eSPeter Avalos char *name;
411133e27eSPeter Avalos int *p_flag;
421133e27eSPeter Avalos char *desc;
431133e27eSPeter Avalos } charsets[] = {
441133e27eSPeter Avalos { "ascii", NULL, "8bcccbcc18b95.b" },
451133e27eSPeter Avalos { "utf-8", &utf_mode, "8bcccbcc18b95.b126.bb" },
461133e27eSPeter Avalos { "iso8859", NULL, "8bcccbcc18b95.33b." },
471133e27eSPeter Avalos { "latin3", NULL, "8bcccbcc18b95.33b5.b8.b15.b4.b12.b18.b12.b." },
481133e27eSPeter Avalos { "arabic", NULL, "8bcccbcc18b95.33b.3b.7b2.13b.3b.b26.5b19.b" },
491133e27eSPeter Avalos { "greek", NULL, "8bcccbcc18b95.33b4.2b4.b3.b35.b44.b" },
501133e27eSPeter Avalos { "greek2005", NULL, "8bcccbcc18b95.33b14.b35.b44.b" },
511133e27eSPeter Avalos { "hebrew", NULL, "8bcccbcc18b95.33b.b29.32b28.2b2.b" },
521133e27eSPeter Avalos { "koi8-r", NULL, "8bcccbcc18b95.b." },
531133e27eSPeter Avalos { "KOI8-T", NULL, "8bcccbcc18b95.b8.b6.b8.b.b.5b7.3b4.b4.b3.b.b.3b." },
541133e27eSPeter Avalos { "georgianps", NULL, "8bcccbcc18b95.3b11.4b12.2b." },
551133e27eSPeter Avalos { "tcvn", NULL, "b..b...bcccbccbbb7.8b95.b48.5b." },
561133e27eSPeter Avalos { "TIS-620", NULL, "8bcccbcc18b95.b.4b.11b7.8b." },
571133e27eSPeter Avalos { "next", NULL, "8bcccbcc18b95.bb125.bb" },
581133e27eSPeter Avalos { "dos", NULL, "8bcccbcc12bc5b95.b." },
591133e27eSPeter Avalos { "windows-1251", NULL, "8bcccbcc12bc5b95.b24.b." },
601133e27eSPeter Avalos { "windows-1252", NULL, "8bcccbcc12bc5b95.b.b11.b.2b12.b." },
611133e27eSPeter Avalos { "windows-1255", NULL, "8bcccbcc12bc5b95.b.b8.b.5b9.b.4b." },
621133e27eSPeter Avalos { "ebcdic", NULL, "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
631133e27eSPeter Avalos { "IBM-1047", NULL, "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
641133e27eSPeter Avalos { NULL, NULL, NULL }
651133e27eSPeter Avalos };
661133e27eSPeter Avalos
671133e27eSPeter Avalos /*
681133e27eSPeter Avalos * Support "locale charmap"/nl_langinfo(CODESET) values, as well as others.
691133e27eSPeter Avalos */
701133e27eSPeter Avalos struct cs_alias {
711133e27eSPeter Avalos char *name;
721133e27eSPeter Avalos char *oname;
731133e27eSPeter Avalos } cs_aliases[] = {
741133e27eSPeter Avalos { "UTF-8", "utf-8" },
7502d62a0fSDaniel Fojt { "utf8", "utf-8" },
7602d62a0fSDaniel Fojt { "UTF8", "utf-8" },
771133e27eSPeter Avalos { "ANSI_X3.4-1968", "ascii" },
781133e27eSPeter Avalos { "US-ASCII", "ascii" },
791133e27eSPeter Avalos { "latin1", "iso8859" },
801133e27eSPeter Avalos { "ISO-8859-1", "iso8859" },
811133e27eSPeter Avalos { "latin9", "iso8859" },
821133e27eSPeter Avalos { "ISO-8859-15", "iso8859" },
831133e27eSPeter Avalos { "latin2", "iso8859" },
841133e27eSPeter Avalos { "ISO-8859-2", "iso8859" },
851133e27eSPeter Avalos { "ISO-8859-3", "latin3" },
861133e27eSPeter Avalos { "latin4", "iso8859" },
871133e27eSPeter Avalos { "ISO-8859-4", "iso8859" },
881133e27eSPeter Avalos { "cyrillic", "iso8859" },
891133e27eSPeter Avalos { "ISO-8859-5", "iso8859" },
901133e27eSPeter Avalos { "ISO-8859-6", "arabic" },
911133e27eSPeter Avalos { "ISO-8859-7", "greek" },
921133e27eSPeter Avalos { "IBM9005", "greek2005" },
931133e27eSPeter Avalos { "ISO-8859-8", "hebrew" },
941133e27eSPeter Avalos { "latin5", "iso8859" },
951133e27eSPeter Avalos { "ISO-8859-9", "iso8859" },
961133e27eSPeter Avalos { "latin6", "iso8859" },
971133e27eSPeter Avalos { "ISO-8859-10", "iso8859" },
981133e27eSPeter Avalos { "latin7", "iso8859" },
991133e27eSPeter Avalos { "ISO-8859-13", "iso8859" },
1001133e27eSPeter Avalos { "latin8", "iso8859" },
1011133e27eSPeter Avalos { "ISO-8859-14", "iso8859" },
1021133e27eSPeter Avalos { "latin10", "iso8859" },
1031133e27eSPeter Avalos { "ISO-8859-16", "iso8859" },
1041133e27eSPeter Avalos { "IBM437", "dos" },
1051133e27eSPeter Avalos { "EBCDIC-US", "ebcdic" },
1061133e27eSPeter Avalos { "IBM1047", "IBM-1047" },
1071133e27eSPeter Avalos { "KOI8-R", "koi8-r" },
1081133e27eSPeter Avalos { "KOI8-U", "koi8-r" },
1091133e27eSPeter Avalos { "GEORGIAN-PS", "georgianps" },
1101133e27eSPeter Avalos { "TCVN5712-1", "tcvn" },
1111133e27eSPeter Avalos { "NEXTSTEP", "next" },
1121133e27eSPeter Avalos { "windows", "windows-1252" }, /* backward compatibility */
1131133e27eSPeter Avalos { "CP1251", "windows-1251" },
1141133e27eSPeter Avalos { "CP1252", "windows-1252" },
1151133e27eSPeter Avalos { "CP1255", "windows-1255" },
1161133e27eSPeter Avalos { NULL, NULL }
1171133e27eSPeter Avalos };
1181133e27eSPeter Avalos
1191133e27eSPeter Avalos #define IS_BINARY_CHAR 01
1201133e27eSPeter Avalos #define IS_CONTROL_CHAR 02
1211133e27eSPeter Avalos
1221133e27eSPeter Avalos static char chardef[256];
123*e433da38SAaron LI static constant char *binfmt = NULL;
124*e433da38SAaron LI static constant char *utfbinfmt = NULL;
1250c7ad07eSAntonio Huete Jimenez public int binattr = AT_STANDOUT|AT_COLOR_BIN;
1261133e27eSPeter Avalos
127320d7c8aSAaron LI static struct xbuffer user_wide_array;
128320d7c8aSAaron LI static struct xbuffer user_ubin_array;
129320d7c8aSAaron LI static struct xbuffer user_compose_array;
130320d7c8aSAaron LI static struct xbuffer user_prt_array;
131320d7c8aSAaron LI static struct wchar_range_table user_wide_table;
132320d7c8aSAaron LI static struct wchar_range_table user_ubin_table;
133320d7c8aSAaron LI static struct wchar_range_table user_compose_table;
134320d7c8aSAaron LI static struct wchar_range_table user_prt_table;
135320d7c8aSAaron LI
136320d7c8aSAaron LI /*
137320d7c8aSAaron LI * Set a wchar_range_table to the table in an xbuffer.
138320d7c8aSAaron LI */
wchar_range_table_set(struct wchar_range_table * tbl,struct xbuffer * arr)139320d7c8aSAaron LI static void wchar_range_table_set(struct wchar_range_table *tbl, struct xbuffer *arr)
140320d7c8aSAaron LI {
141320d7c8aSAaron LI tbl->table = (struct wchar_range *) arr->data;
142*e433da38SAaron LI tbl->count = (unsigned int) (arr->end / sizeof(struct wchar_range));
143320d7c8aSAaron LI }
144320d7c8aSAaron LI
145320d7c8aSAaron LI /*
146320d7c8aSAaron LI * Skip over a "U" or "U+" prefix before a hex codepoint.
147320d7c8aSAaron LI */
skip_uprefix(constant char * s)148*e433da38SAaron LI static constant char * skip_uprefix(constant char *s)
149320d7c8aSAaron LI {
150320d7c8aSAaron LI if (*s == 'U' || *s == 'u')
151320d7c8aSAaron LI if (*++s == '+') ++s;
152320d7c8aSAaron LI return s;
153320d7c8aSAaron LI }
154320d7c8aSAaron LI
155320d7c8aSAaron LI /*
156320d7c8aSAaron LI * Parse a dash-separated range of hex values.
157320d7c8aSAaron LI */
wchar_range_get(constant char ** ss,struct wchar_range * range)158*e433da38SAaron LI static void wchar_range_get(constant char **ss, struct wchar_range *range)
159320d7c8aSAaron LI {
160*e433da38SAaron LI constant char *s = skip_uprefix(*ss);
161*e433da38SAaron LI range->first = lstrtoulc(s, &s, 16);
162320d7c8aSAaron LI if (s[0] == '-')
163320d7c8aSAaron LI {
164320d7c8aSAaron LI s = skip_uprefix(&s[1]);
165*e433da38SAaron LI range->last = lstrtoulc(s, &s, 16);
166320d7c8aSAaron LI } else
167320d7c8aSAaron LI {
168320d7c8aSAaron LI range->last = range->first;
169320d7c8aSAaron LI }
170320d7c8aSAaron LI *ss = s;
171320d7c8aSAaron LI }
172320d7c8aSAaron LI
173320d7c8aSAaron LI /*
174320d7c8aSAaron LI * Parse the LESSUTFCHARDEF variable.
175320d7c8aSAaron LI */
ichardef_utf(constant char * s)176*e433da38SAaron LI static void ichardef_utf(constant char *s)
177320d7c8aSAaron LI {
178320d7c8aSAaron LI xbuf_init(&user_wide_array);
179320d7c8aSAaron LI xbuf_init(&user_ubin_array);
180320d7c8aSAaron LI xbuf_init(&user_compose_array);
181320d7c8aSAaron LI xbuf_init(&user_prt_array);
182320d7c8aSAaron LI
183320d7c8aSAaron LI if (s != NULL)
184320d7c8aSAaron LI {
185320d7c8aSAaron LI while (s[0] != '\0')
186320d7c8aSAaron LI {
187320d7c8aSAaron LI struct wchar_range range;
188320d7c8aSAaron LI wchar_range_get(&s, &range);
189320d7c8aSAaron LI if (range.last == 0)
190320d7c8aSAaron LI {
191320d7c8aSAaron LI error("invalid hex number(s) in LESSUTFCHARDEF", NULL_PARG);
192320d7c8aSAaron LI quit(QUIT_ERROR);
193320d7c8aSAaron LI }
194320d7c8aSAaron LI if (*s++ != ':')
195320d7c8aSAaron LI {
196320d7c8aSAaron LI error("missing colon in LESSUTFCHARDEF", NULL_PARG);
197320d7c8aSAaron LI quit(QUIT_ERROR);
198320d7c8aSAaron LI }
199320d7c8aSAaron LI switch (*s++)
200320d7c8aSAaron LI {
201320d7c8aSAaron LI case 'b':
202320d7c8aSAaron LI xbuf_add_data(&user_ubin_array, (unsigned char *) &range, sizeof(range));
203320d7c8aSAaron LI break;
204320d7c8aSAaron LI case 'c':
205320d7c8aSAaron LI xbuf_add_data(&user_compose_array, (unsigned char *) &range, sizeof(range));
206320d7c8aSAaron LI break;
207320d7c8aSAaron LI case 'w':
208320d7c8aSAaron LI xbuf_add_data(&user_wide_array, (unsigned char *) &range, sizeof(range));
209320d7c8aSAaron LI xbuf_add_data(&user_prt_array, (unsigned char *) &range, sizeof(range));
210320d7c8aSAaron LI break;
211320d7c8aSAaron LI case 'p': case '.':
212320d7c8aSAaron LI xbuf_add_data(&user_prt_array, (unsigned char *) &range, sizeof(range));
213320d7c8aSAaron LI break;
214320d7c8aSAaron LI case '\0':
215320d7c8aSAaron LI s--;
216320d7c8aSAaron LI break;
217320d7c8aSAaron LI default:
218320d7c8aSAaron LI /* Ignore unknown character attribute. */
219320d7c8aSAaron LI break;
220320d7c8aSAaron LI }
221320d7c8aSAaron LI if (s[0] == ',') ++s;
222320d7c8aSAaron LI }
223320d7c8aSAaron LI }
224320d7c8aSAaron LI wchar_range_table_set(&user_wide_table, &user_wide_array);
225320d7c8aSAaron LI wchar_range_table_set(&user_ubin_table, &user_ubin_array);
226320d7c8aSAaron LI wchar_range_table_set(&user_compose_table, &user_compose_array);
227320d7c8aSAaron LI wchar_range_table_set(&user_prt_table, &user_prt_array);
228320d7c8aSAaron LI }
2291133e27eSPeter Avalos
2301133e27eSPeter Avalos /*
2311133e27eSPeter Avalos * Define a charset, given a description string.
2321133e27eSPeter Avalos * The string consists of 256 letters,
2331133e27eSPeter Avalos * one for each character in the charset.
2341133e27eSPeter Avalos * If the string is shorter than 256 letters, missing letters
2351133e27eSPeter Avalos * are taken to be identical to the last one.
2361133e27eSPeter Avalos * A decimal number followed by a letter is taken to be a
2371133e27eSPeter Avalos * repetition of the letter.
2381133e27eSPeter Avalos *
2391133e27eSPeter Avalos * Each letter is one of:
2401133e27eSPeter Avalos * . normal character
2411133e27eSPeter Avalos * b binary character
2421133e27eSPeter Avalos * c control character
2431133e27eSPeter Avalos */
ichardef(constant char * s)244*e433da38SAaron LI static void ichardef(constant char *s)
2451133e27eSPeter Avalos {
24602d62a0fSDaniel Fojt char *cp;
24702d62a0fSDaniel Fojt int n;
24802d62a0fSDaniel Fojt char v;
2491133e27eSPeter Avalos
2501133e27eSPeter Avalos n = 0;
2511133e27eSPeter Avalos v = 0;
2521133e27eSPeter Avalos cp = chardef;
2531133e27eSPeter Avalos while (*s != '\0')
2541133e27eSPeter Avalos {
2551133e27eSPeter Avalos switch (*s++)
2561133e27eSPeter Avalos {
2571133e27eSPeter Avalos case '.':
2581133e27eSPeter Avalos v = 0;
2591133e27eSPeter Avalos break;
2601133e27eSPeter Avalos case 'c':
2611133e27eSPeter Avalos v = IS_CONTROL_CHAR;
2621133e27eSPeter Avalos break;
2631133e27eSPeter Avalos case 'b':
2641133e27eSPeter Avalos v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
2651133e27eSPeter Avalos break;
2661133e27eSPeter Avalos
2671133e27eSPeter Avalos case '0': case '1': case '2': case '3': case '4':
2681133e27eSPeter Avalos case '5': case '6': case '7': case '8': case '9':
269320d7c8aSAaron LI if (ckd_mul(&n, n, 10) || ckd_add(&n, n, s[-1] - '0'))
270320d7c8aSAaron LI goto invalid_chardef;
2711133e27eSPeter Avalos continue;
2721133e27eSPeter Avalos
2731133e27eSPeter Avalos default:
274320d7c8aSAaron LI invalid_chardef:
2751133e27eSPeter Avalos error("invalid chardef", NULL_PARG);
2761133e27eSPeter Avalos quit(QUIT_ERROR);
2771133e27eSPeter Avalos /*NOTREACHED*/
2781133e27eSPeter Avalos }
2791133e27eSPeter Avalos
2801133e27eSPeter Avalos do
2811133e27eSPeter Avalos {
2821133e27eSPeter Avalos if (cp >= chardef + sizeof(chardef))
2831133e27eSPeter Avalos {
2841133e27eSPeter Avalos error("chardef longer than 256", NULL_PARG);
2851133e27eSPeter Avalos quit(QUIT_ERROR);
2861133e27eSPeter Avalos /*NOTREACHED*/
2871133e27eSPeter Avalos }
2881133e27eSPeter Avalos *cp++ = v;
2891133e27eSPeter Avalos } while (--n > 0);
2901133e27eSPeter Avalos n = 0;
2911133e27eSPeter Avalos }
2921133e27eSPeter Avalos
2931133e27eSPeter Avalos while (cp < chardef + sizeof(chardef))
2941133e27eSPeter Avalos *cp++ = v;
2951133e27eSPeter Avalos }
2961133e27eSPeter Avalos
2971133e27eSPeter Avalos /*
2981133e27eSPeter Avalos * Define a charset, given a charset name.
2991133e27eSPeter Avalos * The valid charset names are listed in the "charsets" array.
3001133e27eSPeter Avalos */
icharset(constant char * name,int no_error)301*e433da38SAaron LI static int icharset(constant char *name, int no_error)
3021133e27eSPeter Avalos {
30302d62a0fSDaniel Fojt struct charset *p;
30402d62a0fSDaniel Fojt struct cs_alias *a;
3051133e27eSPeter Avalos
3061133e27eSPeter Avalos if (name == NULL || *name == '\0')
3071133e27eSPeter Avalos return (0);
3081133e27eSPeter Avalos
3091133e27eSPeter Avalos /* First see if the name is an alias. */
3101133e27eSPeter Avalos for (a = cs_aliases; a->name != NULL; a++)
3111133e27eSPeter Avalos {
3121133e27eSPeter Avalos if (strcmp(name, a->name) == 0)
3131133e27eSPeter Avalos {
3141133e27eSPeter Avalos name = a->oname;
3151133e27eSPeter Avalos break;
3161133e27eSPeter Avalos }
3171133e27eSPeter Avalos }
3181133e27eSPeter Avalos
3191133e27eSPeter Avalos for (p = charsets; p->name != NULL; p++)
3201133e27eSPeter Avalos {
3211133e27eSPeter Avalos if (strcmp(name, p->name) == 0)
3221133e27eSPeter Avalos {
3231133e27eSPeter Avalos ichardef(p->desc);
3241133e27eSPeter Avalos if (p->p_flag != NULL)
32502d62a0fSDaniel Fojt {
32602d62a0fSDaniel Fojt #if MSDOS_COMPILER==WIN32C
32702d62a0fSDaniel Fojt *(p->p_flag) = 1 + (GetConsoleOutputCP() != CP_UTF8);
32802d62a0fSDaniel Fojt #else
3291133e27eSPeter Avalos *(p->p_flag) = 1;
33002d62a0fSDaniel Fojt #endif
33102d62a0fSDaniel Fojt }
3321133e27eSPeter Avalos return (1);
3331133e27eSPeter Avalos }
3341133e27eSPeter Avalos }
3351133e27eSPeter Avalos
3361133e27eSPeter Avalos if (!no_error) {
3371133e27eSPeter Avalos error("invalid charset name", NULL_PARG);
3381133e27eSPeter Avalos quit(QUIT_ERROR);
3391133e27eSPeter Avalos }
3401133e27eSPeter Avalos return (0);
3411133e27eSPeter Avalos }
3421133e27eSPeter Avalos
3431133e27eSPeter Avalos #if HAVE_LOCALE
3441133e27eSPeter Avalos /*
3451133e27eSPeter Avalos * Define a charset, given a locale name.
3461133e27eSPeter Avalos */
ilocale(void)347320d7c8aSAaron LI static void ilocale(void)
3481133e27eSPeter Avalos {
34902d62a0fSDaniel Fojt int c;
3501133e27eSPeter Avalos
3511133e27eSPeter Avalos for (c = 0; c < (int) sizeof(chardef); c++)
3521133e27eSPeter Avalos {
3531133e27eSPeter Avalos if (isprint(c))
3541133e27eSPeter Avalos chardef[c] = 0;
3551133e27eSPeter Avalos else if (iscntrl(c))
3561133e27eSPeter Avalos chardef[c] = IS_CONTROL_CHAR;
3571133e27eSPeter Avalos else
3581133e27eSPeter Avalos chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
3591133e27eSPeter Avalos }
3601133e27eSPeter Avalos }
3611133e27eSPeter Avalos #endif
3621133e27eSPeter Avalos
3631133e27eSPeter Avalos /*
3641133e27eSPeter Avalos * Define the printing format for control (or binary utf) chars.
3651133e27eSPeter Avalos */
setfmt(constant char * s,constant char ** fmtvarptr,int * attrptr,constant char * default_fmt,lbool for_printf)366*e433da38SAaron LI public void setfmt(constant char *s, constant char **fmtvarptr, int *attrptr, constant char *default_fmt, lbool for_printf)
3671133e27eSPeter Avalos {
368320d7c8aSAaron LI if (s == NULL || *s == '\0')
369320d7c8aSAaron LI s = default_fmt;
370320d7c8aSAaron LI else if (for_printf &&
371320d7c8aSAaron LI ((*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) ||
372320d7c8aSAaron LI (*s != '*' && strchr(s, 'n'))))
3731133e27eSPeter Avalos /* %n is evil */
3741133e27eSPeter Avalos s = default_fmt;
3751133e27eSPeter Avalos
3761133e27eSPeter Avalos /*
3771133e27eSPeter Avalos * Select the attributes if it starts with "*".
3781133e27eSPeter Avalos */
37902d62a0fSDaniel Fojt if (*s == '*' && s[1] != '\0')
3801133e27eSPeter Avalos {
3811133e27eSPeter Avalos switch (s[1])
3821133e27eSPeter Avalos {
38302d62a0fSDaniel Fojt case 'd': *attrptr = AT_BOLD; break;
38402d62a0fSDaniel Fojt case 'k': *attrptr = AT_BLINK; break;
38502d62a0fSDaniel Fojt case 's': *attrptr = AT_STANDOUT; break;
38602d62a0fSDaniel Fojt case 'u': *attrptr = AT_UNDERLINE; break;
38702d62a0fSDaniel Fojt default: *attrptr = AT_NORMAL; break;
3881133e27eSPeter Avalos }
3891133e27eSPeter Avalos s += 2;
3901133e27eSPeter Avalos }
3911133e27eSPeter Avalos *fmtvarptr = s;
3921133e27eSPeter Avalos }
3931133e27eSPeter Avalos
3941133e27eSPeter Avalos /*
3951133e27eSPeter Avalos *
3961133e27eSPeter Avalos */
set_charset(void)397320d7c8aSAaron LI static void set_charset(void)
3981133e27eSPeter Avalos {
399*e433da38SAaron LI constant char *s;
400320d7c8aSAaron LI
401320d7c8aSAaron LI ichardef_utf(lgetenv("LESSUTFCHARDEF"));
402320d7c8aSAaron LI
4031133e27eSPeter Avalos /*
4041133e27eSPeter Avalos * See if environment variable LESSCHARSET is defined.
4051133e27eSPeter Avalos */
4061133e27eSPeter Avalos s = lgetenv("LESSCHARSET");
4071133e27eSPeter Avalos if (icharset(s, 0))
4081133e27eSPeter Avalos return;
4091133e27eSPeter Avalos
4101133e27eSPeter Avalos /*
4111133e27eSPeter Avalos * LESSCHARSET is not defined: try LESSCHARDEF.
4121133e27eSPeter Avalos */
4131133e27eSPeter Avalos s = lgetenv("LESSCHARDEF");
41402d62a0fSDaniel Fojt if (!isnullenv(s))
4151133e27eSPeter Avalos {
4161133e27eSPeter Avalos ichardef(s);
4171133e27eSPeter Avalos return;
4181133e27eSPeter Avalos }
4191133e27eSPeter Avalos
4201133e27eSPeter Avalos #if HAVE_LOCALE
4211133e27eSPeter Avalos #ifdef CODESET
4221133e27eSPeter Avalos /*
4231133e27eSPeter Avalos * Try using the codeset name as the charset name.
4241133e27eSPeter Avalos */
4251133e27eSPeter Avalos s = nl_langinfo(CODESET);
4261133e27eSPeter Avalos if (icharset(s, 1))
4271133e27eSPeter Avalos return;
4281133e27eSPeter Avalos #endif
4291133e27eSPeter Avalos #endif
4301133e27eSPeter Avalos
4311133e27eSPeter Avalos #if HAVE_STRSTR
4321133e27eSPeter Avalos /*
4331133e27eSPeter Avalos * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
4341133e27eSPeter Avalos */
4351133e27eSPeter Avalos if ((s = lgetenv("LC_ALL")) != NULL ||
4361133e27eSPeter Avalos (s = lgetenv("LC_CTYPE")) != NULL ||
4371133e27eSPeter Avalos (s = lgetenv("LANG")) != NULL)
4381133e27eSPeter Avalos {
4391133e27eSPeter Avalos if ( strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL
4401133e27eSPeter Avalos || strstr(s, "UTF8") != NULL || strstr(s, "utf8") != NULL)
4411133e27eSPeter Avalos if (icharset("utf-8", 1))
4421133e27eSPeter Avalos return;
4431133e27eSPeter Avalos }
4441133e27eSPeter Avalos #endif
4451133e27eSPeter Avalos
4461133e27eSPeter Avalos #if HAVE_LOCALE
4471133e27eSPeter Avalos /*
4481133e27eSPeter Avalos * Get character definitions from locale functions,
4491133e27eSPeter Avalos * rather than from predefined charset entry.
4501133e27eSPeter Avalos */
4511133e27eSPeter Avalos ilocale();
45202d62a0fSDaniel Fojt #else
4531133e27eSPeter Avalos #if MSDOS_COMPILER
454*e433da38SAaron LI #if MSDOS_COMPILER==WIN32C
455*e433da38SAaron LI (void) icharset("utf-8", 1);
4561133e27eSPeter Avalos #else
457*e433da38SAaron LI (void) icharset("dos", 1);
458*e433da38SAaron LI #endif
459*e433da38SAaron LI #else
460*e433da38SAaron LI (void) icharset("utf-8", 1);
4611133e27eSPeter Avalos #endif
4621133e27eSPeter Avalos #endif
4631133e27eSPeter Avalos }
4641133e27eSPeter Avalos
4651133e27eSPeter Avalos /*
4661133e27eSPeter Avalos * Initialize charset data structures.
4671133e27eSPeter Avalos */
init_charset(void)468320d7c8aSAaron LI public void init_charset(void)
4691133e27eSPeter Avalos {
470*e433da38SAaron LI constant char *s;
4711133e27eSPeter Avalos
4721133e27eSPeter Avalos #if HAVE_LOCALE
4731133e27eSPeter Avalos setlocale(LC_ALL, "");
4741133e27eSPeter Avalos #endif
4751133e27eSPeter Avalos
4761133e27eSPeter Avalos set_charset();
4771133e27eSPeter Avalos
4781133e27eSPeter Avalos s = lgetenv("LESSBINFMT");
479320d7c8aSAaron LI setfmt(s, &binfmt, &binattr, "*s<%02X>", TRUE);
4801133e27eSPeter Avalos
4811133e27eSPeter Avalos s = lgetenv("LESSUTFBINFMT");
482320d7c8aSAaron LI setfmt(s, &utfbinfmt, &binattr, "<U+%04lX>", TRUE);
4831133e27eSPeter Avalos }
4841133e27eSPeter Avalos
4851133e27eSPeter Avalos /*
4861133e27eSPeter Avalos * Is a given character a "binary" character?
4871133e27eSPeter Avalos */
binary_char(LWCHAR c)488*e433da38SAaron LI public lbool binary_char(LWCHAR c)
4891133e27eSPeter Avalos {
4908be36e5bSPeter Avalos if (utf_mode)
4918be36e5bSPeter Avalos return (is_ubin_char(c));
492*e433da38SAaron LI if (c >= sizeof(chardef))
493*e433da38SAaron LI return TRUE;
494*e433da38SAaron LI return ((chardef[c] & IS_BINARY_CHAR) != 0);
4951133e27eSPeter Avalos }
4961133e27eSPeter Avalos
4971133e27eSPeter Avalos /*
4981133e27eSPeter Avalos * Is a given character a "control" character?
4991133e27eSPeter Avalos */
control_char(LWCHAR c)500*e433da38SAaron LI public lbool control_char(LWCHAR c)
5011133e27eSPeter Avalos {
502*e433da38SAaron LI if (c >= sizeof(chardef))
503*e433da38SAaron LI return TRUE;
5041133e27eSPeter Avalos return (chardef[c] & IS_CONTROL_CHAR);
5051133e27eSPeter Avalos }
5061133e27eSPeter Avalos
5071133e27eSPeter Avalos /*
5081133e27eSPeter Avalos * Return the printable form of a character.
5091133e27eSPeter Avalos * For example, in the "ascii" charset '\3' is printed as "^C".
5101133e27eSPeter Avalos */
prchar(LWCHAR c)511*e433da38SAaron LI public constant char * prchar(LWCHAR c)
5121133e27eSPeter Avalos {
513*e433da38SAaron LI /* {{ Fixed buffer size means LESSBINFMT etc can be truncated. }} */
5140c7ad07eSAntonio Huete Jimenez static char buf[MAX_PRCHAR_LEN+1];
5151133e27eSPeter Avalos
516*e433da38SAaron LI c &= 0377; /*{{type-issue}}*/
5171133e27eSPeter Avalos if ((c < 128 || !utf_mode) && !control_char(c))
518a9adbba3SJan Lentfer SNPRINTF1(buf, sizeof(buf), "%c", (int) c);
5191133e27eSPeter Avalos else if (c == ESC)
5201133e27eSPeter Avalos strcpy(buf, "ESC");
5211133e27eSPeter Avalos #if IS_EBCDIC_HOST
5221133e27eSPeter Avalos else if (!binary_char(c) && c < 64)
5231133e27eSPeter Avalos SNPRINTF1(buf, sizeof(buf), "^%c",
5241133e27eSPeter Avalos /*
5251133e27eSPeter Avalos * This array roughly inverts CONTROL() #defined in less.h,
5261133e27eSPeter Avalos * and should be kept in sync with CONTROL() and IBM-1047.
5271133e27eSPeter Avalos */
5281133e27eSPeter Avalos "@ABC.I.?...KLMNO"
5291133e27eSPeter Avalos "PQRS.JH.XY.."
5301133e27eSPeter Avalos "\\]^_"
5311133e27eSPeter Avalos "......W[.....EFG"
5321133e27eSPeter Avalos "..V....D....TU.Z"[c]);
5331133e27eSPeter Avalos #else
5341133e27eSPeter Avalos else if (c < 128 && !control_char(c ^ 0100))
535a9adbba3SJan Lentfer SNPRINTF1(buf, sizeof(buf), "^%c", (int) (c ^ 0100));
5361133e27eSPeter Avalos #endif
5371133e27eSPeter Avalos else
5381133e27eSPeter Avalos SNPRINTF1(buf, sizeof(buf), binfmt, c);
5391133e27eSPeter Avalos return (buf);
5401133e27eSPeter Avalos }
5411133e27eSPeter Avalos
5421133e27eSPeter Avalos /*
5431133e27eSPeter Avalos * Return the printable form of a UTF-8 character.
5441133e27eSPeter Avalos */
prutfchar(LWCHAR ch)545*e433da38SAaron LI public constant char * prutfchar(LWCHAR ch)
5461133e27eSPeter Avalos {
5470c7ad07eSAntonio Huete Jimenez static char buf[MAX_PRCHAR_LEN+1];
5481133e27eSPeter Avalos
5491133e27eSPeter Avalos if (ch == ESC)
5501133e27eSPeter Avalos strcpy(buf, "ESC");
5511133e27eSPeter Avalos else if (ch < 128 && control_char(ch))
5521133e27eSPeter Avalos {
5531133e27eSPeter Avalos if (!control_char(ch ^ 0100))
5541133e27eSPeter Avalos SNPRINTF1(buf, sizeof(buf), "^%c", ((char) ch) ^ 0100);
5551133e27eSPeter Avalos else
5561133e27eSPeter Avalos SNPRINTF1(buf, sizeof(buf), binfmt, (char) ch);
5571133e27eSPeter Avalos } else if (is_ubin_char(ch))
558fa0be7c5SJohn Marino {
5591133e27eSPeter Avalos SNPRINTF1(buf, sizeof(buf), utfbinfmt, ch);
5601133e27eSPeter Avalos } else
5611133e27eSPeter Avalos {
562fa0be7c5SJohn Marino char *p = buf;
563fa0be7c5SJohn Marino if (ch >= 0x80000000)
564fa0be7c5SJohn Marino ch = 0xFFFD; /* REPLACEMENT CHARACTER */
565fa0be7c5SJohn Marino put_wchar(&p, ch);
566fa0be7c5SJohn Marino *p = '\0';
5671133e27eSPeter Avalos }
5681133e27eSPeter Avalos return (buf);
5691133e27eSPeter Avalos }
5701133e27eSPeter Avalos
5711133e27eSPeter Avalos /*
5721133e27eSPeter Avalos * Get the length of a UTF-8 character in bytes.
5731133e27eSPeter Avalos */
utf_len(char ch)574*e433da38SAaron LI public int utf_len(char ch)
5751133e27eSPeter Avalos {
5761133e27eSPeter Avalos if ((ch & 0x80) == 0)
5771133e27eSPeter Avalos return 1;
5781133e27eSPeter Avalos if ((ch & 0xE0) == 0xC0)
5791133e27eSPeter Avalos return 2;
5801133e27eSPeter Avalos if ((ch & 0xF0) == 0xE0)
5811133e27eSPeter Avalos return 3;
5821133e27eSPeter Avalos if ((ch & 0xF8) == 0xF0)
5831133e27eSPeter Avalos return 4;
584*e433da38SAaron LI #if 0
5851133e27eSPeter Avalos if ((ch & 0xFC) == 0xF8)
5861133e27eSPeter Avalos return 5;
5871133e27eSPeter Avalos if ((ch & 0xFE) == 0xFC)
5881133e27eSPeter Avalos return 6;
589*e433da38SAaron LI #endif
5901133e27eSPeter Avalos /* Invalid UTF-8 encoding. */
5911133e27eSPeter Avalos return 1;
5921133e27eSPeter Avalos }
5931133e27eSPeter Avalos
5941133e27eSPeter Avalos /*
595fa0be7c5SJohn Marino * Does the parameter point to the lead byte of a well-formed UTF-8 character?
5961133e27eSPeter Avalos */
is_utf8_well_formed(constant char * ss,int slen)597*e433da38SAaron LI public lbool is_utf8_well_formed(constant char *ss, int slen)
5981133e27eSPeter Avalos {
5991133e27eSPeter Avalos int i;
6001133e27eSPeter Avalos int len;
601*e433da38SAaron LI unsigned char s0 = (unsigned char) ss[0];
6021133e27eSPeter Avalos
603*e433da38SAaron LI if (IS_UTF8_INVALID(s0))
604*e433da38SAaron LI return (FALSE);
6051133e27eSPeter Avalos
606*e433da38SAaron LI len = utf_len(ss[0]);
6079b760066SJohn Marino if (len > slen)
608*e433da38SAaron LI return (FALSE);
6091133e27eSPeter Avalos if (len == 1)
610*e433da38SAaron LI return (TRUE);
6111133e27eSPeter Avalos if (len == 2)
6121133e27eSPeter Avalos {
613*e433da38SAaron LI if (s0 < 0xC2)
614*e433da38SAaron LI return (FALSE);
6151133e27eSPeter Avalos } else
6161133e27eSPeter Avalos {
617*e433da38SAaron LI unsigned char mask = (unsigned char) (~((1 << (8-len)) - 1));
618*e433da38SAaron LI if (s0 == mask && (ss[1] & mask) == 0x80)
619*e433da38SAaron LI return (FALSE);
6201133e27eSPeter Avalos }
6211133e27eSPeter Avalos
6221133e27eSPeter Avalos for (i = 1; i < len; i++)
623*e433da38SAaron LI if (!IS_UTF8_TRAIL(ss[i]))
624*e433da38SAaron LI return (FALSE);
625*e433da38SAaron LI return (TRUE);
6261133e27eSPeter Avalos }
6271133e27eSPeter Avalos
6281133e27eSPeter Avalos /*
62902d62a0fSDaniel Fojt * Skip bytes until a UTF-8 lead byte (11xxxxxx) or ASCII byte (0xxxxxxx) is found.
630fa0be7c5SJohn Marino */
utf_skip_to_lead(constant char ** pp,constant char * limit)631*e433da38SAaron LI public void utf_skip_to_lead(constant char **pp, constant char *limit)
632fa0be7c5SJohn Marino {
633fa0be7c5SJohn Marino do {
63402d62a0fSDaniel Fojt ++(*pp);
63502d62a0fSDaniel Fojt } while (*pp < limit && !IS_UTF8_LEAD((*pp)[0] & 0377) && !IS_ASCII_OCTET((*pp)[0]));
636fa0be7c5SJohn Marino }
63702d62a0fSDaniel Fojt
638fa0be7c5SJohn Marino
639fa0be7c5SJohn Marino /*
6401133e27eSPeter Avalos * Get the value of a UTF-8 character.
6411133e27eSPeter Avalos */
get_wchar(constant char * sp)642*e433da38SAaron LI public LWCHAR get_wchar(constant char *sp)
6431133e27eSPeter Avalos {
644*e433da38SAaron LI constant unsigned char *p = (constant unsigned char *) sp;
645*e433da38SAaron LI switch (utf_len(sp[0]))
6461133e27eSPeter Avalos {
6471133e27eSPeter Avalos case 1:
6481133e27eSPeter Avalos default:
6491133e27eSPeter Avalos /* 0xxxxxxx */
6501133e27eSPeter Avalos return (LWCHAR)
6511133e27eSPeter Avalos (p[0] & 0xFF);
6521133e27eSPeter Avalos case 2:
6531133e27eSPeter Avalos /* 110xxxxx 10xxxxxx */
6541133e27eSPeter Avalos return (LWCHAR) (
6551133e27eSPeter Avalos ((p[0] & 0x1F) << 6) |
6561133e27eSPeter Avalos (p[1] & 0x3F));
6571133e27eSPeter Avalos case 3:
6581133e27eSPeter Avalos /* 1110xxxx 10xxxxxx 10xxxxxx */
6591133e27eSPeter Avalos return (LWCHAR) (
6601133e27eSPeter Avalos ((p[0] & 0x0F) << 12) |
6611133e27eSPeter Avalos ((p[1] & 0x3F) << 6) |
6621133e27eSPeter Avalos (p[2] & 0x3F));
6631133e27eSPeter Avalos case 4:
6641133e27eSPeter Avalos /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
6651133e27eSPeter Avalos return (LWCHAR) (
6661133e27eSPeter Avalos ((p[0] & 0x07) << 18) |
6671133e27eSPeter Avalos ((p[1] & 0x3F) << 12) |
6681133e27eSPeter Avalos ((p[2] & 0x3F) << 6) |
6691133e27eSPeter Avalos (p[3] & 0x3F));
670*e433da38SAaron LI #if 0
6711133e27eSPeter Avalos case 5:
6721133e27eSPeter Avalos /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
6731133e27eSPeter Avalos return (LWCHAR) (
6741133e27eSPeter Avalos ((p[0] & 0x03) << 24) |
6751133e27eSPeter Avalos ((p[1] & 0x3F) << 18) |
6761133e27eSPeter Avalos ((p[2] & 0x3F) << 12) |
6771133e27eSPeter Avalos ((p[3] & 0x3F) << 6) |
6781133e27eSPeter Avalos (p[4] & 0x3F));
6791133e27eSPeter Avalos case 6:
6801133e27eSPeter Avalos /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
6811133e27eSPeter Avalos return (LWCHAR) (
6821133e27eSPeter Avalos ((p[0] & 0x01) << 30) |
6831133e27eSPeter Avalos ((p[1] & 0x3F) << 24) |
6841133e27eSPeter Avalos ((p[2] & 0x3F) << 18) |
6851133e27eSPeter Avalos ((p[3] & 0x3F) << 12) |
6861133e27eSPeter Avalos ((p[4] & 0x3F) << 6) |
6871133e27eSPeter Avalos (p[5] & 0x3F));
688*e433da38SAaron LI #endif
6891133e27eSPeter Avalos }
6901133e27eSPeter Avalos }
6911133e27eSPeter Avalos
6921133e27eSPeter Avalos /*
6931133e27eSPeter Avalos * Store a character into a UTF-8 string.
6941133e27eSPeter Avalos */
put_wchar(mutable char ** pp,LWCHAR ch)695*e433da38SAaron LI public void put_wchar(mutable char **pp, LWCHAR ch)
6961133e27eSPeter Avalos {
6971133e27eSPeter Avalos if (!utf_mode || ch < 0x80)
6981133e27eSPeter Avalos {
6991133e27eSPeter Avalos /* 0xxxxxxx */
7001133e27eSPeter Avalos *(*pp)++ = (char) ch;
7011133e27eSPeter Avalos } else if (ch < 0x800)
7021133e27eSPeter Avalos {
7031133e27eSPeter Avalos /* 110xxxxx 10xxxxxx */
7041133e27eSPeter Avalos *(*pp)++ = (char) (0xC0 | ((ch >> 6) & 0x1F));
7051133e27eSPeter Avalos *(*pp)++ = (char) (0x80 | (ch & 0x3F));
7061133e27eSPeter Avalos } else if (ch < 0x10000)
7071133e27eSPeter Avalos {
7081133e27eSPeter Avalos /* 1110xxxx 10xxxxxx 10xxxxxx */
7091133e27eSPeter Avalos *(*pp)++ = (char) (0xE0 | ((ch >> 12) & 0x0F));
7101133e27eSPeter Avalos *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
7111133e27eSPeter Avalos *(*pp)++ = (char) (0x80 | (ch & 0x3F));
7121133e27eSPeter Avalos } else if (ch < 0x200000)
7131133e27eSPeter Avalos {
7141133e27eSPeter Avalos /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
7151133e27eSPeter Avalos *(*pp)++ = (char) (0xF0 | ((ch >> 18) & 0x07));
7161133e27eSPeter Avalos *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
7171133e27eSPeter Avalos *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
7181133e27eSPeter Avalos *(*pp)++ = (char) (0x80 | (ch & 0x3F));
719*e433da38SAaron LI #if 0
7201133e27eSPeter Avalos } else if (ch < 0x4000000)
7211133e27eSPeter Avalos {
7221133e27eSPeter Avalos /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
7231133e27eSPeter Avalos *(*pp)++ = (char) (0xF0 | ((ch >> 24) & 0x03));
7241133e27eSPeter Avalos *(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
7251133e27eSPeter Avalos *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
7261133e27eSPeter Avalos *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
7271133e27eSPeter Avalos *(*pp)++ = (char) (0x80 | (ch & 0x3F));
7281133e27eSPeter Avalos } else
7291133e27eSPeter Avalos {
7301133e27eSPeter Avalos /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
7311133e27eSPeter Avalos *(*pp)++ = (char) (0xF0 | ((ch >> 30) & 0x01));
7321133e27eSPeter Avalos *(*pp)++ = (char) (0x80 | ((ch >> 24) & 0x3F));
7331133e27eSPeter Avalos *(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
7341133e27eSPeter Avalos *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
7351133e27eSPeter Avalos *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
7361133e27eSPeter Avalos *(*pp)++ = (char) (0x80 | (ch & 0x3F));
737*e433da38SAaron LI #endif
7381133e27eSPeter Avalos }
7391133e27eSPeter Avalos }
7401133e27eSPeter Avalos
7411133e27eSPeter Avalos /*
7421133e27eSPeter Avalos * Step forward or backward one character in a string.
7431133e27eSPeter Avalos */
step_charc(constant char ** pp,signed int dir,constant char * limit)744*e433da38SAaron LI public LWCHAR step_charc(constant char **pp, signed int dir, constant char *limit)
7451133e27eSPeter Avalos {
7461133e27eSPeter Avalos LWCHAR ch;
7471133e27eSPeter Avalos int len;
748*e433da38SAaron LI constant char *p = *pp;
7491133e27eSPeter Avalos
7501133e27eSPeter Avalos if (!utf_mode)
7511133e27eSPeter Avalos {
7521133e27eSPeter Avalos /* It's easy if chars are one byte. */
7531133e27eSPeter Avalos if (dir > 0)
75402d62a0fSDaniel Fojt ch = (LWCHAR) (unsigned char) ((p < limit) ? *p++ : 0);
7551133e27eSPeter Avalos else
75602d62a0fSDaniel Fojt ch = (LWCHAR) (unsigned char) ((p > limit) ? *--p : 0);
7571133e27eSPeter Avalos } else if (dir > 0)
7581133e27eSPeter Avalos {
7591133e27eSPeter Avalos len = utf_len(*p);
7601133e27eSPeter Avalos if (p + len > limit)
7611133e27eSPeter Avalos {
7621133e27eSPeter Avalos ch = 0;
76302d62a0fSDaniel Fojt p = (char *) limit;
7641133e27eSPeter Avalos } else
7651133e27eSPeter Avalos {
7661133e27eSPeter Avalos ch = get_wchar(p);
7671133e27eSPeter Avalos p += len;
7681133e27eSPeter Avalos }
7691133e27eSPeter Avalos } else
7701133e27eSPeter Avalos {
7711133e27eSPeter Avalos while (p > limit && IS_UTF8_TRAIL(p[-1]))
7721133e27eSPeter Avalos p--;
7731133e27eSPeter Avalos if (p > limit)
7741133e27eSPeter Avalos ch = get_wchar(--p);
7751133e27eSPeter Avalos else
7761133e27eSPeter Avalos ch = 0;
7771133e27eSPeter Avalos }
7781133e27eSPeter Avalos *pp = p;
7791133e27eSPeter Avalos return ch;
7801133e27eSPeter Avalos }
7811133e27eSPeter Avalos
step_char(char ** pp,signed int dir,constant char * limit)782*e433da38SAaron LI public LWCHAR step_char(char **pp, signed int dir, constant char *limit)
783*e433da38SAaron LI {
784*e433da38SAaron LI constant char *p = (constant char *) *pp;
785*e433da38SAaron LI LWCHAR ch = step_charc(&p, dir, limit);
786*e433da38SAaron LI *pp = (char *) p;
787*e433da38SAaron LI return ch;
788*e433da38SAaron LI }
789*e433da38SAaron LI
7901133e27eSPeter Avalos /*
7911133e27eSPeter Avalos * Unicode characters data
792fa0be7c5SJohn Marino * Actual data is in the generated *.uni files.
7931133e27eSPeter Avalos */
7941133e27eSPeter Avalos
795fa0be7c5SJohn Marino #define DECLARE_RANGE_TABLE_START(name) \
796fa0be7c5SJohn Marino static struct wchar_range name##_array[] = {
797fa0be7c5SJohn Marino #define DECLARE_RANGE_TABLE_END(name) \
798*e433da38SAaron LI }; struct wchar_range_table name##_table = { name##_array, countof(name##_array) };
7991133e27eSPeter Avalos
800fa0be7c5SJohn Marino DECLARE_RANGE_TABLE_START(compose)
801fa0be7c5SJohn Marino #include "compose.uni"
802fa0be7c5SJohn Marino DECLARE_RANGE_TABLE_END(compose)
803fa0be7c5SJohn Marino
804fa0be7c5SJohn Marino DECLARE_RANGE_TABLE_START(ubin)
805fa0be7c5SJohn Marino #include "ubin.uni"
806fa0be7c5SJohn Marino DECLARE_RANGE_TABLE_END(ubin)
807fa0be7c5SJohn Marino
808fa0be7c5SJohn Marino DECLARE_RANGE_TABLE_START(wide)
809fa0be7c5SJohn Marino #include "wide.uni"
810fa0be7c5SJohn Marino DECLARE_RANGE_TABLE_END(wide)
811fa0be7c5SJohn Marino
81202d62a0fSDaniel Fojt DECLARE_RANGE_TABLE_START(fmt)
81302d62a0fSDaniel Fojt #include "fmt.uni"
81402d62a0fSDaniel Fojt DECLARE_RANGE_TABLE_END(fmt)
81502d62a0fSDaniel Fojt
816fa0be7c5SJohn Marino /* comb_table is special pairs, not ranges. */
8171133e27eSPeter Avalos static struct wchar_range comb_table[] = {
8181133e27eSPeter Avalos {0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627},
8191133e27eSPeter Avalos };
8201133e27eSPeter Avalos
8211133e27eSPeter Avalos
is_in_table(LWCHAR ch,struct wchar_range_table * table)822*e433da38SAaron LI static lbool is_in_table(LWCHAR ch, struct wchar_range_table *table)
8231133e27eSPeter Avalos {
824*e433da38SAaron LI unsigned int hi;
825*e433da38SAaron LI unsigned int lo;
8261133e27eSPeter Avalos
8271133e27eSPeter Avalos /* Binary search in the table. */
828320d7c8aSAaron LI if (table->table == NULL || table->count == 0 || ch < table->table[0].first)
829*e433da38SAaron LI return FALSE;
8301133e27eSPeter Avalos lo = 0;
831fa0be7c5SJohn Marino hi = table->count - 1;
8321133e27eSPeter Avalos while (lo <= hi)
8331133e27eSPeter Avalos {
834*e433da38SAaron LI unsigned int mid = (lo + hi) / 2;
835fa0be7c5SJohn Marino if (ch > table->table[mid].last)
8361133e27eSPeter Avalos lo = mid + 1;
837fa0be7c5SJohn Marino else if (ch < table->table[mid].first)
8381133e27eSPeter Avalos hi = mid - 1;
8391133e27eSPeter Avalos else
840*e433da38SAaron LI return TRUE;
8411133e27eSPeter Avalos }
842*e433da38SAaron LI return FALSE;
8431133e27eSPeter Avalos }
8441133e27eSPeter Avalos
8451133e27eSPeter Avalos /*
8461133e27eSPeter Avalos * Is a character a UTF-8 composing character?
8471133e27eSPeter Avalos * If a composing character follows any char, the two combine into one glyph.
8481133e27eSPeter Avalos */
is_composing_char(LWCHAR ch)849*e433da38SAaron LI public lbool is_composing_char(LWCHAR ch)
8501133e27eSPeter Avalos {
851*e433da38SAaron LI if (is_in_table(ch, &user_prt_table)) return FALSE;
852320d7c8aSAaron LI return is_in_table(ch, &user_compose_table) ||
853320d7c8aSAaron LI is_in_table(ch, &compose_table) ||
85402d62a0fSDaniel Fojt (bs_mode != BS_CONTROL && is_in_table(ch, &fmt_table));
8551133e27eSPeter Avalos }
8561133e27eSPeter Avalos
8571133e27eSPeter Avalos /*
8581133e27eSPeter Avalos * Should this UTF-8 character be treated as binary?
8591133e27eSPeter Avalos */
is_ubin_char(LWCHAR ch)860*e433da38SAaron LI public lbool is_ubin_char(LWCHAR ch)
8611133e27eSPeter Avalos {
862*e433da38SAaron LI if (is_in_table(ch, &user_prt_table)) return FALSE;
863320d7c8aSAaron LI return is_in_table(ch, &user_ubin_table) ||
864320d7c8aSAaron LI is_in_table(ch, &ubin_table) ||
86502d62a0fSDaniel Fojt (bs_mode == BS_CONTROL && is_in_table(ch, &fmt_table));
8661133e27eSPeter Avalos }
8671133e27eSPeter Avalos
8681133e27eSPeter Avalos /*
8691133e27eSPeter Avalos * Is this a double width UTF-8 character?
8701133e27eSPeter Avalos */
is_wide_char(LWCHAR ch)871*e433da38SAaron LI public lbool is_wide_char(LWCHAR ch)
8721133e27eSPeter Avalos {
873320d7c8aSAaron LI return is_in_table(ch, &user_wide_table) ||
874320d7c8aSAaron LI is_in_table(ch, &wide_table);
8751133e27eSPeter Avalos }
8761133e27eSPeter Avalos
8771133e27eSPeter Avalos /*
8781133e27eSPeter Avalos * Is a character a UTF-8 combining character?
8791133e27eSPeter Avalos * A combining char acts like an ordinary char, but if it follows
8801133e27eSPeter Avalos * a specific char (not any char), the two combine into one glyph.
8811133e27eSPeter Avalos */
is_combining_char(LWCHAR ch1,LWCHAR ch2)882*e433da38SAaron LI public lbool is_combining_char(LWCHAR ch1, LWCHAR ch2)
8831133e27eSPeter Avalos {
8841133e27eSPeter Avalos /* The table is small; use linear search. */
8851133e27eSPeter Avalos int i;
886*e433da38SAaron LI for (i = 0; i < countof(comb_table); i++)
8871133e27eSPeter Avalos {
8881133e27eSPeter Avalos if (ch1 == comb_table[i].first &&
8891133e27eSPeter Avalos ch2 == comb_table[i].last)
890*e433da38SAaron LI return TRUE;
8911133e27eSPeter Avalos }
892*e433da38SAaron LI return FALSE;
8931133e27eSPeter Avalos }
8941133e27eSPeter Avalos
895