xref: /dflybsd-src/contrib/grep/lib/localeinfo.c (revision 91b9ed38d3db6a8a8ac5b66da1d43e6e331e259a)
1*09d4459fSDaniel Fojt /* locale information
2*09d4459fSDaniel Fojt 
3*09d4459fSDaniel Fojt    Copyright 2016-2020 Free Software Foundation, Inc.
4*09d4459fSDaniel Fojt 
5*09d4459fSDaniel Fojt    This program is free software; you can redistribute it and/or modify
6*09d4459fSDaniel Fojt    it under the terms of the GNU General Public License as published by
7*09d4459fSDaniel Fojt    the Free Software Foundation; either version 3, or (at your option)
8*09d4459fSDaniel Fojt    any later version.
9*09d4459fSDaniel Fojt 
10*09d4459fSDaniel Fojt    This program is distributed in the hope that it will be useful,
11*09d4459fSDaniel Fojt    but WITHOUT ANY WARRANTY; without even the implied warranty of
12*09d4459fSDaniel Fojt    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13*09d4459fSDaniel Fojt    GNU General Public License for more details.
14*09d4459fSDaniel Fojt 
15*09d4459fSDaniel Fojt    You should have received a copy of the GNU General Public License
16*09d4459fSDaniel Fojt    along with this program; if not, write to the Free Software
17*09d4459fSDaniel Fojt    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
18*09d4459fSDaniel Fojt    02110-1301, USA.  */
19*09d4459fSDaniel Fojt 
20*09d4459fSDaniel Fojt /* Written by Paul Eggert.  */
21*09d4459fSDaniel Fojt 
22*09d4459fSDaniel Fojt #include <config.h>
23*09d4459fSDaniel Fojt 
24*09d4459fSDaniel Fojt #include <localeinfo.h>
25*09d4459fSDaniel Fojt 
26*09d4459fSDaniel Fojt #include <verify.h>
27*09d4459fSDaniel Fojt 
28*09d4459fSDaniel Fojt #include <limits.h>
29*09d4459fSDaniel Fojt #include <locale.h>
30*09d4459fSDaniel Fojt #include <stdlib.h>
31*09d4459fSDaniel Fojt #include <string.h>
32*09d4459fSDaniel Fojt #include <wctype.h>
33*09d4459fSDaniel Fojt 
34*09d4459fSDaniel Fojt /* The sbclen implementation relies on this.  */
35*09d4459fSDaniel Fojt verify (MB_LEN_MAX <= SCHAR_MAX);
36*09d4459fSDaniel Fojt 
37*09d4459fSDaniel Fojt /* Return true if the locale uses UTF-8.  */
38*09d4459fSDaniel Fojt 
39*09d4459fSDaniel Fojt static bool
is_using_utf8(void)40*09d4459fSDaniel Fojt is_using_utf8 (void)
41*09d4459fSDaniel Fojt {
42*09d4459fSDaniel Fojt   wchar_t wc;
43*09d4459fSDaniel Fojt   mbstate_t mbs = {0};
44*09d4459fSDaniel Fojt   return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
45*09d4459fSDaniel Fojt }
46*09d4459fSDaniel Fojt 
47*09d4459fSDaniel Fojt /* Return true if the locale is compatible enough with the C locale so
48*09d4459fSDaniel Fojt    that the locale is single-byte, bytes are in collating-sequence
49*09d4459fSDaniel Fojt    order, and there are no multi-character collating elements.  */
50*09d4459fSDaniel Fojt 
51*09d4459fSDaniel Fojt static bool
using_simple_locale(bool multibyte)52*09d4459fSDaniel Fojt using_simple_locale (bool multibyte)
53*09d4459fSDaniel Fojt {
54*09d4459fSDaniel Fojt   /* The native character set is known to be compatible with
55*09d4459fSDaniel Fojt      the C locale.  The following test isn't perfect, but it's good
56*09d4459fSDaniel Fojt      enough in practice, as only ASCII and EBCDIC are in common use
57*09d4459fSDaniel Fojt      and this test correctly accepts ASCII and rejects EBCDIC.  */
58*09d4459fSDaniel Fojt   enum { native_c_charset =
59*09d4459fSDaniel Fojt     ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
60*09d4459fSDaniel Fojt      && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
61*09d4459fSDaniel Fojt      && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
62*09d4459fSDaniel Fojt      && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
63*09d4459fSDaniel Fojt      && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
64*09d4459fSDaniel Fojt      && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
65*09d4459fSDaniel Fojt      && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
66*09d4459fSDaniel Fojt      && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
67*09d4459fSDaniel Fojt      && '}' == 125 && '~' == 126)
68*09d4459fSDaniel Fojt   };
69*09d4459fSDaniel Fojt 
70*09d4459fSDaniel Fojt   if (!native_c_charset || multibyte)
71*09d4459fSDaniel Fojt     return false;
72*09d4459fSDaniel Fojt 
73*09d4459fSDaniel Fojt   /* As a heuristic, use strcoll to compare native character order.
74*09d4459fSDaniel Fojt      If this agrees with byte order the locale should be simple.
75*09d4459fSDaniel Fojt      This heuristic should work for all known practical locales,
76*09d4459fSDaniel Fojt      although it would be invalid for artificially-constructed locales
77*09d4459fSDaniel Fojt      where the native order is the collating-sequence order but there
78*09d4459fSDaniel Fojt      are multi-character collating elements.  */
79*09d4459fSDaniel Fojt   for (int i = 0; i < UCHAR_MAX; i++)
80*09d4459fSDaniel Fojt     if (0 <= strcoll (((char []) {i, 0}), ((char []) {i + 1, 0})))
81*09d4459fSDaniel Fojt       return false;
82*09d4459fSDaniel Fojt 
83*09d4459fSDaniel Fojt   return true;
84*09d4459fSDaniel Fojt }
85*09d4459fSDaniel Fojt 
86*09d4459fSDaniel Fojt /* Initialize *LOCALEINFO from the current locale.  */
87*09d4459fSDaniel Fojt 
88*09d4459fSDaniel Fojt void
init_localeinfo(struct localeinfo * localeinfo)89*09d4459fSDaniel Fojt init_localeinfo (struct localeinfo *localeinfo)
90*09d4459fSDaniel Fojt {
91*09d4459fSDaniel Fojt   localeinfo->multibyte = MB_CUR_MAX > 1;
92*09d4459fSDaniel Fojt   localeinfo->simple = using_simple_locale (localeinfo->multibyte);
93*09d4459fSDaniel Fojt   localeinfo->using_utf8 = is_using_utf8 ();
94*09d4459fSDaniel Fojt 
95*09d4459fSDaniel Fojt   for (int i = CHAR_MIN; i <= CHAR_MAX; i++)
96*09d4459fSDaniel Fojt     {
97*09d4459fSDaniel Fojt       char c = i;
98*09d4459fSDaniel Fojt       unsigned char uc = i;
99*09d4459fSDaniel Fojt       mbstate_t s = {0};
100*09d4459fSDaniel Fojt       wchar_t wc;
101*09d4459fSDaniel Fojt       size_t len = mbrtowc (&wc, &c, 1, &s);
102*09d4459fSDaniel Fojt       localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
103*09d4459fSDaniel Fojt       localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
104*09d4459fSDaniel Fojt     }
105*09d4459fSDaniel Fojt }
106*09d4459fSDaniel Fojt 
107*09d4459fSDaniel Fojt /* The set of wchar_t values C such that there's a useful locale
108*09d4459fSDaniel Fojt    somewhere where C != towupper (C) && C != towlower (towupper (C)).
109*09d4459fSDaniel Fojt    For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
110*09d4459fSDaniel Fojt    towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
111*09d4459fSDaniel Fojt    towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
112*09d4459fSDaniel Fojt static short const lonesome_lower[] =
113*09d4459fSDaniel Fojt   {
114*09d4459fSDaniel Fojt     0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
115*09d4459fSDaniel Fojt     0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
116*09d4459fSDaniel Fojt 
117*09d4459fSDaniel Fojt     /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
118*09d4459fSDaniel Fojt        counterpart in locales predating Unicode 4.0.0 (April 2003).  */
119*09d4459fSDaniel Fojt     0x03F2,
120*09d4459fSDaniel Fojt 
121*09d4459fSDaniel Fojt     0x03F5, 0x1E9B, 0x1FBE,
122*09d4459fSDaniel Fojt   };
123*09d4459fSDaniel Fojt 
124*09d4459fSDaniel Fojt /* Verify that the worst case fits.  This is 1 for towupper, 1 for
125*09d4459fSDaniel Fojt    towlower, and 1 for each entry in LONESOME_LOWER.  */
126*09d4459fSDaniel Fojt verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
127*09d4459fSDaniel Fojt         <= CASE_FOLDED_BUFSIZE);
128*09d4459fSDaniel Fojt 
129*09d4459fSDaniel Fojt /* Find the characters equal to C after case-folding, other than C
130*09d4459fSDaniel Fojt    itself, and store them into FOLDED.  Return the number of characters
131*09d4459fSDaniel Fojt    stored; this is zero if C is WEOF.  */
132*09d4459fSDaniel Fojt 
133*09d4459fSDaniel Fojt int
case_folded_counterparts(wint_t c,wchar_t folded[CASE_FOLDED_BUFSIZE])134*09d4459fSDaniel Fojt case_folded_counterparts (wint_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
135*09d4459fSDaniel Fojt {
136*09d4459fSDaniel Fojt   int i;
137*09d4459fSDaniel Fojt   int n = 0;
138*09d4459fSDaniel Fojt   wint_t uc = towupper (c);
139*09d4459fSDaniel Fojt   wint_t lc = towlower (uc);
140*09d4459fSDaniel Fojt   if (uc != c)
141*09d4459fSDaniel Fojt     folded[n++] = uc;
142*09d4459fSDaniel Fojt   if (lc != uc && lc != c && towupper (lc) == uc)
143*09d4459fSDaniel Fojt     folded[n++] = lc;
144*09d4459fSDaniel Fojt   for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
145*09d4459fSDaniel Fojt     {
146*09d4459fSDaniel Fojt       wint_t li = lonesome_lower[i];
147*09d4459fSDaniel Fojt       if (li != lc && li != uc && li != c && towupper (li) == uc)
148*09d4459fSDaniel Fojt         folded[n++] = li;
149*09d4459fSDaniel Fojt     }
150*09d4459fSDaniel Fojt   return n;
151*09d4459fSDaniel Fojt }
152