localcharset.c - OpenGrok cross reference for /dflybsd-src/contrib/grep/lib/localcharset.c

Lines Matching +full:ascii +full:. +full:r
1 /* Determine a canonical name for the current locale's character encoding.
3    Copyright (C) 2000-2006, 2008-2020 Free Software Foundation, Inc.
8    any later version.
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
16    with this program; if not, see <https://www.gnu.org/licenses/>.  */
18 /* Written by Bruno Haible <bruno@clisp.org>.  */
20 #include <config.h>
22 /* Specification.  */
23 #include "localcharset.h"
25 #include <stddef.h>
26 #include <stdio.h>
27 #include <string.h>
28 #include <stdlib.h>
31 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
36 # include <locale.h>
40 /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
48 #  include <langinfo.h>
51 #   include <locale.h>
56 #  include <windows.h>
60 # include <windows.h>
61   /* For the use of setlocale() below, the Gnulib override in setlocale.c is
62      not needed; see the platform lists in setlocale_null.m4.  */
67 # include <os2.h>
72 # include <xlocale.h>
79    to GNU canonical encoding name.  */
83    GNU canonical names directly.  */
92 /* Table of platform-dependent mappings, sorted in ascending order.  */
98     { "C",          "ASCII" },
113   /*{ "KOI8-R",     "KOI8-R" },*/
116     { "US-ASCII",   "ASCII" },
123     { "646",        "ASCII" },
138   /*{ "KOI8-R",     "KOI8-R" },*/
149     { "646",        "ASCII" },
157     { "US-ASCII",   "ASCII" }
165          LC_CTYPE file.
167          the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
169            "... all code that calls BSD system routines should ensure
171             encoding. All BSD system functions expect their string
172             parameters to be in UTF-8 encoding and nothing else."
176             UTF-8. In a canonical UTF-8 Unicode string, all decomposable
177             characters are decomposed ..."
180          them to decomposed UTF-8 before accessing the file system.
181        - The Apple Terminal application displays UTF-8 by default.
183          - xterm uses ISO-8859-1 by default.
184          - TextEdit uses MacRoman by default.
186        minimize the use of decomposed Unicode. Unfortunately, through the
188        space nevertheless.
189        Then there are also the locales with encodings other than US-ASCII
190        and UTF-8. These locales can be occasionally useful to users (e.g.
192        file names are in US-ASCII.
213     { "KOI8-R",     "KOI8-R" },
275     { "koi8r",     "KOI8-R" },
323     { "646",         "ASCII" },
341     { "TIS620.2533", "TIS-620" },
347     { "koi8-r",      "KOI8-R" }
351     { "646", "ASCII" }
356     { "CP20127", "ASCII" },
357     { "CP20866", "KOI8-R" },
383        <http://altsan.org/os2/toolkits/uls/index.html#codepages>.
385        <https://github.com/bitwiseworks/libc/blob/master/src/emx/src/lib/locale/__convcp.c>.  */
405     { "CP878",         "KOI8-R" },
463     { "IBM-367",       "ASCII" },
488     { "IBM-878",       "KOI8-R" },
547        section 10.7 "Handling Different Character Sets".  */
565     /* Just a dummy entry, to avoid a C syntax error.  */
575    encoding name.  */
583 /* Table of platform-dependent mappings, sorted in ascending order.  */
587     { "cs_CZ.ISO_8859-2",  "ISO-8859-2" },
588     { "da_DK.DIS_8859-15", "ISO-8859-15" },
589     { "da_DK.ISO_8859-1",  "ISO-8859-1" },
590     { "de_AT.DIS_8859-15", "ISO-8859-15" },
591     { "de_AT.ISO_8859-1",  "ISO-8859-1" },
592     { "de_CH.DIS_8859-15", "ISO-8859-15" },
593     { "de_CH.ISO_8859-1",  "ISO-8859-1" },
594     { "de_DE.DIS_8859-15", "ISO-8859-15" },
595     { "de_DE.ISO_8859-1",  "ISO-8859-1" },
596     { "en_AU.DIS_8859-15", "ISO-8859-15" },
597     { "en_AU.ISO_8859-1",  "ISO-8859-1" },
598     { "en_CA.DIS_8859-15", "ISO-8859-15" },
599     { "en_CA.ISO_8859-1",  "ISO-8859-1" },
600     { "en_GB.DIS_8859-15", "ISO-8859-15" },
601     { "en_GB.ISO_8859-1",  "ISO-8859-1" },
602     { "en_US.DIS_8859-15", "ISO-8859-15" },
603     { "en_US.ISO_8859-1",  "ISO-8859-1" },
604     { "es_ES.DIS_8859-15", "ISO-8859-15" },
605     { "es_ES.ISO_8859-1",  "ISO-8859-1" },
606     { "fi_FI.DIS_8859-15", "ISO-8859-15" },
607     { "fi_FI.ISO_8859-1",  "ISO-8859-1" },
608     { "fr_BE.DIS_8859-15", "ISO-8859-15" },
609     { "fr_BE.ISO_8859-1",  "ISO-8859-1" },
610     { "fr_CA.DIS_8859-15", "ISO-8859-15" },
611     { "fr_CA.ISO_8859-1",  "ISO-8859-1" },
612     { "fr_CH.DIS_8859-15", "ISO-8859-15" },
613     { "fr_CH.ISO_8859-1",  "ISO-8859-1" },
614     { "fr_FR.DIS_8859-15", "ISO-8859-15" },
615     { "fr_FR.ISO_8859-1",  "ISO-8859-1" },
616     { "hr_HR.ISO_8859-2",  "ISO-8859-2" },
617     { "hu_HU.ISO_8859-2",  "ISO-8859-2" },
618     { "is_IS.DIS_8859-15", "ISO-8859-15" },
619     { "is_IS.ISO_8859-1",  "ISO-8859-1" },
620     { "it_CH.DIS_8859-15", "ISO-8859-15" },
621     { "it_CH.ISO_8859-1",  "ISO-8859-1" },
622     { "it_IT.DIS_8859-15", "ISO-8859-15" },
623     { "it_IT.ISO_8859-1",  "ISO-8859-1" },
624     { "ja_JP.EUC",         "EUC-JP" },
625     { "ja_JP.SJIS",        "SHIFT_JIS" },
626     { "ja_JP.Shift_JIS",   "SHIFT_JIS" },
627     { "ko_KR.EUC",         "EUC-KR" },
628     { "la_LN.ASCII",       "ASCII" },
629     { "la_LN.DIS_8859-15", "ISO-8859-15" },
630     { "la_LN.ISO_8859-1",  "ISO-8859-1" },
631     { "la_LN.ISO_8859-2",  "ISO-8859-2" },
632     { "la_LN.ISO_8859-4",  "ISO-8859-4" },
633     { "lt_LN.ASCII",       "ASCII" },
634     { "lt_LN.DIS_8859-15", "ISO-8859-15" },
635     { "lt_LN.ISO_8859-1",  "ISO-8859-1" },
636     { "lt_LN.ISO_8859-2",  "ISO-8859-2" },
637     { "lt_LT.ISO_8859-4",  "ISO-8859-4" },
638     { "nl_BE.DIS_8859-15", "ISO-8859-15" },
639     { "nl_BE.ISO_8859-1",  "ISO-8859-1" },
640     { "nl_NL.DIS_8859-15", "ISO-8859-15" },
641     { "nl_NL.ISO_8859-1",  "ISO-8859-1" },
642     { "no_NO.DIS_8859-15", "ISO-8859-15" },
643     { "no_NO.ISO_8859-1",  "ISO-8859-1" },
644     { "pl_PL.ISO_8859-2",  "ISO-8859-2" },
645     { "pt_PT.DIS_8859-15", "ISO-8859-15" },
646     { "pt_PT.ISO_8859-1",  "ISO-8859-1" },
647     { "ru_RU.CP866",       "CP866" },
648     { "ru_RU.ISO_8859-5",  "ISO-8859-5" },
649     { "ru_RU.KOI8-R",      "KOI8-R" },
650     { "ru_SU.CP866",       "CP866" },
651     { "ru_SU.ISO_8859-5",  "ISO-8859-5" },
652     { "ru_SU.KOI8-R",      "KOI8-R" },
653     { "sl_SI.ISO_8859-2",  "ISO-8859-2" },
654     { "sv_SE.DIS_8859-15", "ISO-8859-15" },
655     { "sv_SE.ISO_8859-1",  "ISO-8859-1" },
656     { "uk_UA.KOI8-U",      "KOI8-U" },
657     { "zh_CN.EUC",         "GB2312" },
658     { "zh_TW.BIG5",        "BIG5" },
659     { "zh_TW.Big5",        "BIG5" }
663     /* The encodings given here may not all be correct.
667        Juan Manuel Guerrero <juan.guerrero@gmx.de>
668        and <bug-gnulib@gnu.org>.  */
669     { "C",     "ASCII" },
810     /* Just a dummy entry, to avoid a C syntax error.  */
819    into one of the canonical names listed below.
820    The result must not be freed; it is statically allocated.  The result
823    is changed; threads in multithreaded programs should not do this.
825    name.  */
835   /* This function must be multithread-safe.  To achieve this without using  in locale_charset()
837      buffer.  Filling it through, for example, strcpy + strcat would not be  in locale_charset()
839      currently accessing it.  If necessary, the contents is first assembled in  in locale_charset()
840      a stack-allocated buffer.  */  in locale_charset()
846   /* Most systems support nl_langinfo (CODESET) nowadays.  */  in locale_charset()
850   /* Cygwin < 1.7 does not have locales.  nl_langinfo (CODESET) always  in locale_charset()
851      returns "US-ASCII".  Return the suffix of the locale name from the  in locale_charset()
852      environment variables (if present) or the codepage as a number.  */  in locale_charset()
853   if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)  in locale_charset()
868              it.  */  in locale_charset()
869           const char *dot = strchr (locale, '.');  in locale_charset()
876               /* Look for the possible @... trailer and remove it, if any.  */  in locale_charset()
882                   /* This way of filling resultbuf is multithread-safe.  */  in locale_charset()
891          number: GetACP().  This encoding is used by Cygwin, unless the user  in locale_charset()
893          people do).  in locale_charset()
896          GetConsoleOutputCP() if it is using a TrueType font).  Cygwin does  in locale_charset()
897          this conversion transparently (see winsup/cygwin/fhandler_console.cc),  in locale_charset()
898          converting to GetConsoleOutputCP().  This leads to correct results,  in locale_charset()
900          in use.  */  in locale_charset()
912     /* The canonical name cannot be determined.  */  in locale_charset()
922      'setlocale' call specified.  So we use it as a last resort, in  in locale_charset()
924      codepage.  */  in locale_charset()
926   char *pdot = strrchr (current_locale, '.');  in locale_charset()
933          number: GetACP().  in locale_charset()
936          GetConsoleOutputCP() encoding if it is using a TrueType font.  in locale_charset()
938          encoding is the best bet.  */  in locale_charset()
941   /* For a locale name such as "French_France.65001", in Windows 10,  in locale_charset()
942      setlocale now returns "French_France.utf8" instead.  */  in locale_charset()
961      with standard language environment variables.  */  in locale_charset()
971       /* If the locale name contains an encoding after the dot, return it.  */  in locale_charset()
972       const char *dot = strchr (locale, '.');  in locale_charset()
979           /* Look for the possible @... trailer and remove it, if any.  */  in locale_charset()
985               /* This way of filling resultbuf is multithread-safe.  */  in locale_charset()
992       /* For the POSIX locale, don't use the system's codepage.  */  in locale_charset()
999       /* OS/2 has a function returning the locale's codepage as a number.  */  in locale_charset()
1014 #  error "Add code for other platforms here."  in locale_charset()
1018   /* Resolve alias.  */  in locale_charset()
1021     /* On some platforms, UTF-8 locales are the most frequently used ones.  in locale_charset()
1023        testing for this case first.  */  in locale_charset()
1033         /* The table is sorted.  Perform a binary search.  */  in locale_charset()
1039                for i < lo, strcmp (table[i].alias, codeset) < 0,  in locale_charset()
1040                for i >= hi, strcmp (table[i].alias, codeset) > 0.  */  in locale_charset()
1042             int cmp = strcmp (table[mid].alias, codeset);  in locale_charset()
1050                      strcmp (table[i].alias, codeset) == 0.  */  in locale_charset()
1051                 codeset = table[mid].canonical;  in locale_charset()
1061         /* Did not find it in the table.  */  in locale_charset()
1062         /* On Mac OS X, all modern locales use the UTF-8 encoding.  in locale_charset()
1063            BeOS and Haiku have a single locale, and it has UTF-8 encoding.  */  in locale_charset()
1067         /* Don't return an empty string.  GNU libc and GNU libiconv interpret  in locale_charset()
1069            thus GNU libiconv would call this function a second time.  */  in locale_charset()
1071           codeset = "ASCII";  in locale_charset()
1078   /* On old systems which lack it, use setlocale or getenv.  */  in locale_charset()
1081   /* But most old systems don't have a complete set of locales.  Some  in locale_charset()
1082      (like DJGPP) have only the C locale.  Therefore we don't use setlocale  in locale_charset()
1084      user has set.  */  in locale_charset()
1101   /* Map locale name to canonical encoding name.  */  in locale_charset()
1107     /* The table is sorted.  Perform a binary search.  */  in locale_charset()
1113            for i < lo, strcmp (table[i].locale, locale) < 0,  in locale_charset()
1114            for i >= hi, strcmp (table[i].locale, locale) > 0.  */  in locale_charset()
1116         int cmp = strcmp (table[mid].locale, locale);  in locale_charset()
1124                  strcmp (table[i].locale, locale) == 0.  */  in locale_charset()
1125             codeset = table[mid].canonical;  in locale_charset()
1134         /* Did not find it in the table.  */  in locale_charset()
1135         /* On Mac OS X, all modern locales use the UTF-8 encoding.  in locale_charset()
1136            BeOS and Haiku have a single locale, and it has UTF-8 encoding.  */  in locale_charset()
1140         /* The canonical name cannot be determined.  */  in locale_charset()
1141         /* Don't return an empty string.  GNU libc and GNU libiconv interpret  in locale_charset()
1143            thus GNU libiconv would call this function a second time.  */  in locale_charset()
1144         codeset = "ASCII";  in locale_charset()
1153      (the default codeset) does not work when MB_CUR_MAX is 1.  */  in locale_charset()
1155     codeset = "ASCII";  in locale_charset()