1 /* $NetBSD: localcharset.c,v 1.1.1.1 2016/01/14 00:11:28 christos Exp $ */ 2 3 /* Determine a canonical name for the current locale's character encoding. 4 5 Copyright (C) 2000-2003 Free Software Foundation, Inc. 6 7 This program is free software; you can redistribute it and/or modify it 8 under the terms of the GNU Library General Public License as published 9 by the Free Software Foundation; either version 2, or (at your option) 10 any later version. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 Library General Public License for more details. 16 17 You should have received a copy of the GNU Library General Public 18 License along with this program; if not, write to the Free Software 19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 20 USA. */ 21 22 /* Written by Bruno Haible <bruno@clisp.org>. */ 23 24 #ifdef HAVE_CONFIG_H 25 # include <config.h> 26 #endif 27 28 /* Specification. */ 29 #include "localcharset.h" 30 31 #if HAVE_STDDEF_H 32 # include <stddef.h> 33 #endif 34 35 #include <stdio.h> 36 #if HAVE_STRING_H 37 # include <string.h> 38 #else 39 # include <strings.h> 40 #endif 41 #if HAVE_STDLIB_H 42 # include <stdlib.h> 43 #endif 44 45 #if defined _WIN32 || defined __WIN32__ 46 # undef WIN32 /* avoid warning on mingw32 */ 47 # define WIN32 48 #endif 49 50 #if defined __EMX__ 51 /* Assume EMX program runs on OS/2, even if compiled under DOS. */ 52 # define OS2 53 #endif 54 55 #if !defined WIN32 56 # if HAVE_LANGINFO_CODESET 57 # include <langinfo.h> 58 # else 59 # if HAVE_SETLOCALE 60 # include <locale.h> 61 # endif 62 # endif 63 #elif defined WIN32 64 # define WIN32_LEAN_AND_MEAN 65 # include <windows.h> 66 #endif 67 #if defined OS2 68 # define INCL_DOS 69 # include <os2.h> 70 #endif 71 72 #if ENABLE_RELOCATABLE 73 # include "relocatable.h" 74 #else 75 # define relocate(pathname) (pathname) 76 #endif 77 78 #if defined _WIN32 || defined __WIN32__ || defined __EMX__ || defined __DJGPP__ 79 /* Win32, OS/2, DOS */ 80 # define ISSLASH(C) ((C) == '/' || (C) == '\\') 81 #endif 82 83 #ifndef DIRECTORY_SEPARATOR 84 # define DIRECTORY_SEPARATOR '/' 85 #endif 86 87 #ifndef ISSLASH 88 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 89 #endif 90 91 #if HAVE_DECL_GETC_UNLOCKED 92 # undef getc 93 # define getc getc_unlocked 94 #endif 95 96 /* The following static variable is declared 'volatile' to avoid a 97 possible multithread problem in the function get_charset_aliases. If we 98 are running in a threaded environment, and if two threads initialize 99 'charset_aliases' simultaneously, both will produce the same value, 100 and everything will be ok if the two assignments to 'charset_aliases' 101 are atomic. But I don't know what will happen if the two assignments mix. */ 102 #if __STDC__ != 1 103 # define volatile /* empty */ 104 #endif 105 /* Pointer to the contents of the charset.alias file, if it has already been 106 read, else NULL. Its format is: 107 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 108 static const char * volatile charset_aliases; 109 110 /* Return a pointer to the contents of the charset.alias file. */ 111 static const char * 112 get_charset_aliases () 113 { 114 const char *cp; 115 116 cp = charset_aliases; 117 if (cp == NULL) 118 { 119 #if !(defined VMS || defined WIN32) 120 FILE *fp; 121 const char *dir = relocate (LIBDIR); 122 const char *base = "charset.alias"; 123 char *file_name; 124 125 /* Concatenate dir and base into freshly allocated file_name. */ 126 { 127 size_t dir_len = strlen (dir); 128 size_t base_len = strlen (base); 129 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 130 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 131 if (file_name != NULL) 132 { 133 memcpy (file_name, dir, dir_len); 134 if (add_slash) 135 file_name[dir_len] = DIRECTORY_SEPARATOR; 136 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 137 } 138 } 139 140 if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL) 141 /* Out of memory or file not found, treat it as empty. */ 142 cp = ""; 143 else 144 { 145 /* Parse the file's contents. */ 146 int c; 147 char buf1[50+1]; 148 char buf2[50+1]; 149 char *res_ptr = NULL; 150 size_t res_size = 0; 151 size_t l1, l2; 152 153 for (;;) 154 { 155 c = getc (fp); 156 if (c == EOF) 157 break; 158 if (c == '\n' || c == ' ' || c == '\t') 159 continue; 160 if (c == '#') 161 { 162 /* Skip comment, to end of line. */ 163 do 164 c = getc (fp); 165 while (!(c == EOF || c == '\n')); 166 if (c == EOF) 167 break; 168 continue; 169 } 170 ungetc (c, fp); 171 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 172 break; 173 l1 = strlen (buf1); 174 l2 = strlen (buf2); 175 if (res_size == 0) 176 { 177 res_size = l1 + 1 + l2 + 1; 178 res_ptr = (char *) malloc (res_size + 1); 179 } 180 else 181 { 182 res_size += l1 + 1 + l2 + 1; 183 res_ptr = (char *) realloc (res_ptr, res_size + 1); 184 } 185 if (res_ptr == NULL) 186 { 187 /* Out of memory. */ 188 res_size = 0; 189 break; 190 } 191 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 192 strcpy (res_ptr + res_size - (l2 + 1), buf2); 193 } 194 fclose (fp); 195 if (res_size == 0) 196 cp = ""; 197 else 198 { 199 *(res_ptr + res_size) = '\0'; 200 cp = res_ptr; 201 } 202 } 203 204 if (file_name != NULL) 205 free (file_name); 206 207 #else 208 209 # if defined VMS 210 /* To avoid the troubles of an extra file charset.alias_vms in the 211 sources of many GNU packages, simply inline the aliases here. */ 212 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 213 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 214 section 10.7 "Handling Different Character Sets". */ 215 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 216 "ISO8859-2" "\0" "ISO-8859-2" "\0" 217 "ISO8859-5" "\0" "ISO-8859-5" "\0" 218 "ISO8859-7" "\0" "ISO-8859-7" "\0" 219 "ISO8859-8" "\0" "ISO-8859-8" "\0" 220 "ISO8859-9" "\0" "ISO-8859-9" "\0" 221 /* Japanese */ 222 "eucJP" "\0" "EUC-JP" "\0" 223 "SJIS" "\0" "SHIFT_JIS" "\0" 224 "DECKANJI" "\0" "DEC-KANJI" "\0" 225 "SDECKANJI" "\0" "EUC-JP" "\0" 226 /* Chinese */ 227 "eucTW" "\0" "EUC-TW" "\0" 228 "DECHANYU" "\0" "DEC-HANYU" "\0" 229 "DECHANZI" "\0" "GB2312" "\0" 230 /* Korean */ 231 "DECKOREAN" "\0" "EUC-KR" "\0"; 232 # endif 233 234 # if defined WIN32 235 /* To avoid the troubles of installing a separate file in the same 236 directory as the DLL and of retrieving the DLL's directory at 237 runtime, simply inline the aliases here. */ 238 239 cp = "CP936" "\0" "GBK" "\0" 240 "CP1361" "\0" "JOHAB" "\0" 241 "CP20127" "\0" "ASCII" "\0" 242 "CP20866" "\0" "KOI8-R" "\0" 243 "CP21866" "\0" "KOI8-RU" "\0" 244 "CP28591" "\0" "ISO-8859-1" "\0" 245 "CP28592" "\0" "ISO-8859-2" "\0" 246 "CP28593" "\0" "ISO-8859-3" "\0" 247 "CP28594" "\0" "ISO-8859-4" "\0" 248 "CP28595" "\0" "ISO-8859-5" "\0" 249 "CP28596" "\0" "ISO-8859-6" "\0" 250 "CP28597" "\0" "ISO-8859-7" "\0" 251 "CP28598" "\0" "ISO-8859-8" "\0" 252 "CP28599" "\0" "ISO-8859-9" "\0" 253 "CP28605" "\0" "ISO-8859-15" "\0"; 254 # endif 255 #endif 256 257 charset_aliases = cp; 258 } 259 260 return cp; 261 } 262 263 /* Determine the current locale's character encoding, and canonicalize it 264 into one of the canonical names listed in config.charset. 265 The result must not be freed; it is statically allocated. 266 If the canonical name cannot be determined, the result is a non-canonical 267 name. */ 268 269 #ifdef STATIC 270 STATIC 271 #endif 272 const char * 273 locale_charset () 274 { 275 const char *codeset; 276 const char *aliases; 277 278 #if !(defined WIN32 || defined OS2) 279 280 # if HAVE_LANGINFO_CODESET 281 282 /* Most systems support nl_langinfo (CODESET) nowadays. */ 283 codeset = nl_langinfo (CODESET); 284 285 # else 286 287 /* On old systems which lack it, use setlocale or getenv. */ 288 const char *locale = NULL; 289 290 /* But most old systems don't have a complete set of locales. Some 291 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 292 use setlocale here; it would return "C" when it doesn't support the 293 locale name the user has set. */ 294 # if HAVE_SETLOCALE && 0 295 locale = setlocale (LC_CTYPE, NULL); 296 # endif 297 if (locale == NULL || locale[0] == '\0') 298 { 299 locale = getenv ("LC_ALL"); 300 if (locale == NULL || locale[0] == '\0') 301 { 302 locale = getenv ("LC_CTYPE"); 303 if (locale == NULL || locale[0] == '\0') 304 locale = getenv ("LANG"); 305 } 306 } 307 308 /* On some old systems, one used to set locale = "iso8859_1". On others, 309 you set it to "language_COUNTRY.charset". In any case, we resolve it 310 through the charset.alias file. */ 311 codeset = locale; 312 313 # endif 314 315 #elif defined WIN32 316 317 static char buf[2 + 10 + 1]; 318 319 /* Woe32 has a function returning the locale's codepage as a number. */ 320 sprintf (buf, "CP%u", GetACP ()); 321 codeset = buf; 322 323 #elif defined OS2 324 325 const char *locale; 326 static char buf[2 + 10 + 1]; 327 ULONG cp[3]; 328 ULONG cplen; 329 330 /* Allow user to override the codeset, as set in the operating system, 331 with standard language environment variables. */ 332 locale = getenv ("LC_ALL"); 333 if (locale == NULL || locale[0] == '\0') 334 { 335 locale = getenv ("LC_CTYPE"); 336 if (locale == NULL || locale[0] == '\0') 337 locale = getenv ("LANG"); 338 } 339 if (locale != NULL && locale[0] != '\0') 340 { 341 /* If the locale name contains an encoding after the dot, return it. */ 342 const char *dot = strchr (locale, '.'); 343 344 if (dot != NULL) 345 { 346 const char *modifier; 347 348 dot++; 349 /* Look for the possible @... trailer and remove it, if any. */ 350 modifier = strchr (dot, '@'); 351 if (modifier == NULL) 352 return dot; 353 if (modifier - dot < sizeof (buf)) 354 { 355 memcpy (buf, dot, modifier - dot); 356 buf [modifier - dot] = '\0'; 357 return buf; 358 } 359 } 360 361 /* Resolve through the charset.alias file. */ 362 codeset = locale; 363 } 364 else 365 { 366 /* OS/2 has a function returning the locale's codepage as a number. */ 367 if (DosQueryCp (sizeof (cp), cp, &cplen)) 368 codeset = ""; 369 else 370 { 371 sprintf (buf, "CP%u", cp[0]); 372 codeset = buf; 373 } 374 } 375 376 #endif 377 378 if (codeset == NULL) 379 /* The canonical name cannot be determined. */ 380 codeset = ""; 381 382 /* Resolve alias. */ 383 for (aliases = get_charset_aliases (); 384 *aliases != '\0'; 385 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 386 if (strcmp (codeset, aliases) == 0 387 || (aliases[0] == '*' && aliases[1] == '\0')) 388 { 389 codeset = aliases + strlen (aliases) + 1; 390 break; 391 } 392 393 /* Don't return an empty string. GNU libc and GNU libiconv interpret 394 the empty string as denoting "the locale's character encoding", 395 thus GNU libiconv would call this function a second time. */ 396 if (codeset[0] == '\0') 397 codeset = "ASCII"; 398 399 return codeset; 400 } 401