1 /* -*- buffer-read-only: t -*- vi: set ro: */ 2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */ 3 /* Determine a canonical name for the current locale's character encoding. 4 5 Copyright (C) 2000-2006, 2008-2010 Free Software Foundation, Inc. 6 7 This program is free software; you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 3, or (at your option) 10 any later version. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program; if not, write to the Free Software Foundation, 19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 20 21 /* Written by Bruno Haible <bruno@clisp.org>. */ 22 23 #include <config.h> 24 25 /* Specification. */ 26 #include "localcharset.h" 27 28 #include <fcntl.h> 29 #include <stddef.h> 30 #include <stdio.h> 31 #include <string.h> 32 #include <stdlib.h> 33 34 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET 35 # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */ 36 #endif 37 38 #if defined _WIN32 || defined __WIN32__ 39 # define WIN32_NATIVE 40 #endif 41 42 #if defined __EMX__ 43 /* Assume EMX program runs on OS/2, even if compiled under DOS. */ 44 # ifndef OS2 45 # define OS2 46 # endif 47 #endif 48 49 #if !defined WIN32_NATIVE 50 # include <unistd.h> 51 # if HAVE_LANGINFO_CODESET 52 # include <langinfo.h> 53 # else 54 # if 0 /* see comment below */ 55 # include <locale.h> 56 # endif 57 # endif 58 # ifdef __CYGWIN__ 59 # define WIN32_LEAN_AND_MEAN 60 # include <windows.h> 61 # endif 62 #elif defined WIN32_NATIVE 63 # define WIN32_LEAN_AND_MEAN 64 # include <windows.h> 65 #endif 66 #if defined OS2 67 # define INCL_DOS 68 # include <os2.h> 69 #endif 70 71 #if ENABLE_RELOCATABLE 72 # include "relocatable.h" 73 #else 74 # define relocate(pathname) (pathname) 75 #endif 76 77 /* Get LIBDIR. */ 78 #ifndef LIBDIR 79 # include "configmake.h" 80 #endif 81 82 /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */ 83 #ifndef O_NOFOLLOW 84 # define O_NOFOLLOW 0 85 #endif 86 87 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__ 88 /* Win32, Cygwin, OS/2, DOS */ 89 # define ISSLASH(C) ((C) == '/' || (C) == '\\') 90 #endif 91 92 #ifndef DIRECTORY_SEPARATOR 93 # define DIRECTORY_SEPARATOR '/' 94 #endif 95 96 #ifndef ISSLASH 97 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 98 #endif 99 100 #if HAVE_DECL_GETC_UNLOCKED 101 # undef getc 102 # define getc getc_unlocked 103 #endif 104 105 /* The following static variable is declared 'volatile' to avoid a 106 possible multithread problem in the function get_charset_aliases. If we 107 are running in a threaded environment, and if two threads initialize 108 'charset_aliases' simultaneously, both will produce the same value, 109 and everything will be ok if the two assignments to 'charset_aliases' 110 are atomic. But I don't know what will happen if the two assignments mix. */ 111 #if __STDC__ != 1 112 # define volatile /* empty */ 113 #endif 114 /* Pointer to the contents of the charset.alias file, if it has already been 115 read, else NULL. Its format is: 116 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 117 static const char * volatile charset_aliases; 118 119 /* Return a pointer to the contents of the charset.alias file. */ 120 static const char * 121 get_charset_aliases (void) 122 { 123 const char *cp; 124 125 cp = charset_aliases; 126 if (cp == NULL) 127 { 128 #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__) 129 const char *dir; 130 const char *base = "charset.alias"; 131 char *file_name; 132 133 /* Make it possible to override the charset.alias location. This is 134 necessary for running the testsuite before "make install". */ 135 dir = getenv ("CHARSETALIASDIR"); 136 if (dir == NULL || dir[0] == '\0') 137 dir = relocate (LIBDIR); 138 139 /* Concatenate dir and base into freshly allocated file_name. */ 140 { 141 size_t dir_len = strlen (dir); 142 size_t base_len = strlen (base); 143 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 144 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 145 if (file_name != NULL) 146 { 147 memcpy (file_name, dir, dir_len); 148 if (add_slash) 149 file_name[dir_len] = DIRECTORY_SEPARATOR; 150 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 151 } 152 } 153 154 if (file_name == NULL) 155 /* Out of memory. Treat the file as empty. */ 156 cp = ""; 157 else 158 { 159 int fd; 160 161 /* Open the file. Reject symbolic links on platforms that support 162 O_NOFOLLOW. This is a security feature. Without it, an attacker 163 could retrieve parts of the contents (namely, the tail of the 164 first line that starts with "* ") of an arbitrary file by placing 165 a symbolic link to that file under the name "charset.alias" in 166 some writable directory and defining the environment variable 167 CHARSETALIASDIR to point to that directory. */ 168 fd = open (file_name, 169 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0)); 170 if (fd < 0) 171 /* File not found. Treat it as empty. */ 172 cp = ""; 173 else 174 { 175 FILE *fp; 176 177 fp = fdopen (fd, "r"); 178 if (fp == NULL) 179 { 180 /* Out of memory. Treat the file as empty. */ 181 close (fd); 182 cp = ""; 183 } 184 else 185 { 186 /* Parse the file's contents. */ 187 char *res_ptr = NULL; 188 size_t res_size = 0; 189 190 for (;;) 191 { 192 int c; 193 char buf1[50+1]; 194 char buf2[50+1]; 195 size_t l1, l2; 196 char *old_res_ptr; 197 198 c = getc (fp); 199 if (c == EOF) 200 break; 201 if (c == '\n' || c == ' ' || c == '\t') 202 continue; 203 if (c == '#') 204 { 205 /* Skip comment, to end of line. */ 206 do 207 c = getc (fp); 208 while (!(c == EOF || c == '\n')); 209 if (c == EOF) 210 break; 211 continue; 212 } 213 ungetc (c, fp); 214 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 215 break; 216 l1 = strlen (buf1); 217 l2 = strlen (buf2); 218 old_res_ptr = res_ptr; 219 if (res_size == 0) 220 { 221 res_size = l1 + 1 + l2 + 1; 222 res_ptr = (char *) malloc (res_size + 1); 223 } 224 else 225 { 226 res_size += l1 + 1 + l2 + 1; 227 res_ptr = (char *) realloc (res_ptr, res_size + 1); 228 } 229 if (res_ptr == NULL) 230 { 231 /* Out of memory. */ 232 res_size = 0; 233 if (old_res_ptr != NULL) 234 free (old_res_ptr); 235 break; 236 } 237 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 238 strcpy (res_ptr + res_size - (l2 + 1), buf2); 239 } 240 fclose (fp); 241 if (res_size == 0) 242 cp = ""; 243 else 244 { 245 *(res_ptr + res_size) = '\0'; 246 cp = res_ptr; 247 } 248 } 249 } 250 251 free (file_name); 252 } 253 254 #else 255 256 # if defined DARWIN7 257 /* To avoid the trouble of installing a file that is shared by many 258 GNU packages -- many packaging systems have problems with this --, 259 simply inline the aliases here. */ 260 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 261 "ISO8859-2" "\0" "ISO-8859-2" "\0" 262 "ISO8859-4" "\0" "ISO-8859-4" "\0" 263 "ISO8859-5" "\0" "ISO-8859-5" "\0" 264 "ISO8859-7" "\0" "ISO-8859-7" "\0" 265 "ISO8859-9" "\0" "ISO-8859-9" "\0" 266 "ISO8859-13" "\0" "ISO-8859-13" "\0" 267 "ISO8859-15" "\0" "ISO-8859-15" "\0" 268 "KOI8-R" "\0" "KOI8-R" "\0" 269 "KOI8-U" "\0" "KOI8-U" "\0" 270 "CP866" "\0" "CP866" "\0" 271 "CP949" "\0" "CP949" "\0" 272 "CP1131" "\0" "CP1131" "\0" 273 "CP1251" "\0" "CP1251" "\0" 274 "eucCN" "\0" "GB2312" "\0" 275 "GB2312" "\0" "GB2312" "\0" 276 "eucJP" "\0" "EUC-JP" "\0" 277 "eucKR" "\0" "EUC-KR" "\0" 278 "Big5" "\0" "BIG5" "\0" 279 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0" 280 "GBK" "\0" "GBK" "\0" 281 "GB18030" "\0" "GB18030" "\0" 282 "SJIS" "\0" "SHIFT_JIS" "\0" 283 "ARMSCII-8" "\0" "ARMSCII-8" "\0" 284 "PT154" "\0" "PT154" "\0" 285 /*"ISCII-DEV" "\0" "?" "\0"*/ 286 "*" "\0" "UTF-8" "\0"; 287 # endif 288 289 # if defined VMS 290 /* To avoid the troubles of an extra file charset.alias_vms in the 291 sources of many GNU packages, simply inline the aliases here. */ 292 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 293 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 294 section 10.7 "Handling Different Character Sets". */ 295 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 296 "ISO8859-2" "\0" "ISO-8859-2" "\0" 297 "ISO8859-5" "\0" "ISO-8859-5" "\0" 298 "ISO8859-7" "\0" "ISO-8859-7" "\0" 299 "ISO8859-8" "\0" "ISO-8859-8" "\0" 300 "ISO8859-9" "\0" "ISO-8859-9" "\0" 301 /* Japanese */ 302 "eucJP" "\0" "EUC-JP" "\0" 303 "SJIS" "\0" "SHIFT_JIS" "\0" 304 "DECKANJI" "\0" "DEC-KANJI" "\0" 305 "SDECKANJI" "\0" "EUC-JP" "\0" 306 /* Chinese */ 307 "eucTW" "\0" "EUC-TW" "\0" 308 "DECHANYU" "\0" "DEC-HANYU" "\0" 309 "DECHANZI" "\0" "GB2312" "\0" 310 /* Korean */ 311 "DECKOREAN" "\0" "EUC-KR" "\0"; 312 # endif 313 314 # if defined WIN32_NATIVE || defined __CYGWIN__ 315 /* To avoid the troubles of installing a separate file in the same 316 directory as the DLL and of retrieving the DLL's directory at 317 runtime, simply inline the aliases here. */ 318 319 cp = "CP936" "\0" "GBK" "\0" 320 "CP1361" "\0" "JOHAB" "\0" 321 "CP20127" "\0" "ASCII" "\0" 322 "CP20866" "\0" "KOI8-R" "\0" 323 "CP20936" "\0" "GB2312" "\0" 324 "CP21866" "\0" "KOI8-RU" "\0" 325 "CP28591" "\0" "ISO-8859-1" "\0" 326 "CP28592" "\0" "ISO-8859-2" "\0" 327 "CP28593" "\0" "ISO-8859-3" "\0" 328 "CP28594" "\0" "ISO-8859-4" "\0" 329 "CP28595" "\0" "ISO-8859-5" "\0" 330 "CP28596" "\0" "ISO-8859-6" "\0" 331 "CP28597" "\0" "ISO-8859-7" "\0" 332 "CP28598" "\0" "ISO-8859-8" "\0" 333 "CP28599" "\0" "ISO-8859-9" "\0" 334 "CP28605" "\0" "ISO-8859-15" "\0" 335 "CP38598" "\0" "ISO-8859-8" "\0" 336 "CP51932" "\0" "EUC-JP" "\0" 337 "CP51936" "\0" "GB2312" "\0" 338 "CP51949" "\0" "EUC-KR" "\0" 339 "CP51950" "\0" "EUC-TW" "\0" 340 "CP54936" "\0" "GB18030" "\0" 341 "CP65001" "\0" "UTF-8" "\0"; 342 # endif 343 #endif 344 345 charset_aliases = cp; 346 } 347 348 return cp; 349 } 350 351 /* Determine the current locale's character encoding, and canonicalize it 352 into one of the canonical names listed in config.charset. 353 The result must not be freed; it is statically allocated. 354 If the canonical name cannot be determined, the result is a non-canonical 355 name. */ 356 357 #ifdef STATIC 358 STATIC 359 #endif 360 const char * 361 locale_charset (void) 362 { 363 const char *codeset; 364 const char *aliases; 365 366 #if !(defined WIN32_NATIVE || defined OS2) 367 368 # if HAVE_LANGINFO_CODESET 369 370 /* Most systems support nl_langinfo (CODESET) nowadays. */ 371 codeset = nl_langinfo (CODESET); 372 373 # ifdef __CYGWIN__ 374 /* Cygwin 1.5.x does not have locales. nl_langinfo (CODESET) always 375 returns "US-ASCII". As long as this is not fixed, return the suffix 376 of the locale name from the environment variables (if present) or 377 the codepage as a number. */ 378 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0) 379 { 380 const char *locale; 381 static char buf[2 + 10 + 1]; 382 383 locale = getenv ("LC_ALL"); 384 if (locale == NULL || locale[0] == '\0') 385 { 386 locale = getenv ("LC_CTYPE"); 387 if (locale == NULL || locale[0] == '\0') 388 locale = getenv ("LANG"); 389 } 390 if (locale != NULL && locale[0] != '\0') 391 { 392 /* If the locale name contains an encoding after the dot, return 393 it. */ 394 const char *dot = strchr (locale, '.'); 395 396 if (dot != NULL) 397 { 398 const char *modifier; 399 400 dot++; 401 /* Look for the possible @... trailer and remove it, if any. */ 402 modifier = strchr (dot, '@'); 403 if (modifier == NULL) 404 return dot; 405 if (modifier - dot < sizeof (buf)) 406 { 407 memcpy (buf, dot, modifier - dot); 408 buf [modifier - dot] = '\0'; 409 return buf; 410 } 411 } 412 } 413 414 /* Woe32 has a function returning the locale's codepage as a number: 415 GetACP(). This encoding is used by Cygwin, unless the user has set 416 the environment variable CYGWIN=codepage:oem (which very few people 417 do). 418 Output directed to console windows needs to be converted (to 419 GetOEMCP() if the console is using a raster font, or to 420 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does 421 this conversion transparently (see winsup/cygwin/fhandler_console.cc), 422 converting to GetConsoleOutputCP(). This leads to correct results, 423 except when SetConsoleOutputCP has been called and a raster font is 424 in use. */ 425 sprintf (buf, "CP%u", GetACP ()); 426 codeset = buf; 427 } 428 # endif 429 430 # else 431 432 /* On old systems which lack it, use setlocale or getenv. */ 433 const char *locale = NULL; 434 435 /* But most old systems don't have a complete set of locales. Some 436 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 437 use setlocale here; it would return "C" when it doesn't support the 438 locale name the user has set. */ 439 # if 0 440 locale = setlocale (LC_CTYPE, NULL); 441 # endif 442 if (locale == NULL || locale[0] == '\0') 443 { 444 locale = getenv ("LC_ALL"); 445 if (locale == NULL || locale[0] == '\0') 446 { 447 locale = getenv ("LC_CTYPE"); 448 if (locale == NULL || locale[0] == '\0') 449 locale = getenv ("LANG"); 450 } 451 } 452 453 /* On some old systems, one used to set locale = "iso8859_1". On others, 454 you set it to "language_COUNTRY.charset". In any case, we resolve it 455 through the charset.alias file. */ 456 codeset = locale; 457 458 # endif 459 460 #elif defined WIN32_NATIVE 461 462 static char buf[2 + 10 + 1]; 463 464 /* Woe32 has a function returning the locale's codepage as a number: 465 GetACP(). 466 When the output goes to a console window, it needs to be provided in 467 GetOEMCP() encoding if the console is using a raster font, or in 468 GetConsoleOutputCP() encoding if it is using a TrueType font. 469 But in GUI programs and for output sent to files and pipes, GetACP() 470 encoding is the best bet. */ 471 sprintf (buf, "CP%u", GetACP ()); 472 codeset = buf; 473 474 #elif defined OS2 475 476 const char *locale; 477 static char buf[2 + 10 + 1]; 478 ULONG cp[3]; 479 ULONG cplen; 480 481 /* Allow user to override the codeset, as set in the operating system, 482 with standard language environment variables. */ 483 locale = getenv ("LC_ALL"); 484 if (locale == NULL || locale[0] == '\0') 485 { 486 locale = getenv ("LC_CTYPE"); 487 if (locale == NULL || locale[0] == '\0') 488 locale = getenv ("LANG"); 489 } 490 if (locale != NULL && locale[0] != '\0') 491 { 492 /* If the locale name contains an encoding after the dot, return it. */ 493 const char *dot = strchr (locale, '.'); 494 495 if (dot != NULL) 496 { 497 const char *modifier; 498 499 dot++; 500 /* Look for the possible @... trailer and remove it, if any. */ 501 modifier = strchr (dot, '@'); 502 if (modifier == NULL) 503 return dot; 504 if (modifier - dot < sizeof (buf)) 505 { 506 memcpy (buf, dot, modifier - dot); 507 buf [modifier - dot] = '\0'; 508 return buf; 509 } 510 } 511 512 /* Resolve through the charset.alias file. */ 513 codeset = locale; 514 } 515 else 516 { 517 /* OS/2 has a function returning the locale's codepage as a number. */ 518 if (DosQueryCp (sizeof (cp), cp, &cplen)) 519 codeset = ""; 520 else 521 { 522 sprintf (buf, "CP%u", cp[0]); 523 codeset = buf; 524 } 525 } 526 527 #endif 528 529 if (codeset == NULL) 530 /* The canonical name cannot be determined. */ 531 codeset = ""; 532 533 /* Resolve alias. */ 534 for (aliases = get_charset_aliases (); 535 *aliases != '\0'; 536 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 537 if (strcmp (codeset, aliases) == 0 538 || (aliases[0] == '*' && aliases[1] == '\0')) 539 { 540 codeset = aliases + strlen (aliases) + 1; 541 break; 542 } 543 544 /* Don't return an empty string. GNU libc and GNU libiconv interpret 545 the empty string as denoting "the locale's character encoding", 546 thus GNU libiconv would call this function a second time. */ 547 if (codeset[0] == '\0') 548 codeset = "ASCII"; 549 550 return codeset; 551 } 552