1 /* locale.c 2 * 3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others 5 * 6 * You may distribute under the terms of either the GNU General Public 7 * License or the Artistic License, as specified in the README file. 8 * 9 */ 10 11 /* 12 * A Elbereth Gilthoniel, 13 * silivren penna míriel 14 * o menel aglar elenath! 15 * Na-chaered palan-díriel 16 * o galadhremmin ennorath, 17 * Fanuilos, le linnathon 18 * nef aear, si nef aearon! 19 * 20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"] 21 */ 22 23 /* utility functions for handling locale-specific stuff like what 24 * character represents the decimal point. 25 * 26 * All C programs have an underlying locale. Perl code generally doesn't pay 27 * any attention to it except within the scope of a 'use locale'. For most 28 * categories, it accomplishes this by just using different operations if it is 29 * in such scope than if not. However, various libc functions called by Perl 30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that 31 * are used to toggle between the current locale and the C locale depending on 32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is 33 * switched to the C locale for outputting the message unless within the scope 34 * of 'use locale'. 35 * 36 * There is more than the typical amount of variation between platforms with 37 * regard to locale handling. At the end of these introductory comments, are 38 * listed various relevent Configuration options, including some that can be 39 * used to pretend to some extent that this is being developed on a different 40 * platform than it actually is. This allows you to make changes and catch 41 * some errors without having access to those other platforms. 42 * 43 * This code now has multi-thread-safe locale handling on systems that support 44 * that. This is completely transparent to most XS code. On earlier systems, 45 * it would be possible to emulate thread-safe locales, but this likely would 46 * involve a lot of locale switching, and would require XS code changes. 47 * Macros could be written so that the code wouldn't have to know which type of 48 * system is being used. 49 * 50 * Table-driven code is used for simplicity and clarity, as many operations 51 * differ only in which category is being worked on. However the system 52 * categories need not be small contiguous integers, so do not lend themselves 53 * to table lookup. Instead we have created our own equivalent values which 54 * are all small contiguous non-negative integers, and translation functions 55 * between the two sets. For category 'LC_foo', the name of our index is 56 * LC_foo_INDEX_. Various parallel tables, indexed by these, are used for the 57 * translation. The tables are generated at compile-time based on platform 58 * characteristics and Configure options. They hide from the code many of the 59 * vagaries of the different locale implementations out there. 60 * 61 * On unthreaded perls, most operations expand out to just the basic 62 * setlocale() calls. That sort of is true on threaded perls on modern Windows 63 * systems where the same API, after set up, is used for thread-safe locale 64 * handling. (But there are complications on Windows due to internal character 65 * set issues.) On other systems, there is a completely different API, 66 * specified in POSIX 2008, to do thread-safe locales. On these systems, our 67 * bool_setlocale_2008_i() function is used to hide the different API from the 68 * outside. This makes it completely transparent to most XS code. 69 * 70 * A huge complicating factor is that the LC_NUMERIC category is normally held 71 * in the C locale, except during those relatively rare times when it needs to 72 * be in the underlying locale. There is a bunch of code to accomplish this, 73 * and to allow easy switches from one state to the other. 74 * 75 * In addition, the setlocale equivalents have versions for the return context, 76 * 'void' and 'bool', besides the full return value. This can present 77 * opportunities for avoiding work. We don't have to necessarily create a safe 78 * copy to return if no return is desired. 79 * 80 * There are 3.5 major implementations here; which one chosen depends on what 81 * the platform has available, and Configuration options. 82 * 83 * 1) Raw posix_setlocale(). This implementation is basically the libc 84 * setlocale(), with possibly minor tweaks. This is used for startup, and 85 * always for unthreaded perls, and when the API for safe locale threading 86 * is identical to the unsafe API (Windows, currently). 87 * 88 * This implementation is composed of two layers: 89 * a) posix_setlocale() implements the libc setlocale(). In most cases, 90 * it is just an alias for the libc version. But Windows doesn't 91 * fully conform to the POSIX standard, and this is a layer on top of 92 * libc to bring it more into conformance. And in Configurations 93 * where perl is to ignore some locale categories that the libc 94 * setlocale() knows about, there is a layer to cope with that. 95 * b) stdized_setlocale() is a layer above a) that fixes some vagaries in 96 * the return value of the libc setlocale(). On most platforms this 97 * layer is empty; in order to be activated, it requires perl to be 98 * Configured with a parameter indicating the platform's defect. The 99 * current ones are listed at the definition of the macro. 100 * 101 * 2) An implementation that adds a minimal layer above implementation 1), 102 * making that implementation uninterruptible and returning a 103 * per-thread/per-category value. 104 * 105 * 3a and 3b) An implementation of POSIX 2008 thread-safe locale handling, 106 * hiding from the programmer the completely different API for this. 107 * This automatically makes almost all code thread-safe without need for 108 * changes. This implementation is chosen on threaded perls when the 109 * platform properly supports the POSIX 2008 functions, and when there is no 110 * manual override to the contrary passed to Configure. 111 * 112 * 3a) is when the platform has a documented reliable querylocale() function 113 * or equivalent that is selected to be used. 114 * 3b) is when we have to emulate that functionality. 115 * 116 * Unfortunately, it seems that some platforms that claim to support these 117 * are buggy, in one way or another. There are workarounds encoded here, 118 * where feasible, for platforms where the bugs are amenable to that 119 * (glibc, for example). But other platforms instead don't use this 120 * implementation. 121 * 122 * z/OS (os390) is an outlier. Locales really don't work under threads when 123 * either the radix character isn't a dot, or attempts are made to change 124 * locales after the first thread is created. The reason is that IBM has made 125 * it thread-safe by refusing to change locales (returning failure if 126 * attempted) any time after an application has called pthread_create() to 127 * create another thread. The expectation is that an application will set up 128 * its locale information before the first fork, and be stable thereafter. But 129 * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do 130 * the other toggles, which are less common. 131 * 132 * Associated with each implementation are three sets of macros that translate 133 * a consistent API into what that implementation needs. Each set consists of 134 * three macros with the suffixes: 135 * _c Means the argument is a locale category number known at compile time. 136 * An example would be LC_TIME. This token is a compile-time constant 137 * and can be passed to a '_c' macro. 138 * _r Means the argument is a locale category number whose value might not be 139 * known until runtime 140 * _i Means the argument is our internal index of a locale category 141 * 142 * The three sets are: ('_X' means one of '_c', '_r', '_i') 143 * 1) bool_setlocale_X() 144 * This calls the appropriate setlocale()-equivalent for the 145 * implementation, with the category and new locale. The input locale is 146 * not necessarily valid, so the return is true or false depending on 147 * whether or not the setlocale() succeeded. This is not used for 148 * querying the locale, so the input locale must not be NULL. 149 * 150 * This macro is suitable for toggling the locale back and forth during an 151 * operation. For example, the names of days and months under LC_TIME are 152 * strings that are also subject to LC_CTYPE. If the locales of these two 153 * categories differ, mojibake can result on many platforms. The code 154 * here will toggle LC_CTYPE into the locale of LC_TIME temporarily to 155 * avoid this. 156 * 157 * Several categories require extra work when their locale is changed. 158 * LC_CTYPE, for example, requires the calculation of the table of which 159 * characters fold to which others under /i pattern matching or fc(), as 160 * folding is not a concept in POSIX. This table isn't needed when the 161 * LC_CTYPE locale gets toggled during an operation, and will be toggled 162 * back before return to the caller. To save work that would be 163 * discarded, the bool_setlocale_X() implementations don't do this extra 164 * work. Instead, there is a separate function for just this purpose to 165 * be done before control is transferred back to the external caller. All 166 * categories that have such requirements have such a function. The 167 * update_functions[] array contains pointers to them (or NULL for 168 * categories which don't need a function). 169 * 170 * Care must be taken to remember to call the separate function before 171 * returning to an external caller, and to not use things it updates 172 * before its call. An alternative approach would be to have 173 * bool_setlocale_X() always call the update, which would return 174 * immediately if a flag wasn't set indicating it was time to actually 175 * perform it. 176 * 177 * 2) void_setlocale_X() 178 * This is like bool_setlocale_X(), but it is used only when it is 179 * expected that the call must succeed, or something is seriously wrong. 180 * A panic is issued if it fails. The caller uses this form when it just 181 * wants to assume things worked. 182 * 183 * 3) querylocale_X() 184 * This returns a string that specifies the current locale for the given 185 * category given by the input argument. The string is safe from other 186 * threads zapping it, and the caller need not worry about freeing it, but 187 * it may be mortalized, so must be copied if you need to preserve it 188 * across calls, or long term. This returns the actual current locale, 189 * not the nominal. These differ, for example, when LC_NUMERIC is 190 * supposed to be a locale whose decimal radix character is a comma. As 191 * mentioned above, Perl actually keeps this category set to C in such 192 * circumstances so that XS code can just assume a dot radix character. 193 * querylocale_X() returns the locale that libc has stored at this moment, 194 * so most of the time will return a locale whose radix character is a 195 * dot. The macro query_nominal_locale_i() can be used to get the nominal 196 * locale that an external caller would expect, for all categories except 197 * LC_ALL. For that, you can use the function 198 * S_calculate_LC_ALL_string(). Or S_native_querylocale_i() will operate 199 * on any category. 200 * 201 * The underlying C API that this implements uses category numbers, hence the 202 * code is structured to use '_r' at the API level to convert to indexes, which 203 * are then used internally with the '_i' forms. 204 * 205 * The splitting apart into setting vs querying means that the return value of 206 * the bool macros is not subject to potential clashes with other threads, 207 * eliminating any need for the calling code to worry about that and get it 208 * wrong. Whereas, you do have to think about thread interactions when using a 209 * query. 210 * 211 * Additionally, for the implementations where there aren't any complications, 212 * a setlocale_i() is defined that is like plain setlocale(), returning the new 213 * locale. Thus it combines a bool_setlocale_X() with a querylocale_X(). It 214 * is used only for performance on implementations that allow it, such as 215 * non-threaded perls. 216 * 217 * There are also a few other macros herein that use this naming convention to 218 * describe their category parameter. 219 * 220 * Relevant Configure options 221 * 222 * -Accflags=-DNO_LOCALE 223 * This compiles perl to always use the C locale, ignoring any 224 * attempts to change it. This could be useful on platforms with a 225 * crippled locale implementation. 226 * 227 * -Accflags=-DNO_THREAD_SAFE_LOCALE 228 * Even if thread-safe operations are available on this platform and 229 * would otherwise be used (because this is a perl with multiplicity), 230 * perl is compiled to not use them. This could be useful on 231 * platforms where the libc is buggy. 232 * 233 * -Accflags=-DNO_POSIX_2008_LOCALE 234 * Even if the libc locale operations specified by the Posix 2008 235 * Standard are available on this platform and would otherwise be used 236 * (because this is a threaded perl), perl is compiled to not use 237 * them. This could be useful on platforms where the libc is buggy. 238 * This is like NO_THREAD_SAFE_LOCALE, but has no effect on platforms 239 * that don't have these functions. 240 * 241 * -Accflags=-DUSE_POSIX_2008_LOCALE 242 * Normally, setlocale() is used for locale operations on perls 243 * compiled without threads. This option causes the locale operations 244 * defined by the Posix 2008 Standard to always be used instead. This 245 * could be useful on platforms where the libc setlocale() is buggy. 246 * 247 * -Accflags=-DNO_THREAD_SAFE_QUERYLOCALE 248 * This applies only to platforms that have a querylocale() libc 249 * function. perl assumes that that function is thread-safe, unless 250 * overridden by this, typically in a hints file. When overridden, 251 * querylocale() is called only while the locale mutex is locked, and 252 * the result is copied to a per-thread place before unlocking. 253 * 254 * -Accflags=-DNO_USE_NL_LOCALE_NAME 255 * glibc has an undocumented equivalent function to querylocale(), 256 * which our experience indicates is reliable. But you can forbid its 257 * use by specifying this Configure option (with no effect on systems 258 * lacking it). When this is function is enabled, it removes the need 259 * for perl to keep its own records, hence is more efficient and 260 * guaranteed to be accurate. 261 * 262 * -Accflags=-DNO_LOCALE_CTYPE 263 * -Accflags=-DNO_LOCALE_NUMERIC 264 * etc. 265 * 266 * If the named category(ies) does(do) not exist on this platform, 267 * these have no effect. Otherwise they cause perl to be compiled to 268 * always keep the named category(ies) in the C locale. 269 * 270 * -Accflags=-DHAS_BROKEN_SETLOCALE_QUERY_LC_ALL 271 * This would be set in a hints file to tell perl that doing a libc 272 * setlocale(LC_ALL, NULL) 273 * can give erroneous results, and perl will compensate to get the 274 * correct results. This is known to be a problem in earlier AIX 275 * versions 276 * 277 * -Accflags=-DHAS_LF_IN_SETLOCALE_RETURN 278 * This would be set in a hints file to tell perl that a libc 279 * setlocale() can return results containing \n characters that need 280 * to be stripped off. khw believes there aren't any such platforms 281 * still in existence. 282 * 283 * -Accflags=-DLIBC_HANDLES_MISMATCHED_CTYPE 284 * Consider the name of a month in some language, Chinese for example. 285 * If LC_TIME has been set to a Chinese locale, strftime() can be used 286 * to generate the Chinese month name for any given date, by using the 287 * %B format. But also suppose that LC_CTYPE is set to, say, "C". 288 * The return from strftime() on many platforms will be mojibake given 289 * that no Chinese month name is composed of just ASCII characters. 290 * Perl handles this for you by automatically toggling LC_CTYPE to 291 * whatever LC_TIME is during the execution of strftime(), and 292 * afterwards restoring it to its prior value. But the strftime() 293 * (and similar functions) in some libc implementations already do 294 * this toggle, meaning perl's action is redundant. You can tell perl 295 * that a libc does this by setting this Configure option, and it will 296 * skip its syncing LC_CTYPE and whatever the other locale is. 297 * Currently, perl ignores this Configuration option and syncs anyway 298 * for LC_COLLATE-related operations, due to perl's internal needs. 299 * 300 * -Accflags=USE_FAKE_LC_ALL_POSITIONAL_NOTATION 301 * This is used when developing Perl on a platform that uses 302 * 'name=value;' notation to represent LC_ALL when not all categories 303 * are the same. When so compiled, much of the code gets compiled 304 * and exercised that applies to platforms that instead use positional 305 * notation. This allows for finding many bugs in that portion of the 306 * implementation, without having to access such a platform. 307 * 308 * -Accflags=-DWIN32_USE_FAKE_OLD_MINGW_LOCALES 309 * This is used when developing Perl on a non-Windows platform to 310 * compile and exercise much of the locale-related code that instead 311 * applies to MingW platforms that don't use the more modern UCRT 312 * library. This allows for finding many bugs in that portion of the 313 * implementation, without having to access such a platform. 314 */ 315 316 /* If the environment says to, we can output debugging information during 317 * initialization. This is done before option parsing, and before any thread 318 * creation, so can be a file-level static. (Must come before #including 319 * perl.h) */ 320 #include "config.h" 321 322 /* Returns the Unix errno portion; ignoring any others. This is a macro here 323 * instead of putting it into perl.h, because unclear to khw what should be 324 * done generally. */ 325 #define GET_ERRNO saved_errno 326 327 #ifdef DEBUGGING 328 static int debug_initialization = 0; 329 # define DEBUG_INITIALIZATION_set(v) (debug_initialization = v) 330 # define DEBUG_LOCALE_INITIALIZATION_ debug_initialization 331 332 # ifdef HAS_EXTENDED_OS_ERRNO 333 /* Output the non-zero errno and/or the non-zero extended errno */ 334 # define DEBUG_ERRNO \ 335 dSAVE_ERRNO; dTHX; \ 336 int extended = get_extended_os_errno(); \ 337 const char * errno_string; \ 338 if (GET_ERRNO == 0) { /* Skip output if both errno types are 0 */ \ 339 if (LIKELY(extended == 0)) errno_string = ""; \ 340 else errno_string = Perl_form(aTHX_ "; $^E=%d", extended); \ 341 } \ 342 else if (LIKELY(extended == GET_ERRNO)) \ 343 errno_string = Perl_form(aTHX_ "; $!=%d", GET_ERRNO); \ 344 else errno_string = Perl_form(aTHX_ "; $!=%d, $^E=%d", \ 345 GET_ERRNO, extended); 346 # else 347 /* Output the errno, if non-zero */ 348 # define DEBUG_ERRNO \ 349 dSAVE_ERRNO; \ 350 const char * errno_string = ""; \ 351 if (GET_ERRNO != 0) { \ 352 dTHX; \ 353 errno_string = Perl_form(aTHX_ "; $!=%d", GET_ERRNO); \ 354 } 355 # endif 356 357 /* Automatically include the caller's file, and line number in debugging output; 358 * and the errno (and/or extended errno) if non-zero. On threaded perls add 359 * the aTHX too. */ 360 # if defined(MULTIPLICITY) && ! defined(NO_LOCALE_THREADS) 361 # define DEBUG_PRE_STMTS \ 362 DEBUG_ERRNO; \ 363 PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf ": 0x%p%s: ", \ 364 __FILE__, (line_t)__LINE__, aTHX_ \ 365 errno_string); 366 # else 367 # define DEBUG_PRE_STMTS \ 368 DEBUG_ERRNO; \ 369 PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf "%s: ", \ 370 __FILE__, (line_t)__LINE__, \ 371 errno_string); 372 # endif 373 # define DEBUG_POST_STMTS RESTORE_ERRNO; 374 #else 375 # define debug_initialization 0 376 # define DEBUG_INITIALIZATION_set(v) 377 # define DEBUG_PRE_STMTS 378 # define DEBUG_POST_STMTS 379 #endif 380 381 #include "EXTERN.h" 382 #define PERL_IN_LOCALE_C 383 #include "perl.h" 384 385 /* Some platforms require LC_CTYPE to be congruent with the category we are 386 * looking for. XXX This still presumes that we have to match COLLATE and 387 * CTYPE even on platforms that apparently handle this. */ 388 #if defined(USE_LOCALE_CTYPE) && ! defined(LIBC_HANDLES_MISMATCHED_CTYPE) 389 # define WE_MUST_DEAL_WITH_MISMATCHED_CTYPE /* no longer used; kept for 390 possible future use */ 391 # define start_DEALING_WITH_MISMATCHED_CTYPE(locale) \ 392 const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale) 393 # define end_DEALING_WITH_MISMATCHED_CTYPE(locale) \ 394 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); 395 #else 396 # define start_DEALING_WITH_MISMATCHED_CTYPE(locale) 397 # define end_DEALING_WITH_MISMATCHED_CTYPE(locale) 398 #endif 399 400 #ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES 401 402 /* Use -Accflags=-DWIN32_USE_FAKE_OLD_MINGW_LOCALES on a POSIX or *nix box 403 * to get a semblance of pretending the locale handling is that of a MingW 404 * that doesn't use UCRT (hence 'OLD' in the name). This exercizes code 405 * paths that are not compiled on non-Windows boxes, and allows for ASAN 406 * and PERL_MEMLOG. This is thus a way to see if locale.c on Windows is 407 * likely going to compile, without having to use a real Win32 box. And 408 * running the test suite will verify to a large extent our logic and memory 409 * allocation handling for such boxes. Of course the underlying calls are 410 * to the POSIX libc, so any differences in implementation between those and 411 * the Windows versions will not be caught by this. */ 412 413 # define WIN32 414 # undef P_CS_PRECEDES 415 # undef CURRENCY_SYMBOL 416 # define CP_UTF8 -1 417 # undef _configthreadlocale 418 # define _configthreadlocale(arg) NOOP 419 420 # define MultiByteToWideChar(cp, flags, byte_string, m1, wstring, req_size) \ 421 (PERL_UNUSED_ARG(cp), \ 422 mbsrtowcs(wstring, &(byte_string), req_size, NULL) + 1) 423 # define WideCharToMultiByte(cp, flags, wstring, m1, byte_string, \ 424 req_size, default_char, found_default_char) \ 425 (PERL_UNUSED_ARG(cp), \ 426 wcsrtombs(byte_string, &(wstring), req_size, NULL) + 1) 427 428 # ifdef USE_LOCALE 429 430 static const wchar_t * wsetlocale_buf = NULL; 431 static Size_t wsetlocale_buf_size = 0; 432 433 # ifdef MULTIPLICITY 434 435 static PerlInterpreter * wsetlocale_buf_aTHX = NULL; 436 437 # endif 438 439 STATIC 440 const wchar_t * 441 S_wsetlocale(const int category, const wchar_t * wlocale) 442 { 443 /* Windows uses a setlocale that takes a wchar_t* locale. Other boxes 444 * don't have this, so this Windows replacement converts the wchar_t input 445 * to plain 'char*', calls plain setlocale(), and converts the result back 446 * to 'wchar_t*' */ 447 448 const char * byte_locale = NULL; 449 if (wlocale) { 450 byte_locale = Win_wstring_to_byte_string(CP_UTF8, wlocale); 451 } 452 453 const char * byte_result = setlocale(category, byte_locale); 454 Safefree(byte_locale); 455 if (byte_result == NULL) { 456 return NULL; 457 } 458 459 const wchar_t * wresult = Win_byte_string_to_wstring(CP_UTF8, byte_result); 460 461 if (! wresult) { 462 return NULL; 463 } 464 465 /* Emulate a global static memory return from wsetlocale(). This currently 466 * leaks at process end; would require changing LOCALE_TERM to fix that */ 467 Size_t string_size = wcslen(wresult) + 1; 468 469 if (wsetlocale_buf_size == 0) { 470 Newx(wsetlocale_buf, string_size, wchar_t); 471 wsetlocale_buf_size = string_size; 472 473 # ifdef MULTIPLICITY 474 475 dTHX; 476 wsetlocale_buf_aTHX = aTHX; 477 478 # endif 479 480 } 481 else if (string_size > wsetlocale_buf_size) { 482 Renew(wsetlocale_buf, string_size, wchar_t); 483 wsetlocale_buf_size = string_size; 484 } 485 486 Copy(wresult, wsetlocale_buf, string_size, wchar_t); 487 Safefree(wresult); 488 489 return wsetlocale_buf; 490 } 491 492 # define _wsetlocale(category, wlocale) S_wsetlocale(category, wlocale) 493 # endif 494 #endif /* WIN32_USE_FAKE_OLD_MINGW_LOCALES */ 495 496 /* 'for' loop headers to hide the necessary casts */ 497 #define for_category_indexes_between(i, m, n) \ 498 for (locale_category_index i = (locale_category_index) (m); \ 499 i <= (locale_category_index) (n); \ 500 i = (locale_category_index) ((int) i + 1)) 501 #define for_all_individual_category_indexes(i) \ 502 for_category_indexes_between(i, 0, LC_ALL_INDEX_ - 1) 503 #define for_all_but_0th_individual_category_indexes(i) \ 504 for_category_indexes_between(i, 1, LC_ALL_INDEX_ - 1) 505 #define for_all_category_indexes(i) \ 506 for_category_indexes_between(i, 0, LC_ALL_INDEX_) 507 508 #ifdef USE_LOCALE 509 # if defined(USE_FAKE_LC_ALL_POSITIONAL_NOTATION) && defined(LC_ALL) 510 511 /* This simulates an underlying positional notation for LC_ALL when compiled on 512 * a system that uses name=value notation. Use this to develop on Linux and 513 * make a quick check that things have some chance of working on a positional 514 * box. Enable by adding to the Congfigure parameters: 515 * -Accflags=USE_FAKE_LC_ALL_POSITIONAL_NOTATION 516 * 517 * NOTE it redefines setlocale() and usequerylocale() 518 * */ 519 520 STATIC const char * 521 S_positional_name_value_xlation(const char * locale, bool direction) 522 { /* direction == 1 is from name=value to positional 523 direction == 0 is from positional to name=value */ 524 assert(locale); 525 526 dTHX; 527 const char * individ_locales[LC_ALL_INDEX_] = { NULL }; 528 529 /* This parses either notation */ 530 switch (parse_LC_ALL_string(locale, 531 (const char **) &individ_locales, 532 no_override, /* Handled by other code */ 533 false, /* Return only [0] if suffices */ 534 false, /* Don't panic on error */ 535 __LINE__)) 536 { 537 default: /* Some compilers don't realize that below is the complete 538 list of the available enum values */ 539 case invalid: 540 return NULL; 541 542 case no_array: 543 return locale; 544 case only_element_0: 545 SAVEFREEPV(individ_locales[0]); 546 return individ_locales[0]; 547 case full_array: 548 { 549 calc_LC_ALL_format format = (direction) 550 ? EXTERNAL_FORMAT_FOR_SET 551 : INTERNAL_FORMAT; 552 const char * retval = calculate_LC_ALL_string(individ_locales, 553 format, 554 WANT_TEMP_PV, 555 __LINE__); 556 557 for_all_individual_category_indexes(i) { 558 Safefree(individ_locales[i]); 559 } 560 561 return retval; 562 } 563 } 564 } 565 566 STATIC const char * 567 S_positional_setlocale(int cat, const char * locale) 568 { 569 if (cat != LC_ALL) return setlocale(cat, locale); 570 571 if (locale && strNE(locale, "")) { 572 locale = S_positional_name_value_xlation(locale, 0); 573 if (! locale) return NULL; 574 } 575 576 locale = setlocale(cat, locale); 577 if (locale == NULL) return NULL; 578 return S_positional_name_value_xlation(locale, 1); 579 } 580 581 # undef setlocale 582 # define setlocale(a,b) S_positional_setlocale(a,b) 583 # ifdef USE_POSIX_2008_LOCALE 584 585 STATIC locale_t 586 S_positional_newlocale(int mask, const char * locale, locale_t base) 587 { 588 assert(locale); 589 590 if (mask != LC_ALL_MASK) return newlocale(mask, locale, base); 591 592 if (strNE(locale, "")) locale = S_positional_name_value_xlation(locale, 0); 593 if (locale == NULL) return NULL; 594 return newlocale(LC_ALL_MASK, locale, base); 595 } 596 597 # undef newlocale 598 # define newlocale(a,b,c) S_positional_newlocale(a,b,c) 599 # endif 600 # endif 601 #endif /* End of fake positional notation */ 602 603 #include "reentr.h" 604 605 #ifdef I_WCHAR 606 # include <wchar.h> 607 #endif 608 #ifdef I_WCTYPE 609 # include <wctype.h> 610 #endif 611 612 /* The main errno that gets used is this one, on platforms that support it */ 613 #ifdef EINVAL 614 # define SET_EINVAL SETERRNO(EINVAL, LIB_INVARG) 615 #else 616 # define SET_EINVAL 617 #endif 618 619 /* This is a starting guess as to when this is true. It definititely isn't 620 * true on *BSD where positional LC_ALL notation is used. Likely this will end 621 * up being defined in hints files. */ 622 #ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS 623 # define NEWLOCALE_HANDLES_DISPARATE_LC_ALL 624 #endif 625 626 /* But regardless, we have to look at individual categories if some are 627 * ignored. */ 628 #ifdef HAS_IGNORED_LOCALE_CATEGORIES_ 629 # undef NEWLOCALE_HANDLES_DISPARATE_LC_ALL 630 #endif 631 #ifdef USE_LOCALE 632 633 /* Not all categories need be set to the same locale. This macro determines if 634 * 'name' which represents LC_ALL is uniform or disparate. There are two 635 * situations: 1) the platform uses unordered name=value pairs; 2) the platform 636 * uses ordered positional values, with a separator string between them */ 637 # ifdef PERL_LC_ALL_SEPARATOR /* positional */ 638 # define is_disparate_LC_ALL(name) cBOOL(instr(name, PERL_LC_ALL_SEPARATOR)) 639 # else /* name=value */ 640 641 /* In the, hopefully never occurring, event that the platform doesn't use 642 * either mechanism for disparate LC_ALL's, assume the name=value pairs 643 * form, rather than taking the extreme step of refusing to compile. Many 644 * programs won't have disparate locales, so will generally work */ 645 # define PERL_LC_ALL_SEPARATOR ";" 646 # define is_disparate_LC_ALL(name) cBOOL( strchr(name, ';') \ 647 && strchr(name, '=')) 648 # endif 649 650 /* It is possible to compile perl to always keep any individual category in the 651 * C locale. This would be done where the implementation on a platform is 652 * flawed or incomplete. At the time of this writing, for example, OpenBSD has 653 * not implemented LC_COLLATE beyond the C locale. The 'category_available[]' 654 * table is a bool that says whether a category is changeable, or must be kept 655 * in C. This macro substitutes C for the locale appropriately, expanding to 656 * nothing on the more typical case where all possible categories present on 657 * the platform are handled. */ 658 # if defined(HAS_IGNORED_LOCALE_CATEGORIES_) \ 659 || defined(HAS_MISSING_LANGINFO_ITEM_) 660 # define need_to_override_category(i) (! category_available[i]) 661 # define override_ignored_category(i, new_locale) \ 662 ((need_to_override_category(i)) ? "C" : (new_locale)) 663 # else 664 # define need_to_override_category(i) 0 665 # define override_ignored_category(i, new_locale) (new_locale) 666 # endif 667 668 PERL_STATIC_INLINE const char * 669 S_mortalized_pv_copy(pTHX_ const char * const pv) 670 { 671 PERL_ARGS_ASSERT_MORTALIZED_PV_COPY; 672 673 /* Copies the input pv, and arranges for it to be freed at an unspecified 674 * later time. */ 675 676 if (pv == NULL) { 677 return NULL; 678 } 679 680 const char * copy = savepv(pv); 681 SAVEFREEPV(copy); 682 return copy; 683 } 684 685 #endif 686 687 /* Default values come from the C locale */ 688 #define C_codeset "ANSI_X3.4-1968" /* Only in some Configurations, and usually 689 a single instance, so is a #define */ 690 static const char C_decimal_point[] = "."; 691 692 #if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO) 693 # define HAS_SOME_LANGINFO 694 #endif 695 696 #if (defined(USE_LOCALE_NUMERIC) && ! defined(TS_W32_BROKEN_LOCALECONV)) \ 697 || ! ( defined(USE_LOCALE_NUMERIC) \ 698 && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV))) 699 static const char C_thousands_sep[] = ""; 700 #endif 701 702 /* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the 703 * return of setlocale(), then this is extremely likely to be the C or POSIX 704 * locale. However, the output of setlocale() is documented to be opaque, but 705 * the odds are extremely small that it would return these two strings for some 706 * other locale. Note that VMS includes many non-ASCII characters in these two 707 * locales as controls and punctuation (below are hex bytes): 708 * cntrl: 84-97 9B-9F 709 * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD 710 * Oddly, none there are listed as alphas, though some represent alphabetics 711 * https://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */ 712 #define isNAME_C_OR_POSIX(name) \ 713 ( (name) != NULL \ 714 && (( *(name) == 'C' && (*(name + 1)) == '\0') \ 715 || strEQ((name), "POSIX"))) 716 717 /* If this interface to nl_langinfo() isn't defined by embed.fnc, it means it 718 * isn't available on this platform, so instead emulate it */ 719 #ifndef langinfo_sv_i 720 # define langinfo_sv_i(i, c, l, s, u) \ 721 (PERL_UNUSED_VAR(c), emulate_langinfo(i, l, s, u)) 722 #endif 723 724 /* In either case, create a version that takes things like 'LC_NUMERIC' as a 725 * parameter */ 726 #define langinfo_sv_c(item, category, locale, sv, utf8ness) \ 727 langinfo_sv_i(item, category##_INDEX_, locale, sv, utf8ness) 728 729 /* The normal method for interfacing with nl_langinfo() in this file is to use 730 * a scratch buffer (whose existence is hidden from the caller by these 731 * macros). */ 732 #define langinfo_i(item, index, locale, utf8ness) \ 733 langinfo_sv_i(item, index, locale, PL_scratch_langinfo, utf8ness) 734 735 #define langinfo_c(item, category, locale, utf8ness) \ 736 langinfo_i(item, category##_INDEX_, locale, utf8ness) 737 738 #ifndef USE_LOCALE /* A no-op unless locales are enabled */ 739 # define toggle_locale_i(index, locale) \ 740 ((const char *) (PERL_UNUSED_VAR(locale), NULL)) 741 # define restore_toggled_locale_i(index, locale) PERL_UNUSED_VAR(locale) 742 #else 743 # define toggle_locale_i(index, locale) \ 744 S_toggle_locale_i(aTHX_ index, locale, __LINE__) 745 # define restore_toggled_locale_i(index, locale) \ 746 S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__) 747 #endif 748 749 # define toggle_locale_c(cat, locale) toggle_locale_i(cat##_INDEX_, locale) 750 # define restore_toggled_locale_c(cat, locale) \ 751 restore_toggled_locale_i(cat##_INDEX_, locale) 752 #ifdef USE_LOCALE 753 # ifdef DEBUGGING 754 # define setlocale_debug_string_i(index, locale, result) \ 755 my_setlocale_debug_string_i(index, locale, result, __LINE__) 756 # define setlocale_debug_string_c(category, locale, result) \ 757 setlocale_debug_string_i(category##_INDEX_, locale, result) 758 # define setlocale_debug_string_r(category, locale, result) \ 759 setlocale_debug_string_i(get_category_index(category), \ 760 locale, result) 761 # endif 762 763 /* On systems without LC_ALL, pretending it exists anyway simplifies things. 764 * Choose a value for it that is very unlikely to clash with any actual 765 * category */ 766 # define FAKE_LC_ALL PERL_INT_MIN 767 768 /* Below are parallel arrays for locale information indexed by our mapping of 769 * category numbers into small non-negative indexes. locale_table.h contains 770 * an entry like this for each individual category used on this system: 771 * PERL_LOCALE_TABLE_ENTRY(CTYPE, S_new_ctype) 772 * 773 * Each array redefines PERL_LOCALE_TABLE_ENTRY to generate the information 774 * needed for that array, and #includes locale_table.h to get the valid 775 * categories. 776 * 777 * An entry for the conglomerate category LC_ALL is added here, immediately 778 * following the individual categories. (The treatment for it varies, so can't 779 * be in locale_table.h.) 780 * 781 * Following this, each array ends with an entry for illegal categories. All 782 * category numbers unknown to perl get mapped to this entry. This is likely 783 * to be a parameter error from the calling program; but it could be that this 784 * platform has a category we don't know about, in which case it needs to be 785 * added, using the paradigm of one of the existing categories. */ 786 787 /* The first array is the locale categories perl uses on this system, used to 788 * map our index back to the system's category number. */ 789 STATIC const int categories[] = { 790 791 # undef PERL_LOCALE_TABLE_ENTRY 792 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) LC_ ## name, 793 # include "locale_table.h" 794 795 # ifdef LC_ALL 796 LC_ALL, 797 # else 798 FAKE_LC_ALL, 799 # endif 800 801 (FAKE_LC_ALL + 1) /* Entry for unknown category; this number is unlikely 802 to clash with a real category */ 803 }; 804 805 # define GET_NAME_AS_STRING(token) # token 806 # define GET_LC_NAME_AS_STRING(token) GET_NAME_AS_STRING(LC_ ## token) 807 808 /* The second array is the category names. */ 809 STATIC const char * const category_names[] = { 810 811 # undef PERL_LOCALE_TABLE_ENTRY 812 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) GET_LC_NAME_AS_STRING(name), 813 # include "locale_table.h" 814 815 # ifdef LC_ALL 816 # define LC_ALL_STRING "LC_ALL" 817 # else 818 # define LC_ALL_STRING "If you see this, it is a bug in perl;" \ 819 " please report it via perlbug" 820 # endif 821 822 LC_ALL_STRING, 823 824 # define LC_UNKNOWN_STRING "Locale category unknown to Perl; if you see" \ 825 " this, it is a bug in perl; please report it" \ 826 " via perlbug" 827 LC_UNKNOWN_STRING 828 }; 829 830 STATIC const Size_t category_name_lengths[] = { 831 832 # undef PERL_LOCALE_TABLE_ENTRY 833 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) \ 834 STRLENs(GET_LC_NAME_AS_STRING(name)), 835 # include "locale_table.h" 836 837 STRLENs(LC_ALL_STRING), 838 STRLENs(LC_UNKNOWN_STRING) 839 }; 840 841 /* Each entry includes space for the '=' and ';' */ 842 # undef PERL_LOCALE_TABLE_ENTRY 843 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) \ 844 + STRLENs(GET_LC_NAME_AS_STRING(name)) + 2 845 846 STATIC const Size_t lc_all_boiler_plate_length = 1 /* space for trailing NUL */ 847 # include "locale_table.h" 848 ; 849 850 /* A few categories require additional setup when they are changed. This table 851 * points to the functions that do that setup */ 852 STATIC void (*update_functions[]) (pTHX_ const char *, bool force) = { 853 854 # undef PERL_LOCALE_TABLE_ENTRY 855 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) call_back, 856 # include "locale_table.h" 857 858 S_new_LC_ALL, 859 NULL, /* No update for unknown category */ 860 }; 861 862 # if defined(HAS_IGNORED_LOCALE_CATEGORIES_) \ 863 || defined(HAS_MISSING_LANGINFO_ITEM_) 864 865 /* Indicates if each category on this platform is available to use not in 866 * the C locale */ 867 STATIC const bool category_available[] = { 868 869 # undef PERL_LOCALE_TABLE_ENTRY 870 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) LC_ ## name ## _AVAIL_, 871 # include "locale_table.h" 872 873 # ifdef LC_ALL 874 true, 875 # else 876 false, 877 # endif 878 879 false /* LC_UNKNOWN_AVAIL_ */ 880 }; 881 882 # endif 883 # if defined(USE_POSIX_2008_LOCALE) 884 885 STATIC const int category_masks[] = { 886 887 # undef PERL_LOCALE_TABLE_ENTRY 888 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) LC_ ## name ## _MASK, 889 # include "locale_table.h" 890 891 LC_ALL_MASK, /* Will rightly refuse to compile unless this is defined */ 892 0 /* Empty mask for unknown category */ 893 }; 894 895 # endif 896 # if ! defined(PERL_LC_ALL_USES_NAME_VALUE_PAIRS) 897 898 /* On platforms that use positional notation for expressing LC_ALL, this maps 899 * the position of each category to our corresponding internal index for it. 900 * This is initialized at run time if needed. LC_ALL_INDEX_ is not legal for 901 * an individual locale, hence marks the elements here as not actually 902 * initialized. */ 903 STATIC 904 unsigned int 905 map_LC_ALL_position_to_index[LC_ALL_INDEX_] = { LC_ALL_INDEX_ }; 906 907 # endif 908 #endif 909 #if defined(USE_LOCALE) || defined(DEBUGGING) 910 911 STATIC const char * 912 S_get_displayable_string(pTHX_ 913 const char * const s, 914 const char * const e, 915 const bool is_utf8) 916 { 917 PERL_ARGS_ASSERT_GET_DISPLAYABLE_STRING; 918 919 if (e <= s) { 920 return ""; 921 } 922 923 const char * t = s; 924 bool prev_was_printable = TRUE; 925 bool first_time = TRUE; 926 char * ret; 927 928 /* Worst case scenario: All are non-printable so have a blank between each. 929 * If UTF-8, all are the largest possible code point; otherwise all are a 930 * single byte. '(2 + 1)' is from each byte takes 2 characters to 931 * display, and a blank (or NUL for the final one) after it */ 932 const Size_t size = (e - s) * (2 + 1) * ((is_utf8) ? UVSIZE : 1); 933 Newxz(ret, size, char); 934 SAVEFREEPV(ret); 935 936 while (t < e) { 937 UV cp = (is_utf8) 938 ? utf8_to_uvchr_buf((U8 *) t, e, NULL) 939 : * (U8 *) t; 940 if (isPRINT(cp)) { 941 if (! prev_was_printable) { 942 my_strlcat(ret, " ", size); 943 } 944 945 /* Escape these to avoid any ambiguity */ 946 if (cp == ' ' || cp == '\\') { 947 my_strlcat(ret, "\\", size); 948 } 949 my_strlcat(ret, Perl_form(aTHX_ "%c", (U8) cp), size); 950 prev_was_printable = TRUE; 951 } 952 else { 953 if (! first_time) { 954 my_strlcat(ret, " ", size); 955 } 956 my_strlcat(ret, Perl_form(aTHX_ "%02" UVXf, cp), size); 957 prev_was_printable = FALSE; 958 } 959 t += (is_utf8) ? UTF8SKIP(t) : 1; 960 first_time = FALSE; 961 } 962 963 return ret; 964 } 965 966 #endif 967 #ifdef USE_LOCALE 968 969 # define get_category_index(cat) get_category_index_helper(cat, NULL, __LINE__) 970 971 STATIC locale_category_index 972 S_get_category_index_helper(pTHX_ const int category, bool * succeeded, 973 const line_t caller_line) 974 { 975 PERL_ARGS_ASSERT_GET_CATEGORY_INDEX_HELPER; 976 977 /* Given a category, return the equivalent internal index we generally use 978 * instead, warn or panic if not found. */ 979 980 locale_category_index i; 981 982 # undef PERL_LOCALE_TABLE_ENTRY 983 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) \ 984 case LC_ ## name: i = LC_ ## name ## _INDEX_; break; 985 986 switch (category) { 987 988 # include "locale_table.h" 989 # ifdef LC_ALL 990 case LC_ALL: i = LC_ALL_INDEX_; break; 991 # endif 992 993 default: goto unknown_locale; 994 } 995 996 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 997 "index of category %d (%s) is %d;" 998 " called from %" LINE_Tf "\n", 999 category, category_names[i], i, caller_line)); 1000 1001 if (succeeded) { 1002 *succeeded = true; 1003 } 1004 1005 return i; 1006 1007 unknown_locale: 1008 1009 if (succeeded) { 1010 *succeeded = false; 1011 return LC_ALL_INDEX_; /* Arbitrary */ 1012 } 1013 1014 locale_panic_via_(Perl_form(aTHX_ "Unknown locale category %d", category), 1015 __FILE__, caller_line); 1016 NOT_REACHED; /* NOTREACHED */ 1017 } 1018 1019 #endif /* ifdef USE_LOCALE */ 1020 1021 void 1022 Perl_force_locale_unlock(pTHX) 1023 { 1024 /* Remove any locale mutex, in preperation for an inglorious termination, 1025 * typically a panic */ 1026 1027 #if defined(USE_LOCALE_THREADS) 1028 1029 /* If recursively locked, clear all at once */ 1030 if (PL_locale_mutex_depth > 1) { 1031 PL_locale_mutex_depth = 1; 1032 } 1033 1034 if (PL_locale_mutex_depth > 0) { 1035 LOCALE_UNLOCK_; 1036 } 1037 1038 #endif 1039 1040 } 1041 1042 #ifdef USE_POSIX_2008_LOCALE 1043 1044 STATIC locale_t 1045 S_use_curlocale_scratch(pTHX) 1046 { 1047 /* This function is used to hide from the caller the case where the current 1048 * locale_t object in POSIX 2008 is the global one, which is illegal in 1049 * many of the P2008 API calls. This checks for that and, if necessary 1050 * creates a proper P2008 object. Any prior object is deleted, as is any 1051 * remaining object during global destruction. */ 1052 1053 locale_t cur = uselocale((locale_t) 0); 1054 1055 if (cur != LC_GLOBAL_LOCALE) { 1056 return cur; 1057 } 1058 1059 if (PL_scratch_locale_obj) { 1060 freelocale(PL_scratch_locale_obj); 1061 } 1062 1063 PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE); 1064 return PL_scratch_locale_obj; 1065 } 1066 1067 #endif 1068 1069 void 1070 Perl_locale_panic(const char * msg, 1071 const line_t immediate_caller_line, 1072 const char * const higher_caller_file, 1073 const line_t higher_caller_line) 1074 { 1075 PERL_ARGS_ASSERT_LOCALE_PANIC; 1076 dTHX; 1077 dSAVE_ERRNO; 1078 1079 force_locale_unlock(); 1080 1081 #ifdef USE_C_BACKTRACE 1082 dump_c_backtrace(Perl_debug_log, 20, 1); 1083 #endif 1084 1085 const char * called_by = ""; 1086 if ( strNE(__FILE__, higher_caller_file) 1087 || immediate_caller_line != higher_caller_line) 1088 { 1089 called_by = Perl_form(aTHX_ "\nCalled by %s: %" LINE_Tf "\n", 1090 higher_caller_file, higher_caller_line); 1091 } 1092 1093 RESTORE_ERRNO; 1094 1095 const char * errno_text; 1096 1097 #ifdef HAS_EXTENDED_OS_ERRNO 1098 1099 const int extended_errnum = get_extended_os_errno(); 1100 if (errno != extended_errnum) { 1101 errno_text = Perl_form(aTHX_ "; errno=%d, $^E=%d", 1102 errno, extended_errnum); 1103 } 1104 else 1105 1106 #endif 1107 1108 { 1109 errno_text = Perl_form(aTHX_ "; errno=%d", errno); 1110 } 1111 1112 /* diag_listed_as: panic: %s */ 1113 Perl_croak(aTHX_ "%s: %" LINE_Tf ": panic: %s%s%s\n", 1114 __FILE__, immediate_caller_line, 1115 msg, errno_text, called_by); 1116 } 1117 1118 /* Macros to report and croak on an unexpected failure to set the locale. The 1119 * via version has more stack trace information */ 1120 #define setlocale_failure_panic_i(i, cur, fail, line, higher_line) \ 1121 setlocale_failure_panic_via_i(i, cur, fail, __LINE__, line, \ 1122 __FILE__, higher_line) 1123 1124 #define setlocale_failure_panic_c(cat, cur, fail, line, higher_line) \ 1125 setlocale_failure_panic_i(cat##_INDEX_, cur, fail, line, higher_line) 1126 1127 #if defined(USE_LOCALE) 1128 1129 /* Expands to the code to 1130 * result = savepvn(s, len) 1131 * if the category whose internal index is 'i' doesn't need to be kept in the C 1132 * locale on this system, or if 'action is 'no_override'. Otherwise it expands 1133 * to 1134 * result = savepv("C") 1135 * unless 'action' isn't 'check_that_overridden', in which case if the string 1136 * 's' isn't already "C" it panics */ 1137 # ifndef HAS_IGNORED_LOCALE_CATEGORIES_ 1138 # define OVERRIDE_AND_SAVEPV(s, len, result, i, action) \ 1139 result = savepvn(s, len) 1140 # else 1141 # define OVERRIDE_AND_SAVEPV(s, len, result, i, action) \ 1142 STMT_START { \ 1143 if (LIKELY( ! need_to_override_category(i) \ 1144 || action == no_override)) { \ 1145 result = savepvn(s, len); \ 1146 } \ 1147 else { \ 1148 const char * temp = savepvn(s, len); \ 1149 result = savepv(override_ignored_category(i, temp)); \ 1150 if (action == check_that_overridden && strNE(result, temp)) { \ 1151 locale_panic_(Perl_form(aTHX_ \ 1152 "%s expected to be '%s', instead is '%s'", \ 1153 category_names[i], result, temp)); \ 1154 } \ 1155 Safefree(temp); \ 1156 } \ 1157 } STMT_END 1158 # endif 1159 1160 STATIC parse_LC_ALL_string_return 1161 S_parse_LC_ALL_string(pTHX_ const char * string, 1162 const char ** output, 1163 const parse_LC_ALL_STRING_action override, 1164 bool always_use_full_array, 1165 const bool panic_on_error, 1166 const line_t caller_line) 1167 { 1168 /* This function parses the value of the input 'string' which is expected 1169 * to be the representation of an LC_ALL locale, and splits the result into 1170 * the values for the individual component categories, returning those in 1171 * the 'output' array. Each array value will be a savepv() copy that is 1172 * the responsibility of the caller to make sure gets freed 1173 * 1174 * The locale for each category is independent of the other categories. 1175 * Often, they are all the same, but certainly not always. Perl, in fact, 1176 * usually keeps LC_NUMERIC in the C locale, regardless of the underlying 1177 * locale. LC_ALL has to be able to represent the case of when not all 1178 * categories have the same locale. Platforms have differing ways of 1179 * representing this. Internally, this file uses the 'name=value;' 1180 * representation found on some platforms, so this function always looks 1181 * for and parses that. Other platforms use a positional notation. On 1182 * those platforms, this function also parses that form. It examines the 1183 * input to see which form is being parsed. 1184 * 1185 * Often, all categories will have the same locale. This is special cased 1186 * if 'always_use_full_array' is false on input: 1187 * 1) If the input 'string' is a single value, this function doesn't 1188 * store anything into 'output', and returns 'no_array' 1189 * 2) Some platforms will return multiple occurrences of the same 1190 * value rather than coalescing them down to a single one. HP-UX 1191 * is such a one. This function will do that collapsing for you, 1192 * returning 'only_element_0' and saving the single value in 1193 * output[0], which the caller will need to arrange to be freed. 1194 * The rest of output[] is undefined, and does not need to be 1195 * freed. 1196 * 1197 * Otherwise, the input 'string' may not be valid. This function looks 1198 * mainly for syntactic errors, and if found, returns 'invalid'. 'output' 1199 * will not be filled in in that case, but the input state of it isn't 1200 * necessarily preserved. Turning on -DL debugging will give details as to 1201 * the error. If 'panic_on_error' is 'true', the function panics instead 1202 * of returning on error, with a message giving the details. 1203 * 1204 * Otherwise, output[] will be filled with the individual locale names for 1205 * all categories on the system, 'full_array' will be returned, and the 1206 * caller needs to arrange for each to be freed. This means that either at 1207 * least one category differed from the others, or 'always_use_full_array' was 1208 * true on input. 1209 * 1210 * perl may be configured to ignore changes to a category's locale to 1211 * non-C. The parameter 'override' tells this function what to do when 1212 * encountering such an illegal combination: 1213 * 1214 * no_override indicates to take no special action 1215 * override_if_ignored, indicates to return 'C' instead of what the 1216 * input string actually says. 1217 * check_that_overridden indicates to panic if the string says the 1218 * category is not 'C'. This is used when 1219 * non-C is very unexpected behavior. 1220 * */ 1221 1222 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 1223 "Entering parse_LC_ALL_string; called from %" \ 1224 LINE_Tf "\nnew='%s'\n", caller_line, string)); 1225 1226 # ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS 1227 1228 const char separator[] = ";"; 1229 const Size_t separator_len = 1; 1230 const bool single_component = (strchr(string, ';') == NULL); 1231 1232 # else 1233 1234 /* It's possible (but quite unlikely) that the separator string is an '=' 1235 * or a ';'. Requiring both to be present for using the 'name=value;' form 1236 * properly handles those possibilities */ 1237 const bool name_value = strchr(string, '=') && strchr(string, ';'); 1238 const char * separator; 1239 Size_t separator_len; 1240 bool single_component; 1241 if (name_value) { 1242 separator = ";"; 1243 separator_len = 1; 1244 single_component = false; /* Since has both [;=], must be multi */ 1245 } 1246 else { 1247 separator = PERL_LC_ALL_SEPARATOR; 1248 separator_len = STRLENs(PERL_LC_ALL_SEPARATOR); 1249 single_component = instr(string, separator) == NULL; 1250 } 1251 1252 Size_t component_number = 0; /* Position in the parsing loop below */ 1253 1254 # endif 1255 # ifndef HAS_IGNORED_LOCALE_CATEGORIES_ 1256 PERL_UNUSED_ARG(override); 1257 # else 1258 1259 /* Any ignored categories are to be set to "C", so if this single-component 1260 * LC_ALL isn't to C, it has both "C" and non-C, so isn't really a single 1261 * component. All the non-ignored categories are set to the input 1262 * component, but the ignored ones are overridden to be C. 1263 * 1264 * This incidentally handles the case where the string is "". The return 1265 * will be C for each ignored category and "" for the others. Then the 1266 * caller can individually set each category, and get the right answer. */ 1267 if (single_component && ! isNAME_C_OR_POSIX(string)) { 1268 for_all_individual_category_indexes(i) { 1269 OVERRIDE_AND_SAVEPV(string, strlen(string), output[i], i, override); 1270 } 1271 1272 return full_array; 1273 } 1274 1275 # endif 1276 1277 if (single_component) { 1278 if (! always_use_full_array) { 1279 return no_array; 1280 } 1281 1282 for_all_individual_category_indexes(i) { 1283 output[i] = savepv(string); 1284 } 1285 1286 return full_array; 1287 } 1288 1289 /* Here the input is multiple components. Parse through them. (It is 1290 * possible that these components are all the same, so we check, and if so, 1291 * return just the 0th component (unless 'always_use_full_array' is true) 1292 * 1293 * This enum notes the possible errors findable in parsing */ 1294 enum { 1295 incomplete, 1296 no_equals, 1297 unknown_category, 1298 contains_LC_ALL_element 1299 } error; 1300 1301 /* Keep track of the categories we have encountered so far */ 1302 bool seen[LC_ALL_INDEX_] = { false }; 1303 1304 Size_t index; /* Our internal index for the current category */ 1305 const char * s = string; 1306 const char * e = s + strlen(string); 1307 const char * category_end = NULL; 1308 const char * saved_first = NULL; 1309 1310 /* Parse the input locale string */ 1311 while (s < e) { 1312 1313 /* 'separator' has been set up to delimit the components */ 1314 const char * next_sep = instr(s, separator); 1315 if (! next_sep) { /* At the end of the input */ 1316 next_sep = e; 1317 } 1318 1319 # ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS 1320 1321 if (! name_value) { 1322 /* Get the index of the category in this position */ 1323 index = map_LC_ALL_position_to_index[component_number++]; 1324 } 1325 else 1326 1327 # endif 1328 1329 { /* Get the category part when each component is the 1330 * 'category=locale' form */ 1331 1332 category_end = strchr(s, '='); 1333 1334 /* The '=' terminates the category name. If no '=', is improper 1335 * form */ 1336 if (! category_end) { 1337 error = no_equals; 1338 goto failure; 1339 } 1340 1341 /* Find our internal index of the category name; uses a linear 1342 * search. (XXX This could be avoided by various means, but the 1343 * maximum likely search is 6 items, and khw doesn't think the 1344 * added complexity would save very much at all.) */ 1345 const unsigned int name_len = (unsigned int) (category_end - s); 1346 for (index = 0; index < C_ARRAY_LENGTH(category_names); index++) { 1347 if ( name_len == category_name_lengths[index] 1348 && memEQ(s, category_names[index], name_len)) 1349 { 1350 goto found_category; 1351 } 1352 } 1353 1354 /* Here, the category is not in our list. */ 1355 error = unknown_category; 1356 goto failure; 1357 1358 found_category: /* The system knows about this category. */ 1359 1360 if (index == LC_ALL_INDEX_) { 1361 error = contains_LC_ALL_element; 1362 goto failure; 1363 } 1364 1365 /* The locale name starts just beyond the '=' */ 1366 s = category_end + 1; 1367 1368 /* Linux (and maybe others) doesn't treat a duplicate category in 1369 * the string as an error. Instead it uses the final occurrence as 1370 * the intended value. So if this is a duplicate, free the former 1371 * value before setting the new one */ 1372 if (seen[index]) { 1373 Safefree(output[index]); 1374 } 1375 else { 1376 seen[index] = true; 1377 } 1378 } 1379 1380 /* Here, 'index' contains our internal index number for the current 1381 * category, and 's' points to the beginning of the locale name for 1382 * that category. */ 1383 OVERRIDE_AND_SAVEPV(s, next_sep - s, output[index], index, override); 1384 1385 if (! always_use_full_array) { 1386 if (! saved_first) { 1387 saved_first = output[index]; 1388 } 1389 else { 1390 if (strNE(saved_first, output[index])) { 1391 always_use_full_array = true; 1392 } 1393 } 1394 } 1395 1396 /* Next time start from the new position */ 1397 s = next_sep + separator_len; 1398 } 1399 1400 /* Finished looping through all the categories 1401 * 1402 * Check if the input was incomplete. */ 1403 1404 # ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS 1405 1406 if (! name_value) { /* Positional notation */ 1407 if (component_number != LC_ALL_INDEX_) { 1408 error = incomplete; 1409 goto failure; 1410 } 1411 } 1412 else 1413 1414 # endif 1415 1416 { /* Here is the name=value notation */ 1417 for_all_individual_category_indexes(i) { 1418 if (! seen[i]) { 1419 error = incomplete; 1420 goto failure; 1421 } 1422 } 1423 } 1424 1425 /* In the loop above, we changed 'always_use_full_array' to true iff not all 1426 * categories have the same locale. Hence, if it is still 'false', all of 1427 * them are the same. */ 1428 if (always_use_full_array) { 1429 return full_array; 1430 } 1431 1432 /* Free the dangling ones */ 1433 for_all_but_0th_individual_category_indexes(i) { 1434 Safefree(output[i]); 1435 output[i] = NULL; 1436 } 1437 1438 return only_element_0; 1439 1440 failure: 1441 1442 /* Don't leave memory dangling that we allocated before the failure */ 1443 for_all_individual_category_indexes(i) { 1444 if (seen[i]) { 1445 Safefree(output[i]); 1446 output[i] = NULL; 1447 } 1448 } 1449 1450 const char * msg; 1451 const char * display_start = s; 1452 const char * display_end = e; 1453 1454 switch (error) { 1455 case incomplete: 1456 msg = "doesn't list every locale category"; 1457 display_start = string; 1458 break; 1459 case no_equals: 1460 msg = "needs an '=' to split name=value"; 1461 break; 1462 case unknown_category: 1463 msg = "is an unknown category"; 1464 display_end = (category_end && category_end > display_start) 1465 ? category_end 1466 : e; 1467 break; 1468 case contains_LC_ALL_element: 1469 msg = "has LC_ALL, which is illegal here"; 1470 break; 1471 } 1472 1473 msg = Perl_form(aTHX_ "'%.*s' %s\n", 1474 (int) (display_end - display_start), 1475 display_start, msg); 1476 1477 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s", msg)); 1478 1479 if (panic_on_error) { 1480 locale_panic_via_(msg, __FILE__, caller_line); 1481 } 1482 1483 return invalid; 1484 } 1485 1486 # undef OVERRIDE_AND_SAVEPV 1487 #endif 1488 1489 /*========================================================================== 1490 * Here starts the code that gives a uniform interface to its callers, hiding 1491 * the differences between platforms. 1492 * 1493 * base_posix_setlocale_() presents a consistent POSIX-compliant interface to 1494 * setlocale(). Windows requres a customized base-level setlocale(). This 1495 * layer should only be used by the next level up: the plain posix_setlocale 1496 * layer. Any necessary mutex locking needs to be done at a higher level. The 1497 * return may be overwritten by the next call to this function */ 1498 #ifdef WIN32 1499 # define base_posix_setlocale_(cat, locale) win32_setlocale(cat, locale) 1500 #else 1501 # define base_posix_setlocale_(cat, locale) \ 1502 ((const char *) setlocale(cat, locale)) 1503 #endif 1504 1505 /*========================================================================== 1506 * Here is the main posix layer. It is the same as the base one unless the 1507 * system is lacking LC_ALL, or there are categories that we ignore, but that 1508 * the system libc knows about */ 1509 1510 #if ! defined(USE_LOCALE) \ 1511 || (defined(LC_ALL) && ! defined(HAS_IGNORED_LOCALE_CATEGORIES_)) 1512 # define posix_setlocale(cat, locale) base_posix_setlocale_(cat, locale) 1513 #else 1514 # define posix_setlocale(cat, locale) \ 1515 S_posix_setlocale_with_complications(aTHX_ cat, locale, __LINE__) 1516 1517 STATIC const char * 1518 S_posix_setlocale_with_complications(pTHX_ const int cat, 1519 const char * new_locale, 1520 const line_t caller_line) 1521 { 1522 /* This implements the posix layer above the base posix layer. 1523 * It is needed to reconcile our internal records that reflect only a 1524 * proper subset of the categories known by the system. */ 1525 1526 /* Querying the current locale returns the real value */ 1527 if (new_locale == NULL) { 1528 new_locale = base_posix_setlocale_(cat, NULL); 1529 assert(new_locale); 1530 return new_locale; 1531 } 1532 1533 const char * locale_on_entry = NULL; 1534 1535 /* If setting from the environment, actually do the set to get the system's 1536 * idea of what that means; we may have to override later. */ 1537 if (strEQ(new_locale, "")) { 1538 locale_on_entry = base_posix_setlocale_(cat, NULL); 1539 assert(locale_on_entry); 1540 new_locale = base_posix_setlocale_(cat, ""); 1541 if (! new_locale) { 1542 SET_EINVAL; 1543 return NULL; 1544 } 1545 } 1546 1547 # ifdef LC_ALL 1548 1549 const char * new_locales[LC_ALL_INDEX_] = { NULL }; 1550 1551 if (cat == LC_ALL) { 1552 switch (parse_LC_ALL_string(new_locale, 1553 (const char **) &new_locales, 1554 override_if_ignored, /* Override any 1555 ignored 1556 categories */ 1557 false, /* Return only [0] if suffices */ 1558 false, /* Don't panic on error */ 1559 caller_line)) 1560 { 1561 case invalid: 1562 SET_EINVAL; 1563 return NULL; 1564 1565 case no_array: 1566 break; 1567 1568 case only_element_0: 1569 new_locale = new_locales[0]; 1570 SAVEFREEPV(new_locale); 1571 break; 1572 1573 case full_array: 1574 1575 /* Turn the array into a string that the libc setlocale() should 1576 * understand. (Another option would be to loop, setting the 1577 * individual locales, and then return base(cat, NULL) */ 1578 new_locale = calculate_LC_ALL_string(new_locales, 1579 EXTERNAL_FORMAT_FOR_SET, 1580 WANT_TEMP_PV, 1581 caller_line); 1582 1583 for_all_individual_category_indexes(i) { 1584 Safefree(new_locales[i]); 1585 } 1586 1587 /* And call the libc setlocale. We could avoid this call if 1588 * locale_on_entry is set and eq the new_locale. But that would be 1589 * only for the relatively rare case of the desired locale being 1590 * "", and the time spent in doing the string compare might be more 1591 * than that of just setting it unconditionally */ 1592 new_locale = base_posix_setlocale_(cat, new_locale); 1593 if (! new_locale) { 1594 goto failure; 1595 } 1596 1597 return new_locale; 1598 } 1599 } 1600 1601 # endif 1602 1603 /* Here, 'new_locale' is a single value, not an aggregation. Just set it. 1604 * */ 1605 new_locale = 1606 base_posix_setlocale_(cat, 1607 override_ignored_category( 1608 get_category_index(cat), new_locale)); 1609 if (! new_locale) { 1610 goto failure; 1611 } 1612 1613 return new_locale; 1614 1615 failure: 1616 1617 /* 'locale_on_entry' being set indicates there has likely been a change in 1618 * locale which needs to be restored */ 1619 if (locale_on_entry) { 1620 if (! base_posix_setlocale_(cat, locale_on_entry)) { 1621 setlocale_failure_panic_i(get_category_index(cat), 1622 NULL, locale_on_entry, 1623 __LINE__, caller_line); 1624 } 1625 } 1626 1627 SET_EINVAL; 1628 return NULL; 1629 } 1630 1631 #endif 1632 1633 /* End of posix layer 1634 *========================================================================== 1635 * 1636 * The next layer up is to catch vagaries and bugs in the libc setlocale return 1637 * value. The return is not guaranteed to be stable. 1638 * 1639 * Any necessary mutex locking needs to be done at a higher level. 1640 * 1641 * On most platforms this layer is empty, expanding to just the layer 1642 * below. To enable it, call Configure with either or both: 1643 * -Accflags=-DHAS_LF_IN_SETLOCALE_RETURN 1644 * to indicate that extraneous \n characters can be returned 1645 * by setlocale() 1646 * -Accflags=-DHAS_BROKEN_SETLOCALE_QUERY_LC_ALL 1647 * to indicate that setlocale(LC_ALL, NULL) cannot be relied 1648 * on 1649 */ 1650 1651 #define STDIZED_SETLOCALE_LOCK POSIX_SETLOCALE_LOCK 1652 #define STDIZED_SETLOCALE_UNLOCK POSIX_SETLOCALE_UNLOCK 1653 #if ! defined(USE_LOCALE) \ 1654 || ! ( defined(HAS_LF_IN_SETLOCALE_RETURN) \ 1655 || defined(HAS_BROKEN_SETLOCALE_QUERY_LC_ALL)) 1656 # define stdized_setlocale(cat, locale) posix_setlocale(cat, locale) 1657 # define stdize_locale(cat, locale) (locale) 1658 #else 1659 # define stdized_setlocale(cat, locale) \ 1660 S_stdize_locale(aTHX_ cat, posix_setlocale(cat, locale), __LINE__) 1661 1662 STATIC const char * 1663 S_stdize_locale(pTHX_ const int category, 1664 const char *input_locale, 1665 const line_t caller_line) 1666 { 1667 /* The return value of setlocale() is opaque, but is required to be usable 1668 * as input to a future setlocale() to create the same state. 1669 * Unfortunately not all systems are compliant. This function brings those 1670 * outliers into conformance. It is based on what problems have arisen in 1671 * the field. 1672 * 1673 * This has similar constraints as the posix layer. You need to lock 1674 * around it until its return is safely copied or no longer needed. (The 1675 * return may point to a global static buffer or may be mortalized.) 1676 * 1677 * The current things this corrects are: 1678 * 1) A new-line. This function chops any \n characters 1679 * 2) A broken 'setlocale(LC_ALL, foo)' This constructs a proper returned 1680 * string from the constituent categories 1681 * 1682 * If no changes were made, the input is returned as-is */ 1683 1684 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 1685 "Entering stdize_locale(%d, '%s');" 1686 " called from %" LINE_Tf "\n", 1687 category, input_locale, caller_line)); 1688 1689 if (input_locale == NULL) { 1690 SET_EINVAL; 1691 return NULL; 1692 } 1693 1694 char * retval = (char *) input_locale; 1695 1696 # if defined(LC_ALL) && defined(HAS_BROKEN_SETLOCALE_QUERY_LC_ALL) 1697 1698 /* If setlocale(LC_ALL, NULL) is broken, compute what the system 1699 * actually thinks it should be from its individual components */ 1700 if (category == LC_ALL) { 1701 retval = (char *) calculate_LC_ALL_string( 1702 NULL, /* query each individ locale */ 1703 EXTERNAL_FORMAT_FOR_SET, 1704 WANT_TEMP_PV, 1705 caller_line); 1706 } 1707 1708 # endif 1709 # ifdef HAS_NL_IN_SETLOCALE_RETURN 1710 1711 char * first_bad = NULL; 1712 1713 # ifndef LC_ALL 1714 1715 PERL_UNUSED_ARG(category); 1716 PERL_UNUSED_ARG(caller_line); 1717 1718 # define INPUT_LOCALE retval 1719 # define MARK_CHANGED 1720 # else 1721 1722 char * individ_locales[LC_ALL_INDEX_] = { NULL }; 1723 bool made_changes = false; 1724 Size_t upper; 1725 if (category != LC_ALL) { 1726 individ_locales[0] = retval; 1727 upper = 0; 1728 } 1729 else { 1730 1731 /* And parse the locale string, splitting into its individual 1732 * components. */ 1733 switch (parse_LC_ALL_string(retval, 1734 (const char **) &individ_locales, 1735 check_that_overridden, /* ignored 1736 categories should 1737 already have been 1738 overridden */ 1739 false, /* Return only [0] if suffices */ 1740 false, /* Don't panic on error */ 1741 caller_line)) 1742 { 1743 case invalid: 1744 SET_EINVAL; 1745 return NULL; 1746 1747 case full_array: /* Loop below through all the component categories. 1748 */ 1749 upper = LC_ALL_INDEX_ - 1; 1750 break; 1751 1752 case no_array: 1753 /* All categories here are set to the same locale, and the parse 1754 * didn't fill in any of 'individ_locales'. Set the 0th element to 1755 * that locale. */ 1756 individ_locales[0] = retval; 1757 /* FALLTHROUGH */ 1758 1759 case only_element_0: /* Element 0 is the only element we need to look 1760 at */ 1761 upper = 0; 1762 break; 1763 } 1764 } 1765 1766 for (unsigned int i = 0; i <= upper; i++) 1767 1768 # define INPUT_LOCALE individ_locales[i] 1769 # define MARK_CHANGED made_changes = true; 1770 # endif /* Has LC_ALL */ 1771 1772 { 1773 first_bad = (char *) strchr(INPUT_LOCALE, '\n'); 1774 1775 /* Most likely, there isn't a problem with the input */ 1776 if (UNLIKELY(first_bad)) { 1777 1778 /* This element will need to be adjusted. Create a modifiable 1779 * copy. */ 1780 MARK_CHANGED 1781 retval = savepv(INPUT_LOCALE); 1782 SAVEFREEPV(retval); 1783 1784 /* Translate the found position into terms of the copy */ 1785 first_bad = retval + (first_bad - INPUT_LOCALE); 1786 1787 /* Get rid of the \n and what follows. (Originally, only a 1788 * trailing \n was stripped. Unsure what to do if not trailing) */ 1789 *((char *) first_bad) = '\0'; 1790 } /* End of needs adjusting */ 1791 } /* End of looking for problems */ 1792 1793 # ifdef LC_ALL 1794 1795 /* If we had multiple elements, extra work is required */ 1796 if (upper != 0) { 1797 1798 /* If no changes were made to the input, 'retval' already contains it 1799 * */ 1800 if (made_changes) { 1801 1802 /* But if did make changes, need to calculate the new value */ 1803 retval = (char *) calculate_LC_ALL_string( 1804 (const char **) &individ_locales, 1805 EXTERNAL_FORMAT_FOR_SET, 1806 WANT_TEMP_PV, 1807 caller_line); 1808 } 1809 1810 /* And free the no-longer needed memory */ 1811 for (unsigned int i = 0; i <= upper; i++) { 1812 Safefree(individ_locales[i]); 1813 } 1814 } 1815 1816 # endif 1817 # undef INPUT_LOCALE 1818 # undef MARK_CHANGED 1819 # endif /* HAS_NL_IN_SETLOCALE_RETURN */ 1820 1821 return (const char *) retval; 1822 } 1823 1824 #endif /* USE_LOCALE */ 1825 1826 /* End of stdize_locale layer 1827 * 1828 * ========================================================================== 1829 * 1830 * The next many lines form several implementations of a layer above the 1831 * close-to-the-metal 'posix' and 'stdized' macros. They are used to present a 1832 * uniform API to the rest of the code in this file in spite of the disparate 1833 * underlying implementations. Which implementation gets compiled depends on 1834 * the platform capabilities (and some user choice) as determined by Configure. 1835 * 1836 * As more fully described in the introductory comments in this file, the 1837 * API of each implementation consists of three sets of macros. Each set has 1838 * three variants with suffixes '_c', '_r', and '_i'. In the list below '_X' 1839 * is to be replaced by any of these suffixes. 1840 * 1841 * 1) bool_setlocale_X attempts to set the given category's locale to the 1842 * given value, returning if it worked or not. 1843 * 2) void_setlocale_X is like the corresponding bool_setlocale, but used when 1844 * success is the only sane outcome, so failure causes it 1845 * to panic. 1846 * 3) querylocale_X to see what the given category's locale is 1847 * 1848 * 4) setlocale_i() is defined only in those implementations where the bool 1849 * and query forms are essentially the same, and can be 1850 * combined to save CPU time. 1851 * 1852 * Each implementation is fundamentally defined by just two macros: a 1853 * bool_setlocale_X() and a querylocale_X(). The other macros are all 1854 * derivable from these. Each fundamental macro is either a '_i' suffix one or 1855 * an '_r' suffix one, depending on what is the most efficient in getting to an 1856 * input form that the underlying libc functions want. The derived macro 1857 * definitions are deferred in this file to after the code for all the 1858 * implementations. This makes each implementation shorter and clearer, and 1859 * removes duplication. 1860 * 1861 * Each implementation below is separated by ==== lines, and includes bool, 1862 * void, and query macros. The query macros are first, followed by any 1863 * functions needed to implement them. Then come the bool, again followed by 1864 * any implementing functions Then are the void macros; next is setlocale_i if 1865 * present on this implementation. Finally are any helper functions. The sets 1866 * in each implementation are separated by ---- lines. 1867 * 1868 * The returned strings from all the querylocale...() forms in all 1869 * implementations are thread-safe, and the caller should not free them, 1870 * but each may be a mortalized copy. If you need something stable across 1871 * calls, you need to savepv() the result yourself. 1872 * 1873 *===========================================================================*/ 1874 1875 #if (! defined(USE_LOCALE_THREADS) && ! defined(USE_POSIX_2008_LOCALE)) \ 1876 || ( defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)) 1877 1878 /* For non-threaded perls, the implementation just expands to the base-level 1879 * functions (except if we are Configured to nonetheless use the POSIX 2008 1880 * interface) This implementation is also used on threaded perls where 1881 * threading is invisible to us. Currently this is only on later Windows 1882 * versions. */ 1883 1884 # define querylocale_r(cat) mortalized_pv_copy(stdized_setlocale(cat, NULL)) 1885 # define bool_setlocale_r(cat, locale) cBOOL(posix_setlocale(cat, locale)) 1886 1887 /*---------------------------------------------------------------------------*/ 1888 1889 /* setlocale_i is only defined for Configurations where the libc setlocale() 1890 * doesn't need any tweaking. It allows for some shortcuts */ 1891 # ifndef USE_LOCALE_THREADS 1892 # define setlocale_i(i, locale) stdized_setlocale(categories[i], locale) 1893 1894 # elif defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE) 1895 1896 /* On Windows, we don't know at compile time if we are in thread-safe mode or 1897 * not. If we are, we can just return the result of the layer below us. If we 1898 * are in unsafe mode, we need to first copy that result to a safe place while 1899 * in a critical section */ 1900 1901 # define setlocale_i(i, locale) S_setlocale_i(aTHX_ categories[i], locale) 1902 1903 STATIC const char * 1904 S_setlocale_i(pTHX_ const int category, const char * locale) 1905 { 1906 if (LIKELY(_configthreadlocale(0) == _ENABLE_PER_THREAD_LOCALE)) { 1907 return stdized_setlocale(category, locale); 1908 } 1909 1910 gwLOCALE_LOCK; 1911 const char * retval = save_to_buffer(stdized_setlocale(category, locale), 1912 &PL_setlocale_buf, 1913 &PL_setlocale_bufsize); 1914 gwLOCALE_UNLOCK; 1915 1916 return retval; 1917 } 1918 1919 # endif 1920 1921 /*===========================================================================*/ 1922 #elif defined(USE_LOCALE_THREADS) \ 1923 && ! defined(USE_THREAD_SAFE_LOCALE) 1924 1925 /* Here, there are threads, and there is no support for thread-safe 1926 * operation. This is a dangerous situation, which perl is documented as 1927 * not supporting, but it arises in practice. We can do a modicum of 1928 * automatic mitigation by making sure there is a per-thread return from 1929 * setlocale(), and that a mutex protects it from races */ 1930 1931 # define querylocale_r(cat) \ 1932 mortalized_pv_copy(less_dicey_setlocale_r(cat, NULL)) 1933 1934 STATIC const char * 1935 S_less_dicey_setlocale_r(pTHX_ const int category, const char * locale) 1936 { 1937 const char * retval; 1938 1939 PERL_ARGS_ASSERT_LESS_DICEY_SETLOCALE_R; 1940 1941 STDIZED_SETLOCALE_LOCK; 1942 1943 retval = save_to_buffer(stdized_setlocale(category, locale), 1944 &PL_less_dicey_locale_buf, 1945 &PL_less_dicey_locale_bufsize); 1946 1947 STDIZED_SETLOCALE_UNLOCK; 1948 1949 return retval; 1950 } 1951 1952 /*---------------------------------------------------------------------------*/ 1953 1954 # define bool_setlocale_r(cat, locale) \ 1955 less_dicey_bool_setlocale_r(cat, locale) 1956 1957 STATIC bool 1958 S_less_dicey_bool_setlocale_r(pTHX_ const int cat, const char * locale) 1959 { 1960 bool retval; 1961 1962 PERL_ARGS_ASSERT_LESS_DICEY_BOOL_SETLOCALE_R; 1963 1964 /* Unlikely, but potentially possible that another thread could zap the 1965 * buffer from true to false or vice-versa, so need to lock here */ 1966 POSIX_SETLOCALE_LOCK; 1967 retval = cBOOL(posix_setlocale(cat, locale)); 1968 POSIX_SETLOCALE_UNLOCK; 1969 1970 return retval; 1971 } 1972 1973 /*---------------------------------------------------------------------------*/ 1974 1975 /* setlocale_i is only defined for Configurations where the libc setlocale() 1976 * suffices for both querying and setting the locale. It allows for some 1977 * shortcuts */ 1978 # define setlocale_i(i, locale) less_dicey_setlocale_r(categories[i], locale) 1979 1980 /* The code in this file may change the locale briefly during certain 1981 * operations. This should be a critical section when that could interfere 1982 * with other instances executing at the same time. */ 1983 # define TOGGLE_LOCK(i) POSIX_SETLOCALE_LOCK 1984 # define TOGGLE_UNLOCK(i) POSIX_SETLOCALE_UNLOCK 1985 1986 /*===========================================================================*/ 1987 1988 #elif defined(USE_POSIX_2008_LOCALE) 1989 # ifndef LC_ALL 1990 # error This code assumes that LC_ALL is available on a system modern enough to have POSIX 2008 1991 # endif 1992 1993 /* Here, there is a completely different API to get thread-safe locales. We 1994 * emulate the setlocale() API with our own function(s). setlocale categories, 1995 * like LC_NUMERIC, are not valid here for the POSIX 2008 API. Instead, there 1996 * are equivalents, like LC_NUMERIC_MASK, which we use instead, which we find 1997 * by table lookup. */ 1998 1999 # if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES) 2000 /* https://sourceware.org/bugzilla/show_bug.cgi?id=24936 */ 2001 # define HAS_GLIBC_LC_MESSAGES_BUG 2002 # include <libintl.h> 2003 # endif 2004 2005 # define querylocale_i(i) querylocale_2008_i(i, __LINE__) 2006 2007 /* We need to define this derivative macro here, as it is needed in 2008 * the implementing function (for recursive calls). It also gets defined 2009 * where all the other derivative macros are defined, and the compiler 2010 * will complain if the definition gets out of sync */ 2011 # define querylocale_c(cat) querylocale_i(cat##_INDEX_) 2012 2013 STATIC const char * 2014 S_querylocale_2008_i(pTHX_ const locale_category_index index, 2015 const line_t caller_line) 2016 { 2017 PERL_ARGS_ASSERT_QUERYLOCALE_2008_I; 2018 2019 /* This function returns the name of the locale category given by the input 2020 * 'index' into our parallel tables of them. 2021 * 2022 * POSIX 2008, for some sick reason, chose not to provide a method to find 2023 * the category name of a locale, disregarding a basic linguistic tenet 2024 * that for any object, people will create a name for it. (The next 2025 * version of the POSIX standard is proposed to fix this.) Some vendors 2026 * have created a querylocale() function to do this in the meantime. On 2027 * systems without querylocale(), we have to keep track of what the locale 2028 * has been set to, so that we can return its name so as to emulate 2029 * setlocale(). There are potential problems with this: 2030 * 2031 * 1) We don't know what calling newlocale() with the locale argument "" 2032 * actually does. It gets its values from the program's environment. 2033 * find_locale_from_environment() is used to work around this. But it 2034 * isn't fool-proof. See the comments for that function for details. 2035 * 2) It's possible for C code in some library to change the locale 2036 * without us knowing it, and thus our records become wrong; 2037 * querylocale() would catch this. But as of September 2017, there 2038 * are no occurrences in CPAN of uselocale(). Some libraries do use 2039 * setlocale(), but that changes the global locale, and threads using 2040 * per-thread locales will just ignore those changes. 2041 * 3) Many systems have multiple names for the same locale. Generally, 2042 * there is an underlying base name, with aliases that evaluate to it. 2043 * On some systems, if you set the locale to an alias, and then 2044 * retrieve the name, you get the alias as expected; but on others you 2045 * get the base name, not the alias you used. And sometimes the 2046 * charade is incomplete. See 2047 * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375. 2048 * 2049 * The code is structured so that the returned locale name when the 2050 * locale is changed is whatever the result of querylocale() on the 2051 * new locale is. This effectively gives the result the system 2052 * expects. Without querylocale, the name returned is always the 2053 * input name. Theoretically this could cause problems, but khw knows 2054 * of none so far, but mentions it here in case you are trying to 2055 * debug something. (This could be worked around by messing with the 2056 * global locale temporarily, using setlocale() to get the base name; 2057 * but that could cause a race. The comments for 2058 * find_locale_from_environment() give details on the potential race.) 2059 */ 2060 2061 const locale_t cur_obj = uselocale((locale_t) 0); 2062 const char * retval; 2063 2064 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "querylocale_2008_i(%s) on %p;" 2065 " called from %" LINE_Tf "\n", 2066 category_names[index], cur_obj, 2067 caller_line)); 2068 2069 if (UNLIKELY(cur_obj == LC_GLOBAL_LOCALE)) { 2070 2071 /* Even on platforms that have querylocale(), it is unclear if they 2072 * work in the global locale, and we have the means to get the correct 2073 * answer anyway. khw is unsure this situation even comes up these 2074 * days, hence the branch prediction */ 2075 POSIX_SETLOCALE_LOCK; 2076 retval = mortalized_pv_copy(posix_setlocale(categories[index], NULL)); 2077 POSIX_SETLOCALE_UNLOCK; 2078 } 2079 2080 /* Here we have handled the case of the current locale being the global 2081 * one. Below is the 'else' case of that. There are two different 2082 * implementations, depending on USE_PL_CURLOCALES */ 2083 2084 # ifdef USE_PL_CURLOCALES 2085 2086 else { 2087 2088 /* PL_curlocales[] is kept up-to-date for all categories except LC_ALL, 2089 * which may have been invalidated by setting it to NULL, and if so, 2090 * should now be calculated. (The called function updates that 2091 * element.) */ 2092 if (index == LC_ALL_INDEX_ && PL_curlocales[LC_ALL_INDEX_] == NULL) { 2093 calculate_LC_ALL_string((const char **) &PL_curlocales, 2094 INTERNAL_FORMAT, 2095 WANT_VOID, 2096 caller_line); 2097 } 2098 2099 if (cur_obj == PL_C_locale_obj) { 2100 2101 /* If the current locale object is the C object, then the answer is 2102 * "C" or POSIX, regardless of the category. Handling this 2103 * reasonably likely case specially shortcuts extra effort, and 2104 * hides some bugs from us in OS's that alias other locales to C, 2105 * but do so incompletely. If our records say it is POSIX, use 2106 * that; otherwise use C. See 2107 * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375 */ 2108 retval = (strEQ(PL_curlocales[index], "POSIX")) 2109 ? "POSIX" 2110 : "C"; 2111 } 2112 else { 2113 retval = mortalized_pv_copy(PL_curlocales[index]); 2114 } 2115 } 2116 2117 # else 2118 2119 /* Below is the implementation of the 'else' clause which handles the case 2120 * of the current locale not being the global one on platforms where 2121 * USE_PL_CURLOCALES is NOT in effect. That means the system must have 2122 * some form of querylocale. But these have varying characteristics, so 2123 * first create some #defines to make the actual 'else' clause uniform. 2124 * 2125 * First, glibc has a function that implements querylocale(), but is called 2126 * something else, and takes the category number; the others take the mask. 2127 * */ 2128 # if defined(USE_QUERYLOCALE) && ( defined(_NL_LOCALE_NAME) \ 2129 && defined(HAS_NL_LANGINFO_L)) 2130 # define my_querylocale(index, cur_obj) \ 2131 nl_langinfo_l(_NL_LOCALE_NAME(categories[index]), cur_obj) 2132 2133 /* Experience so far shows it is thread-safe, as well as glibc's 2134 * nl_langinfo_l(), so unless overridden, mark it so */ 2135 # ifdef NO_THREAD_SAFE_QUERYLOCALE 2136 # undef HAS_THREAD_SAFE_QUERYLOCALE 2137 # else 2138 # define HAS_THREAD_SAFE_QUERYLOCALE 2139 # endif 2140 # else /* below, ! glibc */ 2141 2142 /* Otherwise, use the system's querylocale(). */ 2143 # define my_querylocale(index, cur_obj) \ 2144 querylocale(category_masks[index], cur_obj) 2145 2146 /* There is no standard for this function, and khw has never seen 2147 * anything beyond minimal vendor documentation, lacking important 2148 * details. Experience has shown that some implementations have race 2149 * condiions, and their returns may not be thread safe. It would be 2150 * unreliable to test for complete thread safety in Configure. What we 2151 * do instead is to assume that it is thread-safe, unless overriden by, 2152 * say, a hints file specifying 2153 * -Accflags='-DNO_THREAD_SAFE_QUERYLOCALE */ 2154 # ifdef NO_THREAD_SAFE_QUERYLOCALE 2155 # undef HAS_THREAD_SAFE_QUERYLOCALE 2156 # else 2157 # define HAS_THREAD_SAFE_QUERYLOCALE 2158 # endif 2159 # endif 2160 2161 /* Here, we have set up enough information to know if this querylocale() 2162 * is thread-safe, or needs to use a mutex */ 2163 # ifdef HAS_THREAD_SAFE_QUERYLOCALE 2164 # define QUERYLOCALE_LOCK 2165 # define QUERYLOCALE_UNLOCK 2166 # else 2167 # define QUERYLOCALE_LOCK gwLOCALE_LOCK 2168 # define QUERYLOCALE_UNLOCK gwLOCALE_UNLOCK 2169 # endif 2170 2171 /* Finally, everything is ready, so here is the 'else' clause to implement 2172 * the case of the current locale not being the global one on systems that 2173 * have some form of querylocale(). (POSIX will presumably eventually 2174 * publish their next version in their pipeline, which will define a 2175 * precisely specified querylocale equivalent, and there can be a new 2176 * #ifdef to use it without having to guess at its characteristics) */ 2177 2178 else { 2179 /* We don't keep records when there is querylocale(), so as to avoid the 2180 * pitfalls mentioned at the beginning of this function. 2181 * 2182 * That means LC_ALL has to be calculated from all its constituent 2183 * categories each time, since the querylocale() forms on many (if not 2184 * all) platforms only work on individual categories */ 2185 if (index == LC_ALL_INDEX_) { 2186 retval = calculate_LC_ALL_string(NULL, INTERNAL_FORMAT, 2187 WANT_TEMP_PV, 2188 caller_line); 2189 } 2190 else { 2191 2192 QUERYLOCALE_LOCK; 2193 retval = my_querylocale(index, cur_obj); 2194 2195 /* querylocale() may conflate the C locale with something that 2196 * isn't exactly the same. See for example 2197 * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375 2198 * We know that if the locale object is the C one, we 2199 * are in the C locale, which may go by the name POSIX, as both, by 2200 * definition, are equivalent. But we consider any other name 2201 * spurious, so override with "C". As in the PL_CURLOCALES case 2202 * above, this hides those glitches, for the most part, from the 2203 * rest of our code. (The code is ordered this way so that if the 2204 * system distinugishes "C" from "POSIX", we do too.) */ 2205 if (cur_obj == PL_C_locale_obj && ! isNAME_C_OR_POSIX(retval)) { 2206 QUERYLOCALE_UNLOCK; 2207 retval = "C"; 2208 } 2209 else { 2210 retval = savepv(retval); 2211 QUERYLOCALE_UNLOCK; 2212 SAVEFREEPV(retval); 2213 } 2214 } 2215 } 2216 2217 # undef QUERYLOCALE_LOCK 2218 # undef QUERYLOCALE_UNLOCK 2219 # endif 2220 2221 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 2222 "querylocale_2008_i(%s) returning '%s'\n", 2223 category_names[index], retval)); 2224 assert(strNE(retval, "")); 2225 return retval; 2226 } 2227 2228 /*---------------------------------------------------------------------------*/ 2229 2230 # define bool_setlocale_i(i, locale) \ 2231 bool_setlocale_2008_i(i, locale, __LINE__) 2232 2233 /* If this doesn't exist on this platform, make it a no-op (to save #ifdefs) */ 2234 # ifndef update_PL_curlocales_i 2235 # define update_PL_curlocales_i(index, new_locale, caller_line) 2236 # endif 2237 2238 STATIC bool 2239 S_bool_setlocale_2008_i(pTHX_ 2240 2241 /* Our internal index of the 'category' setlocale is called with */ 2242 const locale_category_index index, 2243 const char * new_locale, /* The locale to set the category to */ 2244 const line_t caller_line /* Called from this line number */ 2245 ) 2246 { 2247 PERL_ARGS_ASSERT_BOOL_SETLOCALE_2008_I; 2248 2249 /* This function effectively performs a setlocale() on just the current 2250 * thread; thus it is thread-safe. It does this by using the POSIX 2008 2251 * locale functions to emulate the behavior of setlocale(). Similar to 2252 * regular setlocale(), the return from this function points to memory that 2253 * can be overwritten by other system calls, so needs to be copied 2254 * immediately if you need to retain it. The difference here is that 2255 * system calls besides another setlocale() can overwrite it. 2256 * 2257 * By doing this, most locale-sensitive functions become thread-safe. The 2258 * exceptions are mostly those that return a pointer to static memory. 2259 */ 2260 2261 int mask = category_masks[index]; 2262 const locale_t entry_obj = uselocale((locale_t) 0); 2263 const char * locale_on_entry = querylocale_i(index); 2264 2265 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 2266 "bool_setlocale_2008_i: input=%d (%s), mask=0x%x," 2267 " new locale=\"%s\", current locale=\"%s\"," 2268 " index=%d, entry object=%p;" 2269 " called from %" LINE_Tf "\n", 2270 categories[index], category_names[index], mask, 2271 ((new_locale == NULL) ? "(nil)" : new_locale), 2272 locale_on_entry, index, entry_obj, caller_line)); 2273 2274 /* Here, trying to change the locale, but it is a no-op if the new boss is 2275 * the same as the old boss. Except this routine is called when converting 2276 * from the global locale, so in that case we will create a per-thread 2277 * locale below (with the current values). It also seemed that newlocale() 2278 * could free up the basis locale memory if we called it with the new and 2279 * old being the same, but khw now thinks that this was due to some other 2280 * bug, since fixed, as there are other places where newlocale() gets 2281 * similarly called without problems. */ 2282 if ( entry_obj != LC_GLOBAL_LOCALE 2283 && locale_on_entry 2284 && strEQ(new_locale, locale_on_entry)) 2285 { 2286 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 2287 "bool_setlocale_2008_i: no-op to change to" 2288 " what it already was\n")); 2289 return true; 2290 } 2291 2292 # ifndef USE_QUERYLOCALE 2293 2294 /* Without a querylocale() mechanism, we have to figure out ourselves what 2295 * happens with setting a locale to "" */ 2296 2297 if (strEQ(new_locale, "")) { 2298 new_locale = find_locale_from_environment(index); 2299 if (! new_locale) { 2300 SET_EINVAL; 2301 return false; 2302 } 2303 } 2304 2305 # endif 2306 # ifdef NEWLOCALE_HANDLES_DISPARATE_LC_ALL 2307 2308 const bool need_loop = false; 2309 2310 # else 2311 2312 bool need_loop = false; 2313 const char * new_locales[LC_ALL_INDEX_] = { NULL }; 2314 2315 /* If we're going to have to parse the LC_ALL string, might as well do it 2316 * now before we have made changes that we would have to back out of if the 2317 * parse fails */ 2318 if (index == LC_ALL_INDEX_) { 2319 switch (parse_LC_ALL_string(new_locale, 2320 (const char **) &new_locales, 2321 override_if_ignored, 2322 false, /* Return only [0] if suffices */ 2323 false, /* Don't panic on error */ 2324 caller_line)) 2325 { 2326 case invalid: 2327 SET_EINVAL; 2328 return false; 2329 2330 case no_array: 2331 need_loop = false; 2332 break; 2333 2334 case only_element_0: 2335 SAVEFREEPV(new_locales[0]); 2336 new_locale = new_locales[0]; 2337 need_loop = false; 2338 break; 2339 2340 case full_array: 2341 need_loop = true; 2342 break; 2343 } 2344 } 2345 2346 # endif 2347 # ifdef HAS_GLIBC_LC_MESSAGES_BUG 2348 2349 /* For this bug, if the LC_MESSAGES locale changes, we have to do an 2350 * expensive workaround. Save the current value so we can later determine 2351 * if it changed. */ 2352 const char * old_messages_locale = NULL; 2353 if ( (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_) 2354 && LIKELY(PL_phase != PERL_PHASE_CONSTRUCT)) 2355 { 2356 old_messages_locale = querylocale_c(LC_MESSAGES); 2357 } 2358 2359 # endif 2360 2361 assert(PL_C_locale_obj); 2362 2363 /* Now ready to switch to the input 'new_locale' */ 2364 2365 /* Switching locales generally entails freeing the current one's space (at 2366 * the C library's discretion), hence we can't be using that locale at the 2367 * time of the switch (this wasn't obvious to khw from the man pages). So 2368 * switch to a known locale object that we don't otherwise mess with. */ 2369 if (! uselocale(PL_C_locale_obj)) { 2370 2371 /* Not being able to change to the C locale is severe; don't keep 2372 * going. */ 2373 setlocale_failure_panic_i(index, locale_on_entry, "C", 2374 __LINE__, caller_line); 2375 NOT_REACHED; /* NOTREACHED */ 2376 } 2377 2378 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 2379 "bool_setlocale_2008_i: now using C" 2380 " object=%p\n", PL_C_locale_obj)); 2381 2382 /* These two objects are special: 2383 * LC_GLOBAL_LOCALE because it is undefined behavior to call 2384 * newlocale() with it as a parameter. 2385 * PL_C_locale_obj because newlocale() generally destroys its locale 2386 * object parameter when it succeeds; and we don't 2387 * want that happening to this immutable object. 2388 * Copies will be made for them to use instead if we get so far as to call 2389 * newlocale(). */ 2390 bool entry_obj_is_special = ( entry_obj == LC_GLOBAL_LOCALE 2391 || entry_obj == PL_C_locale_obj); 2392 locale_t new_obj; 2393 2394 /* PL_C_locale_obj is LC_ALL set to the C locale. If this call is to 2395 * switch to LC_ALL => C, simply use that object. But in fact, we already 2396 * have switched to it just above, in preparation for the general case. 2397 * Since we're already there, no need to do further switching. */ 2398 if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) { 2399 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 2400 "bool_setlocale_2008_i: will stay in C" 2401 " object\n")); 2402 new_obj = PL_C_locale_obj; 2403 2404 /* 'entry_obj' is now dangling, of no further use to anyone (unless it 2405 * is one of the special ones). Free it to avoid a leak */ 2406 if (! entry_obj_is_special) { 2407 freelocale(entry_obj); 2408 } 2409 2410 update_PL_curlocales_i(index, new_locale, caller_line); 2411 } 2412 else { /* Here is the general case, not to LC_ALL => C */ 2413 2414 /* The newlocale() call(s) below take a basis object to build upon to 2415 * create the changed locale, trashing it iff successful. 2416 * 2417 * For the objects that are not to be modified by this function, we 2418 * create a duplicate that gets trashed instead. 2419 * 2420 * Also if we will have to loop doing multiple newlocale()s, there is a 2421 * chance we will succeed for the first few, and then fail, having to 2422 * back out. We need to duplicate 'entry_obj' in this case as well, so 2423 * it remains valid as something to back out to. */ 2424 locale_t basis_obj = entry_obj; 2425 2426 if (entry_obj_is_special || need_loop) { 2427 basis_obj = duplocale(basis_obj); 2428 if (! basis_obj) { 2429 locale_panic_via_("duplocale failed", __FILE__, caller_line); 2430 NOT_REACHED; /* NOTREACHED */ 2431 } 2432 2433 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 2434 "bool_setlocale_2008_i created %p by" 2435 " duping the input\n", basis_obj)); 2436 } 2437 2438 # define DEBUG_NEW_OBJECT_CREATED(category, locale, new, old, caller_line) \ 2439 DEBUG_Lv(PerlIO_printf(Perl_debug_log, \ 2440 "bool_setlocale_2008_i(%s, %s): created %p" \ 2441 " while freeing %p; called from %" LINE_Tf \ 2442 " via %" LINE_Tf "\n", \ 2443 category, locale, new, old, \ 2444 caller_line, (line_t)__LINE__)) 2445 # define DEBUG_NEW_OBJECT_FAILED(category, locale, basis_obj) \ 2446 DEBUG_L(PerlIO_printf(Perl_debug_log, \ 2447 "bool_setlocale_2008_i: creating new object" \ 2448 " for (%s '%s') from %p failed; called from %" \ 2449 LINE_Tf " via %" LINE_Tf "\n", \ 2450 category, locale, basis_obj, \ 2451 caller_line, (line_t)__LINE__)); 2452 2453 /* Ready to create a new locale by modification of the existing one. 2454 * 2455 * NOTE: This code may incorrectly show up as a leak under the address 2456 * sanitizer. We do not free this object under normal teardown, however 2457 * you can set PERL_DESTRUCT_LEVEL=2 to cause it to be freed. 2458 */ 2459 2460 # ifdef NEWLOCALE_HANDLES_DISPARATE_LC_ALL 2461 2462 /* Some platforms have a newlocale() that can handle disparate LC_ALL 2463 * input, so on these a single call to newlocale() always works */ 2464 # else 2465 2466 /* If a single call to newlocale() will do */ 2467 if (! need_loop) 2468 2469 # endif 2470 2471 { 2472 new_obj = newlocale(mask, 2473 override_ignored_category(index, new_locale), 2474 basis_obj); 2475 if (! new_obj) { 2476 DEBUG_NEW_OBJECT_FAILED(category_names[index], new_locale, 2477 basis_obj); 2478 2479 /* Since the call failed, it didn't trash 'basis_obj', which is 2480 * a dup for these objects, and hence would leak if we don't 2481 * free it. XXX However, something is seriously wrong if we 2482 * can't switch to C or the global locale, so maybe should 2483 * panic instead */ 2484 if (entry_obj_is_special) { 2485 freelocale(basis_obj); 2486 } 2487 2488 goto must_restore_state; 2489 } 2490 2491 DEBUG_NEW_OBJECT_CREATED(category_names[index], new_locale, 2492 new_obj, basis_obj, caller_line); 2493 2494 update_PL_curlocales_i(index, new_locale, caller_line); 2495 } 2496 2497 # ifndef NEWLOCALE_HANDLES_DISPARATE_LC_ALL 2498 2499 else { /* Need multiple newlocale() calls */ 2500 2501 /* Loop through the individual categories, setting the locale of 2502 * each to the corresponding name previously populated into 2503 * newlocales[]. Each iteration builds on the previous one, adding 2504 * its category to what's already been calculated, and taking as a 2505 * basis for what's been calculated 'basis_obj', which is updated 2506 * each iteration to be the result of the previous one. Upon 2507 * success, newlocale() trashes the 'basis_obj' parameter to it. 2508 * If any iteration fails, we immediately give up, restore the 2509 * locale to what it was at the time this function was called 2510 * (saved in 'entry_obj'), and return failure. */ 2511 2512 /* Loop, using the previous iteration's result as the basis for the 2513 * next one. (The first time we effectively use the locale in 2514 * force upon entry to this function.) */ 2515 for_all_individual_category_indexes(i) { 2516 new_obj = newlocale(category_masks[i], 2517 new_locales[i], 2518 basis_obj); 2519 if (new_obj) { 2520 DEBUG_NEW_OBJECT_CREATED(category_names[i], 2521 new_locales[i], 2522 new_obj, basis_obj, 2523 caller_line); 2524 basis_obj = new_obj; 2525 continue; 2526 } 2527 2528 /* Failed. Likely this is because the proposed new locale 2529 * isn't valid on this system. */ 2530 2531 DEBUG_NEW_OBJECT_FAILED(category_names[i], 2532 new_locales[i], 2533 basis_obj); 2534 2535 /* newlocale() didn't trash this, since the function call 2536 * failed */ 2537 freelocale(basis_obj); 2538 2539 for_all_individual_category_indexes(j) { 2540 Safefree(new_locales[j]); 2541 } 2542 2543 goto must_restore_state; 2544 } 2545 2546 /* Success for all categories. */ 2547 for_all_individual_category_indexes(i) { 2548 update_PL_curlocales_i(i, new_locales[i], caller_line); 2549 Safefree(new_locales[i]); 2550 } 2551 2552 /* We dup'd entry_obj in case we had to fall back to it. The 2553 * newlocale() above destroyed the dup when it first succeeded, but 2554 * entry_obj itself is left dangling, so free it */ 2555 if (! entry_obj_is_special) { 2556 freelocale(entry_obj); 2557 } 2558 } 2559 2560 # endif /* End of newlocale can't handle disparate LC_ALL input */ 2561 2562 } 2563 2564 # undef DEBUG_NEW_OBJECT_CREATED 2565 # undef DEBUG_NEW_OBJECT_FAILED 2566 2567 /* Here, successfully created an object representing the desired locale; 2568 * now switch into it */ 2569 if (! uselocale(new_obj)) { 2570 freelocale(new_obj); 2571 locale_panic_(Perl_form(aTHX_ "(called from %" LINE_Tf "):" 2572 " bool_setlocale_2008_i: switching" 2573 " into new locale failed", 2574 caller_line)); 2575 } 2576 2577 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 2578 "bool_setlocale_2008_i: now using %p\n", new_obj)); 2579 2580 # ifdef MULTIPLICITY /* Unlikely, but POSIX 2008 functions could be 2581 Configured to be used on unthreaded perls, in which 2582 case this object doesn't exist */ 2583 2584 if (DEBUG_Lv_TEST) { 2585 if (PL_cur_locale_obj != new_obj) { 2586 PerlIO_printf(Perl_debug_log, 2587 "bool_setlocale_2008_i: PL_cur_locale_obj" 2588 " was %p, now is %p\n", 2589 PL_cur_locale_obj, new_obj); 2590 } 2591 } 2592 2593 /* Update the current object */ 2594 PL_cur_locale_obj = new_obj; 2595 2596 # endif 2597 # ifdef HAS_GLIBC_LC_MESSAGES_BUG 2598 2599 /* Invalidate the glibc cache of loaded translations if the locale has 2600 * changed, see [perl #134264] and 2601 * https://sourceware.org/bugzilla/show_bug.cgi?id=24936 */ 2602 if (old_messages_locale) { 2603 if (strNE(old_messages_locale, querylocale_c(LC_MESSAGES))) { 2604 textdomain(textdomain(NULL)); 2605 } 2606 } 2607 2608 # endif 2609 2610 return true; 2611 2612 must_restore_state: 2613 2614 /* We earlier switched to the LC_ALL => C locale in anticipation of it 2615 * succeeding, Now have to switch back to the state upon entry. */ 2616 if (! uselocale(entry_obj)) { 2617 setlocale_failure_panic_i(index, "switching back to", 2618 locale_on_entry, __LINE__, caller_line); 2619 } 2620 2621 return false; 2622 } 2623 2624 /*===========================================================================*/ 2625 2626 #else 2627 # error Unexpected Configuration 2628 #endif /* End of the various implementations of the setlocale and 2629 querylocale macros used in the remainder of this program */ 2630 2631 /*===========================================================================*/ 2632 2633 /* Each implementation above is based on two fundamental macros #defined above: 2634 * 1) either a querylocale_r or a querylocale_i 2635 * 2) either a bool_setlocale_r or a bool_setlocale_i 2636 * 2637 * (Which one of each got #defined is based on which is most efficient in 2638 * interacting with the underlying libc functions called.) 2639 * 2640 * To complete the implementation, macros for the missing two suffixes must be 2641 * #defined, as well as all the void_setlocale_X() forms. These all can be 2642 * mechanically derived from the fundamental ones. */ 2643 2644 #ifdef querylocale_r 2645 # define querylocale_c(cat) querylocale_r(cat) 2646 # define querylocale_i(i) querylocale_r(categories[i]) 2647 #elif defined(querylocale_i) 2648 # define querylocale_c(cat) querylocale_i(cat##_INDEX_) 2649 # define querylocale_r(cat) querylocale_i(get_category_index(cat)) 2650 #else 2651 # error No querylocale() form defined 2652 #endif 2653 2654 #ifdef bool_setlocale_r 2655 # define bool_setlocale_i(i, l) bool_setlocale_r(categories[i], l) 2656 # define bool_setlocale_c(cat, l) bool_setlocale_r(cat, l) 2657 2658 # define void_setlocale_r_with_caller(cat, locale, file, line) \ 2659 STMT_START { \ 2660 if (! bool_setlocale_r(cat, locale)) \ 2661 setlocale_failure_panic_via_i(get_category_index(cat), \ 2662 NULL, locale, __LINE__, 0, \ 2663 file, line); \ 2664 } STMT_END 2665 2666 # define void_setlocale_c_with_caller(cat, locale, file, line) \ 2667 void_setlocale_r_with_caller(cat, locale, file, line) 2668 2669 # define void_setlocale_i_with_caller(i, locale, file, line) \ 2670 void_setlocale_r_with_caller(categories[i], locale, file, line) 2671 2672 # define void_setlocale_r(cat, locale) \ 2673 void_setlocale_r_with_caller(cat, locale, __FILE__, __LINE__) 2674 # define void_setlocale_c(cat, locale) \ 2675 void_setlocale_r(cat, locale) 2676 # define void_setlocale_i(i, locale) \ 2677 void_setlocale_r(categories[i], locale) 2678 2679 #elif defined(bool_setlocale_i) 2680 # define bool_setlocale_c(cat, loc) bool_setlocale_i(cat##_INDEX_, loc) 2681 # define bool_setlocale_r(c, loc) bool_setlocale_i(get_category_index(c), l) 2682 2683 # define void_setlocale_i_with_caller(i, locale, file, line) \ 2684 STMT_START { \ 2685 if (! bool_setlocale_i(i, locale)) \ 2686 setlocale_failure_panic_via_i(i, NULL, locale, __LINE__, 0, \ 2687 file, line); \ 2688 } STMT_END 2689 2690 # define void_setlocale_r_with_caller(cat, locale, file, line) \ 2691 void_setlocale_i_with_caller(get_category_index(cat), locale, \ 2692 file, line) 2693 2694 # define void_setlocale_c_with_caller(cat, locale, file, line) \ 2695 void_setlocale_i_with_caller(cat##_INDEX_, locale, file, line) 2696 2697 # define void_setlocale_i(i, locale) \ 2698 void_setlocale_i_with_caller(i, locale, __FILE__, __LINE__) 2699 # define void_setlocale_c(cat, locale) \ 2700 void_setlocale_i(cat##_INDEX_, locale) 2701 # define void_setlocale_r(cat, locale) \ 2702 void_setlocale_i(get_category_index(cat), locale) 2703 2704 #else 2705 # error No bool_setlocale() form defined 2706 #endif 2707 2708 /*===========================================================================*/ 2709 2710 /* Most of the cases in this file just toggle the locale briefly; but there are 2711 * a few instances where a longer toggled interval, over multiple operations, 2712 * is desirable, since toggling and untoggling have a cost. But on platforms 2713 * where toggling must be done in a critical section, it is even more desirable 2714 * to minimize the length of time in an uninterruptable state. 2715 * 2716 * The macros below try to balance these competing interests. When the 2717 * toggling is to be brief, simply use the plain "toggle_locale" macros. But 2718 * in addition, in the places where an over-arching toggle would be nice, add 2719 * calls to the macros below that have the "_locking" suffix. These are no-ops 2720 * except on systems where the toggling doesn't force a critical section. But 2721 * otherwise these toggle to the over-arching locale. When the individual 2722 * toggles are executed, they will check and find that the locale is already in 2723 * the right state, and return without doing anything. */ 2724 #if TOGGLING_LOCKS 2725 # define toggle_locale_c_unless_locking(cat, locale) NULL 2726 # define toggle_locale_c_if_locking( cat, locale) \ 2727 toggle_locale_i(cat##_INDEX_, locale) 2728 2729 # define restore_toggled_locale_c_unless_locking(cat, locale) \ 2730 PERL_UNUSED_ARG(locale) 2731 # define restore_toggled_locale_c_if_locking( cat, locale) \ 2732 restore_toggled_locale_i( cat##_INDEX_, locale) 2733 #else 2734 # define toggle_locale_c_unless_locking(cat, locale) \ 2735 toggle_locale_i(cat##_INDEX_, locale) 2736 # define toggle_locale_c_if_locking( cat, locale) NULL 2737 2738 # define restore_toggled_locale_c_unless_locking(cat, locale) \ 2739 restore_toggled_locale_i(cat##_INDEX_, locale) 2740 # define restore_toggled_locale_c_if_locking( cat, locale) \ 2741 PERL_UNUSED_ARG(locale) 2742 #endif 2743 2744 /* query_nominal_locale_i() is used when the caller needs the locale that an 2745 * external caller would be expecting, and not what we're secretly using 2746 * behind the scenes. It deliberately doesn't handle LC_ALL; use 2747 * calculate_LC_ALL_string() for that. */ 2748 #ifdef USE_LOCALE_NUMERIC 2749 # define query_nominal_locale_i(i) \ 2750 (__ASSERT_(i != LC_ALL_INDEX_) \ 2751 ((i == LC_NUMERIC_INDEX_) ? PL_numeric_name : querylocale_i(i))) 2752 #elif defined(USE_LOCALE) 2753 # define query_nominal_locale_i(i) \ 2754 (__ASSERT_(i != LC_ALL_INDEX_) querylocale_i(i)) 2755 #else 2756 # define query_nominal_locale_i(i) "C" 2757 #endif 2758 2759 #ifdef USE_PL_CURLOCALES 2760 2761 STATIC void 2762 S_update_PL_curlocales_i(pTHX_ 2763 const locale_category_index index, 2764 const char * new_locale, 2765 const line_t caller_line) 2766 { 2767 /* Update PL_curlocales[], which is parallel to the other ones indexed by 2768 * our mapping of libc category number to our internal equivalents. */ 2769 2770 PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I; 2771 2772 if (index == LC_ALL_INDEX_) { 2773 2774 /* For LC_ALL, we change all individual categories to correspond, 2775 * including the LC_ALL element */ 2776 for (unsigned int i = 0; i <= LC_ALL_INDEX_; i++) { 2777 Safefree(PL_curlocales[i]); 2778 PL_curlocales[i] = NULL; 2779 } 2780 2781 switch (parse_LC_ALL_string(new_locale, 2782 (const char **) &PL_curlocales, 2783 check_that_overridden, /* things should 2784 have already 2785 been overridden 2786 */ 2787 true, /* Always fill array */ 2788 true, /* Panic if fails, as to get here 2789 it earlier had to have succeeded 2790 */ 2791 caller_line)) 2792 { 2793 case invalid: 2794 case no_array: 2795 case only_element_0: 2796 locale_panic_via_("Unexpected return from parse_LC_ALL_string", 2797 __FILE__, caller_line); 2798 2799 case full_array: 2800 /* parse_LC_ALL_string() has already filled PL_curlocales properly, 2801 * except for the LC_ALL element, which should be set to 2802 * 'new_locale'. */ 2803 PL_curlocales[LC_ALL_INDEX_] = savepv(new_locale); 2804 } 2805 } 2806 else { /* Not LC_ALL */ 2807 2808 /* Update the single category's record */ 2809 Safefree(PL_curlocales[index]); 2810 PL_curlocales[index] = savepv(new_locale); 2811 2812 /* Invalidate LC_ALL */ 2813 Safefree(PL_curlocales[LC_ALL_INDEX_]); 2814 PL_curlocales[LC_ALL_INDEX_] = NULL; 2815 } 2816 } 2817 2818 # endif /* Need PL_curlocales[] */ 2819 2820 /*===========================================================================*/ 2821 2822 #if defined(USE_LOCALE) 2823 2824 /* This paradigm is needed in several places in the function below. We have to 2825 * substitute the nominal locale for LC_NUMERIC when returning a value for 2826 * external consumption */ 2827 # ifndef USE_LOCALE_NUMERIC 2828 # define ENTRY(i, array, format) array[i] 2829 # else 2830 # define ENTRY(i, array, format) \ 2831 (UNLIKELY( format == EXTERNAL_FORMAT_FOR_QUERY \ 2832 && i == LC_NUMERIC_INDEX_) \ 2833 ? PL_numeric_name \ 2834 : array[i]) 2835 # endif 2836 2837 STATIC 2838 const char * 2839 S_calculate_LC_ALL_string(pTHX_ const char ** category_locales_list, 2840 const calc_LC_ALL_format format, 2841 const calc_LC_ALL_return returning, 2842 const line_t caller_line) 2843 { 2844 PERL_ARGS_ASSERT_CALCULATE_LC_ALL_STRING; 2845 2846 /* NOTE: On Configurations that have PL_curlocales[], this function has the 2847 * side effect of updating the LC_ALL_INDEX_ element with its result. 2848 * 2849 * This function calculates a string that defines the locale(s) LC_ALL is 2850 * set to, in either: 2851 * 1) Our internal format if 'format' is set to INTERNAL_FORMAT. 2852 * 2) The external format returned by Perl_setlocale() if 'format' is set 2853 * to EXTERNAL_FORMAT_FOR_QUERY or EXTERNAL_FORMAT_FOR_SET. 2854 * 2855 * These two are distinguished by: 2856 * a) EXTERNAL_FORMAT_FOR_SET returns the actual locale currently in 2857 * effect. 2858 * b) EXTERNAL_FORMAT_FOR_QUERY returns the nominal locale. 2859 * Currently this can differ only from the actual locale in the 2860 * LC_NUMERIC category when it is set to a locale whose radix is 2861 * not a dot. (The actual locale is kept as a dot to accommodate 2862 * the large corpus of XS code that expects it to be that; 2863 * switched to a non-dot temporarily during certain operations 2864 * that require the actual radix.) 2865 * 2866 * In both 1) and 2), LC_ALL's values are passed to this function by 2867 * 'category_locales_list' which is either: 2868 * 1) a pointer to an array of strings with up-to-date values of all the 2869 * individual categories; or 2870 * 2) NULL, to indicate to use querylocale_i() to get each individual 2871 * value. 2872 * 2873 * The caller sets 'returning' to 2874 * WANT_TEMP_PV the function returns the calculated string 2875 * as a mortalized temporary, so the caller 2876 * doesn't have to worry about it being 2877 * per-thread, nor needs to arrange for its 2878 * clean-up. 2879 * WANT_PL_setlocale_buf the function stores the calculated string 2880 * into the per-thread buffer PL_setlocale_buf 2881 * and returns a pointer to that. The buffer 2882 * is cleaned up automatically in process 2883 * destruction. This return method avoids 2884 * extra copies in some circumstances. 2885 * WANT_VOID NULL is returned. This is used when the 2886 * function is being called only for its side 2887 * effect of updating 2888 * PL_curlocales[LC_ALL_INDEX_] 2889 * 2890 * querylocale(), on systems that have it, doesn't tend to work for LC_ALL. 2891 * So we have to construct the answer ourselves based on the passed in 2892 * data. 2893 * 2894 * If all individual categories are the same locale, we can just set LC_ALL 2895 * to that locale. But if not, we have to create an aggregation of all the 2896 * categories on the system. Platforms differ as to the syntax they use 2897 * for these non-uniform locales for LC_ALL. Some, like glibc and Windows, 2898 * use an unordered series of name=value pairs, like 2899 * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;... 2900 * to specify LC_ALL; others, like *BSD, use a positional notation with a 2901 * delimitter, typically a single '/' character: 2902 * C/en_UK.UTF-8/... 2903 * 2904 * When the external format is desired, this function returns whatever the 2905 * system expects. The internal format is always name=value pairs. 2906 * 2907 * For systems that have categories we don't know about, the algorithm 2908 * below won't know about those missing categories, leading to potential 2909 * bugs for code that looks at them. If there is an environment variable 2910 * that sets that category, we won't know to look for it, and so our use of 2911 * LANG or "C" improperly overrides it. On the other hand, if we don't do 2912 * what is done here, and there is no environment variable, the category's 2913 * locale should be set to LANG or "C". So there is no good solution. khw 2914 * thinks the best is to make sure we have a complete list of possible 2915 * categories, adding new ones as they show up on obscure platforms. 2916 */ 2917 2918 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 2919 "Entering calculate_LC_ALL_string(%s);" 2920 " called from %" LINE_Tf "\n", 2921 ((format == EXTERNAL_FORMAT_FOR_QUERY) 2922 ? "EXTERNAL_FORMAT_FOR_QUERY" 2923 : ((format == EXTERNAL_FORMAT_FOR_SET) 2924 ? "EXTERNAL_FORMAT_FOR_SET" 2925 : "INTERNAL_FORMAT")), 2926 caller_line)); 2927 2928 bool input_list_was_NULL = (category_locales_list == NULL); 2929 2930 /* If there was no input category list, construct a temporary one 2931 * ourselves. */ 2932 const char * my_category_locales_list[LC_ALL_INDEX_]; 2933 const char ** locales_list = category_locales_list; 2934 if (locales_list == NULL) { 2935 locales_list = my_category_locales_list; 2936 2937 if (format == EXTERNAL_FORMAT_FOR_QUERY) { 2938 for_all_individual_category_indexes(i) { 2939 locales_list[i] = query_nominal_locale_i(i); 2940 } 2941 } 2942 else { 2943 for_all_individual_category_indexes(i) { 2944 locales_list[i] = querylocale_i(i); 2945 } 2946 } 2947 } 2948 2949 /* While we are calculating LC_ALL, we see if every category's locale is 2950 * the same as every other's or not. */ 2951 # ifndef HAS_IGNORED_LOCALE_CATEGORIES_ 2952 2953 /* When we pay attention to all categories, we assume they are all the same 2954 * until proven different */ 2955 bool disparate = false; 2956 2957 # else 2958 2959 /* But if there are ignored categories, those will be set to "C", so try an 2960 * arbitrary category, and if it isn't C, we know immediately that the 2961 * locales are disparate. (The #if conditionals are to handle the case 2962 * where LC_NUMERIC_INDEX_ is 0. We don't want to use LC_NUMERIC to 2963 * compare, as that may be different between external and internal forms.) 2964 * */ 2965 # if ! defined(USE_LOCALE_NUMERIC) 2966 2967 bool disparate = ! isNAME_C_OR_POSIX(locales_list[0]); 2968 2969 # elif LC_NUMERIC_INDEX_ != 0 2970 2971 bool disparate = ! isNAME_C_OR_POSIX(locales_list[0]); 2972 2973 # else 2974 2975 /* Would need revision to handle the very unlikely case where only a single 2976 * category, LC_NUMERIC, is defined */ 2977 assert(LOCALE_CATEGORIES_COUNT_ > 0); 2978 2979 bool disparate = ! isNAME_C_OR_POSIX(locales_list[1]); 2980 2981 # endif 2982 # endif 2983 2984 /* Calculate the needed size for the string listing the individual locales. 2985 * Initialize with values known at compile time. */ 2986 Size_t total_len; 2987 const char *separator; 2988 2989 # ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS /* Positional formatted LC_ALL */ 2990 PERL_UNUSED_ARG(format); 2991 # else 2992 2993 if (format != INTERNAL_FORMAT) { 2994 2995 /* Here, we will be using positional notation. it includes n-1 2996 * separators */ 2997 total_len = ( LOCALE_CATEGORIES_COUNT_ - 1) 2998 * STRLENs(PERL_LC_ALL_SEPARATOR) 2999 + 1; /* And a trailing NUL */ 3000 separator = PERL_LC_ALL_SEPARATOR; 3001 } 3002 else 3003 3004 # endif 3005 3006 { 3007 /* name=value output is always used in internal format, and when 3008 * positional isn't available on the platform. */ 3009 total_len = lc_all_boiler_plate_length; 3010 separator = ";"; 3011 } 3012 3013 /* The total length then is just the sum of the above boiler-plate plus the 3014 * total strlen()s of the locale name of each individual category. */ 3015 for_all_individual_category_indexes(i) { 3016 const char * entry = ENTRY(i, locales_list, format); 3017 3018 total_len += strlen(entry); 3019 if (! disparate && strNE(entry, locales_list[0])) { 3020 disparate = true; 3021 } 3022 } 3023 3024 bool free_if_void_return = false; 3025 const char * retval; 3026 3027 /* If all categories have the same locale, we already know the answer */ 3028 if (! disparate) { 3029 if (returning == WANT_PL_setlocale_buf) { 3030 save_to_buffer(locales_list[0], 3031 &PL_setlocale_buf, 3032 &PL_setlocale_bufsize); 3033 retval = PL_setlocale_buf; 3034 } 3035 else { 3036 3037 retval = locales_list[0]; 3038 3039 /* If a temporary is wanted for the return, and we had to create 3040 * the input list ourselves, we created it into such a temporary, 3041 * so no further work is needed; but otherwise, make a mortal copy 3042 * of this passed-in list element */ 3043 if (returning == WANT_TEMP_PV && ! input_list_was_NULL) { 3044 retval = savepv(retval); 3045 SAVEFREEPV(retval); 3046 } 3047 3048 /* In all cases here, there's nothing we create that needs to be 3049 * freed, so leave 'free_if_void_return' set to the default 3050 * 'false'. */ 3051 } 3052 } 3053 else { /* Here, not all categories have the same locale */ 3054 3055 char * constructed; 3056 3057 /* If returning to PL_setlocale_buf, set up to write directly to it, 3058 * being sure it is resized to be large enough */ 3059 if (returning == WANT_PL_setlocale_buf) { 3060 set_save_buffer_min_size(total_len, 3061 &PL_setlocale_buf, 3062 &PL_setlocale_bufsize); 3063 constructed = PL_setlocale_buf; 3064 } 3065 else { /* Otherwise we need new memory to hold the calculated value. */ 3066 3067 Newx(constructed, total_len, char); 3068 3069 /* If returning the new memory, it must be set up to be freed 3070 * later; otherwise at the end of this function */ 3071 if (returning == WANT_TEMP_PV) { 3072 SAVEFREEPV(constructed); 3073 } 3074 else { 3075 free_if_void_return = true; 3076 } 3077 } 3078 3079 constructed[0] = '\0'; 3080 3081 /* Loop through all the categories */ 3082 for_all_individual_category_indexes(j) { 3083 3084 /* Add a separator, except before the first one */ 3085 if (j != 0) { 3086 my_strlcat(constructed, separator, total_len); 3087 } 3088 3089 const char * entry; 3090 Size_t needed_len; 3091 unsigned int i = j; 3092 3093 # ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS 3094 3095 if (UNLIKELY(format != INTERNAL_FORMAT)) { 3096 3097 /* In positional notation 'j' means the position, and we have 3098 * to convert to the index 'i' */ 3099 i = map_LC_ALL_position_to_index[j]; 3100 3101 entry = ENTRY(i, locales_list, format); 3102 needed_len = my_strlcat(constructed, entry, total_len); 3103 } 3104 else 3105 3106 # endif 3107 { 3108 /* Below, we are to use name=value notation, either because 3109 * that's what the platform uses, or because this is the 3110 * internal format, which uses that notation regardless of the 3111 * external form */ 3112 3113 entry = ENTRY(i, locales_list, format); 3114 3115 /* "name=locale;" */ 3116 my_strlcat(constructed, category_names[i], total_len); 3117 my_strlcat(constructed, "=", total_len); 3118 needed_len = my_strlcat(constructed, entry, total_len); 3119 } 3120 3121 if (LIKELY(needed_len <= total_len)) { 3122 continue; 3123 } 3124 3125 /* If would have overflowed, panic */ 3126 locale_panic_via_(Perl_form(aTHX_ 3127 "Internal length calculation wrong.\n" 3128 "\"%s\" was not entirely added to" 3129 " \"%.*s\"; needed=%zu, had=%zu", 3130 entry, (int) total_len, 3131 constructed, 3132 needed_len, total_len), 3133 __FILE__, 3134 caller_line); 3135 } /* End of loop through the categories */ 3136 3137 retval = constructed; 3138 } /* End of the categories' locales are displarate */ 3139 3140 # if defined(USE_PL_CURLOCALES) && defined(LC_ALL) 3141 3142 if (format == INTERNAL_FORMAT) { 3143 3144 /* PL_curlocales[LC_ALL_INDEX_] is updated as a side-effect of this 3145 * function for internal format. */ 3146 Safefree(PL_curlocales[LC_ALL_INDEX_]); 3147 PL_curlocales[LC_ALL_INDEX_] = savepv(retval); 3148 } 3149 3150 # endif 3151 3152 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 3153 "calculate_LC_ALL_string calculated '%s'\n", 3154 retval)); 3155 3156 if (returning == WANT_VOID) { 3157 if (free_if_void_return) { 3158 Safefree(retval); 3159 } 3160 3161 return NULL; 3162 } 3163 3164 return retval; 3165 } 3166 3167 # if defined(WIN32) || ( defined(USE_POSIX_2008_LOCALE) \ 3168 && ! defined(USE_QUERYLOCALE)) 3169 3170 STATIC const char * 3171 S_find_locale_from_environment(pTHX_ const locale_category_index index) 3172 { 3173 /* NB: This function may actually change the locale on Windows. It 3174 * currently is designed to be called only from setting the locale on 3175 * Windows, and POSIX 2008 3176 * 3177 * This function returns the locale specified by the program's environment 3178 * for the category specified by our internal index number 'index'. It 3179 * therefore simulates: 3180 * setlocale(cat, "") 3181 * but, except for some cases in Windows, doesn't actually change the 3182 * locale; merely returns it. 3183 * 3184 * The return need not be freed by the caller. This 3185 * promise relies on PerlEnv_getenv() returning a mortalized copy to us. 3186 * 3187 * The simulation is needed only on certain platforms; otherwise, libc is 3188 * called with "" to get the actual value(s). The simulation is needed 3189 * for: 3190 * 3191 * 1) On Windows systems, the concept of the POSIX ordering of 3192 * environment variables is missing. To increase portability of 3193 * programs across platforms, the POSIX ordering is emulated on 3194 * Windows. 3195 * 3196 * 2) On POSIX 2008 systems without querylocale(), it is problematic 3197 * getting the results of the POSIX 2008 equivalent of 3198 * 3199 * setlocale(category, "") 3200 * 3201 * To ensure that we know exactly what those values are, we do the 3202 * setting ourselves, using the documented algorithm specified by the 3203 * POSIX standard (assuming the platform follows the Standard) rather 3204 * than use "" as the locale. This will lead to results that differ 3205 * from native behavior if the native behavior differs from the 3206 * Standard's documented value, but khw believes it is better to know 3207 * what's going on, even if different from native, than to just guess. 3208 * 3209 * glibc systems differ from this standard in having a LANGUAGE 3210 * environment variable used for just LC_MESSAGES. This function does 3211 * NOT handle that. 3212 * 3213 * Another option for the POSIX 2008 case would be, in a critical 3214 * section, to save the global locale's current value, and do a 3215 * straight setlocale(LC_ALL, ""). That would return our desired 3216 * values, destroying the global locale's, which we would then 3217 * restore. But that could cause races with any other thread that is 3218 * using the global locale and isn't using the mutex. And, the only 3219 * reason someone would have done that is because they are calling a 3220 * library function, like in gtk, that calls setlocale(), and which 3221 * can't be changed to use the mutex. That wouldn't be a problem if 3222 * this were to be done before any threads had switched, say during 3223 * perl construction time. But this code would still be needed for 3224 * the general case. 3225 * 3226 * The Windows and POSIX 2008 differ in that the ultimate fallback is "C" 3227 * in POSIX, and is the system default locale in Windows. To get that 3228 * system default value, we actually have to call setlocale() on Windows. 3229 */ 3230 3231 const char * const lc_all = PerlEnv_getenv("LC_ALL"); 3232 const char * locale_names[LC_ALL_INDEX_] = { NULL }; 3233 3234 /* Use any "LC_ALL" environment variable, as it overrides everything else. 3235 * */ 3236 if (lc_all && strNE(lc_all, "")) { 3237 return lc_all; 3238 } 3239 3240 /* Here, no usable LC_ALL environment variable. We have to handle each 3241 * category separately. If all categories are desired, we loop through 3242 * them all. If only an individual category is desired, to avoid 3243 * duplicating logic, we use the same loop, but set up the limits so it is 3244 * only executed once, for that particular category. */ 3245 locale_category_index lower, upper, offset; 3246 if (index == LC_ALL_INDEX_) { 3247 lower = (locale_category_index) 0; 3248 upper = (locale_category_index) ((int) LC_ALL_INDEX_ - 1); 3249 offset = (locale_category_index) 0; 3250 } 3251 else { 3252 lower = index; 3253 upper = index; 3254 3255 /* 'offset' is used so that the result of the single loop iteration is 3256 * stored into output[0] */ 3257 offset = lower; 3258 } 3259 3260 /* When no LC_ALL environment variable, LANG is used as a default, but 3261 * overridden for individual categories that have corresponding environment 3262 * variables. If no LANG exists, the default is "C" on POSIX 2008, or the 3263 * system default for the category on Windows. */ 3264 const char * env_lang = NULL; 3265 3266 /* For each desired category, use any corresponding environment variable; 3267 * or the default if none such exists. */ 3268 bool is_disparate = false; /* Assume is uniform until proven otherwise */ 3269 for_category_indexes_between(i, lower, upper) { 3270 const char * const env_override = PerlEnv_getenv(category_names[i]); 3271 locale_category_index j = (locale_category_index) (i - offset); 3272 3273 if (env_override && strNE(env_override, "")) { 3274 locale_names[j] = env_override; 3275 } 3276 else { /* Here, no corresponding environment variable, see if LANG 3277 exists and is usable. Done this way to avoid fetching LANG 3278 unless it is actually needed */ 3279 if (env_lang == NULL) { 3280 env_lang = PerlEnv_getenv("LANG"); 3281 3282 /* If not usable, set it to a non-NULL illegal value so won't 3283 * try to use it below */ 3284 if (env_lang == NULL || strEQ(env_lang, "")) { 3285 env_lang = (const char *) 1; 3286 } 3287 } 3288 3289 /* If a usable LANG exists, use it. */ 3290 if (env_lang != NULL && env_lang != (const char *) 1) { 3291 locale_names[j] = env_lang; 3292 } 3293 else { 3294 3295 # ifdef WIN32 3296 /* If no LANG, use the system default on Windows. */ 3297 locale_names[j] = wrap_wsetlocale(categories[i], ".ACP"); 3298 if (locale_names[j]) { 3299 SAVEFREEPV(locale_names[j]); 3300 } 3301 else 3302 # endif 3303 { /* If nothing was found or worked, use C */ 3304 locale_names[j] = "C"; 3305 } 3306 } 3307 } 3308 3309 if (j > 0 && ! is_disparate && strNE(locale_names[0], locale_names[j])) 3310 { 3311 is_disparate = true; 3312 } 3313 3314 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 3315 "find_locale_from_environment i=%u, j=%u, name=%s," 3316 " locale=%s, locale of 0th category=%s, disparate=%d\n", 3317 i, j, category_names[i], 3318 locale_names[j], locale_names[0], is_disparate)); 3319 } 3320 3321 if (! is_disparate) { 3322 return locale_names[0]; 3323 } 3324 3325 return calculate_LC_ALL_string(locale_names, INTERNAL_FORMAT, 3326 WANT_TEMP_PV, 3327 __LINE__); 3328 } 3329 3330 # endif 3331 # if defined(DEBUGGING) || defined(USE_PERL_SWITCH_LOCALE_CONTEXT) 3332 3333 STATIC const char * 3334 S_get_LC_ALL_display(pTHX) 3335 { 3336 return calculate_LC_ALL_string(NULL, INTERNAL_FORMAT, 3337 WANT_TEMP_PV, 3338 __LINE__); 3339 } 3340 3341 # endif 3342 3343 STATIC void 3344 S_setlocale_failure_panic_via_i(pTHX_ 3345 const locale_category_index cat_index, 3346 const char * current, 3347 const char * failed, 3348 const line_t proxy_caller_line, 3349 const line_t immediate_caller_line, 3350 const char * const higher_caller_file, 3351 const line_t higher_caller_line) 3352 { 3353 PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_VIA_I; 3354 3355 /* Called to panic when a setlocale form unexpectedly failed for the 3356 * category determined by 'cat_index', and the locale that was in effect 3357 * (and likely still is) is 'current'. 'current' may be NULL, which causes 3358 * this function to query what it is. 3359 * 3360 * The extra caller information is used for when a function acts as a 3361 * stand-in for another function, which a typical reader would more likely 3362 * think would be the caller 3363 * 3364 * If a line number is 0, its stack (sort-of) frame is omitted; same if 3365 * it's the same line number as the next higher caller. */ 3366 3367 const int cat = categories[cat_index]; 3368 const char * name = category_names[cat_index]; 3369 3370 dSAVE_ERRNO; 3371 3372 if (current == NULL) { 3373 current = querylocale_i(cat_index); 3374 } 3375 3376 const char * proxy_text = ""; 3377 if (proxy_caller_line != 0 && proxy_caller_line != immediate_caller_line) 3378 { 3379 proxy_text = Perl_form(aTHX_ "\nCalled via %s: %" LINE_Tf, 3380 __FILE__, proxy_caller_line); 3381 } 3382 if ( strNE(__FILE__, higher_caller_file) 3383 || ( immediate_caller_line != 0 3384 && immediate_caller_line != higher_caller_line)) 3385 { 3386 proxy_text = Perl_form(aTHX_ "%s\nCalled via %s: %" LINE_Tf, 3387 proxy_text, __FILE__, 3388 immediate_caller_line); 3389 } 3390 3391 /* 'false' in the get_displayable_string() calls makes it not think the 3392 * locale is UTF-8, so just dumps bytes. Actually figuring it out can be 3393 * too complicated for a panic situation. */ 3394 const char * msg = Perl_form(aTHX_ 3395 "Can't change locale for %s (%d) from '%s' to '%s'" 3396 " %s", 3397 name, cat, 3398 get_displayable_string(current, 3399 current + strlen(current), 3400 false), 3401 get_displayable_string(failed, 3402 failed + strlen(failed), 3403 false), 3404 proxy_text); 3405 RESTORE_ERRNO; 3406 3407 Perl_locale_panic(msg, __LINE__, higher_caller_file, higher_caller_line); 3408 NOT_REACHED; /* NOTREACHED */ 3409 } 3410 3411 # ifdef USE_LOCALE_NUMERIC 3412 3413 STATIC void 3414 S_new_numeric(pTHX_ const char *newnum, bool force) 3415 { 3416 PERL_ARGS_ASSERT_NEW_NUMERIC; 3417 3418 /* Called after each libc setlocale() or uselocale() call affecting 3419 * LC_NUMERIC, to tell core Perl this and that 'newnum' is the name of the 3420 * new locale, and we are switched into it. It installs this locale as the 3421 * current underlying default, and then switches to the C locale, if 3422 * necessary, so that the code that has traditionally expected the radix 3423 * character to be a dot may continue to do so. 3424 * 3425 * The default locale and the C locale can be toggled between by use of the 3426 * set_numeric_underlying() and set_numeric_standard() functions, which 3427 * should probably not be called directly, but only via macros like 3428 * SET_NUMERIC_STANDARD() in perl.h. 3429 * 3430 * The toggling is necessary mainly so that a non-dot radix decimal point 3431 * character can be input and output, while allowing internal calculations 3432 * to use a dot. 3433 * 3434 * This sets several interpreter-level variables: 3435 * PL_numeric_name The underlying locale's name: a copy of 'newnum' 3436 * PL_numeric_underlying A boolean indicating if the toggled state is 3437 * such that the current locale is the program's 3438 * underlying locale 3439 * PL_numeric_standard An int indicating if the toggled state is such 3440 * that the current locale is the C locale or 3441 * indistinguishable from the C locale. If non-zero, it 3442 * is in C; if > 1, it means it may not be toggled away 3443 * from C. 3444 * PL_numeric_underlying_is_standard A bool kept by this function 3445 * indicating that the underlying locale and the standard 3446 * C locale are indistinguishable for the purposes of 3447 * LC_NUMERIC. This happens when both of the above two 3448 * variables are true at the same time. (Toggling is a 3449 * no-op under these circumstances.) This variable is 3450 * used to avoid having to recalculate. 3451 * PL_numeric_radix_sv Contains the string that code should use for the 3452 * decimal point. It is set to either a dot or the 3453 * program's underlying locale's radix character string, 3454 * depending on the situation. 3455 * PL_underlying_radix_sv Contains the program's underlying locale's 3456 * radix character string. This is copied into 3457 * PL_numeric_radix_sv when the situation warrants. It 3458 * exists to avoid having to recalculate it when toggling. 3459 */ 3460 3461 DEBUG_L( PerlIO_printf(Perl_debug_log, 3462 "Called new_numeric with %s, PL_numeric_name=%s\n", 3463 newnum, PL_numeric_name)); 3464 3465 /* We keep records comparing the characteristics of the LC_NUMERIC catetory 3466 * of the current locale vs the standard C locale. If the new locale that 3467 * has just been changed to is the same as the one our records are for, 3468 * they are still valid, and we don't have to recalculate them. 'force' is 3469 * true if the caller suspects that the records are out-of-date, so do go 3470 * ahead and recalculate them. (This can happen when an external library 3471 * has had control and now perl is reestablishing control; we have to 3472 * assume that that library changed the locale in unknown ways.) 3473 * 3474 * Even if our records are valid, the new locale will likely have been 3475 * switched to before this function gets called, and we must toggle into 3476 * one indistinguishable from the C locale with regards to LC_NUMERIC 3477 * handling, so that all the libc functions that are affected by LC_NUMERIC 3478 * will work as expected. This can be skipped if we already know that the 3479 * locale is indistinguishable from the C locale. */ 3480 if (! force && strEQ(PL_numeric_name, newnum)) { 3481 if (! PL_numeric_underlying_is_standard) { 3482 set_numeric_standard(__FILE__, __LINE__); 3483 } 3484 3485 return; 3486 } 3487 3488 Safefree(PL_numeric_name); 3489 PL_numeric_name = savepv(newnum); 3490 3491 /* Handle the trivial case. Since this is called at process 3492 * initialization, be aware that this bit can't rely on much being 3493 * available. */ 3494 if (isNAME_C_OR_POSIX(PL_numeric_name)) { 3495 PL_numeric_standard = TRUE; 3496 PL_numeric_underlying_is_standard = TRUE; 3497 PL_numeric_underlying = TRUE; 3498 sv_setpv(PL_numeric_radix_sv, C_decimal_point); 3499 SvUTF8_off(PL_numeric_radix_sv); 3500 sv_setpv(PL_underlying_radix_sv, C_decimal_point); 3501 SvUTF8_off(PL_underlying_radix_sv); 3502 return; 3503 } 3504 3505 /* We are in the underlying locale until changed at the end of this 3506 * function */ 3507 PL_numeric_underlying = TRUE; 3508 3509 /* Passing a non-NULL causes the function call just below to 3510 automatically set the UTF-8 flag on PL_underlying_radix_sv */ 3511 utf8ness_t dummy; 3512 3513 /* Find and save this locale's radix character. */ 3514 langinfo_sv_c(RADIXCHAR, LC_NUMERIC, PL_numeric_name, 3515 PL_underlying_radix_sv, &dummy); 3516 DEBUG_L(PerlIO_printf(Perl_debug_log, 3517 "Locale radix is '%s', ?UTF-8=%d\n", 3518 SvPVX(PL_underlying_radix_sv), 3519 cBOOL(SvUTF8(PL_underlying_radix_sv)))); 3520 3521 /* This locale is indistinguishable from C (for numeric purposes) if both 3522 * the radix character and the thousands separator are the same as C's. 3523 * Start with the radix. */ 3524 PL_numeric_underlying_is_standard = strEQ(C_decimal_point, 3525 SvPVX(PL_underlying_radix_sv)); 3526 3527 # ifndef TS_W32_BROKEN_LOCALECONV 3528 3529 /* If the radix isn't the same as C's, we know it is distinguishable from 3530 * C; otherwise check the thousands separator too. Only if both are the 3531 * same as C's is the locale indistinguishable from C. 3532 * 3533 * But on earlier Windows versions, there is a potential race. This code 3534 * knows that localeconv() (elsewhere in this file) will be used to extract 3535 * the needed value, and localeconv() was buggy for quite a while, and that 3536 * code in this file hence uses a workaround. And that workaround may have 3537 * an (unlikely) race. Gathering the radix uses a different workaround on 3538 * Windows that doesn't involve a race. It might be possible to do the 3539 * same for this (patches welcome). 3540 * 3541 * Until then khw doesn't think it's worth even the small risk of a race to 3542 * get this value, which doesn't appear to be used in any of the Microsoft 3543 * library routines anyway. */ 3544 3545 if (PL_numeric_underlying_is_standard) { 3546 PL_numeric_underlying_is_standard = strEQ(C_thousands_sep, 3547 langinfo_c(THOUSEP, 3548 LC_NUMERIC, 3549 PL_numeric_name, 3550 NULL)); 3551 } 3552 3553 # endif 3554 3555 PL_numeric_standard = PL_numeric_underlying_is_standard; 3556 3557 /* Keep LC_NUMERIC so that it has the C locale radix and thousands 3558 * separator. This is for XS modules, so they don't have to worry about 3559 * the radix being a non-dot. (Core operations that need the underlying 3560 * locale change to it temporarily). */ 3561 if (! PL_numeric_standard) { 3562 set_numeric_standard(__FILE__, __LINE__); 3563 } 3564 } 3565 3566 # endif 3567 3568 void 3569 Perl_set_numeric_standard(pTHX_ const char * const file, const line_t line) 3570 { 3571 PERL_ARGS_ASSERT_SET_NUMERIC_STANDARD; 3572 PERL_UNUSED_ARG(file); /* Some Configurations ignore these */ 3573 PERL_UNUSED_ARG(line); 3574 3575 # ifdef USE_LOCALE_NUMERIC 3576 3577 /* Unconditionally toggle the LC_NUMERIC locale to the C locale 3578 * 3579 * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h 3580 * instead of calling this directly. The macro avoids calling this routine 3581 * if toggling isn't necessary according to our records (which could be 3582 * wrong if some XS code has changed the locale behind our back) */ 3583 3584 DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to" 3585 " standard C; called from %s: %" 3586 LINE_Tf "\n", file, line)); 3587 3588 void_setlocale_c_with_caller(LC_NUMERIC, "C", file, line); 3589 PL_numeric_standard = TRUE; 3590 sv_setpv(PL_numeric_radix_sv, C_decimal_point); 3591 SvUTF8_off(PL_numeric_radix_sv); 3592 3593 PL_numeric_underlying = PL_numeric_underlying_is_standard; 3594 3595 # endif /* USE_LOCALE_NUMERIC */ 3596 3597 } 3598 3599 void 3600 Perl_set_numeric_underlying(pTHX_ const char * const file, const line_t line) 3601 { 3602 PERL_ARGS_ASSERT_SET_NUMERIC_UNDERLYING; 3603 PERL_UNUSED_ARG(file); /* Some Configurations ignore these */ 3604 PERL_UNUSED_ARG(line); 3605 3606 # ifdef USE_LOCALE_NUMERIC 3607 3608 /* Unconditionally toggle the LC_NUMERIC locale to the current underlying 3609 * default. 3610 * 3611 * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h 3612 * instead of calling this directly. The macro avoids calling this routine 3613 * if toggling isn't necessary according to our records (which could be 3614 * wrong if some XS code has changed the locale behind our back) */ 3615 3616 DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s;" 3617 " called from %s: %" LINE_Tf "\n", 3618 PL_numeric_name, file, line)); 3619 /* Maybe not in init? assert(PL_locale_mutex_depth > 0);*/ 3620 3621 void_setlocale_c_with_caller(LC_NUMERIC, PL_numeric_name, file, line); 3622 PL_numeric_underlying = TRUE; 3623 sv_setsv_nomg(PL_numeric_radix_sv, PL_underlying_radix_sv); 3624 3625 PL_numeric_standard = PL_numeric_underlying_is_standard; 3626 3627 # endif /* USE_LOCALE_NUMERIC */ 3628 3629 } 3630 3631 # ifdef USE_LOCALE_CTYPE 3632 3633 STATIC void 3634 S_new_ctype(pTHX_ const char *newctype, bool force) 3635 { 3636 PERL_ARGS_ASSERT_NEW_CTYPE; 3637 PERL_UNUSED_ARG(force); 3638 3639 /* Called after each libc setlocale() call affecting LC_CTYPE, to tell 3640 * core Perl this and that 'newctype' is the name of the new locale. 3641 * 3642 * This function sets up the folding arrays for all 256 bytes, assuming 3643 * that tofold() is tolc() since fold case is not a concept in POSIX, 3644 */ 3645 3646 DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n", 3647 newctype)); 3648 3649 /* No change means no-op */ 3650 if (strEQ(PL_ctype_name, newctype)) { 3651 return; 3652 } 3653 3654 /* We will replace any bad locale warning with 3655 * 1) nothing if the new one is ok; or 3656 * 2) a new warning for the bad new locale */ 3657 if (PL_warn_locale) { 3658 SvREFCNT_dec_NN(PL_warn_locale); 3659 PL_warn_locale = NULL; 3660 } 3661 3662 /* Clear cache */ 3663 Safefree(PL_ctype_name); 3664 PL_ctype_name = ""; 3665 3666 PL_in_utf8_turkic_locale = FALSE; 3667 3668 /* For the C locale, just use the standard folds, and we know there are no 3669 * glitches possible, so return early. Since this is called at process 3670 * initialization, be aware that this bit can't rely on much being 3671 * available. */ 3672 if (isNAME_C_OR_POSIX(newctype)) { 3673 Copy(PL_fold, PL_fold_locale, 256, U8); 3674 PL_ctype_name = savepv(newctype); 3675 PL_in_utf8_CTYPE_locale = FALSE; 3676 return; 3677 } 3678 3679 /* The cache being cleared signals the called function to compute a new 3680 * value */ 3681 PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype); 3682 3683 PL_ctype_name = savepv(newctype); 3684 bool maybe_utf8_turkic = FALSE; 3685 3686 /* Don't check for problems if we are suppressing the warnings */ 3687 bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST); 3688 3689 if (PL_in_utf8_CTYPE_locale) { 3690 3691 /* A UTF-8 locale gets standard rules. But note that code still has to 3692 * handle this specially because of the three problematic code points 3693 * */ 3694 Copy(PL_fold_latin1, PL_fold_locale, 256, U8); 3695 3696 /* UTF-8 locales can have special handling for 'I' and 'i' if they are 3697 * Turkic. Make sure these two are the only anomalies. (We don't 3698 * require towupper and towlower because they aren't in C89.) */ 3699 3700 # if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER) 3701 3702 if (towupper('i') == 0x130 && towlower('I') == 0x131) 3703 3704 # else 3705 3706 if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I') 3707 3708 # endif 3709 3710 { 3711 /* This is how we determine it really is Turkic */ 3712 check_for_problems = TRUE; 3713 maybe_utf8_turkic = TRUE; 3714 } 3715 } 3716 else { /* Not a canned locale we know the values for. Compute them */ 3717 3718 # ifdef DEBUGGING 3719 3720 bool has_non_ascii_fold = FALSE; 3721 bool found_unexpected = FALSE; 3722 3723 /* Under -DLv, see if there are any folds outside the ASCII range. 3724 * This factoid is used below */ 3725 if (DEBUG_Lv_TEST) { 3726 for (unsigned i = 128; i < 256; i++) { 3727 int j = LATIN1_TO_NATIVE(i); 3728 if (toU8_LOWER_LC(j) != j || toU8_UPPER_LC(j) != j) { 3729 has_non_ascii_fold = TRUE; 3730 break; 3731 } 3732 } 3733 } 3734 3735 # endif 3736 3737 for (unsigned i = 0; i < 256; i++) { 3738 if (isU8_UPPER_LC(i)) 3739 PL_fold_locale[i] = (U8) toU8_LOWER_LC(i); 3740 else if (isU8_LOWER_LC(i)) 3741 PL_fold_locale[i] = (U8) toU8_UPPER_LC(i); 3742 else 3743 PL_fold_locale[i] = (U8) i; 3744 3745 # ifdef DEBUGGING 3746 3747 /* Most locales these days are supersets of ASCII. When debugging, 3748 * it is helpful to know what the exceptions to that are in this 3749 * locale */ 3750 if (DEBUG_L_TEST) { 3751 bool unexpected = FALSE; 3752 3753 if (isUPPER_L1(i)) { 3754 if (isUPPER_A(i)) { 3755 if (PL_fold_locale[i] != toLOWER_A(i)) { 3756 unexpected = TRUE; 3757 } 3758 } 3759 else if (has_non_ascii_fold) { 3760 if (PL_fold_locale[i] != toLOWER_L1(i)) { 3761 unexpected = TRUE; 3762 } 3763 } 3764 else if (PL_fold_locale[i] != i) { 3765 unexpected = TRUE; 3766 } 3767 } 3768 else if ( isLOWER_L1(i) 3769 && i != LATIN_SMALL_LETTER_SHARP_S 3770 && i != MICRO_SIGN) 3771 { 3772 if (isLOWER_A(i)) { 3773 if (PL_fold_locale[i] != toUPPER_A(i)) { 3774 unexpected = TRUE; 3775 } 3776 } 3777 else if (has_non_ascii_fold) { 3778 if (PL_fold_locale[i] != toUPPER_LATIN1_MOD(i)) { 3779 unexpected = TRUE; 3780 } 3781 } 3782 else if (PL_fold_locale[i] != i) { 3783 unexpected = TRUE; 3784 } 3785 } 3786 else if (PL_fold_locale[i] != i) { 3787 unexpected = TRUE; 3788 } 3789 3790 if (unexpected) { 3791 found_unexpected = TRUE; 3792 DEBUG_L(PerlIO_printf(Perl_debug_log, 3793 "For %s, fold of %02x is %02x\n", 3794 newctype, i, PL_fold_locale[i])); 3795 } 3796 } 3797 } 3798 3799 if (found_unexpected) { 3800 DEBUG_L(PerlIO_printf(Perl_debug_log, 3801 "All bytes not mentioned above either fold to" 3802 " themselves or are the expected ASCII or" 3803 " Latin1 ones\n")); 3804 } 3805 else { 3806 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 3807 "No nonstandard folds were found\n")); 3808 # endif 3809 3810 } 3811 } 3812 3813 # ifdef MB_CUR_MAX 3814 3815 /* We only handle single-byte locales (outside of UTF-8 ones); so if this 3816 * locale requires more than one byte, there are going to be BIG problems. 3817 * */ 3818 3819 const int mb_cur_max = MB_CUR_MAX; 3820 3821 if (mb_cur_max > 1 && ! PL_in_utf8_CTYPE_locale 3822 3823 /* Some platforms return MB_CUR_MAX > 1 for even the "C" locale. 3824 * Just assume that the implementation for them (plus for POSIX) is 3825 * correct and the > 1 value is spurious. (Since these are 3826 * specially handled to never be considered UTF-8 locales, as long 3827 * as this is the only problem, everything should work fine */ 3828 && ! isNAME_C_OR_POSIX(newctype)) 3829 { 3830 DEBUG_L(PerlIO_printf(Perl_debug_log, 3831 "Unsupported, MB_CUR_MAX=%d\n", mb_cur_max)); 3832 3833 if (! IN_LC(LC_CTYPE) || ckWARN_d(WARN_LOCALE)) { 3834 char * msg = Perl_form(aTHX_ 3835 "Locale '%s' is unsupported, and may hang" 3836 " or crash the interpreter", 3837 newctype); 3838 if (IN_LC(LC_CTYPE)) { 3839 Perl_warner(aTHX_ packWARN(WARN_LOCALE), "%s", msg); 3840 } 3841 else { 3842 PL_warn_locale = newSV(0); 3843 sv_setpvn(PL_warn_locale, msg, strlen(msg)); 3844 } 3845 } 3846 } 3847 3848 # endif 3849 3850 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "check_for_problems=%d\n", 3851 check_for_problems)); 3852 3853 /* We don't populate the other lists if a UTF-8 locale, but do check that 3854 * everything works as expected, unless checking turned off */ 3855 if (check_for_problems) { 3856 /* Assume enough space for every character being bad. 4 spaces each 3857 * for the 94 printable characters that are output like "'x' "; and 5 3858 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating 3859 * NUL */ 3860 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' }; 3861 unsigned int bad_count = 0; /* Count of bad characters */ 3862 3863 for (unsigned i = 0; i < 256; i++) { 3864 3865 /* If checking for locale problems, see if the native ASCII-range 3866 * printables plus \n and \t are in their expected categories in 3867 * the new locale. If not, this could mean big trouble, upending 3868 * Perl's and most programs' assumptions, like having a 3869 * metacharacter with special meaning become a \w. Fortunately, 3870 * it's very rare to find locales that aren't supersets of ASCII 3871 * nowadays. It isn't a problem for most controls to be changed 3872 * into something else; we check only \n and \t, though perhaps \r 3873 * could be an issue as well. */ 3874 if (isGRAPH_A(i) || isBLANK_A(i) || i == '\n') { 3875 bool is_bad = FALSE; 3876 char name[4] = { '\0' }; 3877 3878 /* Convert the name into a string */ 3879 if (isGRAPH_A(i)) { 3880 name[0] = i; 3881 name[1] = '\0'; 3882 } 3883 else if (i == '\n') { 3884 my_strlcpy(name, "\\n", sizeof(name)); 3885 } 3886 else if (i == '\t') { 3887 my_strlcpy(name, "\\t", sizeof(name)); 3888 } 3889 else { 3890 assert(i == ' '); 3891 my_strlcpy(name, "' '", sizeof(name)); 3892 } 3893 3894 /* Check each possibe class */ 3895 if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) != 3896 cBOOL(isALPHANUMERIC_A(i)))) 3897 { 3898 is_bad = TRUE; 3899 DEBUG_L(PerlIO_printf(Perl_debug_log, 3900 "isalnum('%s') unexpectedly is %x\n", 3901 name, cBOOL(isU8_ALPHANUMERIC_LC(i)))); 3902 } 3903 if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i)))) { 3904 is_bad = TRUE; 3905 DEBUG_L(PerlIO_printf(Perl_debug_log, 3906 "isalpha('%s') unexpectedly is %x\n", 3907 name, cBOOL(isU8_ALPHA_LC(i)))); 3908 } 3909 if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i)))) { 3910 is_bad = TRUE; 3911 DEBUG_L(PerlIO_printf(Perl_debug_log, 3912 "isdigit('%s') unexpectedly is %x\n", 3913 name, cBOOL(isU8_DIGIT_LC(i)))); 3914 } 3915 if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i)))) { 3916 is_bad = TRUE; 3917 DEBUG_L(PerlIO_printf(Perl_debug_log, 3918 "isgraph('%s') unexpectedly is %x\n", 3919 name, cBOOL(isU8_GRAPH_LC(i)))); 3920 } 3921 if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i)))) { 3922 is_bad = TRUE; 3923 DEBUG_L(PerlIO_printf(Perl_debug_log, 3924 "islower('%s') unexpectedly is %x\n", 3925 name, cBOOL(isU8_LOWER_LC(i)))); 3926 } 3927 if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i)))) { 3928 is_bad = TRUE; 3929 DEBUG_L(PerlIO_printf(Perl_debug_log, 3930 "isprint('%s') unexpectedly is %x\n", 3931 name, cBOOL(isU8_PRINT_LC(i)))); 3932 } 3933 if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i)))) { 3934 is_bad = TRUE; 3935 DEBUG_L(PerlIO_printf(Perl_debug_log, 3936 "ispunct('%s') unexpectedly is %x\n", 3937 name, cBOOL(isU8_PUNCT_LC(i)))); 3938 } 3939 if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i)))) { 3940 is_bad = TRUE; 3941 DEBUG_L(PerlIO_printf(Perl_debug_log, 3942 "isspace('%s') unexpectedly is %x\n", 3943 name, cBOOL(isU8_SPACE_LC(i)))); 3944 } 3945 if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i)))) { 3946 is_bad = TRUE; 3947 DEBUG_L(PerlIO_printf(Perl_debug_log, 3948 "isupper('%s') unexpectedly is %x\n", 3949 name, cBOOL(isU8_UPPER_LC(i)))); 3950 } 3951 if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i)))) { 3952 is_bad = TRUE; 3953 DEBUG_L(PerlIO_printf(Perl_debug_log, 3954 "isxdigit('%s') unexpectedly is %x\n", 3955 name, cBOOL(isU8_XDIGIT_LC(i)))); 3956 } 3957 if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) { 3958 is_bad = TRUE; 3959 DEBUG_L(PerlIO_printf(Perl_debug_log, 3960 "tolower('%s')=0x%x instead of the expected 0x%x\n", 3961 name, toU8_LOWER_LC(i), (int) toLOWER_A(i))); 3962 } 3963 if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) { 3964 is_bad = TRUE; 3965 DEBUG_L(PerlIO_printf(Perl_debug_log, 3966 "toupper('%s')=0x%x instead of the expected 0x%x\n", 3967 name, toU8_UPPER_LC(i), (int) toUPPER_A(i))); 3968 } 3969 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i)))) { 3970 is_bad = TRUE; 3971 DEBUG_L(PerlIO_printf(Perl_debug_log, 3972 "'\\n' (=%02X) is not a control\n", (int) i)); 3973 } 3974 3975 /* Add to the list; Separate multiple entries with a blank */ 3976 if (is_bad) { 3977 if (bad_count) { 3978 my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list)); 3979 } 3980 my_strlcat(bad_chars_list, name, sizeof(bad_chars_list)); 3981 bad_count++; 3982 } 3983 } 3984 } 3985 3986 if (bad_count == 2 && maybe_utf8_turkic) { 3987 bad_count = 0; 3988 *bad_chars_list = '\0'; 3989 3990 /* The casts are because otherwise some compilers warn: 3991 gcc.gnu.org/bugzilla/show_bug.cgi?id=99950 3992 gcc.gnu.org/bugzilla/show_bug.cgi?id=94182 3993 */ 3994 PL_fold_locale[ (U8) 'I' ] = 'I'; 3995 PL_fold_locale[ (U8) 'i' ] = 'i'; 3996 PL_in_utf8_turkic_locale = TRUE; 3997 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype)); 3998 } 3999 4000 /* If we found problems and we want them output, do so */ 4001 if ( (UNLIKELY(bad_count)) 4002 && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST))) 4003 { 4004 /* WARNING. If you change the wording of these; be sure to update 4005 * t/loc_tools.pl correspondingly */ 4006 4007 if (PL_warn_locale) { 4008 sv_catpvs(PL_warn_locale, "\n"); 4009 } 4010 else { 4011 PL_warn_locale = newSVpvs(""); 4012 } 4013 4014 if (PL_in_utf8_CTYPE_locale) { 4015 Perl_sv_catpvf(aTHX_ PL_warn_locale, 4016 "Locale '%s' contains (at least) the following characters" 4017 " which have\nunexpected meanings: %s\nThe Perl program" 4018 " will use the expected meanings", 4019 newctype, bad_chars_list); 4020 } 4021 else { 4022 Perl_sv_catpvf(aTHX_ PL_warn_locale, 4023 "\nThe following characters (and maybe" 4024 " others) may not have the same meaning as" 4025 " the Perl program expects: %s\n", 4026 bad_chars_list 4027 ); 4028 } 4029 4030 # if defined(HAS_SOME_LANGINFO) || defined(WIN32) 4031 4032 Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s", 4033 langinfo_c(CODESET, LC_CTYPE, newctype, NULL)); 4034 4035 # endif 4036 4037 Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n"); 4038 4039 /* If we are actually in the scope of the locale or are debugging, 4040 * output the message now. If not in that scope, we save the 4041 * message to be output at the first operation using this locale, 4042 * if that actually happens. Most programs don't use locales, so 4043 * they are immune to bad ones. */ 4044 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) { 4045 4046 /* The '0' below suppresses a bogus gcc compiler warning */ 4047 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 4048 0); 4049 if (IN_LC(LC_CTYPE)) { 4050 SvREFCNT_dec_NN(PL_warn_locale); 4051 PL_warn_locale = NULL; 4052 } 4053 } 4054 } 4055 } 4056 } 4057 4058 void 4059 Perl_warn_problematic_locale() 4060 { 4061 dTHX; 4062 4063 /* Core-only function that outputs the message in PL_warn_locale, 4064 * and then NULLS it. Should be called only through the macro 4065 * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */ 4066 4067 if (PL_warn_locale) { 4068 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 4069 SvPVX(PL_warn_locale), 4070 0 /* dummy to avoid compiler warning */ ); 4071 SvREFCNT_dec_NN(PL_warn_locale); 4072 PL_warn_locale = NULL; 4073 } 4074 } 4075 4076 # endif /* USE_LOCALE_CTYPE */ 4077 4078 STATIC void 4079 S_new_LC_ALL(pTHX_ const char *lc_all, bool force) 4080 { 4081 PERL_ARGS_ASSERT_NEW_LC_ALL; 4082 4083 /* new_LC_ALL() updates all the things we care about. Note that this is 4084 * called just after a change, so uses the actual underlying locale just 4085 * set, and not the nominal one (should they differ, as they may in 4086 * LC_NUMERIC). */ 4087 4088 const char * individ_locales[LC_ALL_INDEX_] = { NULL }; 4089 4090 switch (parse_LC_ALL_string(lc_all, 4091 individ_locales, 4092 override_if_ignored, /* Override any ignored 4093 categories */ 4094 true, /* Always fill array */ 4095 true, /* Panic if fails, as to get here it 4096 earlier had to have succeeded */ 4097 __LINE__)) 4098 { 4099 case invalid: 4100 case no_array: 4101 case only_element_0: 4102 locale_panic_("Unexpected return from parse_LC_ALL_string"); 4103 4104 case full_array: 4105 break; 4106 } 4107 4108 for_all_individual_category_indexes(i) { 4109 if (update_functions[i]) { 4110 const char * this_locale = individ_locales[i]; 4111 update_functions[i](aTHX_ this_locale, force); 4112 } 4113 4114 Safefree(individ_locales[i]); 4115 } 4116 } 4117 4118 # ifdef USE_LOCALE_COLLATE 4119 4120 STATIC void 4121 S_new_collate(pTHX_ const char *newcoll, bool force) 4122 { 4123 PERL_ARGS_ASSERT_NEW_COLLATE; 4124 PERL_UNUSED_ARG(force); 4125 4126 /* Called after each libc setlocale() call affecting LC_COLLATE, to tell 4127 * core Perl this and that 'newcoll' is the name of the new locale. 4128 * 4129 * The design of locale collation is that every locale change is given an 4130 * index 'PL_collation_ix'. The first time a string participates in an 4131 * operation that requires collation while locale collation is active, it 4132 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That 4133 * magic includes the collation index, and the transformation of the string 4134 * by strxfrm(), q.v. That transformation is used when doing comparisons, 4135 * instead of the string itself. If a string changes, the magic is 4136 * cleared. The next time the locale changes, the index is incremented, 4137 * and so we know during a comparison that the transformation is not 4138 * necessarily still valid, and so is recomputed. Note that if the locale 4139 * changes enough times, the index could wrap, and it is possible that a 4140 * transformation would improperly be considered valid, leading to an 4141 * unlikely bug. The value is declared to the widest possible type on this 4142 * platform. */ 4143 4144 /* Return if the locale isn't changing */ 4145 if (strEQ(PL_collation_name, newcoll)) { 4146 return; 4147 } 4148 4149 Safefree(PL_collation_name); 4150 PL_collation_name = savepv(newcoll); 4151 ++PL_collation_ix; 4152 4153 /* Set the new one up if trivial. Since this is called at process 4154 * initialization, be aware that this bit can't rely on much being 4155 * available. */ 4156 PL_collation_standard = isNAME_C_OR_POSIX(newcoll); 4157 if (PL_collation_standard) { 4158 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 4159 "Setting PL_collation name='%s'\n", 4160 PL_collation_name)); 4161 PL_collxfrm_base = 0; 4162 PL_collxfrm_mult = 2; 4163 PL_in_utf8_COLLATE_locale = FALSE; 4164 PL_strxfrm_NUL_replacement = '\0'; 4165 PL_strxfrm_max_cp = 0; 4166 return; 4167 } 4168 4169 /* Flag that the remainder of the set up is being deferred until first 4170 * need. */ 4171 PL_collxfrm_mult = 0; 4172 PL_collxfrm_base = 0; 4173 4174 } 4175 4176 # endif /* USE_LOCALE_COLLATE */ 4177 4178 # ifdef WIN32 4179 4180 STATIC wchar_t * 4181 S_Win_byte_string_to_wstring(const UINT code_page, const char * byte_string) 4182 { 4183 /* Caller must arrange to free the returned string */ 4184 4185 int req_size = MultiByteToWideChar(code_page, 0, byte_string, -1, NULL, 0); 4186 if (! req_size) { 4187 SET_EINVAL; 4188 return NULL; 4189 } 4190 4191 wchar_t *wstring; 4192 Newx(wstring, req_size, wchar_t); 4193 4194 if (! MultiByteToWideChar(code_page, 0, byte_string, -1, wstring, req_size)) 4195 { 4196 Safefree(wstring); 4197 SET_EINVAL; 4198 return NULL; 4199 } 4200 4201 return wstring; 4202 } 4203 4204 # define Win_utf8_string_to_wstring(s) \ 4205 Win_byte_string_to_wstring(CP_UTF8, (s)) 4206 4207 STATIC char * 4208 S_Win_wstring_to_byte_string(const UINT code_page, const wchar_t * wstring) 4209 { 4210 /* Caller must arrange to free the returned string */ 4211 4212 int req_size = 4213 WideCharToMultiByte(code_page, 0, wstring, -1, NULL, 0, NULL, NULL); 4214 4215 char *byte_string; 4216 Newx(byte_string, req_size, char); 4217 4218 if (! WideCharToMultiByte(code_page, 0, wstring, -1, byte_string, 4219 req_size, NULL, NULL)) 4220 { 4221 Safefree(byte_string); 4222 SET_EINVAL; 4223 return NULL; 4224 } 4225 4226 return byte_string; 4227 } 4228 4229 # define Win_wstring_to_utf8_string(ws) \ 4230 Win_wstring_to_byte_string(CP_UTF8, (ws)) 4231 4232 STATIC const char * 4233 S_wrap_wsetlocale(pTHX_ const int category, const char *locale) 4234 { 4235 PERL_ARGS_ASSERT_WRAP_WSETLOCALE; 4236 4237 /* Calls _wsetlocale(), converting the parameters/return to/from 4238 * Perl-expected forms as if plain setlocale() were being called instead. 4239 * 4240 * Caller must arrange for the returned PV to be freed. 4241 */ 4242 4243 const wchar_t * wlocale = NULL; 4244 4245 if (locale) { 4246 wlocale = Win_utf8_string_to_wstring(locale); 4247 if (! wlocale) { 4248 return NULL; 4249 } 4250 } 4251 4252 WSETLOCALE_LOCK; 4253 const wchar_t * wresult = _wsetlocale(category, wlocale); 4254 4255 if (! wresult) { 4256 WSETLOCALE_UNLOCK; 4257 Safefree(wlocale); 4258 return NULL; 4259 } 4260 4261 const char * result = Win_wstring_to_utf8_string(wresult); 4262 WSETLOCALE_UNLOCK; 4263 4264 Safefree(wlocale); 4265 return result; 4266 } 4267 4268 STATIC const char * 4269 S_win32_setlocale(pTHX_ int category, const char* locale) 4270 { 4271 /* This, for Windows, emulates POSIX setlocale() behavior. There is no 4272 * difference between the two unless the input locale is "", which normally 4273 * means on Windows to get the machine default, which is set via the 4274 * computer's "Regional and Language Options" (or its current equivalent). 4275 * In POSIX, it instead means to find the locale from the user's 4276 * environment. This routine changes the Windows behavior to try the POSIX 4277 * behavior first. Further details are in the called function 4278 * find_locale_from_environment(). 4279 */ 4280 4281 if (locale != NULL && strEQ(locale, "")) { 4282 /* Note this function may change the locale, but that's ok because we 4283 * are about to change it anyway */ 4284 locale = find_locale_from_environment(get_category_index(category)); 4285 if (locale == NULL) { 4286 SET_EINVAL; 4287 return NULL; 4288 } 4289 } 4290 4291 const char * result = wrap_wsetlocale(category, locale); 4292 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n", 4293 setlocale_debug_string_r(category, locale, result))); 4294 4295 if (! result) { 4296 SET_EINVAL; 4297 return NULL; 4298 } 4299 4300 save_to_buffer(result, &PL_setlocale_buf, &PL_setlocale_bufsize); 4301 4302 # ifndef USE_PL_CUR_LC_ALL 4303 4304 Safefree(result); 4305 4306 # else 4307 4308 /* Here, we need to keep track of LC_ALL, so store the new value. but if 4309 * the input locale is NULL, we were just querying, so the original value 4310 * hasn't changed */ 4311 if (locale == NULL) { 4312 Safefree(result); 4313 } 4314 else { 4315 4316 /* If we set LC_ALL directly above, we already know its new value; but 4317 * if we changed just an individual category, find the new LC_ALL */ 4318 if (category != LC_ALL) { 4319 Safefree(result); 4320 result = wrap_wsetlocale(LC_ALL, NULL); 4321 } 4322 4323 Safefree(PL_cur_LC_ALL); 4324 PL_cur_LC_ALL = result; 4325 } 4326 4327 DEBUG_L(PerlIO_printf(Perl_debug_log, "new PL_cur_LC_ALL=%s\n", 4328 PL_cur_LC_ALL)); 4329 # endif 4330 4331 return PL_setlocale_buf; 4332 } 4333 4334 # endif 4335 4336 STATIC const char * 4337 S_native_querylocale_i(pTHX_ const locale_category_index cat_index) 4338 { 4339 /* Determine the current locale and return it in the form the platform's 4340 * native locale handling understands. This is different only from our 4341 * internal form for the LC_ALL category, as platforms differ in how they 4342 * represent that. 4343 * 4344 * This is only called from Perl_setlocale(). As such it returns in 4345 * PL_setlocale_buf */ 4346 4347 # ifdef USE_LOCALE_NUMERIC 4348 4349 /* We have the LC_NUMERIC name saved, because we are normally switched into 4350 * the C locale (or equivalent) for it. */ 4351 if (cat_index == LC_NUMERIC_INDEX_) { 4352 4353 /* We don't have to copy this return value, as it is a per-thread 4354 * variable, and won't change until a future setlocale */ 4355 return PL_numeric_name; 4356 } 4357 4358 # endif 4359 # ifdef LC_ALL 4360 4361 if (cat_index != LC_ALL_INDEX_) 4362 4363 # endif 4364 4365 { 4366 /* Here, not LC_ALL, and not LC_NUMERIC: the actual and native values 4367 * match */ 4368 4369 # ifdef setlocale_i /* Can shortcut if this is defined */ 4370 4371 return setlocale_i(cat_index, NULL); 4372 4373 # else 4374 4375 return save_to_buffer(querylocale_i(cat_index), 4376 &PL_setlocale_buf, &PL_setlocale_bufsize); 4377 # endif 4378 4379 } 4380 4381 /* Below, querying LC_ALL */ 4382 4383 # ifdef LC_ALL 4384 # ifdef USE_PL_CURLOCALES 4385 # define LC_ALL_ARG PL_curlocales 4386 # else 4387 # define LC_ALL_ARG NULL /* Causes calculate_LC_ALL_string() to find the 4388 locale using a querylocale function */ 4389 # endif 4390 4391 return calculate_LC_ALL_string(LC_ALL_ARG, EXTERNAL_FORMAT_FOR_QUERY, 4392 WANT_PL_setlocale_buf, 4393 __LINE__); 4394 # undef LC_ALL_ARG 4395 # endif /* has LC_ALL */ 4396 4397 } 4398 4399 #endif /* USE_LOCALE */ 4400 4401 /* 4402 =for apidoc Perl_setlocale 4403 4404 This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>, 4405 taking the same parameters, and returning the same information, except that it 4406 returns the correct underlying C<LC_NUMERIC> locale. Regular C<setlocale> will 4407 instead return C<C> if the underlying locale has a non-dot decimal point 4408 character, or a non-empty thousands separator for displaying floating point 4409 numbers. This is because perl keeps that locale category such that it has a 4410 dot and empty separator, changing the locale briefly during the operations 4411 where the underlying one is required. C<Perl_setlocale> knows about this, and 4412 compensates; regular C<setlocale> doesn't. 4413 4414 Another reason it isn't completely a drop-in replacement is that it is 4415 declared to return S<C<const char *>>, whereas the system setlocale omits the 4416 C<const> (presumably because its API was specified long ago, and can't be 4417 updated; it is illegal to change the information C<setlocale> returns; doing 4418 so leads to segfaults.) 4419 4420 Finally, C<Perl_setlocale> works under all circumstances, whereas plain 4421 C<setlocale> can be completely ineffective on some platforms under some 4422 configurations. 4423 4424 Changing the locale is not a good idea when more than one thread is running, 4425 except on systems where the predefined variable C<${^SAFE_LOCALES}> is 4426 non-zero. This is because on such systems the locale is global to the whole 4427 process and not local to just the thread calling the function. So changing it 4428 in one thread instantaneously changes it in all. On some such systems, the 4429 system C<setlocale()> is ineffective, returning the wrong information, and 4430 failing to actually change the locale. z/OS refuses to try to change the 4431 locale once a second thread is created. C<Perl_setlocale>, should give you 4432 accurate results of what actually happened on these problematic platforms, 4433 returning NULL if the system forbade the locale change. 4434 4435 The return points to a per-thread static buffer, which is overwritten the next 4436 time C<Perl_setlocale> is called from the same thread. 4437 4438 =cut 4439 4440 */ 4441 4442 const char * 4443 Perl_setlocale(const int category, const char * locale) 4444 { 4445 /* This wraps POSIX::setlocale() */ 4446 4447 #ifndef USE_LOCALE 4448 4449 PERL_UNUSED_ARG(category); 4450 PERL_UNUSED_ARG(locale); 4451 4452 return "C"; 4453 4454 #else 4455 4456 dTHX; 4457 4458 DEBUG_L(PerlIO_printf(Perl_debug_log, 4459 "Entering Perl_setlocale(%d, \"%s\")\n", 4460 category, locale)); 4461 4462 bool valid_category; 4463 locale_category_index cat_index = get_category_index_helper(category, 4464 &valid_category, 4465 __LINE__); 4466 if (! valid_category) { 4467 if (ckWARN(WARN_LOCALE)) { 4468 const char * conditional_warn_text; 4469 if (locale == NULL) { 4470 conditional_warn_text = ""; 4471 locale = ""; 4472 } 4473 else { 4474 conditional_warn_text = "; can't set it to "; 4475 } 4476 4477 /* diag_listed_as: Unknown locale category %d; can't set it to %s */ 4478 Perl_warner(aTHX_ 4479 packWARN(WARN_LOCALE), 4480 "Unknown locale category %d%s%s", 4481 category, conditional_warn_text, locale); 4482 } 4483 4484 SET_EINVAL; 4485 return NULL; 4486 } 4487 4488 # ifdef setlocale_i 4489 4490 /* setlocale_i() gets defined only on Configurations that use setlocale() 4491 * in a simple manner that adequately handles all cases. If this category 4492 * doesn't have any perl complications, just do that. */ 4493 if (! update_functions[cat_index]) { 4494 return setlocale_i(cat_index, locale); 4495 } 4496 4497 # endif 4498 4499 /* Get current locale */ 4500 const char * current_locale = native_querylocale_i(cat_index); 4501 4502 /* A NULL locale means only query what the current one is. */ 4503 if (locale == NULL) { 4504 return current_locale; 4505 } 4506 4507 if (strEQ(current_locale, locale)) { 4508 DEBUG_L(PerlIO_printf(Perl_debug_log, 4509 "Already in requested locale: no action taken\n")); 4510 return current_locale; 4511 } 4512 4513 /* Here, an actual change is being requested. Do it */ 4514 if (! bool_setlocale_i(cat_index, locale)) { 4515 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n", 4516 setlocale_debug_string_i(cat_index, locale, "NULL"))); 4517 return NULL; 4518 } 4519 4520 /* At this point, the locale has been changed based on the requested value, 4521 * and the querylocale_i() will return the actual new value that the system 4522 * has for the category. That may not be the same as the input, as libc 4523 * may have returned a synonymous locale name instead of the input one; or, 4524 * if there are locale categories that we are compiled to ignore, any 4525 * attempt to change them away from "C" is overruled */ 4526 current_locale = querylocale_i(cat_index); 4527 4528 /* But certain categories need further work. For example we may need to 4529 * calculate new folding or collation rules. And for LC_NUMERIC, we have 4530 * to switch into a locale that has a dot radix. */ 4531 if (update_functions[cat_index]) { 4532 update_functions[cat_index](aTHX_ current_locale, 4533 /* No need to force recalculation, as 4534 * aren't coming from a situation 4535 * where Perl hasn't been controlling 4536 * the locale, so has accurate 4537 * records. */ 4538 false); 4539 } 4540 4541 /* Make sure the result is in a stable buffer for the caller's use, and is 4542 * in the expected format */ 4543 current_locale = native_querylocale_i(cat_index); 4544 4545 DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", current_locale)); 4546 4547 return current_locale; 4548 4549 #endif 4550 4551 } 4552 4553 #ifdef USE_LOCALE 4554 # ifdef DEBUGGING 4555 4556 STATIC char * 4557 S_my_setlocale_debug_string_i(pTHX_ 4558 const locale_category_index cat_index, 4559 const char* locale, /* Optional locale name */ 4560 4561 /* return value from setlocale() when attempting 4562 * to set 'category' to 'locale' */ 4563 const char* retval, 4564 4565 const line_t line) 4566 { 4567 /* Returns a pointer to a NUL-terminated string in static storage with 4568 * added text about the info passed in. This is not thread safe and will 4569 * be overwritten by the next call, so this should be used just to 4570 * formulate a string to immediately print or savepv() on. */ 4571 4572 const char * locale_quote; 4573 const char * retval_quote; 4574 4575 if (locale == NULL) { 4576 locale_quote = ""; 4577 locale = "NULL"; 4578 } 4579 else { 4580 locale_quote = "\""; 4581 } 4582 4583 if (retval == NULL) { 4584 retval_quote = ""; 4585 retval = "NULL"; 4586 } 4587 else { 4588 retval_quote = "\""; 4589 } 4590 4591 # ifdef MULTIPLICITY 4592 # define THREAD_FORMAT "%p:" 4593 # define THREAD_ARGUMENT aTHX_ 4594 # else 4595 # define THREAD_FORMAT 4596 # define THREAD_ARGUMENT 4597 # endif 4598 4599 return Perl_form(aTHX_ 4600 "%s:%" LINE_Tf ": " THREAD_FORMAT 4601 " setlocale(%s[%d], %s%s%s) returned %s%s%s\n", 4602 4603 __FILE__, line, THREAD_ARGUMENT 4604 category_names[cat_index], categories[cat_index], 4605 locale_quote, locale, locale_quote, 4606 retval_quote, retval, retval_quote); 4607 } 4608 4609 # endif 4610 4611 /* If this implementation hasn't defined these macros, they aren't needed */ 4612 # ifndef TOGGLE_LOCK 4613 # define TOGGLE_LOCK(i) 4614 # define TOGGLE_UNLOCK(i) 4615 # endif 4616 4617 STATIC const char * 4618 S_toggle_locale_i(pTHX_ const locale_category_index cat_index, 4619 const char * new_locale, 4620 const line_t caller_line) 4621 { 4622 PERL_ARGS_ASSERT_TOGGLE_LOCALE_I; 4623 4624 /* Changes the locale for the category specified by 'index' to 'new_locale, 4625 * if they aren't already the same. EVERY CALL to this function MUST HAVE 4626 * a corresponding call to restore_toggled_locale_i() 4627 * 4628 * Returns a copy of the name of the original locale for 'cat_index' 4629 * so can be switched back to with the companion function 4630 * restore_toggled_locale_i(), (NULL if no restoral is necessary.) */ 4631 4632 /* Find the original locale of the category we may need to change, so that 4633 * it can be restored to later */ 4634 const char * locale_to_restore_to = querylocale_i(cat_index); 4635 4636 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 4637 "Entering toggle_locale_i: index=%d(%s)," \ 4638 " wanted=%s, actual=%s; called from %" LINE_Tf \ 4639 "\n", cat_index, category_names[cat_index], 4640 new_locale, locale_to_restore_to ? locale_to_restore_to : "(null)", 4641 caller_line)); 4642 4643 if (! locale_to_restore_to) { 4644 locale_panic_via_(Perl_form(aTHX_ 4645 "Could not find current %s locale", 4646 category_names[cat_index]), 4647 __FILE__, caller_line); 4648 } 4649 4650 /* Begin a critical section on platforms that need it. We do this even if 4651 * we don't have to change here, so as to prevent other instances from 4652 * changing the locale out from under us. */ 4653 TOGGLE_LOCK(cat_index); 4654 4655 /* If the locales are the same, there's nothing to do */ 4656 if (strEQ(locale_to_restore_to, new_locale)) { 4657 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n", 4658 category_names[cat_index], 4659 new_locale)); 4660 return NULL; 4661 } 4662 4663 /* Finally, change the locale to the new one */ 4664 void_setlocale_i_with_caller(cat_index, new_locale, __FILE__, caller_line); 4665 4666 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 4667 "%s locale switched to %s\n", 4668 category_names[cat_index], new_locale)); 4669 4670 return locale_to_restore_to; 4671 4672 # ifndef DEBUGGING 4673 PERL_UNUSED_ARG(caller_line); 4674 # endif 4675 4676 } 4677 4678 STATIC void 4679 S_restore_toggled_locale_i(pTHX_ const locale_category_index cat_index, 4680 const char * restore_locale, 4681 const line_t caller_line) 4682 { 4683 /* Restores the locale for LC_category corresponding to cat_index to 4684 * 'restore_locale' (which is a copy that will be freed by this function), 4685 * or do nothing if the latter parameter is NULL */ 4686 4687 PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I; 4688 4689 if (restore_locale == NULL) { 4690 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 4691 "restore_toggled_locale_i: No need to" \ 4692 " restore %s; called from %" LINE_Tf "\n", \ 4693 category_names[cat_index], caller_line)); 4694 TOGGLE_UNLOCK(cat_index); 4695 return; 4696 } 4697 4698 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 4699 "restore_toggled_locale_i: restoring locale for" \ 4700 " %s to %s; called from %" LINE_Tf "\n", \ 4701 category_names[cat_index], restore_locale, 4702 caller_line)); 4703 4704 void_setlocale_i_with_caller(cat_index, restore_locale, 4705 __FILE__, caller_line); 4706 TOGGLE_UNLOCK(cat_index); 4707 4708 # ifndef DEBUGGING 4709 PERL_UNUSED_ARG(caller_line); 4710 # endif 4711 4712 } 4713 4714 #endif 4715 #if defined(USE_LOCALE) || defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV) 4716 4717 STATIC utf8ness_t 4718 S_get_locale_string_utf8ness_i(pTHX_ const char * string, 4719 const locale_utf8ness_t known_utf8, 4720 const char * locale, 4721 const locale_category_index cat_index) 4722 { 4723 PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I; 4724 4725 # ifndef USE_LOCALE 4726 4727 return UTF8NESS_NO; 4728 PERL_UNUSED_ARG(string); 4729 PERL_UNUSED_ARG(known_utf8); 4730 PERL_UNUSED_ARG(locale); 4731 PERL_UNUSED_ARG(cat_index); 4732 4733 # else 4734 4735 /* Return to indicate if 'string' in the locale given by the input 4736 * arguments should be considered UTF-8 or not. 4737 * 4738 * If the input 'locale' is not NULL, use that for the locale; otherwise 4739 * use the current locale for the category specified by 'cat_index'. 4740 */ 4741 4742 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 4743 "Entering get_locale_string_utf8ness_i; locale=%s," 4744 " index=%u(%s), string=%s, known_utf8=%d\n", 4745 locale, cat_index, category_names[cat_index], 4746 ((string) 4747 ? _byte_dump_string((U8 *) string, 4748 strlen(string), 4749 0) 4750 : "nil"), 4751 known_utf8)); 4752 if (string == NULL) { 4753 return UTF8NESS_IMMATERIAL; 4754 } 4755 4756 if (IN_BYTES) { /* respect 'use bytes' */ 4757 return UTF8NESS_NO; 4758 } 4759 4760 Size_t len = strlen(string); 4761 4762 /* UTF8ness is immaterial if the representation doesn't vary */ 4763 const U8 * first_variant = NULL; 4764 if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) { 4765 return UTF8NESS_IMMATERIAL; 4766 } 4767 4768 /* Can't be UTF-8 if invalid */ 4769 if (! is_strict_utf8_string((U8 *) first_variant, 4770 len - ((char *) first_variant - string))) 4771 { 4772 return UTF8NESS_NO; 4773 } 4774 4775 /* Here and below, we know the string is legal UTF-8, containing at least 4776 * one character requiring a sequence of two or more bytes. It is quite 4777 * likely to be UTF-8. But it pays to be paranoid and do further checking. 4778 * 4779 * If we already know the UTF-8ness of the locale, then we immediately know 4780 * what the string is */ 4781 if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) { 4782 return (known_utf8 == LOCALE_IS_UTF8) ? UTF8NESS_YES : UTF8NESS_NO; 4783 } 4784 4785 if (locale == NULL) { 4786 locale = querylocale_i(cat_index); 4787 } 4788 4789 /* If the locale is UTF-8, the string is UTF-8; otherwise it was 4790 * coincidental that the string is legal UTF-8 4791 * 4792 * However, if the perl is compiled to not pay attention to the category 4793 * being passed in, you might think that that locale is essentially always 4794 * the C locale, so it would make sense to say it isn't UTF-8. But to get 4795 * here, the string has to contain characters unknown in the C locale. And 4796 * in fact, Windows boxes are compiled without LC_MESSAGES, as their 4797 * message catalog isn't really a part of the locale system. But those 4798 * messages really could be UTF-8, and given that the odds are rather small 4799 * of something not being UTF-8 but being syntactically valid UTF-8, khw 4800 * has decided to call such strings as UTF-8. */ 4801 return (is_locale_utf8(locale)) ? UTF8NESS_YES : UTF8NESS_NO; 4802 4803 # endif 4804 4805 } 4806 4807 STATIC bool 4808 S_is_locale_utf8(pTHX_ const char * locale) 4809 { 4810 PERL_ARGS_ASSERT_IS_LOCALE_UTF8; 4811 4812 /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise. */ 4813 4814 # if ! defined(USE_LOCALE) \ 4815 || ! defined(USE_LOCALE_CTYPE) \ 4816 || defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */ 4817 4818 PERL_UNUSED_ARG(locale); 4819 4820 return FALSE; 4821 4822 /* Definitively, can't be UTF-8 */ 4823 # define HAS_DEFINITIVE_UTF8NESS_DETERMINATION 4824 # else 4825 4826 /* If the input happens to be the same locale as we are currently setup 4827 * for, the answer has already been cached. */ 4828 if (strEQ(locale, PL_ctype_name)) { 4829 return PL_in_utf8_CTYPE_locale; 4830 } 4831 4832 if (isNAME_C_OR_POSIX(locale)) { 4833 return false; 4834 } 4835 4836 # if ! defined(HAS_SOME_LANGINFO) && ! defined(WIN32) 4837 4838 /* On non-Windows without nl_langinfo(), we have to do some digging to get 4839 * the answer. First, toggle to the desired locale so can query its state 4840 * */ 4841 const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale); 4842 4843 # define TEARDOWN_FOR_IS_LOCALE_UTF8 \ 4844 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale) 4845 4846 # ifdef MB_CUR_MAX 4847 4848 /* If there are fewer bytes available in this locale than are required 4849 * to represent the largest legal UTF-8 code point, this isn't a UTF-8 4850 * locale. */ 4851 const int mb_cur_max = MB_CUR_MAX; 4852 if (mb_cur_max < (int) UNISKIP(PERL_UNICODE_MAX)) { 4853 TEARDOWN_FOR_IS_LOCALE_UTF8; 4854 return false; 4855 } 4856 4857 # endif 4858 # if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) 4859 4860 /* With these functions, we can definitively determine a locale's 4861 * UTF-8ness */ 4862 # define HAS_DEFINITIVE_UTF8NESS_DETERMINATION 4863 4864 /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT CHARACTER 4865 * as that Unicode code point, this has to be a UTF-8 locale; otherwise it 4866 * can't be */ 4867 wchar_t wc = 0; 4868 (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */ 4869 int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc, 4870 STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8)); 4871 TEARDOWN_FOR_IS_LOCALE_UTF8; 4872 return ( mbtowc_ret == STRLENs(REPLACEMENT_CHARACTER_UTF8) 4873 && wc == UNICODE_REPLACEMENT); 4874 4875 # else 4876 4877 /* If the above two C99 functions aren't working, you could try some 4878 * different methods. It seems likely that the obvious choices, 4879 * wctomb() and wcrtomb(), wouldn't be working either. But you could 4880 * choose one of the dozen-ish Unicode titlecase triples and verify 4881 * that towupper/towlower work as expected. 4882 * 4883 * But, our emulation of nl_langinfo() works quite well, so avoid the 4884 * extra code until forced to by some weird non-conforming platform. */ 4885 # define USE_LANGINFO_FOR_UTF8NESS 4886 # undef HAS_DEFINITIVE_UTF8NESS_DETERMINATION 4887 # endif 4888 # else 4889 4890 /* On Windows or on platforms with nl_langinfo(), there is a direct way to 4891 * get the locale's codeset, which will be some form of 'UTF-8' for a 4892 * UTF-8 locale. langinfo_c() handles this, and we will call that 4893 * below */ 4894 # define HAS_DEFINITIVE_UTF8NESS_DETERMINATION 4895 # define USE_LANGINFO_FOR_UTF8NESS 4896 # define TEARDOWN_FOR_IS_LOCALE_UTF8 4897 # endif /* USE_LANGINFO_FOR_UTF8NESS */ 4898 4899 /* If the above compiled into code, it found the locale's UTF-8ness, 4900 * nothing more to do; if it didn't get compiled, 4901 * USE_LANGINFO_FOR_UTF8NESS is defined. There are two possible reasons: 4902 * 1) it is the preferred method because it knows directly for sure 4903 * what the codeset is because the platform has libc functions that 4904 * return this; or 4905 * 2) the functions the above code section would compile to use don't 4906 * exist or are unreliable on this platform; we are less sure of the 4907 * langinfo_c() result, though it is very unlikely to be wrong 4908 * about if it is UTF-8 or not */ 4909 # ifdef USE_LANGINFO_FOR_UTF8NESS 4910 4911 const char * codeset = langinfo_c(CODESET, LC_CTYPE, locale, NULL); 4912 bool retval = is_codeset_name_UTF8(codeset); 4913 4914 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 4915 "found codeset=%s, is_utf8=%d\n", codeset, retval)); 4916 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "is_locale_utf8(%s) returning %d\n", 4917 locale, retval)); 4918 TEARDOWN_FOR_IS_LOCALE_UTF8; 4919 return retval; 4920 4921 # endif 4922 # endif /* End of the #else clause, for the non-trivial case */ 4923 4924 } 4925 4926 #endif 4927 4928 #ifdef USE_LOCALE 4929 # ifdef USE_LOCALE_CTYPE 4930 4931 STATIC bool 4932 S_is_codeset_name_UTF8(const char * name) 4933 { 4934 /* Return a boolean as to if the passed-in name indicates it is a UTF-8 4935 * code set. Several variants are possible */ 4936 const Size_t len = strlen(name); 4937 4938 PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8; 4939 4940 # ifdef WIN32 4941 4942 /* https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers */ 4943 if (memENDs(name, len, "65001")) { 4944 return TRUE; 4945 } 4946 4947 # endif 4948 /* 'UTF8' or 'UTF-8' */ 4949 return ( inRANGE(len, 4, 5) 4950 && name[len-1] == '8' 4951 && ( memBEGINs(name, len, "UTF") 4952 || memBEGINs(name, len, "utf")) 4953 && (len == 4 || name[3] == '-')); 4954 } 4955 4956 # endif 4957 # ifdef WIN32 4958 4959 bool 4960 Perl_get_win32_message_utf8ness(pTHX_ const char * string) 4961 { 4962 /* This is because Windows doesn't have LC_MESSAGES. */ 4963 4964 # ifdef USE_LOCALE_CTYPE 4965 4966 /* We don't know the locale utf8ness here, and not even the locale itself. 4967 * Since Windows uses a different mechanism to specify message language 4968 * output than the locale system, it is going to be problematic deciding 4969 * if we are to store it as UTF-8 or not. By specifying LOCALE_IS_UTF8, we 4970 * are telling the called function to return true iff the string has 4971 * non-ASCII characters in it that are all syntactically UTF-8. We are 4972 * thus relying on the fact that a string that is syntactically valid UTF-8 4973 * is likely to be UTF-8. Should this ever cause problems, this function 4974 * could be replaced by something more Windows-specific */ 4975 return get_locale_string_utf8ness_i(string, LOCALE_IS_UTF8, 4976 NULL, LC_CTYPE_INDEX_); 4977 # else 4978 4979 PERL_UNUSED_ARG(string); 4980 return false; 4981 4982 # endif 4983 4984 } 4985 4986 # endif 4987 4988 STATIC void 4989 S_set_save_buffer_min_size(pTHX_ Size_t min_len, 4990 char **buf, 4991 Size_t * buf_cursize) 4992 { 4993 /* Make sure the buffer pointed to by *buf is at least as large 'min_len'; 4994 * *buf_cursize is the size of 'buf' upon entry; it will be updated to the 4995 * new size on exit. 'buf_cursize' being NULL is to be used when this is a 4996 * single use buffer, which will shortly be freed by the caller. */ 4997 4998 if (buf_cursize == NULL) { 4999 Newx(*buf, min_len, char); 5000 } 5001 else if (*buf_cursize == 0) { 5002 Newx(*buf, min_len, char); 5003 *buf_cursize = min_len; 5004 } 5005 else if (min_len > *buf_cursize) { 5006 Renew(*buf, min_len, char); 5007 *buf_cursize = min_len; 5008 } 5009 } 5010 5011 STATIC const char * 5012 S_save_to_buffer(pTHX_ const char * string, char **buf, Size_t *buf_size) 5013 { 5014 PERL_ARGS_ASSERT_SAVE_TO_BUFFER; 5015 5016 /* Copy the NUL-terminated 'string' to a buffer whose address before this 5017 * call began at *buf, and whose available length before this call was 5018 * *buf_size. 5019 * 5020 * If the length of 'string' is greater than the space available, the 5021 * buffer is grown accordingly, which may mean that it gets relocated. 5022 * *buf and *buf_size will be updated to reflect this. 5023 * 5024 * Regardless, the function returns a pointer to where 'string' is now 5025 * stored. 5026 * 5027 * 'string' may be NULL, which means no action gets taken, and NULL is 5028 * returned. 5029 * 5030 * 'buf_size' being NULL is to be used when this is a single use buffer, 5031 * which will shortly be freed by the caller. 5032 * 5033 * If *buf or 'buf_size' are NULL or *buf_size is 0, the buffer is assumed 5034 * empty, and memory is malloc'd. 5035 */ 5036 5037 if (! string) { 5038 return NULL; 5039 } 5040 5041 /* No-op to copy over oneself */ 5042 if (string == *buf) { 5043 return string; 5044 } 5045 5046 Size_t string_size = strlen(string) + 1; 5047 set_save_buffer_min_size(string_size, buf, buf_size); 5048 5049 # ifdef DEBUGGING 5050 5051 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 5052 "Copying '%s' to %p\n", 5053 ((is_strict_utf8_string((U8 *) string, 0)) 5054 ? string 5055 :_byte_dump_string((U8 *) string, strlen(string), 0)), 5056 *buf)); 5057 5058 # ifdef USE_LOCALE_CTYPE 5059 5060 /* Catch glitches. Usually this is because LC_CTYPE needs to be the same 5061 * locale as whatever is being worked on */ 5062 if (UNLIKELY(instr(string, REPLACEMENT_CHARACTER_UTF8))) { 5063 locale_panic_(Perl_form(aTHX_ 5064 "Unexpected REPLACEMENT_CHARACTER in '%s'\n%s", 5065 string, get_LC_ALL_display())); 5066 } 5067 5068 # endif 5069 # endif 5070 5071 Copy(string, *buf, string_size, char); 5072 return *buf; 5073 } 5074 5075 #endif 5076 5077 int 5078 Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len) 5079 { 5080 5081 #if ! defined(HAS_MBRTOWC) && ! defined(HAS_MBTOWC) 5082 5083 PERL_UNUSED_ARG(pwc); 5084 PERL_UNUSED_ARG(s); 5085 PERL_UNUSED_ARG(len); 5086 return -1; 5087 5088 #else /* Below we have some form of mbtowc() */ 5089 # if defined(HAS_MBRTOWC) \ 5090 && (defined(MULTIPLICITY) || ! defined(HAS_MBTOWC)) 5091 # define USE_MBRTOWC 5092 # else 5093 # undef USE_MBRTOWC 5094 # endif 5095 5096 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 5097 int retval = -1; 5098 5099 if (s == NULL) { /* Initialize the shift state to all zeros in 5100 PL_mbrtowc_ps. */ 5101 5102 # if defined(USE_MBRTOWC) 5103 5104 memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps)); 5105 return 0; 5106 5107 # else 5108 5109 SETERRNO(0, 0); 5110 MBTOWC_LOCK_; 5111 retval = mbtowc(NULL, NULL, 0); 5112 MBTOWC_UNLOCK_; 5113 return retval; 5114 5115 # endif 5116 5117 } 5118 5119 # if defined(USE_MBRTOWC) 5120 5121 SETERRNO(0, 0); 5122 MBRTOWC_LOCK_; 5123 retval = (SSize_t) mbrtowc((wchar_t *) pwc, s, len, &PL_mbrtowc_ps); 5124 MBRTOWC_UNLOCK_; 5125 5126 # else 5127 5128 /* Locking prevents races, but locales can be switched out without locking, 5129 * so this isn't a cure all */ 5130 SETERRNO(0, 0); 5131 MBTOWC_LOCK_; 5132 retval = mbtowc((wchar_t *) pwc, s, len); 5133 MBTOWC_UNLOCK_; 5134 5135 # endif 5136 5137 return retval; 5138 5139 #endif 5140 5141 } 5142 5143 /* 5144 =for apidoc Perl_localeconv 5145 5146 This is a thread-safe version of the libc L<localeconv(3)>. It is the same as 5147 L<POSIX::localeconv|POSIX/localeconv> (returning a hash of the C<localeconv()> 5148 fields), but directly callable from XS code. The hash is mortalized, so must 5149 be dealt with immediately. 5150 5151 =cut 5152 */ 5153 5154 /* All Wndows versions we support, except possibly MingW, have general 5155 * thread-safety, and even localeconv() is thread safe, returning into a 5156 * per-thread buffer. MingW when built with a modern MS C runtime (UCRT as of 5157 * this writing), also has those things. 5158 * 5159 * FreeBSD's localeconv() when used with uselocale() is supposed to be 5160 * thread-safe (as is their localeconv_l()), but we currently don't use 5161 * thread-safe locales there because of bugs. There may be other thread-safe 5162 * localeconv() implementations, especially on *BSD derivatives, but khw knows 5163 * of none, and hasn't really investigated, in part because of the past 5164 * unreliability of vendor thread-safety claims */ 5165 #if defined(WIN32) && (defined(_MSC_VER) || (defined(_UCRT))) 5166 # define LOCALECONV_IS_THREAD_SAFE 5167 #endif 5168 5169 /* When multiple threads can be going at once, we need a critical section 5170 * around doing the localeconv() and saving its return, unless localeconv() is 5171 * thread-safe, and we are using it in a thread-safe manner, which we are only 5172 * doing if safe threads are available and we don't have a broken localeconv() 5173 * */ 5174 #if defined(USE_THREADS) \ 5175 && ( ! defined(LOCALECONV_IS_THREAD_SAFE) \ 5176 || ! defined(USE_THREAD_SAFE_LOCALE) \ 5177 || defined(TS_W32_BROKEN_LOCALECONV)) 5178 # define LOCALECONV_NEEDS_CRITICAL_SECTION 5179 #endif 5180 5181 HV * 5182 Perl_localeconv(pTHX) 5183 { 5184 return (HV *) sv_2mortal((SV *) my_localeconv(0)); 5185 } 5186 5187 HV * 5188 S_my_localeconv(pTHX_ const int item) 5189 { 5190 PERL_ARGS_ASSERT_MY_LOCALECONV; 5191 5192 /* This returns a mortalized hash containing all or certain elements 5193 * returned by localeconv(). */ 5194 HV * hv = newHV(); /* The returned hash, initially empty */ 5195 5196 /* The function is used by Perl_localeconv() and POSIX::localeconv(), or 5197 * internally from this file, and is thread-safe. 5198 * 5199 * localeconv() returns items from two different locale categories, 5200 * LC_MONETARY and LC_NUMERIC. Various data structures in this function 5201 * are arrays with two elements, one for each category, and these indexes 5202 * indicate which array element applies to which category */ 5203 #define NUMERIC_OFFSET 0 5204 #define MONETARY_OFFSET 1 5205 5206 /* Some operations apply to one or the other category, or both. A mask 5207 * is used to specify all the possibilities. This macro converts from the 5208 * category offset to its bit position in the mask. */ 5209 #define OFFSET_TO_BIT(i) (1 << (i)) 5210 5211 /* There are two use cases for this function: 5212 * 1) Called as Perl_localeconv(), or from POSIX::locale_conv(). This 5213 * returns the lconv structure copied to a hash, based on the current 5214 * underlying locales for LC_NUMERIC and LC_MONETARY. An input item==0 5215 * signifies this case, or on many platforms it is the only use case 5216 * compiled. 5217 * 2) Certain items that nl_langinfo() provides are also derivable from 5218 * the return of localeconv(). Windows notably doesn't have 5219 * nl_langinfo(), so on that, and actually any platform lacking it, 5220 * my_localeconv() is used also to emulate it for those particular 5221 * items. The code to do this is compiled only on such platforms. 5222 * Rather than going to the expense of creating a full hash when only 5223 * one item is needed, the returned hash has just the desired item in 5224 * it. 5225 * 5226 * To access all the localeconv() struct lconv fields, there is a data 5227 * structure that contains every commonly documented field in it. (Maybe 5228 * some minority platforms have extra fields. Those could be added here 5229 * without harm; they would just be ignored on platforms lacking them.) 5230 * 5231 * Our structure is compiled to make looping through the fields easier by 5232 * pointing each name to its value's offset within lconv, e.g., 5233 { "thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep) } 5234 */ 5235 #define LCONV_ENTRY(name) {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)} 5236 5237 /* These synonyms are just for clarity, and to make it easier in case 5238 * something needs to change in the future */ 5239 #define LCONV_NUMERIC_ENTRY(name) LCONV_ENTRY(name) 5240 #define LCONV_MONETARY_ENTRY(name) LCONV_ENTRY(name) 5241 5242 /* There are just a few fields for NUMERIC strings */ 5243 const lconv_offset_t lconv_numeric_strings[] = { 5244 #ifndef NO_LOCALECONV_GROUPING 5245 LCONV_NUMERIC_ENTRY(grouping), 5246 # endif 5247 LCONV_NUMERIC_ENTRY(thousands_sep), 5248 # define THOUSANDS_SEP_LITERAL "thousands_sep" 5249 LCONV_NUMERIC_ENTRY(decimal_point), 5250 # define DECIMAL_POINT_LITERAL "decimal_point" 5251 {NULL, 0} 5252 }; 5253 5254 /* When used to implement nl_langinfo(), we save time by only populating 5255 * the hash with the field(s) needed. Thus we would need a data structure 5256 * of just: 5257 * LCONV_NUMERIC_ENTRY(decimal_point), 5258 * {NULL, 0} 5259 * 5260 * By placing the decimal_point field last in the full structure, we can 5261 * use just the tail for this bit of it, saving space. This macro yields 5262 * the address of the sub structure. */ 5263 #define DECIMAL_POINT_ADDRESS \ 5264 &lconv_numeric_strings[(C_ARRAY_LENGTH(lconv_numeric_strings) - 2)] 5265 5266 /* And the MONETARY string fields */ 5267 const lconv_offset_t lconv_monetary_strings[] = { 5268 LCONV_MONETARY_ENTRY(int_curr_symbol), 5269 LCONV_MONETARY_ENTRY(mon_decimal_point), 5270 #ifndef NO_LOCALECONV_MON_THOUSANDS_SEP 5271 LCONV_MONETARY_ENTRY(mon_thousands_sep), 5272 #endif 5273 #ifndef NO_LOCALECONV_MON_GROUPING 5274 LCONV_MONETARY_ENTRY(mon_grouping), 5275 #endif 5276 LCONV_MONETARY_ENTRY(positive_sign), 5277 LCONV_MONETARY_ENTRY(negative_sign), 5278 LCONV_MONETARY_ENTRY(currency_symbol), 5279 #define CURRENCY_SYMBOL_LITERAL "currency_symbol" 5280 {NULL, 0} 5281 }; 5282 5283 /* Like above, this field being last can be used as a sub structure */ 5284 #define CURRENCY_SYMBOL_ADDRESS \ 5285 &lconv_monetary_strings[(C_ARRAY_LENGTH(lconv_monetary_strings) - 2)] 5286 5287 /* Finally there are integer fields, all are for monetary purposes */ 5288 const lconv_offset_t lconv_integers[] = { 5289 LCONV_ENTRY(int_frac_digits), 5290 LCONV_ENTRY(frac_digits), 5291 LCONV_ENTRY(p_sep_by_space), 5292 LCONV_ENTRY(n_cs_precedes), 5293 LCONV_ENTRY(n_sep_by_space), 5294 LCONV_ENTRY(p_sign_posn), 5295 LCONV_ENTRY(n_sign_posn), 5296 #ifdef HAS_LC_MONETARY_2008 5297 LCONV_ENTRY(int_p_cs_precedes), 5298 LCONV_ENTRY(int_p_sep_by_space), 5299 LCONV_ENTRY(int_n_cs_precedes), 5300 LCONV_ENTRY(int_n_sep_by_space), 5301 LCONV_ENTRY(int_p_sign_posn), 5302 LCONV_ENTRY(int_n_sign_posn), 5303 #endif 5304 # define P_CS_PRECEDES_LITERAL "p_cs_precedes" 5305 LCONV_ENTRY(p_cs_precedes), 5306 {NULL, 0} 5307 }; 5308 5309 /* Like above, this field being last can be used as a sub structure */ 5310 #define P_CS_PRECEDES_ADDRESS \ 5311 &lconv_integers[(C_ARRAY_LENGTH(lconv_integers) - 2)] 5312 5313 /* The actual populating of the hash is done by two sub functions that get 5314 * passed an array of length two containing the data structure they are 5315 * supposed to use to get the key names to fill the hash with. One element 5316 * is always for the NUMERIC strings (or NULL if none to use), and the 5317 * other element similarly for the MONETARY ones. */ 5318 const lconv_offset_t * strings[2] = { lconv_numeric_strings, 5319 lconv_monetary_strings 5320 }; 5321 5322 /* The LC_MONETARY category also has some integer-valued fields, whose 5323 * information is kept in a separate parallel array to 'strings' */ 5324 const lconv_offset_t * integers[2] = { 5325 NULL, 5326 lconv_integers 5327 }; 5328 5329 #if ! defined(HAS_LOCALECONV) \ 5330 || (! defined(USE_LOCALE_NUMERIC) && ! defined(USE_LOCALE_MONETARY)) 5331 5332 /* If both NUMERIC and MONETARY must be the "C" locale, simply populate the 5333 * hash using the function that works on just that locale. */ 5334 populate_hash_from_C_localeconv(hv, 5335 "C", 5336 ( OFFSET_TO_BIT(NUMERIC_OFFSET) 5337 | OFFSET_TO_BIT(MONETARY_OFFSET)), 5338 strings, integers); 5339 5340 /* We shouldn't get to here for the case of an individual item, as 5341 * preprocessor directives elsewhere in this file should have filled in the 5342 * correct values at a higher level */ 5343 assert(item == 0); 5344 PERL_UNUSED_ARG(item); 5345 5346 return hv; 5347 5348 #else 5349 5350 /* From here to the end of this function, at least one of NUMERIC or 5351 * MONETARY can be non-C */ 5352 5353 /* This is a mask, with one bit to tell the populate functions to populate 5354 * the NUMERIC items; another bit for the MONETARY ones. This way they can 5355 * choose which (or both) to populate from */ 5356 U32 index_bits = 0; 5357 5358 /* Some platforms, for correct non-mojibake results, require LC_CTYPE's 5359 * locale to match LC_NUMERIC's for the numeric fields, and LC_MONETARY's 5360 * for the monetary ones. What happens if LC_NUMERIC and LC_MONETARY 5361 * aren't compatible? Wrong results. To avoid that, we call localeconv() 5362 * twice, once for each locale, setting LC_CTYPE to match the category. 5363 * But if the locales of both categories are the same, there is no need for 5364 * a second call. Assume this is the case unless overridden below */ 5365 bool requires_2nd_localeconv = false; 5366 5367 /* The actual hash populating is done by one of the two populate functions. 5368 * Which one is appropriate for either the MONETARY_OFFSET or the 5369 * NUMERIC_OFFSET is calculated and then stored in this table */ 5370 void (*populate[2]) (pTHX_ 5371 HV * , 5372 const char *, 5373 const U32, 5374 const lconv_offset_t **, 5375 const lconv_offset_t **); 5376 5377 /* This gives the locale to use for the corresponding OFFSET, like the 5378 * 'populate' array above */ 5379 const char * locales[2]; 5380 5381 # ifdef HAS_SOME_LANGINFO 5382 5383 /* If the only use-case for this is the full localeconv(), the 'item' 5384 * parameter is ignored. */ 5385 PERL_UNUSED_ARG(item); 5386 5387 # else /* This only gets compiled for the use-case of using localeconv() 5388 to emulate nl_langinfo() when missing from the platform. */ 5389 5390 # ifdef USE_LOCALE_NUMERIC 5391 5392 /* We need this substructure to only return this field for the THOUSEP 5393 * item. The other items also need substructures, but they were handled 5394 * above by placing the substructure's item at the end of the full one, so 5395 * the data structure could do double duty. However, both this and 5396 * RADIXCHAR would need to be in the final position of the same full 5397 * structure; an impossibility. So make this into a separate structure */ 5398 const lconv_offset_t thousands_sep_string[] = { 5399 LCONV_NUMERIC_ENTRY(thousands_sep), 5400 {NULL, 0} 5401 }; 5402 5403 # endif 5404 5405 /* End of all the initialization of data structures. Now for actual code. 5406 * 5407 * Without nl_langinfo(), the call to my_localeconv() could be for all of 5408 * the localeconv() items or for just one of the following 3 items to 5409 * emulate nl_langinfo(). 5410 * 5411 * This is compiled only when using perl_langinfo.h, which we control, and 5412 * it has been constructed so that no item is numbered 0. 5413 * 5414 * For each individual item, either return the known value if the current 5415 * locale is "C", or set up the appropriate parameters for the call below 5416 * to the populate function */ 5417 if (item != 0) { 5418 const char *locale; 5419 5420 switch (item) { 5421 default: 5422 locale_panic_(Perl_form(aTHX_ 5423 "Unexpected item passed to my_localeconv: %d", item)); 5424 break; 5425 5426 # ifdef USE_LOCALE_NUMERIC 5427 5428 case RADIXCHAR: 5429 if (isNAME_C_OR_POSIX(PL_numeric_name)) { 5430 (void) hv_stores(hv, DECIMAL_POINT_LITERAL, newSVpvs(".")); 5431 return hv; 5432 } 5433 5434 strings[NUMERIC_OFFSET] = DECIMAL_POINT_ADDRESS; 5435 goto numeric_common; 5436 5437 case THOUSEP: 5438 if (isNAME_C_OR_POSIX(PL_numeric_name)) { 5439 (void) hv_stores(hv, THOUSANDS_SEP_LITERAL, newSVpvs("")); 5440 return hv; 5441 } 5442 5443 strings[NUMERIC_OFFSET] = thousands_sep_string; 5444 5445 numeric_common: 5446 index_bits = OFFSET_TO_BIT(NUMERIC_OFFSET); 5447 locale = PL_numeric_name; 5448 break; 5449 5450 # endif 5451 # ifdef USE_LOCALE_MONETARY 5452 5453 case CRNCYSTR: /* This item needs the values for both the currency 5454 symbol, and another one used to construct the 5455 nl_langino()-compatible return. */ 5456 5457 locale = querylocale_c(LC_MONETARY); 5458 if (isNAME_C_OR_POSIX(locale)) { 5459 (void) hv_stores(hv, CURRENCY_SYMBOL_LITERAL, newSVpvs("")); 5460 (void) hv_stores(hv, P_CS_PRECEDES_LITERAL, newSViv(-1)); 5461 return hv; 5462 } 5463 5464 strings[MONETARY_OFFSET] = CURRENCY_SYMBOL_ADDRESS; 5465 integers[MONETARY_OFFSET] = P_CS_PRECEDES_ADDRESS; 5466 5467 index_bits = OFFSET_TO_BIT(MONETARY_OFFSET); 5468 break; 5469 5470 # endif 5471 5472 } /* End of switch() */ 5473 5474 /* There's only one item, so only one of each of these will get used, 5475 * but cheap to initialize both */ 5476 populate[MONETARY_OFFSET] = 5477 populate[NUMERIC_OFFSET] = S_populate_hash_from_localeconv; 5478 locales[MONETARY_OFFSET] = locales[NUMERIC_OFFSET] = locale; 5479 } 5480 else /* End of for just one item to emulate nl_langinfo() */ 5481 5482 # endif 5483 5484 { 5485 /* Here, the call is for all of localeconv(). It has a bunch of 5486 * items. The first function call always gets the MONETARY values */ 5487 index_bits = OFFSET_TO_BIT(MONETARY_OFFSET); 5488 5489 # ifdef USE_LOCALE_MONETARY 5490 5491 locales[MONETARY_OFFSET] = querylocale_c(LC_MONETARY); 5492 populate[MONETARY_OFFSET] = 5493 (isNAME_C_OR_POSIX(locales[MONETARY_OFFSET])) 5494 ? S_populate_hash_from_C_localeconv 5495 : S_populate_hash_from_localeconv; 5496 5497 # else 5498 5499 locales[MONETARY_OFFSET] = "C"; 5500 populate[MONETARY_OFFSET] = S_populate_hash_from_C_localeconv; 5501 5502 # endif 5503 # ifdef USE_LOCALE_NUMERIC 5504 5505 /* And if the locales for the two categories are the same, we can also 5506 * do the NUMERIC values in the same call */ 5507 if (strEQ(PL_numeric_name, locales[MONETARY_OFFSET])) { 5508 index_bits |= OFFSET_TO_BIT(NUMERIC_OFFSET); 5509 locales[NUMERIC_OFFSET] = locales[MONETARY_OFFSET]; 5510 populate[NUMERIC_OFFSET] = populate[MONETARY_OFFSET]; 5511 } 5512 else { 5513 requires_2nd_localeconv = true; 5514 locales[NUMERIC_OFFSET] = PL_numeric_name; 5515 populate[NUMERIC_OFFSET] = (isNAME_C_OR_POSIX(PL_numeric_name)) 5516 ? S_populate_hash_from_C_localeconv 5517 : S_populate_hash_from_localeconv; 5518 } 5519 5520 # else 5521 5522 /* When LC_NUMERIC is confined to "C", the two locales are the same 5523 iff LC_MONETARY in this case is also "C". We set up the function 5524 for that case above, so fastest to test just its address */ 5525 locales[NUMERIC_OFFSET] = "C"; 5526 if (populate[MONETARY_OFFSET] == S_populate_hash_from_C_localeconv) { 5527 index_bits |= OFFSET_TO_BIT(NUMERIC_OFFSET); 5528 populate[NUMERIC_OFFSET] = populate[MONETARY_OFFSET]; 5529 } 5530 else { 5531 requires_2nd_localeconv = true; 5532 populate[NUMERIC_OFFSET] = S_populate_hash_from_C_localeconv; 5533 } 5534 5535 # endif 5536 5537 } /* End of call is for localeconv() */ 5538 5539 /* Call the proper populate function (which may call localeconv()) and copy 5540 * its results into the hash. All the parameters have been initialized 5541 * above */ 5542 (*populate[MONETARY_OFFSET])(aTHX_ 5543 hv, locales[MONETARY_OFFSET], 5544 index_bits, strings, integers); 5545 5546 # ifndef HAS_SOME_LANGINFO /* Could be using this function to emulate 5547 nl_langinfo() */ 5548 5549 /* We are done when called with an individual item. There are no integer 5550 * items to adjust, and it's best for the caller to determine if this 5551 * string item is UTF-8 or not. This is because the locale's UTF-8ness is 5552 * calculated below, and in some Configurations, that can lead to a 5553 * recursive call to here, which could recurse infinitely. */ 5554 if (item != 0) { 5555 return hv; 5556 } 5557 5558 # endif 5559 5560 /* The above call may have done all the hash fields, but not always, as 5561 * already explained. If we need a second call it is always for the 5562 * NUMERIC fields */ 5563 if (requires_2nd_localeconv) { 5564 (*populate[NUMERIC_OFFSET])(aTHX_ 5565 hv, 5566 locales[NUMERIC_OFFSET], 5567 OFFSET_TO_BIT(NUMERIC_OFFSET), 5568 strings, integers); 5569 } 5570 5571 /* Here, the hash has been completely populated. */ 5572 5573 # ifdef LOCALECONV_NEEDS_CRITICAL_SECTION 5574 5575 /* When the hash was populated during a critical section, the determination 5576 * of whether or not a string element should be marked as UTF-8 was 5577 * deferred, so as to minimize the amount of time in the critical section. 5578 * But now we have the hash specific to this thread, and can do the 5579 * adjusting without worrying about delaying other threads. */ 5580 for (unsigned int i = 0; i < 2; i++) { /* Try both types of strings */ 5581 5582 /* The return from this function is already adjusted */ 5583 if (populate[i] == S_populate_hash_from_C_localeconv) { 5584 continue; 5585 } 5586 5587 /* Examine each string */ 5588 for (const lconv_offset_t *strp = strings[i]; strp->name; strp++) { 5589 const char * name = strp->name; 5590 5591 /* 'value' will contain the string that may need to be marked as 5592 * UTF-8 */ 5593 SV ** value = hv_fetch(hv, name, strlen(name), true); 5594 if (value == NULL) { 5595 continue; 5596 } 5597 5598 /* Determine if the string should be marked as UTF-8. */ 5599 if (UTF8NESS_YES == (get_locale_string_utf8ness_i(SvPVX(*value), 5600 LOCALE_UTF8NESS_UNKNOWN, 5601 locales[i], 5602 LC_ALL_INDEX_ /* OOB */))) 5603 { 5604 SvUTF8_on(*value); 5605 } 5606 } 5607 } 5608 5609 # endif 5610 5611 return hv; 5612 5613 #endif /* End of must have one or both USE_MONETARY, USE_NUMERIC */ 5614 5615 } 5616 5617 STATIC void 5618 S_populate_hash_from_C_localeconv(pTHX_ HV * hv, 5619 const char * locale, /* Unused */ 5620 5621 /* bit mask of which categories to 5622 * populate */ 5623 const U32 which_mask, 5624 5625 /* The string type values to return; 5626 * one element for numeric; the other 5627 * for monetary */ 5628 const lconv_offset_t * strings[2], 5629 5630 /* And the integer fields */ 5631 const lconv_offset_t * integers[2]) 5632 { 5633 PERL_ARGS_ASSERT_POPULATE_HASH_FROM_C_LOCALECONV; 5634 PERL_UNUSED_ARG(locale); 5635 assert(isNAME_C_OR_POSIX(locale)); 5636 5637 /* Fill hv with the values that localeconv() is supposed to return for 5638 * the C locale */ 5639 5640 U32 working_mask = which_mask; 5641 while (working_mask) { 5642 5643 /* Get the bit position of the next lowest set bit. That is the 5644 * index into the 'strings' array of the category we use in this loop 5645 * iteration. Turn the bit off so we don't work on this category 5646 * again in this function call. */ 5647 const PERL_UINT_FAST8_T i = lsbit_pos(working_mask); 5648 working_mask &= ~ (1 << i); 5649 5650 /* This category's string fields */ 5651 const lconv_offset_t * category_strings = strings[i]; 5652 5653 #ifndef HAS_SOME_LANGINFO /* This doesn't work properly if called on a single 5654 item, which could only happen when there isn't 5655 nl_langinfo on the platform */ 5656 assert(category_strings[1].name != NULL); 5657 #endif 5658 5659 /* All string fields are empty except for one NUMERIC one. That one 5660 * has been initialized to be the final one in the NUMERIC strings, so 5661 * stop the loop early in that case. Otherwise, we would store an 5662 * empty string to the hash, and immediately overwrite it with the 5663 * correct value */ 5664 const unsigned int stop_early = (i == NUMERIC_OFFSET) ? 1 : 0; 5665 5666 /* A NULL element terminates the list */ 5667 while ((category_strings + stop_early)->name) { 5668 (void) hv_store(hv, 5669 category_strings->name, 5670 strlen(category_strings->name), 5671 newSVpvs(""), 5672 0); 5673 5674 category_strings++; 5675 } 5676 5677 /* And fill in the NUMERIC exception */ 5678 if (i == NUMERIC_OFFSET) { 5679 (void) hv_stores(hv, "decimal_point", newSVpvs(".")); 5680 category_strings++; 5681 } 5682 5683 /* Add any int fields. In the C locale, all are -1 */ 5684 if (integers[i]) { 5685 const lconv_offset_t * current = integers[i]; 5686 while (current->name) { 5687 (void) hv_store(hv, 5688 current->name, strlen(current->name), 5689 newSViv(-1), 5690 0); 5691 current++; 5692 } 5693 } 5694 } 5695 } 5696 5697 #if defined(HAS_LOCALECONV) && ( defined(USE_LOCALE_NUMERIC) \ 5698 || defined(USE_LOCALE_MONETARY)) 5699 5700 STATIC void 5701 S_populate_hash_from_localeconv(pTHX_ HV * hv, 5702 5703 /* Switch to this locale to run 5704 * localeconv() from */ 5705 const char * locale, 5706 5707 /* bit mask of which categories to 5708 * populate */ 5709 const U32 which_mask, 5710 5711 /* The string type values to return; one 5712 * element for numeric; the other for 5713 * monetary */ 5714 const lconv_offset_t * strings[2], 5715 5716 /* And similarly the integer fields */ 5717 const lconv_offset_t * integers[2]) 5718 { 5719 PERL_ARGS_ASSERT_POPULATE_HASH_FROM_LOCALECONV; 5720 5721 /* Run localeconv() and copy some or all of its results to the input 'hv' 5722 * hash. Most localeconv() implementations return the values in a global 5723 * static buffer, so for them, the operation must be performed in a 5724 * critical section, ending only after the copy is completed. There are so 5725 * many locks because localeconv() deals with two categories, and returns 5726 * in a single global static buffer. Some locks might be no-ops on this 5727 * platform, but not others. We need to lock if any one isn't a no-op. */ 5728 5729 /* If the call could be for either or both of the two categories, we need 5730 * to test which one; but if the Configuration is such that we will never 5731 * be called with one of them, the code for that one will be #ifdef'd out 5732 * below, leaving code for just the other category. That code will always 5733 * want to be executed, no conditional required. Create a macro that 5734 * replaces the condition with an always-true value so the compiler will 5735 * omit the conditional */ 5736 # if defined(USE_LOCALE_NUMERIC) && defined(USE_LOCALE_MONETARY) 5737 # define CALL_IS_FOR(x) (which_mask & OFFSET_TO_BIT(x ## _OFFSET)) 5738 # else 5739 # define CALL_IS_FOR(x) 1 5740 # endif 5741 5742 /* This function is unfortunately full of #ifdefs. It consists of three 5743 * sections: 5744 * 1) Setup: 5745 * a) On platforms where it matters, toggle LC_CTYPE to the same 5746 * locale that LC_NUMERIC and LC_MONETARY will be toggled to 5747 * b) On calls that process LC_NUMERIC, toggle to the desired locale 5748 * c) On calls that process LC_MONETARY, toggle to the desired 5749 * locale 5750 * d) Do any necessary mutex locking not (automatically) done by 5751 * the toggling 5752 * e) Work around some Windows-only issues and bugs 5753 * 2) Do the localeconv(), copying the results. 5754 * 3) Teardown, which is the inverse of setup. 5755 * 5756 * The setup and teardown are highly variable due to the variance in the 5757 * possible Configurations. What is done here to make it slightly more 5758 * understandable is each setup section creates the details of its 5759 * corresponding teardown section, and macroizes them. So that the 5760 * finished teardown product is just a linear series of macros. You can 5761 * thus easily see the logic there. */ 5762 5763 /* Setup any LC_CTYPE handling */ 5764 start_DEALING_WITH_MISMATCHED_CTYPE(locale); 5765 # define CTYPE_TEARDOWN end_DEALING_WITH_MISMATCHED_CTYPE(locale) 5766 5767 /* Setup any LC_NUMERIC handling */ 5768 # ifndef USE_LOCALE_NUMERIC 5769 # define NUMERIC_TEARDOWN 5770 # else 5771 5772 /* We need to toggle the NUMERIC locale to the desired one if we are 5773 * getting NUMERIC strings */ 5774 const char * orig_NUMERIC_locale = NULL; 5775 if (CALL_IS_FOR(NUMERIC)) { 5776 5777 # ifdef WIN32 5778 5779 /* There is a bug in Windows in which setting LC_CTYPE after the others 5780 * doesn't actually take effect for localeconv(). See commit 5781 * 418efacd1950763f74ed3cc22f8cf9206661b892 for details. Thus we have 5782 * to make sure that the locale we want is set after LC_CTYPE. We 5783 * unconditionally toggle away from and back to the current locale 5784 * prior to calling localeconv(). */ 5785 orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC, "C"); 5786 (void) toggle_locale_c(LC_NUMERIC, locale); 5787 5788 # define NUMERIC_TEARDOWN \ 5789 STMT_START { \ 5790 if (CALL_IS_FOR(NUMERIC)) { \ 5791 restore_toggled_locale_c(LC_NUMERIC, "C"); \ 5792 restore_toggled_locale_c(LC_NUMERIC, orig_NUMERIC_locale); \ 5793 } \ 5794 } STMT_END 5795 5796 # else 5797 5798 /* No need for the extra toggle when not on Windows */ 5799 orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC, locale); 5800 5801 # define NUMERIC_TEARDOWN \ 5802 STMT_START { \ 5803 if (CALL_IS_FOR(NUMERIC)) { \ 5804 restore_toggled_locale_c(LC_NUMERIC, orig_NUMERIC_locale); \ 5805 } \ 5806 } STMT_END 5807 # endif 5808 5809 } 5810 5811 # endif /* End of LC_NUMERIC setup */ 5812 5813 /* Setup any LC_MONETARY handling, using the same logic as for 5814 * USE_LOCALE_NUMERIC just above */ 5815 # ifndef USE_LOCALE_MONETARY 5816 # define MONETARY_TEARDOWN 5817 # else 5818 5819 /* Same logic as LC_NUMERIC, and same Windows bug */ 5820 const char * orig_MONETARY_locale = NULL; 5821 if (CALL_IS_FOR(MONETARY)) { 5822 5823 # ifdef WIN32 5824 5825 orig_MONETARY_locale = toggle_locale_c(LC_MONETARY, "C"); 5826 (void) toggle_locale_c(LC_MONETARY, locale); 5827 5828 # define MONETARY_TEARDOWN \ 5829 STMT_START { \ 5830 if (CALL_IS_FOR(MONETARY)) { \ 5831 restore_toggled_locale_c(LC_MONETARY, "C"); \ 5832 restore_toggled_locale_c(LC_MONETARY, orig_MONETARY_locale);\ 5833 } \ 5834 } STMT_END 5835 5836 # else 5837 5838 /* No need for the extra toggle when not on Windows */ 5839 orig_MONETARY_locale = toggle_locale_c(LC_MONETARY, locale); 5840 5841 # define MONETARY_TEARDOWN \ 5842 STMT_START { \ 5843 if (CALL_IS_FOR(MONETARY)) { \ 5844 restore_toggled_locale_c(LC_MONETARY, orig_MONETARY_locale);\ 5845 } \ 5846 } STMT_END 5847 5848 # endif 5849 5850 } 5851 5852 # endif /* End of LC_MONETARY setup */ 5853 5854 /* Here, have toggled to the correct locale. 5855 * 5856 * We don't need to worry about locking at all if localeconv() is 5857 * thread-safe, regardless of if using threads or not. */ 5858 # ifdef LOCALECONV_IS_THREAD_SAFE 5859 # define LOCALECONV_UNLOCK 5860 # else 5861 5862 /* Otherwise, the gwLOCALE_LOCK macro expands to whatever locking is 5863 * needed (none if there is only a single perl instance) */ 5864 gwLOCALE_LOCK; 5865 5866 # define LOCALECONV_UNLOCK gwLOCALE_UNLOCK 5867 # endif 5868 # if ! defined(TS_W32_BROKEN_LOCALECONV) || ! defined(USE_THREAD_SAFE_LOCALE) 5869 # define WIN32_TEARDOWN 5870 # else 5871 5872 /* This is a workaround for another bug in Windows. localeconv() was 5873 * broken with thread-safe locales prior to VS 15. It looks at the global 5874 * locale instead of the thread one. As a work-around, we toggle to the 5875 * global locale; populate the return; then toggle back. We have to use 5876 * LC_ALL instead of the individual categories because of yet another bug 5877 * in Windows. And this all has to be done in a critical section. 5878 * 5879 * This introduces a potential race with any other thread that has also 5880 * converted to use the global locale, and doesn't protect its locale calls 5881 * with mutexes. khw can't think of any reason for a thread to do so on 5882 * Windows, as the locale API is the same regardless of thread-safety, 5883 * except if the code is ported from working on another platform where 5884 * there might be some reason to do this. But this is typically due to 5885 * some alien-to-Perl library that thinks it owns locale setting. Such a 5886 * library isn't likely to exist on Windows, so such an application is 5887 * unlikely to be run on Windows 5888 */ 5889 bool restore_per_thread = FALSE; 5890 5891 /* Save the per-thread locale state */ 5892 const char * save_thread = querylocale_c(LC_ALL); 5893 5894 /* Change to the global locale, and note if we already were there */ 5895 int config_return = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE); 5896 if (config_return != _DISABLE_PER_THREAD_LOCALE) { 5897 if (config_return == -1) { 5898 locale_panic_("_configthreadlocale returned an error"); 5899 } 5900 5901 restore_per_thread = TRUE; 5902 } 5903 5904 /* Save the state of the global locale; then convert to our desired 5905 * state. */ 5906 const char * save_global = querylocale_c(LC_ALL); 5907 void_setlocale_c(LC_ALL, save_thread); 5908 5909 # define WIN32_TEARDOWN \ 5910 STMT_START { \ 5911 /* Restore the global locale's prior state */ \ 5912 void_setlocale_c(LC_ALL, save_global); \ 5913 \ 5914 /* And back to per-thread locales */ \ 5915 if (restore_per_thread) { \ 5916 if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) { \ 5917 locale_panic_("_configthreadlocale returned an error"); \ 5918 } \ 5919 } \ 5920 \ 5921 /* Restore the per-thread locale state */ \ 5922 void_setlocale_c(LC_ALL, save_thread); \ 5923 } STMT_END 5924 # endif /* TS_W32_BROKEN_LOCALECONV */ 5925 5926 5927 /* Finally, everything is locked and loaded; do the actual call to 5928 * localeconv() */ 5929 const char *lcbuf_as_string = (const char *) localeconv(); 5930 5931 /* Copy its results for each desired category as determined by 5932 * 'which_mask' */ 5933 U32 working_mask = which_mask; 5934 while (working_mask) { 5935 5936 /* Get the bit position of the next lowest set bit. That is the 5937 * index into the 'strings' array of the category we use in this loop 5938 * iteration. Turn the bit off so we don't work on this category 5939 * again in this function call. */ 5940 const PERL_UINT_FAST8_T i = lsbit_pos32(working_mask); 5941 working_mask &= ~ (1 << i); 5942 5943 /* Point to the string field list for the given category ... */ 5944 const lconv_offset_t * category_strings = strings[i]; 5945 5946 /* The string fields returned by localeconv() are stored as SVs in the 5947 * hash. Their utf8ness needs to be calculated at some point, and the 5948 * SV flagged accordingly. It is easier to do that now as we go 5949 * through them, but strongly countering this is the need to minimize 5950 * the length of time spent in a critical section with other threads 5951 * locked out. Therefore, when this is being executed in a critical 5952 * section, the strings are stored as-is, and the utf8ness calculation 5953 * is done by our caller, outside the critical section, in an extra 5954 * pass through the hash. But when this code is not being executed in 5955 * a critical section, that extra pass would be extra work, so the 5956 * calculation is done here. We have #defined a symbol that indicates 5957 * whether or not this is being done in a critical section. But there 5958 * is a complication. When this is being called with just a single 5959 * string to populate the hash with, there may be extra adjustments 5960 * needed, and the ultimate caller is expecting to do all adjustments, 5961 * so the adjustment is deferred in this case even if there is no 5962 * critical section. (This case is indicated by element [1] being a 5963 * NULL marker, hence having only one real element.) */ 5964 # ifndef LOCALECONV_NEEDS_CRITICAL_SECTION 5965 const bool calculate_utf8ness_here = category_strings[1].name; 5966 # endif 5967 bool utf8ness = false; 5968 5969 /* For each string field */ 5970 while (category_strings->name) { 5971 5972 /* We have set things up so that we know where in the returned 5973 * structure, when viewed as a string, the corresponding value is. 5974 * */ 5975 char *value = *((char **)( lcbuf_as_string 5976 + category_strings->offset)); 5977 if (value) { /* Copy to the hash */ 5978 5979 # ifndef LOCALECONV_NEEDS_CRITICAL_SECTION 5980 5981 if (calculate_utf8ness_here) { 5982 utf8ness = 5983 ( UTF8NESS_YES 5984 == get_locale_string_utf8ness_i(value, 5985 LOCALE_UTF8NESS_UNKNOWN, 5986 locale, 5987 LC_ALL_INDEX_ /* OOB */)); 5988 } 5989 # endif 5990 (void) hv_store(hv, 5991 category_strings->name, 5992 strlen(category_strings->name), 5993 newSVpvn_utf8(value, strlen(value), utf8ness), 5994 0); 5995 } 5996 5997 category_strings++; 5998 } 5999 6000 /* Add any int fields to the HV*. */ 6001 if (integers[i]) { 6002 const lconv_offset_t * current = integers[i]; 6003 while (current->name) { 6004 int value = *((const char *)( lcbuf_as_string 6005 + current->offset)); 6006 if (value == CHAR_MAX) { /* Change CHAR_MAX to -1 */ 6007 value = -1; 6008 } 6009 6010 (void) hv_store(hv, 6011 current->name, strlen(current->name), 6012 newSViv(value), 6013 0); 6014 current++; 6015 } 6016 } 6017 } /* End of loop through the fields */ 6018 6019 /* Done with copying to the hash. Can unwind the critical section locks */ 6020 6021 /* Back out of what we set up */ 6022 WIN32_TEARDOWN; 6023 LOCALECONV_UNLOCK; 6024 MONETARY_TEARDOWN; 6025 NUMERIC_TEARDOWN; 6026 CTYPE_TEARDOWN; 6027 } 6028 6029 #endif /* defined(USE_LOCALE_NUMERIC) || defined(USE_LOCALE_MONETARY) */ 6030 6031 /* 6032 6033 =for apidoc Perl_langinfo 6034 =for apidoc_item Perl_langinfo8 6035 6036 C<Perl_langinfo> is an (almost) drop-in replacement for the system 6037 C<L<nl_langinfo(3)>>, taking the same C<item> parameter values, and returning 6038 the same information. But it is more thread-safe than regular 6039 C<nl_langinfo()>, and hides the quirks of Perl's locale handling from your 6040 code, and can be used on systems that lack a native C<nl_langinfo>. 6041 6042 However, you should instead use either the improved version of this, 6043 L</Perl_langinfo8>, or even better, L</sv_langinfo>. The latter returns an SV, 6044 handling all the possible non-standard returns of C<nl_langinfo()>, including 6045 the UTF8ness of any returned string. 6046 6047 C<Perl_langinfo8> is identical to C<Perl_langinfo> except for an additional 6048 parameter, a pointer to a variable declared as L</C<utf8ness_t>>, into which it 6049 returns to you how you should treat the returned string with regards to it 6050 being encoded in UTF-8 or not. 6051 6052 These two functions share private per-thread memory that will be changed the 6053 next time either one of them is called with any input, but not before. 6054 6055 Concerning the differences between these and plain C<nl_langinfo()>: 6056 6057 =over 6058 6059 =item a. 6060 6061 C<Perl_langinfo8> has an extra parameter, described above. Besides this, the 6062 other reason they aren't quite a drop-in replacement is actually an advantage. 6063 The C<const>ness of the return allows the compiler to catch attempts to write 6064 into the returned buffer, which is illegal and could cause run-time crashes. 6065 6066 =item b. 6067 6068 They deliver the correct results for the C<RADIXCHAR> and C<THOUSEP> items, 6069 without you having to write extra code. The reason for the extra code would be 6070 because these are from the C<LC_NUMERIC> locale category, which is normally 6071 kept set by Perl so that the radix is a dot, and the separator is the empty 6072 string, no matter what the underlying locale is supposed to be, and so to get 6073 the expected results, you have to temporarily toggle into the underlying 6074 locale, and later toggle back. (You could use plain C<nl_langinfo> and 6075 C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get 6076 the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C 6077 (or equivalent) locale would break a lot of CPAN, which is expecting the radix 6078 (decimal point) character to be a dot.) 6079 6080 =item c. 6081 6082 The system function they replace can have its static return buffer trashed, 6083 not only by a subsequent call to that function, but by a C<freelocale>, 6084 C<setlocale>, or other locale change. The returned buffer of these functions 6085 is not changed until the next call to one or the other, so the buffer is never 6086 in a trashed state. 6087 6088 =item d. 6089 6090 The return buffer is per-thread, so it also is never overwritten by a call to 6091 these functions from another thread; unlike the function it replaces. 6092 6093 =item e. 6094 6095 But most importantly, they work on systems that don't have C<nl_langinfo>, such 6096 as Windows, hence making your code more portable. Of the fifty-some possible 6097 items specified by the POSIX 2008 standard, 6098 L<https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>, 6099 only one is completely unimplemented, though on non-Windows platforms, another 6100 significant one is not fully implemented). They use various techniques to 6101 recover the other items, including calling C<L<localeconv(3)>>, and 6102 C<L<strftime(3)>>, both of which are specified in C89, so should be always be 6103 available. Later C<strftime()> versions have additional capabilities. 6104 If an item is not available on your system, this returns either the value 6105 associated with the C locale, or simply C<"">, whichever is more appropriate. 6106 6107 It is important to note that, when called with an item that is recovered by 6108 using C<localeconv>, the buffer from any previous explicit call to 6109 C<L<localeconv(3)>> will be overwritten. But you shouldn't be using 6110 C<localeconv> anyway because it is is very much not thread-safe, and suffers 6111 from the same problems outlined in item 'b.' above for the fields it returns 6112 that are controlled by the LC_NUMERIC locale category. Instead, avoid all of 6113 those problems by calling L</Perl_localeconv>, which is thread-safe; or by 6114 using the methods given in L<perlcall> to call 6115 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe. 6116 6117 =back 6118 6119 The details for those items which may deviate from what this emulation returns 6120 and what a native C<nl_langinfo()> would return are specified in 6121 L<I18N::Langinfo>. 6122 6123 =for apidoc sv_langinfo 6124 6125 This is the preferred interface for accessing the data that L<nl_langinfo(3)> 6126 provides (or Perl's emulation of it on platforms lacking it), returning an SV. 6127 Unlike, the earlier-defined interfaces to this (L</Perl_langinfo> and 6128 L</Perl_langinfo8>), which return strings, the UTF8ness of the result is 6129 automatically handled for you. And like them, it is thread-safe and 6130 automatically handles getting the proper values for the C<RADIXCHAR> and 6131 C<THOUSEP> items (that calling the plain libc C<nl_langinfo()> could give the 6132 wrong results for). Like them, this also doesn't play well with the libc 6133 C<localeconv()>; use L<C<POSIX::localeconv()>|POSIX/localeconv> instead. 6134 6135 There are a few deviations from what a native C<nl_langinfo()> would return and 6136 what this returns on platforms that don't implement that function. These are 6137 detailed in L<I18N::Langinfo>. 6138 6139 =cut 6140 6141 */ 6142 6143 /* external_call_langinfo() is an interface to callers from outside this file to 6144 * langinfo_sv_i(), calculating a necessary value for it. If those functions 6145 * aren't defined, the fallback function is emulate_langinfo(), which doesn't 6146 * use that value (as everything in this situation takes place in the "C" 6147 * locale), and so we define this macro to transparently hide the absence of 6148 * the missing functions */ 6149 #ifndef external_call_langinfo 6150 # define external_call_langinfo(item, sv, utf8p) \ 6151 emulate_langinfo(item, "C", sv, utf8p) 6152 #endif 6153 6154 SV * 6155 Perl_sv_langinfo(pTHX_ const nl_item item) { 6156 utf8ness_t dummy; /* Having this tells the layers below that we want the 6157 UTF-8 flag in 'sv' to be set properly. */ 6158 6159 SV * sv = newSV_type(SVt_PV); 6160 (void) external_call_langinfo(item, sv, &dummy); 6161 6162 return sv; 6163 } 6164 6165 const char * 6166 Perl_langinfo(const nl_item item) 6167 { 6168 dTHX; 6169 6170 (void) external_call_langinfo(item, PL_langinfo_sv, NULL); 6171 return SvPV_nolen(PL_langinfo_sv); 6172 } 6173 6174 const char * 6175 Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness) 6176 { 6177 PERL_ARGS_ASSERT_PERL_LANGINFO8; 6178 dTHX; 6179 6180 (void) external_call_langinfo(item, PL_langinfo_sv, utf8ness); 6181 return SvPV_nolen(PL_langinfo_sv); 6182 } 6183 6184 #ifdef USE_LOCALE 6185 6186 const char * 6187 S_external_call_langinfo(pTHX_ const nl_item item, 6188 SV * sv, 6189 utf8ness_t * utf8ness) 6190 { 6191 PERL_ARGS_ASSERT_EXTERNAL_CALL_LANGINFO; 6192 6193 /* Find the locale category that controls the input 'item', and call 6194 * langinfo_sv_i() including that value. 6195 * 6196 * If we are not paying attention to that category, instead call 6197 * emulate_langinfo(), which knows how to handle this situation. */ 6198 locale_category_index cat_index = LC_ALL_INDEX_; /* Out-of-bounds */ 6199 6200 switch (item) { 6201 case CODESET: 6202 6203 # ifdef USE_LOCALE_CTYPE 6204 cat_index = LC_CTYPE_INDEX_; 6205 # endif 6206 break; 6207 6208 6209 case YESEXPR: case YESSTR: case NOEXPR: case NOSTR: 6210 6211 # ifdef USE_LOCALE_MESSAGES 6212 cat_index = LC_MESSAGES_INDEX_; 6213 # endif 6214 break; 6215 6216 6217 case CRNCYSTR: 6218 6219 # ifdef USE_LOCALE_MONETARY 6220 cat_index = LC_MONETARY_INDEX_; 6221 # endif 6222 break; 6223 6224 6225 case RADIXCHAR: case THOUSEP: 6226 6227 # ifdef USE_LOCALE_NUMERIC 6228 cat_index = LC_NUMERIC_INDEX_; 6229 # endif 6230 break; 6231 6232 6233 case _NL_ADDRESS_POSTAL_FMT: 6234 case _NL_ADDRESS_COUNTRY_NAME: 6235 case _NL_ADDRESS_COUNTRY_POST: 6236 case _NL_ADDRESS_COUNTRY_AB2: 6237 case _NL_ADDRESS_COUNTRY_AB3: 6238 case _NL_ADDRESS_COUNTRY_CAR: 6239 case _NL_ADDRESS_COUNTRY_NUM: 6240 case _NL_ADDRESS_COUNTRY_ISBN: 6241 case _NL_ADDRESS_LANG_NAME: 6242 case _NL_ADDRESS_LANG_AB: 6243 case _NL_ADDRESS_LANG_TERM: 6244 case _NL_ADDRESS_LANG_LIB: 6245 # ifdef USE_LOCALE_ADDRESS 6246 cat_index = LC_ADDRESS_INDEX_; 6247 # endif 6248 break; 6249 6250 6251 case _NL_IDENTIFICATION_TITLE: 6252 case _NL_IDENTIFICATION_SOURCE: 6253 case _NL_IDENTIFICATION_ADDRESS: 6254 case _NL_IDENTIFICATION_CONTACT: 6255 case _NL_IDENTIFICATION_EMAIL: 6256 case _NL_IDENTIFICATION_TEL: 6257 case _NL_IDENTIFICATION_FAX: 6258 case _NL_IDENTIFICATION_LANGUAGE: 6259 case _NL_IDENTIFICATION_TERRITORY: 6260 case _NL_IDENTIFICATION_AUDIENCE: 6261 case _NL_IDENTIFICATION_APPLICATION: 6262 case _NL_IDENTIFICATION_ABBREVIATION: 6263 case _NL_IDENTIFICATION_REVISION: 6264 case _NL_IDENTIFICATION_DATE: 6265 case _NL_IDENTIFICATION_CATEGORY: 6266 # ifdef USE_LOCALE_IDENTIFICATION 6267 cat_index = LC_IDENTIFICATION_INDEX_; 6268 # endif 6269 break; 6270 6271 6272 case _NL_MEASUREMENT_MEASUREMENT: 6273 # ifdef USE_LOCALE_MEASUREMENT 6274 cat_index = LC_MEASUREMENT_INDEX_; 6275 # endif 6276 break; 6277 6278 6279 case _NL_NAME_NAME_FMT: 6280 case _NL_NAME_NAME_GEN: 6281 case _NL_NAME_NAME_MR: 6282 case _NL_NAME_NAME_MRS: 6283 case _NL_NAME_NAME_MISS: 6284 case _NL_NAME_NAME_MS: 6285 # ifdef USE_LOCALE_NAME 6286 cat_index = LC_NAME_INDEX_; 6287 # endif 6288 break; 6289 6290 6291 case _NL_PAPER_HEIGHT: 6292 case _NL_PAPER_WIDTH: 6293 # ifdef USE_LOCALE_PAPER 6294 cat_index = LC_PAPER_INDEX_; 6295 # endif 6296 break; 6297 6298 6299 case _NL_TELEPHONE_TEL_INT_FMT: 6300 case _NL_TELEPHONE_TEL_DOM_FMT: 6301 case _NL_TELEPHONE_INT_SELECT: 6302 case _NL_TELEPHONE_INT_PREFIX: 6303 # ifdef USE_LOCALE_TELEPHONE 6304 cat_index = LC_TELEPHONE_INDEX_; 6305 # endif 6306 break; 6307 6308 6309 default: /* The other possible items are all in LC_TIME. */ 6310 # ifdef USE_LOCALE_TIME 6311 cat_index = LC_TIME_INDEX_; 6312 # endif 6313 break; 6314 6315 } /* End of switch on item */ 6316 6317 # if defined(HAS_MISSING_LANGINFO_ITEM_) 6318 6319 /* If the above didn't find the category's index, it has to be because the 6320 * item is unknown to us (and the callee will handle that), or the category 6321 * is confined to the "C" locale on this platform, which the callee also 6322 * handles. (LC_MESSAGES is not required by the C Standard (the others 6323 * above are), so we have to emulate it on platforms lacking it (such as 6324 * Windows).) */ 6325 if (cat_index == LC_ALL_INDEX_) { 6326 return emulate_langinfo(item, "C", sv, utf8ness); 6327 } 6328 6329 # endif 6330 6331 /* And get the value for this 'item', whose category has now been 6332 * calculated. We need to find the current corresponding locale, and pass 6333 * that as well. */ 6334 return langinfo_sv_i(item, cat_index, 6335 query_nominal_locale_i(cat_index), 6336 sv, utf8ness); 6337 } 6338 6339 #endif 6340 #if defined(USE_LOCALE) && defined(HAS_NL_LANGINFO) 6341 6342 STATIC const char * 6343 S_langinfo_sv_i(pTHX_ 6344 const nl_item item, /* The item to look up */ 6345 6346 /* The locale category that controls it */ 6347 locale_category_index cat_index, 6348 6349 /* The locale to look up 'item' in. */ 6350 const char * locale, 6351 6352 /* The SV to store the result in; see below */ 6353 SV * sv, 6354 6355 /* If not NULL, the location to store the UTF8-ness of 'item's 6356 * value, as documented */ 6357 utf8ness_t * utf8ness) 6358 { 6359 PERL_ARGS_ASSERT_LANGINFO_SV_I; 6360 assert(cat_index < LC_ALL_INDEX_); 6361 6362 /* This function is the interface to nl_langinfo(), returning a thread-safe 6363 * result, valid until its next call that uses the same 'sv'. Similarly, 6364 * the S_emulate_langinfo() function below does the same, when 6365 * nl_langinfo() isn't available for the desired locale, or is completely 6366 * absent from the system. It is hopefully invisible to an outside caller 6367 * as to which one of the two actually ends up processing the request. 6368 * This comment block hence generally describes the two functions as a 6369 * unit. 6370 * 6371 * The two functions both return values (using 'return' statements) and 6372 * potentially change the contents of the passed in SV 'sv'. However, in 6373 * any given call, only one of the return types is reliable. 6374 * 6375 * When the passed in SV is 'PL_scratch_langinfo', the functions make sure 6376 * that the 'return' statements return the correct value, but whatever 6377 * value is in 'PL_scratch_langinfo' should be considered garbage. When it 6378 * is any other SV, that SV will get the correct result, and the value 6379 * returned by a 'return' statement should be considered garbage. 6380 * 6381 * The reason for this is twofold: 6382 * 6383 * 1) These functions serve two masters. For most purposes when called 6384 * from within this file, the desired value is used immediately, and 6385 * then no longer required. For these, the 'return' statement values 6386 * are most convenient. 6387 * 6388 * But when the call is initiated from an external XS source, like 6389 * I18N::Langinfo, the value needs to be able to be stable for a longer 6390 * time and likely returned to Perl space. An SV return is most 6391 * convenient for these 6392 * 6393 * Further, some Configurations use these functions reentrantly. For 6394 * those, an SV must be passed. 6395 * 6396 * 2) In S_emulate_langinfo(), most langinfo items are easy or even 6397 * trivial to get. These are amenable to being returned by 'return' 6398 * statements. But others are more complex, and use the infrastructure 6399 * provided by perl's SV functions to help out. 6400 * 6401 * So for some items, it is most convenient to 'return' a simple value; for 6402 * others an SV is most convenient. And some callers want a simple value; 6403 * others want or need an SV. It would be wasteful to have an SV, convert 6404 * it to a simple value, discarding the SV, then create a new SV. 6405 * 6406 * The solution adopted here is to always pass an SV, and have a reserved 6407 * one, PL_scratch_langinfo, indicate that a 'return' is desired. That SV 6408 * is then used as scratch for the items that it is most convenient to use 6409 * an SV in calculating. Besides these two functions and initialization, 6410 * the only mention of PL_scratch_langinfo is in the expansion of a single 6411 * macro that is used by the code in this file that desires a non-SV return 6412 * value. 6413 * 6414 * A wart of this interface is that to get the UTF-8 flag of the passed-in 6415 * SV set, you have to also pass a non-null 'utf8ness' parameter. This is 6416 * entirely to prevent the extra expense of calculating UTF-8ness when the 6417 * caller is plain Perl_langinfo(), which doesn't care about this. If that 6418 * seems too kludgy, other mechanisms could be devised. But be aware that 6419 * the SV interface has to have a way to not calculate UTF-8ness, or else 6420 * the reentrant uses could infinitely recurse */ 6421 6422 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 6423 "Entering langinfo_sv_i item=%jd, using locale %s\n", 6424 (PERL_INTMAX_T) item, locale)); 6425 6426 # ifdef HAS_MISSING_LANGINFO_ITEM_ 6427 6428 if (! category_available[cat_index]) { 6429 return emulate_langinfo(item, locale, sv, utf8ness); 6430 } 6431 6432 # endif 6433 6434 /* One might be tempted to avoid any toggling by instead using 6435 * nl_langinfo_l() on platforms that have it. This would entail creating a 6436 * locale object with newlocale() and freeing it afterwards. But doing so 6437 * runs significantly slower than just doing the toggle ourselves. 6438 * lib/locale_threads.t was slowed down by 25% on Ubuntu 22.04 */ 6439 6440 start_DEALING_WITH_MISMATCHED_CTYPE(locale); 6441 6442 const char * orig_switched_locale = toggle_locale_i(cat_index, locale); 6443 6444 /* nl_langinfo() is supposedly thread-safe except for its return value. The 6445 * POSIX 2017 Standard states: 6446 * 6447 * "The pointer returned by nl_langinfo() might be invalidated or the string 6448 * content might be overwritten by a subsequent call to nl_langinfo() in any 6449 * thread or to nl_langinfo_l() in the same thread or the initial thread, by 6450 * subsequent calls to setlocale() with a category corresponding to the 6451 * category of item (see <langinfo.h>) or the category LC_ALL, or by 6452 * subsequent calls to uselocale() which change the category corresponding 6453 * to the category of item." 6454 * 6455 * The implications of this are: 6456 * a) Threaded: nl_langinfo()'s return must be saved in a critical section 6457 * to avoid having another thread's call to it destroying the 6458 * result. That means that the whole call to nl_langinfo() 6459 * plus the save must be done in a critical section. 6460 * b) Unthreaded: No critical section is needed (accomplished by having the 6461 * locks below be no-ops in this case). But any subsequent 6462 * setlocale() or uselocale() could still destroy it. 6463 * Note that before returning, this function restores any 6464 * toggled locale categories. These could easily end up 6465 * calling uselocale() or setlocale(), destroying our 6466 * result. (And in some Configurations, this file currently 6467 * calls nl_langinfo_l() to determine if a uselocale() is 6468 * needed.) So, a copy of the result is made in this case as 6469 * well. 6470 */ 6471 const char * retval = NULL; 6472 utf8ness_t is_utf8 = UTF8NESS_UNKNOWN; 6473 6474 /* Do a bit of extra work so avoid 6475 * switch() { default: ... } 6476 * where the only case in it is the default: */ 6477 # if defined(USE_LOCALE_PAPER) \ 6478 || defined(USE_LOCALE_MEASUREMENT) \ 6479 || defined(USE_LOCALE_ADDRESS) 6480 # define IS_SWITCH 1 6481 # define MAYBE_SWITCH(n) switch(n) 6482 # else 6483 # define IS_SWITCH 0 6484 # define MAYBE_SWITCH(n) 6485 # endif 6486 6487 GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough); 6488 6489 MAYBE_SWITCH(item) { 6490 6491 # if defined(USE_LOCALE_MEASUREMENT) 6492 6493 case _NL_MEASUREMENT_MEASUREMENT: 6494 { 6495 /* An ugly API; only the first byte of the returned char* address means 6496 * anything */ 6497 gwLOCALE_LOCK; 6498 char char_value = nl_langinfo(item)[0]; 6499 gwLOCALE_UNLOCK; 6500 6501 sv_setuv(sv, char_value); 6502 } 6503 6504 goto non_string_common; 6505 6506 # endif 6507 # if defined(USE_LOCALE_ADDRESS) || defined(USE_LOCALE_PAPER) 6508 # if defined(USE_LOCALE_ADDRESS) 6509 6510 case _NL_ADDRESS_COUNTRY_NUM: 6511 6512 /* Some glibc's return random values for this item and locale; 6513 * workaround by special casing it. */ 6514 if (isNAME_C_OR_POSIX(locale)) { 6515 sv_setuv(sv, 0); 6516 goto non_string_common; 6517 } 6518 6519 /* FALLTHROUGH */ 6520 6521 # endif 6522 # if defined(USE_LOCALE_PAPER) 6523 6524 case _NL_PAPER_HEIGHT: case _NL_PAPER_WIDTH: 6525 6526 # endif 6527 6528 { /* A slightly less ugly API; the int portion of the returned char* 6529 * address is an integer. */ 6530 gwLOCALE_LOCK; 6531 int int_value = (int) PTR2UV(nl_langinfo(item)); 6532 gwLOCALE_UNLOCK; 6533 6534 sv_setuv(sv, int_value); 6535 } 6536 6537 # endif 6538 # if IS_SWITCH 6539 # if defined(USE_LOCALE_MEASUREMENT) 6540 6541 non_string_common: 6542 6543 # endif 6544 6545 /* In all cases that get here, the char* instead delivers a numeric 6546 * value, so its UTF-8ness is meaningless */ 6547 is_utf8 = UTF8NESS_IMMATERIAL; 6548 6549 if (sv == PL_scratch_langinfo) { 6550 retval = SvPV_nomg_const_nolen(sv); 6551 } 6552 6553 break; 6554 6555 default: 6556 6557 # endif 6558 6559 /* The rest of the possibilities deliver a true char* pointer to a 6560 * string (or sequence of strings in the case of ALT_DIGITS) */ 6561 gwLOCALE_LOCK; 6562 6563 retval = nl_langinfo(item); 6564 Size_t total_len = strlen(retval); 6565 6566 /* Initialized only to silence some dumber compilers warning that 6567 * might be uninitialized */ 6568 char separator = ';'; 6569 6570 if (UNLIKELY(item == ALT_DIGITS) && total_len > 0) { 6571 6572 /* The return from nl_langinfo(ALT_DIGITS) is specified by the 6573 * 2017 POSIX Standard as a string consisting of "semicolon- 6574 * separated symbols. The first is the alternative symbol 6575 * corresponding to zero, the second is the symbol corresponding to 6576 * one, and so on. Up to 100 alternative symbols may be 6577 * specified". Infuriatingly, Linux does not follow this, and uses 6578 * the least C-language-friendly separator possible, the NUL. In 6579 * case other platforms also violate the standard, the code below 6580 * looks for NUL and any graphic \W character as a potential 6581 * separator. */ 6582 const char * sep_pos = strchr(retval, ';'); 6583 if (! sep_pos) { 6584 sep_pos = strpbrk(retval, " !\"#$%&'()*+,-./:<=>?@[\\]^_`{|}~"); 6585 } 6586 if (sep_pos) { 6587 separator = *sep_pos; 6588 } 6589 else if (strpbrk(retval, "123456789")) { 6590 6591 /* Alternate digits, with the possible exception of 0, 6592 * shouldn't be standard digits, so if we get any back, return 6593 * that there aren't alternate digits. 0 is an exception 6594 * because there may be locales that do not have a zero, such 6595 * as Roman numerals. It could therefore be that alt-0 is 0, 6596 * but alt-1 better be some multi-byte Unicode character(s) 6597 * like U+2160, ROMAN NUMERAL ONE. This clause is necessary 6598 * because the total length of the ASCII digits won't trigger 6599 * the conditional in the next clause that protects against 6600 * non-Standard libc returns, such as in Alpine platforms, but 6601 * multi-byte returns will trigger it */ 6602 retval = ""; 6603 total_len = 0; 6604 } 6605 else if (UNLIKELY(total_len > 6606 2 * UVCHR_SKIP(PERL_UNICODE_MAX) * 4)) 6607 { /* But as a check against the possibility that the separator is 6608 * some other character, look at the length of the returned 6609 * string. If the separator is a NUL, the length will be just 6610 * for the first NUL-terminated segment; if it is some other 6611 * character, there is only a single segment with all returned 6612 * alternate digits, which will be quite a bit longer than just 6613 * the first one. Many locales will always have a leading zero 6614 * to represent 0-9 (hence the 2* in the conditional above). 6615 * The conditional uses the worst case value of the most number 6616 * of byte possible for a Unicode character, and it is possible 6617 * that it requires several characters to represent a single 6618 * value; hence the final multiplier. This length represents a 6619 * conservative upper limit of the number of bytes for the 6620 * alternative representation of 00, but if the string 6621 * represents even only the first 10 alternative digits, it 6622 * will be much longer than that. So to reach here, the 6623 * separator must be some other byte. */ 6624 locale_panic_(Perl_form(aTHX_ 6625 "Can't find separator in ALT_DIGITS" 6626 " representation '%s' for locale '%s'", 6627 _byte_dump_string((U8 *) retval, 6628 total_len, 0), 6629 locale)); 6630 } 6631 else { 6632 separator = '\0'; 6633 6634 /* Must be using NUL to separate the digits. There are up to 6635 * 100 of them. Find the length of the entire sequence. 6636 * 6637 * The only way it could work if fewer is if it ends in two 6638 * NULs. khw has seen cases where there is no 2nd NUL on a 100 6639 * digit return. */ 6640 const char * s = retval + total_len + 1; 6641 6642 for (unsigned int i = 1; i <= 99; i++) { 6643 Size_t len = strlen(s) + 1; 6644 total_len += len; 6645 6646 if (len == 1) { /* Only a NUL */ 6647 break; 6648 } 6649 6650 s += len; 6651 } 6652 } 6653 } 6654 6655 sv_setpvn(sv, retval, total_len); 6656 6657 gwLOCALE_UNLOCK; 6658 6659 /* Convert the ALT_DIGITS separator to a semi-colon if not already */ 6660 if (UNLIKELY(item == ALT_DIGITS) && total_len > 0 && separator != ';') { 6661 6662 /* Operate directly on the string in the SV */ 6663 char * digit_string = SvPVX(sv); 6664 char * s = digit_string; 6665 char * e = s + total_len; 6666 6667 do { 6668 char * this_end = (char *) memchr(s, separator, total_len); 6669 if (! this_end || this_end >= e) { 6670 break; 6671 } 6672 6673 *this_end = ';'; 6674 s = this_end; 6675 } while (1); 6676 } 6677 6678 SvUTF8_off(sv); 6679 retval = SvPV_nomg_const_nolen(sv); 6680 } 6681 6682 GCC_DIAG_RESTORE_STMT; 6683 6684 restore_toggled_locale_i(cat_index, orig_switched_locale); 6685 end_DEALING_WITH_MISMATCHED_CTYPE(locale); 6686 6687 if (utf8ness) { 6688 if (LIKELY(is_utf8 == UTF8NESS_UNKNOWN)) { /* default: case above */ 6689 is_utf8 = get_locale_string_utf8ness_i(retval, 6690 LOCALE_UTF8NESS_UNKNOWN, 6691 locale, cat_index); 6692 } 6693 6694 *utf8ness = is_utf8; 6695 6696 if (*utf8ness == UTF8NESS_YES) { 6697 SvUTF8_on(sv); 6698 } 6699 } 6700 6701 return retval; 6702 } 6703 6704 # undef IS_SWITCH 6705 # undef MAYBE_SWITCH 6706 #endif 6707 #ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION 6708 6709 /* Forward declaration of function that we don't put into embed.fnc so as to 6710 * make its removal easier, as there may not be any extant platforms that need 6711 * it; and the function is located after emulate_langinfo() because it's easier 6712 * to understand when placed in the context of that code */ 6713 STATIC bool 6714 S_maybe_override_codeset(pTHX_ const char * codeset, 6715 const char * locale, 6716 const char ** new_codeset); 6717 #endif 6718 #if ! defined(HAS_NL_LANGINFO) || defined(HAS_MISSING_LANGINFO_ITEM_) 6719 6720 STATIC const char * 6721 S_emulate_langinfo(pTHX_ const PERL_INTMAX_T item, 6722 const char * locale, 6723 SV * sv, 6724 utf8ness_t * utf8ness) 6725 { 6726 PERL_ARGS_ASSERT_EMULATE_LANGINFO; 6727 PERL_UNUSED_ARG(locale); /* Too complicated to specify which 6728 Configurations use this vs which don't */ 6729 6730 /* This emulates nl_langinfo() on platforms: 6731 * 1) where it doesn't exist; or 6732 * 2) where it does exist, but there are categories that it shouldn't be 6733 * called on because they don't exist on the platform or we are 6734 * supposed to always stay in the C locale for them. This function 6735 * has hard-coded in the results for those for the C locale. 6736 * 6737 * This function returns a thread-safe result, valid until its next call 6738 * that uses the same 'sv'. Similarly, the S_langinfo_sv_i() function 6739 * above does the same when nl_langinfo() is available. Its comments 6740 * include a general description of the interface for both it and this 6741 * function. That function should be the one called by code outside this 6742 * little group. If it can't handle the request, it gets handed off to 6743 * this function. 6744 * 6745 * The major platform lacking nl_langinfo() is Windows. It does have 6746 * GetLocaleInfoEx() that could be used to get most of the items, but it 6747 * (and other similar Windows API functions) use what MS calls "locale 6748 * names", whereas the C functions use what MS calls "locale strings". The 6749 * locale string "English_United_States.1252" is equivalent to the locale 6750 * name "en_US". There are tables inside Windows that translate between 6751 * the two forms, but they are not exposed. Also calling setlocale(), then 6752 * calling GetThreadLocale() doesn't work, as the former doesn't change the 6753 * latter's return. Therefore we are stuck using the mechanisms below. */ 6754 6755 /* Almost all the items will have ASCII return values. Set that here, and 6756 * override if necessary */ 6757 utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL; 6758 const char * retval = NULL; 6759 6760 /* This function returns its result either by returning the calculated 6761 * value 'retval' if the 'sv' argument is PL_scratch_langinfo; or for any 6762 * other value of 'sv', it places the result into that 'sv'. For some 6763 * paths through the code, it is more convenient, in the moment, to use one 6764 * or the other to hold the calculated result. And, the calculation could 6765 * end up with the value in both places. At the end, if the caller 6766 * wants the convenient result, we are done; but if it wants the opposite 6767 * type of value, it must be converted. These macros are used to tell the 6768 * code at the end where the value got placed. */ 6769 # define RETVAL_IN_retval -1 6770 # define RETVAL_IN_BOTH 0 6771 # define RETVAL_IN_sv 1 6772 # define isRETVAL_IN_sv(type) ((type) >= RETVAL_IN_BOTH) 6773 # define isRETVAL_IN_retval(type) ((type) <= RETVAL_IN_BOTH) 6774 6775 /* Most calculations place the result in 'retval', so initialize to that, 6776 * and override if necessary */ 6777 int retval_type = RETVAL_IN_retval; 6778 6779 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 6780 "Entering emulate_langinfo item=%jd, using locale %s\n", 6781 item, locale)); 6782 6783 # if defined(HAS_LOCALECONV) \ 6784 && ! defined(HAS_SOME_LANGINFO) \ 6785 && (defined(USE_LOCALE_NUMERIC) || defined(USE_LOCALE_MONETARY)) 6786 6787 locale_category_index cat_index; 6788 const char * localeconv_key; 6789 I32 localeconv_klen; 6790 6791 # endif 6792 6793 GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough); 6794 6795 switch (item) { 6796 6797 # if ! defined(HAS_SOME_LANGINFO) || ! LC_MESSAGES_AVAIL_ 6798 6799 /* The following items have no way khw could figure out how to get except 6800 * via nl_langinfo() */ 6801 case YESEXPR: retval = "^[+1yY]"; break; 6802 case YESSTR: retval = "yes"; break; 6803 case NOEXPR: retval = "^[-0nN]"; break; 6804 case NOSTR: retval = "no"; break; 6805 6806 # endif 6807 # if ! defined(HAS_SOME_LANGINFO) || ! LC_MONETARY_AVAIL_ 6808 # if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV) 6809 # define NEED_USE_LOCALECONV 6810 6811 case CRNCYSTR: 6812 cat_index = LC_MONETARY_INDEX_; 6813 localeconv_key = CURRENCY_SYMBOL_LITERAL; 6814 localeconv_klen = STRLENs(CURRENCY_SYMBOL_LITERAL); 6815 goto use_localeconv; 6816 6817 # else 6818 6819 case CRNCYSTR: 6820 6821 /* The locale's currency symbol may be empty. But if not, the return 6822 * from nl_langinfo() prefixes it with a character that indicates where 6823 * in the monetary value the symbol is to be placed 6824 * a) before, like $9.99 6825 * b) middle, rare, but would like be 9$99 6826 * c) after, like 9.99USD 6827 * 6828 * The POSIX Standard permits an implementation to choose whether or 6829 * not to omit the prefix character if the symbol is empty (the 6830 * placement position is meaningless if there is nothing to place). 6831 * glibc has chosen to always prefix an empty symbol by a minus (which 6832 * is the prefix for 'before' positioning). FreeBSD has chosen to 6833 * return an empty string for an empty symbol. Perl has always 6834 * emulated the glibc way (probably with little thought). */ 6835 retval = "-"; 6836 break; 6837 6838 # endif 6839 # endif 6840 # if ! defined(HAS_SOME_LANGINFO) || ! LC_NUMERIC_AVAIL_ 6841 # if defined(USE_LOCALE_NUMERIC) && defined(HAS_LOCALECONV) 6842 # define NEED_USE_LOCALECONV 6843 6844 case THOUSEP: 6845 cat_index = LC_NUMERIC_INDEX_; 6846 localeconv_key = THOUSANDS_SEP_LITERAL; 6847 localeconv_klen = STRLENs(THOUSANDS_SEP_LITERAL); 6848 goto use_localeconv; 6849 6850 # else 6851 6852 case THOUSEP: 6853 retval = C_thousands_sep; 6854 break; 6855 6856 # endif 6857 6858 case RADIXCHAR: 6859 6860 # if defined(USE_LOCALE_NUMERIC) && defined(HAS_STRTOD) 6861 6862 { 6863 /* khw knows of only three possible radix characters used in the world. 6864 * By far the two most common are comma and dot. We can use strtod() 6865 * to quickly check for those without without much fuss. If it is 6866 * something other than those two, the code drops down and lets 6867 * localeconv() find it. 6868 * 6869 * We don't have to toggle LC_CTYPE here because all locales Perl 6870 * supports are compatible with ASCII, which the two possibilities are. 6871 * */ 6872 const char * orig_switched_locale = toggle_locale_c(LC_NUMERIC, locale); 6873 6874 /* Comma tried first in case strtod() always accepts dot regardless of 6875 * the locale */ 6876 if (strtod("1,5", NULL) > 1.4) { 6877 retval = ","; 6878 } 6879 else if (strtod("1.5", NULL) > 1.4) { 6880 retval = "."; 6881 } 6882 else { 6883 retval = NULL; 6884 } 6885 6886 restore_toggled_locale_c(LC_NUMERIC, orig_switched_locale); 6887 6888 if (retval) { 6889 break; 6890 } 6891 } 6892 6893 # endif /* Trying strtod() */ 6894 6895 /* If gets to here, the strtod() method wasn't compiled, or it failed; 6896 * drop down. 6897 * 6898 * (snprintf() used to be used instead of strtod(), but it was removed 6899 * as being somewhat more clumsy, and maybe non-conforming on some 6900 * platforms. But before resorting to localeconv(), the code that was 6901 * removed by the strtod commit could be inserted here. This seems 6902 * unlikely to be wanted unless some really broken localeconv() shows 6903 * up) */ 6904 6905 # if ! defined(USE_LOCALE_NUMERIC) || ! defined(HAS_LOCALECONV) 6906 6907 retval = C_decimal_point; 6908 break; 6909 6910 # else 6911 # define NEED_USE_LOCALECONV 6912 6913 cat_index = LC_NUMERIC_INDEX_; 6914 localeconv_key = DECIMAL_POINT_LITERAL; 6915 localeconv_klen = STRLENs(DECIMAL_POINT_LITERAL); 6916 6917 # endif 6918 # endif 6919 # ifdef NEED_USE_LOCALECONV 6920 6921 /* These items are available from localeconv(). */ 6922 6923 /* case RADIXCHAR: // May drop down to here in some configurations 6924 case THOUSEP: // Jumps to here 6925 case CRNCYSTR: // Jumps to here */ 6926 use_localeconv: 6927 { 6928 6929 /* The hash gets populated with just the field(s) related to 'item'. */ 6930 HV * result_hv = my_localeconv(item); 6931 SV* string = hv_delete(result_hv, localeconv_key, localeconv_klen, 0); 6932 6933 # ifdef USE_LOCALE_MONETARY 6934 6935 if (item == CRNCYSTR) { 6936 6937 /* CRNCYSTR localeconv() returns a slightly different value 6938 * than the nl_langinfo() API calls for, so have to modify this one 6939 * to conform. We need another value from localeconv() to know 6940 * what to change it to. my_localeconv() has populated the hash 6941 * with exactly both fields. */ 6942 SV* precedes = hv_deletes(result_hv, P_CS_PRECEDES_LITERAL, 0); 6943 if (! precedes) { 6944 locale_panic_("my_localeconv() unexpectedly didn't return" 6945 " a value for " P_CS_PRECEDES_LITERAL); 6946 } 6947 6948 /* The modification is to prefix the localeconv() return with a 6949 * single byte, calculated as follows: */ 6950 const char * prefix = (LIKELY(SvIV(precedes) != -1)) 6951 ? ((precedes != 0) ? "-" : "+") 6952 : "."; 6953 /* (khw couldn't find any documentation that the dot is signalled 6954 * by CHAR_MAX (which we modify to -1), but cygwin uses it thusly, 6955 * and it makes sense given that CHAR_MAX indicates the value isn't 6956 * used, so it neither precedes nor succeeds) */ 6957 6958 /* Perform the modification */ 6959 sv_insert(string, 0, 0, prefix, 1); 6960 } 6961 6962 # endif 6963 6964 /* Here, 'string' contains the value we want to return, and the 6965 * hv_delete() has left it mortalized so its PV may be reused instead of 6966 * copied */ 6967 sv_setsv_nomg(sv, string); 6968 retval_type = RETVAL_IN_sv; 6969 6970 if (utf8ness) { 6971 is_utf8 = get_locale_string_utf8ness_i(SvPVX(sv), 6972 LOCALE_UTF8NESS_UNKNOWN, 6973 locale, 6974 cat_index); 6975 } 6976 6977 SvREFCNT_dec_NN(result_hv); 6978 break; 6979 } 6980 6981 # endif /* Using localeconv() for something or other */ 6982 # undef NEED_USE_LOCALECONV 6983 # if ! defined(HAS_SOME_LANGINFO) || ! LC_CTYPE_AVAIL_ 6984 # ifndef USE_LOCALE_CTYPE 6985 6986 case CODESET: 6987 retval = C_codeset; 6988 break; 6989 6990 # else 6991 6992 case CODESET: 6993 6994 /* The trivial case */ 6995 if (isNAME_C_OR_POSIX(locale)) { 6996 retval = C_codeset; 6997 break; 6998 } 6999 7000 /* If this happens to match our cached value */ 7001 if (PL_in_utf8_CTYPE_locale && strEQ(locale, PL_ctype_name)) { 7002 retval = "UTF-8"; 7003 break; 7004 } 7005 7006 # ifdef WIN32 7007 # ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES 7008 # define CODE_PAGE_FORMAT "%s" 7009 # define CODE_PAGE_FUNCTION nl_langinfo(CODESET) 7010 # else 7011 # define CODE_PAGE_FORMAT "%d" 7012 7013 /* This Windows function retrieves the code page. It is subject to 7014 * change, but is documented, and has been stable for many releases */ 7015 # define CODE_PAGE_FUNCTION ___lc_codepage_func() 7016 # endif 7017 7018 const char * orig_CTYPE_locale; 7019 orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale); 7020 Perl_sv_setpvf(aTHX_ sv, CODE_PAGE_FORMAT, CODE_PAGE_FUNCTION); 7021 retval_type = RETVAL_IN_sv; 7022 7023 /* We just assume the codeset is ASCII; no need to check for it being 7024 * UTF-8 */ 7025 SvUTF8_off(sv); 7026 7027 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); 7028 7029 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n", 7030 locale, SvPVX(sv))); 7031 break; 7032 7033 # else /* Below is ! Win32 */ 7034 7035 /* The codeset is important, but khw did not figure out a way for it to 7036 * be retrieved on non-Windows boxes without nl_langinfo(). But even 7037 * if we can't get it directly, we can usually determine if it is a 7038 * UTF-8 locale or not. If it is UTF-8, we (correctly) use that for 7039 * the code set. */ 7040 7041 # ifdef HAS_DEFINITIVE_UTF8NESS_DETERMINATION 7042 7043 if (is_locale_utf8(locale)) { 7044 retval = "UTF-8"; 7045 break; 7046 } 7047 7048 # endif 7049 7050 /* Here, the code set has not been found. The only other option khw 7051 * could think of is to see if the codeset is part of the locale name. 7052 * This is very less than ideal; often there is no code set in the 7053 * name; and at other times they even lie. 7054 * 7055 * But there is an XPG standard syntax, which many locales follow: 7056 * 7057 * language[_territory[.codeset]][@modifier] 7058 * 7059 * So we take the part between the dot and any '@' */ 7060 const char * name; 7061 name = strchr(locale, '.'); 7062 if (! name) { 7063 retval = ""; /* Alas, no dot */ 7064 } 7065 else { 7066 7067 /* Don't include the dot */ 7068 name++; 7069 7070 /* The code set name is considered to be everything between the dot 7071 * and any '@', so stop before any '@' */ 7072 const char * modifier = strchr(name, '@'); 7073 if (modifier) { 7074 sv_setpvn(sv, name, modifier - name); 7075 } 7076 else { 7077 sv_setpv(sv, name); 7078 } 7079 SvUTF8_off(sv); 7080 7081 retval_type = RETVAL_IN_sv; 7082 } 7083 7084 # ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION 7085 7086 /* Here, 'retval' contains any codeset name derived from the locale 7087 * name. That derived name may be empty or not necessarily indicative 7088 * of the real codeset. But we can often determine if it should be 7089 * UTF-8, regardless of what the name is. On most platforms, that 7090 * determination is definitive, and was already done. But for this 7091 * code to be compiled, this platform is not one of them. However, 7092 * there are typically tools available to make a very good guess, and 7093 * knowing the derived codeset name improves the quality of that guess. 7094 * The following function overrides the derived codeset name when it 7095 * guesses that it actually should be UTF-8. It could be inlined here, 7096 * but was moved out of this switch() so as to make the switch() 7097 * control flow easier to follow */ 7098 if (isRETVAL_IN_sv(retval_type)) { 7099 retval = SvPVX_const(sv); 7100 retval_type = RETVAL_IN_BOTH; 7101 } 7102 7103 if (S_maybe_override_codeset(aTHX_ retval, locale, &retval)) { 7104 retval_type = RETVAL_IN_retval; 7105 } 7106 7107 # endif 7108 7109 break; 7110 7111 # endif /* ! WIN32 */ 7112 # endif /* USE_LOCALE_CTYPE */ 7113 # endif 7114 7115 /* The _NL_foo items are mostly empty; the rest are copied from Ubuntu C 7116 * locale values. khw fairly arbitrarily decided which of its non-empty 7117 * values to copy and which to change to empty. All the numeric ones needed 7118 * some value */ 7119 7120 # if ! defined(HAS_SOME_LANGINFO) || ! LC_ADDRESS_AVAIL_ 7121 7122 case _NL_ADDRESS_POSTAL_FMT: 7123 case _NL_ADDRESS_COUNTRY_NAME: 7124 case _NL_ADDRESS_COUNTRY_POST: 7125 case _NL_ADDRESS_COUNTRY_AB2: 7126 case _NL_ADDRESS_COUNTRY_AB3: 7127 case _NL_ADDRESS_COUNTRY_CAR: 7128 case _NL_ADDRESS_COUNTRY_ISBN: 7129 case _NL_ADDRESS_LANG_NAME: 7130 case _NL_ADDRESS_LANG_AB: 7131 case _NL_ADDRESS_LANG_TERM: 7132 case _NL_ADDRESS_LANG_LIB: 7133 retval = ""; 7134 break; 7135 7136 case _NL_ADDRESS_COUNTRY_NUM: 7137 sv_setuv(sv, 0); 7138 retval_type = RETVAL_IN_sv; 7139 break; 7140 7141 # endif 7142 # if ! defined(HAS_SOME_LANGINFO) || ! LC_IDENTIFICATION_AVAIL_ 7143 7144 case _NL_IDENTIFICATION_ADDRESS: 7145 case _NL_IDENTIFICATION_CONTACT: 7146 case _NL_IDENTIFICATION_EMAIL: 7147 case _NL_IDENTIFICATION_TEL: 7148 case _NL_IDENTIFICATION_FAX: 7149 case _NL_IDENTIFICATION_LANGUAGE: 7150 case _NL_IDENTIFICATION_AUDIENCE: 7151 case _NL_IDENTIFICATION_APPLICATION: 7152 case _NL_IDENTIFICATION_ABBREVIATION: 7153 retval = ""; 7154 break; 7155 7156 case _NL_IDENTIFICATION_DATE: retval = "1997-12-20"; break; 7157 case _NL_IDENTIFICATION_REVISION: retval = "1.0"; break; 7158 case _NL_IDENTIFICATION_CATEGORY: retval = "i18n:1999"; break; 7159 case _NL_IDENTIFICATION_TERRITORY:retval = "ISO"; break; 7160 7161 case _NL_IDENTIFICATION_TITLE: 7162 retval = "ISO/IEC 14652 i18n FDCC-set"; 7163 break; 7164 7165 case _NL_IDENTIFICATION_SOURCE: 7166 retval = "ISO/IEC JTC1/SC22/WG20 - internationalization"; 7167 break; 7168 7169 # endif 7170 # if ! defined(HAS_SOME_LANGINFO) || ! LC_MEASUREMENT_AVAIL_ 7171 7172 case _NL_MEASUREMENT_MEASUREMENT: 7173 sv_setuv(sv, 1); 7174 retval_type = RETVAL_IN_sv; 7175 break; 7176 7177 # endif 7178 # if ! defined(HAS_SOME_LANGINFO) || ! LC_NAME_AVAIL_ 7179 7180 case _NL_NAME_NAME_FMT: 7181 case _NL_NAME_NAME_GEN: 7182 case _NL_NAME_NAME_MR: 7183 case _NL_NAME_NAME_MRS: 7184 case _NL_NAME_NAME_MISS: 7185 case _NL_NAME_NAME_MS: 7186 retval = ""; 7187 break; 7188 7189 # endif 7190 # if ! defined(HAS_SOME_LANGINFO) || ! LC_PAPER_AVAIL_ 7191 7192 case _NL_PAPER_HEIGHT: 7193 sv_setuv(sv, 297); 7194 retval_type = RETVAL_IN_sv; 7195 break; 7196 7197 case _NL_PAPER_WIDTH: 7198 sv_setuv(sv, 210); 7199 retval_type = RETVAL_IN_sv; 7200 break; 7201 7202 # endif 7203 # if ! defined(HAS_SOME_LANGINFO) || ! LC_TELEPHONE_AVAIL_ 7204 7205 case _NL_TELEPHONE_INT_SELECT: 7206 case _NL_TELEPHONE_INT_PREFIX: 7207 case _NL_TELEPHONE_TEL_DOM_FMT: 7208 retval = ""; 7209 break; 7210 7211 case _NL_TELEPHONE_TEL_INT_FMT: 7212 retval = "+%c %a %l"; 7213 break; 7214 7215 # endif 7216 7217 /* When we have to emulate TIME-related items, this bit of code is compiled 7218 * to have the default: case be a nested switch() which distinguishes 7219 * between legal inputs and unknown ones. This bit does initialization and 7220 * then at the end calls switch(). But when we aren't emulating TIME, by 7221 * the time we get to here all legal inputs have been handled above, and it 7222 * is cleaner to not have a nested switch(). So this bit of code is skipped 7223 * and the other-wise nested default: case is compiled as part of the outer 7224 * (and actually only) switch() */ 7225 # if ! defined(HAS_SOME_LANGINFO) || ! LC_TIME_AVAIL_ 7226 7227 default: /* Anything else that is legal is LC_TIME-related */ 7228 { 7229 7230 const char * format = NULL; 7231 retval = NULL; 7232 7233 # ifdef HAS_STRFTIME 7234 7235 bool return_format = FALSE; 7236 7237 /* Without strftime(), default compiled-in values are returned. 7238 * Otherwise, we generally compute a date as explained below. 7239 * Initialize default values for that computation */ 7240 int mon = 0; 7241 int mday = 1; 7242 int hour = 6; 7243 7244 # endif 7245 7246 /* Nested switch for LC_TIME items, plus the default: case is for 7247 * unknown items */ 7248 switch (item) { 7249 7250 # endif /* ! defined(HAS_SOME_LANGINFO) || ! LC_TIME_AVAIL_ */ 7251 7252 default: 7253 7254 /* On systems with langinfo.h, 'item' is an enum. If we don't 7255 * handle one of those, the code needs to change to be able to do 7256 * so. But otherwise, the parameter can be any int, and so could 7257 * be a garbage value and all we can do is to return that it is 7258 * invalid. */; 7259 # if defined(I_LANGINFO) 7260 7261 Perl_croak_nocontext("panic: Unexpected nl_langinfo() item %jd", 7262 item); 7263 7264 # else 7265 assert(item < 0); /* Make sure using perl_langinfo.h */ 7266 SET_EINVAL; 7267 retval = ""; 7268 break; 7269 # endif 7270 7271 /* Back to the nested switch() */ 7272 # if ! defined(HAS_SOME_LANGINFO) || ! LC_TIME_AVAIL_ 7273 7274 /* The case: statments in this switch are all for LC_TIME related 7275 * values. There are four types of values returned. One type is 7276 * "Give me the name in this locale of the 3rd month of the year" 7277 * (March in an English locale). The second main type is "Give me 7278 * the best format string understood by strftime(), like '%c', for 7279 * formatting the date and time in this locale." The other two 7280 * types are for ERA and ALT_DIGITS, and are explained at the case 7281 * statements for them. 7282 * 7283 * For the first type, suppose we want to find the name of the 3rd 7284 * month of the year. We pass a date/time to strftime() that is 7285 * known to evaluate to sometime in March, along with a format that 7286 * tells strftime() to return the month's name. We then return 7287 * that to our caller. Similarly for the names of the days of the 7288 * week, like "Tuesday". There are also abbreviated versions for 7289 * each of these. 7290 * 7291 * To implement the second type (returning to the caller a string 7292 * containing a format suitable for passing to strftime() ) we 7293 * guess a format, pass that to strftime, and examine its return to 7294 * see if that format is known on this platform. If so, we return 7295 * that guess. Otherwise we return the empty string "". There are 7296 * no second guesses, as there don't seem to be alternatives 7297 * lurking out there. For some formats that are supposed to be 7298 * known to all strftime()s since C89, we just assume that they are 7299 * valid, not bothering to check. The guesses may not be the best 7300 * available for this locale on this platform, but should be good 7301 * enough, so that a native speaker would find them understandable. 7302 * */ 7303 7304 /* Unimplemented by perl; for use with strftime() %E modifier */ 7305 case ERA: retval = ""; break; 7306 7307 # if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME) 7308 7309 case AM_STR: retval = "AM"; break; 7310 case PM_STR: retval = "PM"; break; 7311 # else 7312 case PM_STR: hour = 18; 7313 case AM_STR: 7314 format = "%p"; 7315 break; 7316 # endif 7317 # if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME) 7318 7319 case ABDAY_1: retval = "Sun"; break; 7320 case ABDAY_2: retval = "Mon"; break; 7321 case ABDAY_3: retval = "Tue"; break; 7322 case ABDAY_4: retval = "Wed"; break; 7323 case ABDAY_5: retval = "Thu"; break; 7324 case ABDAY_6: retval = "Fri"; break; 7325 case ABDAY_7: retval = "Sat"; break; 7326 # else 7327 case ABDAY_7: mday++; 7328 case ABDAY_6: mday++; 7329 case ABDAY_5: mday++; 7330 case ABDAY_4: mday++; 7331 case ABDAY_3: mday++; 7332 case ABDAY_2: mday++; 7333 case ABDAY_1: 7334 format = "%a"; 7335 break; 7336 # endif 7337 # if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME) 7338 7339 case DAY_1: retval = "Sunday"; break; 7340 case DAY_2: retval = "Monday"; break; 7341 case DAY_3: retval = "Tuesday"; break; 7342 case DAY_4: retval = "Wednesday"; break; 7343 case DAY_5: retval = "Thursday"; break; 7344 case DAY_6: retval = "Friday"; break; 7345 case DAY_7: retval = "Saturday"; break; 7346 # else 7347 case DAY_7: mday++; 7348 case DAY_6: mday++; 7349 case DAY_5: mday++; 7350 case DAY_4: mday++; 7351 case DAY_3: mday++; 7352 case DAY_2: mday++; 7353 case DAY_1: 7354 format = "%A"; 7355 break; 7356 # endif 7357 # if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME) 7358 case ABMON_1: retval = "Jan"; break; 7359 case ABMON_2: retval = "Feb"; break; 7360 case ABMON_3: retval = "Mar"; break; 7361 case ABMON_4: retval = "Apr"; break; 7362 case ABMON_5: retval = "May"; break; 7363 case ABMON_6: retval = "Jun"; break; 7364 case ABMON_7: retval = "Jul"; break; 7365 case ABMON_8: retval = "Aug"; break; 7366 case ABMON_9: retval = "Sep"; break; 7367 case ABMON_10: retval = "Oct"; break; 7368 case ABMON_11: retval = "Nov"; break; 7369 case ABMON_12: retval = "Dec"; break; 7370 # else 7371 case ABMON_12: mon++; 7372 case ABMON_11: mon++; 7373 case ABMON_10: mon++; 7374 case ABMON_9: mon++; 7375 case ABMON_8: mon++; 7376 case ABMON_7: mon++; 7377 case ABMON_6: mon++; 7378 case ABMON_5: mon++; 7379 case ABMON_4: mon++; 7380 case ABMON_3: mon++; 7381 case ABMON_2: mon++; 7382 case ABMON_1: 7383 format = "%b"; 7384 break; 7385 # endif 7386 # if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME) 7387 7388 case MON_1: retval = "January"; break; 7389 case MON_2: retval = "February"; break; 7390 case MON_3: retval = "March"; break; 7391 case MON_4: retval = "April"; break; 7392 case MON_5: retval = "May"; break; 7393 case MON_6: retval = "June"; break; 7394 case MON_7: retval = "July"; break; 7395 case MON_8: retval = "August"; break; 7396 case MON_9: retval = "September";break; 7397 case MON_10: retval = "October"; break; 7398 case MON_11: retval = "November"; break; 7399 case MON_12: retval = "December"; break; 7400 # else 7401 case MON_12: mon++; 7402 case MON_11: mon++; 7403 case MON_10: mon++; 7404 case MON_9: mon++; 7405 case MON_8: mon++; 7406 case MON_7: mon++; 7407 case MON_6: mon++; 7408 case MON_5: mon++; 7409 case MON_4: mon++; 7410 case MON_3: mon++; 7411 case MON_2: mon++; 7412 case MON_1: 7413 format = "%B"; 7414 break; 7415 # endif 7416 # ifndef HAS_STRFTIME 7417 7418 /* If no strftime() on this system, no format will be recognized, so 7419 * return empty */ 7420 case D_FMT: case T_FMT: case D_T_FMT: 7421 case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: 7422 case T_FMT_AMPM: 7423 retval = ""; 7424 break; 7425 # else 7426 /* These strftime formats are defined by C89, so we assume that 7427 * strftime supports them, and so are returned unconditionally; they 7428 * may not be what the locale actually says, but should give good 7429 * enough results for someone using them as formats (as opposed to 7430 * trying to parse them to figure out what the locale says). The 7431 * other format items are actually tested to verify they work on the 7432 * platform */ 7433 case D_FMT: retval = "%x"; break; 7434 case T_FMT: retval = "%X"; break; 7435 case D_T_FMT: retval = "%c"; break; 7436 7437 /* This format isn't in C89; test that it actually works on the 7438 * platform */ 7439 case T_FMT_AMPM: 7440 format = "%r"; 7441 return_format = TRUE; 7442 break; 7443 7444 # if defined(WIN32) || ! defined(USE_LOCALE_TIME) 7445 7446 /* strftime() on Windows doesn't have the POSIX (beyond C89) 7447 * extensions that would allow it to recover these, so use the plain 7448 * non-ERA formats. Also, when LC_TIME is constrained to the C 7449 * locale, the %E modifier is useless, so don't return it. */ 7450 case ERA_D_FMT: retval = "%x"; break; 7451 case ERA_T_FMT: retval = "%X"; break; 7452 case ERA_D_T_FMT: retval = "%c"; break; 7453 # else 7454 case ERA_D_FMT: 7455 format = "%Ex"; 7456 return_format = TRUE; /* Test that this works on the platform */ 7457 break; 7458 7459 case ERA_T_FMT: 7460 format = "%EX"; 7461 return_format = TRUE; 7462 break; 7463 7464 case ERA_D_T_FMT: 7465 format = "%Ec"; 7466 return_format = TRUE; 7467 break; 7468 # endif 7469 # endif 7470 # if defined(WIN32) || ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME) 7471 7472 case ALT_DIGITS: retval = ""; break; 7473 # else 7474 # define CAN_BE_ALT_DIGITS 7475 7476 case ALT_DIGITS: 7477 format = "%Ow"; /* Find the alternate digit for 0 */ 7478 break; 7479 # endif 7480 7481 } /* End of inner switch() */ 7482 7483 /* The inner switch() above has set 'retval' iff that is the final 7484 * answer */ 7485 if (retval) { 7486 break; 7487 } 7488 7489 /* And it hasn't set 'format' iff it can't figure out a good value on 7490 * this platform. */ 7491 if (! format) { 7492 retval = ""; 7493 break; 7494 } 7495 7496 # ifdef HAS_STRFTIME 7497 7498 /* Here we have figured out what to call strftime() with */ 7499 7500 struct tm mytm; 7501 const char * orig_TIME_locale 7502 = toggle_locale_c_unless_locking(LC_TIME, locale); 7503 7504 /* The year was deliberately chosen so that January 1 is on the 7505 * first day of the week. Since we're only getting one thing at a 7506 * time, it all works */ 7507 ints_to_tm(&mytm, locale, 30, 30, hour, mday, mon, 2011, 0, 0, 0); 7508 bool succeeded; 7509 if (utf8ness) { 7510 succeeded = strftime8(format, 7511 sv, 7512 locale, 7513 &mytm, 7514 7515 /* All possible formats specified above are 7516 * entirely ASCII */ 7517 UTF8NESS_IMMATERIAL, 7518 7519 &is_utf8, 7520 false /* not calling from sv_strftime */ 7521 ); 7522 } 7523 else { 7524 succeeded = strftime_tm(format, sv, locale, &mytm); 7525 } 7526 7527 restore_toggled_locale_c_unless_locking(LC_TIME, orig_TIME_locale); 7528 7529 if (UNLIKELY(! succeeded)) { 7530 retval = ""; 7531 break; 7532 } 7533 7534 # ifdef CAN_BE_ALT_DIGITS 7535 7536 if (LIKELY(item != ALT_DIGITS)) 7537 7538 # endif 7539 7540 { 7541 7542 /* If to return what strftime() returns, are done */ 7543 if (! return_format) { 7544 retval_type = RETVAL_IN_sv; 7545 break; 7546 } 7547 7548 /* Here are to return the format, not the value. This is used when 7549 * we are testing if the format we expect to return is legal on 7550 * this platform. We have passed the format, say "%r, to 7551 * strftime(), and now have in 'sv' what strftime processed it 7552 * to be. But the caller doesnt't want that; it wants the actual 7553 * %r, if it is understood on this platform, and "" if it isn't. 7554 * Some strftime()s return "" for an unknown format. (None of the 7555 * formats exposed by langinfo can have "" be a legal result.) 7556 * Other strftime()s return the format unchanged if not understood. 7557 * So if we pass "%r" to strftime(), and it's illegal, we will get 7558 * back either "" or "%r", and we return "" to our caller. If the 7559 * strftime() return is anything else, we conclude that "%r" is 7560 * understood by the platform, and return "%r". */ 7561 if (strEQ(SvPVX(sv), format)) { 7562 retval = ""; 7563 } 7564 else { 7565 retval = format; 7566 } 7567 7568 /* A format is always in ASCII */ 7569 is_utf8 = UTF8NESS_IMMATERIAL; 7570 7571 break; 7572 } 7573 7574 # ifdef CAN_BE_ALT_DIGITS 7575 7576 /* Here, the item is 'ALT_DIGITS' and 'sv' contains the zeroth 7577 * alternate digit. If empty, return that there aren't alternate 7578 * digits */ 7579 Size_t alt0_len = SvCUR(sv); 7580 if (alt0_len == 0) { 7581 retval_type = RETVAL_IN_sv; 7582 break; 7583 } 7584 7585 /* ALT_DIGITS requires special handling because it requires up to 100 7586 * values. Below we generate those by using the %O modifier to 7587 * strftime() formats. 7588 * 7589 * We already have the alternate digit for zero in 'sv', generated 7590 * using the %Ow format, which was used because it seems least likely 7591 * to have a leading zero. But some locales return the equivalent of 7592 * 00 anyway. If the first half of 'sv' is identical to the second 7593 * half, assume that is the case, and use just the first half */ 7594 if ((alt0_len & 1) == 0) { 7595 Size_t half_alt0_len = alt0_len / 2; 7596 if (strnEQ(SvPVX(sv), SvPVX(sv) + half_alt0_len, half_alt0_len)) { 7597 alt0_len = half_alt0_len; 7598 SvCUR_set(sv, alt0_len); 7599 } 7600 } 7601 7602 sv_catpvn_nomg (sv, ";", 1); 7603 7604 /* Many of the remaining digits have representations that include at 7605 * least two 0-sized strings */ 7606 SV* alt_dig_sv = newSV(2 * alt0_len); 7607 7608 /* Various %O formats can be used to derive the alternate digits. Only 7609 * %Oy can go up to the full 100 values. If it doesn't work, we try 7610 * various fallbacks in decreasing order of how many values they can 7611 * deliver. maxes[] tells the highest value that the format applies 7612 * to; offsets[] compensates for 0-based vs 1-based indices; and vars[] 7613 * holds what field in the 'struct tm' to applies to the corresponding 7614 * format */ 7615 int year, min, sec; 7616 const char * fmts[] = {"%Oy", "%OM", "%OS", "%Od", "%OH", "%Om", "%Ow" }; 7617 const Size_t maxes[] = { 99, 59, 59, 31, 23, 11, 6 }; 7618 const int offsets[] = { 0, 0, 0, 1, 0, 1, 0 }; 7619 int * vars[] = {&year, &min, &sec, &mday, &hour, &mon, &mday }; 7620 Size_t j = 0; /* Current index into the above tables */ 7621 7622 orig_TIME_locale = toggle_locale_c_unless_locking(LC_TIME, locale); 7623 7624 for (unsigned int i = 1; i <= 99; i++) { 7625 struct tm mytm; 7626 7627 redo: 7628 if (j >= C_ARRAY_LENGTH(fmts)) { 7629 break; /* Exhausted formats early; can't continue */ 7630 } 7631 7632 if (i > maxes[j]) { 7633 j++; /* Exhausted this format; try next one */ 7634 goto redo; 7635 } 7636 7637 year = (strchr(fmts[j], 'y')) ? 1900 : 2011; 7638 hour = 0; 7639 min = 0; 7640 sec = 0; 7641 mday = 1; 7642 mon = 0; 7643 7644 /* Change the variable corresponding to this format to the 7645 * current time being run in 'i' */ 7646 *(vars[j]) += i - offsets[j]; 7647 7648 /* Do the strftime. Once we have determined the UTF8ness (if 7649 * we want it), assume the rest will be the same, and use 7650 * strftime_tm(), which doesn't recalculate UTF8ness */ 7651 ints_to_tm(&mytm, locale, sec, min, hour, mday, mon, year, 0, 0, 0); 7652 if (utf8ness && is_utf8 != UTF8NESS_NO && is_utf8 != UTF8NESS_YES) { 7653 succeeded = strftime8(fmts[j], 7654 alt_dig_sv, 7655 locale, 7656 &mytm, 7657 UTF8NESS_IMMATERIAL, 7658 &is_utf8, 7659 false /* not calling from sv_strftime */ 7660 ); 7661 } 7662 else { 7663 succeeded = strftime_tm(fmts[j], alt_dig_sv, locale, &mytm); 7664 } 7665 7666 /* If didn't recognize this format, try the next */ 7667 if (UNLIKELY(! succeeded)) { 7668 j++; 7669 goto redo; 7670 } 7671 7672 const char * current = SvPVX(alt_dig_sv); 7673 7674 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 7675 "i=%d, format=%s, alt='%s'\n", 7676 i, fmts[j], current)); 7677 7678 7679 /* If it returned regular digits, give up on this format, to try 7680 * the next candidate one */ 7681 if (strpbrk(current, "0123456789")) { 7682 j++; 7683 goto redo; 7684 } 7685 7686 /* If there is a leading alternate zero, skip past it, to get the 7687 * second one in the string. The first 'alt0_len' bytes in 'sv' 7688 * will be the alternate-zero representation */ 7689 if (strnEQ(current, SvPVX(sv), alt0_len)) { 7690 current += alt0_len; 7691 } 7692 7693 /* Append this number to the ongoing list, including the separator. 7694 * */ 7695 sv_catpv_nomg (sv, current); 7696 sv_catpvn_nomg (sv, ";", 1); 7697 } /* End of loop generating ALT_DIGIT strings */ 7698 7699 /* Above we accepted 0 for alt-0 in case the locale doesn't have a 7700 * zero, but we rejected any other ASCII digits. Now that we have 7701 * processed everything, if that 0 is the only thing we found, it was a 7702 * false positive, and the locale doesn't have alternate digits */ 7703 if (SvCUR(sv) == alt0_len + 1) { 7704 SvCUR_set(sv, 0); 7705 } 7706 7707 SvREFCNT_dec_NN(alt_dig_sv); 7708 7709 restore_toggled_locale_c_unless_locking(LC_TIME, orig_TIME_locale); 7710 7711 retval_type = RETVAL_IN_sv; 7712 break; 7713 7714 # endif /* End of CAN_BE_ALT_DIGITS */ 7715 # endif /* End of HAS_STRFTIME */ 7716 7717 } /* End of braced group for outer switch 'default:' case */ 7718 7719 # endif 7720 7721 } /* Giant switch() of nl_langinfo() items */ 7722 7723 GCC_DIAG_RESTORE_STMT; 7724 7725 if (sv != PL_scratch_langinfo) { /* Caller wants return in 'sv' */ 7726 if (! isRETVAL_IN_sv(retval_type)) { 7727 sv_setpv(sv, retval); 7728 SvUTF8_off(sv); 7729 } 7730 7731 if (utf8ness) { 7732 *utf8ness = is_utf8; 7733 if (is_utf8 == UTF8NESS_YES) { 7734 SvUTF8_on(sv); 7735 } 7736 } 7737 7738 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 7739 "Leaving emulate_langinfo item=%jd, using locale %s\n", 7740 item, locale)); 7741 7742 /* The caller shouldn't also be wanting a 'retval'; make sure segfaults 7743 * if they call this wrong */ 7744 return NULL; 7745 } 7746 7747 /* Here, wants a 'retval' return. Extract that if not already there. */ 7748 if (! isRETVAL_IN_retval(retval_type)) { 7749 retval = SvPV_nolen(sv); 7750 } 7751 7752 /* Here, 'retval' started as a simple value, or has been converted into 7753 * being simple */ 7754 if (utf8ness) { 7755 *utf8ness = is_utf8; 7756 } 7757 7758 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 7759 "Leaving emulate_langinfo item=%jd, using locale %s\n", 7760 item, locale)); 7761 return retval; 7762 7763 # undef RETVAL_IN_retval 7764 # undef RETVAL_IN_BOTH 7765 # undef RETVAL_IN_sv 7766 # undef isRETVAL_IN_sv 7767 # undef isRETVAL_IN_retval 7768 7769 } 7770 7771 #endif /* Needs emulate_langinfo() */ 7772 #ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION 7773 7774 STATIC bool 7775 S_maybe_override_codeset(pTHX_ const char * codeset, 7776 const char * locale, 7777 const char ** new_codeset) 7778 { 7779 # define NAME_INDICATES_UTF8 0x1 7780 # define MB_CUR_MAX_SUGGESTS_UTF8 0x2 7781 7782 /* Override 'codeset' with UTF-8 if this routine guesses that it should be. 7783 * Conversely (but rarely), "UTF-8" in the locale name might be wrong. We 7784 * return "" as the code set name if we find that to be the case. */ 7785 7786 unsigned int lean_towards_being_utf8 = 0; 7787 if (is_codeset_name_UTF8(codeset)) { 7788 lean_towards_being_utf8 |= NAME_INDICATES_UTF8; 7789 } 7790 7791 const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale); 7792 7793 /* For this portion of the file to compile, some C99 functions aren't 7794 * available to us, even though we now require C99. So, something must be 7795 * wrong with them. The code here should be good enough to work around 7796 * this issue, but should the need arise, comments in S_is_locale_utf8() 7797 * list some alternative C99 functions that could be tried. 7798 * 7799 * But MB_CUR_MAX is a C89 construct that helps a lot, is simple for a 7800 * vendor to implement, and our experience with it is that it works well on 7801 * a variety of platforms. We have found that it returns a too-large 7802 * number on some platforms for the C locale, but for no others. That 7803 * locale was already ruled out in the code that called this function. (If 7804 * MB_CUR_MAX returned too small a number, that would break a lot of 7805 * things, and likely would be quickly corrected by the vendor.) khw has 7806 * some confidence that it doesn't return >1 when 1 is meant, as that would 7807 * trigger a Perl warning, and we've had no reports of invalid occurrences 7808 * of such. */ 7809 # ifdef MB_CUR_MAX 7810 7811 /* If there are fewer bytes available in this locale than are required to 7812 * represent the largest legal UTF-8 code point, this definitely isn't a 7813 * UTF-8 locale, even if the locale name says it is. */ 7814 const int mb_cur_max = MB_CUR_MAX; 7815 if (mb_cur_max < (int) UNISKIP(PERL_UNICODE_MAX)) { 7816 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); 7817 7818 if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) { 7819 *new_codeset = ""; /* The name is wrong; override */ 7820 return true; 7821 } 7822 7823 return false; 7824 } 7825 7826 /* But if the locale could be UTF-8, and also the name corroborates this, 7827 * assume it is so */ 7828 if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) { 7829 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); 7830 return false; 7831 } 7832 7833 restore_toggled_locale_c_if_locking(LC_CTYPE, orig_CTYPE_locale); 7834 7835 /* Here, the name doesn't indicate UTF-8, but MB_CUR_MAX indicates it could 7836 * be. khw knows of only two other locales in the world, EUC-TW and GB 7837 * 18030, that legitimately require this many bytes (4). So, if the name 7838 * is one of those, MB_CUR_MAX has corroborated that. */ 7839 bool name_implies_non_utf8 = false; 7840 if (foldEQ(codeset, "GB", 2)) { 7841 const char * s = codeset + 2; 7842 if (*s == '-' || *s == '_') { 7843 s++; 7844 } 7845 7846 if strEQ(s, "18030") { 7847 name_implies_non_utf8 = true; 7848 } 7849 } 7850 else if (foldEQ(codeset, "EUC", 3)) { 7851 const char * s = codeset + 3; 7852 if (*s == '-' || *s == '_') { 7853 s++; 7854 } 7855 7856 if (foldEQ(s, "TW", 2)) { 7857 name_implies_non_utf8 = true; 7858 } 7859 } 7860 7861 /* Otherwise, the locale is likely UTF-8 */ 7862 if (! name_implies_non_utf8) { 7863 lean_towards_being_utf8 |= MB_CUR_MAX_SUGGESTS_UTF8; 7864 } 7865 7866 /* (In both those two other multibyte locales, the single byte characters 7867 * are the same as ASCII. No multi-byte character in EUC-TW is legal UTF-8 7868 * (since the first byte of each is a continuation). GB 18030 has no three 7869 * byte sequences, and none of the four byte ones is legal UTF-8 (as the 7870 * second byte for these is a non-continuation). But every legal UTF-8 two 7871 * byte sequence is also legal in GB 18030, though none have the same 7872 * meaning, and no Han code point expressed in UTF-8 is two byte. So the 7873 * further tests below which look for native expressions of currency and 7874 * time will not return two byte sequences, hence they will reliably rule 7875 * out such a locale as being UTF-8, even if the code set name checked 7876 * above isn't correct.) */ 7877 7878 # endif /* has MB_CUR_MAX */ 7879 7880 /* Here, MB_CUR_MAX is not available, or was inconclusive. What we do is 7881 * to look at various strings associated with the locale: 7882 * 1) If any are illegal UTF-8, the locale can't be UTF-8. 7883 * 2) If all are legal UTF-8, and some non-ASCII characters are present, 7884 * it is likely to be UTF-8, because of the strictness of UTF-8 7885 * syntax. So assume it is UTF-8 7886 * 3) If all are ASCII and the locale name and/or MB_CUR_MAX indicate 7887 * UTF-8, assume the locale is UTF-8. 7888 * 4) Otherwise, assume the locale isn't UTF-8 7889 * 7890 * To save cycles, if the locale name indicates it is a UTF-8 locale, we 7891 * stop looking at the first instance with legal non-ASCII UTF-8. It is 7892 * very unlikely this combination is coincidental. */ 7893 7894 utf8ness_t strings_utf8ness = UTF8NESS_UNKNOWN; 7895 7896 /* List of strings to look at */ 7897 const int trials[] = { 7898 7899 # if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV) 7900 7901 /* The first string tried is the locale currency name. Often that will 7902 * be in the native script. 7903 * 7904 * But this is usable only if localeconv() is available, as that's the 7905 * way we find out the currency symbol. */ 7906 7907 CRNCYSTR, 7908 7909 # endif 7910 # ifdef USE_LOCALE_TIME 7911 7912 /* We can also try various strings associated with LC_TIME, like the names 7913 * of months or days of the week */ 7914 7915 DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7, 7916 MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8, 7917 MON_9, MON_10, MON_11, MON_12, 7918 ALT_DIGITS, AM_STR, PM_STR, 7919 ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6, ABDAY_7, 7920 ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6, 7921 ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12 7922 7923 # endif 7924 7925 }; 7926 7927 # ifdef USE_LOCALE_TIME 7928 7929 /* The code in the recursive call below can handle switching the locales, 7930 * but by doing it now here, that code will check and discover that there 7931 * is no need to switch then restore, avoiding those each loop iteration. 7932 * 7933 * But don't do this if toggling actually creates a critical section, so as 7934 * to minimize the amount of time spent in each critical section. */ 7935 const char * orig_TIME_locale = 7936 toggle_locale_c_unless_locking(LC_TIME, locale); 7937 7938 # endif 7939 7940 /* The trials array may consist of strings from two different locale 7941 * categories. The call to langinfo_i() below needs to pass the proper 7942 * category for each string. There is a max of 1 trial for LC_MONETARY; 7943 * the rest are LC_TIME. So the array is arranged so the LC_MONETARY item 7944 * (if any) is first, and all subsequent iterations will use LC_TIME. 7945 * These #ifdefs set up the values for all possible combinations. */ 7946 # if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV) 7947 7948 locale_category_index cat_index = LC_MONETARY_INDEX_; 7949 7950 # ifdef USE_LOCALE_TIME 7951 7952 const locale_category_index follow_on_cat_index = LC_TIME_INDEX_; 7953 assert(trials[1] == DAY_1); /* Make sure only a single non-time entry */ 7954 7955 # else 7956 7957 /* Effectively out-of-bounds, as there is only the monetary entry */ 7958 const locale_category_index follow_on_cat_index = LC_ALL_INDEX_; 7959 7960 # endif 7961 # elif defined(USE_LOCALE_TIME) 7962 7963 locale_category_index cat_index = LC_TIME_INDEX_; 7964 const locale_category_index follow_on_cat_index = LC_TIME_INDEX_; 7965 7966 # else 7967 7968 /* Effectively out-of-bounds, as here there are no trial entries at all. 7969 * This allows this code to compile, but there are no strings to test, and 7970 * so the answer will always be non-UTF-8. */ 7971 locale_category_index cat_index = LC_ALL_INDEX_; 7972 const locale_category_index follow_on_cat_index = LC_ALL_INDEX_; 7973 7974 # endif 7975 7976 /* We will need to use the reentrant interface. */ 7977 SV * sv = newSVpvs(""); 7978 7979 /* Everything set up; look through all the strings */ 7980 for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(trials); i++) { 7981 7982 /* To prevent infinite recursive calls, we don't ask for the UTF-8ness 7983 * of the string. Instead we examine the result below */ 7984 langinfo_sv_i(trials[i], cat_index, locale, sv, NULL); 7985 7986 cat_index = follow_on_cat_index; 7987 7988 const char * result = SvPVX(sv); 7989 const Size_t len = strlen(result); 7990 const U8 * first_variant; 7991 7992 /* If the string is identical whether or not it is encoded as UTF-8, it 7993 * isn't helpful in determining UTF8ness. */ 7994 if (is_utf8_invariant_string_loc((U8 *) result, len, &first_variant)) 7995 { 7996 continue; 7997 } 7998 7999 /* Here, has non-ASCII. If not legal UTF-8, isn't a UTF-8 locale */ 8000 if (! is_strict_utf8_string(first_variant, 8001 len - (first_variant - (U8 *) result))) 8002 { 8003 strings_utf8ness = UTF8NESS_NO; 8004 break; 8005 } 8006 8007 /* Here, is a legal non-ASCII UTF-8 string; tentatively set the return 8008 * to YES; possibly overridden by later iterations */ 8009 strings_utf8ness = UTF8NESS_YES; 8010 8011 /* But if this corroborates our expectation, quit now */ 8012 if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) { 8013 break; 8014 } 8015 } 8016 8017 # ifdef USE_LOCALE_TIME 8018 8019 restore_toggled_locale_c_unless_locking(LC_TIME, orig_TIME_locale); 8020 8021 # endif 8022 8023 restore_toggled_locale_c_unless_locking(LC_CTYPE, orig_CTYPE_locale); 8024 8025 if (strings_utf8ness == UTF8NESS_NO) { 8026 return false; /* No override */ 8027 } 8028 8029 /* Here all tested strings are legal UTF-8. 8030 * 8031 * Above we set UTF8NESS_YES if any string wasn't ASCII. But even if they 8032 * are all ascii, and the locale name indicates it is a UTF-8 locale, 8033 * assume the locale is UTF-8. */ 8034 if (lean_towards_being_utf8) { 8035 strings_utf8ness = UTF8NESS_YES; 8036 } 8037 8038 if (strings_utf8ness == UTF8NESS_YES) { 8039 *new_codeset = "UTF-8"; 8040 return true; 8041 } 8042 8043 /* Here, nothing examined indicates that the codeset is or isn't UTF-8. 8044 * But what is it? The other locale categories are not likely to be of 8045 * further help: 8046 * 8047 * LC_NUMERIC Only a few locales in the world have a non-ASCII radix or 8048 * group separator. 8049 * LC_CTYPE This code wouldn't be compiled if mbtowc() existed and was 8050 * reliable. This is unlikely in C99. There are other 8051 * functions that could be used instead, but are they going to 8052 * exist, and be able to distinguish between UTF-8 and 8859-1? 8053 * Deal with this only if it becomes necessary. 8054 * LC_MESSAGES The strings returned from strerror() would seem likely 8055 * candidates, but experience has shown that many systems 8056 * don't actually have translations installed for them. They 8057 * are instead always in English, so everything in them is 8058 * ASCII, which is of no help to us. A Configure probe could 8059 * possibly be written to see if this platform has non-ASCII 8060 * error messages. But again, wait until it turns out to be 8061 * an actual problem. 8062 * 8063 * Things like YESSTR, NOSTR, might not be in ASCII, but need 8064 * nl_langinfo() to access, which we don't have. 8065 */ 8066 8067 /* Otherwise, assume the locale isn't UTF-8. This can be wrong if we don't 8068 * have MB_CUR_MAX, and the locale is English without UTF-8 in its name, 8069 * and with a dollar currency symbol. */ 8070 return false; /* No override */ 8071 } 8072 8073 # endif /* ! HAS_DEFINITIVE_UTF8NESS_DETERMINATION */ 8074 8075 /* 8076 =for apidoc_section $time 8077 =for apidoc sv_strftime_tm 8078 =for apidoc_item my_strftime 8079 8080 These implement the libc strftime(). 8081 8082 On failure, they return NULL, and set C<errno> to C<EINVAL>. 8083 8084 C<sv_strftime_tm> is preferred, as it transparently handles the UTF-8ness of 8085 the current locale, the input C<fmt>, and the returned result. Only if the 8086 current C<LC_TIME> locale is a UTF-8 one (and S<C<use bytes>> is not in effect) 8087 will the result be marked as UTF-8. 8088 8089 C<sv_strftime_tm> takes a pointer to a filled-in S<C<struct tm>> parameter. It 8090 ignores the values of the C<wday> and C<yday> fields in it. The other fields 8091 give enough information to accurately calculate these values, and are used for 8092 that purpose. 8093 8094 The caller assumes ownership of the returned SV with a reference count of 1. 8095 8096 C<my_strftime> is kept for backwards compatibility. Knowing if its result 8097 should be considered UTF-8 or not requires significant extra logic. 8098 8099 The return value is a pointer to the formatted result (which MUST be arranged 8100 to be FREED BY THE CALLER). This allows this function to increase the buffer 8101 size as needed, so that the caller doesn't have to worry about that, unlike 8102 libc C<strftime()>. 8103 8104 The C<wday>, C<yday>, and C<isdst> parameters are ignored by C<my_strftime>. 8105 Daylight savings time is never considered to exist, and the values returned for 8106 the other two fields (if C<fmt> even calls for them) are calculated from the 8107 other parameters, without need for referencing these. 8108 8109 Note that both functions are always executed in the underlying 8110 C<LC_TIME> locale of the program, giving results based on that locale. 8111 8112 =cut 8113 */ 8114 8115 char * 8116 Perl_my_strftime(pTHX_ const char *fmt, int sec, int min, int hour, 8117 int mday, int mon, int year, int wday, int yday, 8118 int isdst) 8119 { /* Documented above */ 8120 PERL_ARGS_ASSERT_MY_STRFTIME; 8121 8122 #ifdef USE_LOCALE_TIME 8123 const char * locale = querylocale_c(LC_TIME); 8124 #else 8125 const char * locale = "C"; 8126 #endif 8127 8128 struct tm mytm; 8129 ints_to_tm(&mytm, locale, sec, min, hour, mday, mon, year, wday, yday, 8130 isdst); 8131 if (! strftime_tm(fmt, PL_scratch_langinfo, locale, &mytm)) { 8132 return NULL; 8133 } 8134 8135 return savepv(SvPVX(PL_scratch_langinfo)); 8136 } 8137 8138 SV * 8139 Perl_sv_strftime_ints(pTHX_ SV * fmt, int sec, int min, int hour, 8140 int mday, int mon, int year, int wday, 8141 int yday, int isdst) 8142 { /* Documented above */ 8143 PERL_ARGS_ASSERT_SV_STRFTIME_INTS; 8144 8145 #ifdef USE_LOCALE_TIME 8146 const char * locale = querylocale_c(LC_TIME); 8147 #else 8148 const char * locale = "C"; 8149 #endif 8150 8151 struct tm mytm; 8152 ints_to_tm(&mytm, locale, sec, min, hour, mday, mon, year, wday, yday, 8153 isdst); 8154 return sv_strftime_common(fmt, locale, &mytm); 8155 } 8156 8157 SV * 8158 Perl_sv_strftime_tm(pTHX_ SV * fmt, const struct tm * mytm) 8159 { /* Documented above */ 8160 PERL_ARGS_ASSERT_SV_STRFTIME_TM; 8161 8162 #ifdef USE_LOCALE_TIME 8163 8164 return sv_strftime_common(fmt, querylocale_c(LC_TIME), mytm); 8165 8166 #else 8167 8168 return sv_strftime_common(fmt, "C", mytm); 8169 8170 #endif 8171 8172 } 8173 8174 SV * 8175 S_sv_strftime_common(pTHX_ SV * fmt, 8176 const char * locale, 8177 const struct tm * mytm) 8178 { /* Documented above */ 8179 PERL_ARGS_ASSERT_SV_STRFTIME_COMMON; 8180 8181 STRLEN fmt_cur; 8182 const char *fmt_str = SvPV_const(fmt, fmt_cur); 8183 8184 utf8ness_t fmt_utf8ness = (SvUTF8(fmt) && LIKELY(! IN_BYTES)) 8185 ? UTF8NESS_YES 8186 : UTF8NESS_UNKNOWN; 8187 8188 utf8ness_t result_utf8ness; 8189 8190 /* Use a fairly generous guess as to how big the buffer needs to be, so as 8191 * to get almost all the typical returns to fit without the called function 8192 * having to realloc; this is a somewhat educated guess, but feel free to 8193 * tweak it. */ 8194 SV* sv = newSV(MAX(fmt_cur * 2, 64)); 8195 if (! strftime8(fmt_str, 8196 sv, 8197 locale, 8198 mytm, 8199 fmt_utf8ness, 8200 &result_utf8ness, 8201 true /* calling from sv_strftime */ )) 8202 { 8203 return NULL; 8204 } 8205 8206 8207 if (result_utf8ness == UTF8NESS_YES) { 8208 SvUTF8_on(sv); 8209 } 8210 8211 return sv; 8212 } 8213 8214 STATIC void 8215 S_ints_to_tm(pTHX_ struct tm * mytm, 8216 const char * locale, 8217 int sec, int min, int hour, int mday, int mon, int year, 8218 int wday, int yday, int isdst) 8219 { 8220 /* Create a struct tm structure from the input time-related integer 8221 * variables for 'locale' */ 8222 8223 /* Override with the passed-in values */ 8224 Zero(mytm, 1, struct tm); 8225 mytm->tm_sec = sec; 8226 mytm->tm_min = min; 8227 mytm->tm_hour = hour; 8228 mytm->tm_mday = mday; 8229 mytm->tm_mon = mon; 8230 mytm->tm_year = year; 8231 mytm->tm_wday = wday; 8232 mytm->tm_yday = yday; 8233 mytm->tm_isdst = isdst; 8234 8235 /* Long-standing behavior is to ignore the effects of locale (in 8236 * particular, daylight savings time) on the input, so we use mini_mktime. 8237 * See GH #22062. */ 8238 mini_mktime(mytm); 8239 8240 /* But some of those effect are deemed desirable, so use libc to get the 8241 * values for tm_gmtoff and tm_zone on platforms that have them [perl 8242 * #18238] */ 8243 #if defined(HAS_MKTIME) \ 8244 && (defined(HAS_TM_TM_GMTOFF) || defined(HAS_TM_TM_ZONE)) 8245 8246 const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale); 8247 struct tm mytm2 = *mytm; 8248 MKTIME_LOCK; 8249 mktime(&mytm2); 8250 MKTIME_UNLOCK; 8251 restore_toggled_locale_c(LC_TIME, orig_TIME_locale); 8252 8253 # ifdef HAS_TM_TM_GMTOFF 8254 mytm->tm_gmtoff = mytm2.tm_gmtoff; 8255 # endif 8256 # ifdef HAS_TM_TM_ZONE 8257 mytm->tm_zone = mytm2.tm_zone; 8258 # endif 8259 #endif 8260 8261 return; 8262 } 8263 8264 STATIC bool 8265 S_strftime_tm(pTHX_ const char *fmt, 8266 SV * sv, 8267 const char *locale, 8268 const struct tm *mytm) 8269 { 8270 PERL_ARGS_ASSERT_STRFTIME_TM; 8271 8272 /* Execute strftime() based on the input struct tm, and the current LC_TIME 8273 * locale. 8274 * 8275 * Returns 'true' if succeeded, with the PV pointer in 'sv' filled with the 8276 * result, and all other C<OK> bits disabled, and not marked as UTF-8. 8277 * Determining the UTF-8ness must be done at a higher level. 8278 * 8279 * 'false' is returned if if fails; the state of 'sv' is unspecified. */ 8280 8281 /* An empty format yields an empty result */ 8282 const Size_t fmtlen = strlen(fmt); 8283 if (fmtlen == 0) { 8284 sv_setpvs(sv, ""); 8285 SvUTF8_off(sv); 8286 return true; 8287 } 8288 8289 bool succeeded = false; 8290 8291 #ifndef HAS_STRFTIME 8292 Perl_croak(aTHX_ "panic: no strftime"); 8293 #endif 8294 8295 start_DEALING_WITH_MISMATCHED_CTYPE(locale); 8296 8297 #if defined(USE_LOCALE_TIME) 8298 8299 const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale); 8300 8301 # define LC_TIME_TEARDOWN \ 8302 restore_toggled_locale_c(LC_TIME, orig_TIME_locale) 8303 #else 8304 PERL_UNUSED_ARG(locale); 8305 # define LC_TIME_TEARDOWN 8306 #endif 8307 8308 /* Assume the caller has furnished a reasonable sized guess, but guard 8309 * against one that won't work */ 8310 Size_t bufsize = MAX(2, SvLEN(sv)); 8311 SvUPGRADE(sv, SVt_PV); 8312 SvPOK_only(sv); 8313 8314 do { 8315 char * buf = SvGROW(sv, bufsize); 8316 8317 /* allowing user-supplied (rather than literal) formats is normally 8318 * frowned upon as a potential security risk; but this is part of the 8319 * API so we have to allow it (and the available formats have a much 8320 * lower chance of doing something bad than the ones for printf etc. */ 8321 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); 8322 8323 #ifdef WIN32 /* Windows will tell you if the input is invalid */ 8324 8325 /* Needed because the LOCK might (or might not) save/restore errno */ 8326 bool strftime_failed = false; 8327 8328 STRFTIME_LOCK; 8329 dSAVE_ERRNO; 8330 errno = 0; 8331 8332 Size_t len = strftime(buf, bufsize, fmt, mytm); 8333 if (errno == EINVAL) { 8334 strftime_failed = true; 8335 } 8336 8337 RESTORE_ERRNO; 8338 STRFTIME_UNLOCK; 8339 8340 if (strftime_failed) { 8341 goto strftime_failed; 8342 } 8343 8344 #else 8345 STRFTIME_LOCK; 8346 Size_t len = strftime(buf, bufsize, fmt, mytm); 8347 STRFTIME_UNLOCK; 8348 #endif 8349 8350 GCC_DIAG_RESTORE_STMT; 8351 8352 /* A non-zero return indicates success. But to make sure we're not 8353 * dealing with some rogue strftime that returns how much space it 8354 * needs instead of 0 when there isn't enough, check that the return 8355 * indicates we have at least one byte of spare space (which will be 8356 * used for the terminating NUL). */ 8357 if (inRANGE(len, 1, bufsize - 1)) { 8358 succeeded = true; 8359 SvCUR_set(sv, len); 8360 goto strftime_return; 8361 } 8362 8363 /* There are several possible reasons for a 0 return code for a 8364 * non-empty format, and they are not trivial to tease apart. This 8365 * issue is a known bug in the strftime() API. What we do to cope is 8366 * to assume that the reason is not enough space in the buffer, so 8367 * increase it and try again. */ 8368 bufsize *= 2; 8369 8370 /* But don't just keep increasing the size indefinitely. Stop when it 8371 * becomes obvious that the reason for failure is something besides not 8372 * enough space. The most likely largest expanding format is %c. On 8373 * khw's Linux box, the maximum result of this is 67 characters, in the 8374 * km_KH locale. If a new script comes along that uses 4 UTF-8 bytes 8375 * per character, and with a similar expansion factor, that would be a 8376 * 268:2 byte ratio, or a bit more than 128:1 = 2**7:1. Some strftime 8377 * implementations allow you to say %1000c to pad to 1000 bytes. This 8378 * shows that it is impossible to implement this without a heuristic 8379 * (which can fail). But it indicates we need to be generous in the 8380 * upper limit before failing. The previous heuristic used was too 8381 * stingy. Since the size doubles per iteration, it doesn't take many 8382 * to reach the limit */ 8383 } while (bufsize < ((1 << 11) + 1) * fmtlen); 8384 8385 /* Here, strftime() returned 0, and it likely wasn't for lack of space. 8386 * There are two possible reasons: 8387 * 8388 * First is that the result is legitimately 0 length. This can happen 8389 * when the format is precisely "%p". That is the only documented format 8390 * that can have an empty result. */ 8391 if (strEQ(fmt, "%p")) { 8392 sv_setpvs(sv, ""); 8393 SvUTF8_off(sv); 8394 succeeded = true; 8395 goto strftime_return; 8396 } 8397 8398 /* The other reason is that the format string is malformed. Probably it is 8399 * that the string is syntactically invalid for the locale. On some 8400 * platforms an invalid conversion specifier '%?' (for all illegal '?') is 8401 * treated as a literal, but others may fail when '?' is illegal */ 8402 8403 #ifdef WIN32 8404 strftime_failed: 8405 #endif 8406 8407 SET_EINVAL; 8408 succeeded = false; 8409 8410 strftime_return: 8411 8412 LC_TIME_TEARDOWN; 8413 end_DEALING_WITH_MISMATCHED_CTYPE(locale); 8414 8415 return succeeded; 8416 } 8417 8418 STATIC bool 8419 S_strftime8(pTHX_ const char * fmt, 8420 SV * sv, 8421 const char * locale, 8422 const struct tm * mytm, 8423 const utf8ness_t fmt_utf8ness, 8424 utf8ness_t * result_utf8ness, 8425 const bool called_externally) 8426 { 8427 PERL_ARGS_ASSERT_STRFTIME8; 8428 8429 /* Wrap strftime_tm, taking into account the input and output UTF-8ness */ 8430 8431 #ifdef USE_LOCALE_TIME 8432 # define INDEX_TO_USE LC_TIME_INDEX_ 8433 8434 locale_utf8ness_t locale_utf8ness = LOCALE_UTF8NESS_UNKNOWN; 8435 8436 #else 8437 # define INDEX_TO_USE LC_ALL_INDEX_ /* Effectively out of bounds */ 8438 8439 locale_utf8ness_t locale_utf8ness = LOCALE_NOT_UTF8; 8440 8441 #endif 8442 8443 switch (fmt_utf8ness) { 8444 case UTF8NESS_IMMATERIAL: 8445 break; 8446 8447 case UTF8NESS_NO: /* Known not to be UTF-8; must not be UTF-8 locale */ 8448 if (is_locale_utf8(locale)) { 8449 SET_EINVAL; 8450 return false; 8451 } 8452 8453 locale_utf8ness = LOCALE_NOT_UTF8; 8454 break; 8455 8456 case UTF8NESS_YES: /* Known to be UTF-8; must be UTF-8 locale if can't 8457 downgrade. */ 8458 if (! is_locale_utf8(locale)) { 8459 locale_utf8ness = LOCALE_NOT_UTF8; 8460 8461 bool is_utf8 = true; 8462 Size_t fmt_len = strlen(fmt); 8463 fmt = (char *) bytes_from_utf8((U8 *) fmt, &fmt_len, &is_utf8); 8464 if (is_utf8) { 8465 SET_EINVAL; 8466 return false; 8467 } 8468 8469 SAVEFREEPV(fmt); 8470 } 8471 else { 8472 locale_utf8ness = LOCALE_IS_UTF8; 8473 } 8474 8475 break; 8476 8477 case UTF8NESS_UNKNOWN: 8478 if (! is_locale_utf8(locale)) { 8479 locale_utf8ness = LOCALE_NOT_UTF8; 8480 } 8481 else { 8482 locale_utf8ness = LOCALE_IS_UTF8; 8483 if (called_externally) { 8484 8485 /* All internal calls from this file use ASCII-only formats; 8486 * but otherwise the format could be anything, so make sure to 8487 * upgrade it to UTF-8 for a UTF-8 locale. Otherwise the 8488 * locale would find any UTF-8 variant characters to be 8489 * malformed */ 8490 Size_t fmt_len = strlen(fmt); 8491 fmt = (char *) bytes_to_utf8((U8 *) fmt, &fmt_len); 8492 SAVEFREEPV(fmt); 8493 } 8494 } 8495 8496 break; 8497 } 8498 8499 if (! strftime_tm(fmt, sv, locale, mytm)) { 8500 return false; 8501 } 8502 8503 *result_utf8ness = get_locale_string_utf8ness_i(SvPVX(sv), 8504 locale_utf8ness, 8505 locale, 8506 INDEX_TO_USE); 8507 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 8508 "fmt=%s, retval=%s; utf8ness=%d", 8509 fmt, 8510 ((is_strict_utf8_string((U8 *) SvPVX(sv), 0)) 8511 ? SvPVX(sv) 8512 :_byte_dump_string((U8 *) SvPVX(sv), SvCUR(sv) ,0)), 8513 *result_utf8ness)); 8514 return true; 8515 8516 #undef INDEX_TO_USE 8517 8518 } 8519 8520 #ifdef USE_LOCALE 8521 8522 STATIC void 8523 S_give_perl_locale_control(pTHX_ 8524 # ifdef LC_ALL 8525 const char * lc_all_string, 8526 # else 8527 const char ** locales, 8528 # endif 8529 const line_t caller_line) 8530 { 8531 PERL_UNUSED_ARG(caller_line); 8532 8533 /* This is called when the program is in the global locale and are 8534 * switching to per-thread (if available). And it is called at 8535 * initialization time to do the same. 8536 */ 8537 8538 # if defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE) 8539 8540 /* On Windows, convert to per-thread behavior. This isn't necessary in 8541 * POSIX 2008, as the conversion gets done automatically in the 8542 * void_setlocale_i() calls below. */ 8543 if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) { 8544 locale_panic_("_configthreadlocale returned an error"); 8545 } 8546 8547 # endif 8548 # if ! defined(USE_THREAD_SAFE_LOCALE) \ 8549 && ! defined(USE_POSIX_2008_LOCALE) 8550 # if defined(LC_ALL) 8551 PERL_UNUSED_ARG(lc_all_string); 8552 # else 8553 PERL_UNUSED_ARG(locales); 8554 # endif 8555 # else 8556 8557 /* This platform has per-thread locale handling. Do the conversion. */ 8558 8559 # if defined(LC_ALL) 8560 8561 void_setlocale_c_with_caller(LC_ALL, lc_all_string, __FILE__, caller_line); 8562 8563 # else 8564 8565 for_all_individual_category_indexes(i) { 8566 void_setlocale_i_with_caller(i, locales[i], __FILE__, caller_line); 8567 } 8568 8569 # endif 8570 # endif 8571 8572 /* Finally, update our remaining records. 'true' => force recalculation. 8573 * This is needed because we don't know what's happened while Perl hasn't 8574 * had control, so we need to figure out the current state */ 8575 8576 # if defined(LC_ALL) 8577 8578 new_LC_ALL(lc_all_string, true); 8579 8580 # else 8581 8582 new_LC_ALL(calculate_LC_ALL_string(locales, 8583 INTERNAL_FORMAT, 8584 WANT_TEMP_PV, 8585 caller_line), 8586 true); 8587 # endif 8588 8589 } 8590 8591 STATIC void 8592 S_output_check_environment_warning(pTHX_ const char * const language, 8593 const char * const lc_all, 8594 const char * const lang) 8595 { 8596 PerlIO_printf(Perl_error_log, 8597 "perl: warning: Please check that your locale settings:\n"); 8598 8599 # ifdef __GLIBC__ 8600 8601 PerlIO_printf(Perl_error_log, "\tLANGUAGE = %c%s%c,\n", 8602 language ? '"' : '(', 8603 language ? language : "unset", 8604 language ? '"' : ')'); 8605 # else 8606 PERL_UNUSED_ARG(language); 8607 # endif 8608 8609 PerlIO_printf(Perl_error_log, "\tLC_ALL = %c%s%c,\n", 8610 lc_all ? '"' : '(', 8611 lc_all ? lc_all : "unset", 8612 lc_all ? '"' : ')'); 8613 8614 for_all_individual_category_indexes(i) { 8615 const char * value = PerlEnv_getenv(category_names[i]); 8616 PerlIO_printf(Perl_error_log, 8617 "\t%s = %c%s%c,\n", 8618 category_names[i], 8619 value ? '"' : '(', 8620 value ? value : "unset", 8621 value ? '"' : ')'); 8622 } 8623 8624 PerlIO_printf(Perl_error_log, "\tLANG = %c%s%c\n", 8625 lang ? '"' : '(', 8626 lang ? lang : "unset", 8627 lang ? '"' : ')'); 8628 PerlIO_printf(Perl_error_log, 8629 " are supported and installed on your system.\n"); 8630 } 8631 8632 #endif 8633 8634 /* A helper macro for the next function. Needed because would be called in two 8635 * places. Knows about the internal workings of the function */ 8636 #define GET_DESCRIPTION(trial, name) \ 8637 ((isNAME_C_OR_POSIX(name)) \ 8638 ? "the standard locale" \ 8639 : ((trial == (system_default_trial) \ 8640 ? "the system default locale" \ 8641 : "a fallback locale"))) 8642 8643 /* 8644 * Initialize locale awareness. 8645 */ 8646 int 8647 Perl_init_i18nl10n(pTHX_ int printwarn) 8648 { 8649 /* printwarn is: 8650 * 0 if not to output warning when setup locale is bad 8651 * 1 if to output warning based on value of PERL_BADLANG 8652 * >1 if to output regardless of PERL_BADLANG 8653 * 8654 * returns 8655 * 1 = set ok or not applicable, 8656 * 0 = fallback to a locale of lower priority 8657 * -1 = fallback to all locales failed, not even to the C locale 8658 * 8659 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is 8660 * set, debugging information is output. 8661 * 8662 * This routine effectively does the following in most cases: 8663 * 8664 * basic initialization; 8665 * asserts that the compiled tables are consistent; 8666 * initialize data structures; 8667 * make sure we are in the global locale; 8668 * setlocale(LC_ALL, ""); 8669 * switch to per-thread locale if applicable; 8670 * 8671 * The "" causes the locale to be set to what the environment variables at 8672 * the time say it should be. 8673 * 8674 * To handle possible failures, the setlocale is expanded to be like: 8675 * 8676 * trial_locale = pre-first-trial; 8677 * while (has_another_trial()) { 8678 * trial_locale = next_trial(); 8679 * if setlocale(LC_ALL, trial_locale) { 8680 * ok = true; 8681 * break; 8682 * } 8683 * 8684 * had_failure = true; 8685 * warn(); 8686 * } 8687 * 8688 * if (had_failure) { 8689 * warn_even_more(); 8690 * if (! ok) warn_still_more(); 8691 * } 8692 * 8693 * The first trial is either: 8694 * "" to examine the environment variables for the locale 8695 * NULL to use the values already set for the locale by the program 8696 * embedding this perl instantiation. 8697 * 8698 * Something is wrong if this trial fails, but there is a sequence of 8699 * fallbacks to try should that happen. They are given in the enum below. 8700 8701 * If there is no LC_ALL defined on the system, the setlocale() above is 8702 * replaced by a loop setting each individual category separately. 8703 * 8704 * In a non-embeded environment, this code is executed exactly once. It 8705 * sets up the global locale environment. At the end, if some sort of 8706 * thread-safety is in effect, it will turn thread 0 into using that, with 8707 * the same locale as the global initially. thread 0 can then change its 8708 * locale at will without affecting the global one. 8709 * 8710 * At destruction time, thread 0 will revert to the global locale as the 8711 * other threads die. 8712 * 8713 * Care must be taken in an embedded environment. This code will be 8714 * executed for each instantiation. Since it changes the global locale, it 8715 * could clash with another running instantiation that isn't using 8716 * per-thread locales. perlembed suggests having the controlling program 8717 * set each instantiation's locale and set PERL_SKIP_LOCALE_INIT so this 8718 * code uses that without actually changing anything. Then the onus is on 8719 * the controlling program to prevent any races. The code below does 8720 * enough locking so as to prevent system calls from overwriting data 8721 * before it is safely copied here, but that isn't a general solution. 8722 */ 8723 8724 if (PL_langinfo_sv == NULL) { 8725 PL_langinfo_sv = newSVpvs(""); 8726 } 8727 if (PL_scratch_langinfo == NULL) { 8728 PL_scratch_langinfo = newSVpvs(""); 8729 } 8730 8731 #ifndef USE_LOCALE 8732 8733 PERL_UNUSED_ARG(printwarn); 8734 const int ok = 1; 8735 8736 #else /* USE_LOCALE to near the end of the routine */ 8737 8738 int ok = 0; 8739 8740 # ifdef __GLIBC__ 8741 8742 const char * const language = PerlEnv_getenv("LANGUAGE"); 8743 8744 # else 8745 const char * const language = NULL; /* Unused placeholder */ 8746 # endif 8747 8748 /* A later getenv() could zap this, so only use here */ 8749 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG"); 8750 8751 const bool locwarn = (printwarn > 1 8752 || ( printwarn 8753 && ( ! bad_lang_use_once 8754 || ( 8755 /* disallow with "" or "0" */ 8756 *bad_lang_use_once 8757 && strNE("0", bad_lang_use_once))))); 8758 8759 # ifndef DEBUGGING 8760 # define DEBUG_LOCALE_INIT(a,b,c) 8761 # else 8762 8763 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT"))); 8764 8765 # define DEBUG_LOCALE_INIT(cat_index, locale, result) \ 8766 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n", \ 8767 setlocale_debug_string_i(cat_index, locale, result))); 8768 8769 # ifdef LC_ALL 8770 assert(categories[LC_ALL_INDEX_] == LC_ALL); 8771 assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL")); 8772 # ifdef USE_POSIX_2008_LOCALE 8773 assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK); 8774 # endif 8775 # endif 8776 8777 for_all_individual_category_indexes(i) { 8778 assert(category_name_lengths[i] == strlen(category_names[i])); 8779 } 8780 8781 # endif /* DEBUGGING */ 8782 8783 /* Initialize the per-thread mbrFOO() state variables. See POSIX.xs for 8784 * why these particular incantations are used. */ 8785 # ifdef HAS_MBRLEN 8786 memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps)); 8787 # endif 8788 # ifdef HAS_MBRTOWC 8789 memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps)); 8790 # endif 8791 # ifdef HAS_WCTOMBR 8792 wcrtomb(NULL, L'\0', &PL_wcrtomb_ps); 8793 # endif 8794 # ifdef USE_PL_CURLOCALES 8795 8796 for (unsigned int i = 0; i <= LC_ALL_INDEX_; i++) { 8797 PL_curlocales[i] = savepv("C"); 8798 } 8799 8800 # endif 8801 # ifdef USE_PL_CUR_LC_ALL 8802 8803 PL_cur_LC_ALL = savepv("C"); 8804 8805 # endif 8806 # if ! defined(PERL_LC_ALL_USES_NAME_VALUE_PAIRS) && defined(LC_ALL) 8807 8808 LOCALE_LOCK; 8809 8810 /* If we haven't done so already, translate the LC_ALL positions of 8811 * categories into our internal indices. */ 8812 if (map_LC_ALL_position_to_index[0] == LC_ALL_INDEX_) { 8813 8814 # ifdef PERL_LC_ALL_CATEGORY_POSITIONS_INIT 8815 /* Use this array, initialized by a config.h constant */ 8816 int lc_all_category_positions[] = PERL_LC_ALL_CATEGORY_POSITIONS_INIT; 8817 STATIC_ASSERT_STMT( C_ARRAY_LENGTH(lc_all_category_positions) 8818 == LC_ALL_INDEX_); 8819 8820 for (unsigned int i = 0; 8821 i < C_ARRAY_LENGTH(lc_all_category_positions); 8822 i++) 8823 { 8824 map_LC_ALL_position_to_index[i] = 8825 get_category_index(lc_all_category_positions[i]); 8826 } 8827 # else 8828 /* It is possible for both PERL_LC_ALL_USES_NAME_VALUE_PAIRS and 8829 * PERL_LC_ALL_CATEGORY_POSITIONS_INIT not to be defined, e.g. on 8830 * systems with only a C locale during ./Configure. Assume that this 8831 * can only happen as part of some sort of bootstrapping so allow 8832 * compilation to succeed by ignoring correctness. 8833 */ 8834 for (unsigned int i = 0; 8835 i < C_ARRAY_LENGTH(map_LC_ALL_position_to_index); 8836 i++) 8837 { 8838 map_LC_ALL_position_to_index[i] = 0; 8839 } 8840 # endif 8841 8842 } 8843 8844 LOCALE_UNLOCK; 8845 8846 # endif 8847 # ifdef USE_POSIX_2008_LOCALE 8848 8849 /* This is a global, so be sure to keep another instance from zapping it */ 8850 LOCALE_LOCK; 8851 if (PL_C_locale_obj) { 8852 LOCALE_UNLOCK; 8853 } 8854 else { 8855 PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0); 8856 if (! PL_C_locale_obj) { 8857 LOCALE_UNLOCK; 8858 locale_panic_("Cannot create POSIX 2008 C locale object"); 8859 } 8860 LOCALE_UNLOCK; 8861 8862 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n", 8863 PL_C_locale_obj)); 8864 } 8865 8866 /* Switch to using the POSIX 2008 interface now. This would happen below 8867 * anyway, but deferring it can lead to leaks of memory that would also get 8868 * malloc'd in the interim. We arbitrarily switch to the C locale, 8869 * overridden below */ 8870 if (! uselocale(PL_C_locale_obj)) { 8871 locale_panic_(Perl_form(aTHX_ 8872 "Can't uselocale(%p), LC_ALL supposed to" 8873 " be 'C'", 8874 PL_C_locale_obj)); 8875 } 8876 8877 # ifdef MULTIPLICITY 8878 8879 PL_cur_locale_obj = PL_C_locale_obj; 8880 8881 # endif 8882 # endif 8883 8884 /* Now initialize some data structures. This is entirely so that 8885 * later-executed code doesn't have to concern itself with things not being 8886 * initialized. Arbitrarily use the C locale (which we know has to exist 8887 * on the system). */ 8888 8889 # ifdef USE_LOCALE_NUMERIC 8890 8891 PL_numeric_radix_sv = newSV(1); 8892 PL_underlying_radix_sv = newSV(1); 8893 Newxz(PL_numeric_name, 1, char); /* Single NUL character */ 8894 8895 # endif 8896 # ifdef USE_LOCALE_COLLATE 8897 8898 Newxz(PL_collation_name, 1, char); 8899 8900 # endif 8901 # ifdef USE_LOCALE_CTYPE 8902 8903 Newxz(PL_ctype_name, 1, char); 8904 8905 # endif 8906 8907 new_LC_ALL("C", true /* Don't shortcut */); 8908 8909 /*===========================================================================*/ 8910 8911 /* Now ready to override the initialization with the values that the user 8912 * wants. This is done in the global locale as explained in the 8913 * introductory comments to this function */ 8914 switch_to_global_locale(); 8915 8916 const char * const lc_all = PerlEnv_getenv("LC_ALL"); 8917 const char * const lang = PerlEnv_getenv("LANG"); 8918 8919 /* We try each locale in the enum, in order, until we get one that works, 8920 * or exhaust the list. Normally the loop is executed just once. 8921 * 8922 * Each enum value is +1 from the previous */ 8923 typedef enum { 8924 dummy_trial = -1, 8925 environment_trial = 0, /* "" or NULL; code below assumes value 8926 0 is the first real trial */ 8927 LC_ALL_trial, /* ENV{LC_ALL} */ 8928 LANG_trial, /* ENV{LANG} */ 8929 system_default_trial, /* Windows .ACP */ 8930 C_trial, /* C locale */ 8931 beyond_final_trial, 8932 } trials; 8933 8934 trials trial; 8935 unsigned int already_checked = 0; 8936 const char * checked[C_trial]; 8937 8938 # ifdef LC_ALL 8939 const char * lc_all_string; 8940 # else 8941 const char * curlocales[LC_ALL_INDEX_]; 8942 # endif 8943 8944 /* Loop through the initial setting and all the possible fallbacks, 8945 * breaking out of the loop on success */ 8946 trial = dummy_trial; 8947 while (trial != beyond_final_trial) { 8948 8949 /* Each time through compute the next trial to use based on the one in 8950 * the previous iteration and switch to the new one. This enforces the 8951 * order in which the fallbacks are applied */ 8952 next_trial: 8953 trial = (trials) ((int) trial + 1); /* Casts are needed for g++ */ 8954 8955 const char * locale = NULL; 8956 8957 /* Set up the parameters for this trial */ 8958 switch (trial) { 8959 case dummy_trial: 8960 locale_panic_("Unexpectedly got 'dummy_trial"); 8961 break; 8962 8963 case environment_trial: 8964 /* This is either "" to get the values from the environment, or 8965 * NULL if the calling program has initialized the values already. 8966 * */ 8967 locale = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT")) 8968 ? NULL 8969 : ""; 8970 break; 8971 8972 case LC_ALL_trial: 8973 if (! lc_all || strEQ(lc_all, "")) { 8974 continue; /* No-op */ 8975 } 8976 8977 locale = lc_all; 8978 break; 8979 8980 case LANG_trial: 8981 if (! lang || strEQ(lang, "")) { 8982 continue; /* No-op */ 8983 } 8984 8985 locale = lang; 8986 break; 8987 8988 case system_default_trial: 8989 8990 # if ! defined(WIN32) || ! defined(LC_ALL) 8991 8992 continue; /* No-op */ 8993 8994 # else 8995 /* For Windows, we also try the system default locale before "C". 8996 * (If there exists a Windows without LC_ALL we skip this because 8997 * it gets too complicated. For those, "C" is the next fallback 8998 * possibility). */ 8999 locale = ".ACP"; 9000 # endif 9001 break; 9002 9003 case C_trial: 9004 locale = "C"; 9005 break; 9006 9007 case beyond_final_trial: 9008 continue; /* No-op, causes loop to exit */ 9009 } 9010 9011 /* If the locale is a substantive name, don't try the same locale 9012 * twice. */ 9013 if (locale && strNE(locale, "")) { 9014 for (unsigned int i = 0; i < already_checked; i++) { 9015 if (strEQ(checked[i], locale)) { 9016 goto next_trial; 9017 } 9018 } 9019 9020 /* And, for future iterations, indicate we've tried this locale */ 9021 assert(already_checked < C_ARRAY_LENGTH(checked)); 9022 checked[already_checked] = savepv(locale); 9023 SAVEFREEPV(checked[already_checked]); 9024 already_checked++; 9025 } 9026 9027 # ifdef LC_ALL 9028 9029 STDIZED_SETLOCALE_LOCK; 9030 lc_all_string = savepv(stdized_setlocale(LC_ALL, locale)); 9031 STDIZED_SETLOCALE_UNLOCK; 9032 9033 DEBUG_LOCALE_INIT(LC_ALL_INDEX_, locale, lc_all_string); 9034 9035 if (LIKELY(lc_all_string)) { /* Succeeded */ 9036 ok = 1; 9037 break; 9038 } 9039 9040 if (trial == 0 && locwarn) { 9041 PerlIO_printf(Perl_error_log, 9042 "perl: warning: Setting locale failed.\n"); 9043 output_check_environment_warning(language, lc_all, lang); 9044 } 9045 9046 # else /* Below is ! LC_ALL */ 9047 9048 bool setlocale_failure = FALSE; /* This trial hasn't failed so far */ 9049 bool dowarn = trial == 0 && locwarn; 9050 9051 for_all_individual_category_indexes(j) { 9052 STDIZED_SETLOCALE_LOCK; 9053 curlocales[j] = savepv(stdized_setlocale(categories[j], locale)); 9054 STDIZED_SETLOCALE_UNLOCK; 9055 9056 DEBUG_LOCALE_INIT(j, locale, curlocales[j]); 9057 9058 if (UNLIKELY(! curlocales[j])) { 9059 setlocale_failure = TRUE; 9060 9061 /* If are going to warn below, continue to loop so all failures 9062 * are included in the message */ 9063 if (! dowarn) { 9064 break; 9065 } 9066 } 9067 } 9068 9069 if (LIKELY(! setlocale_failure)) { /* All succeeded */ 9070 ok = 1; 9071 break; /* Exit trial_locales loop */ 9072 } 9073 9074 /* Here, this trial failed */ 9075 9076 if (dowarn) { 9077 PerlIO_printf(Perl_error_log, 9078 "perl: warning: Setting locale failed for the categories:\n"); 9079 9080 for_all_individual_category_indexes(j) { 9081 if (! curlocales[j]) { 9082 PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]); 9083 } 9084 } 9085 9086 output_check_environment_warning(language, lc_all, lang); 9087 } /* end of warning on first failure */ 9088 9089 # endif /* LC_ALL */ 9090 9091 } /* end of looping through the trial locales */ 9092 9093 /* If we had to do more than the first trial, it means that one failed, and 9094 * we may need to output a warning, and, if none worked, do more */ 9095 if (UNLIKELY(trial != 0)) { 9096 if (locwarn) { 9097 const char * description = "a fallback locale"; 9098 const char * name = NULL;; 9099 9100 /* If we didn't find a good fallback, list all we tried */ 9101 if (! ok && already_checked > 0) { 9102 PerlIO_printf(Perl_error_log, "perl: warning: Failed to fall" 9103 " back to "); 9104 if (already_checked > 1) { /* more than one was tried */ 9105 PerlIO_printf(Perl_error_log, "any of:\n"); 9106 } 9107 9108 while (already_checked > 0) { 9109 name = checked[--already_checked]; 9110 description = GET_DESCRIPTION(trial, name); 9111 PerlIO_printf(Perl_error_log, "%s (\"%s\")\n", 9112 description, name); 9113 } 9114 } 9115 9116 if (ok) { 9117 9118 /* Here, a fallback worked. So we have saved its name, and the 9119 * trial that succeeded is still valid */ 9120 # ifdef LC_ALL 9121 const char * individ_locales[LC_ALL_INDEX_] = { NULL }; 9122 9123 /* Even though we know the valid string for LC_ALL that worked, 9124 * translate it into our internal format, which is the 9125 * name=value pairs notation. This is easier for a human to 9126 * decipher than the positional notation. Some platforms 9127 * can return "C C C C C C" for LC_ALL. This code also 9128 * standardizes that result into plain "C". */ 9129 switch (parse_LC_ALL_string(lc_all_string, 9130 (const char **) &individ_locales, 9131 no_override, 9132 false, /* Return only [0] if 9133 suffices */ 9134 false, /* Don't panic on error */ 9135 __LINE__)) 9136 { 9137 case invalid: 9138 9139 /* Here, the parse failed, which shouldn't happen, but if 9140 * it does, we have an easy fallback that allows us to keep 9141 * going. */ 9142 name = lc_all_string; 9143 break; 9144 9145 case no_array: /* The original is a single locale */ 9146 name = lc_all_string; 9147 break; 9148 9149 case only_element_0: /* element[0] is a single locale valid 9150 for all categories */ 9151 SAVEFREEPV(individ_locales[0]); 9152 name = individ_locales[0]; 9153 break; 9154 9155 case full_array: 9156 name = calculate_LC_ALL_string(individ_locales, 9157 INTERNAL_FORMAT, 9158 WANT_TEMP_PV, 9159 __LINE__); 9160 for_all_individual_category_indexes(j) { 9161 Safefree(individ_locales[j]); 9162 } 9163 } 9164 # else 9165 name = calculate_LC_ALL_string(curlocales, 9166 INTERNAL_FORMAT, 9167 WANT_TEMP_PV, 9168 __LINE__); 9169 # endif 9170 description = GET_DESCRIPTION(trial, name); 9171 } 9172 else { 9173 9174 /* Nothing seems to be working, yet we want to continue 9175 * executing. It may well be that locales are mostly 9176 * irrelevant to this particular program, and there must be 9177 * some locale underlying the program. Figure it out as best 9178 * we can, by querying the system's current locale */ 9179 9180 # ifdef LC_ALL 9181 9182 STDIZED_SETLOCALE_LOCK; 9183 name = stdized_setlocale(LC_ALL, NULL); 9184 STDIZED_SETLOCALE_UNLOCK; 9185 9186 if (UNLIKELY(! name)) { 9187 name = "locale name not determinable"; 9188 } 9189 9190 # else /* Below is ! LC_ALL */ 9191 9192 const char * system_locales[LC_ALL_INDEX_] = { NULL }; 9193 9194 for_all_individual_category_indexes(j) { 9195 STDIZED_SETLOCALE_LOCK; 9196 system_locales[j] = savepv(stdized_setlocale(categories[j], 9197 NULL)); 9198 STDIZED_SETLOCALE_UNLOCK; 9199 9200 if (UNLIKELY(! system_locales[j])) { 9201 system_locales[j] = "not determinable"; 9202 } 9203 } 9204 9205 /* We use the name=value form for the string, as that is more 9206 * human readable than the positional notation */ 9207 name = calculate_LC_ALL_string(system_locales, 9208 INTERNAL_FORMAT, 9209 WANT_TEMP_PV, 9210 __LINE__); 9211 description = "what the system says"; 9212 9213 for_all_individual_category_indexes(j) { 9214 Safefree(system_locales[j]); 9215 } 9216 # endif 9217 } 9218 9219 PerlIO_printf(Perl_error_log, 9220 "perl: warning: Falling back to %s (\"%s\").\n", 9221 description, name); 9222 9223 /* Here, ok being true indicates that the first attempt failed, but 9224 * a fallback succeeded; false => nothing working. Translate to 9225 * API return values. */ 9226 ok = (ok) ? 0 : -1; 9227 } 9228 } 9229 9230 # ifdef LC_ALL 9231 9232 give_perl_locale_control(lc_all_string, __LINE__); 9233 Safefree(lc_all_string); 9234 9235 # else 9236 9237 give_perl_locale_control((const char **) &curlocales, __LINE__); 9238 9239 for_all_individual_category_indexes(j) { 9240 Safefree(curlocales[j]); 9241 } 9242 9243 # endif 9244 # if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE) 9245 9246 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE 9247 * locale is UTF-8. give_perl_locale_control() just above has already 9248 * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If 9249 * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE}) 9250 * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on 9251 * STDIN, STDOUT, STDERR, _and_ the default open discipline. */ 9252 PL_utf8locale = PL_in_utf8_CTYPE_locale; 9253 9254 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO. 9255 This is an alternative to using the -C command line switch 9256 (the -C if present will override this). */ 9257 { 9258 const char *p = PerlEnv_getenv("PERL_UNICODE"); 9259 PL_unicode = p ? parse_unicode_opts(&p) : 0; 9260 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG) 9261 PL_utf8cache = -1; 9262 } 9263 9264 # endif 9265 # if defined(USE_POSIX_2008_LOCALE) && defined(MULTIPLICITY) 9266 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 9267 "finished Perl_init_i18nl10n; actual obj=%p," 9268 " expected obj=%p, initial=%s\n", 9269 uselocale(0), PL_cur_locale_obj, 9270 get_LC_ALL_display())); 9271 # endif 9272 9273 /* So won't continue to output stuff */ 9274 DEBUG_INITIALIZATION_set(FALSE); 9275 9276 #endif /* USE_LOCALE */ 9277 9278 return ok; 9279 } 9280 9281 #undef GET_DESCRIPTION 9282 #ifdef USE_LOCALE_COLLATE 9283 9284 STATIC void 9285 S_compute_collxfrm_coefficients(pTHX) 9286 { 9287 9288 /* A locale collation definition includes primary, secondary, tertiary, 9289 * etc. weights for each character. To sort, the primary weights are used, 9290 * and only if they compare equal, then the secondary weights are used, and 9291 * only if they compare equal, then the tertiary, etc. 9292 * 9293 * strxfrm() works by taking the input string, say ABC, and creating an 9294 * output transformed string consisting of first the primary weights, 9295 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the tertiary, 9296 * etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters may not have 9297 * weights at every level. In our example, let's say B doesn't have a 9298 * tertiary weight, and A doesn't have a secondary weight. The constructed 9299 * string is then going to be 9300 * A¹B¹C¹ B²C² A³C³ .... 9301 * This has the desired effect that strcmp() will look at the secondary or 9302 * tertiary weights only if the strings compare equal at all higher 9303 * priority weights. The spaces shown here, like in 9304 * "A¹B¹C¹ A²B²C² " 9305 * are not just for readability. In the general case, these must actually 9306 * be bytes, which we will call here 'separator weights'; and they must be 9307 * smaller than any other weight value, but since these are C strings, only 9308 * the terminating one can be a NUL (some implementations may include a 9309 * non-NUL separator weight just before the NUL). Implementations tend to 9310 * reserve 01 for the separator weights. They are needed so that a shorter 9311 * string's secondary weights won't be misconstrued as primary weights of a 9312 * longer string, etc. By making them smaller than any other weight, the 9313 * shorter string will sort first. (Actually, if all secondary weights are 9314 * smaller than all primary ones, there is no need for a separator weight 9315 * between those two levels, etc.) 9316 * 9317 * The length of the transformed string is roughly a linear function of the 9318 * input string. It's not exactly linear because some characters don't 9319 * have weights at all levels. When we call strxfrm() we have to allocate 9320 * some memory to hold the transformed string. The calculations below try 9321 * to find coefficients 'm' and 'b' for this locale so that m*x + b equals 9322 * how much space we need, given the size of the input string in 'x'. If 9323 * we calculate too small, we increase the size as needed, and call 9324 * strxfrm() again, but it is better to get it right the first time to 9325 * avoid wasted expensive string transformations. 9326 * 9327 * We use the string below to find how long the transformation of it is. 9328 * Almost all locales are supersets of ASCII, or at least the ASCII 9329 * letters. We use all of them, half upper half lower, because if we used 9330 * fewer, we might hit just the ones that are outliers in a particular 9331 * locale. Most of the strings being collated will contain a preponderance 9332 * of letters, and even if they are above-ASCII, they are likely to have 9333 * the same number of weight levels as the ASCII ones. It turns out that 9334 * digits tend to have fewer levels, and some punctuation has more, but 9335 * those are relatively sparse in text, and khw believes this gives a 9336 * reasonable result, but it could be changed if experience so dictates. */ 9337 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz"; 9338 char * x_longer; /* Transformed 'longer' */ 9339 Size_t x_len_longer; /* Length of 'x_longer' */ 9340 9341 char * x_shorter; /* We also transform a substring of 'longer' */ 9342 Size_t x_len_shorter; 9343 9344 PL_in_utf8_COLLATE_locale = (PL_collation_standard) 9345 ? 0 9346 : is_locale_utf8(PL_collation_name); 9347 PL_strxfrm_NUL_replacement = '\0'; 9348 PL_strxfrm_max_cp = 0; 9349 9350 /* mem_collxfrm_() is used get the transformation (though here we are 9351 * interested only in its length). It is used because it has the 9352 * intelligence to handle all cases, but to work, it needs some values of 9353 * 'm' and 'b' to get it started. For the purposes of this calculation we 9354 * use a very conservative estimate of 'm' and 'b'. This assumes a weight 9355 * can be multiple bytes, enough to hold any UV on the platform, and there 9356 * are 5 levels, 4 weight bytes, and a trailing NUL. */ 9357 PL_collxfrm_base = 5; 9358 PL_collxfrm_mult = 5 * sizeof(UV); 9359 9360 /* Find out how long the transformation really is */ 9361 x_longer = mem_collxfrm_(longer, 9362 sizeof(longer) - 1, 9363 &x_len_longer, 9364 9365 /* We avoid converting to UTF-8 in the called 9366 * function by telling it the string is in UTF-8 9367 * if the locale is a UTF-8 one. Since the string 9368 * passed here is invariant under UTF-8, we can 9369 * claim it's UTF-8 even if it isn't. */ 9370 PL_in_utf8_COLLATE_locale); 9371 Safefree(x_longer); 9372 9373 /* Find out how long the transformation of a substring of 'longer' is. 9374 * Together the lengths of these transformations are sufficient to 9375 * calculate 'm' and 'b'. The substring is all of 'longer' except the 9376 * first character. This minimizes the chances of being swayed by outliers 9377 * */ 9378 x_shorter = mem_collxfrm_(longer + 1, 9379 sizeof(longer) - 2, 9380 &x_len_shorter, 9381 PL_in_utf8_COLLATE_locale); 9382 Safefree(x_shorter); 9383 9384 /* If the results are nonsensical for this simple test, the whole locale 9385 * definition is suspect. Mark it so that locale collation is not active 9386 * at all for it. XXX Should we warn? */ 9387 if ( x_len_shorter == 0 9388 || x_len_longer == 0 9389 || x_len_shorter >= x_len_longer) 9390 { 9391 PL_collxfrm_mult = 0; 9392 PL_collxfrm_base = 1; 9393 DEBUG_L(PerlIO_printf(Perl_debug_log, 9394 "Disabling locale collation for LC_COLLATE='%s';" 9395 " length for shorter sample=%zu; longer=%zu\n", 9396 PL_collation_name, x_len_shorter, x_len_longer)); 9397 } 9398 else { 9399 SSize_t base; /* Temporary */ 9400 9401 /* We have both: m * strlen(longer) + b = x_len_longer 9402 * m * strlen(shorter) + b = x_len_shorter; 9403 * subtracting yields: 9404 * m * (strlen(longer) - strlen(shorter)) 9405 * = x_len_longer - x_len_shorter 9406 * But we have set things up so that 'shorter' is 1 byte smaller than 9407 * 'longer'. Hence: 9408 * m = x_len_longer - x_len_shorter 9409 * 9410 * But if something went wrong, make sure the multiplier is at least 1. 9411 */ 9412 if (x_len_longer > x_len_shorter) { 9413 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter; 9414 } 9415 else { 9416 PL_collxfrm_mult = 1; 9417 } 9418 9419 /* mx + b = len 9420 * so: b = len - mx 9421 * but in case something has gone wrong, make sure it is non-negative 9422 * */ 9423 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1); 9424 if (base < 0) { 9425 base = 0; 9426 } 9427 9428 /* Add 1 for the trailing NUL */ 9429 PL_collxfrm_base = base + 1; 9430 } 9431 9432 DEBUG_L(PerlIO_printf(Perl_debug_log, 9433 "?UTF-8 locale=%d; x_len_shorter=%zu, " 9434 "x_len_longer=%zu," 9435 " collate multipler=%zu, collate base=%zu\n", 9436 PL_in_utf8_COLLATE_locale, 9437 x_len_shorter, x_len_longer, 9438 PL_collxfrm_mult, PL_collxfrm_base)); 9439 } 9440 9441 char * 9442 Perl_mem_collxfrm_(pTHX_ const char *input_string, 9443 STRLEN len, /* Length of 'input_string' */ 9444 STRLEN *xlen, /* Set to length of returned string 9445 (not including the collation index 9446 prefix) */ 9447 bool utf8 /* Is the input in UTF-8? */ 9448 ) 9449 { 9450 /* mem_collxfrm_() is like strxfrm() but with two important differences. 9451 * First, it handles embedded NULs. Second, it allocates a bit more memory 9452 * than needed for the transformed data itself. The real transformed data 9453 * begins at offset COLLXFRM_HDR_LEN. *xlen is set to the length of that, 9454 * and doesn't include the collation index size. 9455 * 9456 * It is the caller's responsibility to eventually free the memory returned 9457 * by this function. 9458 * 9459 * Please see sv_collxfrm() to see how this is used. */ 9460 9461 # define COLLXFRM_HDR_LEN sizeof(PL_collation_ix) 9462 9463 char * s = (char *) input_string; 9464 STRLEN s_strlen = strlen(input_string); 9465 char *xbuf = NULL; 9466 STRLEN xAlloc; /* xalloc is a reserved word in VC */ 9467 STRLEN length_in_chars; 9468 bool first_time = TRUE; /* Cleared after first loop iteration */ 9469 9470 # ifdef USE_LOCALE_CTYPE 9471 const char * orig_CTYPE_locale = NULL; 9472 # endif 9473 9474 # if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L 9475 locale_t constructed_locale = (locale_t) 0; 9476 # endif 9477 9478 PERL_ARGS_ASSERT_MEM_COLLXFRM_; 9479 9480 /* Must be NUL-terminated */ 9481 assert(*(input_string + len) == '\0'); 9482 9483 if (PL_collxfrm_mult == 0) { /* unknown or bad */ 9484 if (PL_collxfrm_base != 0) { /* bad collation => skip */ 9485 DEBUG_L(PerlIO_printf(Perl_debug_log, 9486 "mem_collxfrm_: locale's collation is defective\n")); 9487 goto bad; 9488 } 9489 9490 /* (mult, base) == (0,0) means we need to calculate mult and base 9491 * before proceeding */ 9492 S_compute_collxfrm_coefficients(aTHX); 9493 } 9494 9495 /* Replace any embedded NULs with the control that sorts before any others. 9496 * This will give as good as possible results on strings that don't 9497 * otherwise contain that character, but otherwise there may be 9498 * less-than-perfect results with that character and NUL. This is 9499 * unavoidable unless we replace strxfrm with our own implementation. */ 9500 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded 9501 NUL */ 9502 char * e = s + len; 9503 char * sans_nuls; 9504 STRLEN sans_nuls_len; 9505 int try_non_controls; 9506 char this_replacement_char[] = "?\0"; /* Room for a two-byte string, 9507 making sure 2nd byte is NUL. 9508 */ 9509 STRLEN this_replacement_len; 9510 9511 /* If we don't know what non-NUL control character sorts lowest for 9512 * this locale, find it */ 9513 if (PL_strxfrm_NUL_replacement == '\0') { 9514 int j; 9515 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also 9516 includes the collation index 9517 prefixed. */ 9518 9519 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n")); 9520 9521 /* Unlikely, but it may be that no control will work to replace 9522 * NUL, in which case we instead look for any character. Controls 9523 * are preferred because collation order is, in general, context 9524 * sensitive, with adjoining characters affecting the order, and 9525 * controls are less likely to have such interactions, allowing the 9526 * NUL-replacement to stand on its own. (Another way to look at it 9527 * is to imagine what would happen if the NUL were replaced by a 9528 * combining character; it wouldn't work out all that well.) */ 9529 for (try_non_controls = 0; 9530 try_non_controls < 2; 9531 try_non_controls++) 9532 { 9533 9534 # ifdef USE_LOCALE_CTYPE 9535 9536 /* In this case we use isCNTRL_LC() below, which relies on 9537 * LC_CTYPE, so that must be switched to correspond with the 9538 * LC_COLLATE locale */ 9539 const bool need_to_toggle = ( ! try_non_controls 9540 && ! PL_in_utf8_COLLATE_locale); 9541 if (need_to_toggle) { 9542 orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, 9543 PL_collation_name); 9544 } 9545 # endif 9546 /* Look through all legal code points (NUL isn't) */ 9547 for (j = 1; j < 256; j++) { 9548 char * x; /* j's xfrm plus collation index */ 9549 STRLEN x_len; /* length of 'x' */ 9550 STRLEN trial_len = 1; 9551 char cur_source[] = { '\0', '\0' }; 9552 9553 /* Skip non-controls the first time through the loop. The 9554 * controls in a UTF-8 locale are the L1 ones */ 9555 if (! try_non_controls && (PL_in_utf8_COLLATE_locale) 9556 ? ! isCNTRL_L1(j) 9557 : ! isCNTRL_LC(j)) 9558 { 9559 continue; 9560 } 9561 9562 /* Create a 1-char string of the current code point */ 9563 cur_source[0] = (char) j; 9564 9565 /* Then transform it */ 9566 x = mem_collxfrm_(cur_source, trial_len, &x_len, 9567 0 /* The string is not in UTF-8 */); 9568 9569 /* Ignore any character that didn't successfully transform. 9570 * */ 9571 if (! x) { 9572 continue; 9573 } 9574 9575 /* If this character's transformation is lower than 9576 * the current lowest, this one becomes the lowest */ 9577 if ( cur_min_x == NULL 9578 || strLT(x + COLLXFRM_HDR_LEN, 9579 cur_min_x + COLLXFRM_HDR_LEN)) 9580 { 9581 PL_strxfrm_NUL_replacement = j; 9582 Safefree(cur_min_x); 9583 cur_min_x = x; 9584 } 9585 else { 9586 Safefree(x); 9587 } 9588 } /* end of loop through all 255 characters */ 9589 9590 # ifdef USE_LOCALE_CTYPE 9591 9592 if (need_to_toggle) { 9593 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); 9594 } 9595 # endif 9596 9597 /* Stop looking if found */ 9598 if (cur_min_x) { 9599 break; 9600 } 9601 9602 /* Unlikely, but possible, if there aren't any controls that 9603 * work in the locale, repeat the loop, looking for any 9604 * character that works */ 9605 DEBUG_L(PerlIO_printf(Perl_debug_log, 9606 "mem_collxfrm_: No control worked. Trying non-controls\n")); 9607 } /* End of loop to try first the controls, then any char */ 9608 9609 if (! cur_min_x) { 9610 DEBUG_L(PerlIO_printf(Perl_debug_log, 9611 "mem_collxfrm_: Couldn't find any character to replace" 9612 " embedded NULs in locale %s with", PL_collation_name)); 9613 goto bad; 9614 } 9615 9616 DEBUG_L(PerlIO_printf(Perl_debug_log, 9617 "mem_collxfrm_: Replacing embedded NULs in locale %s with " 9618 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement)); 9619 9620 Safefree(cur_min_x); 9621 } /* End of determining the character that is to replace NULs */ 9622 9623 /* If the replacement is variant under UTF-8, it must match the 9624 * UTF8-ness of the original */ 9625 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) { 9626 this_replacement_char[0] = 9627 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement); 9628 this_replacement_char[1] = 9629 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement); 9630 this_replacement_len = 2; 9631 } 9632 else { 9633 this_replacement_char[0] = PL_strxfrm_NUL_replacement; 9634 /* this_replacement_char[1] = '\0' was done at initialization */ 9635 this_replacement_len = 1; 9636 } 9637 9638 /* The worst case length for the replaced string would be if every 9639 * character in it is NUL. Multiply that by the length of each 9640 * replacement, and allow for a trailing NUL */ 9641 sans_nuls_len = (len * this_replacement_len) + 1; 9642 Newx(sans_nuls, sans_nuls_len, char); 9643 *sans_nuls = '\0'; 9644 9645 /* Replace each NUL with the lowest collating control. Loop until have 9646 * exhausted all the NULs */ 9647 while (s + s_strlen < e) { 9648 my_strlcat(sans_nuls, s, sans_nuls_len); 9649 9650 /* Do the actual replacement */ 9651 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len); 9652 9653 /* Move past the input NUL */ 9654 s += s_strlen + 1; 9655 s_strlen = strlen(s); 9656 } 9657 9658 /* And add anything that trails the final NUL */ 9659 my_strlcat(sans_nuls, s, sans_nuls_len); 9660 9661 /* Switch so below we transform this modified string */ 9662 s = sans_nuls; 9663 len = strlen(s); 9664 } /* End of replacing NULs */ 9665 9666 /* Make sure the UTF8ness of the string and locale match */ 9667 if (utf8 != PL_in_utf8_COLLATE_locale) { 9668 /* XXX convert above Unicode to 10FFFF? */ 9669 const char * const t = s; /* Temporary so we can later find where the 9670 input was */ 9671 9672 /* Here they don't match. Change the string's to be what the locale is 9673 * expecting */ 9674 9675 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */ 9676 s = (char *) bytes_to_utf8((const U8 *) s, &len); 9677 utf8 = TRUE; 9678 } 9679 else { /* locale is not UTF-8; but input is; downgrade the input */ 9680 9681 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8); 9682 9683 /* If the downgrade was successful we are done, but if the input 9684 * contains things that require UTF-8 to represent, have to do 9685 * damage control ... */ 9686 if (UNLIKELY(utf8)) { 9687 9688 /* What we do is construct a non-UTF-8 string with 9689 * 1) the characters representable by a single byte converted 9690 * to be so (if necessary); 9691 * 2) and the rest converted to collate the same as the 9692 * highest collating representable character. That makes 9693 * them collate at the end. This is similar to how we 9694 * handle embedded NULs, but we use the highest collating 9695 * code point instead of the smallest. Like the NUL case, 9696 * this isn't perfect, but is the best we can reasonably 9697 * do. Every above-255 code point will sort the same as 9698 * the highest-sorting 0-255 code point. If that code 9699 * point can combine in a sequence with some other code 9700 * points for weight calculations, us changing something to 9701 * be it can adversely affect the results. But in most 9702 * cases, it should work reasonably. And note that this is 9703 * really an illegal situation: using code points above 255 9704 * on a locale where only 0-255 are valid. If two strings 9705 * sort entirely equal, then the sort order for the 9706 * above-255 code points will be in code point order. */ 9707 9708 utf8 = FALSE; 9709 9710 /* If we haven't calculated the code point with the maximum 9711 * collating order for this locale, do so now */ 9712 if (! PL_strxfrm_max_cp) { 9713 int j; 9714 9715 /* The current transformed string that collates the 9716 * highest (except it also includes the prefixed collation 9717 * index. */ 9718 char * cur_max_x = NULL; 9719 9720 /* Look through all legal code points (NUL isn't) */ 9721 for (j = 1; j < 256; j++) { 9722 char * x; 9723 STRLEN x_len; 9724 char cur_source[] = { '\0', '\0' }; 9725 9726 /* Create a 1-char string of the current code point */ 9727 cur_source[0] = (char) j; 9728 9729 /* Then transform it */ 9730 x = mem_collxfrm_(cur_source, 1, &x_len, FALSE); 9731 9732 /* If something went wrong (which it shouldn't), just 9733 * ignore this code point */ 9734 if (! x) { 9735 continue; 9736 } 9737 9738 /* If this character's transformation is higher than 9739 * the current highest, this one becomes the highest */ 9740 if ( cur_max_x == NULL 9741 || strGT(x + COLLXFRM_HDR_LEN, 9742 cur_max_x + COLLXFRM_HDR_LEN)) 9743 { 9744 PL_strxfrm_max_cp = j; 9745 Safefree(cur_max_x); 9746 cur_max_x = x; 9747 } 9748 else { 9749 Safefree(x); 9750 } 9751 } 9752 9753 if (! cur_max_x) { 9754 DEBUG_L(PerlIO_printf(Perl_debug_log, 9755 "mem_collxfrm_: Couldn't find any character to" 9756 " replace above-Latin1 chars in locale %s with", 9757 PL_collation_name)); 9758 goto bad; 9759 } 9760 9761 DEBUG_L(PerlIO_printf(Perl_debug_log, 9762 "mem_collxfrm_: highest 1-byte collating character" 9763 " in locale %s is 0x%02X\n", 9764 PL_collation_name, 9765 PL_strxfrm_max_cp)); 9766 9767 Safefree(cur_max_x); 9768 } 9769 9770 /* Here we know which legal code point collates the highest. 9771 * We are ready to construct the non-UTF-8 string. The length 9772 * will be at least 1 byte smaller than the input string 9773 * (because we changed at least one 2-byte character into a 9774 * single byte), but that is eaten up by the trailing NUL */ 9775 Newx(s, len, char); 9776 9777 { 9778 STRLEN i; 9779 STRLEN d= 0; 9780 char * e = (char *) t + len; 9781 9782 for (i = 0; i < len; i+= UTF8SKIP(t + i)) { 9783 U8 cur_char = t[i]; 9784 if (UTF8_IS_INVARIANT(cur_char)) { 9785 s[d++] = cur_char; 9786 } 9787 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) { 9788 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]); 9789 } 9790 else { /* Replace illegal cp with highest collating 9791 one */ 9792 s[d++] = PL_strxfrm_max_cp; 9793 } 9794 } 9795 s[d++] = '\0'; 9796 Renew(s, d, char); /* Free up unused space */ 9797 } 9798 } 9799 } 9800 9801 /* Here, we have constructed a modified version of the input. It could 9802 * be that we already had a modified copy before we did this version. 9803 * If so, that copy is no longer needed */ 9804 if (t != input_string) { 9805 Safefree(t); 9806 } 9807 } 9808 9809 length_in_chars = (utf8) 9810 ? utf8_length((U8 *) s, (U8 *) s + len) 9811 : len; 9812 9813 /* The first element in the output is the collation id, used by 9814 * sv_collxfrm(); then comes the space for the transformed string. The 9815 * equation should give us a good estimate as to how much is needed */ 9816 xAlloc = COLLXFRM_HDR_LEN 9817 + PL_collxfrm_base 9818 + (PL_collxfrm_mult * length_in_chars); 9819 Newx(xbuf, xAlloc, char); 9820 if (UNLIKELY(! xbuf)) { 9821 DEBUG_L(PerlIO_printf(Perl_debug_log, 9822 "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc)); 9823 goto bad; 9824 } 9825 9826 /* Store the collation id */ 9827 *(PERL_UINTMAX_T *)xbuf = PL_collation_ix; 9828 9829 # if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L 9830 # ifdef USE_LOCALE_CTYPE 9831 9832 constructed_locale = newlocale(LC_CTYPE_MASK, PL_collation_name, 9833 duplocale(use_curlocale_scratch())); 9834 # else 9835 9836 constructed_locale = duplocale(use_curlocale_scratch()); 9837 9838 # endif 9839 # define my_strxfrm(dest, src, n) strxfrm_l(dest, src, n, \ 9840 constructed_locale) 9841 # define CLEANUP_STRXFRM \ 9842 STMT_START { \ 9843 if (constructed_locale != (locale_t) 0) \ 9844 freelocale(constructed_locale); \ 9845 } STMT_END 9846 # else 9847 # define my_strxfrm(dest, src, n) strxfrm(dest, src, n) 9848 # ifdef USE_LOCALE_CTYPE 9849 9850 orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name); 9851 9852 # define CLEANUP_STRXFRM \ 9853 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale) 9854 # else 9855 # define CLEANUP_STRXFRM NOOP 9856 # endif 9857 # endif 9858 9859 /* Then the transformation of the input. We loop until successful, or we 9860 * give up */ 9861 for (;;) { 9862 9863 errno = 0; 9864 *xlen = my_strxfrm(xbuf + COLLXFRM_HDR_LEN, 9865 s, 9866 xAlloc - COLLXFRM_HDR_LEN); 9867 9868 9869 /* If the transformed string occupies less space than we told strxfrm() 9870 * was available, it means it transformed the whole string. */ 9871 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) { 9872 9873 /* But there still could have been a problem */ 9874 if (errno != 0) { 9875 DEBUG_L(PerlIO_printf(Perl_debug_log, 9876 "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n", 9877 PL_collation_name, errno, 9878 _byte_dump_string((U8 *) s, len, 0))); 9879 goto bad; 9880 } 9881 9882 /* Here, the transformation was successful. Some systems include a 9883 * trailing NUL in the returned length. Ignore it, using a loop in 9884 * case multiple trailing NULs are returned. */ 9885 while ( (*xlen) > 0 9886 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0') 9887 { 9888 (*xlen)--; 9889 } 9890 9891 /* If the first try didn't get it, it means our prediction was low. 9892 * Modify the coefficients so that we predict a larger value in any 9893 * future transformations */ 9894 if (! first_time) { 9895 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */ 9896 STRLEN computed_guess = PL_collxfrm_base 9897 + (PL_collxfrm_mult * length_in_chars); 9898 9899 /* On zero-length input, just keep current slope instead of 9900 * dividing by 0 */ 9901 const STRLEN new_m = (length_in_chars != 0) 9902 ? needed / length_in_chars 9903 : PL_collxfrm_mult; 9904 9905 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 9906 "initial size of %zu bytes for a length " 9907 "%zu string was insufficient, %zu needed\n", 9908 computed_guess, length_in_chars, needed)); 9909 9910 /* If slope increased, use it, but discard this result for 9911 * length 1 strings, as we can't be sure that it's a real slope 9912 * change */ 9913 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) { 9914 9915 # ifdef DEBUGGING 9916 9917 STRLEN old_m = PL_collxfrm_mult; 9918 STRLEN old_b = PL_collxfrm_base; 9919 9920 # endif 9921 9922 PL_collxfrm_mult = new_m; 9923 PL_collxfrm_base = 1; /* +1 For trailing NUL */ 9924 computed_guess = PL_collxfrm_base 9925 + (PL_collxfrm_mult * length_in_chars); 9926 if (computed_guess < needed) { 9927 PL_collxfrm_base += needed - computed_guess; 9928 } 9929 9930 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 9931 "slope is now %zu; was %zu, base " 9932 "is now %zu; was %zu\n", 9933 PL_collxfrm_mult, old_m, 9934 PL_collxfrm_base, old_b)); 9935 } 9936 else { /* Slope didn't change, but 'b' did */ 9937 const STRLEN new_b = needed 9938 - computed_guess 9939 + PL_collxfrm_base; 9940 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 9941 "base is now %zu; was %zu\n", new_b, PL_collxfrm_base)); 9942 PL_collxfrm_base = new_b; 9943 } 9944 } 9945 9946 break; 9947 } 9948 9949 if (UNLIKELY(*xlen >= PERL_INT_MAX)) { 9950 DEBUG_L(PerlIO_printf(Perl_debug_log, 9951 "mem_collxfrm_: Needed %zu bytes, max permissible is %u\n", 9952 *xlen, PERL_INT_MAX)); 9953 goto bad; 9954 } 9955 9956 /* A well-behaved strxfrm() returns exactly how much space it needs 9957 * (usually not including the trailing NUL) when it fails due to not 9958 * enough space being provided. Assume that this is the case unless 9959 * it's been proven otherwise */ 9960 if (LIKELY(PL_strxfrm_is_behaved) && first_time) { 9961 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1; 9962 } 9963 else { /* Here, either: 9964 * 1) The strxfrm() has previously shown bad behavior; or 9965 * 2) It isn't the first time through the loop, which means 9966 * that the strxfrm() is now showing bad behavior, because 9967 * we gave it what it said was needed in the previous 9968 * iteration, and it came back saying it needed still more. 9969 * (Many versions of cygwin fit this. When the buffer size 9970 * isn't sufficient, they return the input size instead of 9971 * how much is needed.) 9972 * Increase the buffer size by a fixed percentage and try again. 9973 * */ 9974 xAlloc += (xAlloc / 4) + 1; 9975 PL_strxfrm_is_behaved = FALSE; 9976 9977 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 9978 "mem_collxfrm_ required more space than previously" 9979 " calculated for locale %s, trying again with new" 9980 " guess=%zu+%zu\n", 9981 PL_collation_name, COLLXFRM_HDR_LEN, 9982 xAlloc - COLLXFRM_HDR_LEN)); 9983 } 9984 9985 Renew(xbuf, xAlloc, char); 9986 if (UNLIKELY(! xbuf)) { 9987 DEBUG_L(PerlIO_printf(Perl_debug_log, 9988 "mem_collxfrm_: Couldn't realloc %zu bytes\n", xAlloc)); 9989 goto bad; 9990 } 9991 9992 first_time = FALSE; 9993 } 9994 9995 CLEANUP_STRXFRM; 9996 9997 DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8)); 9998 9999 /* Free up unneeded space; retain enough for trailing NUL */ 10000 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char); 10001 10002 if (s != input_string) { 10003 Safefree(s); 10004 } 10005 10006 return xbuf; 10007 10008 bad: 10009 10010 CLEANUP_STRXFRM; 10011 DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8)); 10012 10013 Safefree(xbuf); 10014 if (s != input_string) { 10015 Safefree(s); 10016 } 10017 *xlen = 0; 10018 10019 return NULL; 10020 } 10021 10022 # ifdef DEBUGGING 10023 10024 STATIC void 10025 S_print_collxfrm_input_and_return(pTHX_ 10026 const char * s, 10027 const char * e, 10028 const char * xbuf, 10029 const STRLEN xlen, 10030 const bool is_utf8) 10031 { 10032 10033 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN; 10034 10035 PerlIO_printf(Perl_debug_log, 10036 "mem_collxfrm_[ix %" UVuf "] for locale '%s':\n" 10037 " input=%s\n return=%s\n return len=%zu\n", 10038 (UV) PL_collation_ix, PL_collation_name, 10039 get_displayable_string(s, e, is_utf8), 10040 ((xbuf == NULL) 10041 ? "(null)" 10042 : ((xlen == 0) 10043 ? "(empty)" 10044 : _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN, 10045 xlen, 0))), 10046 xlen); 10047 } 10048 10049 # endif /* DEBUGGING */ 10050 10051 SV * 10052 Perl_strxfrm(pTHX_ SV * src) 10053 { 10054 PERL_ARGS_ASSERT_STRXFRM; 10055 10056 /* For use by POSIX::strxfrm(). If they differ, toggle LC_CTYPE to 10057 * LC_COLLATE to avoid potential mojibake. 10058 * 10059 * If we can't calculate a collation, 'src' is instead returned, so that 10060 * future comparisons will be by code point order */ 10061 10062 # ifdef USE_LOCALE_CTYPE 10063 10064 const char * orig_ctype = toggle_locale_c(LC_CTYPE, 10065 querylocale_c(LC_COLLATE)); 10066 # endif 10067 10068 SV * dst = src; 10069 STRLEN dstlen; 10070 STRLEN srclen; 10071 const char *p = SvPV_const(src, srclen); 10072 const U32 utf8_flag = SvUTF8(src); 10073 char *d = mem_collxfrm_(p, srclen, &dstlen, cBOOL(utf8_flag)); 10074 10075 assert(utf8_flag == 0 || utf8_flag == SVf_UTF8); 10076 10077 if (d != NULL) { 10078 assert(dstlen > 0); 10079 dst =newSVpvn_flags(d + COLLXFRM_HDR_LEN, 10080 dstlen, SVs_TEMP|utf8_flag); 10081 Safefree(d); 10082 } 10083 10084 # ifdef USE_LOCALE_CTYPE 10085 10086 restore_toggled_locale_c(LC_CTYPE, orig_ctype); 10087 10088 # endif 10089 10090 return dst; 10091 } 10092 10093 #endif /* USE_LOCALE_COLLATE */ 10094 10095 /* my_strerror() returns a mortalized copy of the text of the error message 10096 * associated with 'errnum'. 10097 * 10098 * If not called from within the scope of 'use locale', it uses the text from 10099 * the C locale. If Perl is compiled to not pay attention to LC_CTYPE nor 10100 * LC_MESSAGES, it uses whatever strerror() returns. Otherwise the text is 10101 * derived from the locale, LC_MESSAGES if we have that; LC_CTYPE if not. 10102 * 10103 * It returns in *utf8ness the result's UTF-8ness 10104 * 10105 * The function just calls strerror(), but temporarily switches locales, if 10106 * needed. Many platforms require LC_CTYPE and LC_MESSAGES to be in the same 10107 * CODESET in order for the return from strerror() to not contain '?' symbols, 10108 * or worse, mojibaked. It's cheaper to just use the stricter criteria of 10109 * being in the same locale. So the code below uses a common locale for both 10110 * categories. Again, that is C if not within 'use locale' scope; or the 10111 * LC_MESSAGES locale if in scope and we have that category; and LC_CTYPE if we 10112 * don't have LC_MESSAGES; and whatever strerror returns if we don't have 10113 * either category. 10114 * 10115 * There are two sets of implementations. The first below is if we have 10116 * strerror_l(). This is the simpler. We just use the already-built C locale 10117 * object if not in locale scope, or build up a custom one otherwise. 10118 * 10119 * When strerror_l() is not available, we may have to swap locales temporarily 10120 * to bring the two categories into sync with each other, and possibly to the C 10121 * locale. 10122 * 10123 * Because the prepropessing directives to conditionally compile this function 10124 * would greatly obscure the logic of the various implementations, the whole 10125 * function is repeated for each configuration, with some common macros. */ 10126 10127 /* Used to shorten the definitions of the following implementations of 10128 * my_strerror() */ 10129 #define DEBUG_STRERROR_ENTER(errnum, in_locale) \ 10130 DEBUG_Lv(PerlIO_printf(Perl_debug_log, \ 10131 "my_strerror called with errnum %d;" \ 10132 " Within locale scope=%d\n", \ 10133 errnum, in_locale)) 10134 10135 #define DEBUG_STRERROR_RETURN(errstr, utf8ness) \ 10136 DEBUG_Lv(PerlIO_printf(Perl_debug_log, \ 10137 "Strerror returned; saving a copy: '%s';" \ 10138 " utf8ness=%d\n", \ 10139 get_displayable_string(errstr, \ 10140 errstr + strlen(errstr), \ 10141 *utf8ness), \ 10142 (int) *utf8ness)) 10143 10144 /* On platforms that have precisely one of these categories (Windows 10145 * qualifies), these yield the correct one */ 10146 #if defined(USE_LOCALE_CTYPE) 10147 # define WHICH_LC_INDEX LC_CTYPE_INDEX_ 10148 #elif defined(USE_LOCALE_MESSAGES) 10149 # define WHICH_LC_INDEX LC_MESSAGES_INDEX_ 10150 #endif 10151 10152 /*===========================================================================*/ 10153 /* First set of implementations, when have strerror_l() */ 10154 10155 #if defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L) 10156 10157 # if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES) 10158 10159 /* Here, neither category is defined: use the C locale */ 10160 const char * 10161 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness) 10162 { 10163 PERL_ARGS_ASSERT_MY_STRERROR; 10164 10165 DEBUG_STRERROR_ENTER(errnum, 0); 10166 10167 const char *errstr = savepv(strerror_l(errnum, PL_C_locale_obj)); 10168 *utf8ness = UTF8NESS_IMMATERIAL; 10169 10170 DEBUG_STRERROR_RETURN(errstr, utf8ness); 10171 10172 SAVEFREEPV(errstr); 10173 return errstr; 10174 } 10175 10176 # elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES) 10177 10178 /*--------------------------------------------------------------------------*/ 10179 10180 /* Here one or the other of CTYPE or MESSAGES is defined, but not both. If we 10181 * are not within 'use locale' scope of the only one defined, we use the C 10182 * locale; otherwise use the current locale object */ 10183 10184 const char * 10185 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness) 10186 { 10187 PERL_ARGS_ASSERT_MY_STRERROR; 10188 10189 DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX])); 10190 10191 /* Use C if not within locale scope; Otherwise, use current locale */ 10192 const locale_t which_obj = (IN_LC(categories[WHICH_LC_INDEX])) 10193 ? PL_C_locale_obj 10194 : use_curlocale_scratch(); 10195 10196 const char *errstr = savepv(strerror_l(errnum, which_obj)); 10197 *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN, 10198 NULL, WHICH_LC_INDEX); 10199 DEBUG_STRERROR_RETURN(errstr, utf8ness); 10200 10201 SAVEFREEPV(errstr); 10202 return errstr; 10203 } 10204 10205 /*--------------------------------------------------------------------------*/ 10206 # else /* Are using both categories. Place them in the same CODESET, 10207 * either C or the LC_MESSAGES locale */ 10208 10209 const char * 10210 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness) 10211 { 10212 PERL_ARGS_ASSERT_MY_STRERROR; 10213 10214 DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES)); 10215 10216 const char *errstr; 10217 if (! IN_LC(LC_MESSAGES)) { /* Use C if not within locale scope */ 10218 errstr = savepv(strerror_l(errnum, PL_C_locale_obj)); 10219 *utf8ness = UTF8NESS_IMMATERIAL; 10220 } 10221 else { /* Otherwise, use the LC_MESSAGES locale, making sure LC_CTYPE 10222 matches */ 10223 locale_t cur = duplocale(use_curlocale_scratch()); 10224 10225 const char * locale = querylocale_c(LC_MESSAGES); 10226 cur = newlocale(LC_CTYPE_MASK, locale, cur); 10227 errstr = savepv(strerror_l(errnum, cur)); 10228 *utf8ness = get_locale_string_utf8ness_i(errstr, 10229 LOCALE_UTF8NESS_UNKNOWN, 10230 locale, 10231 LC_MESSAGES_INDEX_); 10232 freelocale(cur); 10233 } 10234 10235 DEBUG_STRERROR_RETURN(errstr, utf8ness); 10236 10237 SAVEFREEPV(errstr); 10238 return errstr; 10239 } 10240 # endif /* Above is using strerror_l */ 10241 /*===========================================================================*/ 10242 #else /* Below is not using strerror_l */ 10243 # if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES) 10244 10245 /* If not using using either of the categories, return plain, unadorned 10246 * strerror */ 10247 10248 const char * 10249 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness) 10250 { 10251 PERL_ARGS_ASSERT_MY_STRERROR; 10252 10253 DEBUG_STRERROR_ENTER(errnum, 0); 10254 10255 const char *errstr = savepv(Strerror(errnum)); 10256 *utf8ness = UTF8NESS_IMMATERIAL; 10257 10258 DEBUG_STRERROR_RETURN(errstr, utf8ness); 10259 10260 SAVEFREEPV(errstr); 10261 return errstr; 10262 } 10263 10264 /*--------------------------------------------------------------------------*/ 10265 # elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES) 10266 10267 /* Here one or the other of CTYPE or MESSAGES is defined, but not both. If we 10268 * are not within 'use locale' scope of the only one defined, we use the C 10269 * locale; otherwise use the current locale */ 10270 10271 const char * 10272 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness) 10273 { 10274 PERL_ARGS_ASSERT_MY_STRERROR; 10275 10276 DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX])); 10277 10278 const char *errstr; 10279 if (IN_LC(categories[WHICH_LC_INDEX])) { 10280 errstr = savepv(Strerror(errnum)); 10281 *utf8ness = get_locale_string_utf8ness_i(errstr, 10282 LOCALE_UTF8NESS_UNKNOWN, 10283 NULL, WHICH_LC_INDEX); 10284 } 10285 else { 10286 10287 LOCALE_LOCK; 10288 10289 const char * orig_locale = toggle_locale_i(WHICH_LC_INDEX, "C"); 10290 10291 errstr = savepv(Strerror(errnum)); 10292 10293 restore_toggled_locale_i(WHICH_LC_INDEX, orig_locale); 10294 10295 LOCALE_UNLOCK; 10296 10297 *utf8ness = UTF8NESS_IMMATERIAL; 10298 } 10299 10300 DEBUG_STRERROR_RETURN(errstr, utf8ness); 10301 10302 SAVEFREEPV(errstr); 10303 return errstr; 10304 } 10305 10306 /*--------------------------------------------------------------------------*/ 10307 # else 10308 10309 /* Below, have both LC_CTYPE and LC_MESSAGES. Place them in the same CODESET, 10310 * either C or the LC_MESSAGES locale */ 10311 10312 const char * 10313 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness) 10314 { 10315 PERL_ARGS_ASSERT_MY_STRERROR; 10316 10317 DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES)); 10318 10319 const char * desired_locale = (IN_LC(LC_MESSAGES)) 10320 ? querylocale_c(LC_MESSAGES) 10321 : "C"; 10322 /* XXX Can fail on z/OS */ 10323 10324 LOCALE_LOCK; 10325 10326 const char* orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, 10327 desired_locale); 10328 const char* orig_MESSAGES_locale = toggle_locale_c(LC_MESSAGES, 10329 desired_locale); 10330 const char *errstr = savepv(Strerror(errnum)); 10331 10332 restore_toggled_locale_c(LC_MESSAGES, orig_MESSAGES_locale); 10333 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); 10334 10335 LOCALE_UNLOCK; 10336 10337 *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN, 10338 desired_locale, 10339 LC_MESSAGES_INDEX_); 10340 DEBUG_STRERROR_RETURN(errstr, utf8ness); 10341 10342 SAVEFREEPV(errstr); 10343 return errstr; 10344 } 10345 10346 /*--------------------------------------------------------------------------*/ 10347 # endif /* end of not using strerror_l() */ 10348 #endif /* end of all the my_strerror() implementations */ 10349 10350 bool 10351 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) 10352 { 10353 /* Internal function which returns if we are in the scope of a pragma that 10354 * enables the locale category 'category'. 'compiling' should indicate if 10355 * this is during the compilation phase (TRUE) or not (FALSE). */ 10356 10357 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop; 10358 10359 SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0); 10360 if (! these_categories || these_categories == &PL_sv_placeholder) { 10361 return FALSE; 10362 } 10363 10364 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get 10365 * a valid unsigned */ 10366 assert(category >= -1); 10367 return cBOOL(SvUV(these_categories) & (1U << (category + 1))); 10368 } 10369 10370 /* 10371 10372 =for apidoc_section $locale 10373 =for apidoc switch_to_global_locale 10374 10375 This function copies the locale state of the calling thread into the program's 10376 global locale, and converts the thread to use that global locale. 10377 10378 It is intended so that Perl can safely be used with C libraries that access the 10379 global locale and which can't be converted to not access it. Effectively, this 10380 means libraries that call C<L<setlocale(3)>> on non-Windows systems. (For 10381 portability, it is a good idea to use it on Windows as well.) 10382 10383 A downside of using it is that it disables the services that Perl provides to 10384 hide locale gotchas from your code. The service you most likely will miss 10385 regards the radix character (decimal point) in floating point numbers. Code 10386 executed after this function is called can no longer just assume that this 10387 character is correct for the current circumstances. 10388 10389 To return to Perl control, and restart the gotcha prevention services, call 10390 C<L</sync_locale>>. Behavior is undefined for any pure Perl code that executes 10391 while the switch is in effect. 10392 10393 The global locale and the per-thread locales are independent. As long as just 10394 one thread converts to the global locale, everything works smoothly. But if 10395 more than one does, they can easily interfere with each other, and races are 10396 likely. On Windows systems prior to Visual Studio 15 (at which point Microsoft 10397 fixed a bug), races can occur (even if only one thread has been converted to 10398 the global locale), but only if you use the following operations: 10399 10400 =over 10401 10402 =item L<POSIX::localeconv|POSIX/localeconv> 10403 10404 =item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP> 10405 10406 =item L<perlapi/sv_langinfo>, items C<CRNCYSTR> and C<THOUSEP> 10407 10408 =back 10409 10410 The first item is not fixable (except by upgrading to a later Visual Studio 10411 release), but it would be possible to work around the latter two items by 10412 having Perl change its algorithm for calculating these to use Windows API 10413 functions (likely C<GetNumberFormat> and C<GetCurrencyFormat>); patches 10414 welcome. 10415 10416 XS code should never call plain C<setlocale>, but should instead be converted 10417 to either call L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in 10418 for the system C<setlocale>) or use the methods given in L<perlcall> to call 10419 L<C<POSIX::setlocale>|POSIX/setlocale>. Either one will transparently properly 10420 handle all cases of single- vs multi-thread, POSIX 2008-supported or not. 10421 10422 =cut 10423 */ 10424 10425 #if defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE) 10426 # define CHANGE_SYSTEM_LOCALE_TO_GLOBAL \ 10427 STMT_START { \ 10428 if (_configthreadlocale(_DISABLE_PER_THREAD_LOCALE) == -1) { \ 10429 locale_panic_("_configthreadlocale returned an error"); \ 10430 } \ 10431 } STMT_END 10432 #elif defined(USE_POSIX_2008_LOCALE) 10433 # define CHANGE_SYSTEM_LOCALE_TO_GLOBAL \ 10434 STMT_START { \ 10435 locale_t old_locale = uselocale(LC_GLOBAL_LOCALE); \ 10436 if (! old_locale) { \ 10437 locale_panic_("Could not change to global locale"); \ 10438 } \ 10439 \ 10440 /* Free the per-thread memory */ \ 10441 if ( old_locale != LC_GLOBAL_LOCALE \ 10442 && old_locale != PL_C_locale_obj) \ 10443 { \ 10444 freelocale(old_locale); \ 10445 } \ 10446 } STMT_END 10447 #else 10448 # define CHANGE_SYSTEM_LOCALE_TO_GLOBAL 10449 #endif 10450 10451 void 10452 Perl_switch_to_global_locale(pTHX) 10453 { 10454 10455 #ifdef USE_LOCALE 10456 10457 DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering switch_to_global; %s\n", 10458 get_LC_ALL_display())); 10459 10460 /* In these cases, we use the system state to determine if we are in the 10461 * global locale or not. */ 10462 # ifdef USE_POSIX_2008_LOCALE 10463 10464 const bool perl_controls = (LC_GLOBAL_LOCALE != uselocale((locale_t) 0)); 10465 10466 # elif defined(USE_THREAD_SAFE_LOCALE) && defined(WIN32) 10467 10468 int config_return = _configthreadlocale(0); 10469 if (config_return == -1) { 10470 locale_panic_("_configthreadlocale returned an error"); 10471 } 10472 const bool perl_controls = (config_return == _ENABLE_PER_THREAD_LOCALE); 10473 10474 # else 10475 10476 const bool perl_controls = false; 10477 10478 # endif 10479 10480 /* No-op if already in global */ 10481 if (! perl_controls) { 10482 return; 10483 } 10484 10485 # ifdef LC_ALL 10486 10487 const char * thread_locale = calculate_LC_ALL_string(NULL, 10488 EXTERNAL_FORMAT_FOR_SET, 10489 WANT_TEMP_PV, 10490 __LINE__); 10491 CHANGE_SYSTEM_LOCALE_TO_GLOBAL; 10492 posix_setlocale(LC_ALL, thread_locale); 10493 10494 # else /* Must be USE_POSIX_2008_LOCALE) */ 10495 10496 const char * cur_thread_locales[LC_ALL_INDEX_]; 10497 10498 /* Save each category's current per-thread state */ 10499 for_all_individual_category_indexes(i) { 10500 cur_thread_locales[i] = querylocale_i(i); 10501 } 10502 10503 CHANGE_SYSTEM_LOCALE_TO_GLOBAL; 10504 10505 /* Set the global to what was our per-thread state */ 10506 POSIX_SETLOCALE_LOCK; 10507 for_all_individual_category_indexes(i) { 10508 posix_setlocale(categories[i], cur_thread_locales[i]); 10509 } 10510 POSIX_SETLOCALE_UNLOCK; 10511 10512 # endif 10513 # ifdef USE_LOCALE_NUMERIC 10514 10515 /* Switch to the underlying C numeric locale; the application is on its 10516 * own. */ 10517 POSIX_SETLOCALE_LOCK; 10518 posix_setlocale(LC_NUMERIC, PL_numeric_name); 10519 POSIX_SETLOCALE_UNLOCK; 10520 10521 # endif 10522 #endif 10523 10524 } 10525 10526 /* 10527 10528 =for apidoc sync_locale 10529 10530 This function copies the state of the program global locale into the calling 10531 thread, and converts that thread to using per-thread locales, if it wasn't 10532 already, and the platform supports them. The LC_NUMERIC locale is toggled into 10533 the standard state (using the C locale's conventions), if not within the 10534 lexical scope of S<C<use locale>>. 10535 10536 Perl will now consider itself to have control of the locale. 10537 10538 Since unthreaded perls have only a global locale, this function is a no-op 10539 without threads. 10540 10541 This function is intended for use with C libraries that do locale manipulation. 10542 It allows Perl to accommodate the use of them. Call this function before 10543 transferring back to Perl space so that it knows what state the C code has left 10544 things in. 10545 10546 XS code should not manipulate the locale on its own. Instead, 10547 L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or 10548 change the locale (though changing the locale is antisocial and dangerous on 10549 multi-threaded systems that don't have multi-thread safe locale operations. 10550 (See L<perllocale/Multi-threaded operation>). 10551 10552 Using the libc L<C<setlocale(3)>> function should be avoided. Nevertheless, 10553 certain non-Perl libraries called from XS, do call it, and their behavior may 10554 not be able to be changed. This function, along with 10555 C<L</switch_to_global_locale>>, can be used to get seamless behavior in these 10556 circumstances, as long as only one thread is involved. 10557 10558 If the library has an option to turn off its locale manipulation, doing that is 10559 preferable to using this mechanism. C<Gtk> is such a library. 10560 10561 The return value is a boolean: TRUE if the global locale at the time of call 10562 was in effect for the caller; and FALSE if a per-thread locale was in effect. 10563 10564 =cut 10565 */ 10566 10567 bool 10568 Perl_sync_locale(pTHX) 10569 { 10570 10571 #ifndef USE_LOCALE 10572 10573 return TRUE; 10574 10575 #else 10576 10577 bool was_in_global = TRUE; 10578 10579 # ifdef USE_THREAD_SAFE_LOCALE 10580 # if defined(WIN32) 10581 10582 int config_return = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE); 10583 if (config_return == -1) { 10584 locale_panic_("_configthreadlocale returned an error"); 10585 } 10586 was_in_global = (config_return == _DISABLE_PER_THREAD_LOCALE); 10587 10588 # elif defined(USE_POSIX_2008_LOCALE) 10589 10590 was_in_global = (LC_GLOBAL_LOCALE == uselocale(LC_GLOBAL_LOCALE)); 10591 10592 # else 10593 # error Unexpected Configuration 10594 # endif 10595 # endif /* USE_THREAD_SAFE_LOCALE */ 10596 10597 /* Here, we are in the global locale. Get and save the values for each 10598 * category, and convert the current thread to use them */ 10599 10600 # ifdef LC_ALL 10601 10602 STDIZED_SETLOCALE_LOCK; 10603 const char * lc_all_string = savepv(stdized_setlocale(LC_ALL, NULL)); 10604 STDIZED_SETLOCALE_UNLOCK; 10605 10606 give_perl_locale_control(lc_all_string, __LINE__); 10607 Safefree(lc_all_string); 10608 10609 # else 10610 10611 const char * current_globals[LC_ALL_INDEX_]; 10612 for_all_individual_category_indexes(i) { 10613 STDIZED_SETLOCALE_LOCK; 10614 current_globals[i] = savepv(stdized_setlocale(categories[i], NULL)); 10615 STDIZED_SETLOCALE_UNLOCK; 10616 } 10617 10618 give_perl_locale_control((const char **) ¤t_globals, __LINE__); 10619 10620 for_all_individual_category_indexes(i) { 10621 Safefree(current_globals[i]); 10622 } 10623 10624 # endif 10625 10626 return was_in_global; 10627 10628 #endif 10629 10630 } 10631 10632 #ifdef USE_PERL_SWITCH_LOCALE_CONTEXT 10633 10634 void 10635 Perl_switch_locale_context(pTHX) 10636 { 10637 /* libc keeps per-thread locale status information in some configurations. 10638 * So, we can't just switch out aTHX to switch to a new thread. libc has 10639 * to follow along. This routine does that based on per-interpreter 10640 * variables we keep just for this purpose. 10641 * 10642 * There are two implementations where this is an issue. For the other 10643 * implementations, it doesn't matter because libc is using global values 10644 * that all threads know about. 10645 * 10646 * The two implementations are where libc keeps thread-specific information 10647 * on its own. These are 10648 * 10649 * POSIX 2008: The current locale is kept by libc as an object. We save 10650 * a copy of that in the per-thread PL_cur_locale_obj, and so 10651 * this routine uses that copy to tell the thread it should be 10652 * operating with that object 10653 * Windows thread-safe locales: A given thread in Windows can be being run 10654 * with per-thread locales, or not. When the thread context 10655 * changes, libc doesn't automatically know if the thread is 10656 * using per-thread locales, nor does it know what the new 10657 * thread's locale is. We keep that information in the 10658 * per-thread variables: 10659 * PL_controls_locale indicates if this thread is using 10660 * per-thread locales or not 10661 * PL_cur_LC_ALL indicates what the locale should be 10662 * if it is a per-thread locale. 10663 */ 10664 10665 if (UNLIKELY( PL_veto_switch_non_tTHX_context 10666 || PL_phase == PERL_PHASE_CONSTRUCT)) 10667 { 10668 return; 10669 } 10670 10671 # ifdef USE_POSIX_2008_LOCALE 10672 10673 if (! uselocale(PL_cur_locale_obj)) { 10674 locale_panic_(Perl_form(aTHX_ 10675 "Can't uselocale(%p), LC_ALL supposed to" 10676 " be '%s'", 10677 PL_cur_locale_obj, get_LC_ALL_display())); 10678 } 10679 10680 # elif defined(WIN32) 10681 10682 if (! bool_setlocale_c(LC_ALL, PL_cur_LC_ALL)) { 10683 locale_panic_(Perl_form(aTHX_ "Can't setlocale(%s)", PL_cur_LC_ALL)); 10684 } 10685 10686 # endif 10687 10688 } 10689 10690 #endif 10691 #ifdef USE_THREADS 10692 10693 void 10694 Perl_thread_locale_init(pTHX) 10695 { 10696 10697 # ifdef USE_THREAD_SAFE_LOCALE 10698 # ifdef USE_POSIX_2008_LOCALE 10699 10700 /* Called from a thread on startup. 10701 * 10702 * The operations here have to be done from within the calling thread, as 10703 * they affect libc's knowledge of the thread; libc has no knowledge of 10704 * aTHX */ 10705 10706 DEBUG_L(PerlIO_printf(Perl_debug_log, 10707 "new thread, initial locale is %s;" 10708 " calling setlocale(LC_ALL, \"C\")\n", 10709 get_LC_ALL_display())); 10710 10711 if (! uselocale(PL_C_locale_obj)) { 10712 10713 /* Not being able to change to the C locale is severe; don't keep 10714 * going. */ 10715 locale_panic_(Perl_form(aTHX_ 10716 "Can't uselocale(%p), 'C'", PL_C_locale_obj)); 10717 NOT_REACHED; /* NOTREACHED */ 10718 } 10719 10720 PL_cur_locale_obj = PL_C_locale_obj; 10721 10722 # elif defined(WIN32) 10723 10724 /* On Windows, make sure new thread has per-thread locales enabled */ 10725 if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) { 10726 locale_panic_("_configthreadlocale returned an error"); 10727 } 10728 void_setlocale_c(LC_ALL, "C"); 10729 10730 # endif 10731 # endif 10732 10733 } 10734 10735 void 10736 Perl_thread_locale_term(pTHX) 10737 { 10738 /* Called from a thread as it gets ready to terminate. 10739 * 10740 * The operations here have to be done from within the calling thread, as 10741 * they affect libc's knowledge of the thread; libc has no knowledge of 10742 * aTHX */ 10743 10744 # if defined(USE_POSIX_2008_LOCALE) 10745 10746 /* Switch to the global locale, so can free up the per-thread object */ 10747 locale_t actual_obj = uselocale(LC_GLOBAL_LOCALE); 10748 if (actual_obj != LC_GLOBAL_LOCALE && actual_obj != PL_C_locale_obj) { 10749 freelocale(actual_obj); 10750 } 10751 10752 /* Prevent leaks even if something has gone wrong */ 10753 locale_t expected_obj = PL_cur_locale_obj; 10754 if (UNLIKELY( expected_obj != actual_obj 10755 && expected_obj != LC_GLOBAL_LOCALE 10756 && expected_obj != PL_C_locale_obj)) 10757 { 10758 freelocale(expected_obj); 10759 } 10760 10761 PL_cur_locale_obj = LC_GLOBAL_LOCALE; 10762 10763 # endif 10764 # ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES 10765 10766 /* When faking the mingw implementation, we coerce this function into doing 10767 * something completely different from its intent -- namely to free up our 10768 * static buffer to avoid a leak. This function gets called for each 10769 * thread that is terminating, so will give us a chance to free the buffer 10770 * from the appropriate pool. On unthreaded systems, it gets called by the 10771 * mutex termination code. */ 10772 10773 if (aTHX != wsetlocale_buf_aTHX) { 10774 return; 10775 } 10776 10777 if (wsetlocale_buf_size > 0) { 10778 Safefree(wsetlocale_buf); 10779 wsetlocale_buf_size = 0; 10780 } 10781 10782 # endif 10783 10784 } 10785 10786 #endif 10787 10788 /* 10789 * ex: set ts=8 sts=4 sw=4 et: 10790 */ 10791