195b7b453SJohn Marino /* Localization of proper names. 2*09d4459fSDaniel Fojt Copyright (C) 2006-2020 Free Software Foundation, Inc. 395b7b453SJohn Marino Written by Bruno Haible <bruno@clisp.org>, 2006. 495b7b453SJohn Marino 595b7b453SJohn Marino This program is free software: you can redistribute it and/or modify 695b7b453SJohn Marino it under the terms of the GNU General Public License as published by 795b7b453SJohn Marino the Free Software Foundation; either version 3 of the License, or 895b7b453SJohn Marino (at your option) any later version. 995b7b453SJohn Marino 1095b7b453SJohn Marino This program is distributed in the hope that it will be useful, 1195b7b453SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of 1295b7b453SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1395b7b453SJohn Marino GNU General Public License for more details. 1495b7b453SJohn Marino 1595b7b453SJohn Marino You should have received a copy of the GNU General Public License 16*09d4459fSDaniel Fojt along with this program. If not, see <https://www.gnu.org/licenses/>. */ 1795b7b453SJohn Marino 18cf28ed85SJohn Marino /* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that 19cf28ed85SJohn Marino the proper_name function might be candidate for attribute 'const' */ 20cf28ed85SJohn Marino #if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__ 21cf28ed85SJohn Marino # pragma GCC diagnostic ignored "-Wsuggest-attribute=const" 22cf28ed85SJohn Marino #endif 23cf28ed85SJohn Marino 2495b7b453SJohn Marino #include <config.h> 2595b7b453SJohn Marino 2695b7b453SJohn Marino /* Specification. */ 2795b7b453SJohn Marino #include "propername.h" 2895b7b453SJohn Marino 2995b7b453SJohn Marino #include <ctype.h> 3095b7b453SJohn Marino #include <stdbool.h> 3195b7b453SJohn Marino #include <stdio.h> 3295b7b453SJohn Marino #include <stdlib.h> 3395b7b453SJohn Marino #include <string.h> 3495b7b453SJohn Marino #if HAVE_ICONV 3595b7b453SJohn Marino # include <iconv.h> 3695b7b453SJohn Marino #endif 3795b7b453SJohn Marino 3895b7b453SJohn Marino #include "trim.h" 3995b7b453SJohn Marino #include "mbchar.h" 4095b7b453SJohn Marino #include "mbuiter.h" 4195b7b453SJohn Marino #include "localcharset.h" 4295b7b453SJohn Marino #include "c-strcase.h" 4395b7b453SJohn Marino #include "xstriconv.h" 4495b7b453SJohn Marino #include "xalloc.h" 4595b7b453SJohn Marino #include "gettext.h" 4695b7b453SJohn Marino 4795b7b453SJohn Marino 4895b7b453SJohn Marino /* Tests whether STRING contains trim (SUB), starting and ending at word 4995b7b453SJohn Marino boundaries. 5095b7b453SJohn Marino Here, instead of implementing Unicode Standard Annex #29 for determining 5195b7b453SJohn Marino word boundaries, we assume that trim (SUB) starts and ends with words and 5295b7b453SJohn Marino only test whether the part before it ends with a non-word and the part 5395b7b453SJohn Marino after it starts with a non-word. */ 5495b7b453SJohn Marino static bool 5595b7b453SJohn Marino mbsstr_trimmed_wordbounded (const char *string, const char *sub) 5695b7b453SJohn Marino { 5795b7b453SJohn Marino char *tsub = trim (sub); 5895b7b453SJohn Marino bool found = false; 5995b7b453SJohn Marino 6095b7b453SJohn Marino for (; *string != '\0';) 6195b7b453SJohn Marino { 6295b7b453SJohn Marino const char *tsub_in_string = mbsstr (string, tsub); 6395b7b453SJohn Marino if (tsub_in_string == NULL) 6495b7b453SJohn Marino break; 6595b7b453SJohn Marino else 6695b7b453SJohn Marino { 6795b7b453SJohn Marino if (MB_CUR_MAX > 1) 6895b7b453SJohn Marino { 6995b7b453SJohn Marino mbui_iterator_t string_iter; 7095b7b453SJohn Marino bool word_boundary_before; 7195b7b453SJohn Marino bool word_boundary_after; 7295b7b453SJohn Marino 7395b7b453SJohn Marino mbui_init (string_iter, string); 7495b7b453SJohn Marino word_boundary_before = true; 7595b7b453SJohn Marino if (mbui_cur_ptr (string_iter) < tsub_in_string) 7695b7b453SJohn Marino { 7795b7b453SJohn Marino mbchar_t last_char_before_tsub; 7895b7b453SJohn Marino do 7995b7b453SJohn Marino { 8095b7b453SJohn Marino if (!mbui_avail (string_iter)) 8195b7b453SJohn Marino abort (); 8295b7b453SJohn Marino last_char_before_tsub = mbui_cur (string_iter); 8395b7b453SJohn Marino mbui_advance (string_iter); 8495b7b453SJohn Marino } 8595b7b453SJohn Marino while (mbui_cur_ptr (string_iter) < tsub_in_string); 8695b7b453SJohn Marino if (mb_isalnum (last_char_before_tsub)) 8795b7b453SJohn Marino word_boundary_before = false; 8895b7b453SJohn Marino } 8995b7b453SJohn Marino 9095b7b453SJohn Marino mbui_init (string_iter, tsub_in_string); 9195b7b453SJohn Marino { 9295b7b453SJohn Marino mbui_iterator_t tsub_iter; 9395b7b453SJohn Marino 9495b7b453SJohn Marino for (mbui_init (tsub_iter, tsub); 9595b7b453SJohn Marino mbui_avail (tsub_iter); 9695b7b453SJohn Marino mbui_advance (tsub_iter)) 9795b7b453SJohn Marino { 9895b7b453SJohn Marino if (!mbui_avail (string_iter)) 9995b7b453SJohn Marino abort (); 10095b7b453SJohn Marino mbui_advance (string_iter); 10195b7b453SJohn Marino } 10295b7b453SJohn Marino } 10395b7b453SJohn Marino word_boundary_after = true; 10495b7b453SJohn Marino if (mbui_avail (string_iter)) 10595b7b453SJohn Marino { 10695b7b453SJohn Marino mbchar_t first_char_after_tsub = mbui_cur (string_iter); 10795b7b453SJohn Marino if (mb_isalnum (first_char_after_tsub)) 10895b7b453SJohn Marino word_boundary_after = false; 10995b7b453SJohn Marino } 11095b7b453SJohn Marino 11195b7b453SJohn Marino if (word_boundary_before && word_boundary_after) 11295b7b453SJohn Marino { 11395b7b453SJohn Marino found = true; 11495b7b453SJohn Marino break; 11595b7b453SJohn Marino } 11695b7b453SJohn Marino 11795b7b453SJohn Marino mbui_init (string_iter, tsub_in_string); 11895b7b453SJohn Marino if (!mbui_avail (string_iter)) 11995b7b453SJohn Marino break; 12095b7b453SJohn Marino string = tsub_in_string + mb_len (mbui_cur (string_iter)); 12195b7b453SJohn Marino } 12295b7b453SJohn Marino else 12395b7b453SJohn Marino { 12495b7b453SJohn Marino bool word_boundary_before; 12595b7b453SJohn Marino const char *p; 12695b7b453SJohn Marino bool word_boundary_after; 12795b7b453SJohn Marino 12895b7b453SJohn Marino word_boundary_before = true; 12995b7b453SJohn Marino if (string < tsub_in_string) 13095b7b453SJohn Marino if (isalnum ((unsigned char) tsub_in_string[-1])) 13195b7b453SJohn Marino word_boundary_before = false; 13295b7b453SJohn Marino 13395b7b453SJohn Marino p = tsub_in_string + strlen (tsub); 13495b7b453SJohn Marino word_boundary_after = true; 13595b7b453SJohn Marino if (*p != '\0') 13695b7b453SJohn Marino if (isalnum ((unsigned char) *p)) 13795b7b453SJohn Marino word_boundary_after = false; 13895b7b453SJohn Marino 13995b7b453SJohn Marino if (word_boundary_before && word_boundary_after) 14095b7b453SJohn Marino { 14195b7b453SJohn Marino found = true; 14295b7b453SJohn Marino break; 14395b7b453SJohn Marino } 14495b7b453SJohn Marino 14595b7b453SJohn Marino if (*tsub_in_string == '\0') 14695b7b453SJohn Marino break; 14795b7b453SJohn Marino string = tsub_in_string + 1; 14895b7b453SJohn Marino } 14995b7b453SJohn Marino } 15095b7b453SJohn Marino } 15195b7b453SJohn Marino free (tsub); 15295b7b453SJohn Marino return found; 15395b7b453SJohn Marino } 15495b7b453SJohn Marino 15595b7b453SJohn Marino /* Return the localization of NAME. NAME is written in ASCII. */ 15695b7b453SJohn Marino 15795b7b453SJohn Marino const char * 15895b7b453SJohn Marino proper_name (const char *name) 15995b7b453SJohn Marino { 16095b7b453SJohn Marino /* See whether there is a translation. */ 16195b7b453SJohn Marino const char *translation = gettext (name); 16295b7b453SJohn Marino 16395b7b453SJohn Marino if (translation != name) 16495b7b453SJohn Marino { 16595b7b453SJohn Marino /* See whether the translation contains the original name. */ 16695b7b453SJohn Marino if (mbsstr_trimmed_wordbounded (translation, name)) 16795b7b453SJohn Marino return translation; 16895b7b453SJohn Marino else 16995b7b453SJohn Marino { 17095b7b453SJohn Marino /* Return "TRANSLATION (NAME)". */ 17195b7b453SJohn Marino char *result = 17295b7b453SJohn Marino XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char); 17395b7b453SJohn Marino 17495b7b453SJohn Marino sprintf (result, "%s (%s)", translation, name); 17595b7b453SJohn Marino return result; 17695b7b453SJohn Marino } 17795b7b453SJohn Marino } 17895b7b453SJohn Marino else 17995b7b453SJohn Marino return name; 18095b7b453SJohn Marino } 18195b7b453SJohn Marino 18295b7b453SJohn Marino /* Return the localization of a name whose original writing is not ASCII. 18395b7b453SJohn Marino NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal 18495b7b453SJohn Marino escape sequences. NAME_ASCII is a fallback written only with ASCII 18595b7b453SJohn Marino characters. */ 18695b7b453SJohn Marino 18795b7b453SJohn Marino const char * 18895b7b453SJohn Marino proper_name_utf8 (const char *name_ascii, const char *name_utf8) 18995b7b453SJohn Marino { 19095b7b453SJohn Marino /* See whether there is a translation. */ 19195b7b453SJohn Marino const char *translation = gettext (name_ascii); 19295b7b453SJohn Marino 19395b7b453SJohn Marino /* Try to convert NAME_UTF8 to the locale encoding. */ 19495b7b453SJohn Marino const char *locale_code = locale_charset (); 19595b7b453SJohn Marino char *alloc_name_converted = NULL; 19695b7b453SJohn Marino char *alloc_name_converted_translit = NULL; 19795b7b453SJohn Marino const char *name_converted = NULL; 19895b7b453SJohn Marino const char *name_converted_translit = NULL; 19995b7b453SJohn Marino const char *name; 20095b7b453SJohn Marino 20195b7b453SJohn Marino if (c_strcasecmp (locale_code, "UTF-8") != 0) 20295b7b453SJohn Marino { 20395b7b453SJohn Marino #if HAVE_ICONV 20495b7b453SJohn Marino name_converted = alloc_name_converted = 20595b7b453SJohn Marino xstr_iconv (name_utf8, "UTF-8", locale_code); 20695b7b453SJohn Marino 207200fbe8dSJohn Marino # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \ 208200fbe8dSJohn Marino && !defined __UCLIBC__) \ 20995b7b453SJohn Marino || _LIBICONV_VERSION >= 0x0105 21095b7b453SJohn Marino { 21195b7b453SJohn Marino char *converted_translit; 21295b7b453SJohn Marino 21395b7b453SJohn Marino size_t len = strlen (locale_code); 21495b7b453SJohn Marino char *locale_code_translit = XNMALLOC (len + 10 + 1, char); 21595b7b453SJohn Marino memcpy (locale_code_translit, locale_code, len); 21695b7b453SJohn Marino memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1); 21795b7b453SJohn Marino 21895b7b453SJohn Marino converted_translit = 21995b7b453SJohn Marino xstr_iconv (name_utf8, "UTF-8", locale_code_translit); 22095b7b453SJohn Marino 22195b7b453SJohn Marino free (locale_code_translit); 22295b7b453SJohn Marino 22395b7b453SJohn Marino if (converted_translit != NULL) 22495b7b453SJohn Marino { 22595b7b453SJohn Marino # if !_LIBICONV_VERSION 22695b7b453SJohn Marino /* Don't use the transliteration if it added question marks. 22795b7b453SJohn Marino glibc's transliteration falls back to question marks; libiconv's 22895b7b453SJohn Marino transliteration does not. 22995b7b453SJohn Marino mbschr is equivalent to strchr in this case. */ 23095b7b453SJohn Marino if (strchr (converted_translit, '?') != NULL) 23195b7b453SJohn Marino free (converted_translit); 23295b7b453SJohn Marino else 23395b7b453SJohn Marino # endif 23495b7b453SJohn Marino name_converted_translit = alloc_name_converted_translit = 23595b7b453SJohn Marino converted_translit; 23695b7b453SJohn Marino } 23795b7b453SJohn Marino } 23895b7b453SJohn Marino # endif 23995b7b453SJohn Marino #endif 24095b7b453SJohn Marino } 24195b7b453SJohn Marino else 24295b7b453SJohn Marino { 24395b7b453SJohn Marino name_converted = name_utf8; 24495b7b453SJohn Marino name_converted_translit = name_utf8; 24595b7b453SJohn Marino } 24695b7b453SJohn Marino 24795b7b453SJohn Marino /* The name in locale encoding. */ 24895b7b453SJohn Marino name = (name_converted != NULL ? name_converted : 24995b7b453SJohn Marino name_converted_translit != NULL ? name_converted_translit : 25095b7b453SJohn Marino name_ascii); 25195b7b453SJohn Marino 25295b7b453SJohn Marino /* See whether we have a translation. Some translators have not understood 25395b7b453SJohn Marino that they should use the UTF-8 form of the name, if possible. So if the 25495b7b453SJohn Marino translator provided a no-op translation, we ignore it. */ 25595b7b453SJohn Marino if (strcmp (translation, name_ascii) != 0) 25695b7b453SJohn Marino { 25795b7b453SJohn Marino /* See whether the translation contains the original name. */ 25895b7b453SJohn Marino if (mbsstr_trimmed_wordbounded (translation, name_ascii) 25995b7b453SJohn Marino || (name_converted != NULL 26095b7b453SJohn Marino && mbsstr_trimmed_wordbounded (translation, name_converted)) 26195b7b453SJohn Marino || (name_converted_translit != NULL 26295b7b453SJohn Marino && mbsstr_trimmed_wordbounded (translation, name_converted_translit))) 26395b7b453SJohn Marino { 26495b7b453SJohn Marino if (alloc_name_converted != NULL) 26595b7b453SJohn Marino free (alloc_name_converted); 26695b7b453SJohn Marino if (alloc_name_converted_translit != NULL) 26795b7b453SJohn Marino free (alloc_name_converted_translit); 26895b7b453SJohn Marino return translation; 26995b7b453SJohn Marino } 27095b7b453SJohn Marino else 27195b7b453SJohn Marino { 27295b7b453SJohn Marino /* Return "TRANSLATION (NAME)". */ 27395b7b453SJohn Marino char *result = 27495b7b453SJohn Marino XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char); 27595b7b453SJohn Marino 27695b7b453SJohn Marino sprintf (result, "%s (%s)", translation, name); 27795b7b453SJohn Marino 27895b7b453SJohn Marino if (alloc_name_converted != NULL) 27995b7b453SJohn Marino free (alloc_name_converted); 28095b7b453SJohn Marino if (alloc_name_converted_translit != NULL) 28195b7b453SJohn Marino free (alloc_name_converted_translit); 28295b7b453SJohn Marino return result; 28395b7b453SJohn Marino } 28495b7b453SJohn Marino } 28595b7b453SJohn Marino else 28695b7b453SJohn Marino { 28795b7b453SJohn Marino if (alloc_name_converted != NULL && alloc_name_converted != name) 28895b7b453SJohn Marino free (alloc_name_converted); 28995b7b453SJohn Marino if (alloc_name_converted_translit != NULL 29095b7b453SJohn Marino && alloc_name_converted_translit != name) 29195b7b453SJohn Marino free (alloc_name_converted_translit); 29295b7b453SJohn Marino return name; 29395b7b453SJohn Marino } 29495b7b453SJohn Marino } 29595b7b453SJohn Marino 29695b7b453SJohn Marino #ifdef TEST1 29795b7b453SJohn Marino # include <locale.h> 29895b7b453SJohn Marino int 29995b7b453SJohn Marino main (int argc, char *argv[]) 30095b7b453SJohn Marino { 30195b7b453SJohn Marino setlocale (LC_ALL, ""); 30295b7b453SJohn Marino if (mbsstr_trimmed_wordbounded (argv[1], argv[2])) 30395b7b453SJohn Marino printf("found\n"); 30495b7b453SJohn Marino return 0; 30595b7b453SJohn Marino } 30695b7b453SJohn Marino #endif 30795b7b453SJohn Marino 30895b7b453SJohn Marino #ifdef TEST2 30995b7b453SJohn Marino # include <locale.h> 31095b7b453SJohn Marino # include <stdio.h> 31195b7b453SJohn Marino int 31295b7b453SJohn Marino main (int argc, char *argv[]) 31395b7b453SJohn Marino { 31495b7b453SJohn Marino setlocale (LC_ALL, ""); 31595b7b453SJohn Marino printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard")); 31695b7b453SJohn Marino return 0; 31795b7b453SJohn Marino } 31895b7b453SJohn Marino #endif 319