195b7b453SJohn Marino /* Localization of proper names.
2*09d4459fSDaniel Fojt Copyright (C) 2006-2020 Free Software Foundation, Inc.
395b7b453SJohn Marino Written by Bruno Haible <bruno@clisp.org>, 2006.
495b7b453SJohn Marino
595b7b453SJohn Marino This program is free software: you can redistribute it and/or modify
695b7b453SJohn Marino it under the terms of the GNU General Public License as published by
795b7b453SJohn Marino the Free Software Foundation; either version 3 of the License, or
895b7b453SJohn Marino (at your option) any later version.
995b7b453SJohn Marino
1095b7b453SJohn Marino This program is distributed in the hope that it will be useful,
1195b7b453SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of
1295b7b453SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1395b7b453SJohn Marino GNU General Public License for more details.
1495b7b453SJohn Marino
1595b7b453SJohn Marino You should have received a copy of the GNU General Public License
16*09d4459fSDaniel Fojt along with this program. If not, see <https://www.gnu.org/licenses/>. */
1795b7b453SJohn Marino
18cf28ed85SJohn Marino /* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that
19cf28ed85SJohn Marino the proper_name function might be candidate for attribute 'const' */
20cf28ed85SJohn Marino #if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__
21cf28ed85SJohn Marino # pragma GCC diagnostic ignored "-Wsuggest-attribute=const"
22cf28ed85SJohn Marino #endif
23cf28ed85SJohn Marino
2495b7b453SJohn Marino #include <config.h>
2595b7b453SJohn Marino
2695b7b453SJohn Marino /* Specification. */
2795b7b453SJohn Marino #include "propername.h"
2895b7b453SJohn Marino
2995b7b453SJohn Marino #include <ctype.h>
3095b7b453SJohn Marino #include <stdbool.h>
3195b7b453SJohn Marino #include <stdio.h>
3295b7b453SJohn Marino #include <stdlib.h>
3395b7b453SJohn Marino #include <string.h>
3495b7b453SJohn Marino #if HAVE_ICONV
3595b7b453SJohn Marino # include <iconv.h>
3695b7b453SJohn Marino #endif
3795b7b453SJohn Marino
3895b7b453SJohn Marino #include "trim.h"
3995b7b453SJohn Marino #include "mbchar.h"
4095b7b453SJohn Marino #include "mbuiter.h"
4195b7b453SJohn Marino #include "localcharset.h"
4295b7b453SJohn Marino #include "c-strcase.h"
4395b7b453SJohn Marino #include "xstriconv.h"
4495b7b453SJohn Marino #include "xalloc.h"
4595b7b453SJohn Marino #include "gettext.h"
4695b7b453SJohn Marino
4795b7b453SJohn Marino
4895b7b453SJohn Marino /* Tests whether STRING contains trim (SUB), starting and ending at word
4995b7b453SJohn Marino boundaries.
5095b7b453SJohn Marino Here, instead of implementing Unicode Standard Annex #29 for determining
5195b7b453SJohn Marino word boundaries, we assume that trim (SUB) starts and ends with words and
5295b7b453SJohn Marino only test whether the part before it ends with a non-word and the part
5395b7b453SJohn Marino after it starts with a non-word. */
5495b7b453SJohn Marino static bool
mbsstr_trimmed_wordbounded(const char * string,const char * sub)5595b7b453SJohn Marino mbsstr_trimmed_wordbounded (const char *string, const char *sub)
5695b7b453SJohn Marino {
5795b7b453SJohn Marino char *tsub = trim (sub);
5895b7b453SJohn Marino bool found = false;
5995b7b453SJohn Marino
6095b7b453SJohn Marino for (; *string != '\0';)
6195b7b453SJohn Marino {
6295b7b453SJohn Marino const char *tsub_in_string = mbsstr (string, tsub);
6395b7b453SJohn Marino if (tsub_in_string == NULL)
6495b7b453SJohn Marino break;
6595b7b453SJohn Marino else
6695b7b453SJohn Marino {
6795b7b453SJohn Marino if (MB_CUR_MAX > 1)
6895b7b453SJohn Marino {
6995b7b453SJohn Marino mbui_iterator_t string_iter;
7095b7b453SJohn Marino bool word_boundary_before;
7195b7b453SJohn Marino bool word_boundary_after;
7295b7b453SJohn Marino
7395b7b453SJohn Marino mbui_init (string_iter, string);
7495b7b453SJohn Marino word_boundary_before = true;
7595b7b453SJohn Marino if (mbui_cur_ptr (string_iter) < tsub_in_string)
7695b7b453SJohn Marino {
7795b7b453SJohn Marino mbchar_t last_char_before_tsub;
7895b7b453SJohn Marino do
7995b7b453SJohn Marino {
8095b7b453SJohn Marino if (!mbui_avail (string_iter))
8195b7b453SJohn Marino abort ();
8295b7b453SJohn Marino last_char_before_tsub = mbui_cur (string_iter);
8395b7b453SJohn Marino mbui_advance (string_iter);
8495b7b453SJohn Marino }
8595b7b453SJohn Marino while (mbui_cur_ptr (string_iter) < tsub_in_string);
8695b7b453SJohn Marino if (mb_isalnum (last_char_before_tsub))
8795b7b453SJohn Marino word_boundary_before = false;
8895b7b453SJohn Marino }
8995b7b453SJohn Marino
9095b7b453SJohn Marino mbui_init (string_iter, tsub_in_string);
9195b7b453SJohn Marino {
9295b7b453SJohn Marino mbui_iterator_t tsub_iter;
9395b7b453SJohn Marino
9495b7b453SJohn Marino for (mbui_init (tsub_iter, tsub);
9595b7b453SJohn Marino mbui_avail (tsub_iter);
9695b7b453SJohn Marino mbui_advance (tsub_iter))
9795b7b453SJohn Marino {
9895b7b453SJohn Marino if (!mbui_avail (string_iter))
9995b7b453SJohn Marino abort ();
10095b7b453SJohn Marino mbui_advance (string_iter);
10195b7b453SJohn Marino }
10295b7b453SJohn Marino }
10395b7b453SJohn Marino word_boundary_after = true;
10495b7b453SJohn Marino if (mbui_avail (string_iter))
10595b7b453SJohn Marino {
10695b7b453SJohn Marino mbchar_t first_char_after_tsub = mbui_cur (string_iter);
10795b7b453SJohn Marino if (mb_isalnum (first_char_after_tsub))
10895b7b453SJohn Marino word_boundary_after = false;
10995b7b453SJohn Marino }
11095b7b453SJohn Marino
11195b7b453SJohn Marino if (word_boundary_before && word_boundary_after)
11295b7b453SJohn Marino {
11395b7b453SJohn Marino found = true;
11495b7b453SJohn Marino break;
11595b7b453SJohn Marino }
11695b7b453SJohn Marino
11795b7b453SJohn Marino mbui_init (string_iter, tsub_in_string);
11895b7b453SJohn Marino if (!mbui_avail (string_iter))
11995b7b453SJohn Marino break;
12095b7b453SJohn Marino string = tsub_in_string + mb_len (mbui_cur (string_iter));
12195b7b453SJohn Marino }
12295b7b453SJohn Marino else
12395b7b453SJohn Marino {
12495b7b453SJohn Marino bool word_boundary_before;
12595b7b453SJohn Marino const char *p;
12695b7b453SJohn Marino bool word_boundary_after;
12795b7b453SJohn Marino
12895b7b453SJohn Marino word_boundary_before = true;
12995b7b453SJohn Marino if (string < tsub_in_string)
13095b7b453SJohn Marino if (isalnum ((unsigned char) tsub_in_string[-1]))
13195b7b453SJohn Marino word_boundary_before = false;
13295b7b453SJohn Marino
13395b7b453SJohn Marino p = tsub_in_string + strlen (tsub);
13495b7b453SJohn Marino word_boundary_after = true;
13595b7b453SJohn Marino if (*p != '\0')
13695b7b453SJohn Marino if (isalnum ((unsigned char) *p))
13795b7b453SJohn Marino word_boundary_after = false;
13895b7b453SJohn Marino
13995b7b453SJohn Marino if (word_boundary_before && word_boundary_after)
14095b7b453SJohn Marino {
14195b7b453SJohn Marino found = true;
14295b7b453SJohn Marino break;
14395b7b453SJohn Marino }
14495b7b453SJohn Marino
14595b7b453SJohn Marino if (*tsub_in_string == '\0')
14695b7b453SJohn Marino break;
14795b7b453SJohn Marino string = tsub_in_string + 1;
14895b7b453SJohn Marino }
14995b7b453SJohn Marino }
15095b7b453SJohn Marino }
15195b7b453SJohn Marino free (tsub);
15295b7b453SJohn Marino return found;
15395b7b453SJohn Marino }
15495b7b453SJohn Marino
15595b7b453SJohn Marino /* Return the localization of NAME. NAME is written in ASCII. */
15695b7b453SJohn Marino
15795b7b453SJohn Marino const char *
proper_name(const char * name)15895b7b453SJohn Marino proper_name (const char *name)
15995b7b453SJohn Marino {
16095b7b453SJohn Marino /* See whether there is a translation. */
16195b7b453SJohn Marino const char *translation = gettext (name);
16295b7b453SJohn Marino
16395b7b453SJohn Marino if (translation != name)
16495b7b453SJohn Marino {
16595b7b453SJohn Marino /* See whether the translation contains the original name. */
16695b7b453SJohn Marino if (mbsstr_trimmed_wordbounded (translation, name))
16795b7b453SJohn Marino return translation;
16895b7b453SJohn Marino else
16995b7b453SJohn Marino {
17095b7b453SJohn Marino /* Return "TRANSLATION (NAME)". */
17195b7b453SJohn Marino char *result =
17295b7b453SJohn Marino XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
17395b7b453SJohn Marino
17495b7b453SJohn Marino sprintf (result, "%s (%s)", translation, name);
17595b7b453SJohn Marino return result;
17695b7b453SJohn Marino }
17795b7b453SJohn Marino }
17895b7b453SJohn Marino else
17995b7b453SJohn Marino return name;
18095b7b453SJohn Marino }
18195b7b453SJohn Marino
18295b7b453SJohn Marino /* Return the localization of a name whose original writing is not ASCII.
18395b7b453SJohn Marino NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal
18495b7b453SJohn Marino escape sequences. NAME_ASCII is a fallback written only with ASCII
18595b7b453SJohn Marino characters. */
18695b7b453SJohn Marino
18795b7b453SJohn Marino const char *
proper_name_utf8(const char * name_ascii,const char * name_utf8)18895b7b453SJohn Marino proper_name_utf8 (const char *name_ascii, const char *name_utf8)
18995b7b453SJohn Marino {
19095b7b453SJohn Marino /* See whether there is a translation. */
19195b7b453SJohn Marino const char *translation = gettext (name_ascii);
19295b7b453SJohn Marino
19395b7b453SJohn Marino /* Try to convert NAME_UTF8 to the locale encoding. */
19495b7b453SJohn Marino const char *locale_code = locale_charset ();
19595b7b453SJohn Marino char *alloc_name_converted = NULL;
19695b7b453SJohn Marino char *alloc_name_converted_translit = NULL;
19795b7b453SJohn Marino const char *name_converted = NULL;
19895b7b453SJohn Marino const char *name_converted_translit = NULL;
19995b7b453SJohn Marino const char *name;
20095b7b453SJohn Marino
20195b7b453SJohn Marino if (c_strcasecmp (locale_code, "UTF-8") != 0)
20295b7b453SJohn Marino {
20395b7b453SJohn Marino #if HAVE_ICONV
20495b7b453SJohn Marino name_converted = alloc_name_converted =
20595b7b453SJohn Marino xstr_iconv (name_utf8, "UTF-8", locale_code);
20695b7b453SJohn Marino
207200fbe8dSJohn Marino # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
208200fbe8dSJohn Marino && !defined __UCLIBC__) \
20995b7b453SJohn Marino || _LIBICONV_VERSION >= 0x0105
21095b7b453SJohn Marino {
21195b7b453SJohn Marino char *converted_translit;
21295b7b453SJohn Marino
21395b7b453SJohn Marino size_t len = strlen (locale_code);
21495b7b453SJohn Marino char *locale_code_translit = XNMALLOC (len + 10 + 1, char);
21595b7b453SJohn Marino memcpy (locale_code_translit, locale_code, len);
21695b7b453SJohn Marino memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1);
21795b7b453SJohn Marino
21895b7b453SJohn Marino converted_translit =
21995b7b453SJohn Marino xstr_iconv (name_utf8, "UTF-8", locale_code_translit);
22095b7b453SJohn Marino
22195b7b453SJohn Marino free (locale_code_translit);
22295b7b453SJohn Marino
22395b7b453SJohn Marino if (converted_translit != NULL)
22495b7b453SJohn Marino {
22595b7b453SJohn Marino # if !_LIBICONV_VERSION
22695b7b453SJohn Marino /* Don't use the transliteration if it added question marks.
22795b7b453SJohn Marino glibc's transliteration falls back to question marks; libiconv's
22895b7b453SJohn Marino transliteration does not.
22995b7b453SJohn Marino mbschr is equivalent to strchr in this case. */
23095b7b453SJohn Marino if (strchr (converted_translit, '?') != NULL)
23195b7b453SJohn Marino free (converted_translit);
23295b7b453SJohn Marino else
23395b7b453SJohn Marino # endif
23495b7b453SJohn Marino name_converted_translit = alloc_name_converted_translit =
23595b7b453SJohn Marino converted_translit;
23695b7b453SJohn Marino }
23795b7b453SJohn Marino }
23895b7b453SJohn Marino # endif
23995b7b453SJohn Marino #endif
24095b7b453SJohn Marino }
24195b7b453SJohn Marino else
24295b7b453SJohn Marino {
24395b7b453SJohn Marino name_converted = name_utf8;
24495b7b453SJohn Marino name_converted_translit = name_utf8;
24595b7b453SJohn Marino }
24695b7b453SJohn Marino
24795b7b453SJohn Marino /* The name in locale encoding. */
24895b7b453SJohn Marino name = (name_converted != NULL ? name_converted :
24995b7b453SJohn Marino name_converted_translit != NULL ? name_converted_translit :
25095b7b453SJohn Marino name_ascii);
25195b7b453SJohn Marino
25295b7b453SJohn Marino /* See whether we have a translation. Some translators have not understood
25395b7b453SJohn Marino that they should use the UTF-8 form of the name, if possible. So if the
25495b7b453SJohn Marino translator provided a no-op translation, we ignore it. */
25595b7b453SJohn Marino if (strcmp (translation, name_ascii) != 0)
25695b7b453SJohn Marino {
25795b7b453SJohn Marino /* See whether the translation contains the original name. */
25895b7b453SJohn Marino if (mbsstr_trimmed_wordbounded (translation, name_ascii)
25995b7b453SJohn Marino || (name_converted != NULL
26095b7b453SJohn Marino && mbsstr_trimmed_wordbounded (translation, name_converted))
26195b7b453SJohn Marino || (name_converted_translit != NULL
26295b7b453SJohn Marino && mbsstr_trimmed_wordbounded (translation, name_converted_translit)))
26395b7b453SJohn Marino {
26495b7b453SJohn Marino if (alloc_name_converted != NULL)
26595b7b453SJohn Marino free (alloc_name_converted);
26695b7b453SJohn Marino if (alloc_name_converted_translit != NULL)
26795b7b453SJohn Marino free (alloc_name_converted_translit);
26895b7b453SJohn Marino return translation;
26995b7b453SJohn Marino }
27095b7b453SJohn Marino else
27195b7b453SJohn Marino {
27295b7b453SJohn Marino /* Return "TRANSLATION (NAME)". */
27395b7b453SJohn Marino char *result =
27495b7b453SJohn Marino XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
27595b7b453SJohn Marino
27695b7b453SJohn Marino sprintf (result, "%s (%s)", translation, name);
27795b7b453SJohn Marino
27895b7b453SJohn Marino if (alloc_name_converted != NULL)
27995b7b453SJohn Marino free (alloc_name_converted);
28095b7b453SJohn Marino if (alloc_name_converted_translit != NULL)
28195b7b453SJohn Marino free (alloc_name_converted_translit);
28295b7b453SJohn Marino return result;
28395b7b453SJohn Marino }
28495b7b453SJohn Marino }
28595b7b453SJohn Marino else
28695b7b453SJohn Marino {
28795b7b453SJohn Marino if (alloc_name_converted != NULL && alloc_name_converted != name)
28895b7b453SJohn Marino free (alloc_name_converted);
28995b7b453SJohn Marino if (alloc_name_converted_translit != NULL
29095b7b453SJohn Marino && alloc_name_converted_translit != name)
29195b7b453SJohn Marino free (alloc_name_converted_translit);
29295b7b453SJohn Marino return name;
29395b7b453SJohn Marino }
29495b7b453SJohn Marino }
29595b7b453SJohn Marino
29695b7b453SJohn Marino #ifdef TEST1
29795b7b453SJohn Marino # include <locale.h>
29895b7b453SJohn Marino int
main(int argc,char * argv[])29995b7b453SJohn Marino main (int argc, char *argv[])
30095b7b453SJohn Marino {
30195b7b453SJohn Marino setlocale (LC_ALL, "");
30295b7b453SJohn Marino if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))
30395b7b453SJohn Marino printf("found\n");
30495b7b453SJohn Marino return 0;
30595b7b453SJohn Marino }
30695b7b453SJohn Marino #endif
30795b7b453SJohn Marino
30895b7b453SJohn Marino #ifdef TEST2
30995b7b453SJohn Marino # include <locale.h>
31095b7b453SJohn Marino # include <stdio.h>
31195b7b453SJohn Marino int
main(int argc,char * argv[])31295b7b453SJohn Marino main (int argc, char *argv[])
31395b7b453SJohn Marino {
31495b7b453SJohn Marino setlocale (LC_ALL, "");
31595b7b453SJohn Marino printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard"));
31695b7b453SJohn Marino return 0;
31795b7b453SJohn Marino }
31895b7b453SJohn Marino #endif
319