xref: /dflybsd-src/contrib/grep/lib/propername.c (revision 91b9ed38d3db6a8a8ac5b66da1d43e6e331e259a)
195b7b453SJohn Marino /* Localization of proper names.
2*09d4459fSDaniel Fojt    Copyright (C) 2006-2020 Free Software Foundation, Inc.
395b7b453SJohn Marino    Written by Bruno Haible <bruno@clisp.org>, 2006.
495b7b453SJohn Marino 
595b7b453SJohn Marino    This program is free software: you can redistribute it and/or modify
695b7b453SJohn Marino    it under the terms of the GNU General Public License as published by
795b7b453SJohn Marino    the Free Software Foundation; either version 3 of the License, or
895b7b453SJohn Marino    (at your option) any later version.
995b7b453SJohn Marino 
1095b7b453SJohn Marino    This program is distributed in the hope that it will be useful,
1195b7b453SJohn Marino    but WITHOUT ANY WARRANTY; without even the implied warranty of
1295b7b453SJohn Marino    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1395b7b453SJohn Marino    GNU General Public License for more details.
1495b7b453SJohn Marino 
1595b7b453SJohn Marino    You should have received a copy of the GNU General Public License
16*09d4459fSDaniel Fojt    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
1795b7b453SJohn Marino 
18cf28ed85SJohn Marino /* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that
19cf28ed85SJohn Marino    the proper_name function might be candidate for attribute 'const'  */
20cf28ed85SJohn Marino #if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__
21cf28ed85SJohn Marino # pragma GCC diagnostic ignored "-Wsuggest-attribute=const"
22cf28ed85SJohn Marino #endif
23cf28ed85SJohn Marino 
2495b7b453SJohn Marino #include <config.h>
2595b7b453SJohn Marino 
2695b7b453SJohn Marino /* Specification.  */
2795b7b453SJohn Marino #include "propername.h"
2895b7b453SJohn Marino 
2995b7b453SJohn Marino #include <ctype.h>
3095b7b453SJohn Marino #include <stdbool.h>
3195b7b453SJohn Marino #include <stdio.h>
3295b7b453SJohn Marino #include <stdlib.h>
3395b7b453SJohn Marino #include <string.h>
3495b7b453SJohn Marino #if HAVE_ICONV
3595b7b453SJohn Marino # include <iconv.h>
3695b7b453SJohn Marino #endif
3795b7b453SJohn Marino 
3895b7b453SJohn Marino #include "trim.h"
3995b7b453SJohn Marino #include "mbchar.h"
4095b7b453SJohn Marino #include "mbuiter.h"
4195b7b453SJohn Marino #include "localcharset.h"
4295b7b453SJohn Marino #include "c-strcase.h"
4395b7b453SJohn Marino #include "xstriconv.h"
4495b7b453SJohn Marino #include "xalloc.h"
4595b7b453SJohn Marino #include "gettext.h"
4695b7b453SJohn Marino 
4795b7b453SJohn Marino 
4895b7b453SJohn Marino /* Tests whether STRING contains trim (SUB), starting and ending at word
4995b7b453SJohn Marino    boundaries.
5095b7b453SJohn Marino    Here, instead of implementing Unicode Standard Annex #29 for determining
5195b7b453SJohn Marino    word boundaries, we assume that trim (SUB) starts and ends with words and
5295b7b453SJohn Marino    only test whether the part before it ends with a non-word and the part
5395b7b453SJohn Marino    after it starts with a non-word.  */
5495b7b453SJohn Marino static bool
mbsstr_trimmed_wordbounded(const char * string,const char * sub)5595b7b453SJohn Marino mbsstr_trimmed_wordbounded (const char *string, const char *sub)
5695b7b453SJohn Marino {
5795b7b453SJohn Marino   char *tsub = trim (sub);
5895b7b453SJohn Marino   bool found = false;
5995b7b453SJohn Marino 
6095b7b453SJohn Marino   for (; *string != '\0';)
6195b7b453SJohn Marino     {
6295b7b453SJohn Marino       const char *tsub_in_string = mbsstr (string, tsub);
6395b7b453SJohn Marino       if (tsub_in_string == NULL)
6495b7b453SJohn Marino         break;
6595b7b453SJohn Marino       else
6695b7b453SJohn Marino         {
6795b7b453SJohn Marino           if (MB_CUR_MAX > 1)
6895b7b453SJohn Marino             {
6995b7b453SJohn Marino               mbui_iterator_t string_iter;
7095b7b453SJohn Marino               bool word_boundary_before;
7195b7b453SJohn Marino               bool word_boundary_after;
7295b7b453SJohn Marino 
7395b7b453SJohn Marino               mbui_init (string_iter, string);
7495b7b453SJohn Marino               word_boundary_before = true;
7595b7b453SJohn Marino               if (mbui_cur_ptr (string_iter) < tsub_in_string)
7695b7b453SJohn Marino                 {
7795b7b453SJohn Marino                   mbchar_t last_char_before_tsub;
7895b7b453SJohn Marino                   do
7995b7b453SJohn Marino                     {
8095b7b453SJohn Marino                       if (!mbui_avail (string_iter))
8195b7b453SJohn Marino                         abort ();
8295b7b453SJohn Marino                       last_char_before_tsub = mbui_cur (string_iter);
8395b7b453SJohn Marino                       mbui_advance (string_iter);
8495b7b453SJohn Marino                     }
8595b7b453SJohn Marino                   while (mbui_cur_ptr (string_iter) < tsub_in_string);
8695b7b453SJohn Marino                   if (mb_isalnum (last_char_before_tsub))
8795b7b453SJohn Marino                     word_boundary_before = false;
8895b7b453SJohn Marino                 }
8995b7b453SJohn Marino 
9095b7b453SJohn Marino               mbui_init (string_iter, tsub_in_string);
9195b7b453SJohn Marino               {
9295b7b453SJohn Marino                 mbui_iterator_t tsub_iter;
9395b7b453SJohn Marino 
9495b7b453SJohn Marino                 for (mbui_init (tsub_iter, tsub);
9595b7b453SJohn Marino                      mbui_avail (tsub_iter);
9695b7b453SJohn Marino                      mbui_advance (tsub_iter))
9795b7b453SJohn Marino                   {
9895b7b453SJohn Marino                     if (!mbui_avail (string_iter))
9995b7b453SJohn Marino                       abort ();
10095b7b453SJohn Marino                     mbui_advance (string_iter);
10195b7b453SJohn Marino                   }
10295b7b453SJohn Marino               }
10395b7b453SJohn Marino               word_boundary_after = true;
10495b7b453SJohn Marino               if (mbui_avail (string_iter))
10595b7b453SJohn Marino                 {
10695b7b453SJohn Marino                   mbchar_t first_char_after_tsub = mbui_cur (string_iter);
10795b7b453SJohn Marino                   if (mb_isalnum (first_char_after_tsub))
10895b7b453SJohn Marino                     word_boundary_after = false;
10995b7b453SJohn Marino                 }
11095b7b453SJohn Marino 
11195b7b453SJohn Marino               if (word_boundary_before && word_boundary_after)
11295b7b453SJohn Marino                 {
11395b7b453SJohn Marino                   found = true;
11495b7b453SJohn Marino                   break;
11595b7b453SJohn Marino                 }
11695b7b453SJohn Marino 
11795b7b453SJohn Marino               mbui_init (string_iter, tsub_in_string);
11895b7b453SJohn Marino               if (!mbui_avail (string_iter))
11995b7b453SJohn Marino                 break;
12095b7b453SJohn Marino               string = tsub_in_string + mb_len (mbui_cur (string_iter));
12195b7b453SJohn Marino             }
12295b7b453SJohn Marino           else
12395b7b453SJohn Marino             {
12495b7b453SJohn Marino               bool word_boundary_before;
12595b7b453SJohn Marino               const char *p;
12695b7b453SJohn Marino               bool word_boundary_after;
12795b7b453SJohn Marino 
12895b7b453SJohn Marino               word_boundary_before = true;
12995b7b453SJohn Marino               if (string < tsub_in_string)
13095b7b453SJohn Marino                 if (isalnum ((unsigned char) tsub_in_string[-1]))
13195b7b453SJohn Marino                   word_boundary_before = false;
13295b7b453SJohn Marino 
13395b7b453SJohn Marino               p = tsub_in_string + strlen (tsub);
13495b7b453SJohn Marino               word_boundary_after = true;
13595b7b453SJohn Marino               if (*p != '\0')
13695b7b453SJohn Marino                 if (isalnum ((unsigned char) *p))
13795b7b453SJohn Marino                   word_boundary_after = false;
13895b7b453SJohn Marino 
13995b7b453SJohn Marino               if (word_boundary_before && word_boundary_after)
14095b7b453SJohn Marino                 {
14195b7b453SJohn Marino                   found = true;
14295b7b453SJohn Marino                   break;
14395b7b453SJohn Marino                 }
14495b7b453SJohn Marino 
14595b7b453SJohn Marino               if (*tsub_in_string == '\0')
14695b7b453SJohn Marino                 break;
14795b7b453SJohn Marino               string = tsub_in_string + 1;
14895b7b453SJohn Marino             }
14995b7b453SJohn Marino         }
15095b7b453SJohn Marino     }
15195b7b453SJohn Marino   free (tsub);
15295b7b453SJohn Marino   return found;
15395b7b453SJohn Marino }
15495b7b453SJohn Marino 
15595b7b453SJohn Marino /* Return the localization of NAME.  NAME is written in ASCII.  */
15695b7b453SJohn Marino 
15795b7b453SJohn Marino const char *
proper_name(const char * name)15895b7b453SJohn Marino proper_name (const char *name)
15995b7b453SJohn Marino {
16095b7b453SJohn Marino   /* See whether there is a translation.   */
16195b7b453SJohn Marino   const char *translation = gettext (name);
16295b7b453SJohn Marino 
16395b7b453SJohn Marino   if (translation != name)
16495b7b453SJohn Marino     {
16595b7b453SJohn Marino       /* See whether the translation contains the original name.  */
16695b7b453SJohn Marino       if (mbsstr_trimmed_wordbounded (translation, name))
16795b7b453SJohn Marino         return translation;
16895b7b453SJohn Marino       else
16995b7b453SJohn Marino         {
17095b7b453SJohn Marino           /* Return "TRANSLATION (NAME)".  */
17195b7b453SJohn Marino           char *result =
17295b7b453SJohn Marino             XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
17395b7b453SJohn Marino 
17495b7b453SJohn Marino           sprintf (result, "%s (%s)", translation, name);
17595b7b453SJohn Marino           return result;
17695b7b453SJohn Marino         }
17795b7b453SJohn Marino     }
17895b7b453SJohn Marino   else
17995b7b453SJohn Marino     return name;
18095b7b453SJohn Marino }
18195b7b453SJohn Marino 
18295b7b453SJohn Marino /* Return the localization of a name whose original writing is not ASCII.
18395b7b453SJohn Marino    NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal
18495b7b453SJohn Marino    escape sequences.  NAME_ASCII is a fallback written only with ASCII
18595b7b453SJohn Marino    characters.  */
18695b7b453SJohn Marino 
18795b7b453SJohn Marino const char *
proper_name_utf8(const char * name_ascii,const char * name_utf8)18895b7b453SJohn Marino proper_name_utf8 (const char *name_ascii, const char *name_utf8)
18995b7b453SJohn Marino {
19095b7b453SJohn Marino   /* See whether there is a translation.   */
19195b7b453SJohn Marino   const char *translation = gettext (name_ascii);
19295b7b453SJohn Marino 
19395b7b453SJohn Marino   /* Try to convert NAME_UTF8 to the locale encoding.  */
19495b7b453SJohn Marino   const char *locale_code = locale_charset ();
19595b7b453SJohn Marino   char *alloc_name_converted = NULL;
19695b7b453SJohn Marino   char *alloc_name_converted_translit = NULL;
19795b7b453SJohn Marino   const char *name_converted = NULL;
19895b7b453SJohn Marino   const char *name_converted_translit = NULL;
19995b7b453SJohn Marino   const char *name;
20095b7b453SJohn Marino 
20195b7b453SJohn Marino   if (c_strcasecmp (locale_code, "UTF-8") != 0)
20295b7b453SJohn Marino     {
20395b7b453SJohn Marino #if HAVE_ICONV
20495b7b453SJohn Marino       name_converted = alloc_name_converted =
20595b7b453SJohn Marino         xstr_iconv (name_utf8, "UTF-8", locale_code);
20695b7b453SJohn Marino 
207200fbe8dSJohn Marino # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
208200fbe8dSJohn Marino       && !defined __UCLIBC__) \
20995b7b453SJohn Marino      || _LIBICONV_VERSION >= 0x0105
21095b7b453SJohn Marino       {
21195b7b453SJohn Marino         char *converted_translit;
21295b7b453SJohn Marino 
21395b7b453SJohn Marino         size_t len = strlen (locale_code);
21495b7b453SJohn Marino         char *locale_code_translit = XNMALLOC (len + 10 + 1, char);
21595b7b453SJohn Marino         memcpy (locale_code_translit, locale_code, len);
21695b7b453SJohn Marino         memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1);
21795b7b453SJohn Marino 
21895b7b453SJohn Marino         converted_translit =
21995b7b453SJohn Marino           xstr_iconv (name_utf8, "UTF-8", locale_code_translit);
22095b7b453SJohn Marino 
22195b7b453SJohn Marino         free (locale_code_translit);
22295b7b453SJohn Marino 
22395b7b453SJohn Marino         if (converted_translit != NULL)
22495b7b453SJohn Marino           {
22595b7b453SJohn Marino #  if !_LIBICONV_VERSION
22695b7b453SJohn Marino             /* Don't use the transliteration if it added question marks.
22795b7b453SJohn Marino                glibc's transliteration falls back to question marks; libiconv's
22895b7b453SJohn Marino                transliteration does not.
22995b7b453SJohn Marino                mbschr is equivalent to strchr in this case.  */
23095b7b453SJohn Marino             if (strchr (converted_translit, '?') != NULL)
23195b7b453SJohn Marino               free (converted_translit);
23295b7b453SJohn Marino             else
23395b7b453SJohn Marino #  endif
23495b7b453SJohn Marino               name_converted_translit = alloc_name_converted_translit =
23595b7b453SJohn Marino                 converted_translit;
23695b7b453SJohn Marino           }
23795b7b453SJohn Marino       }
23895b7b453SJohn Marino # endif
23995b7b453SJohn Marino #endif
24095b7b453SJohn Marino     }
24195b7b453SJohn Marino   else
24295b7b453SJohn Marino     {
24395b7b453SJohn Marino       name_converted = name_utf8;
24495b7b453SJohn Marino       name_converted_translit = name_utf8;
24595b7b453SJohn Marino     }
24695b7b453SJohn Marino 
24795b7b453SJohn Marino   /* The name in locale encoding.  */
24895b7b453SJohn Marino   name = (name_converted != NULL ? name_converted :
24995b7b453SJohn Marino           name_converted_translit != NULL ? name_converted_translit :
25095b7b453SJohn Marino           name_ascii);
25195b7b453SJohn Marino 
25295b7b453SJohn Marino   /* See whether we have a translation.  Some translators have not understood
25395b7b453SJohn Marino      that they should use the UTF-8 form of the name, if possible.  So if the
25495b7b453SJohn Marino      translator provided a no-op translation, we ignore it.  */
25595b7b453SJohn Marino   if (strcmp (translation, name_ascii) != 0)
25695b7b453SJohn Marino     {
25795b7b453SJohn Marino       /* See whether the translation contains the original name.  */
25895b7b453SJohn Marino       if (mbsstr_trimmed_wordbounded (translation, name_ascii)
25995b7b453SJohn Marino           || (name_converted != NULL
26095b7b453SJohn Marino               && mbsstr_trimmed_wordbounded (translation, name_converted))
26195b7b453SJohn Marino           || (name_converted_translit != NULL
26295b7b453SJohn Marino               && mbsstr_trimmed_wordbounded (translation, name_converted_translit)))
26395b7b453SJohn Marino         {
26495b7b453SJohn Marino           if (alloc_name_converted != NULL)
26595b7b453SJohn Marino             free (alloc_name_converted);
26695b7b453SJohn Marino           if (alloc_name_converted_translit != NULL)
26795b7b453SJohn Marino             free (alloc_name_converted_translit);
26895b7b453SJohn Marino           return translation;
26995b7b453SJohn Marino         }
27095b7b453SJohn Marino       else
27195b7b453SJohn Marino         {
27295b7b453SJohn Marino           /* Return "TRANSLATION (NAME)".  */
27395b7b453SJohn Marino           char *result =
27495b7b453SJohn Marino             XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
27595b7b453SJohn Marino 
27695b7b453SJohn Marino           sprintf (result, "%s (%s)", translation, name);
27795b7b453SJohn Marino 
27895b7b453SJohn Marino           if (alloc_name_converted != NULL)
27995b7b453SJohn Marino             free (alloc_name_converted);
28095b7b453SJohn Marino           if (alloc_name_converted_translit != NULL)
28195b7b453SJohn Marino             free (alloc_name_converted_translit);
28295b7b453SJohn Marino           return result;
28395b7b453SJohn Marino         }
28495b7b453SJohn Marino     }
28595b7b453SJohn Marino   else
28695b7b453SJohn Marino     {
28795b7b453SJohn Marino       if (alloc_name_converted != NULL && alloc_name_converted != name)
28895b7b453SJohn Marino         free (alloc_name_converted);
28995b7b453SJohn Marino       if (alloc_name_converted_translit != NULL
29095b7b453SJohn Marino           && alloc_name_converted_translit != name)
29195b7b453SJohn Marino         free (alloc_name_converted_translit);
29295b7b453SJohn Marino       return name;
29395b7b453SJohn Marino     }
29495b7b453SJohn Marino }
29595b7b453SJohn Marino 
29695b7b453SJohn Marino #ifdef TEST1
29795b7b453SJohn Marino # include <locale.h>
29895b7b453SJohn Marino int
main(int argc,char * argv[])29995b7b453SJohn Marino main (int argc, char *argv[])
30095b7b453SJohn Marino {
30195b7b453SJohn Marino   setlocale (LC_ALL, "");
30295b7b453SJohn Marino   if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))
30395b7b453SJohn Marino     printf("found\n");
30495b7b453SJohn Marino   return 0;
30595b7b453SJohn Marino }
30695b7b453SJohn Marino #endif
30795b7b453SJohn Marino 
30895b7b453SJohn Marino #ifdef TEST2
30995b7b453SJohn Marino # include <locale.h>
31095b7b453SJohn Marino # include <stdio.h>
31195b7b453SJohn Marino int
main(int argc,char * argv[])31295b7b453SJohn Marino main (int argc, char *argv[])
31395b7b453SJohn Marino {
31495b7b453SJohn Marino   setlocale (LC_ALL, "");
31595b7b453SJohn Marino   printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard"));
31695b7b453SJohn Marino   return 0;
31795b7b453SJohn Marino }
31895b7b453SJohn Marino #endif
319