xref: /netbsd-src/external/gpl2/grep/dist/intl/localcharset.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /*	$NetBSD: localcharset.c,v 1.1.1.1 2016/01/10 21:36:18 christos Exp $	*/
2 
3 /* Determine a canonical name for the current locale's character encoding.
4 
5    Copyright (C) 2000-2002 Free Software Foundation, Inc.
6 
7    This program is free software; you can redistribute it and/or modify it
8    under the terms of the GNU Library General Public License as published
9    by the Free Software Foundation; either version 2, or (at your option)
10    any later version.
11 
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15    Library General Public License for more details.
16 
17    You should have received a copy of the GNU Library General Public
18    License along with this program; if not, write to the Free Software
19    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20    USA.  */
21 
22 /* Written by Bruno Haible <haible@clisp.cons.org>.  */
23 
24 #ifdef HAVE_CONFIG_H
25 # include <config.h>
26 #endif
27 
28 #if HAVE_STDDEF_H
29 # include <stddef.h>
30 #endif
31 
32 #include <stdio.h>
33 #if HAVE_STRING_H
34 # include <string.h>
35 #else
36 # include <strings.h>
37 #endif
38 #if HAVE_STDLIB_H
39 # include <stdlib.h>
40 #endif
41 
42 #if defined _WIN32 || defined __WIN32__
43 # undef WIN32   /* avoid warning on mingw32 */
44 # define WIN32
45 #endif
46 
47 #if defined __EMX__
48 /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
49 # define OS2
50 #endif
51 
52 #if !defined WIN32
53 # if HAVE_LANGINFO_CODESET
54 #  include <langinfo.h>
55 # else
56 #  if HAVE_SETLOCALE
57 #   include <locale.h>
58 #  endif
59 # endif
60 #elif defined WIN32
61 # define WIN32_LEAN_AND_MEAN
62 # include <windows.h>
63 #endif
64 #if defined OS2
65 # define INCL_DOS
66 # include <os2.h>
67 #endif
68 
69 #if defined _WIN32 || defined __WIN32__ || defined __EMX__ || defined __DJGPP__
70   /* Win32, OS/2, DOS */
71 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
72 #endif
73 
74 #ifndef DIRECTORY_SEPARATOR
75 # define DIRECTORY_SEPARATOR '/'
76 #endif
77 
78 #ifndef ISSLASH
79 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
80 #endif
81 
82 #ifdef HAVE_GETC_UNLOCKED
83 # undef getc
84 # define getc getc_unlocked
85 #endif
86 
87 /* The following static variable is declared 'volatile' to avoid a
88    possible multithread problem in the function get_charset_aliases. If we
89    are running in a threaded environment, and if two threads initialize
90    'charset_aliases' simultaneously, both will produce the same value,
91    and everything will be ok if the two assignments to 'charset_aliases'
92    are atomic. But I don't know what will happen if the two assignments mix.  */
93 #if __STDC__ != 1
94 # define volatile /* empty */
95 #endif
96 /* Pointer to the contents of the charset.alias file, if it has already been
97    read, else NULL.  Its format is:
98    ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
99 static const char * volatile charset_aliases;
100 
101 /* Return a pointer to the contents of the charset.alias file.  */
102 static const char *
103 get_charset_aliases ()
104 {
105   const char *cp;
106 
107   cp = charset_aliases;
108   if (cp == NULL)
109     {
110 #if !defined WIN32
111       FILE *fp;
112       const char *dir = LIBDIR;
113       const char *base = "charset.alias";
114       char *file_name;
115 
116       /* Concatenate dir and base into freshly allocated file_name.  */
117       {
118 	size_t dir_len = strlen (dir);
119 	size_t base_len = strlen (base);
120 	int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
121 	file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
122 	if (file_name != NULL)
123 	  {
124 	    memcpy (file_name, dir, dir_len);
125 	    if (add_slash)
126 	      file_name[dir_len] = DIRECTORY_SEPARATOR;
127 	    memcpy (file_name + dir_len + add_slash, base, base_len + 1);
128 	  }
129       }
130 
131       if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
132 	/* Out of memory or file not found, treat it as empty.  */
133 	cp = "";
134       else
135 	{
136 	  /* Parse the file's contents.  */
137 	  int c;
138 	  char buf1[50+1];
139 	  char buf2[50+1];
140 	  char *res_ptr = NULL;
141 	  size_t res_size = 0;
142 	  size_t l1, l2;
143 
144 	  for (;;)
145 	    {
146 	      c = getc (fp);
147 	      if (c == EOF)
148 		break;
149 	      if (c == '\n' || c == ' ' || c == '\t')
150 		continue;
151 	      if (c == '#')
152 		{
153 		  /* Skip comment, to end of line.  */
154 		  do
155 		    c = getc (fp);
156 		  while (!(c == EOF || c == '\n'));
157 		  if (c == EOF)
158 		    break;
159 		  continue;
160 		}
161 	      ungetc (c, fp);
162 	      if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
163 		break;
164 	      l1 = strlen (buf1);
165 	      l2 = strlen (buf2);
166 	      if (res_size == 0)
167 		{
168 		  res_size = l1 + 1 + l2 + 1;
169 		  res_ptr = (char *) malloc (res_size + 1);
170 		}
171 	      else
172 		{
173 		  res_size += l1 + 1 + l2 + 1;
174 		  res_ptr = (char *) realloc (res_ptr, res_size + 1);
175 		}
176 	      if (res_ptr == NULL)
177 		{
178 		  /* Out of memory. */
179 		  res_size = 0;
180 		  break;
181 		}
182 	      strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
183 	      strcpy (res_ptr + res_size - (l2 + 1), buf2);
184 	    }
185 	  fclose (fp);
186 	  if (res_size == 0)
187 	    cp = "";
188 	  else
189 	    {
190 	      *(res_ptr + res_size) = '\0';
191 	      cp = res_ptr;
192 	    }
193 	}
194 
195       if (file_name != NULL)
196 	free (file_name);
197 
198 #else
199 
200       /* To avoid the troubles of installing a separate file in the same
201 	 directory as the DLL and of retrieving the DLL's directory at
202 	 runtime, simply inline the aliases here.  */
203 
204 # if defined WIN32
205       cp = "CP936" "\0" "GBK" "\0"
206 	   "CP1361" "\0" "JOHAB" "\0";
207 # endif
208 #endif
209 
210       charset_aliases = cp;
211     }
212 
213   return cp;
214 }
215 
216 /* Determine the current locale's character encoding, and canonicalize it
217    into one of the canonical names listed in config.charset.
218    The result must not be freed; it is statically allocated.
219    If the canonical name cannot be determined, the result is a non-canonical
220    name.  */
221 
222 #ifdef STATIC
223 STATIC
224 #endif
225 const char *
226 locale_charset ()
227 {
228   const char *codeset;
229   const char *aliases;
230 
231 #if !(defined WIN32 || defined OS2)
232 
233 # if HAVE_LANGINFO_CODESET
234 
235   /* Most systems support nl_langinfo (CODESET) nowadays.  */
236   codeset = nl_langinfo (CODESET);
237 
238 # else
239 
240   /* On old systems which lack it, use setlocale or getenv.  */
241   const char *locale = NULL;
242 
243   /* But most old systems don't have a complete set of locales.  Some
244      (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
245      use setlocale here; it would return "C" when it doesn't support the
246      locale name the user has set.  */
247 #  if HAVE_SETLOCALE && 0
248   locale = setlocale (LC_CTYPE, NULL);
249 #  endif
250   if (locale == NULL || locale[0] == '\0')
251     {
252       locale = getenv ("LC_ALL");
253       if (locale == NULL || locale[0] == '\0')
254 	{
255 	  locale = getenv ("LC_CTYPE");
256 	  if (locale == NULL || locale[0] == '\0')
257 	    locale = getenv ("LANG");
258 	}
259     }
260 
261   /* On some old systems, one used to set locale = "iso8859_1". On others,
262      you set it to "language_COUNTRY.charset". In any case, we resolve it
263      through the charset.alias file.  */
264   codeset = locale;
265 
266 # endif
267 
268 #elif defined WIN32
269 
270   static char buf[2 + 10 + 1];
271 
272   /* Win32 has a function returning the locale's codepage as a number.  */
273   sprintf (buf, "CP%u", GetACP ());
274   codeset = buf;
275 
276 #elif defined OS2
277 
278   const char *locale;
279   static char buf[2 + 10 + 1];
280   ULONG cp[3];
281   ULONG cplen;
282 
283   /* Allow user to override the codeset, as set in the operating system,
284      with standard language environment variables.  */
285   locale = getenv ("LC_ALL");
286   if (locale == NULL || locale[0] == '\0')
287     {
288       locale = getenv ("LC_CTYPE");
289       if (locale == NULL || locale[0] == '\0')
290 	locale = getenv ("LANG");
291     }
292   if (locale != NULL && locale[0] != '\0')
293     {
294       /* If the locale name contains an encoding after the dot, return it.  */
295       const char *dot = strchr (locale, '.');
296 
297       if (dot != NULL)
298 	{
299 	  const char *modifier;
300 
301 	  dot++;
302 	  /* Look for the possible @... trailer and remove it, if any.  */
303 	  modifier = strchr (dot, '@');
304 	  if (modifier == NULL)
305 	    return dot;
306 	  if (modifier - dot < sizeof (buf))
307 	    {
308 	      memcpy (buf, dot, modifier - dot);
309 	      buf [modifier - dot] = '\0';
310 	      return buf;
311 	    }
312 	}
313 
314       /* Resolve through the charset.alias file.  */
315       codeset = locale;
316     }
317   else
318     {
319       /* OS/2 has a function returning the locale's codepage as a number.  */
320       if (DosQueryCp (sizeof (cp), cp, &cplen))
321 	codeset = "";
322       else
323 	{
324 	  sprintf (buf, "CP%u", cp[0]);
325 	  codeset = buf;
326 	}
327     }
328 
329 #endif
330 
331   if (codeset == NULL)
332     /* The canonical name cannot be determined.  */
333     codeset = "";
334 
335   /* Resolve alias. */
336   for (aliases = get_charset_aliases ();
337        *aliases != '\0';
338        aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
339     if (strcmp (codeset, aliases) == 0
340 	|| (aliases[0] == '*' && aliases[1] == '\0'))
341       {
342 	codeset = aliases + strlen (aliases) + 1;
343 	break;
344       }
345 
346   return codeset;
347 }
348