xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/src/read-mo.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* Reading binary .mo files.
2    Copyright (C) 1995-1998, 2000-2006 Free Software Foundation, Inc.
3    Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, April 1995.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 2, or (at your option)
8    any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program; if not, write to the Free Software Foundation,
17    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
18 
19 #ifdef HAVE_CONFIG_H
20 # include <config.h>
21 #endif
22 
23 /* Specification.  */
24 #include "read-mo.h"
25 
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stddef.h>
30 #include <stdlib.h>
31 #include <string.h>
32 
33 /* This include file describes the main part of binary .mo format.  */
34 #include "gmo.h"
35 
36 #include "error.h"
37 #include "xalloc.h"
38 #include "binary-io.h"
39 #include "exit.h"
40 #include "message.h"
41 #include "format.h"
42 #include "gettext.h"
43 
44 #define _(str) gettext (str)
45 
46 
47 /* We read the file completely into memory.  This is more efficient than
48    lots of lseek().  This struct represents the .mo file in memory.  */
49 struct binary_mo_file
50 {
51   const char *filename;
52   char *data;
53   size_t size;
54   enum { MO_LITTLE_ENDIAN, MO_BIG_ENDIAN } endian;
55 };
56 
57 
58 /* Read the contents of the given input stream.  */
59 static void
read_binary_mo_file(struct binary_mo_file * bfp,FILE * fp,const char * filename)60 read_binary_mo_file (struct binary_mo_file *bfp,
61 		     FILE *fp, const char *filename)
62 {
63   char *buf = NULL;
64   size_t alloc = 0;
65   size_t size = 0;
66   size_t count;
67 
68   while (!feof (fp))
69     {
70       const size_t increment = 4096;
71       if (size + increment > alloc)
72 	{
73 	  alloc = alloc + alloc / 2;
74 	  if (alloc < size + increment)
75 	    alloc = size + increment;
76 	  buf = (char *) xrealloc (buf, alloc);
77 	}
78       count = fread (buf + size, 1, increment, fp);
79       if (count == 0)
80 	{
81 	  if (ferror (fp))
82 	    error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
83 		   filename);
84 	}
85       else
86 	size += count;
87     }
88   buf = (char *) xrealloc (buf, size);
89   bfp->filename = filename;
90   bfp->data = buf;
91   bfp->size = size;
92 }
93 
94 /* Get a 32-bit number from the file, at the given file position.  */
95 static nls_uint32
get_uint32(const struct binary_mo_file * bfp,size_t offset)96 get_uint32 (const struct binary_mo_file *bfp, size_t offset)
97 {
98   nls_uint32 b0, b1, b2, b3;
99 
100   if (offset + 4 > bfp->size)
101     error (EXIT_FAILURE, 0, _("file \"%s\" is truncated"), bfp->filename);
102 
103   b0 = *(unsigned char *) (bfp->data + offset + 0);
104   b1 = *(unsigned char *) (bfp->data + offset + 1);
105   b2 = *(unsigned char *) (bfp->data + offset + 2);
106   b3 = *(unsigned char *) (bfp->data + offset + 3);
107   if (bfp->endian == MO_LITTLE_ENDIAN)
108     return b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
109   else
110     return (b0 << 24) | (b1 << 16) | (b2 << 8) | b3;
111 }
112 
113 /* Get a static string from the file, at the given file position.  */
114 static char *
get_string(const struct binary_mo_file * bfp,size_t offset,size_t * lengthp)115 get_string (const struct binary_mo_file *bfp, size_t offset, size_t *lengthp)
116 {
117   /* See 'struct string_desc'.  */
118   nls_uint32 s_length = get_uint32 (bfp, offset);
119   nls_uint32 s_offset = get_uint32 (bfp, offset + 4);
120 
121   if (s_offset + s_length + 1 > bfp->size)
122     error (EXIT_FAILURE, 0, _("file \"%s\" is truncated"), bfp->filename);
123   if (bfp->data[s_offset + s_length] != '\0')
124     error (EXIT_FAILURE, 0,
125 	   _("file \"%s\" contains a not NUL terminated string"),
126 	   bfp->filename);
127 
128   *lengthp = s_length + 1;
129   return bfp->data + s_offset;
130 }
131 
132 /* Get a system dependent string from the file, at the given file position.  */
133 static char *
get_sysdep_string(const struct binary_mo_file * bfp,size_t offset,const struct mo_file_header * header,size_t * lengthp)134 get_sysdep_string (const struct binary_mo_file *bfp, size_t offset,
135 		   const struct mo_file_header *header, size_t *lengthp)
136 {
137   /* See 'struct sysdep_string'.  */
138   size_t length;
139   char *string;
140   size_t i;
141   char *p;
142   nls_uint32 s_offset;
143 
144   /* Compute the length.  */
145   length = 0;
146   for (i = 4; ; i += 8)
147     {
148       nls_uint32 segsize = get_uint32 (bfp, offset + i);
149       nls_uint32 sysdepref = get_uint32 (bfp, offset + i + 4);
150       nls_uint32 sysdep_segment_offset;
151       nls_uint32 ss_length;
152       nls_uint32 ss_offset;
153       size_t n;
154 
155       length += segsize;
156 
157       if (sysdepref == SEGMENTS_END)
158 	break;
159       if (sysdepref >= header->n_sysdep_segments)
160 	/* Invalid.  */
161 	  error (EXIT_FAILURE, 0, _("file \"%s\" is not in GNU .mo format"),
162 		 bfp->filename);
163       /* See 'struct sysdep_segment'.  */
164       sysdep_segment_offset = header->sysdep_segments_offset + sysdepref * 8;
165       ss_length = get_uint32 (bfp, sysdep_segment_offset);
166       ss_offset = get_uint32 (bfp, sysdep_segment_offset + 4);
167       if (ss_offset + ss_length > bfp->size)
168 	error (EXIT_FAILURE, 0, _("file \"%s\" is truncated"), bfp->filename);
169       if (!(ss_length > 0 && bfp->data[ss_offset + ss_length - 1] == '\0'))
170 	{
171 	  char location[30];
172 	  sprintf (location, "sysdep_segment[%u]", (unsigned int) sysdepref);
173 	  error (EXIT_FAILURE, 0,
174 		 _("file \"%s\" contains a not NUL terminated string, at %s"),
175 		 bfp->filename, location);
176 	}
177       n = strlen (bfp->data + ss_offset);
178       length += (n > 1 ? 1 + n + 1 : n);
179     }
180 
181   /* Allocate and fill the string.  */
182   string = (char *) xmalloc (length);
183   p = string;
184   s_offset = get_uint32 (bfp, offset);
185   for (i = 4; ; i += 8)
186     {
187       nls_uint32 segsize = get_uint32 (bfp, offset + i);
188       nls_uint32 sysdepref = get_uint32 (bfp, offset + i + 4);
189       nls_uint32 sysdep_segment_offset;
190       nls_uint32 ss_length;
191       nls_uint32 ss_offset;
192       size_t n;
193 
194       if (s_offset + segsize > bfp->size)
195 	error (EXIT_FAILURE, 0, _("file \"%s\" is truncated"), bfp->filename);
196       memcpy (p, bfp->data + s_offset, segsize);
197       p += segsize;
198       s_offset += segsize;
199 
200       if (sysdepref == SEGMENTS_END)
201 	break;
202       if (sysdepref >= header->n_sysdep_segments)
203 	abort ();
204       /* See 'struct sysdep_segment'.  */
205       sysdep_segment_offset = header->sysdep_segments_offset + sysdepref * 8;
206       ss_length = get_uint32 (bfp, sysdep_segment_offset);
207       ss_offset = get_uint32 (bfp, sysdep_segment_offset + 4);
208       if (ss_offset + ss_length > bfp->size)
209 	abort ();
210       if (!(ss_length > 0 && bfp->data[ss_offset + ss_length - 1] == '\0'))
211 	abort ();
212       n = strlen (bfp->data + ss_offset);
213       if (n > 1)
214 	*p++ = '<';
215       memcpy (p, bfp->data + ss_offset, n);
216       p += n;
217       if (n > 1)
218 	*p++ = '>';
219     }
220 
221   if (p != string + length)
222     abort ();
223 
224   *lengthp = length;
225   return string;
226 }
227 
228 /* Reads an existing .mo file and adds the messages to mlp.  */
229 void
read_mo_file(message_list_ty * mlp,const char * filename)230 read_mo_file (message_list_ty *mlp, const char *filename)
231 {
232   FILE *fp;
233   struct binary_mo_file bf;
234   struct mo_file_header header;
235   unsigned int i;
236   static lex_pos_ty pos = { __FILE__, __LINE__ };
237 
238   if (strcmp (filename, "-") == 0 || strcmp (filename, "/dev/stdin") == 0)
239     {
240       fp = stdin;
241       SET_BINARY (fileno (fp));
242     }
243   else
244     {
245       fp = fopen (filename, "rb");
246       if (fp == NULL)
247 	error (EXIT_FAILURE, errno,
248 	       _("error while opening \"%s\" for reading"), filename);
249     }
250 
251   /* Read the file contents into memory.  */
252   read_binary_mo_file (&bf, fp, filename);
253 
254   /* Get a 32-bit number from the file header.  */
255 # define GET_HEADER_FIELD(field) \
256     get_uint32 (&bf, offsetof (struct mo_file_header, field))
257 
258   /* We must grope the file to determine which endian it is.
259      Perversity of the universe tends towards maximum, so it will
260      probably not match the currently executing architecture.  */
261   bf.endian = MO_BIG_ENDIAN;
262   header.magic = GET_HEADER_FIELD (magic);
263   if (header.magic != _MAGIC)
264     {
265       bf.endian = MO_LITTLE_ENDIAN;
266       header.magic = GET_HEADER_FIELD (magic);
267       if (header.magic != _MAGIC)
268 	{
269 	unrecognised:
270 	  error (EXIT_FAILURE, 0, _("file \"%s\" is not in GNU .mo format"),
271 		 filename);
272 	}
273     }
274 
275   header.revision = GET_HEADER_FIELD (revision);
276 
277   /* We support only the major revisions 0 and 1.  */
278   switch (header.revision >> 16)
279     {
280     case 0:
281     case 1:
282       /* Fill the header parts that apply to major revisions 0 and 1.  */
283       header.nstrings = GET_HEADER_FIELD (nstrings);
284       header.orig_tab_offset = GET_HEADER_FIELD (orig_tab_offset);
285       header.trans_tab_offset = GET_HEADER_FIELD (trans_tab_offset);
286       header.hash_tab_size = GET_HEADER_FIELD (hash_tab_size);
287       header.hash_tab_offset = GET_HEADER_FIELD (hash_tab_offset);
288 
289       for (i = 0; i < header.nstrings; i++)
290 	{
291 	  message_ty *mp;
292 	  char *msgctxt;
293 	  char *msgid;
294 	  size_t msgid_len;
295 	  char *separator;
296 	  char *msgstr;
297 	  size_t msgstr_len;
298 
299 	  /* Read the msgctxt and msgid.  */
300 	  msgid = get_string (&bf, header.orig_tab_offset + i * 8,
301 			      &msgid_len);
302 	  /* Split into msgctxt and msgid.  */
303 	  separator = strchr (msgid, MSGCTXT_SEPARATOR);
304 	  if (separator != NULL)
305 	    {
306 	      /* The part before the MSGCTXT_SEPARATOR is the msgctxt.  */
307 	      *separator = '\0';
308 	      msgctxt = msgid;
309 	      msgid = separator + 1;
310 	      msgid_len -= msgid - msgctxt;
311 	    }
312 	  else
313 	    msgctxt = NULL;
314 
315 	  /* Read the msgstr.  */
316 	  msgstr = get_string (&bf, header.trans_tab_offset + i * 8,
317 			       &msgstr_len);
318 
319 	  mp = message_alloc (msgctxt,
320 			      msgid,
321 			      (strlen (msgid) + 1 < msgid_len
322 			       ? msgid + strlen (msgid) + 1
323 			       : NULL),
324 			      msgstr, msgstr_len,
325 			      &pos);
326 	  message_list_append (mlp, mp);
327 	}
328 
329       switch (header.revision & 0xffff)
330 	{
331 	case 0:
332 	  break;
333 	case 1:
334 	default:
335 	  /* Fill the header parts that apply to minor revision >= 1.  */
336 	  header.n_sysdep_segments = GET_HEADER_FIELD (n_sysdep_segments);
337 	  header.sysdep_segments_offset =
338 	    GET_HEADER_FIELD (sysdep_segments_offset);
339 	  header.n_sysdep_strings = GET_HEADER_FIELD (n_sysdep_strings);
340 	  header.orig_sysdep_tab_offset =
341 	    GET_HEADER_FIELD (orig_sysdep_tab_offset);
342 	  header.trans_sysdep_tab_offset =
343 	    GET_HEADER_FIELD (trans_sysdep_tab_offset);
344 
345 	  for (i = 0; i < header.n_sysdep_strings; i++)
346 	    {
347 	      message_ty *mp;
348 	      char *msgctxt;
349 	      char *msgid;
350 	      size_t msgid_len;
351 	      char *separator;
352 	      char *msgstr;
353 	      size_t msgstr_len;
354 	      nls_uint32 offset;
355 	      size_t f;
356 
357 	      /* Read the msgctxt and msgid.  */
358 	      offset = get_uint32 (&bf, header.orig_sysdep_tab_offset + i * 4);
359 	      msgid = get_sysdep_string (&bf, offset, &header, &msgid_len);
360 	      /* Split into msgctxt and msgid.  */
361 	      separator = strchr (msgid, MSGCTXT_SEPARATOR);
362 	      if (separator != NULL)
363 		{
364 		  /* The part before the MSGCTXT_SEPARATOR is the msgctxt.  */
365 		  *separator = '\0';
366 		  msgctxt = msgid;
367 		  msgid = separator + 1;
368 		  msgid_len -= msgid - msgctxt;
369 		}
370 	      else
371 		msgctxt = NULL;
372 
373 	      /* Read the msgstr.  */
374 	      offset = get_uint32 (&bf, header.trans_sysdep_tab_offset + i * 4);
375 	      msgstr = get_sysdep_string (&bf, offset, &header, &msgstr_len);
376 
377 	      mp = message_alloc (msgctxt,
378 				  msgid,
379 				  (strlen (msgid) + 1 < msgid_len
380 				   ? msgid + strlen (msgid) + 1
381 				   : NULL),
382 				  msgstr, msgstr_len,
383 				  &pos);
384 
385 	      /* Only messages with c-format or objc-format annotation are
386 		 recognized as having system-dependent strings by msgfmt.
387 		 Which one of the two, we don't know.  We have to guess,
388 		 assuming that c-format is more probable than objc-format and
389 		 that the .mo was likely produced by "msgfmt -c".  */
390 	      for (f = format_c; ; f = format_objc)
391 		{
392 		  bool valid = true;
393 		  struct formatstring_parser *parser = formatstring_parsers[f];
394 		  const char *str_end;
395 		  const char *str;
396 
397 		  str_end = msgid + msgid_len;
398 		  for (str = msgid; str < str_end; str += strlen (str) + 1)
399 		    {
400 		      char *invalid_reason = NULL;
401 		      void *descr = parser->parse (str, false, &invalid_reason);
402 
403 		      if (descr != NULL)
404 			parser->free (descr);
405 		      else
406 			{
407 			  free (invalid_reason);
408 			  valid = false;
409 			  break;
410 			}
411 		    }
412 		  if (valid)
413 		    {
414 		      str_end = msgstr + msgstr_len;
415 		      for (str = msgstr; str < str_end; str += strlen (str) + 1)
416 			{
417 			  char *invalid_reason = NULL;
418 			  void *descr =
419 			    parser->parse (str, true, &invalid_reason);
420 
421 			  if (descr != NULL)
422 			    parser->free (descr);
423 			  else
424 			    {
425 			      free (invalid_reason);
426 			      valid = false;
427 			      break;
428 			    }
429 			}
430 		    }
431 
432 		  if (valid)
433 		    {
434 		      /* Found the most likely among c-format, objc-format.  */
435 		      mp->is_format[f] = yes;
436 		      break;
437 		    }
438 
439 		  /* Try next f.  */
440 		  if (f == format_objc)
441 		    break;
442 		}
443 
444 	      message_list_append (mlp, mp);
445 	    }
446 	  break;
447 	}
448       break;
449 
450     default:
451       goto unrecognised;
452     }
453 
454   if (fp != stdin)
455     fclose (fp);
456 }
457