xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/src/filter-sr-latin.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* Recode Serbian text from Cyrillic to Latin script.
2    Copyright (C) 2006 Free Software Foundation, Inc.
3    Written by Danilo Šegan <danilo@gnome.org>, 2006,
4    and Bruno Haible <bruno@clisp.org>, 2006.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19 
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
23 
24 /* Specification.  */
25 #include "filters.h"
26 
27 #include <stdlib.h>
28 
29 #include "xalloc.h"
30 
31 
32 /* Table for Serbian Cyrillic to Latin transcription.
33    The table is indexed by the Unicode code point, in the range 0x0400..0x045f.
34    The longest table entry is three bytes long.  */
35 static const char table[96][3 + 1] =
36 {
37   /* U+0400 */ "",
38   /* U+0401 */ "",
39   /* U+0402 */ "\xC4\x90", /* "Đ" */
40   /* U+0403 */ "",
41   /* U+0404 */ "",
42   /* U+0405 */ "",
43   /* U+0406 */ "",
44   /* U+0407 */ "",
45   /* U+0408 */ "J",
46   /* U+0409 */ "Lj",
47   /* U+040A */ "Nj",
48   /* U+040B */ "\xC4\x86", /* "Ć" */
49   /* U+040C */ "",
50   /* U+040D */ "",
51   /* U+040E */ "",
52   /* U+040F */ "D\xC5\xBE", /* "Dž" */
53   /* U+0410 */ "A",
54   /* U+0411 */ "B",
55   /* U+0412 */ "V",
56   /* U+0413 */ "G",
57   /* U+0414 */ "D",
58   /* U+0415 */ "E",
59   /* U+0416 */ "\xC5\xBD", /* "Ž" */
60   /* U+0417 */ "Z",
61   /* U+0418 */ "I",
62   /* U+0419 */ "",
63   /* U+041A */ "K",
64   /* U+041B */ "L",
65   /* U+041C */ "M",
66   /* U+041D */ "N",
67   /* U+041E */ "O",
68   /* U+041F */ "P",
69   /* U+0420 */ "R",
70   /* U+0421 */ "S",
71   /* U+0422 */ "T",
72   /* U+0423 */ "U",
73   /* U+0424 */ "F",
74   /* U+0425 */ "H",
75   /* U+0426 */ "C",
76   /* U+0427 */ "\xC4\x8C", /* "Č" */
77   /* U+0428 */ "\xC5\xA0", /* "Š" */
78   /* U+0429 */ "",
79   /* U+042A */ "",
80   /* U+042B */ "",
81   /* U+042C */ "",
82   /* U+042D */ "",
83   /* U+042E */ "",
84   /* U+042F */ "",
85   /* U+0430 */ "a",
86   /* U+0431 */ "b",
87   /* U+0432 */ "v",
88   /* U+0433 */ "g",
89   /* U+0434 */ "d",
90   /* U+0435 */ "e",
91   /* U+0436 */ "\xC5\xBE", /* "ž" */
92   /* U+0437 */ "z",
93   /* U+0438 */ "i",
94   /* U+0439 */ "",
95   /* U+043A */ "k",
96   /* U+043B */ "l",
97   /* U+043C */ "m",
98   /* U+043D */ "n",
99   /* U+043E */ "o",
100   /* U+043F */ "p",
101   /* U+0440 */ "r",
102   /* U+0441 */ "s",
103   /* U+0442 */ "t",
104   /* U+0443 */ "u",
105   /* U+0444 */ "f",
106   /* U+0445 */ "h",
107   /* U+0446 */ "c",
108   /* U+0447 */ "\xC4\x8D", /* "č" */
109   /* U+0448 */ "\xC5\xA1", /* "š" */
110   /* U+0449 */ "",
111   /* U+044A */ "",
112   /* U+044B */ "",
113   /* U+044C */ "",
114   /* U+044D */ "",
115   /* U+044E */ "",
116   /* U+044F */ "",
117   /* U+0450 */ "",
118   /* U+0451 */ "",
119   /* U+0452 */ "\xC4\x91", /* "đ" */
120   /* U+0453 */ "",
121   /* U+0454 */ "",
122   /* U+0455 */ "",
123   /* U+0456 */ "",
124   /* U+0457 */ "",
125   /* U+0458 */ "j",
126   /* U+0459 */ "lj",
127   /* U+045A */ "nj",
128   /* U+045B */ "\xC4\x87", /* "ć" */
129   /* U+045C */ "",
130   /* U+045D */ "",
131   /* U+045E */ "",
132   /* U+045F */ "d\xC5\xBE" /* "dž" */
133 };
134 
135 /* Quick test for an uppercase character in the range U+0041..U+005A.
136    The argument must be a byte in the range 0..UCHAR_MAX.  */
137 #define IS_UPPERCASE_LATIN(byte) \
138   ((unsigned char) ((byte) - 'A') <= 'Z' - 'A')
139 
140 /* Quick test for an uppercase character in the range U+0400..U+042F.
141    The arguments must be bytes in the range 0..UCHAR_MAX.  */
142 #define IS_UPPERCASE_CYRILLIC(byte1,byte2) \
143   ((byte1) == 0xd0 && (unsigned char) ((byte2) - 0x80) < 0x30)
144 
145 void
serbian_to_latin(const char * input,size_t input_len,char ** output_p,size_t * output_len_p)146 serbian_to_latin (const char *input, size_t input_len,
147 		  char **output_p, size_t *output_len_p)
148 {
149   /* Loop through the input string, producing a replacement for each character.
150      Only characters in the range U+0400..U+045F (\xD0\x80..\xD1\x9F) need to
151      be handled, and more precisely only those for which a replacement exists
152      in the table.  Other characters are copied without modification.
153      The characters U+0409, U+040A, U+040F are transliterated to uppercase or
154      mixed-case replacements ("LJ" / "Lj", "NJ" / "Nj", "DŽ" / "Dž"), depending
155      on the case of the surrounding characters.
156      Since we assume UTF-8 encoding, the bytes \xD0..\xD1 can only occur at the
157      beginning of a character; the second and further bytes of a character are
158      all in the range \x80..\xBF.  */
159 
160   /* Since sequences of 2 bytes are sequences of at most 3 bytes, the size
161      of the output will be at most 1.5 * input_len.  */
162   size_t allocated = input_len + (input_len >> 1);
163   char *output = (char *) xmalloc (allocated);
164 
165   const char *input_end = input + input_len;
166   const char *ip;
167   char *op;
168 
169   for (ip = input, op = output; ip < input_end; )
170     {
171       unsigned char byte = (unsigned char) *ip;
172 
173       /* Test for the first byte of a Cyrillic character.  */
174       if ((byte >= 0xd0 && byte <= 0xd1) && (ip + 1 < input_end))
175 	{
176 	  unsigned char second_byte = (unsigned char) ip[1];
177 
178 	  /* Verify the second byte is valid.  */
179 	  if (second_byte >= 0x80 && second_byte < 0xc0)
180 	    {
181 	      unsigned int uc = ((byte & 0x1f) << 6) | (second_byte & 0x3f);
182 
183 	      if (uc >= 0x0400 && uc <= 0x045f)
184 		{
185 		  /* Look up replacement from the table.  */
186 		  const char *repl = table[uc - 0x0400];
187 
188 		  if (repl[0] != '\0')
189 		    {
190 		      /* Found a replacement.
191 			 Now handle the special cases.  */
192 		      if (uc == 0x0409 || uc == 0x040a || uc == 0x040f)
193 			if ((ip + 2 < input_end
194 			     && IS_UPPERCASE_LATIN ((unsigned char) ip[2]))
195 			    || (ip + 3 < input_end
196 				&& IS_UPPERCASE_CYRILLIC ((unsigned char) ip[2],
197 							  (unsigned char) ip[3]))
198 			    || (ip >= input + 1
199 				&& IS_UPPERCASE_LATIN ((unsigned char) ip[-1]))
200 			    || (ip >= input + 2
201 				&& IS_UPPERCASE_CYRILLIC ((unsigned char) ip[-2],
202 							  (unsigned char) ip[-1])))
203 			  {
204 			    /* Use the upper-case replacement instead of
205 			       the mixed-case replacement.  */
206 			    switch (uc)
207 			      {
208 			      case 0x0409:
209 				repl = "LJ"; break;
210 			      case 0x040a:
211 				repl = "NJ"; break;
212 			      case 0x040f:
213 				repl = "D\xC5\xBD"/* "DŽ" */; break;
214 			      default:
215 				abort ();
216 			      }
217 			  }
218 
219 		      /* Use the replacement.  */
220 		      *op++ = *repl++;
221 		      if (*repl != '\0')
222 			{
223 			  *op++ = *repl++;
224 			  if (*repl != '\0')
225 			    {
226 			      *op++ = *repl++;
227 			      /* All replacements have at most 3 bytes.  */
228 			      if (*repl != '\0')
229 				abort ();
230 			    }
231 			}
232 		      ip += 2;
233 		      continue;
234 		    }
235 		}
236 	    }
237 	}
238       *op++ = *ip++;
239     }
240 
241   {
242     size_t output_len = op - output;
243 
244     /* Verify that the allocated size was not exceeded.  */
245     if (output_len > allocated)
246       abort ();
247     /* Shrink the result.  */
248     if (output_len < allocated)
249       output = (char *) xrealloc (output, output_len);
250 
251     /* Done.  */
252     *output_p = output;
253     *output_len_p = output_len;
254   }
255 }
256