xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/gnulib-lib/linebreak.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* linebreak.c - line breaking of Unicode strings
2    Copyright (C) 2001-2003, 2006 Free Software Foundation, Inc.
3    Written by Bruno Haible <haible@clisp.cons.org>, 2001.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9 
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
18 
19 #include <config.h>
20 
21 /* Specification.  */
22 #include "linebreak.h"
23 
24 #include <stdlib.h>
25 #include <string.h>
26 #include "c-ctype.h"
27 #include "xsize.h"
28 
29 #include "utf8-ucs4.h"
30 
31 #ifdef unused
32 #include "utf16-ucs4.h"
33 
34 static inline int
u32_mbtouc(unsigned int * puc,const unsigned int * s,size_t n)35 u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n)
36 {
37   *puc = *s;
38   return 1;
39 }
40 #endif
41 
42 
43 /* Help GCC to generate good code for string comparisons with
44    immediate strings. */
45 #if defined (__GNUC__) && defined (__OPTIMIZE__)
46 
47 static inline int
streq9(const char * s1,const char * s2)48 streq9 (const char *s1, const char *s2)
49 {
50   return strcmp (s1 + 9, s2 + 9) == 0;
51 }
52 
53 static inline int
streq8(const char * s1,const char * s2,char s28)54 streq8 (const char *s1, const char *s2, char s28)
55 {
56   if (s1[8] == s28)
57     {
58       if (s28 == 0)
59         return 1;
60       else
61         return streq9 (s1, s2);
62     }
63   else
64     return 0;
65 }
66 
67 static inline int
streq7(const char * s1,const char * s2,char s27,char s28)68 streq7 (const char *s1, const char *s2, char s27, char s28)
69 {
70   if (s1[7] == s27)
71     {
72       if (s27 == 0)
73         return 1;
74       else
75         return streq8 (s1, s2, s28);
76     }
77   else
78     return 0;
79 }
80 
81 static inline int
streq6(const char * s1,const char * s2,char s26,char s27,char s28)82 streq6 (const char *s1, const char *s2, char s26, char s27, char s28)
83 {
84   if (s1[6] == s26)
85     {
86       if (s26 == 0)
87         return 1;
88       else
89         return streq7 (s1, s2, s27, s28);
90     }
91   else
92     return 0;
93 }
94 
95 static inline int
streq5(const char * s1,const char * s2,char s25,char s26,char s27,char s28)96 streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28)
97 {
98   if (s1[5] == s25)
99     {
100       if (s25 == 0)
101         return 1;
102       else
103         return streq6 (s1, s2, s26, s27, s28);
104     }
105   else
106     return 0;
107 }
108 
109 static inline int
streq4(const char * s1,const char * s2,char s24,char s25,char s26,char s27,char s28)110 streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28)
111 {
112   if (s1[4] == s24)
113     {
114       if (s24 == 0)
115         return 1;
116       else
117         return streq5 (s1, s2, s25, s26, s27, s28);
118     }
119   else
120     return 0;
121 }
122 
123 static inline int
streq3(const char * s1,const char * s2,char s23,char s24,char s25,char s26,char s27,char s28)124 streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28)
125 {
126   if (s1[3] == s23)
127     {
128       if (s23 == 0)
129         return 1;
130       else
131         return streq4 (s1, s2, s24, s25, s26, s27, s28);
132     }
133   else
134     return 0;
135 }
136 
137 static inline int
streq2(const char * s1,const char * s2,char s22,char s23,char s24,char s25,char s26,char s27,char s28)138 streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
139 {
140   if (s1[2] == s22)
141     {
142       if (s22 == 0)
143         return 1;
144       else
145         return streq3 (s1, s2, s23, s24, s25, s26, s27, s28);
146     }
147   else
148     return 0;
149 }
150 
151 static inline int
streq1(const char * s1,const char * s2,char s21,char s22,char s23,char s24,char s25,char s26,char s27,char s28)152 streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
153 {
154   if (s1[1] == s21)
155     {
156       if (s21 == 0)
157         return 1;
158       else
159         return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28);
160     }
161   else
162     return 0;
163 }
164 
165 static inline int
streq0(const char * s1,const char * s2,char s20,char s21,char s22,char s23,char s24,char s25,char s26,char s27,char s28)166 streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
167 {
168   if (s1[0] == s20)
169     {
170       if (s20 == 0)
171         return 1;
172       else
173         return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28);
174     }
175   else
176     return 0;
177 }
178 
179 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
180   streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
181 
182 #else
183 
184 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
185   (strcmp (s1, s2) == 0)
186 
187 #endif
188 
189 
190 static int
is_cjk_encoding(const char * encoding)191 is_cjk_encoding (const char *encoding)
192 {
193   if (0
194       /* Legacy Japanese encodings */
195       || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
196       /* Legacy Chinese encodings */
197       || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
198       || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
199       || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
200       || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
201       /* Legacy Korean encodings */
202       || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
203       || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
204       || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
205     return 1;
206   return 0;
207 }
208 
209 static int
is_utf8_encoding(const char * encoding)210 is_utf8_encoding (const char *encoding)
211 {
212   if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
213     return 1;
214   return 0;
215 }
216 
217 
218 /* Determine number of column positions required for UC. */
219 int uc_width (unsigned int uc, const char *encoding);
220 
221 /*
222  * Non-spacing attribute table.
223  * Consists of:
224  * - Non-spacing characters; generated from PropList.txt or
225  *   "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
226  * - Format control characters; generated from
227  *   "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
228  * - Zero width characters; generated from
229  *   "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
230  */
231 static const unsigned char nonspacing_table_data[16*64] = {
232   /* 0x0000-0x01ff */
233   0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
234   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
235   0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00, /* 0x0080-0x00bf */
236   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */
237   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */
238   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */
239   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */
240   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */
241   /* 0x0200-0x03ff */
242   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */
243   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */
244   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */
245   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */
246   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */
247   0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */
248   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */
249   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */
250   /* 0x0400-0x05ff */
251   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */
252   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */
253   0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */
254   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */
255   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */
256   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */
257   0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */
258   0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
259   /* 0x0600-0x07ff */
260   0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
261   0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
262   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
263   0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
264   0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
265   0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
266   0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
267   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */
268   /* 0x0800-0x09ff */
269   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */
270   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
271   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
272   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */
273   0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */
274   0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
275   0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
276   0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */
277   /* 0x0a00-0x0bff */
278   0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */
279   0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */
280   0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */
281   0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */
282   0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */
283   0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */
284   0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */
285   0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */
286   /* 0x0c00-0x0dff */
287   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */
288   0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */
289   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0c80-0x0cbf */
290   0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */
291   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */
292   0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */
293   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */
294   0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */
295   /* 0x0e00-0x0fff */
296   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */
297   0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */
298   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */
299   0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */
300   0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */
301   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */
302   0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */
303   0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */
304   /* 0x1000-0x11ff */
305   0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */
306   0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */
307   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */
308   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */
309   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */
310   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */
311   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */
312   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */
313   /* 0x1600-0x17ff */
314   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */
315   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */
316   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */
317   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */
318   0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */
319   0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */
320   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f, /* 0x1780-0x17bf */
321   0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */
322   /* 0x1800-0x19ff */
323   0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
324   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
325   0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
326   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
327   0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e, /* 0x1900-0x193f */
328   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
329   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */
330   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */
331   /* 0x2000-0x21ff */
332   0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
333   0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */
334   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */
335   0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */
336   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */
337   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */
338   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */
339   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */
340   /* 0x3000-0x31ff */
341   0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */
342   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */
343   0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
344   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
345   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
346   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
347   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
348   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
349   /* 0xfa00-0xfbff */
350   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */
351   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */
352   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */
353   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */
354   0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */
355   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */
356   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */
357   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */
358   /* 0xfe00-0xffff */
359   0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */
360   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */
361   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */
362   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
363   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
364   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
365   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
366   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */
367   /* 0x1d000-0x1d1ff */
368   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */
369   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */
370   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */
371   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */
372   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */
373   0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */
374   0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */
375   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* 0x1d1c0-0x1d1ff */
376 };
377 static const signed char nonspacing_table_ind[240] = {
378    0,  1,  2,  3,  4,  5,  6,  7, /* 0x0000-0x0fff */
379    8, -1, -1,  9, 10, -1, -1, -1, /* 0x1000-0x1fff */
380   11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */
381   12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */
382   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */
383   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */
384   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */
385   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */
386   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */
387   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */
388   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */
389   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */
390   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */
391   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */
392   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
393   -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */
394   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */
395   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */
396   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */
397   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */
398   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */
399   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */
400   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */
401   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */
402   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */
403   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */
404   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */
405   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */
406   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */
407   15, -1, -1, -1, -1, -1, -1, -1  /* 0x1d000-0x1dfff */
408 };
409 
410 /* Determine number of column positions required for UC. */
411 int
uc_width(unsigned int uc,const char * encoding)412 uc_width (unsigned int uc, const char *encoding)
413 {
414   /* Test for non-spacing or control character.  */
415   if ((uc >> 9) < 240)
416     {
417       int ind = nonspacing_table_ind[uc >> 9];
418       if (ind >= 0)
419 	if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
420 	  {
421 	    if (uc > 0 && uc < 0xa0)
422 	      return -1;
423 	    else
424 	      return 0;
425 	  }
426     }
427   else if ((uc >> 9) == (0xe0000 >> 9))
428     {
429       if (uc < 0xe0100
430 	  ? (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
431 	  : (uc <= 0xe01ef))
432 	return 0;
433     }
434   /* Test for double-width character.
435    * Generated from "grep '^....;[WF]' EastAsianWidth.txt"
436    * and            "grep '^....;[^WF]' EastAsianWidth.txt"
437    */
438   if (uc >= 0x1100
439       && ((uc < 0x1160) /* Hangul Jamo */
440 	  || (uc >= 0x2e80 && uc < 0x4dc0  /* CJK */
441 	      && !(uc == 0x303f))
442 	  || (uc >= 0x4e00 && uc < 0xa4d0) /* CJK ... Yi */
443 	  || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */
444 	  || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */
445 	  || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */
446 	  || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */
447 	  || (uc >= 0xffe0 && uc < 0xffe7)
448 	  || (uc >= 0x20000 && uc <= 0x2fffd) /* CJK, CJK Compatibility Ideographs */
449 	  || (uc >= 0x30000 && uc <= 0x3fffd)
450      )   )
451     return 2;
452   /* In ancient CJK encodings, Cyrillic and most other characters are
453      double-width as well.  */
454   if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
455       && is_cjk_encoding (encoding))
456     return 2;
457   return 1;
458 }
459 
460 
461 #ifdef unused
462 
463 /* Determine number of column positions required for first N units
464    (or fewer if S ends before this) in S.  */
465 
466 int
u8_width(const unsigned char * s,size_t n,const char * encoding)467 u8_width (const unsigned char *s, size_t n, const char *encoding)
468 {
469   const unsigned char *s_end = s + n;
470   int width = 0;
471 
472   while (s < s_end)
473     {
474       unsigned int uc;
475       int w;
476 
477       s += u8_mbtouc (&uc, s, s_end - s);
478 
479       if (uc == 0)
480         break; /* end of string reached */
481 
482       w = uc_width (uc, encoding);
483       if (w >= 0) /* ignore control characters in the string */
484         width += w;
485     }
486 
487   return width;
488 }
489 
490 int
u16_width(const unsigned short * s,size_t n,const char * encoding)491 u16_width (const unsigned short *s, size_t n, const char *encoding)
492 {
493   const unsigned short *s_end = s + n;
494   int width = 0;
495 
496   while (s < s_end)
497     {
498       unsigned int uc;
499       int w;
500 
501       s += u16_mbtouc (&uc, s, s_end - s);
502 
503       if (uc == 0)
504         break; /* end of string reached */
505 
506       w = uc_width (uc, encoding);
507       if (w >= 0) /* ignore control characters in the string */
508         width += w;
509     }
510 
511   return width;
512 }
513 
514 int
u32_width(const unsigned int * s,size_t n,const char * encoding)515 u32_width (const unsigned int *s, size_t n, const char *encoding)
516 {
517   const unsigned int *s_end = s + n;
518   int width = 0;
519 
520   while (s < s_end)
521     {
522       unsigned int uc = *s++;
523       int w;
524 
525       if (uc == 0)
526         break; /* end of string reached */
527 
528       w = uc_width (uc, encoding);
529       if (w >= 0) /* ignore control characters in the string */
530         width += w;
531     }
532 
533   return width;
534 }
535 
536 #endif
537 
538 
539 /* Determine the line break points in S, and store the result at p[0..n-1].  */
540 /* We don't support line breaking of complex-context dependent characters
541    (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
542 
543 /* Line breaking classification.  */
544 
545 enum
546 {
547   /* Values >= 20 are resolved at run time. */
548   LBP_BK =  0, /* mandatory break */
549 /*LBP_CR,         carriage return - not used here because it's a DOSism */
550 /*LBP_LF,         line feed - not used here because it's a DOSism */
551   LBP_CM = 20, /* attached characters and combining marks */
552 /*LBP_SG,         surrogates - not used here because they are not characters */
553   LBP_ZW =  1, /* zero width space */
554   LBP_IN =  2, /* inseparable */
555   LBP_GL =  3, /* non-breaking (glue) */
556   LBP_CB = 22, /* contingent break opportunity */
557   LBP_SP = 21, /* space */
558   LBP_BA =  4, /* break opportunity after */
559   LBP_BB =  5, /* break opportunity before */
560   LBP_B2 =  6, /* break opportunity before and after */
561   LBP_HY =  7, /* hyphen */
562   LBP_NS =  8, /* non starter */
563   LBP_OP =  9, /* opening punctuation */
564   LBP_CL = 10, /* closing punctuation */
565   LBP_QU = 11, /* ambiguous quotation */
566   LBP_EX = 12, /* exclamation/interrogation */
567   LBP_ID = 13, /* ideographic */
568   LBP_NU = 14, /* numeric */
569   LBP_IS = 15, /* infix separator (numeric) */
570   LBP_SY = 16, /* symbols allowing breaks */
571   LBP_AL = 17, /* ordinary alphabetic and symbol characters */
572   LBP_PR = 18, /* prefix (numeric) */
573   LBP_PO = 19, /* postfix (numeric) */
574   LBP_SA = 23, /* complex context (South East Asian) */
575   LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
576   LBP_XX = 25  /* unknown */
577 };
578 
579 #include "lbrkprop.h"
580 
581 static inline unsigned char
lbrkprop_lookup(unsigned int uc)582 lbrkprop_lookup (unsigned int uc)
583 {
584   unsigned int index1 = uc >> lbrkprop_header_0;
585   if (index1 < lbrkprop_header_1)
586     {
587       int lookup1 = lbrkprop.level1[index1];
588       if (lookup1 >= 0)
589         {
590           unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
591           int lookup2 = lbrkprop.level2[lookup1 + index2];
592           if (lookup2 >= 0)
593             {
594               unsigned int index3 = uc & lbrkprop_header_4;
595               return lbrkprop.level3[lookup2 + index3];
596             }
597         }
598     }
599   return LBP_XX;
600 }
601 
602 /* Table indexed by two line breaking classifications.  */
603 #define D 1  /* direct break opportunity, empty in table 7.3 of UTR #14 */
604 #define I 2  /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
605 #define P 3  /* prohibited break,           '^' in table 7.3 of UTR #14 */
606 static const unsigned char lbrk_table[19][19] = {
607                                 /* after */
608         /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
609 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
610 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
611 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
612 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
613 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
614 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
615 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
616 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
617 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
618 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
619 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
620 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
621 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
622 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
623 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
624 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
625 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
626 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
627 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
628 /* "" */
629 /* before */
630 };
631 /* Note: The (B2,B2) entry should probably be D instead of P.  */
632 /* Note: The (PR,ID) entry should probably be D instead of I.  */
633 
634 void
u8_possible_linebreaks(const unsigned char * s,size_t n,const char * encoding,char * p)635 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
636 {
637   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
638   const unsigned char *s_end = s + n;
639   int last_prop = LBP_BK; /* line break property of last non-space character */
640   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
641   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
642 
643   /* Don't break inside multibyte characters.  */
644   memset (p, UC_BREAK_PROHIBITED, n);
645 
646   while (s < s_end)
647     {
648       unsigned int uc;
649       int count = u8_mbtouc (&uc, s, s_end - s);
650       int prop = lbrkprop_lookup (uc);
651 
652       if (prop == LBP_BK)
653         {
654           /* Mandatory break.  */
655           *p = UC_BREAK_MANDATORY;
656           last_prop = LBP_BK;
657           seen_space = NULL;
658           seen_space2 = NULL;
659         }
660       else
661         {
662           char *q;
663 
664           /* Resolve property values whose behaviour is not fixed.  */
665           switch (prop)
666             {
667               case LBP_AI:
668                 /* Resolve ambiguous.  */
669                 prop = LBP_AI_REPLACEMENT;
670                 break;
671               case LBP_CB:
672                 /* This is arbitrary.  */
673                 prop = LBP_ID;
674                 break;
675               case LBP_SA:
676                 /* We don't handle complex scripts yet.
677                    Treat LBP_SA like LBP_XX.  */
678               case LBP_XX:
679                 /* This is arbitrary.  */
680                 prop = LBP_AL;
681                 break;
682             }
683 
684           /* Deal with combining characters.  */
685           q = p;
686           if (prop == LBP_CM)
687             {
688               /* Don't break just before a combining character.  */
689               *p = UC_BREAK_PROHIBITED;
690               /* A combining character turns a preceding space into LBP_AL.  */
691               if (seen_space != NULL)
692                 {
693                   q = seen_space;
694                   seen_space = seen_space2;
695                   prop = LBP_AL;
696                   goto lookup_via_table;
697                 }
698             }
699           else if (prop == LBP_SP)
700             {
701               /* Don't break just before a space.  */
702               *p = UC_BREAK_PROHIBITED;
703               seen_space2 = seen_space;
704               seen_space = p;
705             }
706           else
707             {
708              lookup_via_table:
709               /* prop must be usable as an index for table 7.3 of UTR #14.  */
710               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
711                 abort ();
712 
713               if (last_prop == LBP_BK)
714                 {
715                   /* Don't break at the beginning of a line.  */
716                   *q = UC_BREAK_PROHIBITED;
717                 }
718               else
719                 {
720                   switch (lbrk_table [last_prop-1] [prop-1])
721                     {
722                       case D:
723                         *q = UC_BREAK_POSSIBLE;
724                         break;
725                       case I:
726                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
727                         break;
728                       case P:
729                         *q = UC_BREAK_PROHIBITED;
730                         break;
731                       default:
732                         abort ();
733                     }
734                 }
735               last_prop = prop;
736               seen_space = NULL;
737               seen_space2 = NULL;
738             }
739         }
740 
741       s += count;
742       p += count;
743     }
744 }
745 
746 #ifdef unused
747 
748 void
u16_possible_linebreaks(const unsigned short * s,size_t n,const char * encoding,char * p)749 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
750 {
751   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
752   const unsigned short *s_end = s + n;
753   int last_prop = LBP_BK; /* line break property of last non-space character */
754   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
755   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
756 
757   /* Don't break inside multibyte characters.  */
758   memset (p, UC_BREAK_PROHIBITED, n);
759 
760   while (s < s_end)
761     {
762       unsigned int uc;
763       int count = u16_mbtouc (&uc, s, s_end - s);
764       int prop = lbrkprop_lookup (uc);
765 
766       if (prop == LBP_BK)
767         {
768           /* Mandatory break.  */
769           *p = UC_BREAK_MANDATORY;
770           last_prop = LBP_BK;
771           seen_space = NULL;
772           seen_space2 = NULL;
773         }
774       else
775         {
776           char *q;
777 
778           /* Resolve property values whose behaviour is not fixed.  */
779           switch (prop)
780             {
781               case LBP_AI:
782                 /* Resolve ambiguous.  */
783                 prop = LBP_AI_REPLACEMENT;
784                 break;
785               case LBP_CB:
786                 /* This is arbitrary.  */
787                 prop = LBP_ID;
788                 break;
789               case LBP_SA:
790                 /* We don't handle complex scripts yet.
791                    Treat LBP_SA like LBP_XX.  */
792               case LBP_XX:
793                 /* This is arbitrary.  */
794                 prop = LBP_AL;
795                 break;
796             }
797 
798           /* Deal with combining characters.  */
799           q = p;
800           if (prop == LBP_CM)
801             {
802               /* Don't break just before a combining character.  */
803               *p = UC_BREAK_PROHIBITED;
804               /* A combining character turns a preceding space into LBP_AL.  */
805               if (seen_space != NULL)
806                 {
807                   q = seen_space;
808                   seen_space = seen_space2;
809                   prop = LBP_AL;
810                   goto lookup_via_table;
811                 }
812             }
813           else if (prop == LBP_SP)
814             {
815               /* Don't break just before a space.  */
816               *p = UC_BREAK_PROHIBITED;
817               seen_space2 = seen_space;
818               seen_space = p;
819             }
820           else
821             {
822              lookup_via_table:
823               /* prop must be usable as an index for table 7.3 of UTR #14.  */
824               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
825                 abort ();
826 
827               if (last_prop == LBP_BK)
828                 {
829                   /* Don't break at the beginning of a line.  */
830                   *q = UC_BREAK_PROHIBITED;
831                 }
832               else
833                 {
834                   switch (lbrk_table [last_prop-1] [prop-1])
835                     {
836                       case D:
837                         *q = UC_BREAK_POSSIBLE;
838                         break;
839                       case I:
840                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
841                         break;
842                       case P:
843                         *q = UC_BREAK_PROHIBITED;
844                         break;
845                       default:
846                         abort ();
847                     }
848                 }
849               last_prop = prop;
850               seen_space = NULL;
851               seen_space2 = NULL;
852             }
853         }
854 
855       s += count;
856       p += count;
857     }
858 }
859 
860 void
u32_possible_linebreaks(const unsigned int * s,size_t n,const char * encoding,char * p)861 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
862 {
863   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
864   const unsigned int *s_end = s + n;
865   int last_prop = LBP_BK; /* line break property of last non-space character */
866   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
867   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
868 
869   while (s < s_end)
870     {
871       unsigned int uc = *s;
872       int prop = lbrkprop_lookup (uc);
873 
874       if (prop == LBP_BK)
875         {
876           /* Mandatory break.  */
877           *p = UC_BREAK_MANDATORY;
878           last_prop = LBP_BK;
879           seen_space = NULL;
880           seen_space2 = NULL;
881         }
882       else
883         {
884           char *q;
885 
886           /* Resolve property values whose behaviour is not fixed.  */
887           switch (prop)
888             {
889               case LBP_AI:
890                 /* Resolve ambiguous.  */
891                 prop = LBP_AI_REPLACEMENT;
892                 break;
893               case LBP_CB:
894                 /* This is arbitrary.  */
895                 prop = LBP_ID;
896                 break;
897               case LBP_SA:
898                 /* We don't handle complex scripts yet.
899                    Treat LBP_SA like LBP_XX.  */
900               case LBP_XX:
901                 /* This is arbitrary.  */
902                 prop = LBP_AL;
903                 break;
904             }
905 
906           /* Deal with combining characters.  */
907           q = p;
908           if (prop == LBP_CM)
909             {
910               /* Don't break just before a combining character.  */
911               *p = UC_BREAK_PROHIBITED;
912               /* A combining character turns a preceding space into LBP_AL.  */
913               if (seen_space != NULL)
914                 {
915                   q = seen_space;
916                   seen_space = seen_space2;
917                   prop = LBP_AL;
918                   goto lookup_via_table;
919                 }
920             }
921           else if (prop == LBP_SP)
922             {
923               /* Don't break just before a space.  */
924               *p = UC_BREAK_PROHIBITED;
925               seen_space2 = seen_space;
926               seen_space = p;
927             }
928           else
929             {
930              lookup_via_table:
931               /* prop must be usable as an index for table 7.3 of UTR #14.  */
932               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
933                 abort ();
934 
935               if (last_prop == LBP_BK)
936                 {
937                   /* Don't break at the beginning of a line.  */
938                   *q = UC_BREAK_PROHIBITED;
939                 }
940               else
941                 {
942                   switch (lbrk_table [last_prop-1] [prop-1])
943                     {
944                       case D:
945                         *q = UC_BREAK_POSSIBLE;
946                         break;
947                       case I:
948                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
949                         break;
950                       case P:
951                         *q = UC_BREAK_PROHIBITED;
952                         break;
953                       default:
954                         abort ();
955                     }
956                 }
957               last_prop = prop;
958               seen_space = NULL;
959               seen_space2 = NULL;
960             }
961         }
962 
963       s++;
964       p++;
965     }
966 }
967 
968 #endif
969 
970 
971 /* Choose the best line breaks, assuming the uc_width function.
972    Return the column after the end of the string.  */
973 
974 int
u8_width_linebreaks(const unsigned char * s,size_t n,int width,int start_column,int at_end_columns,const char * o,const char * encoding,char * p)975 u8_width_linebreaks (const unsigned char *s, size_t n,
976                      int width, int start_column, int at_end_columns,
977                      const char *o, const char *encoding,
978                      char *p)
979 {
980   const unsigned char *s_end;
981   char *last_p;
982   int last_column;
983   int piece_width;
984 
985   u8_possible_linebreaks (s, n, encoding, p);
986 
987   s_end = s + n;
988   last_p = NULL;
989   last_column = start_column;
990   piece_width = 0;
991   while (s < s_end)
992     {
993       unsigned int uc;
994       int count = u8_mbtouc (&uc, s, s_end - s);
995 
996       /* Respect the override.  */
997       if (o != NULL && *o != UC_BREAK_UNDEFINED)
998         *p = *o;
999 
1000       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1001         {
1002           /* An atomic piece of text ends here.  */
1003           if (last_p != NULL && last_column + piece_width > width)
1004             {
1005               /* Insert a line break.  */
1006               *last_p = UC_BREAK_POSSIBLE;
1007               last_column = 0;
1008             }
1009         }
1010 
1011       if (*p == UC_BREAK_MANDATORY)
1012         {
1013           /* uc is a line break character.  */
1014           /* Start a new piece at column 0.  */
1015           last_p = NULL;
1016           last_column = 0;
1017           piece_width = 0;
1018         }
1019       else
1020         {
1021           /* uc is not a line break character.  */
1022           int w;
1023 
1024           if (*p == UC_BREAK_POSSIBLE)
1025             {
1026               /* Start a new piece.  */
1027               last_p = p;
1028               last_column += piece_width;
1029               piece_width = 0;
1030               /* No line break for the moment, may be turned into
1031                  UC_BREAK_POSSIBLE later, via last_p. */
1032             }
1033 
1034           *p = UC_BREAK_PROHIBITED;
1035 
1036           w = uc_width (uc, encoding);
1037           if (w >= 0) /* ignore control characters in the string */
1038             piece_width += w;
1039          }
1040 
1041       s += count;
1042       p += count;
1043       if (o != NULL)
1044         o += count;
1045     }
1046 
1047   /* The last atomic piece of text ends here.  */
1048   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1049     {
1050       /* Insert a line break.  */
1051       *last_p = UC_BREAK_POSSIBLE;
1052       last_column = 0;
1053     }
1054 
1055   return last_column + piece_width;
1056 }
1057 
1058 #ifdef unused
1059 
1060 int
u16_width_linebreaks(const unsigned short * s,size_t n,int width,int start_column,int at_end_columns,const char * o,const char * encoding,char * p)1061 u16_width_linebreaks (const unsigned short *s, size_t n,
1062                       int width, int start_column, int at_end_columns,
1063                       const char *o, const char *encoding,
1064                       char *p)
1065 {
1066   const unsigned short *s_end;
1067   char *last_p;
1068   int last_column;
1069   int piece_width;
1070 
1071   u16_possible_linebreaks (s, n, encoding, p);
1072 
1073   s_end = s + n;
1074   last_p = NULL;
1075   last_column = start_column;
1076   piece_width = 0;
1077   while (s < s_end)
1078     {
1079       unsigned int uc;
1080       int count = u16_mbtouc (&uc, s, s_end - s);
1081 
1082       /* Respect the override.  */
1083       if (o != NULL && *o != UC_BREAK_UNDEFINED)
1084         *p = *o;
1085 
1086       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1087         {
1088           /* An atomic piece of text ends here.  */
1089           if (last_p != NULL && last_column + piece_width > width)
1090             {
1091               /* Insert a line break.  */
1092               *last_p = UC_BREAK_POSSIBLE;
1093               last_column = 0;
1094             }
1095         }
1096 
1097       if (*p == UC_BREAK_MANDATORY)
1098         {
1099           /* uc is a line break character.  */
1100           /* Start a new piece at column 0.  */
1101           last_p = NULL;
1102           last_column = 0;
1103           piece_width = 0;
1104         }
1105       else
1106         {
1107           /* uc is not a line break character.  */
1108           int w;
1109 
1110           if (*p == UC_BREAK_POSSIBLE)
1111             {
1112               /* Start a new piece.  */
1113               last_p = p;
1114               last_column += piece_width;
1115               piece_width = 0;
1116               /* No line break for the moment, may be turned into
1117                  UC_BREAK_POSSIBLE later, via last_p. */
1118             }
1119 
1120           *p = UC_BREAK_PROHIBITED;
1121 
1122           w = uc_width (uc, encoding);
1123           if (w >= 0) /* ignore control characters in the string */
1124             piece_width += w;
1125          }
1126 
1127       s += count;
1128       p += count;
1129       if (o != NULL)
1130         o += count;
1131     }
1132 
1133   /* The last atomic piece of text ends here.  */
1134   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1135     {
1136       /* Insert a line break.  */
1137       *last_p = UC_BREAK_POSSIBLE;
1138       last_column = 0;
1139     }
1140 
1141   return last_column + piece_width;
1142 }
1143 
1144 int
u32_width_linebreaks(const unsigned int * s,size_t n,int width,int start_column,int at_end_columns,const char * o,const char * encoding,char * p)1145 u32_width_linebreaks (const unsigned int *s, size_t n,
1146                       int width, int start_column, int at_end_columns,
1147                       const char *o, const char *encoding,
1148                       char *p)
1149 {
1150   const unsigned int *s_end;
1151   char *last_p;
1152   int last_column;
1153   int piece_width;
1154 
1155   u32_possible_linebreaks (s, n, encoding, p);
1156 
1157   s_end = s + n;
1158   last_p = NULL;
1159   last_column = start_column;
1160   piece_width = 0;
1161   while (s < s_end)
1162     {
1163       unsigned int uc = *s;
1164 
1165       /* Respect the override.  */
1166       if (o != NULL && *o != UC_BREAK_UNDEFINED)
1167         *p = *o;
1168 
1169       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1170         {
1171           /* An atomic piece of text ends here.  */
1172           if (last_p != NULL && last_column + piece_width > width)
1173             {
1174               /* Insert a line break.  */
1175               *last_p = UC_BREAK_POSSIBLE;
1176               last_column = 0;
1177             }
1178         }
1179 
1180       if (*p == UC_BREAK_MANDATORY)
1181         {
1182           /* uc is a line break character.  */
1183           /* Start a new piece at column 0.  */
1184           last_p = NULL;
1185           last_column = 0;
1186           piece_width = 0;
1187         }
1188       else
1189         {
1190           /* uc is not a line break character.  */
1191           int w;
1192 
1193           if (*p == UC_BREAK_POSSIBLE)
1194             {
1195               /* Start a new piece.  */
1196               last_p = p;
1197               last_column += piece_width;
1198               piece_width = 0;
1199               /* No line break for the moment, may be turned into
1200                  UC_BREAK_POSSIBLE later, via last_p. */
1201             }
1202 
1203           *p = UC_BREAK_PROHIBITED;
1204 
1205           w = uc_width (uc, encoding);
1206           if (w >= 0) /* ignore control characters in the string */
1207             piece_width += w;
1208          }
1209 
1210       s++;
1211       p++;
1212       if (o != NULL)
1213         o++;
1214     }
1215 
1216   /* The last atomic piece of text ends here.  */
1217   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1218     {
1219       /* Insert a line break.  */
1220       *last_p = UC_BREAK_POSSIBLE;
1221       last_column = 0;
1222     }
1223 
1224   return last_column + piece_width;
1225 }
1226 
1227 #endif
1228 
1229 
1230 #ifdef TEST1
1231 
1232 #include <stdio.h>
1233 
1234 /* Read the contents of an input stream, and return it, terminated with a NUL
1235    byte. */
1236 char *
read_file(FILE * stream)1237 read_file (FILE *stream)
1238 {
1239 #define BUFSIZE 4096
1240   char *buf = NULL;
1241   int alloc = 0;
1242   int size = 0;
1243   int count;
1244 
1245   while (! feof (stream))
1246     {
1247       if (size + BUFSIZE > alloc)
1248         {
1249           alloc = alloc + alloc / 2;
1250           if (alloc < size + BUFSIZE)
1251             alloc = size + BUFSIZE;
1252           buf = realloc (buf, alloc);
1253           if (buf == NULL)
1254             {
1255               fprintf (stderr, "out of memory\n");
1256               exit (1);
1257             }
1258         }
1259       count = fread (buf + size, 1, BUFSIZE, stream);
1260       if (count == 0)
1261         {
1262           if (ferror (stream))
1263             {
1264               perror ("fread");
1265               exit (1);
1266             }
1267         }
1268       else
1269         size += count;
1270     }
1271   buf = realloc (buf, size + 1);
1272   if (buf == NULL)
1273     {
1274       fprintf (stderr, "out of memory\n");
1275       exit (1);
1276     }
1277   buf[size] = '\0';
1278   return buf;
1279 #undef BUFSIZE
1280 }
1281 
1282 int
main(int argc,char * argv[])1283 main (int argc, char * argv[])
1284 {
1285   if (argc == 1)
1286     {
1287       /* Display all the break opportunities in the input string.  */
1288       char *input = read_file (stdin);
1289       int length = strlen (input);
1290       char *breaks = malloc (length);
1291       int i;
1292 
1293       u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
1294 
1295       for (i = 0; i < length; i++)
1296         {
1297           switch (breaks[i])
1298             {
1299               case UC_BREAK_POSSIBLE:
1300                 /* U+2027 in UTF-8 encoding */
1301                 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
1302                 break;
1303               case UC_BREAK_MANDATORY:
1304                 /* U+21B2 (or U+21B5) in UTF-8 encoding */
1305                 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
1306                 break;
1307               case UC_BREAK_PROHIBITED:
1308                 break;
1309               default:
1310                 abort ();
1311             }
1312           putc (input[i], stdout);
1313         }
1314 
1315       free (breaks);
1316 
1317       return 0;
1318     }
1319   else if (argc == 2)
1320     {
1321       /* Insert line breaks for a given width.  */
1322       int width = atoi (argv[1]);
1323       char *input = read_file (stdin);
1324       int length = strlen (input);
1325       char *breaks = malloc (length);
1326       int i;
1327 
1328       u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
1329 
1330       for (i = 0; i < length; i++)
1331         {
1332           switch (breaks[i])
1333             {
1334               case UC_BREAK_POSSIBLE:
1335                 putc ('\n', stdout);
1336                 break;
1337               case UC_BREAK_MANDATORY:
1338                 break;
1339               case UC_BREAK_PROHIBITED:
1340                 break;
1341               default:
1342                 abort ();
1343             }
1344           putc (input[i], stdout);
1345         }
1346 
1347       free (breaks);
1348 
1349       return 0;
1350     }
1351   else
1352     return 1;
1353 }
1354 
1355 #endif /* TEST1 */
1356 
1357 
1358 /* Now the same thing with an arbitrary encoding.
1359 
1360    We convert the input string to Unicode.
1361 
1362    The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
1363    UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
1364    \U0000FFFF.  UTF-16 and variants support only characters up to
1365    \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
1366    UCS-4 specification leaves doubts about endianness and byte order mark.
1367    glibc currently interprets it as big endian without byte order mark,
1368    but this is not backed by an RFC.  So we use UTF-8. It supports
1369    characters up to \U7FFFFFFF and is unambiguously defined.  */
1370 
1371 #if HAVE_ICONV
1372 
1373 #include <iconv.h>
1374 #include <errno.h>
1375 
1376 /* Luckily, the encoding's name is platform independent.  */
1377 #define UTF8_NAME "UTF-8"
1378 
1379 /* Return the length of a string after conversion through an iconv_t.  */
1380 static size_t
iconv_string_length(iconv_t cd,const char * s,size_t n)1381 iconv_string_length (iconv_t cd, const char *s, size_t n)
1382 {
1383 #define TMPBUFSIZE 4096
1384   size_t count = 0;
1385   char tmpbuf[TMPBUFSIZE];
1386   const char *inptr = s;
1387   size_t insize = n;
1388   while (insize > 0)
1389     {
1390       char *outptr = tmpbuf;
1391       size_t outsize = TMPBUFSIZE;
1392       size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1393       if (res == (size_t)(-1) && errno != E2BIG)
1394         return (size_t)(-1);
1395       count += outptr - tmpbuf;
1396     }
1397   /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug.  */
1398 #if defined _LIBICONV_VERSION \
1399     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1400   {
1401     char *outptr = tmpbuf;
1402     size_t outsize = TMPBUFSIZE;
1403     size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
1404     if (res == (size_t)(-1))
1405       return (size_t)(-1);
1406     count += outptr - tmpbuf;
1407   }
1408   /* Return to the initial state.  */
1409   iconv (cd, NULL, NULL, NULL, NULL);
1410 #endif
1411   return count;
1412 #undef TMPBUFSIZE
1413 }
1414 
1415 static void
iconv_string_keeping_offsets(iconv_t cd,const char * s,size_t n,size_t * offtable,char * t,size_t m)1416 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
1417                               size_t *offtable, char *t, size_t m)
1418 {
1419   size_t i;
1420   const char *s_end;
1421   const char *inptr;
1422   char *outptr;
1423   size_t outsize;
1424   /* Avoid glibc-2.1 bug.  */
1425 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
1426   const size_t extra = 1;
1427 #else
1428   const size_t extra = 0;
1429 #endif
1430 
1431   for (i = 0; i < n; i++)
1432     offtable[i] = (size_t)(-1);
1433 
1434   s_end = s + n;
1435   inptr = s;
1436   outptr = t;
1437   outsize = m + extra;
1438   while (inptr < s_end)
1439     {
1440       const char *saved_inptr;
1441       size_t insize;
1442       size_t res;
1443 
1444       offtable[inptr - s] = outptr - t;
1445 
1446       saved_inptr = inptr;
1447       res = (size_t)(-1);
1448       for (insize = 1; inptr + insize <= s_end; insize++)
1449         {
1450           res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1451           if (!(res == (size_t)(-1) && errno == EINVAL))
1452             break;
1453           /* We expect that no input bytes have been consumed so far.  */
1454           if (inptr != saved_inptr)
1455             abort ();
1456         }
1457       /* After we verified the convertibility and computed the translation's
1458          size m, there shouldn't be any conversion error here. */
1459       if (res == (size_t)(-1))
1460         abort ();
1461     }
1462   /* Avoid glibc-2.1 bug and Solaris 7 bug.  */
1463 #if defined _LIBICONV_VERSION \
1464     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1465   if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
1466     abort ();
1467 #endif
1468   /* We should have produced exactly m output bytes.  */
1469   if (outsize != extra)
1470     abort ();
1471 }
1472 
1473 #endif /* HAVE_ICONV */
1474 
1475 #if C_CTYPE_ASCII
1476 
1477 /* Tests whether a string is entirely ASCII.  Returns 1 if yes.
1478    Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding.  */
1479 static int
is_all_ascii(const char * s,size_t n)1480 is_all_ascii (const char *s, size_t n)
1481 {
1482   for (; n > 0; s++, n--)
1483     {
1484       unsigned char c = (unsigned char) *s;
1485 
1486       if (!(c_isprint (c) || c_isspace (c)))
1487 	return 0;
1488     }
1489   return 1;
1490 }
1491 
1492 #endif /* C_CTYPE_ASCII */
1493 
1494 #if defined unused || defined TEST2
1495 
1496 void
mbs_possible_linebreaks(const char * s,size_t n,const char * encoding,char * p)1497 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
1498                          char *p)
1499 {
1500   if (n == 0)
1501     return;
1502   if (is_utf8_encoding (encoding))
1503     u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1504   else
1505     {
1506 #if HAVE_ICONV
1507       iconv_t to_utf8;
1508       /* Avoid glibc-2.1 bug with EUC-KR.  */
1509 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1510       if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1511 	to_utf8 = (iconv_t)(-1);
1512       else
1513 # endif
1514       /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1515          GB18030.  */
1516 # if defined __sun && !defined _LIBICONV_VERSION
1517       if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1518           || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1519           || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1520           || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1521           || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1522           || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1523         to_utf8 = (iconv_t)(-1);
1524       else
1525 # endif
1526       to_utf8 = iconv_open (UTF8_NAME, encoding);
1527       if (to_utf8 != (iconv_t)(-1))
1528         {
1529           /* Determine the length of the resulting UTF-8 string.  */
1530           size_t m = iconv_string_length (to_utf8, s, n);
1531           if (m != (size_t)(-1))
1532             {
1533               /* Convert the string to UTF-8 and build a translation table
1534                  from offsets into s to offsets into the translated string.  */
1535 	      size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
1536               char *memory =
1537 		(size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1538               if (memory != NULL)
1539                 {
1540                   size_t *offtable = (size_t *) memory;
1541                   char *t = (char *) (offtable + n);
1542                   char *q = (char *) (t + m);
1543                   size_t i;
1544 
1545                   iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1546 
1547                   /* Determine the possible line breaks of the UTF-8 string.  */
1548                   u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1549 
1550                   /* Translate the result back to the original string.  */
1551                   memset (p, UC_BREAK_PROHIBITED, n);
1552                   for (i = 0; i < n; i++)
1553                     if (offtable[i] != (size_t)(-1))
1554                       p[i] = q[offtable[i]];
1555 
1556                   free (memory);
1557                   iconv_close (to_utf8);
1558                   return;
1559                 }
1560             }
1561           iconv_close (to_utf8);
1562         }
1563 #endif
1564       /* Impossible to convert.  */
1565 #if C_CTYPE_ASCII
1566       if (is_all_ascii (s, n))
1567 	{
1568 	  /* ASCII is a subset of UTF-8.  */
1569 	  u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1570 	  return;
1571 	}
1572 #endif
1573       /* We have a non-ASCII string and cannot convert it.
1574 	 Don't produce line breaks except those already present in the
1575 	 input string.  All we assume here is that the encoding is
1576 	 minimally ASCII compatible.  */
1577       {
1578         const char *s_end = s + n;
1579         while (s < s_end)
1580           {
1581             *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1582             s++;
1583             p++;
1584           }
1585       }
1586     }
1587 }
1588 
1589 #endif
1590 
1591 int
mbs_width_linebreaks(const char * s,size_t n,int width,int start_column,int at_end_columns,const char * o,const char * encoding,char * p)1592 mbs_width_linebreaks (const char *s, size_t n,
1593                       int width, int start_column, int at_end_columns,
1594                       const char *o, const char *encoding,
1595                       char *p)
1596 {
1597   if (n == 0)
1598     return start_column;
1599   if (is_utf8_encoding (encoding))
1600     return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1601   else
1602     {
1603 #if HAVE_ICONV
1604       iconv_t to_utf8;
1605       /* Avoid glibc-2.1 bug with EUC-KR.  */
1606 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1607       if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1608 	to_utf8 = (iconv_t)(-1);
1609       else
1610 # endif
1611       /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1612          GB18030.  */
1613 # if defined __sun && !defined _LIBICONV_VERSION
1614       if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1615           || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1616           || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1617           || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1618           || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1619           || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1620         to_utf8 = (iconv_t)(-1);
1621       else
1622 # endif
1623       to_utf8 = iconv_open (UTF8_NAME, encoding);
1624       if (to_utf8 != (iconv_t)(-1))
1625         {
1626           /* Determine the length of the resulting UTF-8 string.  */
1627           size_t m = iconv_string_length (to_utf8, s, n);
1628           if (m != (size_t)(-1))
1629             {
1630               /* Convert the string to UTF-8 and build a translation table
1631                  from offsets into s to offsets into the translated string.  */
1632 	      size_t memory_size =
1633 		xsum4 (xtimes (n, sizeof (size_t)), m, m,
1634 		       (o != NULL ? m : 0));
1635 	      char *memory =
1636 		(size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1637               if (memory != NULL)
1638                 {
1639                   size_t *offtable = (size_t *) memory;
1640                   char *t = (char *) (offtable + n);
1641                   char *q = (char *) (t + m);
1642                   char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1643                   int res_column;
1644                   size_t i;
1645 
1646                   iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1647 
1648                   /* Translate the overrides to the UTF-8 string.  */
1649                   if (o != NULL)
1650                     {
1651                       memset (o8, UC_BREAK_UNDEFINED, m);
1652                       for (i = 0; i < n; i++)
1653                         if (offtable[i] != (size_t)(-1))
1654                           o8[offtable[i]] = o[i];
1655                     }
1656 
1657                   /* Determine the line breaks of the UTF-8 string.  */
1658                   res_column =
1659                     u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1660 
1661                   /* Translate the result back to the original string.  */
1662                   memset (p, UC_BREAK_PROHIBITED, n);
1663                   for (i = 0; i < n; i++)
1664                     if (offtable[i] != (size_t)(-1))
1665                       p[i] = q[offtable[i]];
1666 
1667                   free (memory);
1668                   iconv_close (to_utf8);
1669                   return res_column;
1670                 }
1671             }
1672           iconv_close (to_utf8);
1673         }
1674 #endif
1675       /* Impossible to convert.  */
1676 #if C_CTYPE_ASCII
1677       if (is_all_ascii (s, n))
1678 	{
1679 	  /* ASCII is a subset of UTF-8.  */
1680 	  return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1681 	}
1682 #endif
1683       /* We have a non-ASCII string and cannot convert it.
1684 	 Don't produce line breaks except those already present in the
1685 	 input string.  All we assume here is that the encoding is
1686 	 minimally ASCII compatible.  */
1687       {
1688         const char *s_end = s + n;
1689         while (s < s_end)
1690           {
1691             *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1692                   ? UC_BREAK_MANDATORY
1693                   : UC_BREAK_PROHIBITED);
1694             s++;
1695             p++;
1696             if (o != NULL)
1697               o++;
1698           }
1699         /* We cannot compute widths in this case.  */
1700         return start_column;
1701       }
1702     }
1703 }
1704 
1705 
1706 #ifdef TEST2
1707 
1708 #include <stdio.h>
1709 #include <locale.h>
1710 
1711 /* Read the contents of an input stream, and return it, terminated with a NUL
1712    byte. */
1713 char *
read_file(FILE * stream)1714 read_file (FILE *stream)
1715 {
1716 #define BUFSIZE 4096
1717   char *buf = NULL;
1718   int alloc = 0;
1719   int size = 0;
1720   int count;
1721 
1722   while (! feof (stream))
1723     {
1724       if (size + BUFSIZE > alloc)
1725         {
1726           alloc = alloc + alloc / 2;
1727           if (alloc < size + BUFSIZE)
1728             alloc = size + BUFSIZE;
1729           buf = realloc (buf, alloc);
1730           if (buf == NULL)
1731             {
1732               fprintf (stderr, "out of memory\n");
1733               exit (1);
1734             }
1735         }
1736       count = fread (buf + size, 1, BUFSIZE, stream);
1737       if (count == 0)
1738         {
1739           if (ferror (stream))
1740             {
1741               perror ("fread");
1742               exit (1);
1743             }
1744         }
1745       else
1746         size += count;
1747     }
1748   buf = realloc (buf, size + 1);
1749   if (buf == NULL)
1750     {
1751       fprintf (stderr, "out of memory\n");
1752       exit (1);
1753     }
1754   buf[size] = '\0';
1755   return buf;
1756 #undef BUFSIZE
1757 }
1758 
1759 int
main(int argc,char * argv[])1760 main (int argc, char * argv[])
1761 {
1762   setlocale (LC_CTYPE, "");
1763   if (argc == 1)
1764     {
1765       /* Display all the break opportunities in the input string.  */
1766       char *input = read_file (stdin);
1767       int length = strlen (input);
1768       char *breaks = malloc (length);
1769       int i;
1770 
1771       mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1772 
1773       for (i = 0; i < length; i++)
1774         {
1775           switch (breaks[i])
1776             {
1777               case UC_BREAK_POSSIBLE:
1778                 putc ('|', stdout);
1779                 break;
1780               case UC_BREAK_MANDATORY:
1781                 break;
1782               case UC_BREAK_PROHIBITED:
1783                 break;
1784               default:
1785                 abort ();
1786             }
1787           putc (input[i], stdout);
1788         }
1789 
1790       free (breaks);
1791 
1792       return 0;
1793     }
1794   else if (argc == 2)
1795     {
1796       /* Insert line breaks for a given width.  */
1797       int width = atoi (argv[1]);
1798       char *input = read_file (stdin);
1799       int length = strlen (input);
1800       char *breaks = malloc (length);
1801       int i;
1802 
1803       mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1804 
1805       for (i = 0; i < length; i++)
1806         {
1807           switch (breaks[i])
1808             {
1809               case UC_BREAK_POSSIBLE:
1810                 putc ('\n', stdout);
1811                 break;
1812               case UC_BREAK_MANDATORY:
1813                 break;
1814               case UC_BREAK_PROHIBITED:
1815                 break;
1816               default:
1817                 abort ();
1818             }
1819           putc (input[i], stdout);
1820         }
1821 
1822       free (breaks);
1823 
1824       return 0;
1825     }
1826   else
1827     return 1;
1828 }
1829 
1830 #endif /* TEST2 */
1831