1 /* linebreak.c - line breaking of Unicode strings
2 Copyright (C) 2001-2003, 2006 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
18
19 #include <config.h>
20
21 /* Specification. */
22 #include "linebreak.h"
23
24 #include <stdlib.h>
25 #include <string.h>
26 #include "c-ctype.h"
27 #include "xsize.h"
28
29 #include "utf8-ucs4.h"
30
31 #ifdef unused
32 #include "utf16-ucs4.h"
33
34 static inline int
u32_mbtouc(unsigned int * puc,const unsigned int * s,size_t n)35 u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n)
36 {
37 *puc = *s;
38 return 1;
39 }
40 #endif
41
42
43 /* Help GCC to generate good code for string comparisons with
44 immediate strings. */
45 #if defined (__GNUC__) && defined (__OPTIMIZE__)
46
47 static inline int
streq9(const char * s1,const char * s2)48 streq9 (const char *s1, const char *s2)
49 {
50 return strcmp (s1 + 9, s2 + 9) == 0;
51 }
52
53 static inline int
streq8(const char * s1,const char * s2,char s28)54 streq8 (const char *s1, const char *s2, char s28)
55 {
56 if (s1[8] == s28)
57 {
58 if (s28 == 0)
59 return 1;
60 else
61 return streq9 (s1, s2);
62 }
63 else
64 return 0;
65 }
66
67 static inline int
streq7(const char * s1,const char * s2,char s27,char s28)68 streq7 (const char *s1, const char *s2, char s27, char s28)
69 {
70 if (s1[7] == s27)
71 {
72 if (s27 == 0)
73 return 1;
74 else
75 return streq8 (s1, s2, s28);
76 }
77 else
78 return 0;
79 }
80
81 static inline int
streq6(const char * s1,const char * s2,char s26,char s27,char s28)82 streq6 (const char *s1, const char *s2, char s26, char s27, char s28)
83 {
84 if (s1[6] == s26)
85 {
86 if (s26 == 0)
87 return 1;
88 else
89 return streq7 (s1, s2, s27, s28);
90 }
91 else
92 return 0;
93 }
94
95 static inline int
streq5(const char * s1,const char * s2,char s25,char s26,char s27,char s28)96 streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28)
97 {
98 if (s1[5] == s25)
99 {
100 if (s25 == 0)
101 return 1;
102 else
103 return streq6 (s1, s2, s26, s27, s28);
104 }
105 else
106 return 0;
107 }
108
109 static inline int
streq4(const char * s1,const char * s2,char s24,char s25,char s26,char s27,char s28)110 streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28)
111 {
112 if (s1[4] == s24)
113 {
114 if (s24 == 0)
115 return 1;
116 else
117 return streq5 (s1, s2, s25, s26, s27, s28);
118 }
119 else
120 return 0;
121 }
122
123 static inline int
streq3(const char * s1,const char * s2,char s23,char s24,char s25,char s26,char s27,char s28)124 streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28)
125 {
126 if (s1[3] == s23)
127 {
128 if (s23 == 0)
129 return 1;
130 else
131 return streq4 (s1, s2, s24, s25, s26, s27, s28);
132 }
133 else
134 return 0;
135 }
136
137 static inline int
streq2(const char * s1,const char * s2,char s22,char s23,char s24,char s25,char s26,char s27,char s28)138 streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
139 {
140 if (s1[2] == s22)
141 {
142 if (s22 == 0)
143 return 1;
144 else
145 return streq3 (s1, s2, s23, s24, s25, s26, s27, s28);
146 }
147 else
148 return 0;
149 }
150
151 static inline int
streq1(const char * s1,const char * s2,char s21,char s22,char s23,char s24,char s25,char s26,char s27,char s28)152 streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
153 {
154 if (s1[1] == s21)
155 {
156 if (s21 == 0)
157 return 1;
158 else
159 return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28);
160 }
161 else
162 return 0;
163 }
164
165 static inline int
streq0(const char * s1,const char * s2,char s20,char s21,char s22,char s23,char s24,char s25,char s26,char s27,char s28)166 streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
167 {
168 if (s1[0] == s20)
169 {
170 if (s20 == 0)
171 return 1;
172 else
173 return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28);
174 }
175 else
176 return 0;
177 }
178
179 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
180 streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
181
182 #else
183
184 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
185 (strcmp (s1, s2) == 0)
186
187 #endif
188
189
190 static int
is_cjk_encoding(const char * encoding)191 is_cjk_encoding (const char *encoding)
192 {
193 if (0
194 /* Legacy Japanese encodings */
195 || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
196 /* Legacy Chinese encodings */
197 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
198 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
199 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
200 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
201 /* Legacy Korean encodings */
202 || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
203 || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
204 || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
205 return 1;
206 return 0;
207 }
208
209 static int
is_utf8_encoding(const char * encoding)210 is_utf8_encoding (const char *encoding)
211 {
212 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
213 return 1;
214 return 0;
215 }
216
217
218 /* Determine number of column positions required for UC. */
219 int uc_width (unsigned int uc, const char *encoding);
220
221 /*
222 * Non-spacing attribute table.
223 * Consists of:
224 * - Non-spacing characters; generated from PropList.txt or
225 * "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
226 * - Format control characters; generated from
227 * "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
228 * - Zero width characters; generated from
229 * "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
230 */
231 static const unsigned char nonspacing_table_data[16*64] = {
232 /* 0x0000-0x01ff */
233 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
234 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
235 0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00, /* 0x0080-0x00bf */
236 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */
237 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */
238 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */
239 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */
240 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */
241 /* 0x0200-0x03ff */
242 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */
243 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */
244 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */
245 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */
246 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */
247 0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */
248 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */
249 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */
250 /* 0x0400-0x05ff */
251 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */
252 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */
253 0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */
254 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */
255 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */
256 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */
257 0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */
258 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
259 /* 0x0600-0x07ff */
260 0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
261 0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
262 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
263 0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
264 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
265 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
266 0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
267 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */
268 /* 0x0800-0x09ff */
269 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */
270 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
271 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
272 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */
273 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */
274 0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
275 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
276 0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */
277 /* 0x0a00-0x0bff */
278 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */
279 0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */
280 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */
281 0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */
282 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */
283 0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */
284 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */
285 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */
286 /* 0x0c00-0x0dff */
287 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */
288 0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */
289 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0c80-0x0cbf */
290 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */
291 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */
292 0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */
293 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */
294 0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */
295 /* 0x0e00-0x0fff */
296 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */
297 0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */
298 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */
299 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */
300 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */
301 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */
302 0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */
303 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */
304 /* 0x1000-0x11ff */
305 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */
306 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */
307 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */
308 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */
309 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */
310 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */
311 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */
312 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */
313 /* 0x1600-0x17ff */
314 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */
315 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */
316 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */
317 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */
318 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */
319 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */
320 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f, /* 0x1780-0x17bf */
321 0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */
322 /* 0x1800-0x19ff */
323 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
324 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
325 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
326 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
327 0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e, /* 0x1900-0x193f */
328 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
329 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */
330 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */
331 /* 0x2000-0x21ff */
332 0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
333 0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */
334 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */
335 0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */
336 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */
337 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */
338 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */
339 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */
340 /* 0x3000-0x31ff */
341 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */
342 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */
343 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
344 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
345 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
346 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
347 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
348 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
349 /* 0xfa00-0xfbff */
350 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */
351 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */
352 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */
353 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */
354 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */
355 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */
356 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */
357 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */
358 /* 0xfe00-0xffff */
359 0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */
360 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */
361 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */
362 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
363 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
364 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
365 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
366 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */
367 /* 0x1d000-0x1d1ff */
368 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */
369 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */
370 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */
371 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */
372 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */
373 0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */
374 0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */
375 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* 0x1d1c0-0x1d1ff */
376 };
377 static const signed char nonspacing_table_ind[240] = {
378 0, 1, 2, 3, 4, 5, 6, 7, /* 0x0000-0x0fff */
379 8, -1, -1, 9, 10, -1, -1, -1, /* 0x1000-0x1fff */
380 11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */
381 12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */
382 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */
383 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */
384 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */
385 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */
386 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */
387 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */
388 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */
389 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */
390 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */
391 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */
392 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
393 -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */
394 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */
395 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */
396 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */
397 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */
398 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */
399 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */
400 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */
401 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */
402 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */
403 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */
404 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */
405 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */
406 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */
407 15, -1, -1, -1, -1, -1, -1, -1 /* 0x1d000-0x1dfff */
408 };
409
410 /* Determine number of column positions required for UC. */
411 int
uc_width(unsigned int uc,const char * encoding)412 uc_width (unsigned int uc, const char *encoding)
413 {
414 /* Test for non-spacing or control character. */
415 if ((uc >> 9) < 240)
416 {
417 int ind = nonspacing_table_ind[uc >> 9];
418 if (ind >= 0)
419 if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
420 {
421 if (uc > 0 && uc < 0xa0)
422 return -1;
423 else
424 return 0;
425 }
426 }
427 else if ((uc >> 9) == (0xe0000 >> 9))
428 {
429 if (uc < 0xe0100
430 ? (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
431 : (uc <= 0xe01ef))
432 return 0;
433 }
434 /* Test for double-width character.
435 * Generated from "grep '^....;[WF]' EastAsianWidth.txt"
436 * and "grep '^....;[^WF]' EastAsianWidth.txt"
437 */
438 if (uc >= 0x1100
439 && ((uc < 0x1160) /* Hangul Jamo */
440 || (uc >= 0x2e80 && uc < 0x4dc0 /* CJK */
441 && !(uc == 0x303f))
442 || (uc >= 0x4e00 && uc < 0xa4d0) /* CJK ... Yi */
443 || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */
444 || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */
445 || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */
446 || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */
447 || (uc >= 0xffe0 && uc < 0xffe7)
448 || (uc >= 0x20000 && uc <= 0x2fffd) /* CJK, CJK Compatibility Ideographs */
449 || (uc >= 0x30000 && uc <= 0x3fffd)
450 ) )
451 return 2;
452 /* In ancient CJK encodings, Cyrillic and most other characters are
453 double-width as well. */
454 if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
455 && is_cjk_encoding (encoding))
456 return 2;
457 return 1;
458 }
459
460
461 #ifdef unused
462
463 /* Determine number of column positions required for first N units
464 (or fewer if S ends before this) in S. */
465
466 int
u8_width(const unsigned char * s,size_t n,const char * encoding)467 u8_width (const unsigned char *s, size_t n, const char *encoding)
468 {
469 const unsigned char *s_end = s + n;
470 int width = 0;
471
472 while (s < s_end)
473 {
474 unsigned int uc;
475 int w;
476
477 s += u8_mbtouc (&uc, s, s_end - s);
478
479 if (uc == 0)
480 break; /* end of string reached */
481
482 w = uc_width (uc, encoding);
483 if (w >= 0) /* ignore control characters in the string */
484 width += w;
485 }
486
487 return width;
488 }
489
490 int
u16_width(const unsigned short * s,size_t n,const char * encoding)491 u16_width (const unsigned short *s, size_t n, const char *encoding)
492 {
493 const unsigned short *s_end = s + n;
494 int width = 0;
495
496 while (s < s_end)
497 {
498 unsigned int uc;
499 int w;
500
501 s += u16_mbtouc (&uc, s, s_end - s);
502
503 if (uc == 0)
504 break; /* end of string reached */
505
506 w = uc_width (uc, encoding);
507 if (w >= 0) /* ignore control characters in the string */
508 width += w;
509 }
510
511 return width;
512 }
513
514 int
u32_width(const unsigned int * s,size_t n,const char * encoding)515 u32_width (const unsigned int *s, size_t n, const char *encoding)
516 {
517 const unsigned int *s_end = s + n;
518 int width = 0;
519
520 while (s < s_end)
521 {
522 unsigned int uc = *s++;
523 int w;
524
525 if (uc == 0)
526 break; /* end of string reached */
527
528 w = uc_width (uc, encoding);
529 if (w >= 0) /* ignore control characters in the string */
530 width += w;
531 }
532
533 return width;
534 }
535
536 #endif
537
538
539 /* Determine the line break points in S, and store the result at p[0..n-1]. */
540 /* We don't support line breaking of complex-context dependent characters
541 (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
542
543 /* Line breaking classification. */
544
545 enum
546 {
547 /* Values >= 20 are resolved at run time. */
548 LBP_BK = 0, /* mandatory break */
549 /*LBP_CR, carriage return - not used here because it's a DOSism */
550 /*LBP_LF, line feed - not used here because it's a DOSism */
551 LBP_CM = 20, /* attached characters and combining marks */
552 /*LBP_SG, surrogates - not used here because they are not characters */
553 LBP_ZW = 1, /* zero width space */
554 LBP_IN = 2, /* inseparable */
555 LBP_GL = 3, /* non-breaking (glue) */
556 LBP_CB = 22, /* contingent break opportunity */
557 LBP_SP = 21, /* space */
558 LBP_BA = 4, /* break opportunity after */
559 LBP_BB = 5, /* break opportunity before */
560 LBP_B2 = 6, /* break opportunity before and after */
561 LBP_HY = 7, /* hyphen */
562 LBP_NS = 8, /* non starter */
563 LBP_OP = 9, /* opening punctuation */
564 LBP_CL = 10, /* closing punctuation */
565 LBP_QU = 11, /* ambiguous quotation */
566 LBP_EX = 12, /* exclamation/interrogation */
567 LBP_ID = 13, /* ideographic */
568 LBP_NU = 14, /* numeric */
569 LBP_IS = 15, /* infix separator (numeric) */
570 LBP_SY = 16, /* symbols allowing breaks */
571 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
572 LBP_PR = 18, /* prefix (numeric) */
573 LBP_PO = 19, /* postfix (numeric) */
574 LBP_SA = 23, /* complex context (South East Asian) */
575 LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
576 LBP_XX = 25 /* unknown */
577 };
578
579 #include "lbrkprop.h"
580
581 static inline unsigned char
lbrkprop_lookup(unsigned int uc)582 lbrkprop_lookup (unsigned int uc)
583 {
584 unsigned int index1 = uc >> lbrkprop_header_0;
585 if (index1 < lbrkprop_header_1)
586 {
587 int lookup1 = lbrkprop.level1[index1];
588 if (lookup1 >= 0)
589 {
590 unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
591 int lookup2 = lbrkprop.level2[lookup1 + index2];
592 if (lookup2 >= 0)
593 {
594 unsigned int index3 = uc & lbrkprop_header_4;
595 return lbrkprop.level3[lookup2 + index3];
596 }
597 }
598 }
599 return LBP_XX;
600 }
601
602 /* Table indexed by two line breaking classifications. */
603 #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */
604 #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
605 #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */
606 static const unsigned char lbrk_table[19][19] = {
607 /* after */
608 /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
609 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
610 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
611 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
612 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
613 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
614 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
615 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
616 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
617 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
618 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
619 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
620 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
621 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
622 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
623 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
624 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
625 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
626 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
627 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
628 /* "" */
629 /* before */
630 };
631 /* Note: The (B2,B2) entry should probably be D instead of P. */
632 /* Note: The (PR,ID) entry should probably be D instead of I. */
633
634 void
u8_possible_linebreaks(const unsigned char * s,size_t n,const char * encoding,char * p)635 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
636 {
637 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
638 const unsigned char *s_end = s + n;
639 int last_prop = LBP_BK; /* line break property of last non-space character */
640 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
641 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
642
643 /* Don't break inside multibyte characters. */
644 memset (p, UC_BREAK_PROHIBITED, n);
645
646 while (s < s_end)
647 {
648 unsigned int uc;
649 int count = u8_mbtouc (&uc, s, s_end - s);
650 int prop = lbrkprop_lookup (uc);
651
652 if (prop == LBP_BK)
653 {
654 /* Mandatory break. */
655 *p = UC_BREAK_MANDATORY;
656 last_prop = LBP_BK;
657 seen_space = NULL;
658 seen_space2 = NULL;
659 }
660 else
661 {
662 char *q;
663
664 /* Resolve property values whose behaviour is not fixed. */
665 switch (prop)
666 {
667 case LBP_AI:
668 /* Resolve ambiguous. */
669 prop = LBP_AI_REPLACEMENT;
670 break;
671 case LBP_CB:
672 /* This is arbitrary. */
673 prop = LBP_ID;
674 break;
675 case LBP_SA:
676 /* We don't handle complex scripts yet.
677 Treat LBP_SA like LBP_XX. */
678 case LBP_XX:
679 /* This is arbitrary. */
680 prop = LBP_AL;
681 break;
682 }
683
684 /* Deal with combining characters. */
685 q = p;
686 if (prop == LBP_CM)
687 {
688 /* Don't break just before a combining character. */
689 *p = UC_BREAK_PROHIBITED;
690 /* A combining character turns a preceding space into LBP_AL. */
691 if (seen_space != NULL)
692 {
693 q = seen_space;
694 seen_space = seen_space2;
695 prop = LBP_AL;
696 goto lookup_via_table;
697 }
698 }
699 else if (prop == LBP_SP)
700 {
701 /* Don't break just before a space. */
702 *p = UC_BREAK_PROHIBITED;
703 seen_space2 = seen_space;
704 seen_space = p;
705 }
706 else
707 {
708 lookup_via_table:
709 /* prop must be usable as an index for table 7.3 of UTR #14. */
710 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
711 abort ();
712
713 if (last_prop == LBP_BK)
714 {
715 /* Don't break at the beginning of a line. */
716 *q = UC_BREAK_PROHIBITED;
717 }
718 else
719 {
720 switch (lbrk_table [last_prop-1] [prop-1])
721 {
722 case D:
723 *q = UC_BREAK_POSSIBLE;
724 break;
725 case I:
726 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
727 break;
728 case P:
729 *q = UC_BREAK_PROHIBITED;
730 break;
731 default:
732 abort ();
733 }
734 }
735 last_prop = prop;
736 seen_space = NULL;
737 seen_space2 = NULL;
738 }
739 }
740
741 s += count;
742 p += count;
743 }
744 }
745
746 #ifdef unused
747
748 void
u16_possible_linebreaks(const unsigned short * s,size_t n,const char * encoding,char * p)749 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
750 {
751 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
752 const unsigned short *s_end = s + n;
753 int last_prop = LBP_BK; /* line break property of last non-space character */
754 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
755 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
756
757 /* Don't break inside multibyte characters. */
758 memset (p, UC_BREAK_PROHIBITED, n);
759
760 while (s < s_end)
761 {
762 unsigned int uc;
763 int count = u16_mbtouc (&uc, s, s_end - s);
764 int prop = lbrkprop_lookup (uc);
765
766 if (prop == LBP_BK)
767 {
768 /* Mandatory break. */
769 *p = UC_BREAK_MANDATORY;
770 last_prop = LBP_BK;
771 seen_space = NULL;
772 seen_space2 = NULL;
773 }
774 else
775 {
776 char *q;
777
778 /* Resolve property values whose behaviour is not fixed. */
779 switch (prop)
780 {
781 case LBP_AI:
782 /* Resolve ambiguous. */
783 prop = LBP_AI_REPLACEMENT;
784 break;
785 case LBP_CB:
786 /* This is arbitrary. */
787 prop = LBP_ID;
788 break;
789 case LBP_SA:
790 /* We don't handle complex scripts yet.
791 Treat LBP_SA like LBP_XX. */
792 case LBP_XX:
793 /* This is arbitrary. */
794 prop = LBP_AL;
795 break;
796 }
797
798 /* Deal with combining characters. */
799 q = p;
800 if (prop == LBP_CM)
801 {
802 /* Don't break just before a combining character. */
803 *p = UC_BREAK_PROHIBITED;
804 /* A combining character turns a preceding space into LBP_AL. */
805 if (seen_space != NULL)
806 {
807 q = seen_space;
808 seen_space = seen_space2;
809 prop = LBP_AL;
810 goto lookup_via_table;
811 }
812 }
813 else if (prop == LBP_SP)
814 {
815 /* Don't break just before a space. */
816 *p = UC_BREAK_PROHIBITED;
817 seen_space2 = seen_space;
818 seen_space = p;
819 }
820 else
821 {
822 lookup_via_table:
823 /* prop must be usable as an index for table 7.3 of UTR #14. */
824 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
825 abort ();
826
827 if (last_prop == LBP_BK)
828 {
829 /* Don't break at the beginning of a line. */
830 *q = UC_BREAK_PROHIBITED;
831 }
832 else
833 {
834 switch (lbrk_table [last_prop-1] [prop-1])
835 {
836 case D:
837 *q = UC_BREAK_POSSIBLE;
838 break;
839 case I:
840 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
841 break;
842 case P:
843 *q = UC_BREAK_PROHIBITED;
844 break;
845 default:
846 abort ();
847 }
848 }
849 last_prop = prop;
850 seen_space = NULL;
851 seen_space2 = NULL;
852 }
853 }
854
855 s += count;
856 p += count;
857 }
858 }
859
860 void
u32_possible_linebreaks(const unsigned int * s,size_t n,const char * encoding,char * p)861 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
862 {
863 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
864 const unsigned int *s_end = s + n;
865 int last_prop = LBP_BK; /* line break property of last non-space character */
866 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
867 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
868
869 while (s < s_end)
870 {
871 unsigned int uc = *s;
872 int prop = lbrkprop_lookup (uc);
873
874 if (prop == LBP_BK)
875 {
876 /* Mandatory break. */
877 *p = UC_BREAK_MANDATORY;
878 last_prop = LBP_BK;
879 seen_space = NULL;
880 seen_space2 = NULL;
881 }
882 else
883 {
884 char *q;
885
886 /* Resolve property values whose behaviour is not fixed. */
887 switch (prop)
888 {
889 case LBP_AI:
890 /* Resolve ambiguous. */
891 prop = LBP_AI_REPLACEMENT;
892 break;
893 case LBP_CB:
894 /* This is arbitrary. */
895 prop = LBP_ID;
896 break;
897 case LBP_SA:
898 /* We don't handle complex scripts yet.
899 Treat LBP_SA like LBP_XX. */
900 case LBP_XX:
901 /* This is arbitrary. */
902 prop = LBP_AL;
903 break;
904 }
905
906 /* Deal with combining characters. */
907 q = p;
908 if (prop == LBP_CM)
909 {
910 /* Don't break just before a combining character. */
911 *p = UC_BREAK_PROHIBITED;
912 /* A combining character turns a preceding space into LBP_AL. */
913 if (seen_space != NULL)
914 {
915 q = seen_space;
916 seen_space = seen_space2;
917 prop = LBP_AL;
918 goto lookup_via_table;
919 }
920 }
921 else if (prop == LBP_SP)
922 {
923 /* Don't break just before a space. */
924 *p = UC_BREAK_PROHIBITED;
925 seen_space2 = seen_space;
926 seen_space = p;
927 }
928 else
929 {
930 lookup_via_table:
931 /* prop must be usable as an index for table 7.3 of UTR #14. */
932 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
933 abort ();
934
935 if (last_prop == LBP_BK)
936 {
937 /* Don't break at the beginning of a line. */
938 *q = UC_BREAK_PROHIBITED;
939 }
940 else
941 {
942 switch (lbrk_table [last_prop-1] [prop-1])
943 {
944 case D:
945 *q = UC_BREAK_POSSIBLE;
946 break;
947 case I:
948 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
949 break;
950 case P:
951 *q = UC_BREAK_PROHIBITED;
952 break;
953 default:
954 abort ();
955 }
956 }
957 last_prop = prop;
958 seen_space = NULL;
959 seen_space2 = NULL;
960 }
961 }
962
963 s++;
964 p++;
965 }
966 }
967
968 #endif
969
970
971 /* Choose the best line breaks, assuming the uc_width function.
972 Return the column after the end of the string. */
973
974 int
u8_width_linebreaks(const unsigned char * s,size_t n,int width,int start_column,int at_end_columns,const char * o,const char * encoding,char * p)975 u8_width_linebreaks (const unsigned char *s, size_t n,
976 int width, int start_column, int at_end_columns,
977 const char *o, const char *encoding,
978 char *p)
979 {
980 const unsigned char *s_end;
981 char *last_p;
982 int last_column;
983 int piece_width;
984
985 u8_possible_linebreaks (s, n, encoding, p);
986
987 s_end = s + n;
988 last_p = NULL;
989 last_column = start_column;
990 piece_width = 0;
991 while (s < s_end)
992 {
993 unsigned int uc;
994 int count = u8_mbtouc (&uc, s, s_end - s);
995
996 /* Respect the override. */
997 if (o != NULL && *o != UC_BREAK_UNDEFINED)
998 *p = *o;
999
1000 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1001 {
1002 /* An atomic piece of text ends here. */
1003 if (last_p != NULL && last_column + piece_width > width)
1004 {
1005 /* Insert a line break. */
1006 *last_p = UC_BREAK_POSSIBLE;
1007 last_column = 0;
1008 }
1009 }
1010
1011 if (*p == UC_BREAK_MANDATORY)
1012 {
1013 /* uc is a line break character. */
1014 /* Start a new piece at column 0. */
1015 last_p = NULL;
1016 last_column = 0;
1017 piece_width = 0;
1018 }
1019 else
1020 {
1021 /* uc is not a line break character. */
1022 int w;
1023
1024 if (*p == UC_BREAK_POSSIBLE)
1025 {
1026 /* Start a new piece. */
1027 last_p = p;
1028 last_column += piece_width;
1029 piece_width = 0;
1030 /* No line break for the moment, may be turned into
1031 UC_BREAK_POSSIBLE later, via last_p. */
1032 }
1033
1034 *p = UC_BREAK_PROHIBITED;
1035
1036 w = uc_width (uc, encoding);
1037 if (w >= 0) /* ignore control characters in the string */
1038 piece_width += w;
1039 }
1040
1041 s += count;
1042 p += count;
1043 if (o != NULL)
1044 o += count;
1045 }
1046
1047 /* The last atomic piece of text ends here. */
1048 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1049 {
1050 /* Insert a line break. */
1051 *last_p = UC_BREAK_POSSIBLE;
1052 last_column = 0;
1053 }
1054
1055 return last_column + piece_width;
1056 }
1057
1058 #ifdef unused
1059
1060 int
u16_width_linebreaks(const unsigned short * s,size_t n,int width,int start_column,int at_end_columns,const char * o,const char * encoding,char * p)1061 u16_width_linebreaks (const unsigned short *s, size_t n,
1062 int width, int start_column, int at_end_columns,
1063 const char *o, const char *encoding,
1064 char *p)
1065 {
1066 const unsigned short *s_end;
1067 char *last_p;
1068 int last_column;
1069 int piece_width;
1070
1071 u16_possible_linebreaks (s, n, encoding, p);
1072
1073 s_end = s + n;
1074 last_p = NULL;
1075 last_column = start_column;
1076 piece_width = 0;
1077 while (s < s_end)
1078 {
1079 unsigned int uc;
1080 int count = u16_mbtouc (&uc, s, s_end - s);
1081
1082 /* Respect the override. */
1083 if (o != NULL && *o != UC_BREAK_UNDEFINED)
1084 *p = *o;
1085
1086 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1087 {
1088 /* An atomic piece of text ends here. */
1089 if (last_p != NULL && last_column + piece_width > width)
1090 {
1091 /* Insert a line break. */
1092 *last_p = UC_BREAK_POSSIBLE;
1093 last_column = 0;
1094 }
1095 }
1096
1097 if (*p == UC_BREAK_MANDATORY)
1098 {
1099 /* uc is a line break character. */
1100 /* Start a new piece at column 0. */
1101 last_p = NULL;
1102 last_column = 0;
1103 piece_width = 0;
1104 }
1105 else
1106 {
1107 /* uc is not a line break character. */
1108 int w;
1109
1110 if (*p == UC_BREAK_POSSIBLE)
1111 {
1112 /* Start a new piece. */
1113 last_p = p;
1114 last_column += piece_width;
1115 piece_width = 0;
1116 /* No line break for the moment, may be turned into
1117 UC_BREAK_POSSIBLE later, via last_p. */
1118 }
1119
1120 *p = UC_BREAK_PROHIBITED;
1121
1122 w = uc_width (uc, encoding);
1123 if (w >= 0) /* ignore control characters in the string */
1124 piece_width += w;
1125 }
1126
1127 s += count;
1128 p += count;
1129 if (o != NULL)
1130 o += count;
1131 }
1132
1133 /* The last atomic piece of text ends here. */
1134 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1135 {
1136 /* Insert a line break. */
1137 *last_p = UC_BREAK_POSSIBLE;
1138 last_column = 0;
1139 }
1140
1141 return last_column + piece_width;
1142 }
1143
1144 int
u32_width_linebreaks(const unsigned int * s,size_t n,int width,int start_column,int at_end_columns,const char * o,const char * encoding,char * p)1145 u32_width_linebreaks (const unsigned int *s, size_t n,
1146 int width, int start_column, int at_end_columns,
1147 const char *o, const char *encoding,
1148 char *p)
1149 {
1150 const unsigned int *s_end;
1151 char *last_p;
1152 int last_column;
1153 int piece_width;
1154
1155 u32_possible_linebreaks (s, n, encoding, p);
1156
1157 s_end = s + n;
1158 last_p = NULL;
1159 last_column = start_column;
1160 piece_width = 0;
1161 while (s < s_end)
1162 {
1163 unsigned int uc = *s;
1164
1165 /* Respect the override. */
1166 if (o != NULL && *o != UC_BREAK_UNDEFINED)
1167 *p = *o;
1168
1169 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1170 {
1171 /* An atomic piece of text ends here. */
1172 if (last_p != NULL && last_column + piece_width > width)
1173 {
1174 /* Insert a line break. */
1175 *last_p = UC_BREAK_POSSIBLE;
1176 last_column = 0;
1177 }
1178 }
1179
1180 if (*p == UC_BREAK_MANDATORY)
1181 {
1182 /* uc is a line break character. */
1183 /* Start a new piece at column 0. */
1184 last_p = NULL;
1185 last_column = 0;
1186 piece_width = 0;
1187 }
1188 else
1189 {
1190 /* uc is not a line break character. */
1191 int w;
1192
1193 if (*p == UC_BREAK_POSSIBLE)
1194 {
1195 /* Start a new piece. */
1196 last_p = p;
1197 last_column += piece_width;
1198 piece_width = 0;
1199 /* No line break for the moment, may be turned into
1200 UC_BREAK_POSSIBLE later, via last_p. */
1201 }
1202
1203 *p = UC_BREAK_PROHIBITED;
1204
1205 w = uc_width (uc, encoding);
1206 if (w >= 0) /* ignore control characters in the string */
1207 piece_width += w;
1208 }
1209
1210 s++;
1211 p++;
1212 if (o != NULL)
1213 o++;
1214 }
1215
1216 /* The last atomic piece of text ends here. */
1217 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1218 {
1219 /* Insert a line break. */
1220 *last_p = UC_BREAK_POSSIBLE;
1221 last_column = 0;
1222 }
1223
1224 return last_column + piece_width;
1225 }
1226
1227 #endif
1228
1229
1230 #ifdef TEST1
1231
1232 #include <stdio.h>
1233
1234 /* Read the contents of an input stream, and return it, terminated with a NUL
1235 byte. */
1236 char *
read_file(FILE * stream)1237 read_file (FILE *stream)
1238 {
1239 #define BUFSIZE 4096
1240 char *buf = NULL;
1241 int alloc = 0;
1242 int size = 0;
1243 int count;
1244
1245 while (! feof (stream))
1246 {
1247 if (size + BUFSIZE > alloc)
1248 {
1249 alloc = alloc + alloc / 2;
1250 if (alloc < size + BUFSIZE)
1251 alloc = size + BUFSIZE;
1252 buf = realloc (buf, alloc);
1253 if (buf == NULL)
1254 {
1255 fprintf (stderr, "out of memory\n");
1256 exit (1);
1257 }
1258 }
1259 count = fread (buf + size, 1, BUFSIZE, stream);
1260 if (count == 0)
1261 {
1262 if (ferror (stream))
1263 {
1264 perror ("fread");
1265 exit (1);
1266 }
1267 }
1268 else
1269 size += count;
1270 }
1271 buf = realloc (buf, size + 1);
1272 if (buf == NULL)
1273 {
1274 fprintf (stderr, "out of memory\n");
1275 exit (1);
1276 }
1277 buf[size] = '\0';
1278 return buf;
1279 #undef BUFSIZE
1280 }
1281
1282 int
main(int argc,char * argv[])1283 main (int argc, char * argv[])
1284 {
1285 if (argc == 1)
1286 {
1287 /* Display all the break opportunities in the input string. */
1288 char *input = read_file (stdin);
1289 int length = strlen (input);
1290 char *breaks = malloc (length);
1291 int i;
1292
1293 u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
1294
1295 for (i = 0; i < length; i++)
1296 {
1297 switch (breaks[i])
1298 {
1299 case UC_BREAK_POSSIBLE:
1300 /* U+2027 in UTF-8 encoding */
1301 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
1302 break;
1303 case UC_BREAK_MANDATORY:
1304 /* U+21B2 (or U+21B5) in UTF-8 encoding */
1305 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
1306 break;
1307 case UC_BREAK_PROHIBITED:
1308 break;
1309 default:
1310 abort ();
1311 }
1312 putc (input[i], stdout);
1313 }
1314
1315 free (breaks);
1316
1317 return 0;
1318 }
1319 else if (argc == 2)
1320 {
1321 /* Insert line breaks for a given width. */
1322 int width = atoi (argv[1]);
1323 char *input = read_file (stdin);
1324 int length = strlen (input);
1325 char *breaks = malloc (length);
1326 int i;
1327
1328 u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
1329
1330 for (i = 0; i < length; i++)
1331 {
1332 switch (breaks[i])
1333 {
1334 case UC_BREAK_POSSIBLE:
1335 putc ('\n', stdout);
1336 break;
1337 case UC_BREAK_MANDATORY:
1338 break;
1339 case UC_BREAK_PROHIBITED:
1340 break;
1341 default:
1342 abort ();
1343 }
1344 putc (input[i], stdout);
1345 }
1346
1347 free (breaks);
1348
1349 return 0;
1350 }
1351 else
1352 return 1;
1353 }
1354
1355 #endif /* TEST1 */
1356
1357
1358 /* Now the same thing with an arbitrary encoding.
1359
1360 We convert the input string to Unicode.
1361
1362 The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
1363 UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to
1364 \U0000FFFF. UTF-16 and variants support only characters up to
1365 \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1.
1366 UCS-4 specification leaves doubts about endianness and byte order mark.
1367 glibc currently interprets it as big endian without byte order mark,
1368 but this is not backed by an RFC. So we use UTF-8. It supports
1369 characters up to \U7FFFFFFF and is unambiguously defined. */
1370
1371 #if HAVE_ICONV
1372
1373 #include <iconv.h>
1374 #include <errno.h>
1375
1376 /* Luckily, the encoding's name is platform independent. */
1377 #define UTF8_NAME "UTF-8"
1378
1379 /* Return the length of a string after conversion through an iconv_t. */
1380 static size_t
iconv_string_length(iconv_t cd,const char * s,size_t n)1381 iconv_string_length (iconv_t cd, const char *s, size_t n)
1382 {
1383 #define TMPBUFSIZE 4096
1384 size_t count = 0;
1385 char tmpbuf[TMPBUFSIZE];
1386 const char *inptr = s;
1387 size_t insize = n;
1388 while (insize > 0)
1389 {
1390 char *outptr = tmpbuf;
1391 size_t outsize = TMPBUFSIZE;
1392 size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1393 if (res == (size_t)(-1) && errno != E2BIG)
1394 return (size_t)(-1);
1395 count += outptr - tmpbuf;
1396 }
1397 /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */
1398 #if defined _LIBICONV_VERSION \
1399 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1400 {
1401 char *outptr = tmpbuf;
1402 size_t outsize = TMPBUFSIZE;
1403 size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
1404 if (res == (size_t)(-1))
1405 return (size_t)(-1);
1406 count += outptr - tmpbuf;
1407 }
1408 /* Return to the initial state. */
1409 iconv (cd, NULL, NULL, NULL, NULL);
1410 #endif
1411 return count;
1412 #undef TMPBUFSIZE
1413 }
1414
1415 static void
iconv_string_keeping_offsets(iconv_t cd,const char * s,size_t n,size_t * offtable,char * t,size_t m)1416 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
1417 size_t *offtable, char *t, size_t m)
1418 {
1419 size_t i;
1420 const char *s_end;
1421 const char *inptr;
1422 char *outptr;
1423 size_t outsize;
1424 /* Avoid glibc-2.1 bug. */
1425 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
1426 const size_t extra = 1;
1427 #else
1428 const size_t extra = 0;
1429 #endif
1430
1431 for (i = 0; i < n; i++)
1432 offtable[i] = (size_t)(-1);
1433
1434 s_end = s + n;
1435 inptr = s;
1436 outptr = t;
1437 outsize = m + extra;
1438 while (inptr < s_end)
1439 {
1440 const char *saved_inptr;
1441 size_t insize;
1442 size_t res;
1443
1444 offtable[inptr - s] = outptr - t;
1445
1446 saved_inptr = inptr;
1447 res = (size_t)(-1);
1448 for (insize = 1; inptr + insize <= s_end; insize++)
1449 {
1450 res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1451 if (!(res == (size_t)(-1) && errno == EINVAL))
1452 break;
1453 /* We expect that no input bytes have been consumed so far. */
1454 if (inptr != saved_inptr)
1455 abort ();
1456 }
1457 /* After we verified the convertibility and computed the translation's
1458 size m, there shouldn't be any conversion error here. */
1459 if (res == (size_t)(-1))
1460 abort ();
1461 }
1462 /* Avoid glibc-2.1 bug and Solaris 7 bug. */
1463 #if defined _LIBICONV_VERSION \
1464 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1465 if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
1466 abort ();
1467 #endif
1468 /* We should have produced exactly m output bytes. */
1469 if (outsize != extra)
1470 abort ();
1471 }
1472
1473 #endif /* HAVE_ICONV */
1474
1475 #if C_CTYPE_ASCII
1476
1477 /* Tests whether a string is entirely ASCII. Returns 1 if yes.
1478 Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */
1479 static int
is_all_ascii(const char * s,size_t n)1480 is_all_ascii (const char *s, size_t n)
1481 {
1482 for (; n > 0; s++, n--)
1483 {
1484 unsigned char c = (unsigned char) *s;
1485
1486 if (!(c_isprint (c) || c_isspace (c)))
1487 return 0;
1488 }
1489 return 1;
1490 }
1491
1492 #endif /* C_CTYPE_ASCII */
1493
1494 #if defined unused || defined TEST2
1495
1496 void
mbs_possible_linebreaks(const char * s,size_t n,const char * encoding,char * p)1497 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
1498 char *p)
1499 {
1500 if (n == 0)
1501 return;
1502 if (is_utf8_encoding (encoding))
1503 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1504 else
1505 {
1506 #if HAVE_ICONV
1507 iconv_t to_utf8;
1508 /* Avoid glibc-2.1 bug with EUC-KR. */
1509 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1510 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1511 to_utf8 = (iconv_t)(-1);
1512 else
1513 # endif
1514 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1515 GB18030. */
1516 # if defined __sun && !defined _LIBICONV_VERSION
1517 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1518 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1519 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1520 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1521 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1522 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1523 to_utf8 = (iconv_t)(-1);
1524 else
1525 # endif
1526 to_utf8 = iconv_open (UTF8_NAME, encoding);
1527 if (to_utf8 != (iconv_t)(-1))
1528 {
1529 /* Determine the length of the resulting UTF-8 string. */
1530 size_t m = iconv_string_length (to_utf8, s, n);
1531 if (m != (size_t)(-1))
1532 {
1533 /* Convert the string to UTF-8 and build a translation table
1534 from offsets into s to offsets into the translated string. */
1535 size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
1536 char *memory =
1537 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1538 if (memory != NULL)
1539 {
1540 size_t *offtable = (size_t *) memory;
1541 char *t = (char *) (offtable + n);
1542 char *q = (char *) (t + m);
1543 size_t i;
1544
1545 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1546
1547 /* Determine the possible line breaks of the UTF-8 string. */
1548 u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1549
1550 /* Translate the result back to the original string. */
1551 memset (p, UC_BREAK_PROHIBITED, n);
1552 for (i = 0; i < n; i++)
1553 if (offtable[i] != (size_t)(-1))
1554 p[i] = q[offtable[i]];
1555
1556 free (memory);
1557 iconv_close (to_utf8);
1558 return;
1559 }
1560 }
1561 iconv_close (to_utf8);
1562 }
1563 #endif
1564 /* Impossible to convert. */
1565 #if C_CTYPE_ASCII
1566 if (is_all_ascii (s, n))
1567 {
1568 /* ASCII is a subset of UTF-8. */
1569 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1570 return;
1571 }
1572 #endif
1573 /* We have a non-ASCII string and cannot convert it.
1574 Don't produce line breaks except those already present in the
1575 input string. All we assume here is that the encoding is
1576 minimally ASCII compatible. */
1577 {
1578 const char *s_end = s + n;
1579 while (s < s_end)
1580 {
1581 *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1582 s++;
1583 p++;
1584 }
1585 }
1586 }
1587 }
1588
1589 #endif
1590
1591 int
mbs_width_linebreaks(const char * s,size_t n,int width,int start_column,int at_end_columns,const char * o,const char * encoding,char * p)1592 mbs_width_linebreaks (const char *s, size_t n,
1593 int width, int start_column, int at_end_columns,
1594 const char *o, const char *encoding,
1595 char *p)
1596 {
1597 if (n == 0)
1598 return start_column;
1599 if (is_utf8_encoding (encoding))
1600 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1601 else
1602 {
1603 #if HAVE_ICONV
1604 iconv_t to_utf8;
1605 /* Avoid glibc-2.1 bug with EUC-KR. */
1606 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1607 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1608 to_utf8 = (iconv_t)(-1);
1609 else
1610 # endif
1611 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1612 GB18030. */
1613 # if defined __sun && !defined _LIBICONV_VERSION
1614 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1615 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1616 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1617 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1618 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1619 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1620 to_utf8 = (iconv_t)(-1);
1621 else
1622 # endif
1623 to_utf8 = iconv_open (UTF8_NAME, encoding);
1624 if (to_utf8 != (iconv_t)(-1))
1625 {
1626 /* Determine the length of the resulting UTF-8 string. */
1627 size_t m = iconv_string_length (to_utf8, s, n);
1628 if (m != (size_t)(-1))
1629 {
1630 /* Convert the string to UTF-8 and build a translation table
1631 from offsets into s to offsets into the translated string. */
1632 size_t memory_size =
1633 xsum4 (xtimes (n, sizeof (size_t)), m, m,
1634 (o != NULL ? m : 0));
1635 char *memory =
1636 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1637 if (memory != NULL)
1638 {
1639 size_t *offtable = (size_t *) memory;
1640 char *t = (char *) (offtable + n);
1641 char *q = (char *) (t + m);
1642 char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1643 int res_column;
1644 size_t i;
1645
1646 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1647
1648 /* Translate the overrides to the UTF-8 string. */
1649 if (o != NULL)
1650 {
1651 memset (o8, UC_BREAK_UNDEFINED, m);
1652 for (i = 0; i < n; i++)
1653 if (offtable[i] != (size_t)(-1))
1654 o8[offtable[i]] = o[i];
1655 }
1656
1657 /* Determine the line breaks of the UTF-8 string. */
1658 res_column =
1659 u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1660
1661 /* Translate the result back to the original string. */
1662 memset (p, UC_BREAK_PROHIBITED, n);
1663 for (i = 0; i < n; i++)
1664 if (offtable[i] != (size_t)(-1))
1665 p[i] = q[offtable[i]];
1666
1667 free (memory);
1668 iconv_close (to_utf8);
1669 return res_column;
1670 }
1671 }
1672 iconv_close (to_utf8);
1673 }
1674 #endif
1675 /* Impossible to convert. */
1676 #if C_CTYPE_ASCII
1677 if (is_all_ascii (s, n))
1678 {
1679 /* ASCII is a subset of UTF-8. */
1680 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1681 }
1682 #endif
1683 /* We have a non-ASCII string and cannot convert it.
1684 Don't produce line breaks except those already present in the
1685 input string. All we assume here is that the encoding is
1686 minimally ASCII compatible. */
1687 {
1688 const char *s_end = s + n;
1689 while (s < s_end)
1690 {
1691 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1692 ? UC_BREAK_MANDATORY
1693 : UC_BREAK_PROHIBITED);
1694 s++;
1695 p++;
1696 if (o != NULL)
1697 o++;
1698 }
1699 /* We cannot compute widths in this case. */
1700 return start_column;
1701 }
1702 }
1703 }
1704
1705
1706 #ifdef TEST2
1707
1708 #include <stdio.h>
1709 #include <locale.h>
1710
1711 /* Read the contents of an input stream, and return it, terminated with a NUL
1712 byte. */
1713 char *
read_file(FILE * stream)1714 read_file (FILE *stream)
1715 {
1716 #define BUFSIZE 4096
1717 char *buf = NULL;
1718 int alloc = 0;
1719 int size = 0;
1720 int count;
1721
1722 while (! feof (stream))
1723 {
1724 if (size + BUFSIZE > alloc)
1725 {
1726 alloc = alloc + alloc / 2;
1727 if (alloc < size + BUFSIZE)
1728 alloc = size + BUFSIZE;
1729 buf = realloc (buf, alloc);
1730 if (buf == NULL)
1731 {
1732 fprintf (stderr, "out of memory\n");
1733 exit (1);
1734 }
1735 }
1736 count = fread (buf + size, 1, BUFSIZE, stream);
1737 if (count == 0)
1738 {
1739 if (ferror (stream))
1740 {
1741 perror ("fread");
1742 exit (1);
1743 }
1744 }
1745 else
1746 size += count;
1747 }
1748 buf = realloc (buf, size + 1);
1749 if (buf == NULL)
1750 {
1751 fprintf (stderr, "out of memory\n");
1752 exit (1);
1753 }
1754 buf[size] = '\0';
1755 return buf;
1756 #undef BUFSIZE
1757 }
1758
1759 int
main(int argc,char * argv[])1760 main (int argc, char * argv[])
1761 {
1762 setlocale (LC_CTYPE, "");
1763 if (argc == 1)
1764 {
1765 /* Display all the break opportunities in the input string. */
1766 char *input = read_file (stdin);
1767 int length = strlen (input);
1768 char *breaks = malloc (length);
1769 int i;
1770
1771 mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1772
1773 for (i = 0; i < length; i++)
1774 {
1775 switch (breaks[i])
1776 {
1777 case UC_BREAK_POSSIBLE:
1778 putc ('|', stdout);
1779 break;
1780 case UC_BREAK_MANDATORY:
1781 break;
1782 case UC_BREAK_PROHIBITED:
1783 break;
1784 default:
1785 abort ();
1786 }
1787 putc (input[i], stdout);
1788 }
1789
1790 free (breaks);
1791
1792 return 0;
1793 }
1794 else if (argc == 2)
1795 {
1796 /* Insert line breaks for a given width. */
1797 int width = atoi (argv[1]);
1798 char *input = read_file (stdin);
1799 int length = strlen (input);
1800 char *breaks = malloc (length);
1801 int i;
1802
1803 mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1804
1805 for (i = 0; i < length; i++)
1806 {
1807 switch (breaks[i])
1808 {
1809 case UC_BREAK_POSSIBLE:
1810 putc ('\n', stdout);
1811 break;
1812 case UC_BREAK_MANDATORY:
1813 break;
1814 case UC_BREAK_PROHIBITED:
1815 break;
1816 default:
1817 abort ();
1818 }
1819 putc (input[i], stdout);
1820 }
1821
1822 free (breaks);
1823
1824 return 0;
1825 }
1826 else
1827 return 1;
1828 }
1829
1830 #endif /* TEST2 */
1831