1 /* linebreak.c - line breaking of Unicode strings 2 Copyright (C) 2001-2003, 2006 Free Software Foundation, Inc. 3 Written by Bruno Haible <haible@clisp.cons.org>, 2001. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software Foundation, 17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 18 19 #include <config.h> 20 21 /* Specification. */ 22 #include "linebreak.h" 23 24 #include <stdlib.h> 25 #include <string.h> 26 #include "c-ctype.h" 27 #include "xsize.h" 28 29 #include "utf8-ucs4.h" 30 31 #ifdef unused 32 #include "utf16-ucs4.h" 33 34 static inline int 35 u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n) 36 { 37 *puc = *s; 38 return 1; 39 } 40 #endif 41 42 43 /* Help GCC to generate good code for string comparisons with 44 immediate strings. */ 45 #if defined (__GNUC__) && defined (__OPTIMIZE__) 46 47 static inline int 48 streq9 (const char *s1, const char *s2) 49 { 50 return strcmp (s1 + 9, s2 + 9) == 0; 51 } 52 53 static inline int 54 streq8 (const char *s1, const char *s2, char s28) 55 { 56 if (s1[8] == s28) 57 { 58 if (s28 == 0) 59 return 1; 60 else 61 return streq9 (s1, s2); 62 } 63 else 64 return 0; 65 } 66 67 static inline int 68 streq7 (const char *s1, const char *s2, char s27, char s28) 69 { 70 if (s1[7] == s27) 71 { 72 if (s27 == 0) 73 return 1; 74 else 75 return streq8 (s1, s2, s28); 76 } 77 else 78 return 0; 79 } 80 81 static inline int 82 streq6 (const char *s1, const char *s2, char s26, char s27, char s28) 83 { 84 if (s1[6] == s26) 85 { 86 if (s26 == 0) 87 return 1; 88 else 89 return streq7 (s1, s2, s27, s28); 90 } 91 else 92 return 0; 93 } 94 95 static inline int 96 streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28) 97 { 98 if (s1[5] == s25) 99 { 100 if (s25 == 0) 101 return 1; 102 else 103 return streq6 (s1, s2, s26, s27, s28); 104 } 105 else 106 return 0; 107 } 108 109 static inline int 110 streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28) 111 { 112 if (s1[4] == s24) 113 { 114 if (s24 == 0) 115 return 1; 116 else 117 return streq5 (s1, s2, s25, s26, s27, s28); 118 } 119 else 120 return 0; 121 } 122 123 static inline int 124 streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28) 125 { 126 if (s1[3] == s23) 127 { 128 if (s23 == 0) 129 return 1; 130 else 131 return streq4 (s1, s2, s24, s25, s26, s27, s28); 132 } 133 else 134 return 0; 135 } 136 137 static inline int 138 streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28) 139 { 140 if (s1[2] == s22) 141 { 142 if (s22 == 0) 143 return 1; 144 else 145 return streq3 (s1, s2, s23, s24, s25, s26, s27, s28); 146 } 147 else 148 return 0; 149 } 150 151 static inline int 152 streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28) 153 { 154 if (s1[1] == s21) 155 { 156 if (s21 == 0) 157 return 1; 158 else 159 return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28); 160 } 161 else 162 return 0; 163 } 164 165 static inline int 166 streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28) 167 { 168 if (s1[0] == s20) 169 { 170 if (s20 == 0) 171 return 1; 172 else 173 return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28); 174 } 175 else 176 return 0; 177 } 178 179 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \ 180 streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28) 181 182 #else 183 184 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \ 185 (strcmp (s1, s2) == 0) 186 187 #endif 188 189 190 static int 191 is_cjk_encoding (const char *encoding) 192 { 193 if (0 194 /* Legacy Japanese encodings */ 195 || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0) 196 /* Legacy Chinese encodings */ 197 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) 198 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0) 199 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0) 200 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0) 201 /* Legacy Korean encodings */ 202 || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) 203 || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0) 204 || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0)) 205 return 1; 206 return 0; 207 } 208 209 static int 210 is_utf8_encoding (const char *encoding) 211 { 212 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0)) 213 return 1; 214 return 0; 215 } 216 217 218 /* Determine number of column positions required for UC. */ 219 int uc_width (unsigned int uc, const char *encoding); 220 221 /* 222 * Non-spacing attribute table. 223 * Consists of: 224 * - Non-spacing characters; generated from PropList.txt or 225 * "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt" 226 * - Format control characters; generated from 227 * "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt" 228 * - Zero width characters; generated from 229 * "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt" 230 */ 231 static const unsigned char nonspacing_table_data[16*64] = { 232 /* 0x0000-0x01ff */ 233 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */ 234 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */ 235 0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00, /* 0x0080-0x00bf */ 236 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */ 237 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */ 238 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */ 239 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */ 240 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */ 241 /* 0x0200-0x03ff */ 242 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */ 243 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */ 244 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */ 245 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */ 246 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */ 247 0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */ 248 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */ 249 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */ 250 /* 0x0400-0x05ff */ 251 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */ 252 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */ 253 0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */ 254 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */ 255 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */ 256 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */ 257 0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */ 258 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */ 259 /* 0x0600-0x07ff */ 260 0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */ 261 0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */ 262 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */ 263 0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */ 264 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */ 265 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */ 266 0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */ 267 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */ 268 /* 0x0800-0x09ff */ 269 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */ 270 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */ 271 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */ 272 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */ 273 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */ 274 0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */ 275 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */ 276 0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */ 277 /* 0x0a00-0x0bff */ 278 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */ 279 0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */ 280 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */ 281 0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */ 282 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */ 283 0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */ 284 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */ 285 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */ 286 /* 0x0c00-0x0dff */ 287 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */ 288 0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */ 289 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0c80-0x0cbf */ 290 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */ 291 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */ 292 0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */ 293 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */ 294 0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */ 295 /* 0x0e00-0x0fff */ 296 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */ 297 0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */ 298 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */ 299 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */ 300 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */ 301 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */ 302 0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */ 303 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */ 304 /* 0x1000-0x11ff */ 305 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */ 306 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */ 307 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */ 308 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */ 309 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */ 310 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */ 311 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */ 312 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */ 313 /* 0x1600-0x17ff */ 314 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */ 315 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */ 316 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */ 317 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */ 318 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */ 319 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */ 320 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f, /* 0x1780-0x17bf */ 321 0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */ 322 /* 0x1800-0x19ff */ 323 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */ 324 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */ 325 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */ 326 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */ 327 0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e, /* 0x1900-0x193f */ 328 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */ 329 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */ 330 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */ 331 /* 0x2000-0x21ff */ 332 0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */ 333 0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */ 334 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */ 335 0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */ 336 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */ 337 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */ 338 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */ 339 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */ 340 /* 0x3000-0x31ff */ 341 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */ 342 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */ 343 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */ 344 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */ 345 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */ 346 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */ 347 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */ 348 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */ 349 /* 0xfa00-0xfbff */ 350 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */ 351 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */ 352 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */ 353 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */ 354 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */ 355 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */ 356 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */ 357 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */ 358 /* 0xfe00-0xffff */ 359 0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */ 360 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */ 361 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */ 362 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */ 363 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */ 364 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */ 365 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */ 366 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */ 367 /* 0x1d000-0x1d1ff */ 368 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */ 369 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */ 370 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */ 371 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */ 372 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */ 373 0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */ 374 0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */ 375 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* 0x1d1c0-0x1d1ff */ 376 }; 377 static const signed char nonspacing_table_ind[240] = { 378 0, 1, 2, 3, 4, 5, 6, 7, /* 0x0000-0x0fff */ 379 8, -1, -1, 9, 10, -1, -1, -1, /* 0x1000-0x1fff */ 380 11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */ 381 12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */ 382 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */ 383 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */ 384 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */ 385 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */ 386 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */ 387 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */ 388 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */ 389 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */ 390 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */ 391 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */ 392 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */ 393 -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */ 394 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */ 395 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */ 396 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */ 397 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */ 398 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */ 399 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */ 400 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */ 401 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */ 402 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */ 403 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */ 404 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */ 405 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */ 406 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */ 407 15, -1, -1, -1, -1, -1, -1, -1 /* 0x1d000-0x1dfff */ 408 }; 409 410 /* Determine number of column positions required for UC. */ 411 int 412 uc_width (unsigned int uc, const char *encoding) 413 { 414 /* Test for non-spacing or control character. */ 415 if ((uc >> 9) < 240) 416 { 417 int ind = nonspacing_table_ind[uc >> 9]; 418 if (ind >= 0) 419 if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1) 420 { 421 if (uc > 0 && uc < 0xa0) 422 return -1; 423 else 424 return 0; 425 } 426 } 427 else if ((uc >> 9) == (0xe0000 >> 9)) 428 { 429 if (uc < 0xe0100 430 ? (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001) 431 : (uc <= 0xe01ef)) 432 return 0; 433 } 434 /* Test for double-width character. 435 * Generated from "grep '^....;[WF]' EastAsianWidth.txt" 436 * and "grep '^....;[^WF]' EastAsianWidth.txt" 437 */ 438 if (uc >= 0x1100 439 && ((uc < 0x1160) /* Hangul Jamo */ 440 || (uc >= 0x2e80 && uc < 0x4dc0 /* CJK */ 441 && !(uc == 0x303f)) 442 || (uc >= 0x4e00 && uc < 0xa4d0) /* CJK ... Yi */ 443 || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */ 444 || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */ 445 || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */ 446 || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */ 447 || (uc >= 0xffe0 && uc < 0xffe7) 448 || (uc >= 0x20000 && uc <= 0x2fffd) /* CJK, CJK Compatibility Ideographs */ 449 || (uc >= 0x30000 && uc <= 0x3fffd) 450 ) ) 451 return 2; 452 /* In ancient CJK encodings, Cyrillic and most other characters are 453 double-width as well. */ 454 if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9 455 && is_cjk_encoding (encoding)) 456 return 2; 457 return 1; 458 } 459 460 461 #ifdef unused 462 463 /* Determine number of column positions required for first N units 464 (or fewer if S ends before this) in S. */ 465 466 int 467 u8_width (const unsigned char *s, size_t n, const char *encoding) 468 { 469 const unsigned char *s_end = s + n; 470 int width = 0; 471 472 while (s < s_end) 473 { 474 unsigned int uc; 475 int w; 476 477 s += u8_mbtouc (&uc, s, s_end - s); 478 479 if (uc == 0) 480 break; /* end of string reached */ 481 482 w = uc_width (uc, encoding); 483 if (w >= 0) /* ignore control characters in the string */ 484 width += w; 485 } 486 487 return width; 488 } 489 490 int 491 u16_width (const unsigned short *s, size_t n, const char *encoding) 492 { 493 const unsigned short *s_end = s + n; 494 int width = 0; 495 496 while (s < s_end) 497 { 498 unsigned int uc; 499 int w; 500 501 s += u16_mbtouc (&uc, s, s_end - s); 502 503 if (uc == 0) 504 break; /* end of string reached */ 505 506 w = uc_width (uc, encoding); 507 if (w >= 0) /* ignore control characters in the string */ 508 width += w; 509 } 510 511 return width; 512 } 513 514 int 515 u32_width (const unsigned int *s, size_t n, const char *encoding) 516 { 517 const unsigned int *s_end = s + n; 518 int width = 0; 519 520 while (s < s_end) 521 { 522 unsigned int uc = *s++; 523 int w; 524 525 if (uc == 0) 526 break; /* end of string reached */ 527 528 w = uc_width (uc, encoding); 529 if (w >= 0) /* ignore control characters in the string */ 530 width += w; 531 } 532 533 return width; 534 } 535 536 #endif 537 538 539 /* Determine the line break points in S, and store the result at p[0..n-1]. */ 540 /* We don't support line breaking of complex-context dependent characters 541 (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */ 542 543 /* Line breaking classification. */ 544 545 enum 546 { 547 /* Values >= 20 are resolved at run time. */ 548 LBP_BK = 0, /* mandatory break */ 549 /*LBP_CR, carriage return - not used here because it's a DOSism */ 550 /*LBP_LF, line feed - not used here because it's a DOSism */ 551 LBP_CM = 20, /* attached characters and combining marks */ 552 /*LBP_SG, surrogates - not used here because they are not characters */ 553 LBP_ZW = 1, /* zero width space */ 554 LBP_IN = 2, /* inseparable */ 555 LBP_GL = 3, /* non-breaking (glue) */ 556 LBP_CB = 22, /* contingent break opportunity */ 557 LBP_SP = 21, /* space */ 558 LBP_BA = 4, /* break opportunity after */ 559 LBP_BB = 5, /* break opportunity before */ 560 LBP_B2 = 6, /* break opportunity before and after */ 561 LBP_HY = 7, /* hyphen */ 562 LBP_NS = 8, /* non starter */ 563 LBP_OP = 9, /* opening punctuation */ 564 LBP_CL = 10, /* closing punctuation */ 565 LBP_QU = 11, /* ambiguous quotation */ 566 LBP_EX = 12, /* exclamation/interrogation */ 567 LBP_ID = 13, /* ideographic */ 568 LBP_NU = 14, /* numeric */ 569 LBP_IS = 15, /* infix separator (numeric) */ 570 LBP_SY = 16, /* symbols allowing breaks */ 571 LBP_AL = 17, /* ordinary alphabetic and symbol characters */ 572 LBP_PR = 18, /* prefix (numeric) */ 573 LBP_PO = 19, /* postfix (numeric) */ 574 LBP_SA = 23, /* complex context (South East Asian) */ 575 LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */ 576 LBP_XX = 25 /* unknown */ 577 }; 578 579 #include "lbrkprop.h" 580 581 static inline unsigned char 582 lbrkprop_lookup (unsigned int uc) 583 { 584 unsigned int index1 = uc >> lbrkprop_header_0; 585 if (index1 < lbrkprop_header_1) 586 { 587 int lookup1 = lbrkprop.level1[index1]; 588 if (lookup1 >= 0) 589 { 590 unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3; 591 int lookup2 = lbrkprop.level2[lookup1 + index2]; 592 if (lookup2 >= 0) 593 { 594 unsigned int index3 = uc & lbrkprop_header_4; 595 return lbrkprop.level3[lookup2 + index3]; 596 } 597 } 598 } 599 return LBP_XX; 600 } 601 602 /* Table indexed by two line breaking classifications. */ 603 #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */ 604 #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */ 605 #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */ 606 static const unsigned char lbrk_table[19][19] = { 607 /* after */ 608 /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */ 609 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, }, 610 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 611 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, }, 612 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 613 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, }, 614 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 615 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 616 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 617 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, }, 618 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, }, 619 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, }, 620 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 621 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, }, 622 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, }, 623 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, }, 624 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, }, 625 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, }, 626 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, }, 627 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 628 /* "" */ 629 /* before */ 630 }; 631 /* Note: The (B2,B2) entry should probably be D instead of P. */ 632 /* Note: The (PR,ID) entry should probably be D instead of I. */ 633 634 void 635 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p) 636 { 637 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); 638 const unsigned char *s_end = s + n; 639 int last_prop = LBP_BK; /* line break property of last non-space character */ 640 char *seen_space = NULL; /* Was a space seen after the last non-space character? */ 641 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ 642 643 /* Don't break inside multibyte characters. */ 644 memset (p, UC_BREAK_PROHIBITED, n); 645 646 while (s < s_end) 647 { 648 unsigned int uc; 649 int count = u8_mbtouc (&uc, s, s_end - s); 650 int prop = lbrkprop_lookup (uc); 651 652 if (prop == LBP_BK) 653 { 654 /* Mandatory break. */ 655 *p = UC_BREAK_MANDATORY; 656 last_prop = LBP_BK; 657 seen_space = NULL; 658 seen_space2 = NULL; 659 } 660 else 661 { 662 char *q; 663 664 /* Resolve property values whose behaviour is not fixed. */ 665 switch (prop) 666 { 667 case LBP_AI: 668 /* Resolve ambiguous. */ 669 prop = LBP_AI_REPLACEMENT; 670 break; 671 case LBP_CB: 672 /* This is arbitrary. */ 673 prop = LBP_ID; 674 break; 675 case LBP_SA: 676 /* We don't handle complex scripts yet. 677 Treat LBP_SA like LBP_XX. */ 678 case LBP_XX: 679 /* This is arbitrary. */ 680 prop = LBP_AL; 681 break; 682 } 683 684 /* Deal with combining characters. */ 685 q = p; 686 if (prop == LBP_CM) 687 { 688 /* Don't break just before a combining character. */ 689 *p = UC_BREAK_PROHIBITED; 690 /* A combining character turns a preceding space into LBP_AL. */ 691 if (seen_space != NULL) 692 { 693 q = seen_space; 694 seen_space = seen_space2; 695 prop = LBP_AL; 696 goto lookup_via_table; 697 } 698 } 699 else if (prop == LBP_SP) 700 { 701 /* Don't break just before a space. */ 702 *p = UC_BREAK_PROHIBITED; 703 seen_space2 = seen_space; 704 seen_space = p; 705 } 706 else 707 { 708 lookup_via_table: 709 /* prop must be usable as an index for table 7.3 of UTR #14. */ 710 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0]))) 711 abort (); 712 713 if (last_prop == LBP_BK) 714 { 715 /* Don't break at the beginning of a line. */ 716 *q = UC_BREAK_PROHIBITED; 717 } 718 else 719 { 720 switch (lbrk_table [last_prop-1] [prop-1]) 721 { 722 case D: 723 *q = UC_BREAK_POSSIBLE; 724 break; 725 case I: 726 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); 727 break; 728 case P: 729 *q = UC_BREAK_PROHIBITED; 730 break; 731 default: 732 abort (); 733 } 734 } 735 last_prop = prop; 736 seen_space = NULL; 737 seen_space2 = NULL; 738 } 739 } 740 741 s += count; 742 p += count; 743 } 744 } 745 746 #ifdef unused 747 748 void 749 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p) 750 { 751 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); 752 const unsigned short *s_end = s + n; 753 int last_prop = LBP_BK; /* line break property of last non-space character */ 754 char *seen_space = NULL; /* Was a space seen after the last non-space character? */ 755 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ 756 757 /* Don't break inside multibyte characters. */ 758 memset (p, UC_BREAK_PROHIBITED, n); 759 760 while (s < s_end) 761 { 762 unsigned int uc; 763 int count = u16_mbtouc (&uc, s, s_end - s); 764 int prop = lbrkprop_lookup (uc); 765 766 if (prop == LBP_BK) 767 { 768 /* Mandatory break. */ 769 *p = UC_BREAK_MANDATORY; 770 last_prop = LBP_BK; 771 seen_space = NULL; 772 seen_space2 = NULL; 773 } 774 else 775 { 776 char *q; 777 778 /* Resolve property values whose behaviour is not fixed. */ 779 switch (prop) 780 { 781 case LBP_AI: 782 /* Resolve ambiguous. */ 783 prop = LBP_AI_REPLACEMENT; 784 break; 785 case LBP_CB: 786 /* This is arbitrary. */ 787 prop = LBP_ID; 788 break; 789 case LBP_SA: 790 /* We don't handle complex scripts yet. 791 Treat LBP_SA like LBP_XX. */ 792 case LBP_XX: 793 /* This is arbitrary. */ 794 prop = LBP_AL; 795 break; 796 } 797 798 /* Deal with combining characters. */ 799 q = p; 800 if (prop == LBP_CM) 801 { 802 /* Don't break just before a combining character. */ 803 *p = UC_BREAK_PROHIBITED; 804 /* A combining character turns a preceding space into LBP_AL. */ 805 if (seen_space != NULL) 806 { 807 q = seen_space; 808 seen_space = seen_space2; 809 prop = LBP_AL; 810 goto lookup_via_table; 811 } 812 } 813 else if (prop == LBP_SP) 814 { 815 /* Don't break just before a space. */ 816 *p = UC_BREAK_PROHIBITED; 817 seen_space2 = seen_space; 818 seen_space = p; 819 } 820 else 821 { 822 lookup_via_table: 823 /* prop must be usable as an index for table 7.3 of UTR #14. */ 824 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0]))) 825 abort (); 826 827 if (last_prop == LBP_BK) 828 { 829 /* Don't break at the beginning of a line. */ 830 *q = UC_BREAK_PROHIBITED; 831 } 832 else 833 { 834 switch (lbrk_table [last_prop-1] [prop-1]) 835 { 836 case D: 837 *q = UC_BREAK_POSSIBLE; 838 break; 839 case I: 840 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); 841 break; 842 case P: 843 *q = UC_BREAK_PROHIBITED; 844 break; 845 default: 846 abort (); 847 } 848 } 849 last_prop = prop; 850 seen_space = NULL; 851 seen_space2 = NULL; 852 } 853 } 854 855 s += count; 856 p += count; 857 } 858 } 859 860 void 861 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p) 862 { 863 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); 864 const unsigned int *s_end = s + n; 865 int last_prop = LBP_BK; /* line break property of last non-space character */ 866 char *seen_space = NULL; /* Was a space seen after the last non-space character? */ 867 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ 868 869 while (s < s_end) 870 { 871 unsigned int uc = *s; 872 int prop = lbrkprop_lookup (uc); 873 874 if (prop == LBP_BK) 875 { 876 /* Mandatory break. */ 877 *p = UC_BREAK_MANDATORY; 878 last_prop = LBP_BK; 879 seen_space = NULL; 880 seen_space2 = NULL; 881 } 882 else 883 { 884 char *q; 885 886 /* Resolve property values whose behaviour is not fixed. */ 887 switch (prop) 888 { 889 case LBP_AI: 890 /* Resolve ambiguous. */ 891 prop = LBP_AI_REPLACEMENT; 892 break; 893 case LBP_CB: 894 /* This is arbitrary. */ 895 prop = LBP_ID; 896 break; 897 case LBP_SA: 898 /* We don't handle complex scripts yet. 899 Treat LBP_SA like LBP_XX. */ 900 case LBP_XX: 901 /* This is arbitrary. */ 902 prop = LBP_AL; 903 break; 904 } 905 906 /* Deal with combining characters. */ 907 q = p; 908 if (prop == LBP_CM) 909 { 910 /* Don't break just before a combining character. */ 911 *p = UC_BREAK_PROHIBITED; 912 /* A combining character turns a preceding space into LBP_AL. */ 913 if (seen_space != NULL) 914 { 915 q = seen_space; 916 seen_space = seen_space2; 917 prop = LBP_AL; 918 goto lookup_via_table; 919 } 920 } 921 else if (prop == LBP_SP) 922 { 923 /* Don't break just before a space. */ 924 *p = UC_BREAK_PROHIBITED; 925 seen_space2 = seen_space; 926 seen_space = p; 927 } 928 else 929 { 930 lookup_via_table: 931 /* prop must be usable as an index for table 7.3 of UTR #14. */ 932 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0]))) 933 abort (); 934 935 if (last_prop == LBP_BK) 936 { 937 /* Don't break at the beginning of a line. */ 938 *q = UC_BREAK_PROHIBITED; 939 } 940 else 941 { 942 switch (lbrk_table [last_prop-1] [prop-1]) 943 { 944 case D: 945 *q = UC_BREAK_POSSIBLE; 946 break; 947 case I: 948 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); 949 break; 950 case P: 951 *q = UC_BREAK_PROHIBITED; 952 break; 953 default: 954 abort (); 955 } 956 } 957 last_prop = prop; 958 seen_space = NULL; 959 seen_space2 = NULL; 960 } 961 } 962 963 s++; 964 p++; 965 } 966 } 967 968 #endif 969 970 971 /* Choose the best line breaks, assuming the uc_width function. 972 Return the column after the end of the string. */ 973 974 int 975 u8_width_linebreaks (const unsigned char *s, size_t n, 976 int width, int start_column, int at_end_columns, 977 const char *o, const char *encoding, 978 char *p) 979 { 980 const unsigned char *s_end; 981 char *last_p; 982 int last_column; 983 int piece_width; 984 985 u8_possible_linebreaks (s, n, encoding, p); 986 987 s_end = s + n; 988 last_p = NULL; 989 last_column = start_column; 990 piece_width = 0; 991 while (s < s_end) 992 { 993 unsigned int uc; 994 int count = u8_mbtouc (&uc, s, s_end - s); 995 996 /* Respect the override. */ 997 if (o != NULL && *o != UC_BREAK_UNDEFINED) 998 *p = *o; 999 1000 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) 1001 { 1002 /* An atomic piece of text ends here. */ 1003 if (last_p != NULL && last_column + piece_width > width) 1004 { 1005 /* Insert a line break. */ 1006 *last_p = UC_BREAK_POSSIBLE; 1007 last_column = 0; 1008 } 1009 } 1010 1011 if (*p == UC_BREAK_MANDATORY) 1012 { 1013 /* uc is a line break character. */ 1014 /* Start a new piece at column 0. */ 1015 last_p = NULL; 1016 last_column = 0; 1017 piece_width = 0; 1018 } 1019 else 1020 { 1021 /* uc is not a line break character. */ 1022 int w; 1023 1024 if (*p == UC_BREAK_POSSIBLE) 1025 { 1026 /* Start a new piece. */ 1027 last_p = p; 1028 last_column += piece_width; 1029 piece_width = 0; 1030 /* No line break for the moment, may be turned into 1031 UC_BREAK_POSSIBLE later, via last_p. */ 1032 } 1033 1034 *p = UC_BREAK_PROHIBITED; 1035 1036 w = uc_width (uc, encoding); 1037 if (w >= 0) /* ignore control characters in the string */ 1038 piece_width += w; 1039 } 1040 1041 s += count; 1042 p += count; 1043 if (o != NULL) 1044 o += count; 1045 } 1046 1047 /* The last atomic piece of text ends here. */ 1048 if (last_p != NULL && last_column + piece_width + at_end_columns > width) 1049 { 1050 /* Insert a line break. */ 1051 *last_p = UC_BREAK_POSSIBLE; 1052 last_column = 0; 1053 } 1054 1055 return last_column + piece_width; 1056 } 1057 1058 #ifdef unused 1059 1060 int 1061 u16_width_linebreaks (const unsigned short *s, size_t n, 1062 int width, int start_column, int at_end_columns, 1063 const char *o, const char *encoding, 1064 char *p) 1065 { 1066 const unsigned short *s_end; 1067 char *last_p; 1068 int last_column; 1069 int piece_width; 1070 1071 u16_possible_linebreaks (s, n, encoding, p); 1072 1073 s_end = s + n; 1074 last_p = NULL; 1075 last_column = start_column; 1076 piece_width = 0; 1077 while (s < s_end) 1078 { 1079 unsigned int uc; 1080 int count = u16_mbtouc (&uc, s, s_end - s); 1081 1082 /* Respect the override. */ 1083 if (o != NULL && *o != UC_BREAK_UNDEFINED) 1084 *p = *o; 1085 1086 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) 1087 { 1088 /* An atomic piece of text ends here. */ 1089 if (last_p != NULL && last_column + piece_width > width) 1090 { 1091 /* Insert a line break. */ 1092 *last_p = UC_BREAK_POSSIBLE; 1093 last_column = 0; 1094 } 1095 } 1096 1097 if (*p == UC_BREAK_MANDATORY) 1098 { 1099 /* uc is a line break character. */ 1100 /* Start a new piece at column 0. */ 1101 last_p = NULL; 1102 last_column = 0; 1103 piece_width = 0; 1104 } 1105 else 1106 { 1107 /* uc is not a line break character. */ 1108 int w; 1109 1110 if (*p == UC_BREAK_POSSIBLE) 1111 { 1112 /* Start a new piece. */ 1113 last_p = p; 1114 last_column += piece_width; 1115 piece_width = 0; 1116 /* No line break for the moment, may be turned into 1117 UC_BREAK_POSSIBLE later, via last_p. */ 1118 } 1119 1120 *p = UC_BREAK_PROHIBITED; 1121 1122 w = uc_width (uc, encoding); 1123 if (w >= 0) /* ignore control characters in the string */ 1124 piece_width += w; 1125 } 1126 1127 s += count; 1128 p += count; 1129 if (o != NULL) 1130 o += count; 1131 } 1132 1133 /* The last atomic piece of text ends here. */ 1134 if (last_p != NULL && last_column + piece_width + at_end_columns > width) 1135 { 1136 /* Insert a line break. */ 1137 *last_p = UC_BREAK_POSSIBLE; 1138 last_column = 0; 1139 } 1140 1141 return last_column + piece_width; 1142 } 1143 1144 int 1145 u32_width_linebreaks (const unsigned int *s, size_t n, 1146 int width, int start_column, int at_end_columns, 1147 const char *o, const char *encoding, 1148 char *p) 1149 { 1150 const unsigned int *s_end; 1151 char *last_p; 1152 int last_column; 1153 int piece_width; 1154 1155 u32_possible_linebreaks (s, n, encoding, p); 1156 1157 s_end = s + n; 1158 last_p = NULL; 1159 last_column = start_column; 1160 piece_width = 0; 1161 while (s < s_end) 1162 { 1163 unsigned int uc = *s; 1164 1165 /* Respect the override. */ 1166 if (o != NULL && *o != UC_BREAK_UNDEFINED) 1167 *p = *o; 1168 1169 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) 1170 { 1171 /* An atomic piece of text ends here. */ 1172 if (last_p != NULL && last_column + piece_width > width) 1173 { 1174 /* Insert a line break. */ 1175 *last_p = UC_BREAK_POSSIBLE; 1176 last_column = 0; 1177 } 1178 } 1179 1180 if (*p == UC_BREAK_MANDATORY) 1181 { 1182 /* uc is a line break character. */ 1183 /* Start a new piece at column 0. */ 1184 last_p = NULL; 1185 last_column = 0; 1186 piece_width = 0; 1187 } 1188 else 1189 { 1190 /* uc is not a line break character. */ 1191 int w; 1192 1193 if (*p == UC_BREAK_POSSIBLE) 1194 { 1195 /* Start a new piece. */ 1196 last_p = p; 1197 last_column += piece_width; 1198 piece_width = 0; 1199 /* No line break for the moment, may be turned into 1200 UC_BREAK_POSSIBLE later, via last_p. */ 1201 } 1202 1203 *p = UC_BREAK_PROHIBITED; 1204 1205 w = uc_width (uc, encoding); 1206 if (w >= 0) /* ignore control characters in the string */ 1207 piece_width += w; 1208 } 1209 1210 s++; 1211 p++; 1212 if (o != NULL) 1213 o++; 1214 } 1215 1216 /* The last atomic piece of text ends here. */ 1217 if (last_p != NULL && last_column + piece_width + at_end_columns > width) 1218 { 1219 /* Insert a line break. */ 1220 *last_p = UC_BREAK_POSSIBLE; 1221 last_column = 0; 1222 } 1223 1224 return last_column + piece_width; 1225 } 1226 1227 #endif 1228 1229 1230 #ifdef TEST1 1231 1232 #include <stdio.h> 1233 1234 /* Read the contents of an input stream, and return it, terminated with a NUL 1235 byte. */ 1236 char * 1237 read_file (FILE *stream) 1238 { 1239 #define BUFSIZE 4096 1240 char *buf = NULL; 1241 int alloc = 0; 1242 int size = 0; 1243 int count; 1244 1245 while (! feof (stream)) 1246 { 1247 if (size + BUFSIZE > alloc) 1248 { 1249 alloc = alloc + alloc / 2; 1250 if (alloc < size + BUFSIZE) 1251 alloc = size + BUFSIZE; 1252 buf = realloc (buf, alloc); 1253 if (buf == NULL) 1254 { 1255 fprintf (stderr, "out of memory\n"); 1256 exit (1); 1257 } 1258 } 1259 count = fread (buf + size, 1, BUFSIZE, stream); 1260 if (count == 0) 1261 { 1262 if (ferror (stream)) 1263 { 1264 perror ("fread"); 1265 exit (1); 1266 } 1267 } 1268 else 1269 size += count; 1270 } 1271 buf = realloc (buf, size + 1); 1272 if (buf == NULL) 1273 { 1274 fprintf (stderr, "out of memory\n"); 1275 exit (1); 1276 } 1277 buf[size] = '\0'; 1278 return buf; 1279 #undef BUFSIZE 1280 } 1281 1282 int 1283 main (int argc, char * argv[]) 1284 { 1285 if (argc == 1) 1286 { 1287 /* Display all the break opportunities in the input string. */ 1288 char *input = read_file (stdin); 1289 int length = strlen (input); 1290 char *breaks = malloc (length); 1291 int i; 1292 1293 u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks); 1294 1295 for (i = 0; i < length; i++) 1296 { 1297 switch (breaks[i]) 1298 { 1299 case UC_BREAK_POSSIBLE: 1300 /* U+2027 in UTF-8 encoding */ 1301 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout); 1302 break; 1303 case UC_BREAK_MANDATORY: 1304 /* U+21B2 (or U+21B5) in UTF-8 encoding */ 1305 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout); 1306 break; 1307 case UC_BREAK_PROHIBITED: 1308 break; 1309 default: 1310 abort (); 1311 } 1312 putc (input[i], stdout); 1313 } 1314 1315 free (breaks); 1316 1317 return 0; 1318 } 1319 else if (argc == 2) 1320 { 1321 /* Insert line breaks for a given width. */ 1322 int width = atoi (argv[1]); 1323 char *input = read_file (stdin); 1324 int length = strlen (input); 1325 char *breaks = malloc (length); 1326 int i; 1327 1328 u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks); 1329 1330 for (i = 0; i < length; i++) 1331 { 1332 switch (breaks[i]) 1333 { 1334 case UC_BREAK_POSSIBLE: 1335 putc ('\n', stdout); 1336 break; 1337 case UC_BREAK_MANDATORY: 1338 break; 1339 case UC_BREAK_PROHIBITED: 1340 break; 1341 default: 1342 abort (); 1343 } 1344 putc (input[i], stdout); 1345 } 1346 1347 free (breaks); 1348 1349 return 0; 1350 } 1351 else 1352 return 1; 1353 } 1354 1355 #endif /* TEST1 */ 1356 1357 1358 /* Now the same thing with an arbitrary encoding. 1359 1360 We convert the input string to Unicode. 1361 1362 The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16, 1363 UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to 1364 \U0000FFFF. UTF-16 and variants support only characters up to 1365 \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1. 1366 UCS-4 specification leaves doubts about endianness and byte order mark. 1367 glibc currently interprets it as big endian without byte order mark, 1368 but this is not backed by an RFC. So we use UTF-8. It supports 1369 characters up to \U7FFFFFFF and is unambiguously defined. */ 1370 1371 #if HAVE_ICONV 1372 1373 #include <iconv.h> 1374 #include <errno.h> 1375 1376 /* Luckily, the encoding's name is platform independent. */ 1377 #define UTF8_NAME "UTF-8" 1378 1379 /* Return the length of a string after conversion through an iconv_t. */ 1380 static size_t 1381 iconv_string_length (iconv_t cd, const char *s, size_t n) 1382 { 1383 #define TMPBUFSIZE 4096 1384 size_t count = 0; 1385 char tmpbuf[TMPBUFSIZE]; 1386 const char *inptr = s; 1387 size_t insize = n; 1388 while (insize > 0) 1389 { 1390 char *outptr = tmpbuf; 1391 size_t outsize = TMPBUFSIZE; 1392 size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize); 1393 if (res == (size_t)(-1) && errno != E2BIG) 1394 return (size_t)(-1); 1395 count += outptr - tmpbuf; 1396 } 1397 /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */ 1398 #if defined _LIBICONV_VERSION \ 1399 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) 1400 { 1401 char *outptr = tmpbuf; 1402 size_t outsize = TMPBUFSIZE; 1403 size_t res = iconv (cd, NULL, NULL, &outptr, &outsize); 1404 if (res == (size_t)(-1)) 1405 return (size_t)(-1); 1406 count += outptr - tmpbuf; 1407 } 1408 /* Return to the initial state. */ 1409 iconv (cd, NULL, NULL, NULL, NULL); 1410 #endif 1411 return count; 1412 #undef TMPBUFSIZE 1413 } 1414 1415 static void 1416 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n, 1417 size_t *offtable, char *t, size_t m) 1418 { 1419 size_t i; 1420 const char *s_end; 1421 const char *inptr; 1422 char *outptr; 1423 size_t outsize; 1424 /* Avoid glibc-2.1 bug. */ 1425 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) 1426 const size_t extra = 1; 1427 #else 1428 const size_t extra = 0; 1429 #endif 1430 1431 for (i = 0; i < n; i++) 1432 offtable[i] = (size_t)(-1); 1433 1434 s_end = s + n; 1435 inptr = s; 1436 outptr = t; 1437 outsize = m + extra; 1438 while (inptr < s_end) 1439 { 1440 const char *saved_inptr; 1441 size_t insize; 1442 size_t res; 1443 1444 offtable[inptr - s] = outptr - t; 1445 1446 saved_inptr = inptr; 1447 res = (size_t)(-1); 1448 for (insize = 1; inptr + insize <= s_end; insize++) 1449 { 1450 res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize); 1451 if (!(res == (size_t)(-1) && errno == EINVAL)) 1452 break; 1453 /* We expect that no input bytes have been consumed so far. */ 1454 if (inptr != saved_inptr) 1455 abort (); 1456 } 1457 /* After we verified the convertibility and computed the translation's 1458 size m, there shouldn't be any conversion error here. */ 1459 if (res == (size_t)(-1)) 1460 abort (); 1461 } 1462 /* Avoid glibc-2.1 bug and Solaris 7 bug. */ 1463 #if defined _LIBICONV_VERSION \ 1464 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) 1465 if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1)) 1466 abort (); 1467 #endif 1468 /* We should have produced exactly m output bytes. */ 1469 if (outsize != extra) 1470 abort (); 1471 } 1472 1473 #endif /* HAVE_ICONV */ 1474 1475 #if C_CTYPE_ASCII 1476 1477 /* Tests whether a string is entirely ASCII. Returns 1 if yes. 1478 Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */ 1479 static int 1480 is_all_ascii (const char *s, size_t n) 1481 { 1482 for (; n > 0; s++, n--) 1483 { 1484 unsigned char c = (unsigned char) *s; 1485 1486 if (!(c_isprint (c) || c_isspace (c))) 1487 return 0; 1488 } 1489 return 1; 1490 } 1491 1492 #endif /* C_CTYPE_ASCII */ 1493 1494 #if defined unused || defined TEST2 1495 1496 void 1497 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding, 1498 char *p) 1499 { 1500 if (n == 0) 1501 return; 1502 if (is_utf8_encoding (encoding)) 1503 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p); 1504 else 1505 { 1506 #if HAVE_ICONV 1507 iconv_t to_utf8; 1508 /* Avoid glibc-2.1 bug with EUC-KR. */ 1509 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION 1510 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)) 1511 to_utf8 = (iconv_t)(-1); 1512 else 1513 # endif 1514 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK, 1515 GB18030. */ 1516 # if defined __sun && !defined _LIBICONV_VERSION 1517 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) 1518 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0) 1519 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0) 1520 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C') 1521 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0) 1522 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) 1523 to_utf8 = (iconv_t)(-1); 1524 else 1525 # endif 1526 to_utf8 = iconv_open (UTF8_NAME, encoding); 1527 if (to_utf8 != (iconv_t)(-1)) 1528 { 1529 /* Determine the length of the resulting UTF-8 string. */ 1530 size_t m = iconv_string_length (to_utf8, s, n); 1531 if (m != (size_t)(-1)) 1532 { 1533 /* Convert the string to UTF-8 and build a translation table 1534 from offsets into s to offsets into the translated string. */ 1535 size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m); 1536 char *memory = 1537 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL); 1538 if (memory != NULL) 1539 { 1540 size_t *offtable = (size_t *) memory; 1541 char *t = (char *) (offtable + n); 1542 char *q = (char *) (t + m); 1543 size_t i; 1544 1545 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m); 1546 1547 /* Determine the possible line breaks of the UTF-8 string. */ 1548 u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q); 1549 1550 /* Translate the result back to the original string. */ 1551 memset (p, UC_BREAK_PROHIBITED, n); 1552 for (i = 0; i < n; i++) 1553 if (offtable[i] != (size_t)(-1)) 1554 p[i] = q[offtable[i]]; 1555 1556 free (memory); 1557 iconv_close (to_utf8); 1558 return; 1559 } 1560 } 1561 iconv_close (to_utf8); 1562 } 1563 #endif 1564 /* Impossible to convert. */ 1565 #if C_CTYPE_ASCII 1566 if (is_all_ascii (s, n)) 1567 { 1568 /* ASCII is a subset of UTF-8. */ 1569 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p); 1570 return; 1571 } 1572 #endif 1573 /* We have a non-ASCII string and cannot convert it. 1574 Don't produce line breaks except those already present in the 1575 input string. All we assume here is that the encoding is 1576 minimally ASCII compatible. */ 1577 { 1578 const char *s_end = s + n; 1579 while (s < s_end) 1580 { 1581 *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED); 1582 s++; 1583 p++; 1584 } 1585 } 1586 } 1587 } 1588 1589 #endif 1590 1591 int 1592 mbs_width_linebreaks (const char *s, size_t n, 1593 int width, int start_column, int at_end_columns, 1594 const char *o, const char *encoding, 1595 char *p) 1596 { 1597 if (n == 0) 1598 return start_column; 1599 if (is_utf8_encoding (encoding)) 1600 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p); 1601 else 1602 { 1603 #if HAVE_ICONV 1604 iconv_t to_utf8; 1605 /* Avoid glibc-2.1 bug with EUC-KR. */ 1606 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION 1607 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)) 1608 to_utf8 = (iconv_t)(-1); 1609 else 1610 # endif 1611 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK, 1612 GB18030. */ 1613 # if defined __sun && !defined _LIBICONV_VERSION 1614 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) 1615 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0) 1616 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0) 1617 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C') 1618 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0) 1619 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) 1620 to_utf8 = (iconv_t)(-1); 1621 else 1622 # endif 1623 to_utf8 = iconv_open (UTF8_NAME, encoding); 1624 if (to_utf8 != (iconv_t)(-1)) 1625 { 1626 /* Determine the length of the resulting UTF-8 string. */ 1627 size_t m = iconv_string_length (to_utf8, s, n); 1628 if (m != (size_t)(-1)) 1629 { 1630 /* Convert the string to UTF-8 and build a translation table 1631 from offsets into s to offsets into the translated string. */ 1632 size_t memory_size = 1633 xsum4 (xtimes (n, sizeof (size_t)), m, m, 1634 (o != NULL ? m : 0)); 1635 char *memory = 1636 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL); 1637 if (memory != NULL) 1638 { 1639 size_t *offtable = (size_t *) memory; 1640 char *t = (char *) (offtable + n); 1641 char *q = (char *) (t + m); 1642 char *o8 = (o != NULL ? (char *) (q + m) : NULL); 1643 int res_column; 1644 size_t i; 1645 1646 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m); 1647 1648 /* Translate the overrides to the UTF-8 string. */ 1649 if (o != NULL) 1650 { 1651 memset (o8, UC_BREAK_UNDEFINED, m); 1652 for (i = 0; i < n; i++) 1653 if (offtable[i] != (size_t)(-1)) 1654 o8[offtable[i]] = o[i]; 1655 } 1656 1657 /* Determine the line breaks of the UTF-8 string. */ 1658 res_column = 1659 u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q); 1660 1661 /* Translate the result back to the original string. */ 1662 memset (p, UC_BREAK_PROHIBITED, n); 1663 for (i = 0; i < n; i++) 1664 if (offtable[i] != (size_t)(-1)) 1665 p[i] = q[offtable[i]]; 1666 1667 free (memory); 1668 iconv_close (to_utf8); 1669 return res_column; 1670 } 1671 } 1672 iconv_close (to_utf8); 1673 } 1674 #endif 1675 /* Impossible to convert. */ 1676 #if C_CTYPE_ASCII 1677 if (is_all_ascii (s, n)) 1678 { 1679 /* ASCII is a subset of UTF-8. */ 1680 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p); 1681 } 1682 #endif 1683 /* We have a non-ASCII string and cannot convert it. 1684 Don't produce line breaks except those already present in the 1685 input string. All we assume here is that the encoding is 1686 minimally ASCII compatible. */ 1687 { 1688 const char *s_end = s + n; 1689 while (s < s_end) 1690 { 1691 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n' 1692 ? UC_BREAK_MANDATORY 1693 : UC_BREAK_PROHIBITED); 1694 s++; 1695 p++; 1696 if (o != NULL) 1697 o++; 1698 } 1699 /* We cannot compute widths in this case. */ 1700 return start_column; 1701 } 1702 } 1703 } 1704 1705 1706 #ifdef TEST2 1707 1708 #include <stdio.h> 1709 #include <locale.h> 1710 1711 /* Read the contents of an input stream, and return it, terminated with a NUL 1712 byte. */ 1713 char * 1714 read_file (FILE *stream) 1715 { 1716 #define BUFSIZE 4096 1717 char *buf = NULL; 1718 int alloc = 0; 1719 int size = 0; 1720 int count; 1721 1722 while (! feof (stream)) 1723 { 1724 if (size + BUFSIZE > alloc) 1725 { 1726 alloc = alloc + alloc / 2; 1727 if (alloc < size + BUFSIZE) 1728 alloc = size + BUFSIZE; 1729 buf = realloc (buf, alloc); 1730 if (buf == NULL) 1731 { 1732 fprintf (stderr, "out of memory\n"); 1733 exit (1); 1734 } 1735 } 1736 count = fread (buf + size, 1, BUFSIZE, stream); 1737 if (count == 0) 1738 { 1739 if (ferror (stream)) 1740 { 1741 perror ("fread"); 1742 exit (1); 1743 } 1744 } 1745 else 1746 size += count; 1747 } 1748 buf = realloc (buf, size + 1); 1749 if (buf == NULL) 1750 { 1751 fprintf (stderr, "out of memory\n"); 1752 exit (1); 1753 } 1754 buf[size] = '\0'; 1755 return buf; 1756 #undef BUFSIZE 1757 } 1758 1759 int 1760 main (int argc, char * argv[]) 1761 { 1762 setlocale (LC_CTYPE, ""); 1763 if (argc == 1) 1764 { 1765 /* Display all the break opportunities in the input string. */ 1766 char *input = read_file (stdin); 1767 int length = strlen (input); 1768 char *breaks = malloc (length); 1769 int i; 1770 1771 mbs_possible_linebreaks (input, length, locale_charset (), breaks); 1772 1773 for (i = 0; i < length; i++) 1774 { 1775 switch (breaks[i]) 1776 { 1777 case UC_BREAK_POSSIBLE: 1778 putc ('|', stdout); 1779 break; 1780 case UC_BREAK_MANDATORY: 1781 break; 1782 case UC_BREAK_PROHIBITED: 1783 break; 1784 default: 1785 abort (); 1786 } 1787 putc (input[i], stdout); 1788 } 1789 1790 free (breaks); 1791 1792 return 0; 1793 } 1794 else if (argc == 2) 1795 { 1796 /* Insert line breaks for a given width. */ 1797 int width = atoi (argv[1]); 1798 char *input = read_file (stdin); 1799 int length = strlen (input); 1800 char *breaks = malloc (length); 1801 int i; 1802 1803 mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks); 1804 1805 for (i = 0; i < length; i++) 1806 { 1807 switch (breaks[i]) 1808 { 1809 case UC_BREAK_POSSIBLE: 1810 putc ('\n', stdout); 1811 break; 1812 case UC_BREAK_MANDATORY: 1813 break; 1814 case UC_BREAK_PROHIBITED: 1815 break; 1816 default: 1817 abort (); 1818 } 1819 putc (input[i], stdout); 1820 } 1821 1822 free (breaks); 1823 1824 return 0; 1825 } 1826 else 1827 return 1; 1828 } 1829 1830 #endif /* TEST2 */ 1831