1 /* Generate a Unicode conforming Line Break Properties tables from a 2 UnicodeData file. 3 Written by Bruno Haible <bruno@clisp.org>, 2000-2004. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software Foundation, 17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 18 19 /* Usage example: 20 $ gen-lbrkprop /usr/local/share/Unidata/UnicodeData.txt \ 21 Combining.txt \ 22 /usr/local/share/Unidata/EastAsianWidth.txt \ 23 /usr/local/share/Unidata/LineBreak.txt \ 24 3.1.0 25 */ 26 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <stdbool.h> 30 #include <stdint.h> 31 #include <string.h> 32 #include <time.h> 33 34 /* This structure represents one line in the UnicodeData.txt file. */ 35 struct unicode_attribute 36 { 37 const char *name; /* Character name */ 38 const char *category; /* General category */ 39 const char *combining; /* Canonical combining classes */ 40 const char *bidi; /* Bidirectional category */ 41 const char *decomposition; /* Character decomposition mapping */ 42 const char *decdigit; /* Decimal digit value */ 43 const char *digit; /* Digit value */ 44 const char *numeric; /* Numeric value */ 45 int mirrored; /* mirrored */ 46 const char *oldname; /* Old Unicode 1.0 name */ 47 const char *comment; /* Comment */ 48 unsigned int upper; /* Uppercase mapping */ 49 unsigned int lower; /* Lowercase mapping */ 50 unsigned int title; /* Titlecase mapping */ 51 }; 52 53 /* Missing fields are represented with "" for strings, and NONE for 54 characters. */ 55 #define NONE (~(unsigned int)0) 56 57 /* The entire contents of the UnicodeData.txt file. */ 58 struct unicode_attribute unicode_attributes [0x110000]; 59 60 /* Stores in unicode_attributes[i] the values from the given fields. */ 61 static void 62 fill_attribute (unsigned int i, 63 const char *field1, const char *field2, 64 const char *field3, const char *field4, 65 const char *field5, const char *field6, 66 const char *field7, const char *field8, 67 const char *field9, const char *field10, 68 const char *field11, const char *field12, 69 const char *field13, const char *field14) 70 { 71 struct unicode_attribute * uni; 72 73 if (i >= 0x110000) 74 { 75 fprintf (stderr, "index too large\n"); 76 exit (1); 77 } 78 uni = &unicode_attributes[i]; 79 /* Copy the strings. */ 80 uni->name = strdup (field1); 81 uni->category = (field2[0] == '\0' ? "" : strdup (field2)); 82 uni->combining = (field3[0] == '\0' ? "" : strdup (field3)); 83 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4)); 84 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5)); 85 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6)); 86 uni->digit = (field7[0] == '\0' ? "" : strdup (field7)); 87 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8)); 88 uni->mirrored = (field9[0] == 'Y'); 89 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10)); 90 uni->comment = (field11[0] == '\0' ? "" : strdup (field11)); 91 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16)); 92 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16)); 93 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16)); 94 } 95 96 /* Maximum length of a field in the UnicodeData.txt file. */ 97 #define FIELDLEN 120 98 99 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN. 100 Reads up to (but excluding) DELIM. 101 Returns 1 when a field was successfully read, otherwise 0. */ 102 static int 103 getfield (FILE *stream, char *buffer, int delim) 104 { 105 int count = 0; 106 int c; 107 108 for (; (c = getc (stream)), (c != EOF && c != delim); ) 109 { 110 /* The original unicode.org UnicodeData.txt file happens to have 111 CR/LF line terminators. Silently convert to LF. */ 112 if (c == '\r') 113 continue; 114 115 /* Put c into the buffer. */ 116 if (++count >= FIELDLEN - 1) 117 { 118 fprintf (stderr, "field too long\n"); 119 exit (1); 120 } 121 *buffer++ = c; 122 } 123 124 if (c == EOF) 125 return 0; 126 127 *buffer = '\0'; 128 return 1; 129 } 130 131 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt 132 file. */ 133 static void 134 fill_attributes (const char *unicodedata_filename) 135 { 136 unsigned int i, j; 137 FILE *stream; 138 char field0[FIELDLEN]; 139 char field1[FIELDLEN]; 140 char field2[FIELDLEN]; 141 char field3[FIELDLEN]; 142 char field4[FIELDLEN]; 143 char field5[FIELDLEN]; 144 char field6[FIELDLEN]; 145 char field7[FIELDLEN]; 146 char field8[FIELDLEN]; 147 char field9[FIELDLEN]; 148 char field10[FIELDLEN]; 149 char field11[FIELDLEN]; 150 char field12[FIELDLEN]; 151 char field13[FIELDLEN]; 152 char field14[FIELDLEN]; 153 int lineno = 0; 154 155 for (i = 0; i < 0x110000; i++) 156 unicode_attributes[i].name = NULL; 157 158 stream = fopen (unicodedata_filename, "r"); 159 if (stream == NULL) 160 { 161 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename); 162 exit (1); 163 } 164 165 for (;;) 166 { 167 int n; 168 169 lineno++; 170 n = getfield (stream, field0, ';'); 171 n += getfield (stream, field1, ';'); 172 n += getfield (stream, field2, ';'); 173 n += getfield (stream, field3, ';'); 174 n += getfield (stream, field4, ';'); 175 n += getfield (stream, field5, ';'); 176 n += getfield (stream, field6, ';'); 177 n += getfield (stream, field7, ';'); 178 n += getfield (stream, field8, ';'); 179 n += getfield (stream, field9, ';'); 180 n += getfield (stream, field10, ';'); 181 n += getfield (stream, field11, ';'); 182 n += getfield (stream, field12, ';'); 183 n += getfield (stream, field13, ';'); 184 n += getfield (stream, field14, '\n'); 185 if (n == 0) 186 break; 187 if (n != 15) 188 { 189 fprintf (stderr, "short line in'%s':%d\n", 190 unicodedata_filename, lineno); 191 exit (1); 192 } 193 i = strtoul (field0, NULL, 16); 194 if (field1[0] == '<' 195 && strlen (field1) >= 9 196 && !strcmp (field1 + strlen(field1) - 8, ", First>")) 197 { 198 /* Deal with a range. */ 199 lineno++; 200 n = getfield (stream, field0, ';'); 201 n += getfield (stream, field1, ';'); 202 n += getfield (stream, field2, ';'); 203 n += getfield (stream, field3, ';'); 204 n += getfield (stream, field4, ';'); 205 n += getfield (stream, field5, ';'); 206 n += getfield (stream, field6, ';'); 207 n += getfield (stream, field7, ';'); 208 n += getfield (stream, field8, ';'); 209 n += getfield (stream, field9, ';'); 210 n += getfield (stream, field10, ';'); 211 n += getfield (stream, field11, ';'); 212 n += getfield (stream, field12, ';'); 213 n += getfield (stream, field13, ';'); 214 n += getfield (stream, field14, '\n'); 215 if (n != 15) 216 { 217 fprintf (stderr, "missing end range in '%s':%d\n", 218 unicodedata_filename, lineno); 219 exit (1); 220 } 221 if (!(field1[0] == '<' 222 && strlen (field1) >= 8 223 && !strcmp (field1 + strlen (field1) - 7, ", Last>"))) 224 { 225 fprintf (stderr, "missing end range in '%s':%d\n", 226 unicodedata_filename, lineno); 227 exit (1); 228 } 229 field1[strlen (field1) - 7] = '\0'; 230 j = strtoul (field0, NULL, 16); 231 for (; i <= j; i++) 232 fill_attribute (i, field1+1, field2, field3, field4, field5, 233 field6, field7, field8, field9, field10, 234 field11, field12, field13, field14); 235 } 236 else 237 { 238 /* Single character line */ 239 fill_attribute (i, field1, field2, field3, field4, field5, 240 field6, field7, field8, field9, field10, 241 field11, field12, field13, field14); 242 } 243 } 244 if (ferror (stream) || fclose (stream)) 245 { 246 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename); 247 exit (1); 248 } 249 } 250 251 /* The combining property from the PropList.txt file. */ 252 char unicode_combining[0x110000]; 253 254 /* Stores in unicode_combining[] the Combining property from the 255 Unicode 3.0 PropList.txt file. */ 256 static void 257 fill_combining (const char *proplist_filename) 258 { 259 unsigned int i; 260 FILE *stream; 261 char buf[100+1]; 262 263 for (i = 0; i < 0x110000; i++) 264 unicode_combining[i] = 0; 265 266 stream = fopen (proplist_filename, "r"); 267 if (stream == NULL) 268 { 269 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename); 270 exit (1); 271 } 272 273 /* Search for the "Property dump for: 0x20000004 (Combining)" line. */ 274 do 275 { 276 if (fscanf (stream, "%100[^\n]\n", buf) < 1) 277 { 278 fprintf (stderr, "no combining property found in '%s'\n", 279 proplist_filename); 280 exit (1); 281 } 282 } 283 while (strstr (buf, "(Combining)") == NULL); 284 285 for (;;) 286 { 287 unsigned int i1, i2; 288 289 if (fscanf (stream, "%100[^\n]\n", buf) < 1) 290 { 291 fprintf (stderr, "premature end of combining property in '%s'\n", 292 proplist_filename); 293 exit (1); 294 } 295 if (buf[0] == '*') 296 break; 297 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.') 298 { 299 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2) 300 { 301 fprintf (stderr, "parse error in combining property in '%s'\n", 302 proplist_filename); 303 exit (1); 304 } 305 } 306 else if (strlen (buf) >= 4) 307 { 308 if (sscanf (buf, "%4X", &i1) < 1) 309 { 310 fprintf (stderr, "parse error in combining property in '%s'\n", 311 proplist_filename); 312 exit (1); 313 } 314 i2 = i1; 315 } 316 else 317 { 318 fprintf (stderr, "parse error in combining property in '%s'\n", 319 proplist_filename); 320 exit (1); 321 } 322 for (i = i1; i <= i2; i++) 323 unicode_combining[i] = 1; 324 } 325 if (ferror (stream) || fclose (stream)) 326 { 327 fprintf (stderr, "error reading from '%s'\n", proplist_filename); 328 exit (1); 329 } 330 } 331 332 /* The width property from the EastAsianWidth.txt file. 333 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */ 334 const char * unicode_width[0x110000]; 335 336 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt 337 file. */ 338 static void 339 fill_width (const char *width_filename) 340 { 341 unsigned int i, j; 342 FILE *stream; 343 char field0[FIELDLEN]; 344 char field1[FIELDLEN]; 345 char field2[FIELDLEN]; 346 int lineno = 0; 347 348 for (i = 0; i < 0x110000; i++) 349 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL); 350 351 stream = fopen (width_filename, "r"); 352 if (stream == NULL) 353 { 354 fprintf (stderr, "error during fopen of '%s'\n", width_filename); 355 exit (1); 356 } 357 358 for (;;) 359 { 360 int n; 361 int c; 362 363 lineno++; 364 c = getc (stream); 365 if (c == EOF) 366 break; 367 if (c == '#') 368 { 369 do c = getc (stream); while (c != EOF && c != '\n'); 370 continue; 371 } 372 ungetc (c, stream); 373 n = getfield (stream, field0, ';'); 374 n += getfield (stream, field1, ' '); 375 n += getfield (stream, field2, '\n'); 376 if (n == 0) 377 break; 378 if (n != 3) 379 { 380 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno); 381 exit (1); 382 } 383 i = strtoul (field0, NULL, 16); 384 if (strstr (field0, "..") != NULL) 385 { 386 /* Deal with a range. */ 387 j = strtoul (strstr (field0, "..") + 2, NULL, 16); 388 for (; i <= j; i++) 389 unicode_width[i] = strdup (field1); 390 } 391 else 392 { 393 /* Single character line. */ 394 unicode_width[i] = strdup (field1); 395 } 396 } 397 if (ferror (stream) || fclose (stream)) 398 { 399 fprintf (stderr, "error reading from '%s'\n", width_filename); 400 exit (1); 401 } 402 } 403 404 /* Line breaking classification. */ 405 406 enum 407 { 408 /* Values >= 20 are resolved at run time. */ 409 LBP_BK = 0, /* mandatory break */ 410 /*LBP_CR, carriage return - not used here because it's a DOSism */ 411 /*LBP_LF, line feed - not used here because it's a DOSism */ 412 LBP_CM = 20, /* attached characters and combining marks */ 413 /*LBP_SG, surrogates - not used here because they are not characters */ 414 LBP_ZW = 1, /* zero width space */ 415 LBP_IN = 2, /* inseparable */ 416 LBP_GL = 3, /* non-breaking (glue) */ 417 LBP_CB = 22, /* contingent break opportunity */ 418 LBP_SP = 21, /* space */ 419 LBP_BA = 4, /* break opportunity after */ 420 LBP_BB = 5, /* break opportunity before */ 421 LBP_B2 = 6, /* break opportunity before and after */ 422 LBP_HY = 7, /* hyphen */ 423 LBP_NS = 8, /* non starter */ 424 LBP_OP = 9, /* opening punctuation */ 425 LBP_CL = 10, /* closing punctuation */ 426 LBP_QU = 11, /* ambiguous quotation */ 427 LBP_EX = 12, /* exclamation/interrogation */ 428 LBP_ID = 13, /* ideographic */ 429 LBP_NU = 14, /* numeric */ 430 LBP_IS = 15, /* infix separator (numeric) */ 431 LBP_SY = 16, /* symbols allowing breaks */ 432 LBP_AL = 17, /* ordinary alphabetic and symbol characters */ 433 LBP_PR = 18, /* prefix (numeric) */ 434 LBP_PO = 19, /* postfix (numeric) */ 435 LBP_SA = 23, /* complex context (South East Asian) */ 436 LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */ 437 LBP_XX = 25 /* unknown */ 438 }; 439 440 /* Returns the line breaking classification for ch, as a bit mask. */ 441 static int 442 get_lbp (unsigned int ch) 443 { 444 int attr = 0; 445 446 if (unicode_attributes[ch].name != NULL) 447 { 448 /* mandatory break */ 449 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */ 450 || ch == 0x000C /* form feed */ 451 || ch == 0x2028 /* LINE SEPARATOR */ 452 || ch == 0x2029 /* PARAGRAPH SEPARATOR */) 453 attr |= 1 << LBP_BK; 454 455 /* zero width space */ 456 if (ch == 0x200B /* ZERO WIDTH SPACE */) 457 attr |= 1 << LBP_ZW; 458 459 /* inseparable */ 460 if (ch == 0x2024 /* ONE DOT LEADER */ 461 || ch == 0x2025 /* TWO DOT LEADER */ 462 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */) 463 attr |= 1 << LBP_IN; 464 465 /* non-breaking (glue) */ 466 if (ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */ 467 || ch == 0x00A0 /* NO-BREAK SPACE */ 468 || ch == 0x202F /* NARROW NO-BREAK SPACE */ 469 || ch == 0x2007 /* FIGURE SPACE */ 470 || ch == 0x2011 /* NON-BREAKING HYPHEN */ 471 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */) 472 attr |= 1 << LBP_GL; 473 474 /* contingent break opportunity */ 475 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */) 476 attr |= 1 << LBP_CB; 477 478 /* space */ 479 if (ch == 0x0020 /* SPACE */) 480 attr |= 1 << LBP_SP; 481 482 /* break opportunity after */ 483 if (ch == 0x2000 /* EN QUAD */ 484 || ch == 0x2001 /* EM QUAD */ 485 || ch == 0x2002 /* EN SPACE */ 486 || ch == 0x2003 /* EM SPACE */ 487 || ch == 0x2004 /* THREE-PER-EM SPACE */ 488 || ch == 0x2005 /* FOUR-PER-EM SPACE */ 489 || ch == 0x2006 /* SIX-PER-EM SPACE */ 490 || ch == 0x2008 /* PUNCTUATION SPACE */ 491 || ch == 0x2009 /* THIN SPACE */ 492 || ch == 0x200A /* HAIR SPACE */ 493 || ch == 0x0009 /* tab */ 494 || ch == 0x058A /* ARMENIAN HYPHEN */ 495 || ch == 0x2010 /* HYPHEN */ 496 || ch == 0x2012 /* FIGURE DASH */ 497 || ch == 0x2013 /* EN DASH */ 498 || ch == 0x00AD /* SOFT HYPHEN */ 499 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */ 500 || ch == 0x1361 /* ETHIOPIC WORDSPACE */ 501 || ch == 0x1680 /* OGHAM SPACE MARK */ 502 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */ 503 || ch == 0x2027 /* HYPHENATION POINT */ 504 || ch == 0x007C /* VERTICAL LINE */) 505 attr |= 1 << LBP_BA; 506 507 /* break opportunity before */ 508 if (ch == 0x00B4 /* ACUTE ACCENT */ 509 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */ 510 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */ 511 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */) 512 attr |= 1 << LBP_BB; 513 514 /* break opportunity before and after */ 515 if (ch == 0x2014 /* EM DASH */) 516 attr |= 1 << LBP_B2; 517 518 /* hyphen */ 519 if (ch == 0x002D /* HYPHEN-MINUS */) 520 attr |= 1 << LBP_HY; 521 522 /* exclamation/interrogation */ 523 if (ch == 0x0021 /* EXCLAMATION MARK */ 524 || ch == 0x003F /* QUESTION MARK */ 525 || ch == 0xFE56 /* SMALL QUESTION MARK */ 526 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */ 527 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */ 528 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */) 529 attr |= 1 << LBP_EX; 530 531 /* opening punctuation */ 532 if (unicode_attributes[ch].category[0] == 'P' 533 && unicode_attributes[ch].category[1] == 's') 534 attr |= 1 << LBP_OP; 535 536 /* closing punctuation */ 537 if (ch == 0x3001 /* IDEOGRAPHIC COMMA */ 538 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */ 539 || ch == 0xFE50 /* SMALL COMMA */ 540 || ch == 0xFE52 /* SMALL FULL STOP */ 541 || ch == 0xFF0C /* FULLWIDTH COMMA */ 542 || ch == 0xFF0E /* FULLWIDTH FULL STOP */ 543 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */ 544 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */ 545 || (unicode_attributes[ch].category[0] == 'P' 546 && unicode_attributes[ch].category[1] == 'e')) 547 attr |= 1 << LBP_CL; 548 549 /* ambiguous quotation */ 550 if (ch == 0x0022 /* QUOTATION MARK */ 551 || ch == 0x0027 /* APOSTROPHE */ 552 || (unicode_attributes[ch].category[0] == 'P' 553 && (unicode_attributes[ch].category[1] == 'f' 554 || unicode_attributes[ch].category[1] == 'i'))) 555 attr |= 1 << LBP_QU; 556 557 /* attached characters and combining marks */ 558 if ((unicode_attributes[ch].category[0] == 'M' 559 && (unicode_attributes[ch].category[1] == 'n' 560 || unicode_attributes[ch].category[1] == 'c' 561 || unicode_attributes[ch].category[1] == 'e')) 562 || (ch >= 0x1160 && ch <= 0x11F9) 563 || (unicode_attributes[ch].category[0] == 'C' 564 && (unicode_attributes[ch].category[1] == 'c' 565 || unicode_attributes[ch].category[1] == 'f'))) 566 if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL)))) 567 attr |= 1 << LBP_CM; 568 569 /* non starter */ 570 if (ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */ 571 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */ 572 || ch == 0x17D4 /* KHMER SIGN KHAN */ 573 || ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */ 574 || ch == 0x17D7 /* KHMER SIGN LEK TOO */ 575 || ch == 0x17D8 /* KHMER SIGN BEYYAL */ 576 || ch == 0x17D9 /* KHMER SIGN PHNAEK MUAN */ 577 || ch == 0x17DA /* KHMER SIGN KOOMUUT */ 578 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */ 579 || ch == 0x2044 /* FRACTION SLASH */ 580 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */ 581 || ch == 0x301C /* WAVE DASH */ 582 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */ 583 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */ 584 || ch == 0x309D /* HIRAGANA ITERATION MARK */ 585 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */ 586 || ch == 0x30FB /* KATAKANA MIDDLE DOT */ 587 || ch == 0x30FD /* KATAKANA ITERATION MARK */ 588 || ch == 0xFE54 /* SMALL SEMICOLON */ 589 || ch == 0xFE55 /* SMALL COLON */ 590 || ch == 0xFF1A /* FULLWIDTH COLON */ 591 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */ 592 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */ 593 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */ 594 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */ 595 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */ 596 || (unicode_attributes[ch].category[0] == 'L' 597 && unicode_attributes[ch].category[1] == 'm' 598 && (unicode_width[ch][0] == 'W' 599 || unicode_width[ch][0] == 'H')) 600 || (unicode_attributes[ch].category[0] == 'S' 601 && unicode_attributes[ch].category[1] == 'k' 602 && unicode_width[ch][0] == 'W') 603 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL 604 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL) 605 attr |= 1 << LBP_NS; 606 607 /* numeric */ 608 if (unicode_attributes[ch].category[0] == 'N' 609 && unicode_attributes[ch].category[1] == 'd' 610 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL) 611 attr |= 1 << LBP_NU; 612 613 /* infix separator (numeric) */ 614 if (ch == 0x002C /* COMMA */ 615 || ch == 0x002E /* FULL STOP */ 616 || ch == 0x003A /* COLON */ 617 || ch == 0x003B /* SEMICOLON */ 618 || ch == 0x0589 /* ARMENIAN FULL STOP */) 619 attr |= 1 << LBP_IS; 620 621 /* symbols allowing breaks */ 622 if (ch == 0x002F /* SOLIDUS */) 623 attr |= 1 << LBP_SY; 624 625 /* postfix (numeric) */ 626 if (ch == 0x0025 /* PERCENT SIGN */ 627 || ch == 0x00A2 /* CENT SIGN */ 628 || ch == 0x00B0 /* DEGREE SIGN */ 629 || ch == 0x2030 /* PER MILLE SIGN */ 630 || ch == 0x2031 /* PER TEN THOUSAND SIGN */ 631 || ch == 0x2032 /* PRIME */ 632 || ch == 0x2033 /* DOUBLE PRIME */ 633 || ch == 0x2034 /* TRIPLE PRIME */ 634 || ch == 0x2035 /* REVERSED PRIME */ 635 || ch == 0x2036 /* REVERSED DOUBLE PRIME */ 636 || ch == 0x2037 /* REVERSED TRIPLE PRIME */ 637 || ch == 0x20A7 /* PESETA SIGN */ 638 || ch == 0x2103 /* DEGREE CELSIUS */ 639 || ch == 0x2109 /* DEGREE FAHRENHEIT */ 640 || ch == 0x2126 /* OHM SIGN */ 641 || ch == 0xFE6A /* SMALL PERCENT SIGN */ 642 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */ 643 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */) 644 attr |= 1 << LBP_PO; 645 646 /* prefix (numeric) */ 647 if (ch == 0x002B /* PLUS SIGN */ 648 || ch == 0x005C /* REVERSE SOLIDUS */ 649 || ch == 0x00B1 /* PLUS-MINUS SIGN */ 650 || ch == 0x2116 /* NUMERO SIGN */ 651 || ch == 0x2212 /* MINUS SIGN */ 652 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */ 653 || (unicode_attributes[ch].category[0] == 'S' 654 && unicode_attributes[ch].category[1] == 'c')) 655 if (!(attr & (1 << LBP_PO))) 656 attr |= 1 << LBP_PR; 657 658 /* complex context (South East Asian) */ 659 if (((ch >= 0x0E00 && ch <= 0x0EFF) 660 || (ch >= 0x1000 && ch <= 0x109F) 661 || (ch >= 0x1780 && ch <= 0x17FF)) 662 && unicode_attributes[ch].category[0] == 'L' 663 && (unicode_attributes[ch].category[1] == 'm' 664 || unicode_attributes[ch].category[1] == 'o')) 665 if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_NU) | (1 << LBP_BA) | (1 << LBP_PR)))) 666 attr |= 1 << LBP_SA; 667 668 /* ideographic */ 669 if ((ch >= 0x1100 && ch <= 0x115F) /* HANGUL CHOSEONG */ 670 || (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */ 671 || ch == 0x3000 /* IDEOGRAPHIC SPACE */ 672 || (ch >= 0x3130 && ch <= 0x318F) /* HANGUL LETTER */ 673 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */ 674 || (ch >= 0x4E00 && ch <= 0x9FAF) /* CJK Ideograph */ 675 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK COMPATIBILITY IDEOGRAPH */ 676 || (ch >= 0xAC00 && ch <= 0xD7AF) /* HANGUL SYLLABLE */ 677 || (ch >= 0xA000 && ch <= 0xA48C) /* YI SYLLABLE */ 678 || (ch >= 0xA490 && ch <= 0xA4C6) /* YI RADICAL */ 679 || ch == 0xFE62 /* SMALL PLUS SIGN */ 680 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */ 681 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */ 682 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */ 683 || ch == 0xFE66 /* SMALL EQUALS SIGN */ 684 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */ 685 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */ 686 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */ 687 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL 688 || (ch >= 0x3000 && ch <= 0x33FF 689 && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL)))) 690 /* Extra characters for compatibility with Unicode LineBreak.txt. */ 691 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */ 692 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */ 693 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */ 694 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */ 695 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */ 696 || ch == 0xFE49 /* DASHED OVERLINE */ 697 || ch == 0xFE4A /* CENTRELINE OVERLINE */ 698 || ch == 0xFE4B /* WAVY OVERLINE */ 699 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */ 700 || ch == 0xFE4D /* DASHED LOW LINE */ 701 || ch == 0xFE4E /* CENTRELINE LOW LINE */ 702 || ch == 0xFE4F /* WAVY LOW LINE */ 703 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */ 704 || ch == 0xFE58 /* SMALL EM DASH */ 705 || ch == 0xFE5F /* SMALL NUMBER SIGN */ 706 || ch == 0xFE60 /* SMALL AMPERSAND */ 707 || ch == 0xFE61 /* SMALL ASTERISK */ 708 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */ 709 || ch == 0xFE6B /* SMALL COMMERCIAL AT */ 710 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */ 711 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */ 712 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */ 713 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */ 714 || ch == 0xFF0A /* FULLWIDTH ASTERISK */ 715 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */ 716 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */ 717 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */ 718 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */ 719 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */ 720 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */ 721 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */ 722 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */ 723 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */ 724 || ch == 0xFF3F /* FULLWIDTH LOW LINE */ 725 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */ 726 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */ 727 || ch == 0xFF5E /* FULLWIDTH TILDE */ 728 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */ 729 || ch == 0xFFE3 /* FULLWIDTH MACRON */ 730 || ch == 0xFFE4) /* FULLWIDTH BROKEN BAR */ 731 { 732 /* ambiguous (ideograph) ? */ 733 if (unicode_width[ch] != NULL 734 && unicode_width[ch][0] == 'A') 735 attr |= 1 << LBP_AI; 736 else 737 attr |= 1 << LBP_ID; 738 } 739 740 /* ordinary alphabetic and symbol characters */ 741 if ((unicode_attributes[ch].category[0] == 'L' 742 && (unicode_attributes[ch].category[1] == 'u' 743 || unicode_attributes[ch].category[1] == 'l' 744 || unicode_attributes[ch].category[1] == 't' 745 || unicode_attributes[ch].category[1] == 'm' 746 || unicode_attributes[ch].category[1] == 'o')) 747 || (unicode_attributes[ch].category[0] == 'S' 748 && (unicode_attributes[ch].category[1] == 'm' 749 || unicode_attributes[ch].category[1] == 'c' 750 || unicode_attributes[ch].category[1] == 'k' 751 || unicode_attributes[ch].category[1] == 'o')) 752 /* Extra characters for compatibility with Unicode LineBreak.txt. */ 753 || ch == 0x0023 /* NUMBER SIGN */ 754 || ch == 0x0026 /* AMPERSAND */ 755 || ch == 0x002A /* ASTERISK */ 756 || ch == 0x0040 /* COMMERCIAL AT */ 757 || ch == 0x005F /* LOW LINE */ 758 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */ 759 || ch == 0x00B2 /* SUPERSCRIPT TWO */ 760 || ch == 0x00B3 /* SUPERSCRIPT THREE */ 761 || ch == 0x00B7 /* MIDDLE DOT */ 762 || ch == 0x00B9 /* SUPERSCRIPT ONE */ 763 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */ 764 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */ 765 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */ 766 || ch == 0x00BF /* INVERTED QUESTION MARK */ 767 || ch == 0x037E /* GREEK QUESTION MARK */ 768 || ch == 0x0387 /* GREEK ANO TELEIA */ 769 || ch == 0x055A /* ARMENIAN APOSTROPHE */ 770 || ch == 0x055B /* ARMENIAN EMPHASIS MARK */ 771 || ch == 0x055C /* ARMENIAN EXCLAMATION MARK */ 772 || ch == 0x055D /* ARMENIAN COMMA */ 773 || ch == 0x055E /* ARMENIAN QUESTION MARK */ 774 || ch == 0x055F /* ARMENIAN ABBREVIATION MARK */ 775 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */ 776 || ch == 0x05C0 /* HEBREW PUNCTUATION PASEQ */ 777 || ch == 0x05C3 /* HEBREW PUNCTUATION SOF PASUQ */ 778 || ch == 0x05F3 /* HEBREW PUNCTUATION GERESH */ 779 || ch == 0x05F4 /* HEBREW PUNCTUATION GERSHAYIM */ 780 || ch == 0x060C /* ARABIC COMMA */ 781 || ch == 0x061B /* ARABIC SEMICOLON */ 782 || ch == 0x061F /* ARABIC QUESTION MARK */ 783 || ch == 0x066A /* ARABIC PERCENT SIGN */ 784 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */ 785 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */ 786 || ch == 0x066D /* ARABIC FIVE POINTED STAR */ 787 || ch == 0x06D4 /* ARABIC FULL STOP */ 788 || ch == 0x0700 /* SYRIAC END OF PARAGRAPH */ 789 || ch == 0x0701 /* SYRIAC SUPRALINEAR FULL STOP */ 790 || ch == 0x0702 /* SYRIAC SUBLINEAR FULL STOP */ 791 || ch == 0x0703 /* SYRIAC SUPRALINEAR COLON */ 792 || ch == 0x0704 /* SYRIAC SUBLINEAR COLON */ 793 || ch == 0x0705 /* SYRIAC HORIZONTAL COLON */ 794 || ch == 0x0706 /* SYRIAC COLON SKEWED LEFT */ 795 || ch == 0x0707 /* SYRIAC COLON SKEWED RIGHT */ 796 || ch == 0x0708 /* SYRIAC SUPRALINEAR COLON SKEWED LEFT */ 797 || ch == 0x0709 /* SYRIAC SUBLINEAR COLON SKEWED RIGHT */ 798 || ch == 0x070A /* SYRIAC CONTRACTION */ 799 || ch == 0x070B /* SYRIAC HARKLEAN OBELUS */ 800 || ch == 0x070C /* SYRIAC HARKLEAN METOBELUS */ 801 || ch == 0x070D /* SYRIAC HARKLEAN ASTERISCUS */ 802 || ch == 0x0964 /* DEVANAGARI DANDA */ 803 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */ 804 || ch == 0x0970 /* DEVANAGARI ABBREVIATION SIGN */ 805 || ch == 0x09F4 /* BENGALI CURRENCY NUMERATOR ONE */ 806 || ch == 0x09F5 /* BENGALI CURRENCY NUMERATOR TWO */ 807 || ch == 0x09F6 /* BENGALI CURRENCY NUMERATOR THREE */ 808 || ch == 0x09F7 /* BENGALI CURRENCY NUMERATOR FOUR */ 809 || ch == 0x09F8 /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */ 810 || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */ 811 || ch == 0x0BF0 /* TAMIL NUMBER TEN */ 812 || ch == 0x0BF1 /* TAMIL NUMBER ONE HUNDRED */ 813 || ch == 0x0BF2 /* TAMIL NUMBER ONE THOUSAND */ 814 || ch == 0x0DF4 /* SINHALA PUNCTUATION KUNDDALIYA */ 815 || ch == 0x0E4F /* THAI CHARACTER FONGMAN */ 816 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */ 817 || ch == 0x0F05 /* TIBETAN MARK CLOSING YIG MGO SGAB MA */ 818 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */ 819 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */ 820 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */ 821 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */ 822 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */ 823 || ch == 0x0F0D /* TIBETAN MARK SHAD */ 824 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */ 825 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */ 826 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */ 827 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */ 828 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */ 829 || ch == 0x0F2A /* TIBETAN DIGIT HALF ONE */ 830 || ch == 0x0F2B /* TIBETAN DIGIT HALF TWO */ 831 || ch == 0x0F2C /* TIBETAN DIGIT HALF THREE */ 832 || ch == 0x0F2D /* TIBETAN DIGIT HALF FOUR */ 833 || ch == 0x0F2E /* TIBETAN DIGIT HALF FIVE */ 834 || ch == 0x0F2F /* TIBETAN DIGIT HALF SIX */ 835 || ch == 0x0F30 /* TIBETAN DIGIT HALF SEVEN */ 836 || ch == 0x0F31 /* TIBETAN DIGIT HALF EIGHT */ 837 || ch == 0x0F32 /* TIBETAN DIGIT HALF NINE */ 838 || ch == 0x0F33 /* TIBETAN DIGIT HALF ZERO */ 839 || ch == 0x0F85 /* TIBETAN MARK PALUTA */ 840 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */ 841 || ch == 0x104B /* MYANMAR SIGN SECTION */ 842 || ch == 0x104C /* MYANMAR SYMBOL LOCATIVE */ 843 || ch == 0x104D /* MYANMAR SYMBOL COMPLETED */ 844 || ch == 0x104E /* MYANMAR SYMBOL AFOREMENTIONED */ 845 || ch == 0x104F /* MYANMAR SYMBOL GENITIVE */ 846 || ch == 0x10FB /* GEORGIAN PARAGRAPH SEPARATOR */ 847 || ch == 0x1362 /* ETHIOPIC FULL STOP */ 848 || ch == 0x1363 /* ETHIOPIC COMMA */ 849 || ch == 0x1364 /* ETHIOPIC SEMICOLON */ 850 || ch == 0x1365 /* ETHIOPIC COLON */ 851 || ch == 0x1366 /* ETHIOPIC PREFACE COLON */ 852 || ch == 0x1367 /* ETHIOPIC QUESTION MARK */ 853 || ch == 0x1368 /* ETHIOPIC PARAGRAPH SEPARATOR */ 854 || ch == 0x1372 /* ETHIOPIC NUMBER TEN */ 855 || ch == 0x1373 /* ETHIOPIC NUMBER TWENTY */ 856 || ch == 0x1374 /* ETHIOPIC NUMBER THIRTY */ 857 || ch == 0x1375 /* ETHIOPIC NUMBER FORTY */ 858 || ch == 0x1376 /* ETHIOPIC NUMBER FIFTY */ 859 || ch == 0x1377 /* ETHIOPIC NUMBER SIXTY */ 860 || ch == 0x1378 /* ETHIOPIC NUMBER SEVENTY */ 861 || ch == 0x1379 /* ETHIOPIC NUMBER EIGHTY */ 862 || ch == 0x137A /* ETHIOPIC NUMBER NINETY */ 863 || ch == 0x137B /* ETHIOPIC NUMBER HUNDRED */ 864 || ch == 0x137C /* ETHIOPIC NUMBER TEN THOUSAND */ 865 || ch == 0x166D /* CANADIAN SYLLABICS CHI SIGN */ 866 || ch == 0x166E /* CANADIAN SYLLABICS FULL STOP */ 867 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */ 868 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */ 869 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */ 870 || ch == 0x16EE /* RUNIC ARLAUG SYMBOL */ 871 || ch == 0x16EF /* RUNIC TVIMADUR SYMBOL */ 872 || ch == 0x16F0 /* RUNIC BELGTHOR SYMBOL */ 873 || ch == 0x17DC /* KHMER SIGN AVAKRAHASANYA */ 874 || ch == 0x1800 /* MONGOLIAN BIRGA */ 875 || ch == 0x1801 /* MONGOLIAN ELLIPSIS */ 876 || ch == 0x1802 /* MONGOLIAN COMMA */ 877 || ch == 0x1803 /* MONGOLIAN FULL STOP */ 878 || ch == 0x1804 /* MONGOLIAN COLON */ 879 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */ 880 || ch == 0x1807 /* MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER */ 881 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */ 882 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */ 883 || ch == 0x180A /* MONGOLIAN NIRUGU */ 884 || ch == 0x2015 /* HORIZONTAL BAR */ 885 || ch == 0x2016 /* DOUBLE VERTICAL LINE */ 886 || ch == 0x2017 /* DOUBLE LOW LINE */ 887 || ch == 0x2020 /* DAGGER */ 888 || ch == 0x2021 /* DOUBLE DAGGER */ 889 || ch == 0x2022 /* BULLET */ 890 || ch == 0x2023 /* TRIANGULAR BULLET */ 891 || ch == 0x2038 /* CARET */ 892 || ch == 0x203B /* REFERENCE MARK */ 893 || ch == 0x203D /* INTERROBANG */ 894 || ch == 0x203E /* OVERLINE */ 895 || ch == 0x203F /* UNDERTIE */ 896 || ch == 0x2040 /* CHARACTER TIE */ 897 || ch == 0x2041 /* CARET INSERTION POINT */ 898 || ch == 0x2042 /* ASTERISM */ 899 || ch == 0x2043 /* HYPHEN BULLET */ 900 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */ 901 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */ 902 || ch == 0x204A /* TIRONIAN SIGN ET */ 903 || ch == 0x204B /* REVERSED PILCROW SIGN */ 904 || ch == 0x204C /* BLACK LEFTWARDS BULLET */ 905 || ch == 0x204D /* BLACK RIGHTWARDS BULLET */ 906 || ch == 0x2070 /* SUPERSCRIPT ZERO */ 907 || ch == 0x2074 /* SUPERSCRIPT FOUR */ 908 || ch == 0x2075 /* SUPERSCRIPT FIVE */ 909 || ch == 0x2076 /* SUPERSCRIPT SIX */ 910 || ch == 0x2077 /* SUPERSCRIPT SEVEN */ 911 || ch == 0x2078 /* SUPERSCRIPT EIGHT */ 912 || ch == 0x2079 /* SUPERSCRIPT NINE */ 913 || ch == 0x2080 /* SUBSCRIPT ZERO */ 914 || ch == 0x2081 /* SUBSCRIPT ONE */ 915 || ch == 0x2082 /* SUBSCRIPT TWO */ 916 || ch == 0x2083 /* SUBSCRIPT THREE */ 917 || ch == 0x2084 /* SUBSCRIPT FOUR */ 918 || ch == 0x2085 /* SUBSCRIPT FIVE */ 919 || ch == 0x2086 /* SUBSCRIPT SIX */ 920 || ch == 0x2087 /* SUBSCRIPT SEVEN */ 921 || ch == 0x2088 /* SUBSCRIPT EIGHT */ 922 || ch == 0x2089 /* SUBSCRIPT NINE */ 923 || (ch >= 0x2153 && ch <= 0x215E) /* VULGAR FRACTION */ 924 || ch == 0x215F /* FRACTION NUMERATOR ONE */ 925 || (ch >= 0x2160 && ch <= 0x2183) /* ROMAN NUMERAL */ 926 || (ch >= 0x2460 && ch <= 0x2473) /* CIRCLED NUMBER */ 927 || (ch >= 0x2474 && ch <= 0x2487) /* PARENTHESIZED NUMBER */ 928 || (ch >= 0x2488 && ch <= 0x249B) /* NUMBER FULL STOP */ 929 || ch == 0x24EA /* CIRCLED DIGIT ZERO */ 930 || (ch >= 0x2776 && ch <= 0x2793) /* DINGBAT CIRCLED DIGIT */ 931 || ch == 0x10320 /* OLD ITALIC NUMERAL ONE */ 932 || ch == 0x10321 /* OLD ITALIC NUMERAL FIVE */ 933 || ch == 0x10322 /* OLD ITALIC NUMERAL TEN */ 934 || ch == 0x10323 /* OLD ITALIC NUMERAL FIFTY */ 935 || ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */ 936 if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_ID) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SA) | (1 << LBP_CB)))) 937 { 938 /* ambiguous (alphabetic) ? */ 939 if (unicode_width[ch] != NULL 940 && unicode_width[ch][0] == 'A') 941 attr |= 1 << LBP_AI; 942 else 943 attr |= 1 << LBP_AL; 944 } 945 } 946 947 if (attr == 0) 948 /* unknown */ 949 attr |= 1 << LBP_XX; 950 951 return attr; 952 } 953 954 /* Output the line breaking properties in a human readable format. */ 955 static void 956 debug_output_lbp (FILE *stream) 957 { 958 unsigned int i; 959 960 for (i = 0; i < 0x110000; i++) 961 { 962 int attr = get_lbp (i); 963 if (attr != 1 << LBP_XX) 964 { 965 fprintf (stream, "0x%04X", i); 966 #define PRINT_BIT(attr,bit) \ 967 if (attr & (1 << bit)) fprintf (stream, " " #bit); 968 PRINT_BIT(attr,LBP_BK); 969 PRINT_BIT(attr,LBP_CM); 970 PRINT_BIT(attr,LBP_ZW); 971 PRINT_BIT(attr,LBP_IN); 972 PRINT_BIT(attr,LBP_GL); 973 PRINT_BIT(attr,LBP_CB); 974 PRINT_BIT(attr,LBP_SP); 975 PRINT_BIT(attr,LBP_BA); 976 PRINT_BIT(attr,LBP_BB); 977 PRINT_BIT(attr,LBP_B2); 978 PRINT_BIT(attr,LBP_HY); 979 PRINT_BIT(attr,LBP_NS); 980 PRINT_BIT(attr,LBP_OP); 981 PRINT_BIT(attr,LBP_CL); 982 PRINT_BIT(attr,LBP_QU); 983 PRINT_BIT(attr,LBP_EX); 984 PRINT_BIT(attr,LBP_ID); 985 PRINT_BIT(attr,LBP_NU); 986 PRINT_BIT(attr,LBP_IS); 987 PRINT_BIT(attr,LBP_SY); 988 PRINT_BIT(attr,LBP_AL); 989 PRINT_BIT(attr,LBP_PR); 990 PRINT_BIT(attr,LBP_PO); 991 PRINT_BIT(attr,LBP_SA); 992 PRINT_BIT(attr,LBP_XX); 993 PRINT_BIT(attr,LBP_AI); 994 #undef PRINT_BIT 995 fprintf (stream, "\n"); 996 } 997 } 998 } 999 1000 static void 1001 debug_output_tables (const char *filename) 1002 { 1003 FILE *stream; 1004 1005 stream = fopen (filename, "w"); 1006 if (stream == NULL) 1007 { 1008 fprintf (stderr, "cannot open '%s' for writing\n", filename); 1009 exit (1); 1010 } 1011 1012 debug_output_lbp (stream); 1013 1014 if (ferror (stream) || fclose (stream)) 1015 { 1016 fprintf (stderr, "error writing to '%s'\n", filename); 1017 exit (1); 1018 } 1019 } 1020 1021 /* The line breaking property from the LineBreak.txt file. */ 1022 int unicode_org_lbp[0x110000]; 1023 1024 /* Stores in unicode_org_lbp[] the line breaking property from the 1025 LineBreak.txt file. */ 1026 static void 1027 fill_org_lbp (const char *linebreak_filename) 1028 { 1029 unsigned int i, j; 1030 FILE *stream; 1031 char field0[FIELDLEN]; 1032 char field1[FIELDLEN]; 1033 char field2[FIELDLEN]; 1034 int lineno = 0; 1035 1036 for (i = 0; i < 0x110000; i++) 1037 unicode_org_lbp[i] = LBP_XX; 1038 1039 stream = fopen (linebreak_filename, "r"); 1040 if (stream == NULL) 1041 { 1042 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename); 1043 exit (1); 1044 } 1045 1046 for (;;) 1047 { 1048 int n; 1049 int c; 1050 int value; 1051 1052 lineno++; 1053 c = getc (stream); 1054 if (c == EOF) 1055 break; 1056 if (c == '#') 1057 { 1058 do c = getc (stream); while (c != EOF && c != '\n'); 1059 continue; 1060 } 1061 ungetc (c, stream); 1062 n = getfield (stream, field0, ';'); 1063 n += getfield (stream, field1, ' '); 1064 n += getfield (stream, field2, '\n'); 1065 if (n == 0) 1066 break; 1067 if (n != 3) 1068 { 1069 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename, 1070 lineno); 1071 exit (1); 1072 } 1073 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit; 1074 if (false) {} 1075 TRY(LBP_BK) 1076 TRY(LBP_CM) 1077 TRY(LBP_ZW) 1078 TRY(LBP_IN) 1079 TRY(LBP_GL) 1080 TRY(LBP_CB) 1081 TRY(LBP_SP) 1082 TRY(LBP_BA) 1083 TRY(LBP_BB) 1084 TRY(LBP_B2) 1085 TRY(LBP_HY) 1086 TRY(LBP_NS) 1087 TRY(LBP_OP) 1088 TRY(LBP_CL) 1089 TRY(LBP_QU) 1090 TRY(LBP_EX) 1091 TRY(LBP_ID) 1092 TRY(LBP_NU) 1093 TRY(LBP_IS) 1094 TRY(LBP_SY) 1095 TRY(LBP_AL) 1096 TRY(LBP_PR) 1097 TRY(LBP_PO) 1098 TRY(LBP_SA) 1099 TRY(LBP_XX) 1100 TRY(LBP_AI) 1101 #undef TRY 1102 else if (strcmp (field1, "LF") == 0) value = LBP_BK; 1103 else if (strcmp (field1, "CR") == 0) value = LBP_BK; 1104 else if (strcmp (field1, "SG") == 0) value = LBP_XX; 1105 else 1106 { 1107 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n", 1108 field1, linebreak_filename, lineno); 1109 exit (1); 1110 } 1111 i = strtoul (field0, NULL, 16); 1112 if (strstr (field0, "..") != NULL) 1113 { 1114 /* Deal with a range. */ 1115 j = strtoul (strstr (field0, "..") + 2, NULL, 16); 1116 for (; i <= j; i++) 1117 unicode_org_lbp[i] = value; 1118 } 1119 else 1120 { 1121 /* Single character line. */ 1122 unicode_org_lbp[i] = value; 1123 } 1124 } 1125 if (ferror (stream) || fclose (stream)) 1126 { 1127 fprintf (stderr, "error reading from '%s'\n", linebreak_filename); 1128 exit (1); 1129 } 1130 } 1131 1132 /* Output the line breaking properties in a human readable format. */ 1133 static void 1134 debug_output_org_lbp (FILE *stream) 1135 { 1136 unsigned int i; 1137 1138 for (i = 0; i < 0x110000; i++) 1139 { 1140 int attr = unicode_org_lbp[i]; 1141 if (attr != LBP_XX) 1142 { 1143 fprintf (stream, "0x%04X", i); 1144 #define PRINT_BIT(attr,bit) \ 1145 if (attr == bit) fprintf (stream, " " #bit); 1146 PRINT_BIT(attr,LBP_BK); 1147 PRINT_BIT(attr,LBP_CM); 1148 PRINT_BIT(attr,LBP_ZW); 1149 PRINT_BIT(attr,LBP_IN); 1150 PRINT_BIT(attr,LBP_GL); 1151 PRINT_BIT(attr,LBP_CB); 1152 PRINT_BIT(attr,LBP_SP); 1153 PRINT_BIT(attr,LBP_BA); 1154 PRINT_BIT(attr,LBP_BB); 1155 PRINT_BIT(attr,LBP_B2); 1156 PRINT_BIT(attr,LBP_HY); 1157 PRINT_BIT(attr,LBP_NS); 1158 PRINT_BIT(attr,LBP_OP); 1159 PRINT_BIT(attr,LBP_CL); 1160 PRINT_BIT(attr,LBP_QU); 1161 PRINT_BIT(attr,LBP_EX); 1162 PRINT_BIT(attr,LBP_ID); 1163 PRINT_BIT(attr,LBP_NU); 1164 PRINT_BIT(attr,LBP_IS); 1165 PRINT_BIT(attr,LBP_SY); 1166 PRINT_BIT(attr,LBP_AL); 1167 PRINT_BIT(attr,LBP_PR); 1168 PRINT_BIT(attr,LBP_PO); 1169 PRINT_BIT(attr,LBP_SA); 1170 PRINT_BIT(attr,LBP_XX); 1171 PRINT_BIT(attr,LBP_AI); 1172 #undef PRINT_BIT 1173 fprintf (stream, "\n"); 1174 } 1175 } 1176 } 1177 1178 static void 1179 debug_output_org_tables (const char *filename) 1180 { 1181 FILE *stream; 1182 1183 stream = fopen (filename, "w"); 1184 if (stream == NULL) 1185 { 1186 fprintf (stderr, "cannot open '%s' for writing\n", filename); 1187 exit (1); 1188 } 1189 1190 debug_output_org_lbp (stream); 1191 1192 if (ferror (stream) || fclose (stream)) 1193 { 1194 fprintf (stderr, "error writing to '%s'\n", filename); 1195 exit (1); 1196 } 1197 } 1198 1199 /* Construction of sparse 3-level tables. */ 1200 #define TABLE lbp_table 1201 #define ELEMENT unsigned char 1202 #define DEFAULT LBP_XX 1203 #define xmalloc malloc 1204 #define xrealloc realloc 1205 #include "3level.h" 1206 1207 static void 1208 output_lbp (FILE *stream) 1209 { 1210 unsigned int i; 1211 struct lbp_table t; 1212 unsigned int level1_offset, level2_offset, level3_offset; 1213 1214 t.p = 7; 1215 t.q = 9; 1216 lbp_table_init (&t); 1217 1218 for (i = 0; i < 0x110000; i++) 1219 { 1220 int attr = get_lbp (i); 1221 1222 /* Now attr should contain exactly one bit. */ 1223 if (attr == 0 || ((attr & (attr - 1)) != 0)) 1224 abort (); 1225 1226 if (attr != 1 << LBP_XX) 1227 { 1228 unsigned int log2_attr; 1229 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); 1230 1231 lbp_table_add (&t, i, log2_attr); 1232 } 1233 } 1234 1235 lbp_table_finalize (&t); 1236 1237 level1_offset = 1238 5 * sizeof (uint32_t); 1239 level2_offset = 1240 5 * sizeof (uint32_t) 1241 + t.level1_size * sizeof (uint32_t); 1242 level3_offset = 1243 5 * sizeof (uint32_t) 1244 + t.level1_size * sizeof (uint32_t) 1245 + (t.level2_size << t.q) * sizeof (uint32_t); 1246 1247 for (i = 0; i < 5; i++) 1248 fprintf (stream, "#define lbrkprop_header_%d %d\n", i, 1249 ((uint32_t *) t.result)[i]); 1250 fprintf (stream, "static const\n"); 1251 fprintf (stream, "struct\n"); 1252 fprintf (stream, " {\n"); 1253 fprintf (stream, " int level1[%d];\n", t.level1_size); 1254 fprintf (stream, " int level2[%d << %d];\n", t.level2_size, t.q); 1255 fprintf (stream, " unsigned char level3[%d << %d];\n", t.level3_size, t.p); 1256 fprintf (stream, " }\n"); 1257 fprintf (stream, "lbrkprop =\n"); 1258 fprintf (stream, "{\n"); 1259 fprintf (stream, " {"); 1260 for (i = 0; i < t.level1_size; i++) 1261 { 1262 uint32_t offset; 1263 if (i > 0 && (i % 8) == 0) 1264 fprintf (stream, "\n "); 1265 offset = ((uint32_t *) (t.result + level1_offset))[i]; 1266 fprintf (stream, " %5d%s", 1267 offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t), 1268 (i+1 < t.level1_size ? "," : "")); 1269 } 1270 fprintf (stream, " },\n"); 1271 fprintf (stream, " {"); 1272 if (t.level2_size << t.q > 8) 1273 fprintf (stream, "\n "); 1274 for (i = 0; i < t.level2_size << t.q; i++) 1275 { 1276 uint32_t offset; 1277 if (i > 0 && (i % 8) == 0) 1278 fprintf (stream, "\n "); 1279 offset = ((uint32_t *) (t.result + level2_offset))[i]; 1280 fprintf (stream, " %5d%s", 1281 offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t), 1282 (i+1 < t.level2_size << t.q ? "," : "")); 1283 } 1284 if (t.level2_size << t.q > 8) 1285 fprintf (stream, "\n "); 1286 fprintf (stream, " },\n"); 1287 fprintf (stream, " {"); 1288 if (t.level3_size << t.p > 8) 1289 fprintf (stream, "\n "); 1290 for (i = 0; i < t.level3_size << t.p; i++) 1291 { 1292 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i]; 1293 const char *value_string; 1294 switch (value) 1295 { 1296 #define CASE(x) case x: value_string = #x; break; 1297 CASE(LBP_BK); 1298 CASE(LBP_CM); 1299 CASE(LBP_ZW); 1300 CASE(LBP_IN); 1301 CASE(LBP_GL); 1302 CASE(LBP_CB); 1303 CASE(LBP_SP); 1304 CASE(LBP_BA); 1305 CASE(LBP_BB); 1306 CASE(LBP_B2); 1307 CASE(LBP_HY); 1308 CASE(LBP_NS); 1309 CASE(LBP_OP); 1310 CASE(LBP_CL); 1311 CASE(LBP_QU); 1312 CASE(LBP_EX); 1313 CASE(LBP_ID); 1314 CASE(LBP_NU); 1315 CASE(LBP_IS); 1316 CASE(LBP_SY); 1317 CASE(LBP_AL); 1318 CASE(LBP_PR); 1319 CASE(LBP_PO); 1320 CASE(LBP_SA); 1321 CASE(LBP_XX); 1322 CASE(LBP_AI); 1323 #undef CASE 1324 default: 1325 abort (); 1326 } 1327 if (i > 0 && (i % 8) == 0) 1328 fprintf (stream, "\n "); 1329 fprintf (stream, " %s%s", value_string, 1330 (i+1 < t.level3_size << t.p ? "," : "")); 1331 } 1332 if (t.level3_size << t.p > 8) 1333 fprintf (stream, "\n "); 1334 fprintf (stream, " }\n"); 1335 fprintf (stream, "};\n"); 1336 } 1337 1338 static void 1339 output_tables (const char *filename, const char *version) 1340 { 1341 FILE *stream; 1342 1343 stream = fopen (filename, "w"); 1344 if (stream == NULL) 1345 { 1346 fprintf (stderr, "cannot open '%s' for writing\n", filename); 1347 exit (1); 1348 } 1349 1350 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); 1351 fprintf (stream, "/* Generated automatically by gen-lbrkprop for Unicode %s. */\n", 1352 version); 1353 fprintf (stream, "\n"); 1354 1355 /* Put a GPL header on it. The gnulib module is under LGPL (although it 1356 still carries the GPL header), and it's gnulib-tool which replaces the 1357 GPL header with an LGPL header. */ 1358 fprintf (stream, "/* Copyright (C) 2000-2004 Free Software Foundation, Inc.\n"); 1359 fprintf (stream, "\n"); 1360 fprintf (stream, "This program is free software; you can redistribute it and/or modify\n"); 1361 fprintf (stream, "it under the terms of the GNU General Public License as published by\n"); 1362 fprintf (stream, "the Free Software Foundation; either version 2, or (at your option)\n"); 1363 fprintf (stream, "any later version.\n"); 1364 fprintf (stream, "\n"); 1365 fprintf (stream, "This program is distributed in the hope that it will be useful,\n"); 1366 fprintf (stream, "but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); 1367 fprintf (stream, "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); 1368 fprintf (stream, "GNU General Public License for more details.\n"); 1369 fprintf (stream, "\n"); 1370 fprintf (stream, "You should have received a copy of the GNU General Public License\n"); 1371 fprintf (stream, "along with this program; if not, write to the Free Software\n"); 1372 fprintf (stream, "Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */\n"); 1373 fprintf (stream, "\n"); 1374 1375 output_lbp (stream); 1376 1377 if (ferror (stream) || fclose (stream)) 1378 { 1379 fprintf (stderr, "error writing to '%s'\n", filename); 1380 exit (1); 1381 } 1382 } 1383 1384 int 1385 main (int argc, char * argv[]) 1386 { 1387 if (argc != 6) 1388 { 1389 fprintf (stderr, "Usage: %s UnicodeData.txt Combining.txt EastAsianWidth.txt LineBreak.txt version\n", 1390 argv[0]); 1391 exit (1); 1392 } 1393 1394 fill_attributes (argv[1]); 1395 fill_combining (argv[2]); 1396 fill_width (argv[3]); 1397 fill_org_lbp (argv[4]); 1398 1399 debug_output_tables ("lbrkprop.txt"); 1400 debug_output_org_tables ("lbrkprop_org.txt"); 1401 1402 output_tables ("lbrkprop.h", argv[5]); 1403 1404 return 0; 1405 } 1406