1 /* $NetBSD: ucdata.c,v 1.1.1.4 2014/05/28 09:58:44 tron Exp $ */ 2 3 /* $OpenLDAP$ */ 4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 5 * 6 * Copyright 1998-2014 The OpenLDAP Foundation. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted only as authorized by the OpenLDAP 11 * Public License. 12 * 13 * A copy of this license is available in file LICENSE in the 14 * top-level directory of the distribution or, alternatively, at 15 * <http://www.OpenLDAP.org/license.html>. 16 */ 17 /* Copyright 2001 Computing Research Labs, New Mexico State University 18 * 19 * Permission is hereby granted, free of charge, to any person obtaining a 20 * copy of this software and associated documentation files (the "Software"), 21 * to deal in the Software without restriction, including without limitation 22 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 23 * and/or sell copies of the Software, and to permit persons to whom the 24 * Software is furnished to do so, subject to the following conditions: 25 * 26 * The above copyright notice and this permission notice shall be included in 27 * all copies or substantial portions of the Software. 28 * 29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 32 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 33 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 34 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 35 * THE USE OR OTHER DEALINGS IN THE SOFTWARE. 36 */ 37 /* Id: ucdata.c,v 1.4 2001/01/02 18:46:20 mleisher Exp " */ 38 39 #include "portable.h" 40 #include "ldap_config.h" 41 42 #include <stdio.h> 43 #include <ac/stdlib.h> 44 #include <ac/string.h> 45 #include <ac/unistd.h> 46 47 #include <ac/bytes.h> 48 49 #include "lber_pvt.h" 50 #include "ucdata.h" 51 52 #ifndef HARDCODE_DATA 53 #define HARDCODE_DATA 1 54 #endif 55 56 #if HARDCODE_DATA 57 #include "uctable.h" 58 #endif 59 60 /************************************************************************** 61 * 62 * Miscellaneous types, data, and support functions. 63 * 64 **************************************************************************/ 65 66 typedef struct { 67 ac_uint2 bom; 68 ac_uint2 cnt; 69 union { 70 ac_uint4 bytes; 71 ac_uint2 len[2]; 72 } size; 73 } _ucheader_t; 74 75 /* 76 * A simple array of 32-bit masks for lookup. 77 */ 78 static ac_uint4 masks32[32] = { 79 0x00000001UL, 0x00000002UL, 0x00000004UL, 0x00000008UL, 80 0x00000010UL, 0x00000020UL, 0x00000040UL, 0x00000080UL, 81 0x00000100UL, 0x00000200UL, 0x00000400UL, 0x00000800UL, 82 0x00001000UL, 0x00002000UL, 0x00004000UL, 0x00008000UL, 83 0x00010000UL, 0x00020000UL, 0x00040000UL, 0x00080000UL, 84 0x00100000UL, 0x00200000UL, 0x00400000UL, 0x00800000UL, 85 0x01000000UL, 0x02000000UL, 0x04000000UL, 0x08000000UL, 86 0x10000000UL, 0x20000000UL, 0x40000000UL, 0x80000000UL 87 }; 88 89 #define endian_short(cc) (((cc) >> 8) | (((cc) & 0xff) << 8)) 90 #define endian_long(cc) ((((cc) & 0xff) << 24)|((((cc) >> 8) & 0xff) << 16)|\ 91 ((((cc) >> 16) & 0xff) << 8)|((cc) >> 24)) 92 93 #if !HARDCODE_DATA 94 static FILE * 95 _ucopenfile(char *paths, char *filename, char *mode) 96 { 97 FILE *f; 98 char *fp, *dp, *pp, path[BUFSIZ]; 99 100 if (filename == 0 || *filename == 0) 101 return 0; 102 103 dp = paths; 104 while (dp && *dp) { 105 pp = path; 106 while (*dp && *dp != ':') 107 *pp++ = *dp++; 108 *pp++ = *LDAP_DIRSEP; 109 110 fp = filename; 111 while (*fp) 112 *pp++ = *fp++; 113 *pp = 0; 114 115 if ((f = fopen(path, mode)) != 0) 116 return f; 117 118 if (*dp == ':') 119 dp++; 120 } 121 122 return 0; 123 } 124 #endif 125 126 /************************************************************************** 127 * 128 * Support for the character properties. 129 * 130 **************************************************************************/ 131 132 #if !HARDCODE_DATA 133 134 static ac_uint4 _ucprop_size; 135 static ac_uint2 *_ucprop_offsets; 136 static ac_uint4 *_ucprop_ranges; 137 138 /* 139 * Return -1 on error, 0 if okay 140 */ 141 static int 142 _ucprop_load(char *paths, int reload) 143 { 144 FILE *in; 145 ac_uint4 size, i; 146 _ucheader_t hdr; 147 148 if (_ucprop_size > 0) { 149 if (!reload) 150 /* 151 * The character properties have already been loaded. 152 */ 153 return 0; 154 155 /* 156 * Unload the current character property data in preparation for 157 * loading a new copy. Only the first array has to be deallocated 158 * because all the memory for the arrays is allocated as a single 159 * block. 160 */ 161 free((char *) _ucprop_offsets); 162 _ucprop_size = 0; 163 } 164 165 if ((in = _ucopenfile(paths, "ctype.dat", "rb")) == 0) 166 return -1; 167 168 /* 169 * Load the header. 170 */ 171 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 172 173 if (hdr.bom == 0xfffe) { 174 hdr.cnt = endian_short(hdr.cnt); 175 hdr.size.bytes = endian_long(hdr.size.bytes); 176 } 177 178 if ((_ucprop_size = hdr.cnt) == 0) { 179 fclose(in); 180 return -1; 181 } 182 183 /* 184 * Allocate all the storage needed for the lookup table. 185 */ 186 _ucprop_offsets = (ac_uint2 *) malloc(hdr.size.bytes); 187 188 /* 189 * Calculate the offset into the storage for the ranges. The offsets 190 * array is on a 4-byte boundary and one larger than the value provided in 191 * the header count field. This means the offset to the ranges must be 192 * calculated after aligning the count to a 4-byte boundary. 193 */ 194 if ((size = ((hdr.cnt + 1) * sizeof(ac_uint2))) & 3) 195 size += 4 - (size & 3); 196 size >>= 1; 197 _ucprop_ranges = (ac_uint4 *) (_ucprop_offsets + size); 198 199 /* 200 * Load the offset array. 201 */ 202 fread((char *) _ucprop_offsets, sizeof(ac_uint2), size, in); 203 204 /* 205 * Do an endian swap if necessary. Don't forget there is an extra node on 206 * the end with the final index. 207 */ 208 if (hdr.bom == 0xfffe) { 209 for (i = 0; i <= _ucprop_size; i++) 210 _ucprop_offsets[i] = endian_short(_ucprop_offsets[i]); 211 } 212 213 /* 214 * Load the ranges. The number of elements is in the last array position 215 * of the offsets. 216 */ 217 fread((char *) _ucprop_ranges, sizeof(ac_uint4), 218 _ucprop_offsets[_ucprop_size], in); 219 220 fclose(in); 221 222 /* 223 * Do an endian swap if necessary. 224 */ 225 if (hdr.bom == 0xfffe) { 226 for (i = 0; i < _ucprop_offsets[_ucprop_size]; i++) 227 _ucprop_ranges[i] = endian_long(_ucprop_ranges[i]); 228 } 229 return 0; 230 } 231 232 static void 233 _ucprop_unload(void) 234 { 235 if (_ucprop_size == 0) 236 return; 237 238 /* 239 * Only need to free the offsets because the memory is allocated as a 240 * single block. 241 */ 242 free((char *) _ucprop_offsets); 243 _ucprop_size = 0; 244 } 245 #endif 246 247 static int 248 _ucprop_lookup(ac_uint4 code, ac_uint4 n) 249 { 250 long l, r, m; 251 252 if (_ucprop_size == 0) 253 return 0; 254 255 /* 256 * There is an extra node on the end of the offsets to allow this routine 257 * to work right. If the index is 0xffff, then there are no nodes for the 258 * property. 259 */ 260 if ((l = _ucprop_offsets[n]) == 0xffff) 261 return 0; 262 263 /* 264 * Locate the next offset that is not 0xffff. The sentinel at the end of 265 * the array is the max index value. 266 */ 267 for (m = 1; 268 n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++) ; 269 270 r = _ucprop_offsets[n + m] - 1; 271 272 while (l <= r) { 273 /* 274 * Determine a "mid" point and adjust to make sure the mid point is at 275 * the beginning of a range pair. 276 */ 277 m = (l + r) >> 1; 278 m -= (m & 1); 279 if (code > _ucprop_ranges[m + 1]) 280 l = m + 2; 281 else if (code < _ucprop_ranges[m]) 282 r = m - 2; 283 else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1]) 284 return 1; 285 } 286 return 0; 287 } 288 289 int 290 ucisprop(ac_uint4 code, ac_uint4 mask1, ac_uint4 mask2) 291 { 292 ac_uint4 i; 293 294 if (mask1 == 0 && mask2 == 0) 295 return 0; 296 297 for (i = 0; mask1 && i < 32; i++) { 298 if ((mask1 & masks32[i]) && _ucprop_lookup(code, i)) 299 return 1; 300 } 301 302 for (i = 32; mask2 && i < _ucprop_size; i++) { 303 if ((mask2 & masks32[i & 31]) && _ucprop_lookup(code, i)) 304 return 1; 305 } 306 307 return 0; 308 } 309 310 /************************************************************************** 311 * 312 * Support for case mapping. 313 * 314 **************************************************************************/ 315 316 #if !HARDCODE_DATA 317 318 /* These record the number of slots in the map. 319 * There are 3 words per slot. 320 */ 321 static ac_uint4 _uccase_size; 322 static ac_uint2 _uccase_len[2]; 323 static ac_uint4 *_uccase_map; 324 325 /* 326 * Return -1 on error, 0 if okay 327 */ 328 static int 329 _uccase_load(char *paths, int reload) 330 { 331 FILE *in; 332 ac_uint4 i; 333 _ucheader_t hdr; 334 335 if (_uccase_size > 0) { 336 if (!reload) 337 /* 338 * The case mappings have already been loaded. 339 */ 340 return 0; 341 342 free((char *) _uccase_map); 343 _uccase_size = 0; 344 } 345 346 if ((in = _ucopenfile(paths, "case.dat", "rb")) == 0) 347 return -1; 348 349 /* 350 * Load the header. 351 */ 352 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 353 354 if (hdr.bom == 0xfffe) { 355 hdr.cnt = endian_short(hdr.cnt); 356 hdr.size.len[0] = endian_short(hdr.size.len[0]); 357 hdr.size.len[1] = endian_short(hdr.size.len[1]); 358 } 359 360 /* 361 * Set the node count and lengths of the upper and lower case mapping 362 * tables. 363 */ 364 _uccase_size = hdr.cnt; 365 _uccase_len[0] = hdr.size.len[0]; 366 _uccase_len[1] = hdr.size.len[1]; 367 368 _uccase_map = (ac_uint4 *) 369 malloc(_uccase_size * 3 * sizeof(ac_uint4)); 370 371 /* 372 * Load the case mapping table. 373 */ 374 fread((char *) _uccase_map, sizeof(ac_uint4), _uccase_size * 3, in); 375 376 /* 377 * Do an endian swap if necessary. 378 */ 379 if (hdr.bom == 0xfffe) { 380 for (i = 0; i < _uccase_size * 3; i++) 381 _uccase_map[i] = endian_long(_uccase_map[i]); 382 } 383 fclose(in); 384 return 0; 385 } 386 387 static void 388 _uccase_unload(void) 389 { 390 if (_uccase_size == 0) 391 return; 392 393 free((char *) _uccase_map); 394 _uccase_size = 0; 395 } 396 #endif 397 398 static ac_uint4 399 _uccase_lookup(ac_uint4 code, long l, long r, int field) 400 { 401 long m; 402 const ac_uint4 *tmp; 403 404 /* 405 * Do the binary search. 406 */ 407 while (l <= r) { 408 /* 409 * Determine a "mid" point and adjust to make sure the mid point is at 410 * the beginning of a case mapping triple. 411 */ 412 m = (l + r) >> 1; 413 tmp = &_uccase_map[m*3]; 414 if (code > *tmp) 415 l = m + 1; 416 else if (code < *tmp) 417 r = m - 1; 418 else if (code == *tmp) 419 return tmp[field]; 420 } 421 422 return code; 423 } 424 425 ac_uint4 426 uctoupper(ac_uint4 code) 427 { 428 int field; 429 long l, r; 430 431 if (ucisupper(code)) 432 return code; 433 434 if (ucislower(code)) { 435 /* 436 * The character is lower case. 437 */ 438 field = 2; 439 l = _uccase_len[0]; 440 r = (l + _uccase_len[1]) - 1; 441 } else { 442 /* 443 * The character is title case. 444 */ 445 field = 1; 446 l = _uccase_len[0] + _uccase_len[1]; 447 r = _uccase_size - 1; 448 } 449 return _uccase_lookup(code, l, r, field); 450 } 451 452 ac_uint4 453 uctolower(ac_uint4 code) 454 { 455 int field; 456 long l, r; 457 458 if (ucislower(code)) 459 return code; 460 461 if (ucisupper(code)) { 462 /* 463 * The character is upper case. 464 */ 465 field = 1; 466 l = 0; 467 r = _uccase_len[0] - 1; 468 } else { 469 /* 470 * The character is title case. 471 */ 472 field = 2; 473 l = _uccase_len[0] + _uccase_len[1]; 474 r = _uccase_size - 1; 475 } 476 return _uccase_lookup(code, l, r, field); 477 } 478 479 ac_uint4 480 uctotitle(ac_uint4 code) 481 { 482 int field; 483 long l, r; 484 485 if (ucistitle(code)) 486 return code; 487 488 /* 489 * The offset will always be the same for converting to title case. 490 */ 491 field = 2; 492 493 if (ucisupper(code)) { 494 /* 495 * The character is upper case. 496 */ 497 l = 0; 498 r = _uccase_len[0] - 1; 499 } else { 500 /* 501 * The character is lower case. 502 */ 503 l = _uccase_len[0]; 504 r = (l + _uccase_len[1]) - 1; 505 } 506 return _uccase_lookup(code, l, r, field); 507 } 508 509 /************************************************************************** 510 * 511 * Support for compositions. 512 * 513 **************************************************************************/ 514 515 #if !HARDCODE_DATA 516 517 static ac_uint4 _uccomp_size; 518 static ac_uint4 *_uccomp_data; 519 520 /* 521 * Return -1 on error, 0 if okay 522 */ 523 static int 524 _uccomp_load(char *paths, int reload) 525 { 526 FILE *in; 527 ac_uint4 size, i; 528 _ucheader_t hdr; 529 530 if (_uccomp_size > 0) { 531 if (!reload) 532 /* 533 * The compositions have already been loaded. 534 */ 535 return 0; 536 537 free((char *) _uccomp_data); 538 _uccomp_size = 0; 539 } 540 541 if ((in = _ucopenfile(paths, "comp.dat", "rb")) == 0) 542 return -1; 543 544 /* 545 * Load the header. 546 */ 547 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 548 549 if (hdr.bom == 0xfffe) { 550 hdr.cnt = endian_short(hdr.cnt); 551 hdr.size.bytes = endian_long(hdr.size.bytes); 552 } 553 554 _uccomp_size = hdr.cnt; 555 _uccomp_data = (ac_uint4 *) malloc(hdr.size.bytes); 556 557 /* 558 * Read the composition data in. 559 */ 560 size = hdr.size.bytes / sizeof(ac_uint4); 561 fread((char *) _uccomp_data, sizeof(ac_uint4), size, in); 562 563 /* 564 * Do an endian swap if necessary. 565 */ 566 if (hdr.bom == 0xfffe) { 567 for (i = 0; i < size; i++) 568 _uccomp_data[i] = endian_long(_uccomp_data[i]); 569 } 570 571 /* 572 * Assume that the data is ordered on count, so that all compositions 573 * of length 2 come first. Only handling length 2 for now. 574 */ 575 for (i = 1; i < size; i += 4) 576 if (_uccomp_data[i] != 2) 577 break; 578 _uccomp_size = i - 1; 579 580 fclose(in); 581 return 0; 582 } 583 584 static void 585 _uccomp_unload(void) 586 { 587 if (_uccomp_size == 0) 588 return; 589 590 free((char *) _uccomp_data); 591 _uccomp_size = 0; 592 } 593 #endif 594 595 int 596 uccomp(ac_uint4 node1, ac_uint4 node2, ac_uint4 *comp) 597 { 598 int l, r, m; 599 600 l = 0; 601 r = _uccomp_size - 1; 602 603 while (l <= r) { 604 m = ((r + l) >> 1); 605 m -= m & 3; 606 if (node1 > _uccomp_data[m+2]) 607 l = m + 4; 608 else if (node1 < _uccomp_data[m+2]) 609 r = m - 4; 610 else if (node2 > _uccomp_data[m+3]) 611 l = m + 4; 612 else if (node2 < _uccomp_data[m+3]) 613 r = m - 4; 614 else { 615 *comp = _uccomp_data[m]; 616 return 1; 617 } 618 } 619 return 0; 620 } 621 622 int 623 uccomp_hangul(ac_uint4 *str, int len) 624 { 625 const int SBase = 0xAC00, LBase = 0x1100, 626 VBase = 0x1161, TBase = 0x11A7, 627 LCount = 19, VCount = 21, TCount = 28, 628 NCount = VCount * TCount, /* 588 */ 629 SCount = LCount * NCount; /* 11172 */ 630 631 int i, rlen; 632 ac_uint4 ch, last, lindex, sindex; 633 634 last = str[0]; 635 rlen = 1; 636 for ( i = 1; i < len; i++ ) { 637 ch = str[i]; 638 639 /* check if two current characters are L and V */ 640 lindex = last - LBase; 641 if (lindex < (ac_uint4) LCount) { 642 ac_uint4 vindex = ch - VBase; 643 if (vindex < (ac_uint4) VCount) { 644 /* make syllable of form LV */ 645 last = SBase + (lindex * VCount + vindex) * TCount; 646 str[rlen-1] = last; /* reset last */ 647 continue; 648 } 649 } 650 651 /* check if two current characters are LV and T */ 652 sindex = last - SBase; 653 if (sindex < (ac_uint4) SCount 654 && (sindex % TCount) == 0) 655 { 656 ac_uint4 tindex = ch - TBase; 657 if (tindex <= (ac_uint4) TCount) { 658 /* make syllable of form LVT */ 659 last += tindex; 660 str[rlen-1] = last; /* reset last */ 661 continue; 662 } 663 } 664 665 /* if neither case was true, just add the character */ 666 last = ch; 667 str[rlen] = ch; 668 rlen++; 669 } 670 return rlen; 671 } 672 673 int 674 uccanoncomp(ac_uint4 *str, int len) 675 { 676 int i, stpos, copos; 677 ac_uint4 cl, prevcl, st, ch, co; 678 679 st = str[0]; 680 stpos = 0; 681 copos = 1; 682 prevcl = uccombining_class(st) == 0 ? 0 : 256; 683 684 for (i = 1; i < len; i++) { 685 ch = str[i]; 686 cl = uccombining_class(ch); 687 if (uccomp(st, ch, &co) && (prevcl < cl || prevcl == 0)) 688 st = str[stpos] = co; 689 else { 690 if (cl == 0) { 691 stpos = copos; 692 st = ch; 693 } 694 prevcl = cl; 695 str[copos++] = ch; 696 } 697 } 698 699 return uccomp_hangul(str, copos); 700 } 701 702 /************************************************************************** 703 * 704 * Support for decompositions. 705 * 706 **************************************************************************/ 707 708 #if !HARDCODE_DATA 709 710 static ac_uint4 _ucdcmp_size; 711 static ac_uint4 *_ucdcmp_nodes; 712 static ac_uint4 *_ucdcmp_decomp; 713 714 static ac_uint4 _uckdcmp_size; 715 static ac_uint4 *_uckdcmp_nodes; 716 static ac_uint4 *_uckdcmp_decomp; 717 718 /* 719 * Return -1 on error, 0 if okay 720 */ 721 static int 722 _ucdcmp_load(char *paths, int reload) 723 { 724 FILE *in; 725 ac_uint4 size, i; 726 _ucheader_t hdr; 727 728 if (_ucdcmp_size > 0) { 729 if (!reload) 730 /* 731 * The decompositions have already been loaded. 732 */ 733 return 0; 734 735 free((char *) _ucdcmp_nodes); 736 _ucdcmp_size = 0; 737 } 738 739 if ((in = _ucopenfile(paths, "decomp.dat", "rb")) == 0) 740 return -1; 741 742 /* 743 * Load the header. 744 */ 745 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 746 747 if (hdr.bom == 0xfffe) { 748 hdr.cnt = endian_short(hdr.cnt); 749 hdr.size.bytes = endian_long(hdr.size.bytes); 750 } 751 752 _ucdcmp_size = hdr.cnt << 1; 753 _ucdcmp_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 754 _ucdcmp_decomp = _ucdcmp_nodes + (_ucdcmp_size + 1); 755 756 /* 757 * Read the decomposition data in. 758 */ 759 size = hdr.size.bytes / sizeof(ac_uint4); 760 fread((char *) _ucdcmp_nodes, sizeof(ac_uint4), size, in); 761 762 /* 763 * Do an endian swap if necessary. 764 */ 765 if (hdr.bom == 0xfffe) { 766 for (i = 0; i < size; i++) 767 _ucdcmp_nodes[i] = endian_long(_ucdcmp_nodes[i]); 768 } 769 fclose(in); 770 return 0; 771 } 772 773 /* 774 * Return -1 on error, 0 if okay 775 */ 776 static int 777 _uckdcmp_load(char *paths, int reload) 778 { 779 FILE *in; 780 ac_uint4 size, i; 781 _ucheader_t hdr; 782 783 if (_uckdcmp_size > 0) { 784 if (!reload) 785 /* 786 * The decompositions have already been loaded. 787 */ 788 return 0; 789 790 free((char *) _uckdcmp_nodes); 791 _uckdcmp_size = 0; 792 } 793 794 if ((in = _ucopenfile(paths, "kdecomp.dat", "rb")) == 0) 795 return -1; 796 797 /* 798 * Load the header. 799 */ 800 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 801 802 if (hdr.bom == 0xfffe) { 803 hdr.cnt = endian_short(hdr.cnt); 804 hdr.size.bytes = endian_long(hdr.size.bytes); 805 } 806 807 _uckdcmp_size = hdr.cnt << 1; 808 _uckdcmp_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 809 _uckdcmp_decomp = _uckdcmp_nodes + (_uckdcmp_size + 1); 810 811 /* 812 * Read the decomposition data in. 813 */ 814 size = hdr.size.bytes / sizeof(ac_uint4); 815 fread((char *) _uckdcmp_nodes, sizeof(ac_uint4), size, in); 816 817 /* 818 * Do an endian swap if necessary. 819 */ 820 if (hdr.bom == 0xfffe) { 821 for (i = 0; i < size; i++) 822 _uckdcmp_nodes[i] = endian_long(_uckdcmp_nodes[i]); 823 } 824 fclose(in); 825 return 0; 826 } 827 828 static void 829 _ucdcmp_unload(void) 830 { 831 if (_ucdcmp_size == 0) 832 return; 833 834 /* 835 * Only need to free the offsets because the memory is allocated as a 836 * single block. 837 */ 838 free((char *) _ucdcmp_nodes); 839 _ucdcmp_size = 0; 840 } 841 842 static void 843 _uckdcmp_unload(void) 844 { 845 if (_uckdcmp_size == 0) 846 return; 847 848 /* 849 * Only need to free the offsets because the memory is allocated as a 850 * single block. 851 */ 852 free((char *) _uckdcmp_nodes); 853 _uckdcmp_size = 0; 854 } 855 #endif 856 857 int 858 ucdecomp(ac_uint4 code, ac_uint4 *num, ac_uint4 **decomp) 859 { 860 long l, r, m; 861 862 if (code < _ucdcmp_nodes[0]) { 863 return 0; 864 } 865 866 l = 0; 867 r = _ucdcmp_nodes[_ucdcmp_size] - 1; 868 869 while (l <= r) { 870 /* 871 * Determine a "mid" point and adjust to make sure the mid point is at 872 * the beginning of a code+offset pair. 873 */ 874 m = (l + r) >> 1; 875 m -= (m & 1); 876 if (code > _ucdcmp_nodes[m]) 877 l = m + 2; 878 else if (code < _ucdcmp_nodes[m]) 879 r = m - 2; 880 else if (code == _ucdcmp_nodes[m]) { 881 *num = _ucdcmp_nodes[m + 3] - _ucdcmp_nodes[m + 1]; 882 *decomp = (ac_uint4*)&_ucdcmp_decomp[_ucdcmp_nodes[m + 1]]; 883 return 1; 884 } 885 } 886 return 0; 887 } 888 889 int 890 uckdecomp(ac_uint4 code, ac_uint4 *num, ac_uint4 **decomp) 891 { 892 long l, r, m; 893 894 if (code < _uckdcmp_nodes[0]) { 895 return 0; 896 } 897 898 l = 0; 899 r = _uckdcmp_nodes[_uckdcmp_size] - 1; 900 901 while (l <= r) { 902 /* 903 * Determine a "mid" point and adjust to make sure the mid point is at 904 * the beginning of a code+offset pair. 905 */ 906 m = (l + r) >> 1; 907 m -= (m & 1); 908 if (code > _uckdcmp_nodes[m]) 909 l = m + 2; 910 else if (code < _uckdcmp_nodes[m]) 911 r = m - 2; 912 else if (code == _uckdcmp_nodes[m]) { 913 *num = _uckdcmp_nodes[m + 3] - _uckdcmp_nodes[m + 1]; 914 *decomp = (ac_uint4*)&_uckdcmp_decomp[_uckdcmp_nodes[m + 1]]; 915 return 1; 916 } 917 } 918 return 0; 919 } 920 921 int 922 ucdecomp_hangul(ac_uint4 code, ac_uint4 *num, ac_uint4 decomp[]) 923 { 924 if (!ucishangul(code)) 925 return 0; 926 927 code -= 0xac00; 928 decomp[0] = 0x1100 + (ac_uint4) (code / 588); 929 decomp[1] = 0x1161 + (ac_uint4) ((code % 588) / 28); 930 decomp[2] = 0x11a7 + (ac_uint4) (code % 28); 931 *num = (decomp[2] != 0x11a7) ? 3 : 2; 932 933 return 1; 934 } 935 936 /* mode == 0 for canonical, mode == 1 for compatibility */ 937 static int 938 uccanoncompatdecomp(const ac_uint4 *in, int inlen, 939 ac_uint4 **out, int *outlen, short mode, void *ctx) 940 { 941 int l, size; 942 unsigned i, j, k; 943 ac_uint4 num, class, *decomp, hangdecomp[3]; 944 945 size = inlen * 2; 946 *out = (ac_uint4 *) ber_memalloc_x(size * sizeof(**out), ctx); 947 if (*out == NULL) 948 return *outlen = -1; 949 950 i = 0; 951 for (j = 0; j < (unsigned) inlen; j++) { 952 if (mode ? uckdecomp(in[j], &num, &decomp) : ucdecomp(in[j], &num, &decomp)) { 953 if ( size - i < num) { 954 size = inlen + i - j + num - 1; 955 *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx ); 956 if (*out == NULL) 957 return *outlen = -1; 958 } 959 for (k = 0; k < num; k++) { 960 class = uccombining_class(decomp[k]); 961 if (class == 0) { 962 (*out)[i] = decomp[k]; 963 } else { 964 for (l = i; l > 0; l--) 965 if (class >= uccombining_class((*out)[l-1])) 966 break; 967 AC_MEMCPY(*out + l + 1, *out + l, (i - l) * sizeof(**out)); 968 (*out)[l] = decomp[k]; 969 } 970 i++; 971 } 972 } else if (ucdecomp_hangul(in[j], &num, hangdecomp)) { 973 if (size - i < num) { 974 size = inlen + i - j + num - 1; 975 *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx); 976 if (*out == NULL) 977 return *outlen = -1; 978 } 979 for (k = 0; k < num; k++) { 980 (*out)[i] = hangdecomp[k]; 981 i++; 982 } 983 } else { 984 if (size - i < 1) { 985 size = inlen + i - j; 986 *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx); 987 if (*out == NULL) 988 return *outlen = -1; 989 } 990 class = uccombining_class(in[j]); 991 if (class == 0) { 992 (*out)[i] = in[j]; 993 } else { 994 for (l = i; l > 0; l--) 995 if (class >= uccombining_class((*out)[l-1])) 996 break; 997 AC_MEMCPY(*out + l + 1, *out + l, (i - l) * sizeof(**out)); 998 (*out)[l] = in[j]; 999 } 1000 i++; 1001 } 1002 } 1003 return *outlen = i; 1004 } 1005 1006 int 1007 uccanondecomp(const ac_uint4 *in, int inlen, 1008 ac_uint4 **out, int *outlen, void *ctx) 1009 { 1010 return uccanoncompatdecomp(in, inlen, out, outlen, 0, ctx); 1011 } 1012 1013 int 1014 uccompatdecomp(const ac_uint4 *in, int inlen, 1015 ac_uint4 **out, int *outlen, void *ctx) 1016 { 1017 return uccanoncompatdecomp(in, inlen, out, outlen, 1, ctx); 1018 } 1019 1020 /************************************************************************** 1021 * 1022 * Support for combining classes. 1023 * 1024 **************************************************************************/ 1025 1026 #if !HARDCODE_DATA 1027 static ac_uint4 _uccmcl_size; 1028 static ac_uint4 *_uccmcl_nodes; 1029 1030 /* 1031 * Return -1 on error, 0 if okay 1032 */ 1033 static int 1034 _uccmcl_load(char *paths, int reload) 1035 { 1036 FILE *in; 1037 ac_uint4 i; 1038 _ucheader_t hdr; 1039 1040 if (_uccmcl_size > 0) { 1041 if (!reload) 1042 /* 1043 * The combining classes have already been loaded. 1044 */ 1045 return 0; 1046 1047 free((char *) _uccmcl_nodes); 1048 _uccmcl_size = 0; 1049 } 1050 1051 if ((in = _ucopenfile(paths, "cmbcl.dat", "rb")) == 0) 1052 return -1; 1053 1054 /* 1055 * Load the header. 1056 */ 1057 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 1058 1059 if (hdr.bom == 0xfffe) { 1060 hdr.cnt = endian_short(hdr.cnt); 1061 hdr.size.bytes = endian_long(hdr.size.bytes); 1062 } 1063 1064 _uccmcl_size = hdr.cnt * 3; 1065 _uccmcl_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 1066 1067 /* 1068 * Read the combining classes in. 1069 */ 1070 fread((char *) _uccmcl_nodes, sizeof(ac_uint4), _uccmcl_size, in); 1071 1072 /* 1073 * Do an endian swap if necessary. 1074 */ 1075 if (hdr.bom == 0xfffe) { 1076 for (i = 0; i < _uccmcl_size; i++) 1077 _uccmcl_nodes[i] = endian_long(_uccmcl_nodes[i]); 1078 } 1079 fclose(in); 1080 return 0; 1081 } 1082 1083 static void 1084 _uccmcl_unload(void) 1085 { 1086 if (_uccmcl_size == 0) 1087 return; 1088 1089 free((char *) _uccmcl_nodes); 1090 _uccmcl_size = 0; 1091 } 1092 #endif 1093 1094 ac_uint4 1095 uccombining_class(ac_uint4 code) 1096 { 1097 long l, r, m; 1098 1099 l = 0; 1100 r = _uccmcl_size - 1; 1101 1102 while (l <= r) { 1103 m = (l + r) >> 1; 1104 m -= (m % 3); 1105 if (code > _uccmcl_nodes[m + 1]) 1106 l = m + 3; 1107 else if (code < _uccmcl_nodes[m]) 1108 r = m - 3; 1109 else if (code >= _uccmcl_nodes[m] && code <= _uccmcl_nodes[m + 1]) 1110 return _uccmcl_nodes[m + 2]; 1111 } 1112 return 0; 1113 } 1114 1115 /************************************************************************** 1116 * 1117 * Support for numeric values. 1118 * 1119 **************************************************************************/ 1120 1121 #if !HARDCODE_DATA 1122 static ac_uint4 *_ucnum_nodes; 1123 static ac_uint4 _ucnum_size; 1124 static short *_ucnum_vals; 1125 1126 /* 1127 * Return -1 on error, 0 if okay 1128 */ 1129 static int 1130 _ucnumb_load(char *paths, int reload) 1131 { 1132 FILE *in; 1133 ac_uint4 size, i; 1134 _ucheader_t hdr; 1135 1136 if (_ucnum_size > 0) { 1137 if (!reload) 1138 /* 1139 * The numbers have already been loaded. 1140 */ 1141 return 0; 1142 1143 free((char *) _ucnum_nodes); 1144 _ucnum_size = 0; 1145 } 1146 1147 if ((in = _ucopenfile(paths, "num.dat", "rb")) == 0) 1148 return -1; 1149 1150 /* 1151 * Load the header. 1152 */ 1153 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 1154 1155 if (hdr.bom == 0xfffe) { 1156 hdr.cnt = endian_short(hdr.cnt); 1157 hdr.size.bytes = endian_long(hdr.size.bytes); 1158 } 1159 1160 _ucnum_size = hdr.cnt; 1161 _ucnum_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 1162 _ucnum_vals = (short *) (_ucnum_nodes + _ucnum_size); 1163 1164 /* 1165 * Read the combining classes in. 1166 */ 1167 fread((char *) _ucnum_nodes, sizeof(unsigned char), hdr.size.bytes, in); 1168 1169 /* 1170 * Do an endian swap if necessary. 1171 */ 1172 if (hdr.bom == 0xfffe) { 1173 for (i = 0; i < _ucnum_size; i++) 1174 _ucnum_nodes[i] = endian_long(_ucnum_nodes[i]); 1175 1176 /* 1177 * Determine the number of values that have to be adjusted. 1178 */ 1179 size = (hdr.size.bytes - 1180 (_ucnum_size * (sizeof(ac_uint4) << 1))) / 1181 sizeof(short); 1182 1183 for (i = 0; i < size; i++) 1184 _ucnum_vals[i] = endian_short(_ucnum_vals[i]); 1185 } 1186 fclose(in); 1187 return 0; 1188 } 1189 1190 static void 1191 _ucnumb_unload(void) 1192 { 1193 if (_ucnum_size == 0) 1194 return; 1195 1196 free((char *) _ucnum_nodes); 1197 _ucnum_size = 0; 1198 } 1199 #endif 1200 1201 int 1202 ucnumber_lookup(ac_uint4 code, struct ucnumber *num) 1203 { 1204 long l, r, m; 1205 short *vp; 1206 1207 l = 0; 1208 r = _ucnum_size - 1; 1209 while (l <= r) { 1210 /* 1211 * Determine a "mid" point and adjust to make sure the mid point is at 1212 * the beginning of a code+offset pair. 1213 */ 1214 m = (l + r) >> 1; 1215 m -= (m & 1); 1216 if (code > _ucnum_nodes[m]) 1217 l = m + 2; 1218 else if (code < _ucnum_nodes[m]) 1219 r = m - 2; 1220 else { 1221 vp = (short *)_ucnum_vals + _ucnum_nodes[m + 1]; 1222 num->numerator = (int) *vp++; 1223 num->denominator = (int) *vp; 1224 return 1; 1225 } 1226 } 1227 return 0; 1228 } 1229 1230 int 1231 ucdigit_lookup(ac_uint4 code, int *digit) 1232 { 1233 long l, r, m; 1234 short *vp; 1235 1236 l = 0; 1237 r = _ucnum_size - 1; 1238 while (l <= r) { 1239 /* 1240 * Determine a "mid" point and adjust to make sure the mid point is at 1241 * the beginning of a code+offset pair. 1242 */ 1243 m = (l + r) >> 1; 1244 m -= (m & 1); 1245 if (code > _ucnum_nodes[m]) 1246 l = m + 2; 1247 else if (code < _ucnum_nodes[m]) 1248 r = m - 2; 1249 else { 1250 vp = (short *)_ucnum_vals + _ucnum_nodes[m + 1]; 1251 if (*vp == *(vp + 1)) { 1252 *digit = *vp; 1253 return 1; 1254 } 1255 return 0; 1256 } 1257 } 1258 return 0; 1259 } 1260 1261 struct ucnumber 1262 ucgetnumber(ac_uint4 code) 1263 { 1264 struct ucnumber num; 1265 1266 /* 1267 * Initialize with some arbitrary value, because the caller simply cannot 1268 * tell for sure if the code is a number without calling the ucisnumber() 1269 * macro before calling this function. 1270 */ 1271 num.numerator = num.denominator = -111; 1272 1273 (void) ucnumber_lookup(code, &num); 1274 1275 return num; 1276 } 1277 1278 int 1279 ucgetdigit(ac_uint4 code) 1280 { 1281 int dig; 1282 1283 /* 1284 * Initialize with some arbitrary value, because the caller simply cannot 1285 * tell for sure if the code is a number without calling the ucisdigit() 1286 * macro before calling this function. 1287 */ 1288 dig = -111; 1289 1290 (void) ucdigit_lookup(code, &dig); 1291 1292 return dig; 1293 } 1294 1295 /************************************************************************** 1296 * 1297 * Setup and cleanup routines. 1298 * 1299 **************************************************************************/ 1300 1301 #if HARDCODE_DATA 1302 int ucdata_load(char *paths, int masks) { return 0; } 1303 void ucdata_unload(int masks) { } 1304 int ucdata_reload(char *paths, int masks) { return 0; } 1305 #else 1306 /* 1307 * Return 0 if okay, negative on error 1308 */ 1309 int 1310 ucdata_load(char *paths, int masks) 1311 { 1312 int error = 0; 1313 1314 if (masks & UCDATA_CTYPE) 1315 error |= _ucprop_load(paths, 0) < 0 ? UCDATA_CTYPE : 0; 1316 if (masks & UCDATA_CASE) 1317 error |= _uccase_load(paths, 0) < 0 ? UCDATA_CASE : 0; 1318 if (masks & UCDATA_DECOMP) 1319 error |= _ucdcmp_load(paths, 0) < 0 ? UCDATA_DECOMP : 0; 1320 if (masks & UCDATA_CMBCL) 1321 error |= _uccmcl_load(paths, 0) < 0 ? UCDATA_CMBCL : 0; 1322 if (masks & UCDATA_NUM) 1323 error |= _ucnumb_load(paths, 0) < 0 ? UCDATA_NUM : 0; 1324 if (masks & UCDATA_COMP) 1325 error |= _uccomp_load(paths, 0) < 0 ? UCDATA_COMP : 0; 1326 if (masks & UCDATA_KDECOMP) 1327 error |= _uckdcmp_load(paths, 0) < 0 ? UCDATA_KDECOMP : 0; 1328 1329 return -error; 1330 } 1331 1332 void 1333 ucdata_unload(int masks) 1334 { 1335 if (masks & UCDATA_CTYPE) 1336 _ucprop_unload(); 1337 if (masks & UCDATA_CASE) 1338 _uccase_unload(); 1339 if (masks & UCDATA_DECOMP) 1340 _ucdcmp_unload(); 1341 if (masks & UCDATA_CMBCL) 1342 _uccmcl_unload(); 1343 if (masks & UCDATA_NUM) 1344 _ucnumb_unload(); 1345 if (masks & UCDATA_COMP) 1346 _uccomp_unload(); 1347 if (masks & UCDATA_KDECOMP) 1348 _uckdcmp_unload(); 1349 } 1350 1351 /* 1352 * Return 0 if okay, negative on error 1353 */ 1354 int 1355 ucdata_reload(char *paths, int masks) 1356 { 1357 int error = 0; 1358 1359 if (masks & UCDATA_CTYPE) 1360 error |= _ucprop_load(paths, 1) < 0 ? UCDATA_CTYPE : 0; 1361 if (masks & UCDATA_CASE) 1362 error |= _uccase_load(paths, 1) < 0 ? UCDATA_CASE : 0; 1363 if (masks & UCDATA_DECOMP) 1364 error |= _ucdcmp_load(paths, 1) < 0 ? UCDATA_DECOMP : 0; 1365 if (masks & UCDATA_CMBCL) 1366 error |= _uccmcl_load(paths, 1) < 0 ? UCDATA_CMBCL : 0; 1367 if (masks & UCDATA_NUM) 1368 error |= _ucnumb_load(paths, 1) < 0 ? UCDATA_NUM : 0; 1369 if (masks & UCDATA_COMP) 1370 error |= _uccomp_load(paths, 1) < 0 ? UCDATA_COMP : 0; 1371 if (masks & UCDATA_KDECOMP) 1372 error |= _uckdcmp_load(paths, 1) < 0 ? UCDATA_KDECOMP : 0; 1373 1374 return -error; 1375 } 1376 #endif 1377 1378 #ifdef TEST 1379 1380 void 1381 main(void) 1382 { 1383 int dig; 1384 ac_uint4 i, lo, *dec; 1385 struct ucnumber num; 1386 1387 /* ucdata_setup("."); */ 1388 1389 if (ucisweak(0x30)) 1390 printf("WEAK\n"); 1391 else 1392 printf("NOT WEAK\n"); 1393 1394 printf("LOWER 0x%04lX\n", uctolower(0xff3a)); 1395 printf("UPPER 0x%04lX\n", uctoupper(0xff5a)); 1396 1397 if (ucisalpha(0x1d5)) 1398 printf("ALPHA\n"); 1399 else 1400 printf("NOT ALPHA\n"); 1401 1402 if (ucisupper(0x1d5)) { 1403 printf("UPPER\n"); 1404 lo = uctolower(0x1d5); 1405 printf("0x%04lx\n", lo); 1406 lo = uctotitle(0x1d5); 1407 printf("0x%04lx\n", lo); 1408 } else 1409 printf("NOT UPPER\n"); 1410 1411 if (ucistitle(0x1d5)) 1412 printf("TITLE\n"); 1413 else 1414 printf("NOT TITLE\n"); 1415 1416 if (uciscomposite(0x1d5)) 1417 printf("COMPOSITE\n"); 1418 else 1419 printf("NOT COMPOSITE\n"); 1420 1421 if (ucdecomp(0x1d5, &lo, &dec)) { 1422 for (i = 0; i < lo; i++) 1423 printf("0x%04lx ", dec[i]); 1424 putchar('\n'); 1425 } 1426 1427 if ((lo = uccombining_class(0x41)) != 0) 1428 printf("0x41 CCL %ld\n", lo); 1429 1430 if (ucisxdigit(0xfeff)) 1431 printf("0xFEFF HEX DIGIT\n"); 1432 else 1433 printf("0xFEFF NOT HEX DIGIT\n"); 1434 1435 if (ucisdefined(0x10000)) 1436 printf("0x10000 DEFINED\n"); 1437 else 1438 printf("0x10000 NOT DEFINED\n"); 1439 1440 if (ucnumber_lookup(0x30, &num)) { 1441 if (num.denominator != 1) 1442 printf("UCNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator); 1443 else 1444 printf("UCNUMBER: 0x30 = %d\n", num.numerator); 1445 } else 1446 printf("UCNUMBER: 0x30 NOT A NUMBER\n"); 1447 1448 if (ucnumber_lookup(0xbc, &num)) { 1449 if (num.denominator != 1) 1450 printf("UCNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator); 1451 else 1452 printf("UCNUMBER: 0xbc = %d\n", num.numerator); 1453 } else 1454 printf("UCNUMBER: 0xbc NOT A NUMBER\n"); 1455 1456 1457 if (ucnumber_lookup(0xff19, &num)) { 1458 if (num.denominator != 1) 1459 printf("UCNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator); 1460 else 1461 printf("UCNUMBER: 0xff19 = %d\n", num.numerator); 1462 } else 1463 printf("UCNUMBER: 0xff19 NOT A NUMBER\n"); 1464 1465 if (ucnumber_lookup(0x4e00, &num)) { 1466 if (num.denominator != 1) 1467 printf("UCNUMBER: 0x4e00 = %d/%d\n", num.numerator, num.denominator); 1468 else 1469 printf("UCNUMBER: 0x4e00 = %d\n", num.numerator); 1470 } else 1471 printf("UCNUMBER: 0x4e00 NOT A NUMBER\n"); 1472 1473 if (ucdigit_lookup(0x06f9, &dig)) 1474 printf("UCDIGIT: 0x6f9 = %d\n", dig); 1475 else 1476 printf("UCDIGIT: 0x6f9 NOT A NUMBER\n"); 1477 1478 dig = ucgetdigit(0x0969); 1479 printf("UCGETDIGIT: 0x969 = %d\n", dig); 1480 1481 num = ucgetnumber(0x30); 1482 if (num.denominator != 1) 1483 printf("UCGETNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator); 1484 else 1485 printf("UCGETNUMBER: 0x30 = %d\n", num.numerator); 1486 1487 num = ucgetnumber(0xbc); 1488 if (num.denominator != 1) 1489 printf("UCGETNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator); 1490 else 1491 printf("UCGETNUMBER: 0xbc = %d\n", num.numerator); 1492 1493 num = ucgetnumber(0xff19); 1494 if (num.denominator != 1) 1495 printf("UCGETNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator); 1496 else 1497 printf("UCGETNUMBER: 0xff19 = %d\n", num.numerator); 1498 1499 /* ucdata_cleanup(); */ 1500 exit(0); 1501 } 1502 1503 #endif /* TEST */ 1504