1 /* $NetBSD: ucdata.c,v 1.1.1.6 2018/02/06 01:53:07 christos Exp $ */ 2 3 /* $OpenLDAP$ */ 4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 5 * 6 * Copyright 1998-2017 The OpenLDAP Foundation. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted only as authorized by the OpenLDAP 11 * Public License. 12 * 13 * A copy of this license is available in file LICENSE in the 14 * top-level directory of the distribution or, alternatively, at 15 * <http://www.OpenLDAP.org/license.html>. 16 */ 17 /* Copyright 2001 Computing Research Labs, New Mexico State University 18 * 19 * Permission is hereby granted, free of charge, to any person obtaining a 20 * copy of this software and associated documentation files (the "Software"), 21 * to deal in the Software without restriction, including without limitation 22 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 23 * and/or sell copies of the Software, and to permit persons to whom the 24 * Software is furnished to do so, subject to the following conditions: 25 * 26 * The above copyright notice and this permission notice shall be included in 27 * all copies or substantial portions of the Software. 28 * 29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 32 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 33 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 34 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 35 * THE USE OR OTHER DEALINGS IN THE SOFTWARE. 36 */ 37 /* Id: ucdata.c,v 1.4 2001/01/02 18:46:20 mleisher Exp " */ 38 39 #include <sys/cdefs.h> 40 __RCSID("$NetBSD: ucdata.c,v 1.1.1.6 2018/02/06 01:53:07 christos Exp $"); 41 42 #include "portable.h" 43 #include "ldap_config.h" 44 45 #include <stdio.h> 46 #include <ac/stdlib.h> 47 #include <ac/string.h> 48 #include <ac/unistd.h> 49 50 #include <ac/bytes.h> 51 52 #include "lber_pvt.h" 53 #include "ucdata.h" 54 55 #ifndef HARDCODE_DATA 56 #define HARDCODE_DATA 1 57 #endif 58 59 #if HARDCODE_DATA 60 #include "uctable.h" 61 #endif 62 63 /************************************************************************** 64 * 65 * Miscellaneous types, data, and support functions. 66 * 67 **************************************************************************/ 68 69 typedef struct { 70 ac_uint2 bom; 71 ac_uint2 cnt; 72 union { 73 ac_uint4 bytes; 74 ac_uint2 len[2]; 75 } size; 76 } _ucheader_t; 77 78 /* 79 * A simple array of 32-bit masks for lookup. 80 */ 81 static ac_uint4 masks32[32] = { 82 0x00000001UL, 0x00000002UL, 0x00000004UL, 0x00000008UL, 83 0x00000010UL, 0x00000020UL, 0x00000040UL, 0x00000080UL, 84 0x00000100UL, 0x00000200UL, 0x00000400UL, 0x00000800UL, 85 0x00001000UL, 0x00002000UL, 0x00004000UL, 0x00008000UL, 86 0x00010000UL, 0x00020000UL, 0x00040000UL, 0x00080000UL, 87 0x00100000UL, 0x00200000UL, 0x00400000UL, 0x00800000UL, 88 0x01000000UL, 0x02000000UL, 0x04000000UL, 0x08000000UL, 89 0x10000000UL, 0x20000000UL, 0x40000000UL, 0x80000000UL 90 }; 91 92 #define endian_short(cc) (((cc) >> 8) | (((cc) & 0xff) << 8)) 93 #define endian_long(cc) ((((cc) & 0xff) << 24)|((((cc) >> 8) & 0xff) << 16)|\ 94 ((((cc) >> 16) & 0xff) << 8)|((cc) >> 24)) 95 96 #if !HARDCODE_DATA 97 static FILE * 98 _ucopenfile(char *paths, char *filename, char *mode) 99 { 100 FILE *f; 101 char *fp, *dp, *pp, path[BUFSIZ]; 102 103 if (filename == 0 || *filename == 0) 104 return 0; 105 106 dp = paths; 107 while (dp && *dp) { 108 pp = path; 109 while (*dp && *dp != ':') 110 *pp++ = *dp++; 111 *pp++ = *LDAP_DIRSEP; 112 113 fp = filename; 114 while (*fp) 115 *pp++ = *fp++; 116 *pp = 0; 117 118 if ((f = fopen(path, mode)) != 0) 119 return f; 120 121 if (*dp == ':') 122 dp++; 123 } 124 125 return 0; 126 } 127 #endif 128 129 /************************************************************************** 130 * 131 * Support for the character properties. 132 * 133 **************************************************************************/ 134 135 #if !HARDCODE_DATA 136 137 static ac_uint4 _ucprop_size; 138 static ac_uint2 *_ucprop_offsets; 139 static ac_uint4 *_ucprop_ranges; 140 141 /* 142 * Return -1 on error, 0 if okay 143 */ 144 static int 145 _ucprop_load(char *paths, int reload) 146 { 147 FILE *in; 148 ac_uint4 size, i; 149 _ucheader_t hdr; 150 151 if (_ucprop_size > 0) { 152 if (!reload) 153 /* 154 * The character properties have already been loaded. 155 */ 156 return 0; 157 158 /* 159 * Unload the current character property data in preparation for 160 * loading a new copy. Only the first array has to be deallocated 161 * because all the memory for the arrays is allocated as a single 162 * block. 163 */ 164 free((char *) _ucprop_offsets); 165 _ucprop_size = 0; 166 } 167 168 if ((in = _ucopenfile(paths, "ctype.dat", "rb")) == 0) 169 return -1; 170 171 /* 172 * Load the header. 173 */ 174 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 175 176 if (hdr.bom == 0xfffe) { 177 hdr.cnt = endian_short(hdr.cnt); 178 hdr.size.bytes = endian_long(hdr.size.bytes); 179 } 180 181 if ((_ucprop_size = hdr.cnt) == 0) { 182 fclose(in); 183 return -1; 184 } 185 186 /* 187 * Allocate all the storage needed for the lookup table. 188 */ 189 _ucprop_offsets = (ac_uint2 *) malloc(hdr.size.bytes); 190 191 /* 192 * Calculate the offset into the storage for the ranges. The offsets 193 * array is on a 4-byte boundary and one larger than the value provided in 194 * the header count field. This means the offset to the ranges must be 195 * calculated after aligning the count to a 4-byte boundary. 196 */ 197 if ((size = ((hdr.cnt + 1) * sizeof(ac_uint2))) & 3) 198 size += 4 - (size & 3); 199 size >>= 1; 200 _ucprop_ranges = (ac_uint4 *) (_ucprop_offsets + size); 201 202 /* 203 * Load the offset array. 204 */ 205 fread((char *) _ucprop_offsets, sizeof(ac_uint2), size, in); 206 207 /* 208 * Do an endian swap if necessary. Don't forget there is an extra node on 209 * the end with the final index. 210 */ 211 if (hdr.bom == 0xfffe) { 212 for (i = 0; i <= _ucprop_size; i++) 213 _ucprop_offsets[i] = endian_short(_ucprop_offsets[i]); 214 } 215 216 /* 217 * Load the ranges. The number of elements is in the last array position 218 * of the offsets. 219 */ 220 fread((char *) _ucprop_ranges, sizeof(ac_uint4), 221 _ucprop_offsets[_ucprop_size], in); 222 223 fclose(in); 224 225 /* 226 * Do an endian swap if necessary. 227 */ 228 if (hdr.bom == 0xfffe) { 229 for (i = 0; i < _ucprop_offsets[_ucprop_size]; i++) 230 _ucprop_ranges[i] = endian_long(_ucprop_ranges[i]); 231 } 232 return 0; 233 } 234 235 static void 236 _ucprop_unload(void) 237 { 238 if (_ucprop_size == 0) 239 return; 240 241 /* 242 * Only need to free the offsets because the memory is allocated as a 243 * single block. 244 */ 245 free((char *) _ucprop_offsets); 246 _ucprop_size = 0; 247 } 248 #endif 249 250 static int 251 _ucprop_lookup(ac_uint4 code, ac_uint4 n) 252 { 253 long l, r, m; 254 255 if (_ucprop_size == 0) 256 return 0; 257 258 /* 259 * There is an extra node on the end of the offsets to allow this routine 260 * to work right. If the index is 0xffff, then there are no nodes for the 261 * property. 262 */ 263 if ((l = _ucprop_offsets[n]) == 0xffff) 264 return 0; 265 266 /* 267 * Locate the next offset that is not 0xffff. The sentinel at the end of 268 * the array is the max index value. 269 */ 270 for (m = 1; 271 n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++) ; 272 273 r = _ucprop_offsets[n + m] - 1; 274 275 while (l <= r) { 276 /* 277 * Determine a "mid" point and adjust to make sure the mid point is at 278 * the beginning of a range pair. 279 */ 280 m = (l + r) >> 1; 281 m -= (m & 1); 282 if (code > _ucprop_ranges[m + 1]) 283 l = m + 2; 284 else if (code < _ucprop_ranges[m]) 285 r = m - 2; 286 else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1]) 287 return 1; 288 } 289 return 0; 290 } 291 292 int 293 ucisprop(ac_uint4 code, ac_uint4 mask1, ac_uint4 mask2) 294 { 295 ac_uint4 i; 296 297 if (mask1 == 0 && mask2 == 0) 298 return 0; 299 300 for (i = 0; mask1 && i < 32; i++) { 301 if ((mask1 & masks32[i]) && _ucprop_lookup(code, i)) 302 return 1; 303 } 304 305 for (i = 32; mask2 && i < _ucprop_size; i++) { 306 if ((mask2 & masks32[i & 31]) && _ucprop_lookup(code, i)) 307 return 1; 308 } 309 310 return 0; 311 } 312 313 /************************************************************************** 314 * 315 * Support for case mapping. 316 * 317 **************************************************************************/ 318 319 #if !HARDCODE_DATA 320 321 /* These record the number of slots in the map. 322 * There are 3 words per slot. 323 */ 324 static ac_uint4 _uccase_size; 325 static ac_uint2 _uccase_len[2]; 326 static ac_uint4 *_uccase_map; 327 328 /* 329 * Return -1 on error, 0 if okay 330 */ 331 static int 332 _uccase_load(char *paths, int reload) 333 { 334 FILE *in; 335 ac_uint4 i; 336 _ucheader_t hdr; 337 338 if (_uccase_size > 0) { 339 if (!reload) 340 /* 341 * The case mappings have already been loaded. 342 */ 343 return 0; 344 345 free((char *) _uccase_map); 346 _uccase_size = 0; 347 } 348 349 if ((in = _ucopenfile(paths, "case.dat", "rb")) == 0) 350 return -1; 351 352 /* 353 * Load the header. 354 */ 355 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 356 357 if (hdr.bom == 0xfffe) { 358 hdr.cnt = endian_short(hdr.cnt); 359 hdr.size.len[0] = endian_short(hdr.size.len[0]); 360 hdr.size.len[1] = endian_short(hdr.size.len[1]); 361 } 362 363 /* 364 * Set the node count and lengths of the upper and lower case mapping 365 * tables. 366 */ 367 _uccase_size = hdr.cnt; 368 _uccase_len[0] = hdr.size.len[0]; 369 _uccase_len[1] = hdr.size.len[1]; 370 371 _uccase_map = (ac_uint4 *) 372 malloc(_uccase_size * 3 * sizeof(ac_uint4)); 373 374 /* 375 * Load the case mapping table. 376 */ 377 fread((char *) _uccase_map, sizeof(ac_uint4), _uccase_size * 3, in); 378 379 /* 380 * Do an endian swap if necessary. 381 */ 382 if (hdr.bom == 0xfffe) { 383 for (i = 0; i < _uccase_size * 3; i++) 384 _uccase_map[i] = endian_long(_uccase_map[i]); 385 } 386 fclose(in); 387 return 0; 388 } 389 390 static void 391 _uccase_unload(void) 392 { 393 if (_uccase_size == 0) 394 return; 395 396 free((char *) _uccase_map); 397 _uccase_size = 0; 398 } 399 #endif 400 401 static ac_uint4 402 _uccase_lookup(ac_uint4 code, long l, long r, int field) 403 { 404 long m; 405 const ac_uint4 *tmp; 406 407 /* 408 * Do the binary search. 409 */ 410 while (l <= r) { 411 /* 412 * Determine a "mid" point and adjust to make sure the mid point is at 413 * the beginning of a case mapping triple. 414 */ 415 m = (l + r) >> 1; 416 tmp = &_uccase_map[m*3]; 417 if (code > *tmp) 418 l = m + 1; 419 else if (code < *tmp) 420 r = m - 1; 421 else if (code == *tmp) 422 return tmp[field]; 423 } 424 425 return code; 426 } 427 428 ac_uint4 429 uctoupper(ac_uint4 code) 430 { 431 int field; 432 long l, r; 433 434 if (ucisupper(code)) 435 return code; 436 437 if (ucislower(code)) { 438 /* 439 * The character is lower case. 440 */ 441 field = 2; 442 l = _uccase_len[0]; 443 r = (l + _uccase_len[1]) - 1; 444 } else { 445 /* 446 * The character is title case. 447 */ 448 field = 1; 449 l = _uccase_len[0] + _uccase_len[1]; 450 r = _uccase_size - 1; 451 } 452 return _uccase_lookup(code, l, r, field); 453 } 454 455 ac_uint4 456 uctolower(ac_uint4 code) 457 { 458 int field; 459 long l, r; 460 461 if (ucislower(code)) 462 return code; 463 464 if (ucisupper(code)) { 465 /* 466 * The character is upper case. 467 */ 468 field = 1; 469 l = 0; 470 r = _uccase_len[0] - 1; 471 } else { 472 /* 473 * The character is title case. 474 */ 475 field = 2; 476 l = _uccase_len[0] + _uccase_len[1]; 477 r = _uccase_size - 1; 478 } 479 return _uccase_lookup(code, l, r, field); 480 } 481 482 ac_uint4 483 uctotitle(ac_uint4 code) 484 { 485 int field; 486 long l, r; 487 488 if (ucistitle(code)) 489 return code; 490 491 /* 492 * The offset will always be the same for converting to title case. 493 */ 494 field = 2; 495 496 if (ucisupper(code)) { 497 /* 498 * The character is upper case. 499 */ 500 l = 0; 501 r = _uccase_len[0] - 1; 502 } else { 503 /* 504 * The character is lower case. 505 */ 506 l = _uccase_len[0]; 507 r = (l + _uccase_len[1]) - 1; 508 } 509 return _uccase_lookup(code, l, r, field); 510 } 511 512 /************************************************************************** 513 * 514 * Support for compositions. 515 * 516 **************************************************************************/ 517 518 #if !HARDCODE_DATA 519 520 static ac_uint4 _uccomp_size; 521 static ac_uint4 *_uccomp_data; 522 523 /* 524 * Return -1 on error, 0 if okay 525 */ 526 static int 527 _uccomp_load(char *paths, int reload) 528 { 529 FILE *in; 530 ac_uint4 size, i; 531 _ucheader_t hdr; 532 533 if (_uccomp_size > 0) { 534 if (!reload) 535 /* 536 * The compositions have already been loaded. 537 */ 538 return 0; 539 540 free((char *) _uccomp_data); 541 _uccomp_size = 0; 542 } 543 544 if ((in = _ucopenfile(paths, "comp.dat", "rb")) == 0) 545 return -1; 546 547 /* 548 * Load the header. 549 */ 550 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 551 552 if (hdr.bom == 0xfffe) { 553 hdr.cnt = endian_short(hdr.cnt); 554 hdr.size.bytes = endian_long(hdr.size.bytes); 555 } 556 557 _uccomp_size = hdr.cnt; 558 _uccomp_data = (ac_uint4 *) malloc(hdr.size.bytes); 559 560 /* 561 * Read the composition data in. 562 */ 563 size = hdr.size.bytes / sizeof(ac_uint4); 564 fread((char *) _uccomp_data, sizeof(ac_uint4), size, in); 565 566 /* 567 * Do an endian swap if necessary. 568 */ 569 if (hdr.bom == 0xfffe) { 570 for (i = 0; i < size; i++) 571 _uccomp_data[i] = endian_long(_uccomp_data[i]); 572 } 573 574 /* 575 * Assume that the data is ordered on count, so that all compositions 576 * of length 2 come first. Only handling length 2 for now. 577 */ 578 for (i = 1; i < size; i += 4) 579 if (_uccomp_data[i] != 2) 580 break; 581 _uccomp_size = i - 1; 582 583 fclose(in); 584 return 0; 585 } 586 587 static void 588 _uccomp_unload(void) 589 { 590 if (_uccomp_size == 0) 591 return; 592 593 free((char *) _uccomp_data); 594 _uccomp_size = 0; 595 } 596 #endif 597 598 int 599 uccomp(ac_uint4 node1, ac_uint4 node2, ac_uint4 *comp) 600 { 601 int l, r, m; 602 603 l = 0; 604 r = _uccomp_size - 1; 605 606 while (l <= r) { 607 m = ((r + l) >> 1); 608 m -= m & 3; 609 if (node1 > _uccomp_data[m+2]) 610 l = m + 4; 611 else if (node1 < _uccomp_data[m+2]) 612 r = m - 4; 613 else if (node2 > _uccomp_data[m+3]) 614 l = m + 4; 615 else if (node2 < _uccomp_data[m+3]) 616 r = m - 4; 617 else { 618 *comp = _uccomp_data[m]; 619 return 1; 620 } 621 } 622 return 0; 623 } 624 625 int 626 uccomp_hangul(ac_uint4 *str, int len) 627 { 628 const int SBase = 0xAC00, LBase = 0x1100, 629 VBase = 0x1161, TBase = 0x11A7, 630 LCount = 19, VCount = 21, TCount = 28, 631 NCount = VCount * TCount, /* 588 */ 632 SCount = LCount * NCount; /* 11172 */ 633 634 int i, rlen; 635 ac_uint4 ch, last, lindex, sindex; 636 637 last = str[0]; 638 rlen = 1; 639 for ( i = 1; i < len; i++ ) { 640 ch = str[i]; 641 642 /* check if two current characters are L and V */ 643 lindex = last - LBase; 644 if (lindex < (ac_uint4) LCount) { 645 ac_uint4 vindex = ch - VBase; 646 if (vindex < (ac_uint4) VCount) { 647 /* make syllable of form LV */ 648 last = SBase + (lindex * VCount + vindex) * TCount; 649 str[rlen-1] = last; /* reset last */ 650 continue; 651 } 652 } 653 654 /* check if two current characters are LV and T */ 655 sindex = last - SBase; 656 if (sindex < (ac_uint4) SCount 657 && (sindex % TCount) == 0) 658 { 659 ac_uint4 tindex = ch - TBase; 660 if (tindex <= (ac_uint4) TCount) { 661 /* make syllable of form LVT */ 662 last += tindex; 663 str[rlen-1] = last; /* reset last */ 664 continue; 665 } 666 } 667 668 /* if neither case was true, just add the character */ 669 last = ch; 670 str[rlen] = ch; 671 rlen++; 672 } 673 return rlen; 674 } 675 676 int 677 uccanoncomp(ac_uint4 *str, int len) 678 { 679 int i, stpos, copos; 680 ac_uint4 cl, prevcl, st, ch, co; 681 682 st = str[0]; 683 stpos = 0; 684 copos = 1; 685 prevcl = uccombining_class(st) == 0 ? 0 : 256; 686 687 for (i = 1; i < len; i++) { 688 ch = str[i]; 689 cl = uccombining_class(ch); 690 if (uccomp(st, ch, &co) && (prevcl < cl || prevcl == 0)) 691 st = str[stpos] = co; 692 else { 693 if (cl == 0) { 694 stpos = copos; 695 st = ch; 696 } 697 prevcl = cl; 698 str[copos++] = ch; 699 } 700 } 701 702 return uccomp_hangul(str, copos); 703 } 704 705 /************************************************************************** 706 * 707 * Support for decompositions. 708 * 709 **************************************************************************/ 710 711 #if !HARDCODE_DATA 712 713 static ac_uint4 _ucdcmp_size; 714 static ac_uint4 *_ucdcmp_nodes; 715 static ac_uint4 *_ucdcmp_decomp; 716 717 static ac_uint4 _uckdcmp_size; 718 static ac_uint4 *_uckdcmp_nodes; 719 static ac_uint4 *_uckdcmp_decomp; 720 721 /* 722 * Return -1 on error, 0 if okay 723 */ 724 static int 725 _ucdcmp_load(char *paths, int reload) 726 { 727 FILE *in; 728 ac_uint4 size, i; 729 _ucheader_t hdr; 730 731 if (_ucdcmp_size > 0) { 732 if (!reload) 733 /* 734 * The decompositions have already been loaded. 735 */ 736 return 0; 737 738 free((char *) _ucdcmp_nodes); 739 _ucdcmp_size = 0; 740 } 741 742 if ((in = _ucopenfile(paths, "decomp.dat", "rb")) == 0) 743 return -1; 744 745 /* 746 * Load the header. 747 */ 748 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 749 750 if (hdr.bom == 0xfffe) { 751 hdr.cnt = endian_short(hdr.cnt); 752 hdr.size.bytes = endian_long(hdr.size.bytes); 753 } 754 755 _ucdcmp_size = hdr.cnt << 1; 756 _ucdcmp_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 757 _ucdcmp_decomp = _ucdcmp_nodes + (_ucdcmp_size + 1); 758 759 /* 760 * Read the decomposition data in. 761 */ 762 size = hdr.size.bytes / sizeof(ac_uint4); 763 fread((char *) _ucdcmp_nodes, sizeof(ac_uint4), size, in); 764 765 /* 766 * Do an endian swap if necessary. 767 */ 768 if (hdr.bom == 0xfffe) { 769 for (i = 0; i < size; i++) 770 _ucdcmp_nodes[i] = endian_long(_ucdcmp_nodes[i]); 771 } 772 fclose(in); 773 return 0; 774 } 775 776 /* 777 * Return -1 on error, 0 if okay 778 */ 779 static int 780 _uckdcmp_load(char *paths, int reload) 781 { 782 FILE *in; 783 ac_uint4 size, i; 784 _ucheader_t hdr; 785 786 if (_uckdcmp_size > 0) { 787 if (!reload) 788 /* 789 * The decompositions have already been loaded. 790 */ 791 return 0; 792 793 free((char *) _uckdcmp_nodes); 794 _uckdcmp_size = 0; 795 } 796 797 if ((in = _ucopenfile(paths, "kdecomp.dat", "rb")) == 0) 798 return -1; 799 800 /* 801 * Load the header. 802 */ 803 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 804 805 if (hdr.bom == 0xfffe) { 806 hdr.cnt = endian_short(hdr.cnt); 807 hdr.size.bytes = endian_long(hdr.size.bytes); 808 } 809 810 _uckdcmp_size = hdr.cnt << 1; 811 _uckdcmp_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 812 _uckdcmp_decomp = _uckdcmp_nodes + (_uckdcmp_size + 1); 813 814 /* 815 * Read the decomposition data in. 816 */ 817 size = hdr.size.bytes / sizeof(ac_uint4); 818 fread((char *) _uckdcmp_nodes, sizeof(ac_uint4), size, in); 819 820 /* 821 * Do an endian swap if necessary. 822 */ 823 if (hdr.bom == 0xfffe) { 824 for (i = 0; i < size; i++) 825 _uckdcmp_nodes[i] = endian_long(_uckdcmp_nodes[i]); 826 } 827 fclose(in); 828 return 0; 829 } 830 831 static void 832 _ucdcmp_unload(void) 833 { 834 if (_ucdcmp_size == 0) 835 return; 836 837 /* 838 * Only need to free the offsets because the memory is allocated as a 839 * single block. 840 */ 841 free((char *) _ucdcmp_nodes); 842 _ucdcmp_size = 0; 843 } 844 845 static void 846 _uckdcmp_unload(void) 847 { 848 if (_uckdcmp_size == 0) 849 return; 850 851 /* 852 * Only need to free the offsets because the memory is allocated as a 853 * single block. 854 */ 855 free((char *) _uckdcmp_nodes); 856 _uckdcmp_size = 0; 857 } 858 #endif 859 860 int 861 ucdecomp(ac_uint4 code, ac_uint4 *num, ac_uint4 **decomp) 862 { 863 long l, r, m; 864 865 if (code < _ucdcmp_nodes[0]) { 866 return 0; 867 } 868 869 l = 0; 870 r = _ucdcmp_nodes[_ucdcmp_size] - 1; 871 872 while (l <= r) { 873 /* 874 * Determine a "mid" point and adjust to make sure the mid point is at 875 * the beginning of a code+offset pair. 876 */ 877 m = (l + r) >> 1; 878 m -= (m & 1); 879 if (code > _ucdcmp_nodes[m]) 880 l = m + 2; 881 else if (code < _ucdcmp_nodes[m]) 882 r = m - 2; 883 else if (code == _ucdcmp_nodes[m]) { 884 *num = _ucdcmp_nodes[m + 3] - _ucdcmp_nodes[m + 1]; 885 *decomp = (ac_uint4*)&_ucdcmp_decomp[_ucdcmp_nodes[m + 1]]; 886 return 1; 887 } 888 } 889 return 0; 890 } 891 892 int 893 uckdecomp(ac_uint4 code, ac_uint4 *num, ac_uint4 **decomp) 894 { 895 long l, r, m; 896 897 if (code < _uckdcmp_nodes[0]) { 898 return 0; 899 } 900 901 l = 0; 902 r = _uckdcmp_nodes[_uckdcmp_size] - 1; 903 904 while (l <= r) { 905 /* 906 * Determine a "mid" point and adjust to make sure the mid point is at 907 * the beginning of a code+offset pair. 908 */ 909 m = (l + r) >> 1; 910 m -= (m & 1); 911 if (code > _uckdcmp_nodes[m]) 912 l = m + 2; 913 else if (code < _uckdcmp_nodes[m]) 914 r = m - 2; 915 else if (code == _uckdcmp_nodes[m]) { 916 *num = _uckdcmp_nodes[m + 3] - _uckdcmp_nodes[m + 1]; 917 *decomp = (ac_uint4*)&_uckdcmp_decomp[_uckdcmp_nodes[m + 1]]; 918 return 1; 919 } 920 } 921 return 0; 922 } 923 924 int 925 ucdecomp_hangul(ac_uint4 code, ac_uint4 *num, ac_uint4 decomp[]) 926 { 927 if (!ucishangul(code)) 928 return 0; 929 930 code -= 0xac00; 931 decomp[0] = 0x1100 + (ac_uint4) (code / 588); 932 decomp[1] = 0x1161 + (ac_uint4) ((code % 588) / 28); 933 decomp[2] = 0x11a7 + (ac_uint4) (code % 28); 934 *num = (decomp[2] != 0x11a7) ? 3 : 2; 935 936 return 1; 937 } 938 939 /* mode == 0 for canonical, mode == 1 for compatibility */ 940 static int 941 uccanoncompatdecomp(const ac_uint4 *in, int inlen, 942 ac_uint4 **out, int *outlen, short mode, void *ctx) 943 { 944 int l, size; 945 unsigned i, j, k; 946 ac_uint4 num, class, *decomp, hangdecomp[3]; 947 948 size = inlen * 2; 949 *out = (ac_uint4 *) ber_memalloc_x(size * sizeof(**out), ctx); 950 if (*out == NULL) 951 return *outlen = -1; 952 953 i = 0; 954 for (j = 0; j < (unsigned) inlen; j++) { 955 if (mode ? uckdecomp(in[j], &num, &decomp) : ucdecomp(in[j], &num, &decomp)) { 956 if ( size - i < num) { 957 size = inlen + i - j + num - 1; 958 *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx ); 959 if (*out == NULL) 960 return *outlen = -1; 961 } 962 for (k = 0; k < num; k++) { 963 class = uccombining_class(decomp[k]); 964 if (class == 0) { 965 (*out)[i] = decomp[k]; 966 } else { 967 for (l = i; l > 0; l--) 968 if (class >= uccombining_class((*out)[l-1])) 969 break; 970 AC_MEMCPY(*out + l + 1, *out + l, (i - l) * sizeof(**out)); 971 (*out)[l] = decomp[k]; 972 } 973 i++; 974 } 975 } else if (ucdecomp_hangul(in[j], &num, hangdecomp)) { 976 if (size - i < num) { 977 size = inlen + i - j + num - 1; 978 *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx); 979 if (*out == NULL) 980 return *outlen = -1; 981 } 982 for (k = 0; k < num; k++) { 983 (*out)[i] = hangdecomp[k]; 984 i++; 985 } 986 } else { 987 if (size - i < 1) { 988 size = inlen + i - j; 989 *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx); 990 if (*out == NULL) 991 return *outlen = -1; 992 } 993 class = uccombining_class(in[j]); 994 if (class == 0) { 995 (*out)[i] = in[j]; 996 } else { 997 for (l = i; l > 0; l--) 998 if (class >= uccombining_class((*out)[l-1])) 999 break; 1000 AC_MEMCPY(*out + l + 1, *out + l, (i - l) * sizeof(**out)); 1001 (*out)[l] = in[j]; 1002 } 1003 i++; 1004 } 1005 } 1006 return *outlen = i; 1007 } 1008 1009 int 1010 uccanondecomp(const ac_uint4 *in, int inlen, 1011 ac_uint4 **out, int *outlen, void *ctx) 1012 { 1013 return uccanoncompatdecomp(in, inlen, out, outlen, 0, ctx); 1014 } 1015 1016 int 1017 uccompatdecomp(const ac_uint4 *in, int inlen, 1018 ac_uint4 **out, int *outlen, void *ctx) 1019 { 1020 return uccanoncompatdecomp(in, inlen, out, outlen, 1, ctx); 1021 } 1022 1023 /************************************************************************** 1024 * 1025 * Support for combining classes. 1026 * 1027 **************************************************************************/ 1028 1029 #if !HARDCODE_DATA 1030 static ac_uint4 _uccmcl_size; 1031 static ac_uint4 *_uccmcl_nodes; 1032 1033 /* 1034 * Return -1 on error, 0 if okay 1035 */ 1036 static int 1037 _uccmcl_load(char *paths, int reload) 1038 { 1039 FILE *in; 1040 ac_uint4 i; 1041 _ucheader_t hdr; 1042 1043 if (_uccmcl_size > 0) { 1044 if (!reload) 1045 /* 1046 * The combining classes have already been loaded. 1047 */ 1048 return 0; 1049 1050 free((char *) _uccmcl_nodes); 1051 _uccmcl_size = 0; 1052 } 1053 1054 if ((in = _ucopenfile(paths, "cmbcl.dat", "rb")) == 0) 1055 return -1; 1056 1057 /* 1058 * Load the header. 1059 */ 1060 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 1061 1062 if (hdr.bom == 0xfffe) { 1063 hdr.cnt = endian_short(hdr.cnt); 1064 hdr.size.bytes = endian_long(hdr.size.bytes); 1065 } 1066 1067 _uccmcl_size = hdr.cnt * 3; 1068 _uccmcl_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 1069 1070 /* 1071 * Read the combining classes in. 1072 */ 1073 fread((char *) _uccmcl_nodes, sizeof(ac_uint4), _uccmcl_size, in); 1074 1075 /* 1076 * Do an endian swap if necessary. 1077 */ 1078 if (hdr.bom == 0xfffe) { 1079 for (i = 0; i < _uccmcl_size; i++) 1080 _uccmcl_nodes[i] = endian_long(_uccmcl_nodes[i]); 1081 } 1082 fclose(in); 1083 return 0; 1084 } 1085 1086 static void 1087 _uccmcl_unload(void) 1088 { 1089 if (_uccmcl_size == 0) 1090 return; 1091 1092 free((char *) _uccmcl_nodes); 1093 _uccmcl_size = 0; 1094 } 1095 #endif 1096 1097 ac_uint4 1098 uccombining_class(ac_uint4 code) 1099 { 1100 long l, r, m; 1101 1102 l = 0; 1103 r = _uccmcl_size - 1; 1104 1105 while (l <= r) { 1106 m = (l + r) >> 1; 1107 m -= (m % 3); 1108 if (code > _uccmcl_nodes[m + 1]) 1109 l = m + 3; 1110 else if (code < _uccmcl_nodes[m]) 1111 r = m - 3; 1112 else if (code >= _uccmcl_nodes[m] && code <= _uccmcl_nodes[m + 1]) 1113 return _uccmcl_nodes[m + 2]; 1114 } 1115 return 0; 1116 } 1117 1118 /************************************************************************** 1119 * 1120 * Support for numeric values. 1121 * 1122 **************************************************************************/ 1123 1124 #if !HARDCODE_DATA 1125 static ac_uint4 *_ucnum_nodes; 1126 static ac_uint4 _ucnum_size; 1127 static short *_ucnum_vals; 1128 1129 /* 1130 * Return -1 on error, 0 if okay 1131 */ 1132 static int 1133 _ucnumb_load(char *paths, int reload) 1134 { 1135 FILE *in; 1136 ac_uint4 size, i; 1137 _ucheader_t hdr; 1138 1139 if (_ucnum_size > 0) { 1140 if (!reload) 1141 /* 1142 * The numbers have already been loaded. 1143 */ 1144 return 0; 1145 1146 free((char *) _ucnum_nodes); 1147 _ucnum_size = 0; 1148 } 1149 1150 if ((in = _ucopenfile(paths, "num.dat", "rb")) == 0) 1151 return -1; 1152 1153 /* 1154 * Load the header. 1155 */ 1156 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 1157 1158 if (hdr.bom == 0xfffe) { 1159 hdr.cnt = endian_short(hdr.cnt); 1160 hdr.size.bytes = endian_long(hdr.size.bytes); 1161 } 1162 1163 _ucnum_size = hdr.cnt; 1164 _ucnum_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 1165 _ucnum_vals = (short *) (_ucnum_nodes + _ucnum_size); 1166 1167 /* 1168 * Read the combining classes in. 1169 */ 1170 fread((char *) _ucnum_nodes, sizeof(unsigned char), hdr.size.bytes, in); 1171 1172 /* 1173 * Do an endian swap if necessary. 1174 */ 1175 if (hdr.bom == 0xfffe) { 1176 for (i = 0; i < _ucnum_size; i++) 1177 _ucnum_nodes[i] = endian_long(_ucnum_nodes[i]); 1178 1179 /* 1180 * Determine the number of values that have to be adjusted. 1181 */ 1182 size = (hdr.size.bytes - 1183 (_ucnum_size * (sizeof(ac_uint4) << 1))) / 1184 sizeof(short); 1185 1186 for (i = 0; i < size; i++) 1187 _ucnum_vals[i] = endian_short(_ucnum_vals[i]); 1188 } 1189 fclose(in); 1190 return 0; 1191 } 1192 1193 static void 1194 _ucnumb_unload(void) 1195 { 1196 if (_ucnum_size == 0) 1197 return; 1198 1199 free((char *) _ucnum_nodes); 1200 _ucnum_size = 0; 1201 } 1202 #endif 1203 1204 int 1205 ucnumber_lookup(ac_uint4 code, struct ucnumber *num) 1206 { 1207 long l, r, m; 1208 short *vp; 1209 1210 l = 0; 1211 r = _ucnum_size - 1; 1212 while (l <= r) { 1213 /* 1214 * Determine a "mid" point and adjust to make sure the mid point is at 1215 * the beginning of a code+offset pair. 1216 */ 1217 m = (l + r) >> 1; 1218 m -= (m & 1); 1219 if (code > _ucnum_nodes[m]) 1220 l = m + 2; 1221 else if (code < _ucnum_nodes[m]) 1222 r = m - 2; 1223 else { 1224 vp = (short *)_ucnum_vals + _ucnum_nodes[m + 1]; 1225 num->numerator = (int) *vp++; 1226 num->denominator = (int) *vp; 1227 return 1; 1228 } 1229 } 1230 return 0; 1231 } 1232 1233 int 1234 ucdigit_lookup(ac_uint4 code, int *digit) 1235 { 1236 long l, r, m; 1237 short *vp; 1238 1239 l = 0; 1240 r = _ucnum_size - 1; 1241 while (l <= r) { 1242 /* 1243 * Determine a "mid" point and adjust to make sure the mid point is at 1244 * the beginning of a code+offset pair. 1245 */ 1246 m = (l + r) >> 1; 1247 m -= (m & 1); 1248 if (code > _ucnum_nodes[m]) 1249 l = m + 2; 1250 else if (code < _ucnum_nodes[m]) 1251 r = m - 2; 1252 else { 1253 vp = (short *)_ucnum_vals + _ucnum_nodes[m + 1]; 1254 if (*vp == *(vp + 1)) { 1255 *digit = *vp; 1256 return 1; 1257 } 1258 return 0; 1259 } 1260 } 1261 return 0; 1262 } 1263 1264 struct ucnumber 1265 ucgetnumber(ac_uint4 code) 1266 { 1267 struct ucnumber num; 1268 1269 /* 1270 * Initialize with some arbitrary value, because the caller simply cannot 1271 * tell for sure if the code is a number without calling the ucisnumber() 1272 * macro before calling this function. 1273 */ 1274 num.numerator = num.denominator = -111; 1275 1276 (void) ucnumber_lookup(code, &num); 1277 1278 return num; 1279 } 1280 1281 int 1282 ucgetdigit(ac_uint4 code) 1283 { 1284 int dig; 1285 1286 /* 1287 * Initialize with some arbitrary value, because the caller simply cannot 1288 * tell for sure if the code is a number without calling the ucisdigit() 1289 * macro before calling this function. 1290 */ 1291 dig = -111; 1292 1293 (void) ucdigit_lookup(code, &dig); 1294 1295 return dig; 1296 } 1297 1298 /************************************************************************** 1299 * 1300 * Setup and cleanup routines. 1301 * 1302 **************************************************************************/ 1303 1304 #if HARDCODE_DATA 1305 int ucdata_load(char *paths, int masks) { return 0; } 1306 void ucdata_unload(int masks) { } 1307 int ucdata_reload(char *paths, int masks) { return 0; } 1308 #else 1309 /* 1310 * Return 0 if okay, negative on error 1311 */ 1312 int 1313 ucdata_load(char *paths, int masks) 1314 { 1315 int error = 0; 1316 1317 if (masks & UCDATA_CTYPE) 1318 error |= _ucprop_load(paths, 0) < 0 ? UCDATA_CTYPE : 0; 1319 if (masks & UCDATA_CASE) 1320 error |= _uccase_load(paths, 0) < 0 ? UCDATA_CASE : 0; 1321 if (masks & UCDATA_DECOMP) 1322 error |= _ucdcmp_load(paths, 0) < 0 ? UCDATA_DECOMP : 0; 1323 if (masks & UCDATA_CMBCL) 1324 error |= _uccmcl_load(paths, 0) < 0 ? UCDATA_CMBCL : 0; 1325 if (masks & UCDATA_NUM) 1326 error |= _ucnumb_load(paths, 0) < 0 ? UCDATA_NUM : 0; 1327 if (masks & UCDATA_COMP) 1328 error |= _uccomp_load(paths, 0) < 0 ? UCDATA_COMP : 0; 1329 if (masks & UCDATA_KDECOMP) 1330 error |= _uckdcmp_load(paths, 0) < 0 ? UCDATA_KDECOMP : 0; 1331 1332 return -error; 1333 } 1334 1335 void 1336 ucdata_unload(int masks) 1337 { 1338 if (masks & UCDATA_CTYPE) 1339 _ucprop_unload(); 1340 if (masks & UCDATA_CASE) 1341 _uccase_unload(); 1342 if (masks & UCDATA_DECOMP) 1343 _ucdcmp_unload(); 1344 if (masks & UCDATA_CMBCL) 1345 _uccmcl_unload(); 1346 if (masks & UCDATA_NUM) 1347 _ucnumb_unload(); 1348 if (masks & UCDATA_COMP) 1349 _uccomp_unload(); 1350 if (masks & UCDATA_KDECOMP) 1351 _uckdcmp_unload(); 1352 } 1353 1354 /* 1355 * Return 0 if okay, negative on error 1356 */ 1357 int 1358 ucdata_reload(char *paths, int masks) 1359 { 1360 int error = 0; 1361 1362 if (masks & UCDATA_CTYPE) 1363 error |= _ucprop_load(paths, 1) < 0 ? UCDATA_CTYPE : 0; 1364 if (masks & UCDATA_CASE) 1365 error |= _uccase_load(paths, 1) < 0 ? UCDATA_CASE : 0; 1366 if (masks & UCDATA_DECOMP) 1367 error |= _ucdcmp_load(paths, 1) < 0 ? UCDATA_DECOMP : 0; 1368 if (masks & UCDATA_CMBCL) 1369 error |= _uccmcl_load(paths, 1) < 0 ? UCDATA_CMBCL : 0; 1370 if (masks & UCDATA_NUM) 1371 error |= _ucnumb_load(paths, 1) < 0 ? UCDATA_NUM : 0; 1372 if (masks & UCDATA_COMP) 1373 error |= _uccomp_load(paths, 1) < 0 ? UCDATA_COMP : 0; 1374 if (masks & UCDATA_KDECOMP) 1375 error |= _uckdcmp_load(paths, 1) < 0 ? UCDATA_KDECOMP : 0; 1376 1377 return -error; 1378 } 1379 #endif 1380 1381 #ifdef TEST 1382 1383 void 1384 main(void) 1385 { 1386 int dig; 1387 ac_uint4 i, lo, *dec; 1388 struct ucnumber num; 1389 1390 /* ucdata_setup("."); */ 1391 1392 if (ucisweak(0x30)) 1393 printf("WEAK\n"); 1394 else 1395 printf("NOT WEAK\n"); 1396 1397 printf("LOWER 0x%04lX\n", uctolower(0xff3a)); 1398 printf("UPPER 0x%04lX\n", uctoupper(0xff5a)); 1399 1400 if (ucisalpha(0x1d5)) 1401 printf("ALPHA\n"); 1402 else 1403 printf("NOT ALPHA\n"); 1404 1405 if (ucisupper(0x1d5)) { 1406 printf("UPPER\n"); 1407 lo = uctolower(0x1d5); 1408 printf("0x%04lx\n", lo); 1409 lo = uctotitle(0x1d5); 1410 printf("0x%04lx\n", lo); 1411 } else 1412 printf("NOT UPPER\n"); 1413 1414 if (ucistitle(0x1d5)) 1415 printf("TITLE\n"); 1416 else 1417 printf("NOT TITLE\n"); 1418 1419 if (uciscomposite(0x1d5)) 1420 printf("COMPOSITE\n"); 1421 else 1422 printf("NOT COMPOSITE\n"); 1423 1424 if (ucdecomp(0x1d5, &lo, &dec)) { 1425 for (i = 0; i < lo; i++) 1426 printf("0x%04lx ", dec[i]); 1427 putchar('\n'); 1428 } 1429 1430 if ((lo = uccombining_class(0x41)) != 0) 1431 printf("0x41 CCL %ld\n", lo); 1432 1433 if (ucisxdigit(0xfeff)) 1434 printf("0xFEFF HEX DIGIT\n"); 1435 else 1436 printf("0xFEFF NOT HEX DIGIT\n"); 1437 1438 if (ucisdefined(0x10000)) 1439 printf("0x10000 DEFINED\n"); 1440 else 1441 printf("0x10000 NOT DEFINED\n"); 1442 1443 if (ucnumber_lookup(0x30, &num)) { 1444 if (num.denominator != 1) 1445 printf("UCNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator); 1446 else 1447 printf("UCNUMBER: 0x30 = %d\n", num.numerator); 1448 } else 1449 printf("UCNUMBER: 0x30 NOT A NUMBER\n"); 1450 1451 if (ucnumber_lookup(0xbc, &num)) { 1452 if (num.denominator != 1) 1453 printf("UCNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator); 1454 else 1455 printf("UCNUMBER: 0xbc = %d\n", num.numerator); 1456 } else 1457 printf("UCNUMBER: 0xbc NOT A NUMBER\n"); 1458 1459 1460 if (ucnumber_lookup(0xff19, &num)) { 1461 if (num.denominator != 1) 1462 printf("UCNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator); 1463 else 1464 printf("UCNUMBER: 0xff19 = %d\n", num.numerator); 1465 } else 1466 printf("UCNUMBER: 0xff19 NOT A NUMBER\n"); 1467 1468 if (ucnumber_lookup(0x4e00, &num)) { 1469 if (num.denominator != 1) 1470 printf("UCNUMBER: 0x4e00 = %d/%d\n", num.numerator, num.denominator); 1471 else 1472 printf("UCNUMBER: 0x4e00 = %d\n", num.numerator); 1473 } else 1474 printf("UCNUMBER: 0x4e00 NOT A NUMBER\n"); 1475 1476 if (ucdigit_lookup(0x06f9, &dig)) 1477 printf("UCDIGIT: 0x6f9 = %d\n", dig); 1478 else 1479 printf("UCDIGIT: 0x6f9 NOT A NUMBER\n"); 1480 1481 dig = ucgetdigit(0x0969); 1482 printf("UCGETDIGIT: 0x969 = %d\n", dig); 1483 1484 num = ucgetnumber(0x30); 1485 if (num.denominator != 1) 1486 printf("UCGETNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator); 1487 else 1488 printf("UCGETNUMBER: 0x30 = %d\n", num.numerator); 1489 1490 num = ucgetnumber(0xbc); 1491 if (num.denominator != 1) 1492 printf("UCGETNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator); 1493 else 1494 printf("UCGETNUMBER: 0xbc = %d\n", num.numerator); 1495 1496 num = ucgetnumber(0xff19); 1497 if (num.denominator != 1) 1498 printf("UCGETNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator); 1499 else 1500 printf("UCGETNUMBER: 0xff19 = %d\n", num.numerator); 1501 1502 /* ucdata_cleanup(); */ 1503 exit(0); 1504 } 1505 1506 #endif /* TEST */ 1507