xref: /netbsd-src/external/bsd/openldap/dist/libraries/liblunicode/ucdata/ucdata.c (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1 /*	$NetBSD: ucdata.c,v 1.1.1.4 2014/05/28 09:58:44 tron Exp $	*/
2 
3 /* $OpenLDAP$ */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5  *
6  * Copyright 1998-2014 The OpenLDAP Foundation.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted only as authorized by the OpenLDAP
11  * Public License.
12  *
13  * A copy of this license is available in file LICENSE in the
14  * top-level directory of the distribution or, alternatively, at
15  * <http://www.OpenLDAP.org/license.html>.
16  */
17 /* Copyright 2001 Computing Research Labs, New Mexico State University
18  *
19  * Permission is hereby granted, free of charge, to any person obtaining a
20  * copy of this software and associated documentation files (the "Software"),
21  * to deal in the Software without restriction, including without limitation
22  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
23  * and/or sell copies of the Software, and to permit persons to whom the
24  * Software is furnished to do so, subject to the following conditions:
25  *
26  * The above copyright notice and this permission notice shall be included in
27  * all copies or substantial portions of the Software.
28  *
29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
32  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
33  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
34  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
35  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36  */
37 /* Id: ucdata.c,v 1.4 2001/01/02 18:46:20 mleisher Exp " */
38 
39 #include "portable.h"
40 #include "ldap_config.h"
41 
42 #include <stdio.h>
43 #include <ac/stdlib.h>
44 #include <ac/string.h>
45 #include <ac/unistd.h>
46 
47 #include <ac/bytes.h>
48 
49 #include "lber_pvt.h"
50 #include "ucdata.h"
51 
52 #ifndef HARDCODE_DATA
53 #define	HARDCODE_DATA	1
54 #endif
55 
56 #if HARDCODE_DATA
57 #include "uctable.h"
58 #endif
59 
60 /**************************************************************************
61  *
62  * Miscellaneous types, data, and support functions.
63  *
64  **************************************************************************/
65 
66 typedef struct {
67     ac_uint2 bom;
68     ac_uint2 cnt;
69     union {
70         ac_uint4 bytes;
71         ac_uint2 len[2];
72     } size;
73 } _ucheader_t;
74 
75 /*
76  * A simple array of 32-bit masks for lookup.
77  */
78 static ac_uint4 masks32[32] = {
79 	0x00000001UL, 0x00000002UL, 0x00000004UL, 0x00000008UL,
80 	0x00000010UL, 0x00000020UL, 0x00000040UL, 0x00000080UL,
81 	0x00000100UL, 0x00000200UL, 0x00000400UL, 0x00000800UL,
82 	0x00001000UL, 0x00002000UL, 0x00004000UL, 0x00008000UL,
83 	0x00010000UL, 0x00020000UL, 0x00040000UL, 0x00080000UL,
84 	0x00100000UL, 0x00200000UL, 0x00400000UL, 0x00800000UL,
85 	0x01000000UL, 0x02000000UL, 0x04000000UL, 0x08000000UL,
86 	0x10000000UL, 0x20000000UL, 0x40000000UL, 0x80000000UL
87 };
88 
89 #define endian_short(cc) (((cc) >> 8) | (((cc) & 0xff) << 8))
90 #define endian_long(cc) ((((cc) & 0xff) << 24)|((((cc) >> 8) & 0xff) << 16)|\
91                         ((((cc) >> 16) & 0xff) << 8)|((cc) >> 24))
92 
93 #if !HARDCODE_DATA
94 static FILE *
95 _ucopenfile(char *paths, char *filename, char *mode)
96 {
97     FILE *f;
98     char *fp, *dp, *pp, path[BUFSIZ];
99 
100     if (filename == 0 || *filename == 0)
101       return 0;
102 
103     dp = paths;
104     while (dp && *dp) {
105         pp = path;
106         while (*dp && *dp != ':')
107           *pp++ = *dp++;
108         *pp++ = *LDAP_DIRSEP;
109 
110         fp = filename;
111         while (*fp)
112           *pp++ = *fp++;
113         *pp = 0;
114 
115         if ((f = fopen(path, mode)) != 0)
116           return f;
117 
118         if (*dp == ':')
119           dp++;
120     }
121 
122     return 0;
123 }
124 #endif
125 
126 /**************************************************************************
127  *
128  * Support for the character properties.
129  *
130  **************************************************************************/
131 
132 #if !HARDCODE_DATA
133 
134 static ac_uint4 _ucprop_size;
135 static ac_uint2 *_ucprop_offsets;
136 static ac_uint4 *_ucprop_ranges;
137 
138 /*
139  * Return -1 on error, 0 if okay
140  */
141 static int
142 _ucprop_load(char *paths, int reload)
143 {
144     FILE *in;
145     ac_uint4 size, i;
146     _ucheader_t hdr;
147 
148     if (_ucprop_size > 0) {
149         if (!reload)
150           /*
151            * The character properties have already been loaded.
152            */
153           return 0;
154 
155         /*
156          * Unload the current character property data in preparation for
157          * loading a new copy.  Only the first array has to be deallocated
158          * because all the memory for the arrays is allocated as a single
159          * block.
160          */
161         free((char *) _ucprop_offsets);
162         _ucprop_size = 0;
163     }
164 
165     if ((in = _ucopenfile(paths, "ctype.dat", "rb")) == 0)
166       return -1;
167 
168     /*
169      * Load the header.
170      */
171     fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
172 
173     if (hdr.bom == 0xfffe) {
174         hdr.cnt = endian_short(hdr.cnt);
175         hdr.size.bytes = endian_long(hdr.size.bytes);
176     }
177 
178     if ((_ucprop_size = hdr.cnt) == 0) {
179         fclose(in);
180         return -1;
181     }
182 
183     /*
184      * Allocate all the storage needed for the lookup table.
185      */
186     _ucprop_offsets = (ac_uint2 *) malloc(hdr.size.bytes);
187 
188     /*
189      * Calculate the offset into the storage for the ranges.  The offsets
190      * array is on a 4-byte boundary and one larger than the value provided in
191      * the header count field.  This means the offset to the ranges must be
192      * calculated after aligning the count to a 4-byte boundary.
193      */
194     if ((size = ((hdr.cnt + 1) * sizeof(ac_uint2))) & 3)
195       size += 4 - (size & 3);
196     size >>= 1;
197     _ucprop_ranges = (ac_uint4 *) (_ucprop_offsets + size);
198 
199     /*
200      * Load the offset array.
201      */
202     fread((char *) _ucprop_offsets, sizeof(ac_uint2), size, in);
203 
204     /*
205      * Do an endian swap if necessary.  Don't forget there is an extra node on
206      * the end with the final index.
207      */
208     if (hdr.bom == 0xfffe) {
209         for (i = 0; i <= _ucprop_size; i++)
210           _ucprop_offsets[i] = endian_short(_ucprop_offsets[i]);
211     }
212 
213     /*
214      * Load the ranges.  The number of elements is in the last array position
215      * of the offsets.
216      */
217     fread((char *) _ucprop_ranges, sizeof(ac_uint4),
218           _ucprop_offsets[_ucprop_size], in);
219 
220     fclose(in);
221 
222     /*
223      * Do an endian swap if necessary.
224      */
225     if (hdr.bom == 0xfffe) {
226         for (i = 0; i < _ucprop_offsets[_ucprop_size]; i++)
227           _ucprop_ranges[i] = endian_long(_ucprop_ranges[i]);
228     }
229     return 0;
230 }
231 
232 static void
233 _ucprop_unload(void)
234 {
235     if (_ucprop_size == 0)
236       return;
237 
238     /*
239      * Only need to free the offsets because the memory is allocated as a
240      * single block.
241      */
242     free((char *) _ucprop_offsets);
243     _ucprop_size = 0;
244 }
245 #endif
246 
247 static int
248 _ucprop_lookup(ac_uint4 code, ac_uint4 n)
249 {
250     long l, r, m;
251 
252     if (_ucprop_size == 0)
253       return 0;
254 
255     /*
256      * There is an extra node on the end of the offsets to allow this routine
257      * to work right.  If the index is 0xffff, then there are no nodes for the
258      * property.
259      */
260     if ((l = _ucprop_offsets[n]) == 0xffff)
261       return 0;
262 
263     /*
264      * Locate the next offset that is not 0xffff.  The sentinel at the end of
265      * the array is the max index value.
266      */
267     for (m = 1;
268          n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++) ;
269 
270     r = _ucprop_offsets[n + m] - 1;
271 
272     while (l <= r) {
273         /*
274          * Determine a "mid" point and adjust to make sure the mid point is at
275          * the beginning of a range pair.
276          */
277         m = (l + r) >> 1;
278         m -= (m & 1);
279         if (code > _ucprop_ranges[m + 1])
280           l = m + 2;
281         else if (code < _ucprop_ranges[m])
282           r = m - 2;
283         else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
284           return 1;
285     }
286     return 0;
287 }
288 
289 int
290 ucisprop(ac_uint4 code, ac_uint4 mask1, ac_uint4 mask2)
291 {
292     ac_uint4 i;
293 
294     if (mask1 == 0 && mask2 == 0)
295       return 0;
296 
297     for (i = 0; mask1 && i < 32; i++) {
298         if ((mask1 & masks32[i]) && _ucprop_lookup(code, i))
299           return 1;
300     }
301 
302     for (i = 32; mask2 && i < _ucprop_size; i++) {
303         if ((mask2 & masks32[i & 31]) && _ucprop_lookup(code, i))
304           return 1;
305     }
306 
307     return 0;
308 }
309 
310 /**************************************************************************
311  *
312  * Support for case mapping.
313  *
314  **************************************************************************/
315 
316 #if !HARDCODE_DATA
317 
318 /* These record the number of slots in the map.
319  * There are 3 words per slot.
320  */
321 static ac_uint4 _uccase_size;
322 static ac_uint2 _uccase_len[2];
323 static ac_uint4 *_uccase_map;
324 
325 /*
326  * Return -1 on error, 0 if okay
327  */
328 static int
329 _uccase_load(char *paths, int reload)
330 {
331     FILE *in;
332     ac_uint4 i;
333     _ucheader_t hdr;
334 
335     if (_uccase_size > 0) {
336         if (!reload)
337           /*
338            * The case mappings have already been loaded.
339            */
340           return 0;
341 
342         free((char *) _uccase_map);
343         _uccase_size = 0;
344     }
345 
346     if ((in = _ucopenfile(paths, "case.dat", "rb")) == 0)
347       return -1;
348 
349     /*
350      * Load the header.
351      */
352     fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
353 
354     if (hdr.bom == 0xfffe) {
355         hdr.cnt = endian_short(hdr.cnt);
356         hdr.size.len[0] = endian_short(hdr.size.len[0]);
357         hdr.size.len[1] = endian_short(hdr.size.len[1]);
358     }
359 
360     /*
361      * Set the node count and lengths of the upper and lower case mapping
362      * tables.
363      */
364     _uccase_size = hdr.cnt;
365     _uccase_len[0] = hdr.size.len[0];
366     _uccase_len[1] = hdr.size.len[1];
367 
368     _uccase_map = (ac_uint4 *)
369         malloc(_uccase_size * 3 * sizeof(ac_uint4));
370 
371     /*
372      * Load the case mapping table.
373      */
374     fread((char *) _uccase_map, sizeof(ac_uint4), _uccase_size * 3, in);
375 
376     /*
377      * Do an endian swap if necessary.
378      */
379     if (hdr.bom == 0xfffe) {
380         for (i = 0; i < _uccase_size * 3; i++)
381           _uccase_map[i] = endian_long(_uccase_map[i]);
382     }
383     fclose(in);
384     return 0;
385 }
386 
387 static void
388 _uccase_unload(void)
389 {
390     if (_uccase_size == 0)
391       return;
392 
393     free((char *) _uccase_map);
394     _uccase_size = 0;
395 }
396 #endif
397 
398 static ac_uint4
399 _uccase_lookup(ac_uint4 code, long l, long r, int field)
400 {
401     long m;
402 	const ac_uint4 *tmp;
403 
404     /*
405      * Do the binary search.
406      */
407     while (l <= r) {
408         /*
409          * Determine a "mid" point and adjust to make sure the mid point is at
410          * the beginning of a case mapping triple.
411          */
412         m = (l + r) >> 1;
413 		tmp = &_uccase_map[m*3];
414         if (code > *tmp)
415           l = m + 1;
416         else if (code < *tmp)
417           r = m - 1;
418         else if (code == *tmp)
419           return tmp[field];
420     }
421 
422     return code;
423 }
424 
425 ac_uint4
426 uctoupper(ac_uint4 code)
427 {
428     int field;
429     long l, r;
430 
431     if (ucisupper(code))
432       return code;
433 
434     if (ucislower(code)) {
435         /*
436          * The character is lower case.
437          */
438         field = 2;
439         l = _uccase_len[0];
440         r = (l + _uccase_len[1]) - 1;
441     } else {
442         /*
443          * The character is title case.
444          */
445         field = 1;
446         l = _uccase_len[0] + _uccase_len[1];
447         r = _uccase_size - 1;
448     }
449     return _uccase_lookup(code, l, r, field);
450 }
451 
452 ac_uint4
453 uctolower(ac_uint4 code)
454 {
455     int field;
456     long l, r;
457 
458     if (ucislower(code))
459       return code;
460 
461     if (ucisupper(code)) {
462         /*
463          * The character is upper case.
464          */
465         field = 1;
466         l = 0;
467         r = _uccase_len[0] - 1;
468     } else {
469         /*
470          * The character is title case.
471          */
472         field = 2;
473         l = _uccase_len[0] + _uccase_len[1];
474         r = _uccase_size - 1;
475     }
476     return _uccase_lookup(code, l, r, field);
477 }
478 
479 ac_uint4
480 uctotitle(ac_uint4 code)
481 {
482     int field;
483     long l, r;
484 
485     if (ucistitle(code))
486       return code;
487 
488     /*
489      * The offset will always be the same for converting to title case.
490      */
491     field = 2;
492 
493     if (ucisupper(code)) {
494         /*
495          * The character is upper case.
496          */
497         l = 0;
498         r = _uccase_len[0] - 1;
499     } else {
500         /*
501          * The character is lower case.
502          */
503         l = _uccase_len[0];
504         r = (l + _uccase_len[1]) - 1;
505     }
506     return _uccase_lookup(code, l, r, field);
507 }
508 
509 /**************************************************************************
510  *
511  * Support for compositions.
512  *
513  **************************************************************************/
514 
515 #if !HARDCODE_DATA
516 
517 static ac_uint4  _uccomp_size;
518 static ac_uint4 *_uccomp_data;
519 
520 /*
521  * Return -1 on error, 0 if okay
522  */
523 static int
524 _uccomp_load(char *paths, int reload)
525 {
526     FILE *in;
527     ac_uint4 size, i;
528     _ucheader_t hdr;
529 
530     if (_uccomp_size > 0) {
531         if (!reload)
532             /*
533              * The compositions have already been loaded.
534              */
535             return 0;
536 
537         free((char *) _uccomp_data);
538         _uccomp_size = 0;
539     }
540 
541     if ((in = _ucopenfile(paths, "comp.dat", "rb")) == 0)
542         return -1;
543 
544     /*
545      * Load the header.
546      */
547     fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
548 
549     if (hdr.bom == 0xfffe) {
550         hdr.cnt = endian_short(hdr.cnt);
551         hdr.size.bytes = endian_long(hdr.size.bytes);
552     }
553 
554     _uccomp_size = hdr.cnt;
555     _uccomp_data = (ac_uint4 *) malloc(hdr.size.bytes);
556 
557     /*
558      * Read the composition data in.
559      */
560     size = hdr.size.bytes / sizeof(ac_uint4);
561     fread((char *) _uccomp_data, sizeof(ac_uint4), size, in);
562 
563     /*
564      * Do an endian swap if necessary.
565      */
566     if (hdr.bom == 0xfffe) {
567         for (i = 0; i < size; i++)
568             _uccomp_data[i] = endian_long(_uccomp_data[i]);
569     }
570 
571     /*
572      * Assume that the data is ordered on count, so that all compositions
573      * of length 2 come first. Only handling length 2 for now.
574      */
575     for (i = 1; i < size; i += 4)
576       if (_uccomp_data[i] != 2)
577         break;
578     _uccomp_size = i - 1;
579 
580     fclose(in);
581     return 0;
582 }
583 
584 static void
585 _uccomp_unload(void)
586 {
587     if (_uccomp_size == 0)
588         return;
589 
590     free((char *) _uccomp_data);
591     _uccomp_size = 0;
592 }
593 #endif
594 
595 int
596 uccomp(ac_uint4 node1, ac_uint4 node2, ac_uint4 *comp)
597 {
598     int l, r, m;
599 
600     l = 0;
601     r = _uccomp_size - 1;
602 
603     while (l <= r) {
604         m = ((r + l) >> 1);
605         m -= m & 3;
606         if (node1 > _uccomp_data[m+2])
607           l = m + 4;
608         else if (node1 < _uccomp_data[m+2])
609           r = m - 4;
610         else if (node2 > _uccomp_data[m+3])
611           l = m + 4;
612         else if (node2 < _uccomp_data[m+3])
613           r = m - 4;
614         else {
615             *comp = _uccomp_data[m];
616             return 1;
617         }
618     }
619     return 0;
620 }
621 
622 int
623 uccomp_hangul(ac_uint4 *str, int len)
624 {
625     const int SBase = 0xAC00, LBase = 0x1100,
626         VBase = 0x1161, TBase = 0x11A7,
627         LCount = 19, VCount = 21, TCount = 28,
628         NCount = VCount * TCount,   /* 588 */
629         SCount = LCount * NCount;   /* 11172 */
630 
631     int i, rlen;
632     ac_uint4 ch, last, lindex, sindex;
633 
634     last = str[0];
635     rlen = 1;
636     for ( i = 1; i < len; i++ ) {
637         ch = str[i];
638 
639         /* check if two current characters are L and V */
640         lindex = last - LBase;
641         if (lindex < (ac_uint4) LCount) {
642             ac_uint4 vindex = ch - VBase;
643             if (vindex < (ac_uint4) VCount) {
644                 /* make syllable of form LV */
645                 last = SBase + (lindex * VCount + vindex) * TCount;
646                 str[rlen-1] = last; /* reset last */
647                 continue;
648             }
649         }
650 
651         /* check if two current characters are LV and T */
652         sindex = last - SBase;
653         if (sindex < (ac_uint4) SCount
654 			&& (sindex % TCount) == 0)
655 		{
656             ac_uint4 tindex = ch - TBase;
657             if (tindex <= (ac_uint4) TCount) {
658                 /* make syllable of form LVT */
659                 last += tindex;
660                 str[rlen-1] = last; /* reset last */
661                 continue;
662             }
663         }
664 
665         /* if neither case was true, just add the character */
666         last = ch;
667         str[rlen] = ch;
668         rlen++;
669     }
670     return rlen;
671 }
672 
673 int
674 uccanoncomp(ac_uint4 *str, int len)
675 {
676     int i, stpos, copos;
677     ac_uint4 cl, prevcl, st, ch, co;
678 
679     st = str[0];
680     stpos = 0;
681     copos = 1;
682     prevcl = uccombining_class(st) == 0 ? 0 : 256;
683 
684     for (i = 1; i < len; i++) {
685         ch = str[i];
686         cl = uccombining_class(ch);
687         if (uccomp(st, ch, &co) && (prevcl < cl || prevcl == 0))
688           st = str[stpos] = co;
689         else {
690             if (cl == 0) {
691                 stpos = copos;
692                 st = ch;
693             }
694             prevcl = cl;
695             str[copos++] = ch;
696         }
697     }
698 
699     return uccomp_hangul(str, copos);
700 }
701 
702 /**************************************************************************
703  *
704  * Support for decompositions.
705  *
706  **************************************************************************/
707 
708 #if !HARDCODE_DATA
709 
710 static ac_uint4  _ucdcmp_size;
711 static ac_uint4 *_ucdcmp_nodes;
712 static ac_uint4 *_ucdcmp_decomp;
713 
714 static ac_uint4  _uckdcmp_size;
715 static ac_uint4 *_uckdcmp_nodes;
716 static ac_uint4 *_uckdcmp_decomp;
717 
718 /*
719  * Return -1 on error, 0 if okay
720  */
721 static int
722 _ucdcmp_load(char *paths, int reload)
723 {
724     FILE *in;
725     ac_uint4 size, i;
726     _ucheader_t hdr;
727 
728     if (_ucdcmp_size > 0) {
729         if (!reload)
730             /*
731              * The decompositions have already been loaded.
732              */
733           return 0;
734 
735         free((char *) _ucdcmp_nodes);
736         _ucdcmp_size = 0;
737     }
738 
739     if ((in = _ucopenfile(paths, "decomp.dat", "rb")) == 0)
740         return -1;
741 
742     /*
743      * Load the header.
744      */
745     fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
746 
747     if (hdr.bom == 0xfffe) {
748         hdr.cnt = endian_short(hdr.cnt);
749         hdr.size.bytes = endian_long(hdr.size.bytes);
750     }
751 
752     _ucdcmp_size = hdr.cnt << 1;
753     _ucdcmp_nodes = (ac_uint4 *) malloc(hdr.size.bytes);
754     _ucdcmp_decomp = _ucdcmp_nodes + (_ucdcmp_size + 1);
755 
756     /*
757      * Read the decomposition data in.
758      */
759     size = hdr.size.bytes / sizeof(ac_uint4);
760     fread((char *) _ucdcmp_nodes, sizeof(ac_uint4), size, in);
761 
762     /*
763      * Do an endian swap if necessary.
764      */
765     if (hdr.bom == 0xfffe) {
766         for (i = 0; i < size; i++)
767             _ucdcmp_nodes[i] = endian_long(_ucdcmp_nodes[i]);
768     }
769     fclose(in);
770     return 0;
771 }
772 
773 /*
774  * Return -1 on error, 0 if okay
775  */
776 static int
777 _uckdcmp_load(char *paths, int reload)
778 {
779     FILE *in;
780     ac_uint4 size, i;
781     _ucheader_t hdr;
782 
783     if (_uckdcmp_size > 0) {
784         if (!reload)
785             /*
786              * The decompositions have already been loaded.
787              */
788           return 0;
789 
790         free((char *) _uckdcmp_nodes);
791         _uckdcmp_size = 0;
792     }
793 
794     if ((in = _ucopenfile(paths, "kdecomp.dat", "rb")) == 0)
795         return -1;
796 
797     /*
798      * Load the header.
799      */
800     fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
801 
802     if (hdr.bom == 0xfffe) {
803         hdr.cnt = endian_short(hdr.cnt);
804         hdr.size.bytes = endian_long(hdr.size.bytes);
805     }
806 
807     _uckdcmp_size = hdr.cnt << 1;
808     _uckdcmp_nodes = (ac_uint4 *) malloc(hdr.size.bytes);
809     _uckdcmp_decomp = _uckdcmp_nodes + (_uckdcmp_size + 1);
810 
811     /*
812      * Read the decomposition data in.
813      */
814     size = hdr.size.bytes / sizeof(ac_uint4);
815     fread((char *) _uckdcmp_nodes, sizeof(ac_uint4), size, in);
816 
817     /*
818      * Do an endian swap if necessary.
819      */
820     if (hdr.bom == 0xfffe) {
821         for (i = 0; i < size; i++)
822             _uckdcmp_nodes[i] = endian_long(_uckdcmp_nodes[i]);
823     }
824     fclose(in);
825     return 0;
826 }
827 
828 static void
829 _ucdcmp_unload(void)
830 {
831     if (_ucdcmp_size == 0)
832       return;
833 
834     /*
835      * Only need to free the offsets because the memory is allocated as a
836      * single block.
837      */
838     free((char *) _ucdcmp_nodes);
839     _ucdcmp_size = 0;
840 }
841 
842 static void
843 _uckdcmp_unload(void)
844 {
845     if (_uckdcmp_size == 0)
846       return;
847 
848     /*
849      * Only need to free the offsets because the memory is allocated as a
850      * single block.
851      */
852     free((char *) _uckdcmp_nodes);
853     _uckdcmp_size = 0;
854 }
855 #endif
856 
857 int
858 ucdecomp(ac_uint4 code, ac_uint4 *num, ac_uint4 **decomp)
859 {
860     long l, r, m;
861 
862     if (code < _ucdcmp_nodes[0]) {
863 	return 0;
864     }
865 
866     l = 0;
867     r = _ucdcmp_nodes[_ucdcmp_size] - 1;
868 
869     while (l <= r) {
870         /*
871          * Determine a "mid" point and adjust to make sure the mid point is at
872          * the beginning of a code+offset pair.
873          */
874         m = (l + r) >> 1;
875         m -= (m & 1);
876         if (code > _ucdcmp_nodes[m])
877           l = m + 2;
878         else if (code < _ucdcmp_nodes[m])
879           r = m - 2;
880         else if (code == _ucdcmp_nodes[m]) {
881             *num = _ucdcmp_nodes[m + 3] - _ucdcmp_nodes[m + 1];
882             *decomp = (ac_uint4*)&_ucdcmp_decomp[_ucdcmp_nodes[m + 1]];
883             return 1;
884         }
885     }
886     return 0;
887 }
888 
889 int
890 uckdecomp(ac_uint4 code, ac_uint4 *num, ac_uint4 **decomp)
891 {
892     long l, r, m;
893 
894     if (code < _uckdcmp_nodes[0]) {
895 	return 0;
896     }
897 
898     l = 0;
899     r = _uckdcmp_nodes[_uckdcmp_size] - 1;
900 
901     while (l <= r) {
902         /*
903          * Determine a "mid" point and adjust to make sure the mid point is at
904          * the beginning of a code+offset pair.
905          */
906         m = (l + r) >> 1;
907         m -= (m & 1);
908         if (code > _uckdcmp_nodes[m])
909           l = m + 2;
910         else if (code < _uckdcmp_nodes[m])
911           r = m - 2;
912         else if (code == _uckdcmp_nodes[m]) {
913             *num = _uckdcmp_nodes[m + 3] - _uckdcmp_nodes[m + 1];
914             *decomp = (ac_uint4*)&_uckdcmp_decomp[_uckdcmp_nodes[m + 1]];
915             return 1;
916         }
917     }
918     return 0;
919 }
920 
921 int
922 ucdecomp_hangul(ac_uint4 code, ac_uint4 *num, ac_uint4 decomp[])
923 {
924     if (!ucishangul(code))
925       return 0;
926 
927     code -= 0xac00;
928     decomp[0] = 0x1100 + (ac_uint4) (code / 588);
929     decomp[1] = 0x1161 + (ac_uint4) ((code % 588) / 28);
930     decomp[2] = 0x11a7 + (ac_uint4) (code % 28);
931     *num = (decomp[2] != 0x11a7) ? 3 : 2;
932 
933     return 1;
934 }
935 
936 /* mode == 0 for canonical, mode == 1 for compatibility */
937 static int
938 uccanoncompatdecomp(const ac_uint4 *in, int inlen,
939 		    ac_uint4 **out, int *outlen, short mode, void *ctx)
940 {
941     int l, size;
942 	unsigned i, j, k;
943     ac_uint4 num, class, *decomp, hangdecomp[3];
944 
945     size = inlen * 2;
946     *out = (ac_uint4 *) ber_memalloc_x(size * sizeof(**out), ctx);
947     if (*out == NULL)
948         return *outlen = -1;
949 
950     i = 0;
951     for (j = 0; j < (unsigned) inlen; j++) {
952 	if (mode ? uckdecomp(in[j], &num, &decomp) : ucdecomp(in[j], &num, &decomp)) {
953             if ( size - i < num) {
954                 size = inlen + i - j + num - 1;
955                 *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx );
956                 if (*out == NULL)
957                     return *outlen = -1;
958             }
959             for (k = 0; k < num; k++) {
960                 class = uccombining_class(decomp[k]);
961                 if (class == 0) {
962                     (*out)[i] = decomp[k];
963                 } else {
964                     for (l = i; l > 0; l--)
965                         if (class >= uccombining_class((*out)[l-1]))
966                             break;
967                     AC_MEMCPY(*out + l + 1, *out + l, (i - l) * sizeof(**out));
968                     (*out)[l] = decomp[k];
969                 }
970                 i++;
971             }
972         } else if (ucdecomp_hangul(in[j], &num, hangdecomp)) {
973             if (size - i < num) {
974                 size = inlen + i - j + num - 1;
975                 *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx);
976                 if (*out == NULL)
977                     return *outlen = -1;
978             }
979             for (k = 0; k < num; k++) {
980                 (*out)[i] = hangdecomp[k];
981                 i++;
982             }
983         } else {
984             if (size - i < 1) {
985                 size = inlen + i - j;
986                 *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx);
987                 if (*out == NULL)
988                     return *outlen = -1;
989             }
990             class = uccombining_class(in[j]);
991             if (class == 0) {
992                 (*out)[i] = in[j];
993             } else {
994                 for (l = i; l > 0; l--)
995                     if (class >= uccombining_class((*out)[l-1]))
996                         break;
997                 AC_MEMCPY(*out + l + 1, *out + l, (i - l) * sizeof(**out));
998                 (*out)[l] = in[j];
999             }
1000             i++;
1001         }
1002     }
1003     return *outlen = i;
1004 }
1005 
1006 int
1007 uccanondecomp(const ac_uint4 *in, int inlen,
1008               ac_uint4 **out, int *outlen, void *ctx)
1009 {
1010     return uccanoncompatdecomp(in, inlen, out, outlen, 0, ctx);
1011 }
1012 
1013 int
1014 uccompatdecomp(const ac_uint4 *in, int inlen,
1015 	       ac_uint4 **out, int *outlen, void *ctx)
1016 {
1017     return uccanoncompatdecomp(in, inlen, out, outlen, 1, ctx);
1018 }
1019 
1020 /**************************************************************************
1021  *
1022  * Support for combining classes.
1023  *
1024  **************************************************************************/
1025 
1026 #if !HARDCODE_DATA
1027 static ac_uint4  _uccmcl_size;
1028 static ac_uint4 *_uccmcl_nodes;
1029 
1030 /*
1031  * Return -1 on error, 0 if okay
1032  */
1033 static int
1034 _uccmcl_load(char *paths, int reload)
1035 {
1036     FILE *in;
1037     ac_uint4 i;
1038     _ucheader_t hdr;
1039 
1040     if (_uccmcl_size > 0) {
1041         if (!reload)
1042             /*
1043              * The combining classes have already been loaded.
1044              */
1045             return 0;
1046 
1047         free((char *) _uccmcl_nodes);
1048         _uccmcl_size = 0;
1049     }
1050 
1051     if ((in = _ucopenfile(paths, "cmbcl.dat", "rb")) == 0)
1052         return -1;
1053 
1054     /*
1055      * Load the header.
1056      */
1057     fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
1058 
1059     if (hdr.bom == 0xfffe) {
1060         hdr.cnt = endian_short(hdr.cnt);
1061         hdr.size.bytes = endian_long(hdr.size.bytes);
1062     }
1063 
1064     _uccmcl_size = hdr.cnt * 3;
1065     _uccmcl_nodes = (ac_uint4 *) malloc(hdr.size.bytes);
1066 
1067     /*
1068      * Read the combining classes in.
1069      */
1070     fread((char *) _uccmcl_nodes, sizeof(ac_uint4), _uccmcl_size, in);
1071 
1072     /*
1073      * Do an endian swap if necessary.
1074      */
1075     if (hdr.bom == 0xfffe) {
1076         for (i = 0; i < _uccmcl_size; i++)
1077             _uccmcl_nodes[i] = endian_long(_uccmcl_nodes[i]);
1078     }
1079     fclose(in);
1080     return 0;
1081 }
1082 
1083 static void
1084 _uccmcl_unload(void)
1085 {
1086     if (_uccmcl_size == 0)
1087       return;
1088 
1089     free((char *) _uccmcl_nodes);
1090     _uccmcl_size = 0;
1091 }
1092 #endif
1093 
1094 ac_uint4
1095 uccombining_class(ac_uint4 code)
1096 {
1097     long l, r, m;
1098 
1099     l = 0;
1100     r = _uccmcl_size - 1;
1101 
1102     while (l <= r) {
1103         m = (l + r) >> 1;
1104         m -= (m % 3);
1105         if (code > _uccmcl_nodes[m + 1])
1106           l = m + 3;
1107         else if (code < _uccmcl_nodes[m])
1108           r = m - 3;
1109         else if (code >= _uccmcl_nodes[m] && code <= _uccmcl_nodes[m + 1])
1110           return _uccmcl_nodes[m + 2];
1111     }
1112     return 0;
1113 }
1114 
1115 /**************************************************************************
1116  *
1117  * Support for numeric values.
1118  *
1119  **************************************************************************/
1120 
1121 #if !HARDCODE_DATA
1122 static ac_uint4 *_ucnum_nodes;
1123 static ac_uint4 _ucnum_size;
1124 static short *_ucnum_vals;
1125 
1126 /*
1127  * Return -1 on error, 0 if okay
1128  */
1129 static int
1130 _ucnumb_load(char *paths, int reload)
1131 {
1132     FILE *in;
1133     ac_uint4 size, i;
1134     _ucheader_t hdr;
1135 
1136     if (_ucnum_size > 0) {
1137         if (!reload)
1138           /*
1139            * The numbers have already been loaded.
1140            */
1141           return 0;
1142 
1143         free((char *) _ucnum_nodes);
1144         _ucnum_size = 0;
1145     }
1146 
1147     if ((in = _ucopenfile(paths, "num.dat", "rb")) == 0)
1148       return -1;
1149 
1150     /*
1151      * Load the header.
1152      */
1153     fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
1154 
1155     if (hdr.bom == 0xfffe) {
1156         hdr.cnt = endian_short(hdr.cnt);
1157         hdr.size.bytes = endian_long(hdr.size.bytes);
1158     }
1159 
1160     _ucnum_size = hdr.cnt;
1161     _ucnum_nodes = (ac_uint4 *) malloc(hdr.size.bytes);
1162     _ucnum_vals = (short *) (_ucnum_nodes + _ucnum_size);
1163 
1164     /*
1165      * Read the combining classes in.
1166      */
1167     fread((char *) _ucnum_nodes, sizeof(unsigned char), hdr.size.bytes, in);
1168 
1169     /*
1170      * Do an endian swap if necessary.
1171      */
1172     if (hdr.bom == 0xfffe) {
1173         for (i = 0; i < _ucnum_size; i++)
1174           _ucnum_nodes[i] = endian_long(_ucnum_nodes[i]);
1175 
1176         /*
1177          * Determine the number of values that have to be adjusted.
1178          */
1179         size = (hdr.size.bytes -
1180                 (_ucnum_size * (sizeof(ac_uint4) << 1))) /
1181             sizeof(short);
1182 
1183         for (i = 0; i < size; i++)
1184           _ucnum_vals[i] = endian_short(_ucnum_vals[i]);
1185     }
1186     fclose(in);
1187     return 0;
1188 }
1189 
1190 static void
1191 _ucnumb_unload(void)
1192 {
1193     if (_ucnum_size == 0)
1194       return;
1195 
1196     free((char *) _ucnum_nodes);
1197     _ucnum_size = 0;
1198 }
1199 #endif
1200 
1201 int
1202 ucnumber_lookup(ac_uint4 code, struct ucnumber *num)
1203 {
1204     long l, r, m;
1205     short *vp;
1206 
1207     l = 0;
1208     r = _ucnum_size - 1;
1209     while (l <= r) {
1210         /*
1211          * Determine a "mid" point and adjust to make sure the mid point is at
1212          * the beginning of a code+offset pair.
1213          */
1214         m = (l + r) >> 1;
1215         m -= (m & 1);
1216         if (code > _ucnum_nodes[m])
1217           l = m + 2;
1218         else if (code < _ucnum_nodes[m])
1219           r = m - 2;
1220         else {
1221             vp = (short *)_ucnum_vals + _ucnum_nodes[m + 1];
1222             num->numerator = (int) *vp++;
1223             num->denominator = (int) *vp;
1224             return 1;
1225         }
1226     }
1227     return 0;
1228 }
1229 
1230 int
1231 ucdigit_lookup(ac_uint4 code, int *digit)
1232 {
1233     long l, r, m;
1234     short *vp;
1235 
1236     l = 0;
1237     r = _ucnum_size - 1;
1238     while (l <= r) {
1239         /*
1240          * Determine a "mid" point and adjust to make sure the mid point is at
1241          * the beginning of a code+offset pair.
1242          */
1243         m = (l + r) >> 1;
1244         m -= (m & 1);
1245         if (code > _ucnum_nodes[m])
1246           l = m + 2;
1247         else if (code < _ucnum_nodes[m])
1248           r = m - 2;
1249         else {
1250             vp = (short *)_ucnum_vals + _ucnum_nodes[m + 1];
1251             if (*vp == *(vp + 1)) {
1252               *digit = *vp;
1253               return 1;
1254             }
1255             return 0;
1256         }
1257     }
1258     return 0;
1259 }
1260 
1261 struct ucnumber
1262 ucgetnumber(ac_uint4 code)
1263 {
1264     struct ucnumber num;
1265 
1266     /*
1267      * Initialize with some arbitrary value, because the caller simply cannot
1268      * tell for sure if the code is a number without calling the ucisnumber()
1269      * macro before calling this function.
1270      */
1271     num.numerator = num.denominator = -111;
1272 
1273     (void) ucnumber_lookup(code, &num);
1274 
1275     return num;
1276 }
1277 
1278 int
1279 ucgetdigit(ac_uint4 code)
1280 {
1281     int dig;
1282 
1283     /*
1284      * Initialize with some arbitrary value, because the caller simply cannot
1285      * tell for sure if the code is a number without calling the ucisdigit()
1286      * macro before calling this function.
1287      */
1288     dig = -111;
1289 
1290     (void) ucdigit_lookup(code, &dig);
1291 
1292     return dig;
1293 }
1294 
1295 /**************************************************************************
1296  *
1297  * Setup and cleanup routines.
1298  *
1299  **************************************************************************/
1300 
1301 #if HARDCODE_DATA
1302 int ucdata_load(char *paths, int masks) { return 0; }
1303 void ucdata_unload(int masks) { }
1304 int ucdata_reload(char *paths, int masks) { return 0; }
1305 #else
1306 /*
1307  * Return 0 if okay, negative on error
1308  */
1309 int
1310 ucdata_load(char *paths, int masks)
1311 {
1312     int error = 0;
1313 
1314     if (masks & UCDATA_CTYPE)
1315       error |= _ucprop_load(paths, 0) < 0 ? UCDATA_CTYPE : 0;
1316     if (masks & UCDATA_CASE)
1317       error |= _uccase_load(paths, 0) < 0 ? UCDATA_CASE : 0;
1318     if (masks & UCDATA_DECOMP)
1319       error |= _ucdcmp_load(paths, 0) < 0 ? UCDATA_DECOMP : 0;
1320     if (masks & UCDATA_CMBCL)
1321       error |= _uccmcl_load(paths, 0) < 0 ? UCDATA_CMBCL : 0;
1322     if (masks & UCDATA_NUM)
1323       error |= _ucnumb_load(paths, 0) < 0 ? UCDATA_NUM : 0;
1324     if (masks & UCDATA_COMP)
1325       error |= _uccomp_load(paths, 0) < 0 ? UCDATA_COMP : 0;
1326     if (masks & UCDATA_KDECOMP)
1327       error |= _uckdcmp_load(paths, 0) < 0 ? UCDATA_KDECOMP : 0;
1328 
1329     return -error;
1330 }
1331 
1332 void
1333 ucdata_unload(int masks)
1334 {
1335     if (masks & UCDATA_CTYPE)
1336       _ucprop_unload();
1337     if (masks & UCDATA_CASE)
1338       _uccase_unload();
1339     if (masks & UCDATA_DECOMP)
1340       _ucdcmp_unload();
1341     if (masks & UCDATA_CMBCL)
1342       _uccmcl_unload();
1343     if (masks & UCDATA_NUM)
1344       _ucnumb_unload();
1345     if (masks & UCDATA_COMP)
1346       _uccomp_unload();
1347     if (masks & UCDATA_KDECOMP)
1348       _uckdcmp_unload();
1349 }
1350 
1351 /*
1352  * Return 0 if okay, negative on error
1353  */
1354 int
1355 ucdata_reload(char *paths, int masks)
1356 {
1357     int error = 0;
1358 
1359     if (masks & UCDATA_CTYPE)
1360         error |= _ucprop_load(paths, 1) < 0 ? UCDATA_CTYPE : 0;
1361     if (masks & UCDATA_CASE)
1362         error |= _uccase_load(paths, 1) < 0 ? UCDATA_CASE : 0;
1363     if (masks & UCDATA_DECOMP)
1364         error |= _ucdcmp_load(paths, 1) < 0 ? UCDATA_DECOMP : 0;
1365     if (masks & UCDATA_CMBCL)
1366         error |= _uccmcl_load(paths, 1) < 0 ? UCDATA_CMBCL : 0;
1367     if (masks & UCDATA_NUM)
1368         error |= _ucnumb_load(paths, 1) < 0 ? UCDATA_NUM : 0;
1369     if (masks & UCDATA_COMP)
1370         error |= _uccomp_load(paths, 1) < 0 ? UCDATA_COMP : 0;
1371     if (masks & UCDATA_KDECOMP)
1372         error |= _uckdcmp_load(paths, 1) < 0 ? UCDATA_KDECOMP : 0;
1373 
1374     return -error;
1375 }
1376 #endif
1377 
1378 #ifdef TEST
1379 
1380 void
1381 main(void)
1382 {
1383     int dig;
1384     ac_uint4 i, lo, *dec;
1385     struct ucnumber num;
1386 
1387 /*    ucdata_setup("."); */
1388 
1389     if (ucisweak(0x30))
1390       printf("WEAK\n");
1391     else
1392       printf("NOT WEAK\n");
1393 
1394     printf("LOWER 0x%04lX\n", uctolower(0xff3a));
1395     printf("UPPER 0x%04lX\n", uctoupper(0xff5a));
1396 
1397     if (ucisalpha(0x1d5))
1398       printf("ALPHA\n");
1399     else
1400       printf("NOT ALPHA\n");
1401 
1402     if (ucisupper(0x1d5)) {
1403         printf("UPPER\n");
1404         lo = uctolower(0x1d5);
1405         printf("0x%04lx\n", lo);
1406         lo = uctotitle(0x1d5);
1407         printf("0x%04lx\n", lo);
1408     } else
1409       printf("NOT UPPER\n");
1410 
1411     if (ucistitle(0x1d5))
1412       printf("TITLE\n");
1413     else
1414       printf("NOT TITLE\n");
1415 
1416     if (uciscomposite(0x1d5))
1417       printf("COMPOSITE\n");
1418     else
1419       printf("NOT COMPOSITE\n");
1420 
1421     if (ucdecomp(0x1d5, &lo, &dec)) {
1422         for (i = 0; i < lo; i++)
1423           printf("0x%04lx ", dec[i]);
1424         putchar('\n');
1425     }
1426 
1427     if ((lo = uccombining_class(0x41)) != 0)
1428       printf("0x41 CCL %ld\n", lo);
1429 
1430     if (ucisxdigit(0xfeff))
1431       printf("0xFEFF HEX DIGIT\n");
1432     else
1433       printf("0xFEFF NOT HEX DIGIT\n");
1434 
1435     if (ucisdefined(0x10000))
1436       printf("0x10000 DEFINED\n");
1437     else
1438       printf("0x10000 NOT DEFINED\n");
1439 
1440     if (ucnumber_lookup(0x30, &num)) {
1441         if (num.denominator != 1)
1442           printf("UCNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator);
1443         else
1444           printf("UCNUMBER: 0x30 = %d\n", num.numerator);
1445     } else
1446       printf("UCNUMBER: 0x30 NOT A NUMBER\n");
1447 
1448     if (ucnumber_lookup(0xbc, &num)) {
1449         if (num.denominator != 1)
1450           printf("UCNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator);
1451         else
1452           printf("UCNUMBER: 0xbc = %d\n", num.numerator);
1453     } else
1454       printf("UCNUMBER: 0xbc NOT A NUMBER\n");
1455 
1456 
1457     if (ucnumber_lookup(0xff19, &num)) {
1458         if (num.denominator != 1)
1459           printf("UCNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator);
1460         else
1461           printf("UCNUMBER: 0xff19 = %d\n", num.numerator);
1462     } else
1463       printf("UCNUMBER: 0xff19 NOT A NUMBER\n");
1464 
1465     if (ucnumber_lookup(0x4e00, &num)) {
1466         if (num.denominator != 1)
1467           printf("UCNUMBER: 0x4e00 = %d/%d\n", num.numerator, num.denominator);
1468         else
1469           printf("UCNUMBER: 0x4e00 = %d\n", num.numerator);
1470     } else
1471       printf("UCNUMBER: 0x4e00 NOT A NUMBER\n");
1472 
1473     if (ucdigit_lookup(0x06f9, &dig))
1474       printf("UCDIGIT: 0x6f9 = %d\n", dig);
1475     else
1476       printf("UCDIGIT: 0x6f9 NOT A NUMBER\n");
1477 
1478     dig = ucgetdigit(0x0969);
1479     printf("UCGETDIGIT: 0x969 = %d\n", dig);
1480 
1481     num = ucgetnumber(0x30);
1482     if (num.denominator != 1)
1483       printf("UCGETNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator);
1484     else
1485       printf("UCGETNUMBER: 0x30 = %d\n", num.numerator);
1486 
1487     num = ucgetnumber(0xbc);
1488     if (num.denominator != 1)
1489       printf("UCGETNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator);
1490     else
1491       printf("UCGETNUMBER: 0xbc = %d\n", num.numerator);
1492 
1493     num = ucgetnumber(0xff19);
1494     if (num.denominator != 1)
1495       printf("UCGETNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator);
1496     else
1497       printf("UCGETNUMBER: 0xff19 = %d\n", num.numerator);
1498 
1499 /*    ucdata_cleanup(); */
1500     exit(0);
1501 }
1502 
1503 #endif /* TEST */
1504