1 /* $NetBSD: ucsset.c,v 1.4 2014/12/10 04:37:55 christos Exp $ */
2
3 #ifndef lint
4 static char *rcsid = "Id: ucsset.c,v 1.1 2003/06/04 00:26:15 marka Exp ";
5 #endif
6
7 /*
8 * Copyright (c) 2001 Japan Network Information Center. All rights reserved.
9 *
10 * By using this file, you agree to the terms and conditions set forth bellow.
11 *
12 * LICENSE TERMS AND CONDITIONS
13 *
14 * The following License Terms and Conditions apply, unless a different
15 * license is obtained from Japan Network Information Center ("JPNIC"),
16 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
17 * Chiyoda-ku, Tokyo 101-0047, Japan.
18 *
19 * 1. Use, Modification and Redistribution (including distribution of any
20 * modified or derived work) in source and/or binary forms is permitted
21 * under this License Terms and Conditions.
22 *
23 * 2. Redistribution of source code must retain the copyright notices as they
24 * appear in each source code file, this License Terms and Conditions.
25 *
26 * 3. Redistribution in binary form must reproduce the Copyright Notice,
27 * this License Terms and Conditions, in the documentation and/or other
28 * materials provided with the distribution. For the purposes of binary
29 * distribution the "Copyright Notice" refers to the following language:
30 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
31 *
32 * 4. The name of JPNIC may not be used to endorse or promote products
33 * derived from this Software without specific prior written approval of
34 * JPNIC.
35 *
36 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
37 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
38 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
39 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
40 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
41 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
42 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
43 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
44 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
45 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
46 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
47 */
48
49 #include <config.h>
50
51 #include <stdlib.h>
52 #include <stdio.h>
53 #include <string.h>
54
55 #include <idn/result.h>
56 #include <idn/assert.h>
57 #include <idn/logmacro.h>
58 #include <idn/ucsset.h>
59
60 #define UCS_MAX 0x80000000UL
61
62 #define INIT_SIZE 50
63
64 /*
65 * Code point range.
66 *
67 * The set of code points is represented by an array of code point ranges.
68 * In the building phase, specified ranges by 'idn_ucsset_add' or
69 * 'idn_ucsset_addrange' are simply appended to the array.
70 * And 'idn_ucsset_fix' sorts the array by the code point value, and also
71 * merges any intersecting ranges. Since the array is sorted, a binary
72 * search can be used for looking up.
73 */
74 typedef struct {
75 unsigned long from;
76 unsigned long to;
77 } range_t;
78
79 /*
80 * Code point segment.
81 *
82 * To speed up searching further, the entire region of UCS-4 code points
83 * (U+0000 - U+7FFFFFFF) are divided into segments. For each segment,
84 * the first and last element of the range array corresponding to the
85 * segment are computed by 'idn_ucsset_fix'. This narrows down the
86 * (initial) search range.
87 */
88 typedef struct {
89 int range_start; /* index of ucsset.ranges */
90 int range_end; /* ditto */
91 } segment_t;
92
93 /*
94 * Code point to segment index conversion.
95 *
96 * Below is the function that maps a code point to the corresponding segment.
97 * The mapping is non-uniform, so that BMP, the following 16 planes that
98 * comprise Unicode code points together with BMP, and other planes
99 * have different granularity.
100 */
101 #define SEG_THLD1 0x10000 /* BMP */
102 #define SEG_THLD2 0x110000 /* Unicode (BMP+16planes) */
103 #define SEG_SFT1 10 /* BMP: 1K code points/segment */
104 #define SEG_SFT2 14 /* following 16 planes: 16K cp/seg */
105 #define SEG_SFT3 24 /* rest: 16M cp/seg */
106 #define SEG_OFF1 (SEG_THLD1 >> SEG_SFT1)
107 #define SEG_OFF2 (((SEG_THLD2 - SEG_THLD1) >> SEG_SFT2) + SEG_OFF1)
108 #define SEG_INDEX(v) \
109 (((v) < SEG_THLD1) ? ((v) >> SEG_SFT1) : \
110 ((v) < SEG_THLD2) ? ((((v) - SEG_THLD1) >> SEG_SFT2) + SEG_OFF1) : \
111 ((((v) - SEG_THLD2) >> SEG_SFT3) + SEG_OFF2))
112 #define SEG_LEN (SEG_INDEX(UCS_MAX - 1) + 1)
113
114 /*
115 * Representation of set of UCS code points.
116 */
117 typedef struct idn_ucsset {
118 segment_t segments[SEG_LEN];
119 int fixed;
120 int size; /* allocated size of 'ranges' */
121 int nranges; /* num of ranges */
122 range_t *ranges;
123 int refcnt; /* reference count */
124 } ucsset;
125
126 static idn_result_t addrange(idn_ucsset_t ctx, unsigned long from,
127 unsigned long to, char *func_name);
128 static int comp_range(const void *v1, const void *v2);
129
130 idn_result_t
idn_ucsset_create(idn_ucsset_t * ctx)131 idn_ucsset_create(idn_ucsset_t *ctx) {
132 idn_ucsset_t bm;
133
134 assert(ctx != NULL);
135
136 TRACE(("idn_ucsset_create()\n"));
137
138 if ((bm = malloc(sizeof(ucsset))) == NULL) {
139 WARNING(("idn_ucsset_create: malloc failed\n"));
140 return idn_nomemory;
141 }
142 bm->size = bm->nranges = 0;
143 bm->ranges = NULL;
144 bm->fixed = 0;
145 bm->refcnt = 1;
146 *ctx = bm;
147 return (idn_success);
148 }
149
150 void
idn_ucsset_destroy(idn_ucsset_t ctx)151 idn_ucsset_destroy(idn_ucsset_t ctx) {
152 assert(ctx != NULL && ctx->refcnt > 0);
153
154 TRACE(("idn_ucsset_destroy()\n"));
155
156 if (--ctx->refcnt == 0) {
157 if (ctx->ranges != NULL)
158 free(ctx->ranges);
159 free(ctx);
160 }
161 }
162
163 void
idn_ucsset_incrref(idn_ucsset_t ctx)164 idn_ucsset_incrref(idn_ucsset_t ctx) {
165 assert(ctx != NULL && ctx->refcnt > 0);
166
167 TRACE(("idn_ucsset_incrref()\n"));
168
169 ctx->refcnt++;
170 }
171
172 idn_result_t
idn_ucsset_add(idn_ucsset_t ctx,unsigned long v)173 idn_ucsset_add(idn_ucsset_t ctx, unsigned long v) {
174 assert(ctx != NULL && ctx->refcnt > 0);
175
176 TRACE(("idn_ucsset_add(v=U+%lX)\n", v));
177
178 return (addrange(ctx, v, v, "idn_ucsset_add"));
179 }
180
181 idn_result_t
idn_ucsset_addrange(idn_ucsset_t ctx,unsigned long from,unsigned long to)182 idn_ucsset_addrange(idn_ucsset_t ctx, unsigned long from,
183 unsigned long to)
184 {
185 assert(ctx != NULL && ctx->refcnt > 0);
186
187 TRACE(("idn_ucsset_addrange(from=U+%lX, to=U+%lX)\n",
188 from, to));
189
190 return (addrange(ctx, from, to, "idn_ucsset_addrange"));
191 }
192
193 void
idn_ucsset_fix(idn_ucsset_t ctx)194 idn_ucsset_fix(idn_ucsset_t ctx) {
195 int nranges;
196 range_t *ranges;
197 segment_t *segments;
198 int i, j;
199
200 assert(ctx != NULL && ctx->refcnt > 0);
201
202 TRACE(("idn_ucsset_fix()\n"));
203
204 nranges = ctx->nranges;
205 ranges = ctx->ranges;
206 segments = ctx->segments;
207
208 if (ctx->fixed)
209 return;
210
211 ctx->fixed = 1;
212
213 /* Initialize segment array */
214 for (i = 0; i < SEG_LEN; i++) {
215 segments[i].range_start = -1;
216 segments[i].range_end = -1;
217 }
218
219 /* If the set is empty, there's nothing to be done. */
220 if (nranges == 0)
221 return;
222
223 /* Sort ranges. */
224 qsort(ranges, nranges, sizeof(range_t), comp_range);
225
226 /* Merge overlapped/continuous ranges. */
227 for (i = 0, j = 1; j < nranges; j++) {
228 if (ranges[i].to + 1 >= ranges[j].from) {
229 /* can be merged */
230 if (ranges[i].to < ranges[j].to) {
231 ranges[i].to = ranges[j].to;
232 }
233 } else {
234 i++;
235 if (i < j)
236 ranges[i] = ranges[j];
237 }
238 }
239 /* 'i' points the last range in the array. */
240 ctx->nranges = nranges = ++i;
241
242 /* Create segment array. */
243 for (i = 0; i < nranges; i++) {
244 int fidx = SEG_INDEX(ranges[i].from);
245 int tidx = SEG_INDEX(ranges[i].to);
246
247 for (j = fidx; j <= tidx; j++) {
248 if (segments[j].range_start < 0)
249 segments[j].range_start = i;
250 segments[j].range_end = i;
251 }
252 }
253
254 #if 0
255 /*
256 * Does the standard guarantee realloc() always succeeds
257 * when shrinking?
258 */
259 /* Shrink malloc'ed space if possible. */
260 ctx->ranges = realloc(ctx->ranges, ctx->nranges * sizeof(range_t));
261 #endif
262 }
263
264 idn_result_t
idn_ucsset_lookup(idn_ucsset_t ctx,unsigned long v,int * found)265 idn_ucsset_lookup(idn_ucsset_t ctx, unsigned long v, int *found) {
266 int idx;
267 segment_t *segments;
268
269 assert(ctx != NULL && ctx->refcnt > 0 && found != NULL);
270
271 TRACE(("idn_ucsset_lookup(v=U+%lX)\n", v));
272
273 /* Make sure it is fixed. */
274 if (!ctx->fixed) {
275 WARNING(("idn_ucsset_lookup: not fixed yet\n"));
276 return (idn_failure);
277 }
278
279 /* Check the given code point. */
280 if (v >= UCS_MAX)
281 return (idn_invalid_codepoint);
282
283 /* Get the segment 'v' belongs to. */
284 segments = ctx->segments;
285 idx = SEG_INDEX(v);
286
287 /* Do binary search. */
288 *found = 0;
289 if (segments[idx].range_start >= 0) {
290 int lo = segments[idx].range_start;
291 int hi = segments[idx].range_end;
292 range_t *ranges = ctx->ranges;
293
294 while (lo <= hi) {
295 int mid = (lo + hi) / 2;
296 if (v < ranges[mid].from) {
297 hi = mid - 1;
298 } else if (v > ranges[mid].to) {
299 lo = mid + 1;
300 } else {
301 *found = 1;
302 break;
303 }
304 }
305 }
306 return (idn_success);
307 }
308
309 static idn_result_t
addrange(idn_ucsset_t ctx,unsigned long from,unsigned long to,char * func_name)310 addrange(idn_ucsset_t ctx, unsigned long from, unsigned long to,
311 char *func_name)
312 {
313 range_t *newbuf;
314
315 /* Check the given code points. */
316 if (from > UCS_MAX) {
317 WARNING(("%s: code point out of range (U+%lX)\n",
318 func_name, from));
319 return (idn_invalid_codepoint);
320 } else if (to > UCS_MAX) {
321 WARNING(("%s: code point out of range (U+%lX)\n",
322 func_name, to));
323 return (idn_invalid_codepoint);
324 } else if (from > to) {
325 WARNING(("%s: invalid range spec (U+%lX-U+%lX)\n",
326 func_name, from, to));
327 return (idn_invalid_codepoint);
328 }
329
330 /* Make sure it is not fixed yet. */
331 if (ctx->fixed) {
332 WARNING(("%s: attempt to add to already fixed object\n",
333 func_name));
334 return (idn_failure);
335 }
336
337 /* Append the specified range to the 'ranges' array. */
338 if (ctx->nranges >= ctx->size) {
339 /* Make it bigger. */
340 if (ctx->size == 0)
341 ctx->size = INIT_SIZE;
342 else
343 ctx->size *= 2;
344 newbuf = realloc(ctx->ranges, ctx->size * sizeof(range_t));
345 if (newbuf == NULL)
346 return (idn_nomemory);
347 ctx->ranges = newbuf;
348 }
349 ctx->ranges[ctx->nranges].from = from;
350 ctx->ranges[ctx->nranges].to = to;
351 ctx->nranges++;
352
353 return (idn_success);
354 }
355
356 static int
comp_range(const void * v1,const void * v2)357 comp_range(const void *v1, const void *v2) {
358 /*
359 * Range comparation function suitable for qsort().
360 */
361 const range_t *r1 = v1;
362 const range_t *r2 = v2;
363
364 if (r1->from < r2->from)
365 return (-1);
366 else if (r1->from > r2->from)
367 return (1);
368 else
369 return (0);
370 }
371