xref: /minix3/external/bsd/bind/dist/contrib/idn/idnkit-1.0-src/lib/ucsset.c (revision 00b67f09dd46474d133c95011a48590a8e8f94c7)
1 /*	$NetBSD: ucsset.c,v 1.4 2014/12/10 04:37:55 christos Exp $	*/
2 
3 #ifndef lint
4 static char *rcsid = "Id: ucsset.c,v 1.1 2003/06/04 00:26:15 marka Exp ";
5 #endif
6 
7 /*
8  * Copyright (c) 2001 Japan Network Information Center.  All rights reserved.
9  *
10  * By using this file, you agree to the terms and conditions set forth bellow.
11  *
12  * 			LICENSE TERMS AND CONDITIONS
13  *
14  * The following License Terms and Conditions apply, unless a different
15  * license is obtained from Japan Network Information Center ("JPNIC"),
16  * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
17  * Chiyoda-ku, Tokyo 101-0047, Japan.
18  *
19  * 1. Use, Modification and Redistribution (including distribution of any
20  *    modified or derived work) in source and/or binary forms is permitted
21  *    under this License Terms and Conditions.
22  *
23  * 2. Redistribution of source code must retain the copyright notices as they
24  *    appear in each source code file, this License Terms and Conditions.
25  *
26  * 3. Redistribution in binary form must reproduce the Copyright Notice,
27  *    this License Terms and Conditions, in the documentation and/or other
28  *    materials provided with the distribution.  For the purposes of binary
29  *    distribution the "Copyright Notice" refers to the following language:
30  *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
31  *
32  * 4. The name of JPNIC may not be used to endorse or promote products
33  *    derived from this Software without specific prior written approval of
34  *    JPNIC.
35  *
36  * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
37  *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
38  *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
39  *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
40  *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
41  *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
42  *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
43  *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
44  *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
45  *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
46  *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
47  */
48 
49 #include <config.h>
50 
51 #include <stdlib.h>
52 #include <stdio.h>
53 #include <string.h>
54 
55 #include <idn/result.h>
56 #include <idn/assert.h>
57 #include <idn/logmacro.h>
58 #include <idn/ucsset.h>
59 
60 #define UCS_MAX		0x80000000UL
61 
62 #define INIT_SIZE	50
63 
64 /*
65  * Code point range.
66  *
67  * The set of code points is represented by an array of code point ranges.
68  * In the building phase, specified ranges by 'idn_ucsset_add' or
69  * 'idn_ucsset_addrange' are simply appended to the array.
70  * And 'idn_ucsset_fix' sorts the array by the code point value, and also
71  * merges any intersecting ranges.  Since the array is sorted, a binary
72  * search can be used for looking up.
73  */
74 typedef struct {
75 	unsigned long from;
76 	unsigned long to;
77 } range_t;
78 
79 /*
80  * Code point segment.
81  *
82  * To speed up searching further, the entire region of UCS-4 code points
83  * (U+0000 - U+7FFFFFFF) are divided into segments. For each segment,
84  * the first and last element of the range array corresponding to the
85  * segment are computed by 'idn_ucsset_fix'.  This narrows down the
86  * (initial) search range.
87  */
88 typedef struct {
89 	int range_start;	/* index of ucsset.ranges */
90 	int range_end;		/* ditto */
91 } segment_t;
92 
93 /*
94  * Code point to segment index conversion.
95  *
96  * Below is the function that maps a code point to the corresponding segment.
97  * The mapping is non-uniform, so that BMP, the following 16 planes that
98  * comprise Unicode code points together with BMP, and other planes
99  * have different granularity.
100  */
101 #define SEG_THLD1	0x10000		/* BMP */
102 #define SEG_THLD2	0x110000	/* Unicode (BMP+16planes) */
103 #define SEG_SFT1	10		/* BMP: 1K code points/segment */
104 #define SEG_SFT2	14		/* following 16 planes: 16K cp/seg */
105 #define SEG_SFT3	24		/* rest: 16M cp/seg */
106 #define SEG_OFF1	(SEG_THLD1 >> SEG_SFT1)
107 #define SEG_OFF2	(((SEG_THLD2 - SEG_THLD1) >> SEG_SFT2) + SEG_OFF1)
108 #define SEG_INDEX(v) \
109 	(((v) < SEG_THLD1) ? ((v) >> SEG_SFT1) : \
110 	 ((v) < SEG_THLD2) ? ((((v) - SEG_THLD1) >> SEG_SFT2) + SEG_OFF1) : \
111 	 ((((v) - SEG_THLD2) >> SEG_SFT3) + SEG_OFF2))
112 #define SEG_LEN	(SEG_INDEX(UCS_MAX - 1) + 1)
113 
114 /*
115  * Representation of set of UCS code points.
116  */
117 typedef struct idn_ucsset {
118 	segment_t segments[SEG_LEN];
119 	int fixed;
120 	int size;			/* allocated size of 'ranges' */
121 	int nranges;			/* num of ranges */
122 	range_t *ranges;
123 	int refcnt;			/* reference count */
124 } ucsset;
125 
126 static idn_result_t	addrange(idn_ucsset_t ctx, unsigned long from,
127 				 unsigned long to, char *func_name);
128 static int		comp_range(const void *v1, const void *v2);
129 
130 idn_result_t
idn_ucsset_create(idn_ucsset_t * ctx)131 idn_ucsset_create(idn_ucsset_t *ctx) {
132 	idn_ucsset_t bm;
133 
134 	assert(ctx != NULL);
135 
136 	TRACE(("idn_ucsset_create()\n"));
137 
138 	if ((bm = malloc(sizeof(ucsset))) == NULL) {
139 		WARNING(("idn_ucsset_create: malloc failed\n"));
140 		return idn_nomemory;
141 	}
142 	bm->size = bm->nranges = 0;
143 	bm->ranges = NULL;
144 	bm->fixed = 0;
145 	bm->refcnt = 1;
146 	*ctx = bm;
147 	return (idn_success);
148 }
149 
150 void
idn_ucsset_destroy(idn_ucsset_t ctx)151 idn_ucsset_destroy(idn_ucsset_t ctx) {
152 	assert(ctx != NULL && ctx->refcnt > 0);
153 
154 	TRACE(("idn_ucsset_destroy()\n"));
155 
156 	if (--ctx->refcnt == 0) {
157 		if (ctx->ranges != NULL)
158 			free(ctx->ranges);
159 		free(ctx);
160 	}
161 }
162 
163 void
idn_ucsset_incrref(idn_ucsset_t ctx)164 idn_ucsset_incrref(idn_ucsset_t ctx) {
165 	assert(ctx != NULL && ctx->refcnt > 0);
166 
167 	TRACE(("idn_ucsset_incrref()\n"));
168 
169 	ctx->refcnt++;
170 }
171 
172 idn_result_t
idn_ucsset_add(idn_ucsset_t ctx,unsigned long v)173 idn_ucsset_add(idn_ucsset_t ctx, unsigned long v) {
174 	assert(ctx != NULL && ctx->refcnt > 0);
175 
176 	TRACE(("idn_ucsset_add(v=U+%lX)\n", v));
177 
178 	return (addrange(ctx, v, v, "idn_ucsset_add"));
179 }
180 
181 idn_result_t
idn_ucsset_addrange(idn_ucsset_t ctx,unsigned long from,unsigned long to)182 idn_ucsset_addrange(idn_ucsset_t ctx, unsigned long from,
183 			 unsigned long to)
184 {
185 	assert(ctx != NULL && ctx->refcnt > 0);
186 
187 	TRACE(("idn_ucsset_addrange(from=U+%lX, to=U+%lX)\n",
188 	       from, to));
189 
190 	return (addrange(ctx, from, to, "idn_ucsset_addrange"));
191 }
192 
193 void
idn_ucsset_fix(idn_ucsset_t ctx)194 idn_ucsset_fix(idn_ucsset_t ctx) {
195 	int nranges;
196 	range_t *ranges;
197 	segment_t *segments;
198 	int i, j;
199 
200 	assert(ctx != NULL && ctx->refcnt > 0);
201 
202 	TRACE(("idn_ucsset_fix()\n"));
203 
204 	nranges = ctx->nranges;
205 	ranges = ctx->ranges;
206 	segments = ctx->segments;
207 
208 	if (ctx->fixed)
209 		return;
210 
211 	ctx->fixed = 1;
212 
213 	/* Initialize segment array */
214 	for (i = 0; i < SEG_LEN; i++) {
215 		segments[i].range_start = -1;
216 		segments[i].range_end = -1;
217 	}
218 
219 	/* If the set is empty, there's nothing to be done. */
220 	if (nranges == 0)
221 		return;
222 
223 	/* Sort ranges. */
224 	qsort(ranges, nranges, sizeof(range_t), comp_range);
225 
226 	/* Merge overlapped/continuous ranges. */
227 	for (i = 0, j = 1; j < nranges; j++) {
228 		if (ranges[i].to + 1 >= ranges[j].from) {
229 			/* can be merged */
230 			if (ranges[i].to < ranges[j].to) {
231 				ranges[i].to = ranges[j].to;
232 			}
233 		} else {
234 			i++;
235 			if (i < j)
236 				ranges[i] = ranges[j];
237 		}
238 	}
239 	/* 'i' points the last range in the array. */
240 	ctx->nranges = nranges = ++i;
241 
242 	/* Create segment array. */
243 	for (i = 0; i < nranges; i++) {
244 		int fidx = SEG_INDEX(ranges[i].from);
245 		int tidx = SEG_INDEX(ranges[i].to);
246 
247 		for (j = fidx; j <= tidx; j++) {
248 			if (segments[j].range_start < 0)
249 				segments[j].range_start = i;
250 			segments[j].range_end = i;
251 		}
252 	}
253 
254 #if 0
255 	/*
256 	 * Does the standard guarantee realloc() always succeeds
257 	 * when shrinking?
258 	 */
259 	/* Shrink malloc'ed space if possible. */
260 	ctx->ranges = realloc(ctx->ranges, ctx->nranges * sizeof(range_t));
261 #endif
262 }
263 
264 idn_result_t
idn_ucsset_lookup(idn_ucsset_t ctx,unsigned long v,int * found)265 idn_ucsset_lookup(idn_ucsset_t ctx, unsigned long v, int *found) {
266 	int idx;
267 	segment_t *segments;
268 
269 	assert(ctx != NULL && ctx->refcnt > 0 && found != NULL);
270 
271 	TRACE(("idn_ucsset_lookup(v=U+%lX)\n", v));
272 
273 	/* Make sure it is fixed. */
274 	if (!ctx->fixed) {
275 		WARNING(("idn_ucsset_lookup: not fixed yet\n"));
276 		return (idn_failure);
277 	}
278 
279 	/* Check the given code point. */
280 	if (v >= UCS_MAX)
281 		return (idn_invalid_codepoint);
282 
283 	/* Get the segment 'v' belongs to. */
284 	segments = ctx->segments;
285 	idx = SEG_INDEX(v);
286 
287 	/* Do binary search. */
288 	*found = 0;
289 	if (segments[idx].range_start >= 0) {
290 		int lo = segments[idx].range_start;
291 		int hi = segments[idx].range_end;
292 		range_t *ranges = ctx->ranges;
293 
294 		while (lo <= hi) {
295 			int mid = (lo + hi) / 2;
296 			if (v < ranges[mid].from) {
297 				hi = mid - 1;
298 			} else if (v > ranges[mid].to) {
299 				lo = mid + 1;
300 			} else {
301 				*found = 1;
302 				break;
303 			}
304 		}
305 	}
306 	return (idn_success);
307 }
308 
309 static idn_result_t
addrange(idn_ucsset_t ctx,unsigned long from,unsigned long to,char * func_name)310 addrange(idn_ucsset_t ctx, unsigned long from, unsigned long to,
311 	 char *func_name)
312 {
313 	range_t *newbuf;
314 
315 	/* Check the given code points. */
316 	if (from > UCS_MAX) {
317 		WARNING(("%s: code point out of range (U+%lX)\n",
318 			 func_name, from));
319 		return (idn_invalid_codepoint);
320 	} else if (to > UCS_MAX) {
321 		WARNING(("%s: code point out of range (U+%lX)\n",
322 			 func_name, to));
323 		return (idn_invalid_codepoint);
324 	} else if (from > to) {
325 		WARNING(("%s: invalid range spec (U+%lX-U+%lX)\n",
326 			 func_name, from, to));
327 		return (idn_invalid_codepoint);
328 	}
329 
330 	/* Make sure it is not fixed yet. */
331 	if (ctx->fixed) {
332 		WARNING(("%s: attempt to add to already fixed object\n",
333 			 func_name));
334 		return (idn_failure);
335 	}
336 
337 	/* Append the specified range to the 'ranges' array. */
338 	if (ctx->nranges >= ctx->size) {
339 		/* Make it bigger. */
340 		if (ctx->size == 0)
341 			ctx->size = INIT_SIZE;
342 		else
343 			ctx->size *= 2;
344 		newbuf = realloc(ctx->ranges, ctx->size * sizeof(range_t));
345 		if (newbuf == NULL)
346 			return (idn_nomemory);
347 		ctx->ranges = newbuf;
348 	}
349 	ctx->ranges[ctx->nranges].from = from;
350 	ctx->ranges[ctx->nranges].to = to;
351 	ctx->nranges++;
352 
353 	return (idn_success);
354 }
355 
356 static int
comp_range(const void * v1,const void * v2)357 comp_range(const void *v1, const void *v2) {
358 	/*
359 	 * Range comparation function suitable for qsort().
360 	 */
361 	const range_t *r1 = v1;
362 	const range_t *r2 = v2;
363 
364 	if (r1->from < r2->from)
365 		return (-1);
366 	else if (r1->from > r2->from)
367 		return (1);
368 	else
369 		return (0);
370 }
371