xref: /netbsd-src/external/bsd/openldap/dist/libraries/libldap/t61.c (revision 549b59ed3ccf0d36d3097190a0db27b770f3a839)
1 /*	$NetBSD: t61.c,v 1.3 2021/08/14 16:14:56 christos Exp $	*/
2 
3 /* $OpenLDAP$ */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5  *
6  * Copyright 2002-2021 The OpenLDAP Foundation.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted only as authorized by the OpenLDAP
11  * Public License.
12  *
13  * A copy of this license is available in the file LICENSE in the
14  * top-level directory of the distribution or, alternatively, at
15  * <http://www.OpenLDAP.org/license.html>.
16  */
17 /* ACKNOWLEDGEMENTS:
18  * This work was initially developed by Howard Chu for inclusion in
19  * OpenLDAP Software.
20  */
21 
22 /*
23  * Basic T.61 <-> UTF-8 conversion
24  *
25  * These routines will perform a lossless translation from T.61 to UTF-8
26  * and a lossy translation from UTF-8 to T.61.
27  */
28 
29 #include <sys/cdefs.h>
30 __RCSID("$NetBSD: t61.c,v 1.3 2021/08/14 16:14:56 christos Exp $");
31 
32 #include "portable.h"
33 
34 #include <stdio.h>
35 
36 #include <ac/stdlib.h>
37 
38 #include <ac/socket.h>
39 #include <ac/string.h>
40 #include <ac/time.h>
41 
42 #include "ldap-int.h"
43 #include "ldap_utf8.h"
44 
45 #include "ldap_defaults.h"
46 
47 /*
48  * T.61 is somewhat braindead; even in the 7-bit space it is not
49  * completely equivalent to 7-bit US-ASCII. Our definition of the
50  * character set comes from RFC 1345 with a slightly more readable
51  * rendition at http://std.dkuug.dk/i18n/charmaps/T.61-8BIT.
52  *
53  * Even though '#' and '$' are present in the 7-bit US-ASCII space,
54  * (x23 and x24, resp.) in T.61 they are mapped to 8-bit characters
55  * xA6 and xA4.
56  *
57  * Also T.61 lacks
58  *	backslash 	\	(x5C)
59  *	caret		^	(x5E)
60  *	backquote	`	(x60)
61  *	left brace	{	(x7B)
62  *	right brace	}	(x7D)
63  *	tilde		~	(x7E)
64  *
65  * In T.61, the codes xC1 to xCF (excluding xC9, unused) are non-spacing
66  * accents of some form or another. There are predefined combinations
67  * for certain characters, but they can also be used arbitrarily. The
68  * table at dkuug.dk maps these accents to the E000 "private use" range
69  * of the Unicode space, but I believe they more properly belong in the
70  * 0300 range (non-spacing accents). The transformation is complicated
71  * slightly because Unicode wants the non-spacing character to follow
72  * the base character, while T.61 has the non-spacing character leading.
73  * Also, T.61 specifically recognizes certain combined pairs as "characters"
74  * but doesn't specify how to treat unrecognized pairs. This code will
75  * always attempt to combine pairs when a known Unicode composite exists.
76  */
77 
78 static const wchar_t t61_tab[] = {
79 	0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
80 	0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
81 	0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
82 	0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
83 	0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
84 	0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
85 	0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
86 	0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
87 	0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
88 	0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
89 	0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
90 	0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
91 	0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
92 	0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
93 	0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
94 	0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
95 	0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
96 	0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
97 	0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
98 	0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
99 	0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
100 	0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
101 	0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
102 	0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
103 	0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
104 	0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
105 	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
106 	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
107 	0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
108 	0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
109 	0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
110 	0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
111 };
112 
113 typedef wchar_t wvec16[16];
114 typedef wchar_t wvec32[32];
115 typedef wchar_t wvec64[64];
116 
117 /* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
118 static const wvec16 accents = {
119 	0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
120 	0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
121 
122 /* In the following tables, base characters commented in (parentheses)
123  * are not defined by T.61 but are mapped anyway since their Unicode
124  * composite exists.
125  */
126 
127 /* Grave accented chars AEIOU (NWY) */
128 static const wvec32 c1_vec1 = {
129 	/* Upper case */
130 	0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
131 	0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
132 static const wvec32 c1_vec2 = {
133 	/* Lower case */
134 	0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
135 	0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
136 
137 static const wvec32 *c1_grave[] = {
138 	NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
139 };
140 
141 /* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
142 static const wvec32 c2_vec1 = {
143 	/* Upper case */
144 	0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
145 	0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
146 	0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
147 	0, 0xdd, 0x179, 0, 0, 0, 0, 0};
148 static const wvec32 c2_vec2 = {
149 	/* Lower case */
150 	0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
151 	0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
152 	0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
153 	0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
154 static const wvec32 c2_vec3 = {
155 	/* (AE and ae) */
156 	0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
157 	0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
158 
159 static const wvec32 *c2_acute[] = {
160 	NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
161 };
162 
163 /* Circumflex AEIOUYCGHJSW (Z) */
164 static const wvec32 c3_vec1 = {
165 	/* Upper case */
166 	0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
167 	0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
168 	0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
169 	0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
170 static const wvec32 c3_vec2 = {
171 	/* Lower case */
172 	0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
173 	0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
174 	0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
175 	0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
176 static const wvec32 *c3_circumflex[] = {
177 	NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
178 };
179 
180 /* Tilde AIOUN (EVY) */
181 static const wvec32 c4_vec1 = {
182 	/* Upper case */
183 	0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
184 	0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
185 static const wvec32 c4_vec2 = {
186 	/* Lower case */
187 	0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
188 	0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
189 static const wvec32 *c4_tilde[] = {
190 	NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
191 };
192 
193 /* Macron AEIOU (YG) */
194 static const wvec32 c5_vec1 = {
195 	/* Upper case */
196 	0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
197 	0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
198 static const wvec32 c5_vec2 = {
199 	/* Lower case */
200 	0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
201 	0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
202 static const wvec32 c5_vec3 = {
203 	/* (AE and ae) */
204 	0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
205 	0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
206 static const wvec32 *c5_macron[] = {
207 	NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
208 };
209 
210 /* Breve AUG (EIO) */
211 static const wvec32 c6_vec1 = {
212 	/* Upper case */
213 	0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
214 	0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
215 static const wvec32 c6_vec2 = {
216 	/* Lower case */
217 	0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
218 	0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
219 static const wvec32 *c6_breve[] = {
220 	NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
221 };
222 
223 /* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
224 static const wvec32 c7_vec1 = {
225 	/* Upper case */
226 	0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
227 	0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
228 	0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
229 	0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
230 static const wvec32 c7_vec2 = {
231 	/* Lower case */
232 	0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
233 	0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
234 	0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
235 	0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
236 static const wvec32 *c7_dotabove[] = {
237 	NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
238 };
239 
240 /* Diaeresis AEIOUY (HWXt) */
241 static const wvec32 c8_vec1 = {
242 	/* Upper case */
243 	0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
244 	0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
245 static const wvec32 c8_vec2 = {
246 	/* Lower case */
247 	0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
248 	0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
249 static const wvec32 *c8_diaeresis[] = {
250 	NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
251 };
252 
253 /* Ring Above AU (wy) */
254 static const wvec32 ca_vec1 = {
255 	/* Upper case */
256 	0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
257 	0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
258 static const wvec32 ca_vec2 = {
259 	/* Lower case */
260 	0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
261 	0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
262 static const wvec32 *ca_ringabove[] = {
263 	NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
264 };
265 
266 /* Cedilla CGKLNRST (EDH) */
267 static const wvec32 cb_vec1 = {
268 	/* Upper case */
269 	0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
270 	0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
271 	0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
272 static const wvec32 cb_vec2 = {
273 	/* Lower case */
274 	0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
275 	0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
276 	0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
277 static const wvec32 *cb_cedilla[] = {
278 	NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
279 };
280 
281 /* Double Acute Accent OU */
282 static const wvec32 cd_vec1 = {
283 	/* Upper case */
284 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
285 	0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
286 static const wvec32 cd_vec2 = {
287 	/* Lower case */
288 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
289 	0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
290 static const wvec32 *cd_doubleacute[] = {
291 	NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
292 };
293 
294 /* Ogonek AEIU (O) */
295 static const wvec32 ce_vec1 = {
296 	/* Upper case */
297 	0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
298 	0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
299 static const wvec32 ce_vec2 = {
300 	/* Lower case */
301 	0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
302 	0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
303 static const wvec32 *ce_ogonek[] = {
304 	NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
305 };
306 
307 /* Caron CDELNRSTZ (AIOUGKjH) */
308 static const wvec32 cf_vec1 = {
309 	/* Upper case */
310 	0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
311 	0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
312 	0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
313 	0, 0, 0x17d, 0, 0, 0, 0, 0};
314 static const wvec32 cf_vec2 = {
315 	/* Lower case */
316 	0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
317 	0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
318 	0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
319 	0, 0, 0x17e, 0, 0, 0, 0, 0};
320 static const wvec32 *cf_caron[] = {
321 	NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
322 };
323 
324 static const wvec32 **cx_tab[] = {
325 	NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
326 	c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
327 	cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
328 
ldap_t61s_valid(struct berval * str)329 int ldap_t61s_valid( struct berval *str )
330 {
331 	unsigned char *c = (unsigned char *)str->bv_val;
332 	int i;
333 
334 	for (i=0; i < str->bv_len; c++,i++)
335 		if (!t61_tab[*c])
336 			return 0;
337 	return 1;
338 }
339 
340 /* Transform a T.61 string to UTF-8.
341  */
ldap_t61s_to_utf8s(struct berval * src,struct berval * dst)342 int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst )
343 {
344 	unsigned char *c;
345 	char *d;
346 	int i, wlen = 0;
347 
348 	/* Just count the length of the UTF-8 result first */
349 	for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
350 		/* Invalid T.61 characters? */
351 		if (!t61_tab[*c])
352 			return LDAP_INVALID_SYNTAX;
353 		if ((*c & 0xf0) == 0xc0) {
354 			int j = *c & 0x0f;
355 			/* If this is the end of the string, or if the base
356 			 * character is just a space, treat this as a regular
357 			 * spacing character.
358 			 */
359 			if ((!c[1] || c[1] == 0x20) && accents[j]) {
360 				wlen += ldap_x_wc_to_utf8(NULL, accents[j], 0);
361 			} else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
362 			/* We have a composite mapping for this pair */
363 				(*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
364 				wlen += ldap_x_wc_to_utf8( NULL,
365 					(*cx_tab[j][c[1]>>5])[c[1]&0x1f], 0);
366 			} else {
367 			/* No mapping, just swap it around so the base
368 			 * character comes first.
369 			 */
370 			 	wlen += ldap_x_wc_to_utf8(NULL, c[1], 0);
371 				wlen += ldap_x_wc_to_utf8(NULL,
372 					t61_tab[*c], 0);
373 			}
374 			c++; i++;
375 			continue;
376 		} else {
377 			wlen += ldap_x_wc_to_utf8(NULL, t61_tab[*c], 0);
378 		}
379 	}
380 
381 	/* Now transform the string */
382 	dst->bv_len = wlen;
383 	dst->bv_val = LDAP_MALLOC( wlen+1 );
384 	d = dst->bv_val;
385 	if (!d)
386 		return LDAP_NO_MEMORY;
387 
388 	for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
389 		if ((*c & 0xf0) == 0xc0) {
390 			int j = *c & 0x0f;
391 			/* If this is the end of the string, or if the base
392 			 * character is just a space, treat this as a regular
393 			 * spacing character.
394 			 */
395 			if ((!c[1] || c[1] == 0x20) && accents[j]) {
396 				d += ldap_x_wc_to_utf8(d, accents[j], 6);
397 			} else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
398 			/* We have a composite mapping for this pair */
399 				(*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
400 				d += ldap_x_wc_to_utf8(d,
401 				(*cx_tab[j][c[1]>>5])[c[1]&0x1f], 6);
402 			} else {
403 			/* No mapping, just swap it around so the base
404 			 * character comes first.
405 			 */
406 				d += ldap_x_wc_to_utf8(d, c[1], 6);
407 				d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
408 			}
409 			c++; i++;
410 			continue;
411 		} else {
412 			d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
413 		}
414 	}
415 	*d = '\0';
416 	return LDAP_SUCCESS;
417 }
418 
419 /* For the reverse mapping, we just pay attention to the Latin-oriented
420  * code blocks. These are
421  *	0000 - 007f Basic Latin
422  *	0080 - 00ff Latin-1 Supplement
423  *	0100 - 017f Latin Extended-A
424  *	0180 - 024f Latin Extended-B
425  *	1e00 - 1eff Latin Extended Additional
426  *
427  * We have a special case to map Ohm U2126 back to T.61 0xe0. All other
428  * unrecognized characters are replaced with '?' 0x3f.
429  */
430 
431 static const wvec64 u000 = {
432 	0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
433 	0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
434 	0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
435 	0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
436 	0x0020, 0x0021, 0x0022, 0x00a6, 0x00a4, 0x0025, 0x0026, 0x0027,
437 	0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
438 	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
439 	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f};
440 
441 /* In this range, we've mapped caret to xc3/x20, backquote to xc1/x20,
442  * and tilde to xc4/x20. T.61 (stupidly!) doesn't define these characters
443  * on their own, even though it provides them as combiners for other
444  * letters. T.61 doesn't define these pairings either, so this may just
445  * have to be replaced with '?' 0x3f if other software can't cope with it.
446  */
447 static const wvec64 u001 = {
448 	0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
449 	0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
450 	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
451 	0x0058, 0x0059, 0x005a, 0x005b, 0x003f, 0x005d, 0xc320, 0x005f,
452 	0xc120, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
453 	0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
454 	0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
455 	0x0078, 0x0079, 0x007a, 0x003f, 0x007c, 0x003f, 0xc420, 0x007f};
456 
457 static const wvec64 u002 = {
458 	0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
459 	0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
460 	0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
461 	0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
462 	0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a8, 0x00a5, 0x003f, 0x00a7,
463 	0xc820, 0x003f, 0x00e3, 0x00ab, 0x003f, 0x003f, 0x003f, 0xc520,
464 	0x00b0, 0x00b1, 0x00b2, 0x00b3, 0xc220, 0x00b5, 0x00b6, 0x00b7,
465 	0xcb20, 0x003f, 0x00eb, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf};
466 
467 static const wvec64 u003 = {
468 	0xc141, 0xc241, 0xc341, 0xc441, 0xc841, 0xca41, 0x00e1, 0xcb43,
469 	0xc145, 0xc245, 0xc345, 0xc845, 0xc149, 0xc249, 0xc349, 0xc849,
470 	0x00e2, 0xc44e, 0xc14f, 0xc24f, 0xc34f, 0xc44f, 0xc84f, 0x00b4,
471 	0x00e9, 0xc155, 0xc255, 0xc355, 0xc855, 0xc259, 0x00ec, 0x00fb,
472 	0xc161, 0xc261, 0xc361, 0xc461, 0xc861, 0xca61, 0x00f1, 0xcb63,
473 	0xc165, 0xc265, 0xc365, 0xc865, 0xc169, 0xc269, 0xc369, 0xc869,
474 	0x00f3, 0xc46e, 0xc16f, 0xc26f, 0xc36f, 0xc46f, 0xc86f, 0x00b8,
475 	0x00f9, 0xc175, 0xc275, 0xc375, 0xc875, 0xc279, 0x00fc, 0xc879};
476 
477 /* These codes are used here but not defined by T.61:
478  * x114 = xc6/x45, x115 = xc6/x65, x12c = xc6/x49, x12d = xc6/x69
479  */
480 static const wvec64 u010 = {
481 	0xc541, 0xc561, 0xc641, 0xc661, 0xce41, 0xce61, 0xc243, 0xc263,
482 	0xc343, 0xc363, 0xc743, 0xc763, 0xcf43, 0xcf63, 0xcf44, 0xcf64,
483 	0x003f, 0x00f2, 0xc545, 0xc565, 0xc645, 0xc665, 0xc745, 0xc765,
484 	0xce45, 0xce65, 0xcf45, 0xcf65, 0xc347, 0xc367, 0xc647, 0xc667,
485 	0xc747, 0xc767, 0xcb47, 0xcb67, 0xc348, 0xc368, 0x00e4, 0x00f4,
486 	0xc449, 0xc469, 0xc549, 0xc569, 0xc649, 0xc669, 0xce49, 0xce69,
487 	0xc749, 0x00f5, 0x00e6, 0x00f6, 0xc34a, 0xc36a, 0xcb4b, 0xcb6b,
488 	0x00f0, 0xc24c, 0xc26c, 0xcb4c, 0xcb6c, 0xcf4c, 0xcf6c, 0x00e7};
489 
490 /* These codes are used here but not defined by T.61:
491  * x14e = xc6/x4f, x14f = xc6/x6f
492  */
493 static const wvec64 u011 = {
494 	0x00f7, 0x00e8, 0x00f8, 0xc24e, 0xc26e, 0xcb4e, 0xcb6e, 0xcf4e,
495 	0xcf6e, 0x00ef, 0x00ee, 0x00fe, 0xc54f, 0xc56f, 0xc64f, 0xc66f,
496 	0xcd4f, 0xcd6f, 0x00ea, 0x00fa, 0xc252, 0xc272, 0xcb52, 0xcb72,
497 	0xcf52, 0xcf72, 0xc253, 0xc273, 0xc353, 0xc373, 0xcb53, 0xcb73,
498 	0xcf53, 0xcf73, 0xcb54, 0xcb74, 0xcf54, 0xcf74, 0x00ed, 0x00fd,
499 	0xc455, 0xc475, 0xc555, 0xc575, 0xc655, 0xc675, 0xca55, 0xca75,
500 	0xcd55, 0xcd75, 0xce55, 0xce75, 0xc357, 0xc377, 0xc359, 0xc379,
501 	0xc859, 0xc25a, 0xc27a, 0xc75a, 0xc77a, 0xcf5a, 0xcf7a, 0x003f};
502 
503 /* All of the codes in this block are undefined in T.61.
504  */
505 static const wvec64 u013 = {
506 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
507 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf41, 0xcf61, 0xcf49,
508 	0xcf69, 0xcf4f, 0xcf6f, 0xcf55, 0xcf75, 0x003f, 0x003f, 0x003f,
509 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
510 	0x003f, 0x003f, 0xc5e1, 0xc5f1, 0x003f, 0x003f, 0xcf47, 0xcf67,
511 	0xcf4b, 0xcf6b, 0xce4f, 0xce6f, 0x003f, 0x003f, 0x003f, 0x003f,
512 	0xcf6a, 0x003f, 0x003f, 0x003f, 0xc247, 0xc267, 0x003f, 0x003f,
513 	0xc14e, 0xc16e, 0x003f, 0x003f, 0xc2e1, 0xc2f1, 0x003f, 0x003f};
514 
515 /* All of the codes in this block are undefined in T.61.
516  */
517 static const wvec64 u020 = {
518 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
519 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
520 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
521 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf48, 0xcf68,
522 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc741, 0xc761,
523 	0xcb45, 0xcb65, 0x003f, 0x003f, 0x003f, 0x003f, 0xc74f, 0xc76f,
524 	0x003f, 0x003f, 0xc559, 0xc579, 0x003f, 0x003f, 0x003f, 0x003f,
525 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
526 
527 static const wvec64 u023 = {
528 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf20,
529 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
530 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
531 	0xc620, 0xc720, 0xca20, 0xce20, 0x003f, 0xcd20, 0x003f, 0x003f,
532 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
533 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
534 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
535 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
536 
537 /* These are the non-spacing characters by themselves. They should
538  * never appear by themselves in actual text.
539  */
540 static const wvec64 u030 = {
541 	0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x003f, 0x00c6, 0x00c7,
542 	0x00c8, 0x003f, 0x00ca, 0x00cd, 0x00cf, 0x003f, 0x003f, 0x003f,
543 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
544 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
545 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x00cb,
546 	0x00ce, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
547 	0x003f, 0x003f, 0x00cc, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
548 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
549 
550 /* None of the following blocks are defined in T.61.
551  */
552 static const wvec64 u1e0 = {
553 	0x003f, 0x003f, 0xc742, 0xc762, 0x003f, 0x003f, 0x003f, 0x003f,
554 	0x003f, 0x003f, 0xc744, 0xc764, 0x003f, 0x003f, 0x003f, 0x003f,
555 	0xcb44, 0xcb64, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
556 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc746, 0xc766,
557 	0xc547, 0xc567, 0xc748, 0xc768, 0x003f, 0x003f, 0xc848, 0xc868,
558 	0xcb48, 0xcb68, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
559 	0xc24b, 0xc26b, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
560 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc24d, 0xc26d,
561 };
562 
563 static const wvec64 u1e1 = {
564 	0xc74d, 0xc76d, 0x003f, 0x003f, 0xc74e, 0xc76e, 0x003f, 0x003f,
565 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
566 	0x003f, 0x003f, 0x003f, 0x003f, 0xc250, 0xc270, 0xc750, 0xc770,
567 	0xc752, 0xc772, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
568 	0xc753, 0xc773, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
569 	0x003f, 0x003f, 0xc754, 0xc774, 0x003f, 0x003f, 0x003f, 0x003f,
570 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
571 	0x003f, 0x003f, 0x003f, 0x003f, 0xc456, 0xc476, 0x003f, 0x003f,
572 };
573 
574 static const wvec64 u1e2 = {
575 	0xc157, 0xc177, 0xc257, 0xc277, 0xc857, 0xc877, 0xc757, 0xc777,
576 	0x003f, 0x003f, 0xc758, 0xc778, 0xc858, 0xc878, 0xc759, 0xc779,
577 	0xc35a, 0xc37a, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc874,
578 	0xca77, 0xca79, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
579 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
580 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
581 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
582 	0x003f, 0x003f, 0x003f, 0x003f, 0xc445, 0xc465, 0x003f, 0x003f,
583 };
584 
585 static const wvec64 u1e3 = {
586 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
587 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
588 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
589 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
590 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
591 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
592 	0x003f, 0x003f, 0xc159, 0xc179, 0x003f, 0x003f, 0x003f, 0x003f,
593 	0xc459, 0xc479, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
594 };
595 
596 static const wvec64 *wc00[] = {
597 	&u000, &u001, &u002, &u003,
598 	&u010, &u011, NULL, &u013,
599 	&u020, NULL, NULL, &u023,
600 	&u030, NULL, NULL, NULL};
601 
602 static const wvec64 *wc1e[] = {
603 	&u1e0, &u1e1, &u1e2, &u1e3};
604 
605 
ldap_utf8s_to_t61s(struct berval * src,struct berval * dst)606 int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst )
607 {
608 	char *c, *d;
609 	wchar_t tmp;
610 	int i, j, tlen = 0;
611 
612 	/* Just count the length of the T.61 result first */
613 	for (i=0,c=src->bv_val; i < src->bv_len;) {
614 		j = ldap_x_utf8_to_wc( &tmp, c );
615 		if (j == -1)
616 			return LDAP_INVALID_SYNTAX;
617 		switch (tmp >> 8) {
618 		case 0x00:
619 		case 0x01:
620 		case 0x02:
621 		case 0x03:
622 			if (wc00[tmp >> 6] &&
623 				((*wc00[tmp >> 6])[tmp & 0x3f] & 0xff00)) {
624 				tlen++;
625 			}
626 			tlen++;
627 			break;
628 		case 0x1e:
629 			if ((*wc1e[(tmp >> 6) & 3])[tmp & 0x3f] & 0xff00) {
630 				tlen++;
631 			}
632 		case 0x21:
633 		default:
634 			tlen ++;
635 			break;
636 		}
637 		i += j;
638 		c += j;
639 	}
640 	dst->bv_len = tlen;
641 	dst->bv_val = LDAP_MALLOC( tlen+1 );
642 	if (!dst->bv_val)
643 		return LDAP_NO_MEMORY;
644 
645 	d = dst->bv_val;
646 	for (i=0,c=src->bv_val; i < src->bv_len;) {
647 		j = ldap_x_utf8_to_wc( &tmp, c );
648 		switch (tmp >> 8) {
649 		case 0x00:
650 		case 0x01:
651 		case 0x02:
652 			if (wc00[tmp >> 6]) {
653 				tmp = (*wc00[tmp >> 6])[tmp & 0x3f];
654 				if (tmp & 0xff00)
655 					*d++ = (tmp >> 8);
656 				*d++ = tmp & 0xff;
657 			} else {
658 				*d++ = 0x3f;
659 			}
660 			break;
661 		case 0x03:
662 			/* swap order of non-spacing characters */
663 			if (wc00[tmp >> 6]) {
664 				wchar_t t2 = (*wc00[tmp >> 6])[tmp & 0x3f];
665 				if (t2 != 0x3f) {
666 					d[0] = d[-1];
667 					d[-1] = t2;
668 					d++;
669 				} else {
670 					*d++ = 0x3f;
671 				}
672 			} else {
673 				*d++ = 0x3f;
674 			}
675 			break;
676 		case 0x1e:
677 			tmp = (*wc1e[(tmp >> 6) & 3])[tmp & 0x3f];
678 			if (tmp & 0xff00)
679 				*d++ = (tmp >> 8);
680 			*d++ = tmp & 0xff;
681 			break;
682 		case 0x21:
683 			if (tmp == 0x2126) {
684 				*d++ = 0xe0;
685 				break;
686 			}
687 			/* FALLTHRU */
688 		default:
689 			*d++ = 0x3f;
690 			break;
691 		}
692 		i += j;
693 		c += j;
694 	}
695 	*d = '\0';
696 	return LDAP_SUCCESS;
697 }
698