xref: /netbsd-src/external/bsd/openldap/dist/libraries/libldap/t61.c (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1 /*	$NetBSD: t61.c,v 1.1.1.3 2010/03/08 02:14:20 lukem Exp $	*/
2 
3 /* OpenLDAP: pkg/ldap/libraries/libldap/t61.c,v 1.9.2.5 2009/01/22 00:00:56 kurt Exp */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5  *
6  * Copyright 2002-2009 The OpenLDAP Foundation.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted only as authorized by the OpenLDAP
11  * Public License.
12  *
13  * A copy of this license is available in the file LICENSE in the
14  * top-level directory of the distribution or, alternatively, at
15  * <http://www.OpenLDAP.org/license.html>.
16  */
17 /* ACKNOWLEDGEMENTS:
18  * This work was initially developed by Howard Chu for inclusion in
19  * OpenLDAP Software.
20  */
21 
22 /*
23  * Basic T.61 <-> UTF-8 conversion
24  *
25  * These routines will perform a lossless translation from T.61 to UTF-8
26  * and a lossy translation from UTF-8 to T.61.
27  */
28 
29 #include "portable.h"
30 
31 #include <stdio.h>
32 
33 #include <ac/stdlib.h>
34 
35 #include <ac/socket.h>
36 #include <ac/string.h>
37 #include <ac/time.h>
38 
39 #include "ldap-int.h"
40 #include "ldap_utf8.h"
41 
42 #include "ldap_defaults.h"
43 
44 /*
45  * T.61 is somewhat braindead; even in the 7-bit space it is not
46  * completely equivalent to 7-bit US-ASCII. Our definition of the
47  * character set comes from RFC 1345 with a slightly more readable
48  * rendition at http://std.dkuug.dk/i18n/charmaps/T.61-8BIT.
49  *
50  * Even though '#' and '$' are present in the 7-bit US-ASCII space,
51  * (x23 and x24, resp.) in T.61 they are mapped to 8-bit characters
52  * xA6 and xA4.
53  *
54  * Also T.61 lacks
55  *	backslash 	\	(x5C)
56  *	caret		^	(x5E)
57  *	backquote	`	(x60)
58  *	left brace	{	(x7B)
59  *	right brace	}	(x7D)
60  *	tilde		~	(x7E)
61  *
62  * In T.61, the codes xC1 to xCF (excluding xC9, unused) are non-spacing
63  * accents of some form or another. There are predefined combinations
64  * for certain characters, but they can also be used arbitrarily. The
65  * table at dkuug.dk maps these accents to the E000 "private use" range
66  * of the Unicode space, but I believe they more properly belong in the
67  * 0300 range (non-spacing accents). The transformation is complicated
68  * slightly because Unicode wants the non-spacing character to follow
69  * the base character, while T.61 has the non-spacing character leading.
70  * Also, T.61 specifically recognizes certain combined pairs as "characters"
71  * but doesn't specify how to treat unrecognized pairs. This code will
72  * always attempt to combine pairs when a known Unicode composite exists.
73  */
74 
75 static const wchar_t t61_tab[] = {
76 	0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
77 	0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
78 	0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
79 	0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
80 	0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
81 	0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
82 	0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
83 	0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
84 	0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
85 	0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
86 	0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
87 	0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
88 	0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
89 	0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
90 	0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
91 	0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
92 	0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
93 	0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
94 	0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
95 	0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
96 	0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
97 	0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
98 	0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
99 	0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
100 	0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
101 	0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
102 	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
103 	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
104 	0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
105 	0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
106 	0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
107 	0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
108 };
109 
110 typedef wchar_t wvec16[16];
111 typedef wchar_t wvec32[32];
112 typedef wchar_t wvec64[64];
113 
114 /* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
115 static const wvec16 accents = {
116 	0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
117 	0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
118 
119 /* In the following tables, base characters commented in (parentheses)
120  * are not defined by T.61 but are mapped anyway since their Unicode
121  * composite exists.
122  */
123 
124 /* Grave accented chars AEIOU (NWY) */
125 static const wvec32 c1_vec1 = {
126 	/* Upper case */
127 	0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
128 	0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
129 static const wvec32 c1_vec2 = {
130 	/* Lower case */
131 	0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
132 	0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
133 
134 static const wvec32 *c1_grave[] = {
135 	NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
136 };
137 
138 /* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
139 static const wvec32 c2_vec1 = {
140 	/* Upper case */
141 	0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
142 	0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
143 	0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
144 	0, 0xdd, 0x179, 0, 0, 0, 0, 0};
145 static const wvec32 c2_vec2 = {
146 	/* Lower case */
147 	0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
148 	0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
149 	0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
150 	0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
151 static const wvec32 c2_vec3 = {
152 	/* (AE and ae) */
153 	0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
154 	0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
155 
156 static const wvec32 *c2_acute[] = {
157 	NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
158 };
159 
160 /* Circumflex AEIOUYCGHJSW (Z) */
161 static const wvec32 c3_vec1 = {
162 	/* Upper case */
163 	0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
164 	0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
165 	0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
166 	0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
167 static const wvec32 c3_vec2 = {
168 	/* Lower case */
169 	0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
170 	0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
171 	0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
172 	0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
173 static const wvec32 *c3_circumflex[] = {
174 	NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
175 };
176 
177 /* Tilde AIOUN (EVY) */
178 static const wvec32 c4_vec1 = {
179 	/* Upper case */
180 	0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
181 	0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
182 static const wvec32 c4_vec2 = {
183 	/* Lower case */
184 	0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
185 	0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
186 static const wvec32 *c4_tilde[] = {
187 	NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
188 };
189 
190 /* Macron AEIOU (YG) */
191 static const wvec32 c5_vec1 = {
192 	/* Upper case */
193 	0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
194 	0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
195 static const wvec32 c5_vec2 = {
196 	/* Lower case */
197 	0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
198 	0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
199 static const wvec32 c5_vec3 = {
200 	/* (AE and ae) */
201 	0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 	0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
203 static const wvec32 *c5_macron[] = {
204 	NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
205 };
206 
207 /* Breve AUG (EIO) */
208 static const wvec32 c6_vec1 = {
209 	/* Upper case */
210 	0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
211 	0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
212 static const wvec32 c6_vec2 = {
213 	/* Lower case */
214 	0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
215 	0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
216 static const wvec32 *c6_breve[] = {
217 	NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
218 };
219 
220 /* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
221 static const wvec32 c7_vec1 = {
222 	/* Upper case */
223 	0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
224 	0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
225 	0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
226 	0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
227 static const wvec32 c7_vec2 = {
228 	/* Lower case */
229 	0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
230 	0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
231 	0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
232 	0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
233 static const wvec32 *c7_dotabove[] = {
234 	NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
235 };
236 
237 /* Diaeresis AEIOUY (HWXt) */
238 static const wvec32 c8_vec1 = {
239 	/* Upper case */
240 	0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
241 	0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
242 static const wvec32 c8_vec2 = {
243 	/* Lower case */
244 	0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
245 	0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
246 static const wvec32 *c8_diaeresis[] = {
247 	NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
248 };
249 
250 /* Ring Above AU (wy) */
251 static const wvec32 ca_vec1 = {
252 	/* Upper case */
253 	0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
254 	0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
255 static const wvec32 ca_vec2 = {
256 	/* Lower case */
257 	0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
258 	0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
259 static const wvec32 *ca_ringabove[] = {
260 	NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
261 };
262 
263 /* Cedilla CGKLNRST (EDH) */
264 static const wvec32 cb_vec1 = {
265 	/* Upper case */
266 	0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
267 	0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
268 	0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
269 static const wvec32 cb_vec2 = {
270 	/* Lower case */
271 	0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
272 	0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
273 	0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
274 static const wvec32 *cb_cedilla[] = {
275 	NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
276 };
277 
278 /* Double Acute Accent OU */
279 static const wvec32 cd_vec1 = {
280 	/* Upper case */
281 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
282 	0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
283 static const wvec32 cd_vec2 = {
284 	/* Lower case */
285 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
286 	0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
287 static const wvec32 *cd_doubleacute[] = {
288 	NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
289 };
290 
291 /* Ogonek AEIU (O) */
292 static const wvec32 ce_vec1 = {
293 	/* Upper case */
294 	0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
295 	0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
296 static const wvec32 ce_vec2 = {
297 	/* Lower case */
298 	0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
299 	0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
300 static const wvec32 *ce_ogonek[] = {
301 	NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
302 };
303 
304 /* Caron CDELNRSTZ (AIOUGKjH) */
305 static const wvec32 cf_vec1 = {
306 	/* Upper case */
307 	0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
308 	0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
309 	0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
310 	0, 0, 0x17d, 0, 0, 0, 0, 0};
311 static const wvec32 cf_vec2 = {
312 	/* Lower case */
313 	0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
314 	0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
315 	0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
316 	0, 0, 0x17e, 0, 0, 0, 0, 0};
317 static const wvec32 *cf_caron[] = {
318 	NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
319 };
320 
321 static const wvec32 **cx_tab[] = {
322 	NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
323 	c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
324 	cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
325 
326 int ldap_t61s_valid( struct berval *str )
327 {
328 	unsigned char *c = (unsigned char *)str->bv_val;
329 	int i;
330 
331 	for (i=0; i < str->bv_len; c++,i++)
332 		if (!t61_tab[*c])
333 			return 0;
334 	return 1;
335 }
336 
337 /* Transform a T.61 string to UTF-8.
338  */
339 int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst )
340 {
341 	unsigned char *c;
342 	char *d;
343 	int i, wlen = 0;
344 
345 	/* Just count the length of the UTF-8 result first */
346 	for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
347 		/* Invalid T.61 characters? */
348 		if (!t61_tab[*c])
349 			return LDAP_INVALID_SYNTAX;
350 		if ((*c & 0xf0) == 0xc0) {
351 			int j = *c & 0x0f;
352 			/* If this is the end of the string, or if the base
353 			 * character is just a space, treat this as a regular
354 			 * spacing character.
355 			 */
356 			if ((!c[1] || c[1] == 0x20) && accents[j]) {
357 				wlen += ldap_x_wc_to_utf8(NULL, accents[j], 0);
358 			} else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
359 			/* We have a composite mapping for this pair */
360 				(*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
361 				wlen += ldap_x_wc_to_utf8( NULL,
362 					(*cx_tab[j][c[1]>>5])[c[1]&0x1f], 0);
363 			} else {
364 			/* No mapping, just swap it around so the base
365 			 * character comes first.
366 			 */
367 			 	wlen += ldap_x_wc_to_utf8(NULL, c[1], 0);
368 				wlen += ldap_x_wc_to_utf8(NULL,
369 					t61_tab[*c], 0);
370 			}
371 			c++; i++;
372 			continue;
373 		} else {
374 			wlen += ldap_x_wc_to_utf8(NULL, t61_tab[*c], 0);
375 		}
376 	}
377 
378 	/* Now transform the string */
379 	dst->bv_len = wlen;
380 	dst->bv_val = LDAP_MALLOC( wlen+1 );
381 	d = dst->bv_val;
382 	if (!d)
383 		return LDAP_NO_MEMORY;
384 
385 	for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
386 		if ((*c & 0xf0) == 0xc0) {
387 			int j = *c & 0x0f;
388 			/* If this is the end of the string, or if the base
389 			 * character is just a space, treat this as a regular
390 			 * spacing character.
391 			 */
392 			if ((!c[1] || c[1] == 0x20) && accents[j]) {
393 				d += ldap_x_wc_to_utf8(d, accents[j], 6);
394 			} else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
395 			/* We have a composite mapping for this pair */
396 				(*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
397 				d += ldap_x_wc_to_utf8(d,
398 				(*cx_tab[j][c[1]>>5])[c[1]&0x1f], 6);
399 			} else {
400 			/* No mapping, just swap it around so the base
401 			 * character comes first.
402 			 */
403 				d += ldap_x_wc_to_utf8(d, c[1], 6);
404 				d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
405 			}
406 			c++; i++;
407 			continue;
408 		} else {
409 			d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
410 		}
411 	}
412 	*d = '\0';
413 	return LDAP_SUCCESS;
414 }
415 
416 /* For the reverse mapping, we just pay attention to the Latin-oriented
417  * code blocks. These are
418  *	0000 - 007f Basic Latin
419  *	0080 - 00ff Latin-1 Supplement
420  *	0100 - 017f Latin Extended-A
421  *	0180 - 024f Latin Extended-B
422  *	1e00 - 1eff Latin Extended Additional
423  *
424  * We have a special case to map Ohm U2126 back to T.61 0xe0. All other
425  * unrecognized characters are replaced with '?' 0x3f.
426  */
427 
428 static const wvec64 u000 = {
429 	0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
430 	0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
431 	0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
432 	0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
433 	0x0020, 0x0021, 0x0022, 0x00a6, 0x00a4, 0x0025, 0x0026, 0x0027,
434 	0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
435 	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
436 	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f};
437 
438 /* In this range, we've mapped caret to xc3/x20, backquote to xc1/x20,
439  * and tilde to xc4/x20. T.61 (stupidly!) doesn't define these characters
440  * on their own, even though it provides them as combiners for other
441  * letters. T.61 doesn't define these pairings either, so this may just
442  * have to be replaced with '?' 0x3f if other software can't cope with it.
443  */
444 static const wvec64 u001 = {
445 	0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
446 	0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
447 	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
448 	0x0058, 0x0059, 0x005a, 0x005b, 0x003f, 0x005d, 0xc320, 0x005f,
449 	0xc120, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
450 	0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
451 	0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
452 	0x0078, 0x0079, 0x007a, 0x003f, 0x007c, 0x003f, 0xc420, 0x007f};
453 
454 static const wvec64 u002 = {
455 	0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
456 	0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
457 	0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
458 	0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
459 	0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a8, 0x00a5, 0x003f, 0x00a7,
460 	0xc820, 0x003f, 0x00e3, 0x00ab, 0x003f, 0x003f, 0x003f, 0xc520,
461 	0x00b0, 0x00b1, 0x00b2, 0x00b3, 0xc220, 0x00b5, 0x00b6, 0x00b7,
462 	0xcb20, 0x003f, 0x00eb, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf};
463 
464 static const wvec64 u003 = {
465 	0xc141, 0xc241, 0xc341, 0xc441, 0xc841, 0xca41, 0x00e1, 0xcb43,
466 	0xc145, 0xc245, 0xc345, 0xc845, 0xc149, 0xc249, 0xc349, 0xc849,
467 	0x00e2, 0xc44e, 0xc14f, 0xc24f, 0xc34f, 0xc44f, 0xc84f, 0x00b4,
468 	0x00e9, 0xc155, 0xc255, 0xc355, 0xc855, 0xc259, 0x00ec, 0x00fb,
469 	0xc161, 0xc261, 0xc361, 0xc461, 0xc861, 0xca61, 0x00f1, 0xcb63,
470 	0xc165, 0xc265, 0xc365, 0xc865, 0xc169, 0xc269, 0xc369, 0xc869,
471 	0x00f3, 0xc46e, 0xc16f, 0xc26f, 0xc36f, 0xc46f, 0xc86f, 0x00b8,
472 	0x00f9, 0xc175, 0xc275, 0xc375, 0xc875, 0xc279, 0x00fc, 0xc879};
473 
474 /* These codes are used here but not defined by T.61:
475  * x114 = xc6/x45, x115 = xc6/x65, x12c = xc6/x49, x12d = xc6/x69
476  */
477 static const wvec64 u010 = {
478 	0xc541, 0xc561, 0xc641, 0xc661, 0xce41, 0xce61, 0xc243, 0xc263,
479 	0xc343, 0xc363, 0xc743, 0xc763, 0xcf43, 0xcf63, 0xcf44, 0xcf64,
480 	0x003f, 0x00f2, 0xc545, 0xc565, 0xc645, 0xc665, 0xc745, 0xc765,
481 	0xce45, 0xce65, 0xcf45, 0xcf65, 0xc347, 0xc367, 0xc647, 0xc667,
482 	0xc747, 0xc767, 0xcb47, 0xcb67, 0xc348, 0xc368, 0x00e4, 0x00f4,
483 	0xc449, 0xc469, 0xc549, 0xc569, 0xc649, 0xc669, 0xce49, 0xce69,
484 	0xc749, 0x00f5, 0x00e6, 0x00f6, 0xc34a, 0xc36a, 0xcb4b, 0xcb6b,
485 	0x00f0, 0xc24c, 0xc26c, 0xcb4c, 0xcb6c, 0xcf4c, 0xcf6c, 0x00e7};
486 
487 /* These codes are used here but not defined by T.61:
488  * x14e = xc6/x4f, x14f = xc6/x6f
489  */
490 static const wvec64 u011 = {
491 	0x00f7, 0x00e8, 0x00f8, 0xc24e, 0xc26e, 0xcb4e, 0xcb6e, 0xcf4e,
492 	0xcf6e, 0x00ef, 0x00ee, 0x00fe, 0xc54f, 0xc56f, 0xc64f, 0xc66f,
493 	0xcd4f, 0xcd6f, 0x00ea, 0x00fa, 0xc252, 0xc272, 0xcb52, 0xcb72,
494 	0xcf52, 0xcf72, 0xc253, 0xc273, 0xc353, 0xc373, 0xcb53, 0xcb73,
495 	0xcf53, 0xcf73, 0xcb54, 0xcb74, 0xcf54, 0xcf74, 0x00ed, 0x00fd,
496 	0xc455, 0xc475, 0xc555, 0xc575, 0xc655, 0xc675, 0xca55, 0xca75,
497 	0xcd55, 0xcd75, 0xce55, 0xce75, 0xc357, 0xc377, 0xc359, 0xc379,
498 	0xc859, 0xc25a, 0xc27a, 0xc75a, 0xc77a, 0xcf5a, 0xcf7a, 0x003f};
499 
500 /* All of the codes in this block are undefined in T.61.
501  */
502 static const wvec64 u013 = {
503 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
504 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf41, 0xcf61, 0xcf49,
505 	0xcf69, 0xcf4f, 0xcf6f, 0xcf55, 0xcf75, 0x003f, 0x003f, 0x003f,
506 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
507 	0x003f, 0x003f, 0xc5e1, 0xc5f1, 0x003f, 0x003f, 0xcf47, 0xcf67,
508 	0xcf4b, 0xcf6b, 0xce4f, 0xce6f, 0x003f, 0x003f, 0x003f, 0x003f,
509 	0xcf6a, 0x003f, 0x003f, 0x003f, 0xc247, 0xc267, 0x003f, 0x003f,
510 	0xc14e, 0xc16e, 0x003f, 0x003f, 0xc2e1, 0xc2f1, 0x003f, 0x003f};
511 
512 /* All of the codes in this block are undefined in T.61.
513  */
514 static const wvec64 u020 = {
515 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
516 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
517 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
518 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf48, 0xcf68,
519 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc741, 0xc761,
520 	0xcb45, 0xcb65, 0x003f, 0x003f, 0x003f, 0x003f, 0xc74f, 0xc76f,
521 	0x003f, 0x003f, 0xc559, 0xc579, 0x003f, 0x003f, 0x003f, 0x003f,
522 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
523 
524 static const wvec64 u023 = {
525 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf20,
526 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
527 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
528 	0xc620, 0xc720, 0xca20, 0xce20, 0x003f, 0xcd20, 0x003f, 0x003f,
529 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
530 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
531 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
532 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
533 
534 /* These are the non-spacing characters by themselves. They should
535  * never appear by themselves in actual text.
536  */
537 static const wvec64 u030 = {
538 	0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x003f, 0x00c6, 0x00c7,
539 	0x00c8, 0x003f, 0x00ca, 0x00cd, 0x00cf, 0x003f, 0x003f, 0x003f,
540 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
541 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
542 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x00cb,
543 	0x00ce, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
544 	0x003f, 0x003f, 0x00cc, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
545 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
546 
547 /* None of the following blocks are defined in T.61.
548  */
549 static const wvec64 u1e0 = {
550 	0x003f, 0x003f, 0xc742, 0xc762, 0x003f, 0x003f, 0x003f, 0x003f,
551 	0x003f, 0x003f, 0xc744, 0xc764, 0x003f, 0x003f, 0x003f, 0x003f,
552 	0xcb44, 0xcb64, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
553 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc746, 0xc766,
554 	0xc547, 0xc567, 0xc748, 0xc768, 0x003f, 0x003f, 0xc848, 0xc868,
555 	0xcb48, 0xcb68, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
556 	0xc24b, 0xc26b, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
557 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc24d, 0xc26d,
558 };
559 
560 static const wvec64 u1e1 = {
561 	0xc74d, 0xc76d, 0x003f, 0x003f, 0xc74e, 0xc76e, 0x003f, 0x003f,
562 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
563 	0x003f, 0x003f, 0x003f, 0x003f, 0xc250, 0xc270, 0xc750, 0xc770,
564 	0xc752, 0xc772, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
565 	0xc753, 0xc773, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
566 	0x003f, 0x003f, 0xc754, 0xc774, 0x003f, 0x003f, 0x003f, 0x003f,
567 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
568 	0x003f, 0x003f, 0x003f, 0x003f, 0xc456, 0xc476, 0x003f, 0x003f,
569 };
570 
571 static const wvec64 u1e2 = {
572 	0xc157, 0xc177, 0xc257, 0xc277, 0xc857, 0xc877, 0xc757, 0xc777,
573 	0x003f, 0x003f, 0xc758, 0xc778, 0xc858, 0xc878, 0xc759, 0xc779,
574 	0xc35a, 0xc37a, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc874,
575 	0xca77, 0xca79, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
576 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
577 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
578 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
579 	0x003f, 0x003f, 0x003f, 0x003f, 0xc445, 0xc465, 0x003f, 0x003f,
580 };
581 
582 static const wvec64 u1e3 = {
583 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
584 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
585 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
586 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
587 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
588 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
589 	0x003f, 0x003f, 0xc159, 0xc179, 0x003f, 0x003f, 0x003f, 0x003f,
590 	0xc459, 0xc479, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
591 };
592 
593 static const wvec64 *wc00[] = {
594 	&u000, &u001, &u002, &u003,
595 	&u010, &u011, NULL, &u013,
596 	&u020, NULL, NULL, &u023,
597 	&u030, NULL, NULL, NULL};
598 
599 static const wvec64 *wc1e[] = {
600 	&u1e0, &u1e1, &u1e2, &u1e3};
601 
602 
603 int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst )
604 {
605 	char *c, *d;
606 	wchar_t tmp;
607 	int i, j, tlen = 0;
608 
609 	/* Just count the length of the T.61 result first */
610 	for (i=0,c=src->bv_val; i < src->bv_len;) {
611 		j = ldap_x_utf8_to_wc( &tmp, c );
612 		if (j == -1)
613 			return LDAP_INVALID_SYNTAX;
614 		switch (tmp >> 8) {
615 		case 0x00:
616 		case 0x01:
617 		case 0x02:
618 		case 0x03:
619 			if (wc00[tmp >> 6] &&
620 				((*wc00[tmp >> 6])[tmp & 0x3f] & 0xff00)) {
621 				tlen++;
622 			}
623 			tlen++;
624 			break;
625 		case 0x1e:
626 			if ((*wc1e[(tmp >> 6) & 3])[tmp & 0x3f] & 0xff00) {
627 				tlen++;
628 			}
629 		case 0x21:
630 		default:
631 			tlen ++;
632 			break;
633 		}
634 		i += j;
635 		c += j;
636 	}
637 	dst->bv_len = tlen;
638 	dst->bv_val = LDAP_MALLOC( tlen+1 );
639 	if (!dst->bv_val)
640 		return LDAP_NO_MEMORY;
641 
642 	d = dst->bv_val;
643 	for (i=0,c=src->bv_val; i < src->bv_len;) {
644 		j = ldap_x_utf8_to_wc( &tmp, c );
645 		switch (tmp >> 8) {
646 		case 0x00:
647 		case 0x01:
648 		case 0x02:
649 			if (wc00[tmp >> 6]) {
650 				tmp = (*wc00[tmp >> 6])[tmp & 0x3f];
651 				if (tmp & 0xff00)
652 					*d++ = (tmp >> 8);
653 				*d++ = tmp & 0xff;
654 			} else {
655 				*d++ = 0x3f;
656 			}
657 			break;
658 		case 0x03:
659 			/* swap order of non-spacing characters */
660 			if (wc00[tmp >> 6]) {
661 				wchar_t t2 = (*wc00[tmp >> 6])[tmp & 0x3f];
662 				if (t2 != 0x3f) {
663 					d[0] = d[-1];
664 					d[-1] = t2;
665 					d++;
666 				} else {
667 					*d++ = 0x3f;
668 				}
669 			} else {
670 				*d++ = 0x3f;
671 			}
672 			break;
673 		case 0x1e:
674 			tmp = (*wc1e[(tmp >> 6) & 3])[tmp & 0x3f];
675 			if (tmp & 0xff00)
676 				*d++ = (tmp >> 8);
677 			*d++ = tmp & 0xff;
678 			break;
679 		case 0x21:
680 			if (tmp == 0x2126) {
681 				*d++ = 0xe0;
682 				break;
683 			}
684 			/* FALLTHRU */
685 		default:
686 			*d++ = 0x3f;
687 			break;
688 		}
689 		i += j;
690 		c += j;
691 	}
692 	*d = '\0';
693 	return LDAP_SUCCESS;
694 }
695