1 /* $NetBSD: t61.c,v 1.3 2021/08/14 16:14:56 christos Exp $ */
2
3 /* $OpenLDAP$ */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 *
6 * Copyright 2002-2021 The OpenLDAP Foundation.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted only as authorized by the OpenLDAP
11 * Public License.
12 *
13 * A copy of this license is available in the file LICENSE in the
14 * top-level directory of the distribution or, alternatively, at
15 * <http://www.OpenLDAP.org/license.html>.
16 */
17 /* ACKNOWLEDGEMENTS:
18 * This work was initially developed by Howard Chu for inclusion in
19 * OpenLDAP Software.
20 */
21
22 /*
23 * Basic T.61 <-> UTF-8 conversion
24 *
25 * These routines will perform a lossless translation from T.61 to UTF-8
26 * and a lossy translation from UTF-8 to T.61.
27 */
28
29 #include <sys/cdefs.h>
30 __RCSID("$NetBSD: t61.c,v 1.3 2021/08/14 16:14:56 christos Exp $");
31
32 #include "portable.h"
33
34 #include <stdio.h>
35
36 #include <ac/stdlib.h>
37
38 #include <ac/socket.h>
39 #include <ac/string.h>
40 #include <ac/time.h>
41
42 #include "ldap-int.h"
43 #include "ldap_utf8.h"
44
45 #include "ldap_defaults.h"
46
47 /*
48 * T.61 is somewhat braindead; even in the 7-bit space it is not
49 * completely equivalent to 7-bit US-ASCII. Our definition of the
50 * character set comes from RFC 1345 with a slightly more readable
51 * rendition at http://std.dkuug.dk/i18n/charmaps/T.61-8BIT.
52 *
53 * Even though '#' and '$' are present in the 7-bit US-ASCII space,
54 * (x23 and x24, resp.) in T.61 they are mapped to 8-bit characters
55 * xA6 and xA4.
56 *
57 * Also T.61 lacks
58 * backslash \ (x5C)
59 * caret ^ (x5E)
60 * backquote ` (x60)
61 * left brace { (x7B)
62 * right brace } (x7D)
63 * tilde ~ (x7E)
64 *
65 * In T.61, the codes xC1 to xCF (excluding xC9, unused) are non-spacing
66 * accents of some form or another. There are predefined combinations
67 * for certain characters, but they can also be used arbitrarily. The
68 * table at dkuug.dk maps these accents to the E000 "private use" range
69 * of the Unicode space, but I believe they more properly belong in the
70 * 0300 range (non-spacing accents). The transformation is complicated
71 * slightly because Unicode wants the non-spacing character to follow
72 * the base character, while T.61 has the non-spacing character leading.
73 * Also, T.61 specifically recognizes certain combined pairs as "characters"
74 * but doesn't specify how to treat unrecognized pairs. This code will
75 * always attempt to combine pairs when a known Unicode composite exists.
76 */
77
78 static const wchar_t t61_tab[] = {
79 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
80 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
81 0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
82 0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
83 0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
84 0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
85 0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
86 0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
87 0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
88 0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
89 0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
90 0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
91 0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
92 0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
93 0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
94 0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
95 0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
96 0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
97 0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
98 0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
99 0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
100 0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
101 0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
102 0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
103 0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
104 0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
105 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
106 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
107 0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
108 0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
109 0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
110 0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
111 };
112
113 typedef wchar_t wvec16[16];
114 typedef wchar_t wvec32[32];
115 typedef wchar_t wvec64[64];
116
117 /* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
118 static const wvec16 accents = {
119 0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
120 0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
121
122 /* In the following tables, base characters commented in (parentheses)
123 * are not defined by T.61 but are mapped anyway since their Unicode
124 * composite exists.
125 */
126
127 /* Grave accented chars AEIOU (NWY) */
128 static const wvec32 c1_vec1 = {
129 /* Upper case */
130 0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
131 0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
132 static const wvec32 c1_vec2 = {
133 /* Lower case */
134 0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
135 0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
136
137 static const wvec32 *c1_grave[] = {
138 NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
139 };
140
141 /* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
142 static const wvec32 c2_vec1 = {
143 /* Upper case */
144 0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
145 0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
146 0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
147 0, 0xdd, 0x179, 0, 0, 0, 0, 0};
148 static const wvec32 c2_vec2 = {
149 /* Lower case */
150 0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
151 0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
152 0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
153 0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
154 static const wvec32 c2_vec3 = {
155 /* (AE and ae) */
156 0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
158
159 static const wvec32 *c2_acute[] = {
160 NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
161 };
162
163 /* Circumflex AEIOUYCGHJSW (Z) */
164 static const wvec32 c3_vec1 = {
165 /* Upper case */
166 0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
167 0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
168 0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
169 0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
170 static const wvec32 c3_vec2 = {
171 /* Lower case */
172 0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
173 0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
174 0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
175 0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
176 static const wvec32 *c3_circumflex[] = {
177 NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
178 };
179
180 /* Tilde AIOUN (EVY) */
181 static const wvec32 c4_vec1 = {
182 /* Upper case */
183 0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
184 0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
185 static const wvec32 c4_vec2 = {
186 /* Lower case */
187 0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
188 0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
189 static const wvec32 *c4_tilde[] = {
190 NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
191 };
192
193 /* Macron AEIOU (YG) */
194 static const wvec32 c5_vec1 = {
195 /* Upper case */
196 0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
197 0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
198 static const wvec32 c5_vec2 = {
199 /* Lower case */
200 0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
201 0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
202 static const wvec32 c5_vec3 = {
203 /* (AE and ae) */
204 0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
206 static const wvec32 *c5_macron[] = {
207 NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
208 };
209
210 /* Breve AUG (EIO) */
211 static const wvec32 c6_vec1 = {
212 /* Upper case */
213 0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
214 0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
215 static const wvec32 c6_vec2 = {
216 /* Lower case */
217 0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
218 0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
219 static const wvec32 *c6_breve[] = {
220 NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
221 };
222
223 /* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
224 static const wvec32 c7_vec1 = {
225 /* Upper case */
226 0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
227 0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
228 0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
229 0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
230 static const wvec32 c7_vec2 = {
231 /* Lower case */
232 0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
233 0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
234 0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
235 0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
236 static const wvec32 *c7_dotabove[] = {
237 NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
238 };
239
240 /* Diaeresis AEIOUY (HWXt) */
241 static const wvec32 c8_vec1 = {
242 /* Upper case */
243 0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
244 0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
245 static const wvec32 c8_vec2 = {
246 /* Lower case */
247 0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
248 0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
249 static const wvec32 *c8_diaeresis[] = {
250 NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
251 };
252
253 /* Ring Above AU (wy) */
254 static const wvec32 ca_vec1 = {
255 /* Upper case */
256 0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
258 static const wvec32 ca_vec2 = {
259 /* Lower case */
260 0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
261 0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
262 static const wvec32 *ca_ringabove[] = {
263 NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
264 };
265
266 /* Cedilla CGKLNRST (EDH) */
267 static const wvec32 cb_vec1 = {
268 /* Upper case */
269 0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
270 0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
271 0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
272 static const wvec32 cb_vec2 = {
273 /* Lower case */
274 0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
275 0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
276 0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
277 static const wvec32 *cb_cedilla[] = {
278 NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
279 };
280
281 /* Double Acute Accent OU */
282 static const wvec32 cd_vec1 = {
283 /* Upper case */
284 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
285 0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
286 static const wvec32 cd_vec2 = {
287 /* Lower case */
288 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
289 0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
290 static const wvec32 *cd_doubleacute[] = {
291 NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
292 };
293
294 /* Ogonek AEIU (O) */
295 static const wvec32 ce_vec1 = {
296 /* Upper case */
297 0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
298 0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
299 static const wvec32 ce_vec2 = {
300 /* Lower case */
301 0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
302 0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
303 static const wvec32 *ce_ogonek[] = {
304 NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
305 };
306
307 /* Caron CDELNRSTZ (AIOUGKjH) */
308 static const wvec32 cf_vec1 = {
309 /* Upper case */
310 0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
311 0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
312 0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
313 0, 0, 0x17d, 0, 0, 0, 0, 0};
314 static const wvec32 cf_vec2 = {
315 /* Lower case */
316 0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
317 0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
318 0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
319 0, 0, 0x17e, 0, 0, 0, 0, 0};
320 static const wvec32 *cf_caron[] = {
321 NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
322 };
323
324 static const wvec32 **cx_tab[] = {
325 NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
326 c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
327 cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
328
ldap_t61s_valid(struct berval * str)329 int ldap_t61s_valid( struct berval *str )
330 {
331 unsigned char *c = (unsigned char *)str->bv_val;
332 int i;
333
334 for (i=0; i < str->bv_len; c++,i++)
335 if (!t61_tab[*c])
336 return 0;
337 return 1;
338 }
339
340 /* Transform a T.61 string to UTF-8.
341 */
ldap_t61s_to_utf8s(struct berval * src,struct berval * dst)342 int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst )
343 {
344 unsigned char *c;
345 char *d;
346 int i, wlen = 0;
347
348 /* Just count the length of the UTF-8 result first */
349 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
350 /* Invalid T.61 characters? */
351 if (!t61_tab[*c])
352 return LDAP_INVALID_SYNTAX;
353 if ((*c & 0xf0) == 0xc0) {
354 int j = *c & 0x0f;
355 /* If this is the end of the string, or if the base
356 * character is just a space, treat this as a regular
357 * spacing character.
358 */
359 if ((!c[1] || c[1] == 0x20) && accents[j]) {
360 wlen += ldap_x_wc_to_utf8(NULL, accents[j], 0);
361 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
362 /* We have a composite mapping for this pair */
363 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
364 wlen += ldap_x_wc_to_utf8( NULL,
365 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 0);
366 } else {
367 /* No mapping, just swap it around so the base
368 * character comes first.
369 */
370 wlen += ldap_x_wc_to_utf8(NULL, c[1], 0);
371 wlen += ldap_x_wc_to_utf8(NULL,
372 t61_tab[*c], 0);
373 }
374 c++; i++;
375 continue;
376 } else {
377 wlen += ldap_x_wc_to_utf8(NULL, t61_tab[*c], 0);
378 }
379 }
380
381 /* Now transform the string */
382 dst->bv_len = wlen;
383 dst->bv_val = LDAP_MALLOC( wlen+1 );
384 d = dst->bv_val;
385 if (!d)
386 return LDAP_NO_MEMORY;
387
388 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
389 if ((*c & 0xf0) == 0xc0) {
390 int j = *c & 0x0f;
391 /* If this is the end of the string, or if the base
392 * character is just a space, treat this as a regular
393 * spacing character.
394 */
395 if ((!c[1] || c[1] == 0x20) && accents[j]) {
396 d += ldap_x_wc_to_utf8(d, accents[j], 6);
397 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
398 /* We have a composite mapping for this pair */
399 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
400 d += ldap_x_wc_to_utf8(d,
401 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 6);
402 } else {
403 /* No mapping, just swap it around so the base
404 * character comes first.
405 */
406 d += ldap_x_wc_to_utf8(d, c[1], 6);
407 d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
408 }
409 c++; i++;
410 continue;
411 } else {
412 d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
413 }
414 }
415 *d = '\0';
416 return LDAP_SUCCESS;
417 }
418
419 /* For the reverse mapping, we just pay attention to the Latin-oriented
420 * code blocks. These are
421 * 0000 - 007f Basic Latin
422 * 0080 - 00ff Latin-1 Supplement
423 * 0100 - 017f Latin Extended-A
424 * 0180 - 024f Latin Extended-B
425 * 1e00 - 1eff Latin Extended Additional
426 *
427 * We have a special case to map Ohm U2126 back to T.61 0xe0. All other
428 * unrecognized characters are replaced with '?' 0x3f.
429 */
430
431 static const wvec64 u000 = {
432 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
433 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
434 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
435 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
436 0x0020, 0x0021, 0x0022, 0x00a6, 0x00a4, 0x0025, 0x0026, 0x0027,
437 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
438 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
439 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f};
440
441 /* In this range, we've mapped caret to xc3/x20, backquote to xc1/x20,
442 * and tilde to xc4/x20. T.61 (stupidly!) doesn't define these characters
443 * on their own, even though it provides them as combiners for other
444 * letters. T.61 doesn't define these pairings either, so this may just
445 * have to be replaced with '?' 0x3f if other software can't cope with it.
446 */
447 static const wvec64 u001 = {
448 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
449 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
450 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
451 0x0058, 0x0059, 0x005a, 0x005b, 0x003f, 0x005d, 0xc320, 0x005f,
452 0xc120, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
453 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
454 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
455 0x0078, 0x0079, 0x007a, 0x003f, 0x007c, 0x003f, 0xc420, 0x007f};
456
457 static const wvec64 u002 = {
458 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
459 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
460 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
461 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
462 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a8, 0x00a5, 0x003f, 0x00a7,
463 0xc820, 0x003f, 0x00e3, 0x00ab, 0x003f, 0x003f, 0x003f, 0xc520,
464 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0xc220, 0x00b5, 0x00b6, 0x00b7,
465 0xcb20, 0x003f, 0x00eb, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf};
466
467 static const wvec64 u003 = {
468 0xc141, 0xc241, 0xc341, 0xc441, 0xc841, 0xca41, 0x00e1, 0xcb43,
469 0xc145, 0xc245, 0xc345, 0xc845, 0xc149, 0xc249, 0xc349, 0xc849,
470 0x00e2, 0xc44e, 0xc14f, 0xc24f, 0xc34f, 0xc44f, 0xc84f, 0x00b4,
471 0x00e9, 0xc155, 0xc255, 0xc355, 0xc855, 0xc259, 0x00ec, 0x00fb,
472 0xc161, 0xc261, 0xc361, 0xc461, 0xc861, 0xca61, 0x00f1, 0xcb63,
473 0xc165, 0xc265, 0xc365, 0xc865, 0xc169, 0xc269, 0xc369, 0xc869,
474 0x00f3, 0xc46e, 0xc16f, 0xc26f, 0xc36f, 0xc46f, 0xc86f, 0x00b8,
475 0x00f9, 0xc175, 0xc275, 0xc375, 0xc875, 0xc279, 0x00fc, 0xc879};
476
477 /* These codes are used here but not defined by T.61:
478 * x114 = xc6/x45, x115 = xc6/x65, x12c = xc6/x49, x12d = xc6/x69
479 */
480 static const wvec64 u010 = {
481 0xc541, 0xc561, 0xc641, 0xc661, 0xce41, 0xce61, 0xc243, 0xc263,
482 0xc343, 0xc363, 0xc743, 0xc763, 0xcf43, 0xcf63, 0xcf44, 0xcf64,
483 0x003f, 0x00f2, 0xc545, 0xc565, 0xc645, 0xc665, 0xc745, 0xc765,
484 0xce45, 0xce65, 0xcf45, 0xcf65, 0xc347, 0xc367, 0xc647, 0xc667,
485 0xc747, 0xc767, 0xcb47, 0xcb67, 0xc348, 0xc368, 0x00e4, 0x00f4,
486 0xc449, 0xc469, 0xc549, 0xc569, 0xc649, 0xc669, 0xce49, 0xce69,
487 0xc749, 0x00f5, 0x00e6, 0x00f6, 0xc34a, 0xc36a, 0xcb4b, 0xcb6b,
488 0x00f0, 0xc24c, 0xc26c, 0xcb4c, 0xcb6c, 0xcf4c, 0xcf6c, 0x00e7};
489
490 /* These codes are used here but not defined by T.61:
491 * x14e = xc6/x4f, x14f = xc6/x6f
492 */
493 static const wvec64 u011 = {
494 0x00f7, 0x00e8, 0x00f8, 0xc24e, 0xc26e, 0xcb4e, 0xcb6e, 0xcf4e,
495 0xcf6e, 0x00ef, 0x00ee, 0x00fe, 0xc54f, 0xc56f, 0xc64f, 0xc66f,
496 0xcd4f, 0xcd6f, 0x00ea, 0x00fa, 0xc252, 0xc272, 0xcb52, 0xcb72,
497 0xcf52, 0xcf72, 0xc253, 0xc273, 0xc353, 0xc373, 0xcb53, 0xcb73,
498 0xcf53, 0xcf73, 0xcb54, 0xcb74, 0xcf54, 0xcf74, 0x00ed, 0x00fd,
499 0xc455, 0xc475, 0xc555, 0xc575, 0xc655, 0xc675, 0xca55, 0xca75,
500 0xcd55, 0xcd75, 0xce55, 0xce75, 0xc357, 0xc377, 0xc359, 0xc379,
501 0xc859, 0xc25a, 0xc27a, 0xc75a, 0xc77a, 0xcf5a, 0xcf7a, 0x003f};
502
503 /* All of the codes in this block are undefined in T.61.
504 */
505 static const wvec64 u013 = {
506 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
507 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf41, 0xcf61, 0xcf49,
508 0xcf69, 0xcf4f, 0xcf6f, 0xcf55, 0xcf75, 0x003f, 0x003f, 0x003f,
509 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
510 0x003f, 0x003f, 0xc5e1, 0xc5f1, 0x003f, 0x003f, 0xcf47, 0xcf67,
511 0xcf4b, 0xcf6b, 0xce4f, 0xce6f, 0x003f, 0x003f, 0x003f, 0x003f,
512 0xcf6a, 0x003f, 0x003f, 0x003f, 0xc247, 0xc267, 0x003f, 0x003f,
513 0xc14e, 0xc16e, 0x003f, 0x003f, 0xc2e1, 0xc2f1, 0x003f, 0x003f};
514
515 /* All of the codes in this block are undefined in T.61.
516 */
517 static const wvec64 u020 = {
518 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
519 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
520 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
521 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf48, 0xcf68,
522 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc741, 0xc761,
523 0xcb45, 0xcb65, 0x003f, 0x003f, 0x003f, 0x003f, 0xc74f, 0xc76f,
524 0x003f, 0x003f, 0xc559, 0xc579, 0x003f, 0x003f, 0x003f, 0x003f,
525 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
526
527 static const wvec64 u023 = {
528 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf20,
529 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
530 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
531 0xc620, 0xc720, 0xca20, 0xce20, 0x003f, 0xcd20, 0x003f, 0x003f,
532 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
533 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
534 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
535 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
536
537 /* These are the non-spacing characters by themselves. They should
538 * never appear by themselves in actual text.
539 */
540 static const wvec64 u030 = {
541 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x003f, 0x00c6, 0x00c7,
542 0x00c8, 0x003f, 0x00ca, 0x00cd, 0x00cf, 0x003f, 0x003f, 0x003f,
543 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
544 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
545 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x00cb,
546 0x00ce, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
547 0x003f, 0x003f, 0x00cc, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
548 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
549
550 /* None of the following blocks are defined in T.61.
551 */
552 static const wvec64 u1e0 = {
553 0x003f, 0x003f, 0xc742, 0xc762, 0x003f, 0x003f, 0x003f, 0x003f,
554 0x003f, 0x003f, 0xc744, 0xc764, 0x003f, 0x003f, 0x003f, 0x003f,
555 0xcb44, 0xcb64, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
556 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc746, 0xc766,
557 0xc547, 0xc567, 0xc748, 0xc768, 0x003f, 0x003f, 0xc848, 0xc868,
558 0xcb48, 0xcb68, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
559 0xc24b, 0xc26b, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
560 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc24d, 0xc26d,
561 };
562
563 static const wvec64 u1e1 = {
564 0xc74d, 0xc76d, 0x003f, 0x003f, 0xc74e, 0xc76e, 0x003f, 0x003f,
565 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
566 0x003f, 0x003f, 0x003f, 0x003f, 0xc250, 0xc270, 0xc750, 0xc770,
567 0xc752, 0xc772, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
568 0xc753, 0xc773, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
569 0x003f, 0x003f, 0xc754, 0xc774, 0x003f, 0x003f, 0x003f, 0x003f,
570 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
571 0x003f, 0x003f, 0x003f, 0x003f, 0xc456, 0xc476, 0x003f, 0x003f,
572 };
573
574 static const wvec64 u1e2 = {
575 0xc157, 0xc177, 0xc257, 0xc277, 0xc857, 0xc877, 0xc757, 0xc777,
576 0x003f, 0x003f, 0xc758, 0xc778, 0xc858, 0xc878, 0xc759, 0xc779,
577 0xc35a, 0xc37a, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc874,
578 0xca77, 0xca79, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
579 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
580 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
581 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
582 0x003f, 0x003f, 0x003f, 0x003f, 0xc445, 0xc465, 0x003f, 0x003f,
583 };
584
585 static const wvec64 u1e3 = {
586 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
587 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
588 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
589 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
590 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
591 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
592 0x003f, 0x003f, 0xc159, 0xc179, 0x003f, 0x003f, 0x003f, 0x003f,
593 0xc459, 0xc479, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
594 };
595
596 static const wvec64 *wc00[] = {
597 &u000, &u001, &u002, &u003,
598 &u010, &u011, NULL, &u013,
599 &u020, NULL, NULL, &u023,
600 &u030, NULL, NULL, NULL};
601
602 static const wvec64 *wc1e[] = {
603 &u1e0, &u1e1, &u1e2, &u1e3};
604
605
ldap_utf8s_to_t61s(struct berval * src,struct berval * dst)606 int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst )
607 {
608 char *c, *d;
609 wchar_t tmp;
610 int i, j, tlen = 0;
611
612 /* Just count the length of the T.61 result first */
613 for (i=0,c=src->bv_val; i < src->bv_len;) {
614 j = ldap_x_utf8_to_wc( &tmp, c );
615 if (j == -1)
616 return LDAP_INVALID_SYNTAX;
617 switch (tmp >> 8) {
618 case 0x00:
619 case 0x01:
620 case 0x02:
621 case 0x03:
622 if (wc00[tmp >> 6] &&
623 ((*wc00[tmp >> 6])[tmp & 0x3f] & 0xff00)) {
624 tlen++;
625 }
626 tlen++;
627 break;
628 case 0x1e:
629 if ((*wc1e[(tmp >> 6) & 3])[tmp & 0x3f] & 0xff00) {
630 tlen++;
631 }
632 case 0x21:
633 default:
634 tlen ++;
635 break;
636 }
637 i += j;
638 c += j;
639 }
640 dst->bv_len = tlen;
641 dst->bv_val = LDAP_MALLOC( tlen+1 );
642 if (!dst->bv_val)
643 return LDAP_NO_MEMORY;
644
645 d = dst->bv_val;
646 for (i=0,c=src->bv_val; i < src->bv_len;) {
647 j = ldap_x_utf8_to_wc( &tmp, c );
648 switch (tmp >> 8) {
649 case 0x00:
650 case 0x01:
651 case 0x02:
652 if (wc00[tmp >> 6]) {
653 tmp = (*wc00[tmp >> 6])[tmp & 0x3f];
654 if (tmp & 0xff00)
655 *d++ = (tmp >> 8);
656 *d++ = tmp & 0xff;
657 } else {
658 *d++ = 0x3f;
659 }
660 break;
661 case 0x03:
662 /* swap order of non-spacing characters */
663 if (wc00[tmp >> 6]) {
664 wchar_t t2 = (*wc00[tmp >> 6])[tmp & 0x3f];
665 if (t2 != 0x3f) {
666 d[0] = d[-1];
667 d[-1] = t2;
668 d++;
669 } else {
670 *d++ = 0x3f;
671 }
672 } else {
673 *d++ = 0x3f;
674 }
675 break;
676 case 0x1e:
677 tmp = (*wc1e[(tmp >> 6) & 3])[tmp & 0x3f];
678 if (tmp & 0xff00)
679 *d++ = (tmp >> 8);
680 *d++ = tmp & 0xff;
681 break;
682 case 0x21:
683 if (tmp == 0x2126) {
684 *d++ = 0xe0;
685 break;
686 }
687 /* FALLTHRU */
688 default:
689 *d++ = 0x3f;
690 break;
691 }
692 i += j;
693 c += j;
694 }
695 *d = '\0';
696 return LDAP_SUCCESS;
697 }
698