xref: /onnv-gate/usr/src/lib/libsmbfs/smb/charsets.c (revision 6007:d57e38e8fdd1)
1*6007Sthurlow /*
2*6007Sthurlow  * Copyright (c) 2001 Apple Computer, Inc. All rights reserved.
3*6007Sthurlow  *
4*6007Sthurlow  * @APPLE_LICENSE_HEADER_START@
5*6007Sthurlow  *
6*6007Sthurlow  * "Portions Copyright (c) 1999 Apple Computer, Inc.  All Rights
7*6007Sthurlow  * Reserved.  This file contains Original Code and/or Modifications of
8*6007Sthurlow  * Original Code as defined in and that are subject to the Apple Public
9*6007Sthurlow  * Source License Version 1.0 (the 'License').  You may not use this file
10*6007Sthurlow  * except in compliance with the License.  Please obtain a copy of the
11*6007Sthurlow  * License at http://www.apple.com/publicsource and read it before using
12*6007Sthurlow  * this file.
13*6007Sthurlow  *
14*6007Sthurlow  * The Original Code and all software distributed under the License are
15*6007Sthurlow  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
16*6007Sthurlow  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
17*6007Sthurlow  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
18*6007Sthurlow  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
19*6007Sthurlow  * License for the specific language governing rights and limitations
20*6007Sthurlow  * under the License."
21*6007Sthurlow  *
22*6007Sthurlow  * @APPLE_LICENSE_HEADER_END@
23*6007Sthurlow  */
24*6007Sthurlow /*      @(#)charsets.c      *
25*6007Sthurlow  *      (c) 2004   Apple Computer, Inc.  All Rights Reserved
26*6007Sthurlow  *
27*6007Sthurlow  *
28*6007Sthurlow  *      charsets.c -- Routines converting between UTF-8, 16-bit
29*6007Sthurlow  *			little-endian Unicode, and various Windows
30*6007Sthurlow  *			code pages.
31*6007Sthurlow  *
32*6007Sthurlow  *      MODIFICATION HISTORY:
33*6007Sthurlow  *       28-Nov-2004     Guy Harris	New today
34*6007Sthurlow  */
35*6007Sthurlow 
36*6007Sthurlow #pragma ident	"%Z%%M%	%I%	%E% SMI"
37*6007Sthurlow 
38*6007Sthurlow #include <stdlib.h>
39*6007Sthurlow #include <stdio.h>
40*6007Sthurlow #include <string.h>
41*6007Sthurlow #include <ctype.h>
42*6007Sthurlow #include <iconv.h>
43*6007Sthurlow #include <langinfo.h>
44*6007Sthurlow #include <strings.h>
45*6007Sthurlow 
46*6007Sthurlow #ifdef NOTPORTED
47*6007Sthurlow #include <CoreFoundation/CoreFoundation.h>
48*6007Sthurlow #include <CoreFoundation/CFStringDefaultEncoding.h>
49*6007Sthurlow #include <CoreFoundation/CFStringEncodingConverter.h>
50*6007Sthurlow #include <sys/mchain.h>
51*6007Sthurlow #endif /* NOTPORTED */
52*6007Sthurlow 
53*6007Sthurlow #include <netsmb/smb_lib.h>
54*6007Sthurlow #include <netsmb/mchain.h>
55*6007Sthurlow 
56*6007Sthurlow #include "charsets.h"
57*6007Sthurlow 
58*6007Sthurlow #ifdef NOTPORTED
59*6007Sthurlow extern 	 uid_t real_uid,eff_uid;
60*6007Sthurlow #endif /* NOTPORTED */
61*6007Sthurlow 
62*6007Sthurlow /*
63*6007Sthurlow  * On Solaris, we will need to do some rewriting to use our iconv
64*6007Sthurlow  * routines for the conversions.  For now, we're effectively
65*6007Sthurlow  * stubbing out code, leaving the details of what happens on
66*6007Sthurlow  * Darwin in case it's useful as a guide later.
67*6007Sthurlow  */
68*6007Sthurlow 
69*6007Sthurlow static unsigned
70*6007Sthurlow xtoi(char u)
71*6007Sthurlow {
72*6007Sthurlow         if (isdigit(u))
73*6007Sthurlow                 return (u - '0');
74*6007Sthurlow         else if (islower(u))
75*6007Sthurlow                 return (10 + u - 'a');
76*6007Sthurlow         else if (isupper(u))
77*6007Sthurlow                 return (10 + u - 'A');
78*6007Sthurlow         return (16);
79*6007Sthurlow }
80*6007Sthurlow 
81*6007Sthurlow 
82*6007Sthurlow /* Removes the "%" escape sequences from a URL component.
83*6007Sthurlow  * See IETF RFC 2396.
84*6007Sthurlow  */
85*6007Sthurlow char *
86*6007Sthurlow unpercent(char * component)
87*6007Sthurlow {
88*6007Sthurlow         char c, *s;
89*6007Sthurlow         unsigned hi, lo;
90*6007Sthurlow 
91*6007Sthurlow         if (component)
92*6007Sthurlow                 for (s = component; (c = *s) != 0; s++) {
93*6007Sthurlow                         if (c != '%')
94*6007Sthurlow                                 continue;
95*6007Sthurlow                         if ((hi = xtoi(s[1])) > 15 || (lo = xtoi(s[2])) > 15)
96*6007Sthurlow                                 continue; /* ignore invalid escapes */
97*6007Sthurlow                         s[0] = hi*16 + lo;
98*6007Sthurlow                         /*
99*6007Sthurlow                          * This was strcpy(s + 1, s + 3);
100*6007Sthurlow                          * But nowadays leftward overlapping copies are
101*6007Sthurlow                          * officially undefined in C.  Ours seems to
102*6007Sthurlow                          * work or not depending upon alignment.
103*6007Sthurlow                          */
104*6007Sthurlow                         memmove(s+1, s+3, strlen(s+3) + 1);
105*6007Sthurlow                 }
106*6007Sthurlow         return (component);
107*6007Sthurlow }
108*6007Sthurlow 
109*6007Sthurlow #ifdef NOTPORTED
110*6007Sthurlow static CFStringEncoding
111*6007Sthurlow get_windows_encoding_equivalent( void )
112*6007Sthurlow {
113*6007Sthurlow 
114*6007Sthurlow 	CFStringEncoding encoding;
115*6007Sthurlow 	uint32_t index,region;
116*6007Sthurlow 
117*6007Sthurlow 	/* important! use root ID so you can read the config file! */
118*6007Sthurlow 	seteuid(eff_uid);
119*6007Sthurlow 	__CFStringGetInstallationEncodingAndRegion(&index,&region);
120*6007Sthurlow 	seteuid(real_uid);
121*6007Sthurlow 
122*6007Sthurlow 	switch ( index )
123*6007Sthurlow 	{
124*6007Sthurlow 		case	kCFStringEncodingMacRoman:
125*6007Sthurlow 			if (region) /* anything nonzero is not US */
126*6007Sthurlow 				encoding = kCFStringEncodingDOSLatin1;
127*6007Sthurlow 			else /* US region */
128*6007Sthurlow 				encoding = kCFStringEncodingDOSLatinUS;
129*6007Sthurlow 			break;
130*6007Sthurlow 
131*6007Sthurlow 		case	kCFStringEncodingMacJapanese:
132*6007Sthurlow 			encoding = kCFStringEncodingDOSJapanese;
133*6007Sthurlow 			break;
134*6007Sthurlow 
135*6007Sthurlow 		case	kCFStringEncodingMacChineseTrad:
136*6007Sthurlow 			encoding = kCFStringEncodingDOSChineseTrad;
137*6007Sthurlow 			break;
138*6007Sthurlow 
139*6007Sthurlow 		case	kCFStringEncodingMacKorean:
140*6007Sthurlow 			encoding = kCFStringEncodingDOSKorean;
141*6007Sthurlow 			break;
142*6007Sthurlow 
143*6007Sthurlow 		case	kCFStringEncodingMacArabic:
144*6007Sthurlow 			encoding = kCFStringEncodingDOSArabic;
145*6007Sthurlow 			break;
146*6007Sthurlow 
147*6007Sthurlow 		case	kCFStringEncodingMacHebrew:
148*6007Sthurlow 			encoding = kCFStringEncodingDOSHebrew;
149*6007Sthurlow 			break;
150*6007Sthurlow 
151*6007Sthurlow 		case	kCFStringEncodingMacGreek:
152*6007Sthurlow 			encoding = kCFStringEncodingDOSGreek;
153*6007Sthurlow 			break;
154*6007Sthurlow 
155*6007Sthurlow 		case	kCFStringEncodingMacCyrillic:
156*6007Sthurlow 			encoding = kCFStringEncodingDOSCyrillic;
157*6007Sthurlow 			break;
158*6007Sthurlow 
159*6007Sthurlow 		case	kCFStringEncodingMacThai:
160*6007Sthurlow 			encoding = kCFStringEncodingDOSThai;
161*6007Sthurlow 			break;
162*6007Sthurlow 
163*6007Sthurlow 		case	kCFStringEncodingMacChineseSimp:
164*6007Sthurlow 			encoding = kCFStringEncodingDOSChineseSimplif;
165*6007Sthurlow 			break;
166*6007Sthurlow 
167*6007Sthurlow 		case	kCFStringEncodingMacCentralEurRoman:
168*6007Sthurlow 			encoding = kCFStringEncodingDOSLatin2;
169*6007Sthurlow 			break;
170*6007Sthurlow 
171*6007Sthurlow 		case	kCFStringEncodingMacTurkish:
172*6007Sthurlow 			encoding = kCFStringEncodingDOSTurkish;
173*6007Sthurlow 			break;
174*6007Sthurlow 
175*6007Sthurlow 		case	kCFStringEncodingMacCroatian:
176*6007Sthurlow 			encoding = kCFStringEncodingDOSLatin2;
177*6007Sthurlow 			break;
178*6007Sthurlow 
179*6007Sthurlow 		case	kCFStringEncodingMacIcelandic:
180*6007Sthurlow 			encoding = kCFStringEncodingDOSIcelandic;
181*6007Sthurlow 			break;
182*6007Sthurlow 
183*6007Sthurlow 		case	kCFStringEncodingMacRomanian:
184*6007Sthurlow 			encoding = kCFStringEncodingDOSLatin2;
185*6007Sthurlow 			break;
186*6007Sthurlow 
187*6007Sthurlow 		case	kCFStringEncodingMacFarsi:
188*6007Sthurlow 			encoding = kCFStringEncodingDOSArabic;
189*6007Sthurlow 			break;
190*6007Sthurlow 
191*6007Sthurlow 		case	kCFStringEncodingMacUkrainian:
192*6007Sthurlow 			encoding = kCFStringEncodingDOSCyrillic;
193*6007Sthurlow 			break;
194*6007Sthurlow 
195*6007Sthurlow 		default:
196*6007Sthurlow 			encoding = kCFStringEncodingDOSLatin1;
197*6007Sthurlow 			break;
198*6007Sthurlow 	}
199*6007Sthurlow 
200*6007Sthurlow 	return encoding;
201*6007Sthurlow }
202*6007Sthurlow #endif /* NOTPORTED */
203*6007Sthurlow 
204*6007Sthurlow /*
205*6007Sthurlow  * XXX - NLS, or CF?  We should probably use the same routine for all
206*6007Sthurlow  * conversions.
207*6007Sthurlow  */
208*6007Sthurlow char *
209*6007Sthurlow convert_wincs_to_utf8(const char *windows_string)
210*6007Sthurlow {
211*6007Sthurlow #ifdef NOTPORTED
212*6007Sthurlow 	CFStringRef s;
213*6007Sthurlow 	CFIndex maxlen;
214*6007Sthurlow 	char *result;
215*6007Sthurlow 
216*6007Sthurlow 	s = CFStringCreateWithCString(NULL, windows_string,
217*6007Sthurlow 		get_windows_encoding_equivalent());
218*6007Sthurlow 	if (s == NULL) {
219*6007Sthurlow 		smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" ", -1,
220*6007Sthurlow 		    windows_string);
221*6007Sthurlow 
222*6007Sthurlow 		/* kCFStringEncodingMacRoman should always succeed */
223*6007Sthurlow 		s = CFStringCreateWithCString(NULL, windows_string,
224*6007Sthurlow 		    kCFStringEncodingMacRoman);
225*6007Sthurlow 		if (s == NULL) {
226*6007Sthurlow 			smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" with kCFStringEncodingMacRoman - skipping",
227*6007Sthurlow 			    -1, windows_string);
228*6007Sthurlow 			return NULL;
229*6007Sthurlow 		}
230*6007Sthurlow 	}
231*6007Sthurlow 
232*6007Sthurlow 	maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
233*6007Sthurlow 	    kCFStringEncodingUTF8) + 1;
234*6007Sthurlow 	result = malloc(maxlen);
235*6007Sthurlow 	if (result == NULL) {
236*6007Sthurlow 		smb_error("Couldn't allocate buffer for UTF-8 string for \"%s\" - skipping", -1,
237*6007Sthurlow 		    windows_string);
238*6007Sthurlow 		CFRelease(s);
239*6007Sthurlow 		return NULL;
240*6007Sthurlow 	}
241*6007Sthurlow 	if (!CFStringGetCString(s, result, maxlen, kCFStringEncodingUTF8)) {
242*6007Sthurlow 		smb_error("CFStringGetCString for UTF-8 failed on \"%s\" - skipping",
243*6007Sthurlow 		    -1, windows_string);
244*6007Sthurlow 		CFRelease(s);
245*6007Sthurlow 		return NULL;
246*6007Sthurlow 	}
247*6007Sthurlow 	CFRelease(s);
248*6007Sthurlow 	return result;
249*6007Sthurlow #else /* NOTPORTED */
250*6007Sthurlow 	return ((char*)windows_string);
251*6007Sthurlow #endif /* NOTPORTED */
252*6007Sthurlow }
253*6007Sthurlow 
254*6007Sthurlow /*
255*6007Sthurlow  * XXX - NLS, or CF?  We should probably use the same routine for all
256*6007Sthurlow  * conversions.
257*6007Sthurlow  */
258*6007Sthurlow char *
259*6007Sthurlow convert_utf8_to_wincs(const char *utf8_string)
260*6007Sthurlow {
261*6007Sthurlow #ifdef NOTPORTED
262*6007Sthurlow 	CFStringRef s;
263*6007Sthurlow 	CFIndex maxlen;
264*6007Sthurlow 	char *result;
265*6007Sthurlow 
266*6007Sthurlow 	s = CFStringCreateWithCString(NULL, utf8_string,
267*6007Sthurlow 	    kCFStringEncodingUTF8);
268*6007Sthurlow 	if (s == NULL) {
269*6007Sthurlow 		smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1,
270*6007Sthurlow 		    utf8_string);
271*6007Sthurlow 		return NULL;
272*6007Sthurlow 	}
273*6007Sthurlow 
274*6007Sthurlow 	maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
275*6007Sthurlow 	    get_windows_encoding_equivalent()) + 1;
276*6007Sthurlow 	result = malloc(maxlen);
277*6007Sthurlow 	if (result == NULL) {
278*6007Sthurlow 		smb_error("Couldn't allocate buffer for Windows code page string for \"%s\" - skipping", -1,
279*6007Sthurlow 		    utf8_string);
280*6007Sthurlow 		CFRelease(s);
281*6007Sthurlow 		return NULL;
282*6007Sthurlow 	}
283*6007Sthurlow 	if (!CFStringGetCString(s, result, maxlen,
284*6007Sthurlow 	    get_windows_encoding_equivalent())) {
285*6007Sthurlow 		smb_error("CFStringGetCString for Windows code page failed on \"%s\" - skipping",
286*6007Sthurlow 		    -1, utf8_string);
287*6007Sthurlow 		CFRelease(s);
288*6007Sthurlow 		return NULL;
289*6007Sthurlow 	}
290*6007Sthurlow 	CFRelease(s);
291*6007Sthurlow 	return result;
292*6007Sthurlow #else /* NOTPORTED */
293*6007Sthurlow 	return ((char*)utf8_string);
294*6007Sthurlow #endif /* NOTPORTED */
295*6007Sthurlow }
296*6007Sthurlow 
297*6007Sthurlow /*
298*6007Sthurlow  * Convert little-endian Unicode string to UTF-8.
299*6007Sthurlow  * Converts the Unicode string to host byte order in place.
300*6007Sthurlow  */
301*6007Sthurlow char *
302*6007Sthurlow convert_leunicode_to_utf8(unsigned short *unicode_string)
303*6007Sthurlow {
304*6007Sthurlow 	unsigned short *unicode_charp, unicode_char;
305*6007Sthurlow 	int len = 0;
306*6007Sthurlow 
307*6007Sthurlow 	for (unicode_charp = unicode_string;
308*6007Sthurlow 	    (unicode_char = *unicode_charp) != 0;
309*6007Sthurlow 	    unicode_charp++) {
310*6007Sthurlow 		*unicode_charp = letohs(unicode_char);
311*6007Sthurlow 		len = len + 2;
312*6007Sthurlow 	}
313*6007Sthurlow 	return (convert_unicode_to_utf8(unicode_string, len));
314*6007Sthurlow }
315*6007Sthurlow 
316*6007Sthurlow char *
317*6007Sthurlow convert_unicode_to_utf8(unsigned short *unicode_string, int len)
318*6007Sthurlow {
319*6007Sthurlow 	iconv_t cd;
320*6007Sthurlow 	char    from[BUFSIZ], to[BUFSIZ];
321*6007Sthurlow 	char *tptr = NULL;
322*6007Sthurlow 	const char *fptr;
323*6007Sthurlow 	size_t  ileft, oleft, ret;
324*6007Sthurlow 
325*6007Sthurlow 	cd = iconv_open("UTF-8", "UTF-16");
326*6007Sthurlow 	if (cd != (iconv_t)-1) {
327*6007Sthurlow 		ileft = len;
328*6007Sthurlow 		bcopy((char *)unicode_string, from, ileft);
329*6007Sthurlow 		fptr = from;
330*6007Sthurlow 		oleft = BUFSIZ;
331*6007Sthurlow 		tptr = to;
332*6007Sthurlow 		ret = iconv(cd, &fptr, &ileft, &tptr, &oleft);
333*6007Sthurlow 		if (ret != (size_t)-1) {
334*6007Sthurlow 			to[BUFSIZ-oleft] = '\0';
335*6007Sthurlow 			tptr = to;
336*6007Sthurlow 		} else {
337*6007Sthurlow 			tptr = NULL;
338*6007Sthurlow 		}
339*6007Sthurlow 		(void) iconv_close(cd);
340*6007Sthurlow 	}
341*6007Sthurlow 	return (tptr);
342*6007Sthurlow }
343*6007Sthurlow 
344*6007Sthurlow /*
345*6007Sthurlow  * Convert UTF-8 string to little-endian Unicode.
346*6007Sthurlow  */
347*6007Sthurlow unsigned short *
348*6007Sthurlow convert_utf8_to_leunicode(const char *utf8_string)
349*6007Sthurlow {
350*6007Sthurlow #ifdef NOTPORTED
351*6007Sthurlow 	CFStringRef s;
352*6007Sthurlow 	CFIndex maxlen;
353*6007Sthurlow 	unsigned short *result;
354*6007Sthurlow 	CFRange range;
355*6007Sthurlow 	int i;
356*6007Sthurlow 
357*6007Sthurlow 	s = CFStringCreateWithCString(NULL, utf8_string,
358*6007Sthurlow 	     kCFStringEncodingUTF8);
359*6007Sthurlow 	if (s == NULL) {
360*6007Sthurlow 		smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1,
361*6007Sthurlow 		    utf8_string);
362*6007Sthurlow 		return NULL;
363*6007Sthurlow 	}
364*6007Sthurlow 
365*6007Sthurlow 	maxlen = CFStringGetLength(s);
366*6007Sthurlow 	result = malloc(2*(maxlen + 1));
367*6007Sthurlow 	if (result == NULL) {
368*6007Sthurlow 		smb_error("Couldn't allocate buffer for Unicode string for \"%s\" - skipping", -1,
369*6007Sthurlow 		    utf8_string);
370*6007Sthurlow 		CFRelease(s);
371*6007Sthurlow 		return NULL;
372*6007Sthurlow 	}
373*6007Sthurlow 	range.location = 0;
374*6007Sthurlow 	range.length = maxlen;
375*6007Sthurlow 	CFStringGetCharacters(s, range, result);
376*6007Sthurlow 	for (i = 0; i < maxlen; i++)
377*6007Sthurlow 		result[i] = CFSwapInt16HostToLittle(result[i]);
378*6007Sthurlow 	result[maxlen] = 0;
379*6007Sthurlow 	CFRelease(s);
380*6007Sthurlow 	return result;
381*6007Sthurlow #else /* NOTPORTED */
382*6007Sthurlow 	/* LINTED */ /* XXX Really need to fix this! */
383*6007Sthurlow 	return ((ushort_t *)utf8_string); /* XXX */
384*6007Sthurlow #endif /* NOTPORTED */
385*6007Sthurlow }
386