xref: /onnv-gate/usr/src/lib/libsmbfs/smb/charsets.c (revision 10023:71bf38dba3d6)
16007Sthurlow /*
26007Sthurlow  * Copyright (c) 2001 Apple Computer, Inc. All rights reserved.
36007Sthurlow  *
46007Sthurlow  * @APPLE_LICENSE_HEADER_START@
56007Sthurlow  *
66007Sthurlow  * "Portions Copyright (c) 1999 Apple Computer, Inc.  All Rights
76007Sthurlow  * Reserved.  This file contains Original Code and/or Modifications of
86007Sthurlow  * Original Code as defined in and that are subject to the Apple Public
96007Sthurlow  * Source License Version 1.0 (the 'License').  You may not use this file
106007Sthurlow  * except in compliance with the License.  Please obtain a copy of the
116007Sthurlow  * License at http://www.apple.com/publicsource and read it before using
126007Sthurlow  * this file.
136007Sthurlow  *
146007Sthurlow  * The Original Code and all software distributed under the License are
156007Sthurlow  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
166007Sthurlow  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
176007Sthurlow  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
186007Sthurlow  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
196007Sthurlow  * License for the specific language governing rights and limitations
206007Sthurlow  * under the License."
216007Sthurlow  *
226007Sthurlow  * @APPLE_LICENSE_HEADER_END@
236007Sthurlow  */
248271SGordon.Ross@Sun.COM /* CSTYLED */
258271SGordon.Ross@Sun.COM /*
268271SGordon.Ross@Sun.COM  *      @(#)charsets.c      *
276007Sthurlow  *      (c) 2004   Apple Computer, Inc.  All Rights Reserved
286007Sthurlow  *
296007Sthurlow  *
306007Sthurlow  *      charsets.c -- Routines converting between UTF-8, 16-bit
316007Sthurlow  *			little-endian Unicode, and various Windows
326007Sthurlow  *			code pages.
336007Sthurlow  *
346007Sthurlow  *      MODIFICATION HISTORY:
356007Sthurlow  *       28-Nov-2004     Guy Harris	New today
366007Sthurlow  */
376007Sthurlow 
386007Sthurlow #include <stdlib.h>
396007Sthurlow #include <stdio.h>
406007Sthurlow #include <string.h>
416007Sthurlow #include <ctype.h>
428271SGordon.Ross@Sun.COM #include <errno.h>
436007Sthurlow #include <iconv.h>
446007Sthurlow #include <langinfo.h>
456007Sthurlow #include <strings.h>
46*10023SGordon.Ross@Sun.COM #include <libintl.h>
476007Sthurlow 
48*10023SGordon.Ross@Sun.COM #include <sys/isa_defs.h>
496007Sthurlow #include <netsmb/smb_lib.h>
506007Sthurlow #include <netsmb/mchain.h>
516007Sthurlow 
526007Sthurlow #include "charsets.h"
536007Sthurlow 
546007Sthurlow /*
556007Sthurlow  * On Solaris, we will need to do some rewriting to use our iconv
566007Sthurlow  * routines for the conversions.  For now, we're effectively
576007Sthurlow  * stubbing out code, leaving the details of what happens on
586007Sthurlow  * Darwin in case it's useful as a guide later.
596007Sthurlow  */
606007Sthurlow 
616007Sthurlow static unsigned
xtoi(char u)626007Sthurlow xtoi(char u)
636007Sthurlow {
648271SGordon.Ross@Sun.COM 	if (isdigit(u))
658271SGordon.Ross@Sun.COM 		return (u - '0');
668271SGordon.Ross@Sun.COM 	else if (islower(u))
678271SGordon.Ross@Sun.COM 		return (10 + u - 'a');
688271SGordon.Ross@Sun.COM 	else if (isupper(u))
698271SGordon.Ross@Sun.COM 		return (10 + u - 'A');
708271SGordon.Ross@Sun.COM 	return (16);
716007Sthurlow }
726007Sthurlow 
736007Sthurlow 
748271SGordon.Ross@Sun.COM /*
758271SGordon.Ross@Sun.COM  * Removes the "%" escape sequences from a URL component.
766007Sthurlow  * See IETF RFC 2396.
776007Sthurlow  */
786007Sthurlow char *
unpercent(char * component)798271SGordon.Ross@Sun.COM unpercent(char *component)
806007Sthurlow {
818271SGordon.Ross@Sun.COM 	char c, *s;
828271SGordon.Ross@Sun.COM 	unsigned hi, lo;
838271SGordon.Ross@Sun.COM 
848271SGordon.Ross@Sun.COM 	if (component == NULL)
858271SGordon.Ross@Sun.COM 		return (component);
866007Sthurlow 
878271SGordon.Ross@Sun.COM 	for (s = component; (c = *s) != 0; s++) {
888271SGordon.Ross@Sun.COM 		if (c != '%')
898271SGordon.Ross@Sun.COM 			continue;
908271SGordon.Ross@Sun.COM 		if ((hi = xtoi(s[1])) > 15 || (lo = xtoi(s[2])) > 15)
918271SGordon.Ross@Sun.COM 			continue; /* ignore invalid escapes */
928271SGordon.Ross@Sun.COM 		s[0] = hi*16 + lo;
938271SGordon.Ross@Sun.COM 		/*
948271SGordon.Ross@Sun.COM 		 * This was strcpy(s + 1, s + 3);
958271SGordon.Ross@Sun.COM 		 * But nowadays leftward overlapping copies are
968271SGordon.Ross@Sun.COM 		 * officially undefined in C.  Ours seems to
978271SGordon.Ross@Sun.COM 		 * work or not depending upon alignment.
988271SGordon.Ross@Sun.COM 		 */
998271SGordon.Ross@Sun.COM 		memmove(s+1, s+3, strlen(s+3) + 1);
1008271SGordon.Ross@Sun.COM 	}
1018271SGordon.Ross@Sun.COM 	return (component);
1026007Sthurlow }
1036007Sthurlow 
1048271SGordon.Ross@Sun.COM /* BEGIN CSTYLED */
1056007Sthurlow #ifdef NOTPORTED
1066007Sthurlow static CFStringEncoding
get_windows_encoding_equivalent(void)1076007Sthurlow get_windows_encoding_equivalent( void )
1086007Sthurlow {
1096007Sthurlow 
1106007Sthurlow 	CFStringEncoding encoding;
1116007Sthurlow 	uint32_t index,region;
1126007Sthurlow 
1136007Sthurlow 	/* important! use root ID so you can read the config file! */
1146007Sthurlow 	seteuid(eff_uid);
1156007Sthurlow 	__CFStringGetInstallationEncodingAndRegion(&index,&region);
1166007Sthurlow 	seteuid(real_uid);
1176007Sthurlow 
1186007Sthurlow 	switch ( index )
1196007Sthurlow 	{
1206007Sthurlow 		case	kCFStringEncodingMacRoman:
1216007Sthurlow 			if (region) /* anything nonzero is not US */
1226007Sthurlow 				encoding = kCFStringEncodingDOSLatin1;
1236007Sthurlow 			else /* US region */
1246007Sthurlow 				encoding = kCFStringEncodingDOSLatinUS;
1256007Sthurlow 			break;
1266007Sthurlow 
1276007Sthurlow 		case	kCFStringEncodingMacJapanese:
1286007Sthurlow 			encoding = kCFStringEncodingDOSJapanese;
1296007Sthurlow 			break;
1306007Sthurlow 
1316007Sthurlow 		case	kCFStringEncodingMacChineseTrad:
1326007Sthurlow 			encoding = kCFStringEncodingDOSChineseTrad;
1336007Sthurlow 			break;
1346007Sthurlow 
1356007Sthurlow 		case	kCFStringEncodingMacKorean:
1366007Sthurlow 			encoding = kCFStringEncodingDOSKorean;
1376007Sthurlow 			break;
1386007Sthurlow 
1396007Sthurlow 		case	kCFStringEncodingMacArabic:
1406007Sthurlow 			encoding = kCFStringEncodingDOSArabic;
1416007Sthurlow 			break;
1426007Sthurlow 
1436007Sthurlow 		case	kCFStringEncodingMacHebrew:
1446007Sthurlow 			encoding = kCFStringEncodingDOSHebrew;
1456007Sthurlow 			break;
1466007Sthurlow 
1476007Sthurlow 		case	kCFStringEncodingMacGreek:
1486007Sthurlow 			encoding = kCFStringEncodingDOSGreek;
1496007Sthurlow 			break;
1506007Sthurlow 
1516007Sthurlow 		case	kCFStringEncodingMacCyrillic:
1526007Sthurlow 			encoding = kCFStringEncodingDOSCyrillic;
1536007Sthurlow 			break;
1546007Sthurlow 
1556007Sthurlow 		case	kCFStringEncodingMacThai:
1566007Sthurlow 			encoding = kCFStringEncodingDOSThai;
1576007Sthurlow 			break;
1586007Sthurlow 
1596007Sthurlow 		case	kCFStringEncodingMacChineseSimp:
1606007Sthurlow 			encoding = kCFStringEncodingDOSChineseSimplif;
1616007Sthurlow 			break;
1626007Sthurlow 
1636007Sthurlow 		case	kCFStringEncodingMacCentralEurRoman:
1646007Sthurlow 			encoding = kCFStringEncodingDOSLatin2;
1656007Sthurlow 			break;
1666007Sthurlow 
1676007Sthurlow 		case	kCFStringEncodingMacTurkish:
1686007Sthurlow 			encoding = kCFStringEncodingDOSTurkish;
1696007Sthurlow 			break;
1706007Sthurlow 
1716007Sthurlow 		case	kCFStringEncodingMacCroatian:
1726007Sthurlow 			encoding = kCFStringEncodingDOSLatin2;
1736007Sthurlow 			break;
1746007Sthurlow 
1756007Sthurlow 		case	kCFStringEncodingMacIcelandic:
1766007Sthurlow 			encoding = kCFStringEncodingDOSIcelandic;
1776007Sthurlow 			break;
1786007Sthurlow 
1796007Sthurlow 		case	kCFStringEncodingMacRomanian:
1806007Sthurlow 			encoding = kCFStringEncodingDOSLatin2;
1816007Sthurlow 			break;
1826007Sthurlow 
1836007Sthurlow 		case	kCFStringEncodingMacFarsi:
1846007Sthurlow 			encoding = kCFStringEncodingDOSArabic;
1856007Sthurlow 			break;
1866007Sthurlow 
1876007Sthurlow 		case	kCFStringEncodingMacUkrainian:
1886007Sthurlow 			encoding = kCFStringEncodingDOSCyrillic;
1896007Sthurlow 			break;
1906007Sthurlow 
1916007Sthurlow 		default:
1926007Sthurlow 			encoding = kCFStringEncodingDOSLatin1;
1936007Sthurlow 			break;
1946007Sthurlow 	}
1956007Sthurlow 
1966007Sthurlow 	return encoding;
1976007Sthurlow }
1986007Sthurlow #endif /* NOTPORTED */
1996007Sthurlow 
2006007Sthurlow /*
2016007Sthurlow  * XXX - NLS, or CF?  We should probably use the same routine for all
2026007Sthurlow  * conversions.
2036007Sthurlow  */
2046007Sthurlow char *
convert_wincs_to_utf8(const char * windows_string)2056007Sthurlow convert_wincs_to_utf8(const char *windows_string)
2066007Sthurlow {
2076007Sthurlow #ifdef NOTPORTED
2086007Sthurlow 	CFStringRef s;
2096007Sthurlow 	CFIndex maxlen;
2106007Sthurlow 	char *result;
2116007Sthurlow 
2126007Sthurlow 	s = CFStringCreateWithCString(NULL, windows_string,
2136007Sthurlow 		get_windows_encoding_equivalent());
2146007Sthurlow 	if (s == NULL) {
2156007Sthurlow 		smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" ", -1,
2166007Sthurlow 		    windows_string);
2176007Sthurlow 
2186007Sthurlow 		/* kCFStringEncodingMacRoman should always succeed */
2196007Sthurlow 		s = CFStringCreateWithCString(NULL, windows_string,
2206007Sthurlow 		    kCFStringEncodingMacRoman);
2216007Sthurlow 		if (s == NULL) {
2226007Sthurlow 			smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" with kCFStringEncodingMacRoman - skipping",
2236007Sthurlow 			    -1, windows_string);
2246007Sthurlow 			return NULL;
2256007Sthurlow 		}
2266007Sthurlow 	}
2276007Sthurlow 
2286007Sthurlow 	maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
2296007Sthurlow 	    kCFStringEncodingUTF8) + 1;
2306007Sthurlow 	result = malloc(maxlen);
2316007Sthurlow 	if (result == NULL) {
2326007Sthurlow 		smb_error("Couldn't allocate buffer for UTF-8 string for \"%s\" - skipping", -1,
2336007Sthurlow 		    windows_string);
2346007Sthurlow 		CFRelease(s);
2356007Sthurlow 		return NULL;
2366007Sthurlow 	}
2376007Sthurlow 	if (!CFStringGetCString(s, result, maxlen, kCFStringEncodingUTF8)) {
2386007Sthurlow 		smb_error("CFStringGetCString for UTF-8 failed on \"%s\" - skipping",
2396007Sthurlow 		    -1, windows_string);
2406007Sthurlow 		CFRelease(s);
2416007Sthurlow 		return NULL;
2426007Sthurlow 	}
2436007Sthurlow 	CFRelease(s);
2446007Sthurlow 	return result;
2456007Sthurlow #else /* NOTPORTED */
2468271SGordon.Ross@Sun.COM 	return (strdup((char*)windows_string));
2476007Sthurlow #endif /* NOTPORTED */
2486007Sthurlow }
2496007Sthurlow 
2506007Sthurlow /*
2516007Sthurlow  * XXX - NLS, or CF?  We should probably use the same routine for all
2526007Sthurlow  * conversions.
2536007Sthurlow  */
2546007Sthurlow char *
convert_utf8_to_wincs(const char * utf8_string)2556007Sthurlow convert_utf8_to_wincs(const char *utf8_string)
2566007Sthurlow {
2576007Sthurlow #ifdef NOTPORTED
2586007Sthurlow 	CFStringRef s;
2596007Sthurlow 	CFIndex maxlen;
2606007Sthurlow 	char *result;
2616007Sthurlow 
2626007Sthurlow 	s = CFStringCreateWithCString(NULL, utf8_string,
2636007Sthurlow 	    kCFStringEncodingUTF8);
2646007Sthurlow 	if (s == NULL) {
2656007Sthurlow 		smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1,
2666007Sthurlow 		    utf8_string);
2676007Sthurlow 		return NULL;
2686007Sthurlow 	}
2696007Sthurlow 
2706007Sthurlow 	maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
2716007Sthurlow 	    get_windows_encoding_equivalent()) + 1;
2726007Sthurlow 	result = malloc(maxlen);
2736007Sthurlow 	if (result == NULL) {
2746007Sthurlow 		smb_error("Couldn't allocate buffer for Windows code page string for \"%s\" - skipping", -1,
2756007Sthurlow 		    utf8_string);
2766007Sthurlow 		CFRelease(s);
2776007Sthurlow 		return NULL;
2786007Sthurlow 	}
2796007Sthurlow 	if (!CFStringGetCString(s, result, maxlen,
2806007Sthurlow 	    get_windows_encoding_equivalent())) {
2816007Sthurlow 		smb_error("CFStringGetCString for Windows code page failed on \"%s\" - skipping",
2826007Sthurlow 		    -1, utf8_string);
2836007Sthurlow 		CFRelease(s);
2846007Sthurlow 		return NULL;
2856007Sthurlow 	}
2866007Sthurlow 	CFRelease(s);
2876007Sthurlow 	return result;
2886007Sthurlow #else /* NOTPORTED */
2898271SGordon.Ross@Sun.COM 	return (strdup((char*)utf8_string));
2906007Sthurlow #endif /* NOTPORTED */
2916007Sthurlow }
2928271SGordon.Ross@Sun.COM /* END CSTYLED */
2936007Sthurlow 
2946007Sthurlow /*
2958271SGordon.Ross@Sun.COM  * We replaced these routines for Solaris:
2968271SGordon.Ross@Sun.COM  *	convert_leunicode_to_utf8
2978271SGordon.Ross@Sun.COM  *	convert_unicode_to_utf8
2988271SGordon.Ross@Sun.COM  *	convert_utf8_to_leunicode
2998271SGordon.Ross@Sun.COM  * with new code in: utf_str.c
3006007Sthurlow  */
301