xref: /onnv-gate/usr/src/lib/libsmbfs/smb/charsets.c (revision 8271:792589b3384f)
16007Sthurlow /*
26007Sthurlow  * Copyright (c) 2001 Apple Computer, Inc. All rights reserved.
36007Sthurlow  *
46007Sthurlow  * @APPLE_LICENSE_HEADER_START@
56007Sthurlow  *
66007Sthurlow  * "Portions Copyright (c) 1999 Apple Computer, Inc.  All Rights
76007Sthurlow  * Reserved.  This file contains Original Code and/or Modifications of
86007Sthurlow  * Original Code as defined in and that are subject to the Apple Public
96007Sthurlow  * Source License Version 1.0 (the 'License').  You may not use this file
106007Sthurlow  * except in compliance with the License.  Please obtain a copy of the
116007Sthurlow  * License at http://www.apple.com/publicsource and read it before using
126007Sthurlow  * this file.
136007Sthurlow  *
146007Sthurlow  * The Original Code and all software distributed under the License are
156007Sthurlow  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
166007Sthurlow  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
176007Sthurlow  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
186007Sthurlow  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
196007Sthurlow  * License for the specific language governing rights and limitations
206007Sthurlow  * under the License."
216007Sthurlow  *
226007Sthurlow  * @APPLE_LICENSE_HEADER_END@
236007Sthurlow  */
24*8271SGordon.Ross@Sun.COM /* CSTYLED */
25*8271SGordon.Ross@Sun.COM /*
26*8271SGordon.Ross@Sun.COM  *      @(#)charsets.c      *
276007Sthurlow  *      (c) 2004   Apple Computer, Inc.  All Rights Reserved
286007Sthurlow  *
296007Sthurlow  *
306007Sthurlow  *      charsets.c -- Routines converting between UTF-8, 16-bit
316007Sthurlow  *			little-endian Unicode, and various Windows
326007Sthurlow  *			code pages.
336007Sthurlow  *
346007Sthurlow  *      MODIFICATION HISTORY:
356007Sthurlow  *       28-Nov-2004     Guy Harris	New today
366007Sthurlow  */
376007Sthurlow 
386007Sthurlow #include <stdlib.h>
396007Sthurlow #include <stdio.h>
406007Sthurlow #include <string.h>
416007Sthurlow #include <ctype.h>
42*8271SGordon.Ross@Sun.COM #include <errno.h>
436007Sthurlow #include <iconv.h>
446007Sthurlow #include <langinfo.h>
456007Sthurlow #include <strings.h>
466007Sthurlow 
476007Sthurlow #include <netsmb/smb_lib.h>
486007Sthurlow #include <netsmb/mchain.h>
496007Sthurlow 
506007Sthurlow #include "charsets.h"
516007Sthurlow 
526007Sthurlow /*
536007Sthurlow  * On Solaris, we will need to do some rewriting to use our iconv
546007Sthurlow  * routines for the conversions.  For now, we're effectively
556007Sthurlow  * stubbing out code, leaving the details of what happens on
566007Sthurlow  * Darwin in case it's useful as a guide later.
576007Sthurlow  */
586007Sthurlow 
596007Sthurlow static unsigned
606007Sthurlow xtoi(char u)
616007Sthurlow {
62*8271SGordon.Ross@Sun.COM 	if (isdigit(u))
63*8271SGordon.Ross@Sun.COM 		return (u - '0');
64*8271SGordon.Ross@Sun.COM 	else if (islower(u))
65*8271SGordon.Ross@Sun.COM 		return (10 + u - 'a');
66*8271SGordon.Ross@Sun.COM 	else if (isupper(u))
67*8271SGordon.Ross@Sun.COM 		return (10 + u - 'A');
68*8271SGordon.Ross@Sun.COM 	return (16);
696007Sthurlow }
706007Sthurlow 
716007Sthurlow 
72*8271SGordon.Ross@Sun.COM /*
73*8271SGordon.Ross@Sun.COM  * Removes the "%" escape sequences from a URL component.
746007Sthurlow  * See IETF RFC 2396.
756007Sthurlow  */
766007Sthurlow char *
77*8271SGordon.Ross@Sun.COM unpercent(char *component)
786007Sthurlow {
79*8271SGordon.Ross@Sun.COM 	char c, *s;
80*8271SGordon.Ross@Sun.COM 	unsigned hi, lo;
81*8271SGordon.Ross@Sun.COM 
82*8271SGordon.Ross@Sun.COM 	if (component == NULL)
83*8271SGordon.Ross@Sun.COM 		return (component);
846007Sthurlow 
85*8271SGordon.Ross@Sun.COM 	for (s = component; (c = *s) != 0; s++) {
86*8271SGordon.Ross@Sun.COM 		if (c != '%')
87*8271SGordon.Ross@Sun.COM 			continue;
88*8271SGordon.Ross@Sun.COM 		if ((hi = xtoi(s[1])) > 15 || (lo = xtoi(s[2])) > 15)
89*8271SGordon.Ross@Sun.COM 			continue; /* ignore invalid escapes */
90*8271SGordon.Ross@Sun.COM 		s[0] = hi*16 + lo;
91*8271SGordon.Ross@Sun.COM 		/*
92*8271SGordon.Ross@Sun.COM 		 * This was strcpy(s + 1, s + 3);
93*8271SGordon.Ross@Sun.COM 		 * But nowadays leftward overlapping copies are
94*8271SGordon.Ross@Sun.COM 		 * officially undefined in C.  Ours seems to
95*8271SGordon.Ross@Sun.COM 		 * work or not depending upon alignment.
96*8271SGordon.Ross@Sun.COM 		 */
97*8271SGordon.Ross@Sun.COM 		memmove(s+1, s+3, strlen(s+3) + 1);
98*8271SGordon.Ross@Sun.COM 	}
99*8271SGordon.Ross@Sun.COM 	return (component);
1006007Sthurlow }
1016007Sthurlow 
102*8271SGordon.Ross@Sun.COM /* BEGIN CSTYLED */
1036007Sthurlow #ifdef NOTPORTED
1046007Sthurlow static CFStringEncoding
1056007Sthurlow get_windows_encoding_equivalent( void )
1066007Sthurlow {
1076007Sthurlow 
1086007Sthurlow 	CFStringEncoding encoding;
1096007Sthurlow 	uint32_t index,region;
1106007Sthurlow 
1116007Sthurlow 	/* important! use root ID so you can read the config file! */
1126007Sthurlow 	seteuid(eff_uid);
1136007Sthurlow 	__CFStringGetInstallationEncodingAndRegion(&index,&region);
1146007Sthurlow 	seteuid(real_uid);
1156007Sthurlow 
1166007Sthurlow 	switch ( index )
1176007Sthurlow 	{
1186007Sthurlow 		case	kCFStringEncodingMacRoman:
1196007Sthurlow 			if (region) /* anything nonzero is not US */
1206007Sthurlow 				encoding = kCFStringEncodingDOSLatin1;
1216007Sthurlow 			else /* US region */
1226007Sthurlow 				encoding = kCFStringEncodingDOSLatinUS;
1236007Sthurlow 			break;
1246007Sthurlow 
1256007Sthurlow 		case	kCFStringEncodingMacJapanese:
1266007Sthurlow 			encoding = kCFStringEncodingDOSJapanese;
1276007Sthurlow 			break;
1286007Sthurlow 
1296007Sthurlow 		case	kCFStringEncodingMacChineseTrad:
1306007Sthurlow 			encoding = kCFStringEncodingDOSChineseTrad;
1316007Sthurlow 			break;
1326007Sthurlow 
1336007Sthurlow 		case	kCFStringEncodingMacKorean:
1346007Sthurlow 			encoding = kCFStringEncodingDOSKorean;
1356007Sthurlow 			break;
1366007Sthurlow 
1376007Sthurlow 		case	kCFStringEncodingMacArabic:
1386007Sthurlow 			encoding = kCFStringEncodingDOSArabic;
1396007Sthurlow 			break;
1406007Sthurlow 
1416007Sthurlow 		case	kCFStringEncodingMacHebrew:
1426007Sthurlow 			encoding = kCFStringEncodingDOSHebrew;
1436007Sthurlow 			break;
1446007Sthurlow 
1456007Sthurlow 		case	kCFStringEncodingMacGreek:
1466007Sthurlow 			encoding = kCFStringEncodingDOSGreek;
1476007Sthurlow 			break;
1486007Sthurlow 
1496007Sthurlow 		case	kCFStringEncodingMacCyrillic:
1506007Sthurlow 			encoding = kCFStringEncodingDOSCyrillic;
1516007Sthurlow 			break;
1526007Sthurlow 
1536007Sthurlow 		case	kCFStringEncodingMacThai:
1546007Sthurlow 			encoding = kCFStringEncodingDOSThai;
1556007Sthurlow 			break;
1566007Sthurlow 
1576007Sthurlow 		case	kCFStringEncodingMacChineseSimp:
1586007Sthurlow 			encoding = kCFStringEncodingDOSChineseSimplif;
1596007Sthurlow 			break;
1606007Sthurlow 
1616007Sthurlow 		case	kCFStringEncodingMacCentralEurRoman:
1626007Sthurlow 			encoding = kCFStringEncodingDOSLatin2;
1636007Sthurlow 			break;
1646007Sthurlow 
1656007Sthurlow 		case	kCFStringEncodingMacTurkish:
1666007Sthurlow 			encoding = kCFStringEncodingDOSTurkish;
1676007Sthurlow 			break;
1686007Sthurlow 
1696007Sthurlow 		case	kCFStringEncodingMacCroatian:
1706007Sthurlow 			encoding = kCFStringEncodingDOSLatin2;
1716007Sthurlow 			break;
1726007Sthurlow 
1736007Sthurlow 		case	kCFStringEncodingMacIcelandic:
1746007Sthurlow 			encoding = kCFStringEncodingDOSIcelandic;
1756007Sthurlow 			break;
1766007Sthurlow 
1776007Sthurlow 		case	kCFStringEncodingMacRomanian:
1786007Sthurlow 			encoding = kCFStringEncodingDOSLatin2;
1796007Sthurlow 			break;
1806007Sthurlow 
1816007Sthurlow 		case	kCFStringEncodingMacFarsi:
1826007Sthurlow 			encoding = kCFStringEncodingDOSArabic;
1836007Sthurlow 			break;
1846007Sthurlow 
1856007Sthurlow 		case	kCFStringEncodingMacUkrainian:
1866007Sthurlow 			encoding = kCFStringEncodingDOSCyrillic;
1876007Sthurlow 			break;
1886007Sthurlow 
1896007Sthurlow 		default:
1906007Sthurlow 			encoding = kCFStringEncodingDOSLatin1;
1916007Sthurlow 			break;
1926007Sthurlow 	}
1936007Sthurlow 
1946007Sthurlow 	return encoding;
1956007Sthurlow }
1966007Sthurlow #endif /* NOTPORTED */
1976007Sthurlow 
1986007Sthurlow /*
1996007Sthurlow  * XXX - NLS, or CF?  We should probably use the same routine for all
2006007Sthurlow  * conversions.
2016007Sthurlow  */
2026007Sthurlow char *
2036007Sthurlow convert_wincs_to_utf8(const char *windows_string)
2046007Sthurlow {
2056007Sthurlow #ifdef NOTPORTED
2066007Sthurlow 	CFStringRef s;
2076007Sthurlow 	CFIndex maxlen;
2086007Sthurlow 	char *result;
2096007Sthurlow 
2106007Sthurlow 	s = CFStringCreateWithCString(NULL, windows_string,
2116007Sthurlow 		get_windows_encoding_equivalent());
2126007Sthurlow 	if (s == NULL) {
2136007Sthurlow 		smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" ", -1,
2146007Sthurlow 		    windows_string);
2156007Sthurlow 
2166007Sthurlow 		/* kCFStringEncodingMacRoman should always succeed */
2176007Sthurlow 		s = CFStringCreateWithCString(NULL, windows_string,
2186007Sthurlow 		    kCFStringEncodingMacRoman);
2196007Sthurlow 		if (s == NULL) {
2206007Sthurlow 			smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" with kCFStringEncodingMacRoman - skipping",
2216007Sthurlow 			    -1, windows_string);
2226007Sthurlow 			return NULL;
2236007Sthurlow 		}
2246007Sthurlow 	}
2256007Sthurlow 
2266007Sthurlow 	maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
2276007Sthurlow 	    kCFStringEncodingUTF8) + 1;
2286007Sthurlow 	result = malloc(maxlen);
2296007Sthurlow 	if (result == NULL) {
2306007Sthurlow 		smb_error("Couldn't allocate buffer for UTF-8 string for \"%s\" - skipping", -1,
2316007Sthurlow 		    windows_string);
2326007Sthurlow 		CFRelease(s);
2336007Sthurlow 		return NULL;
2346007Sthurlow 	}
2356007Sthurlow 	if (!CFStringGetCString(s, result, maxlen, kCFStringEncodingUTF8)) {
2366007Sthurlow 		smb_error("CFStringGetCString for UTF-8 failed on \"%s\" - skipping",
2376007Sthurlow 		    -1, windows_string);
2386007Sthurlow 		CFRelease(s);
2396007Sthurlow 		return NULL;
2406007Sthurlow 	}
2416007Sthurlow 	CFRelease(s);
2426007Sthurlow 	return result;
2436007Sthurlow #else /* NOTPORTED */
244*8271SGordon.Ross@Sun.COM 	return (strdup((char*)windows_string));
2456007Sthurlow #endif /* NOTPORTED */
2466007Sthurlow }
2476007Sthurlow 
2486007Sthurlow /*
2496007Sthurlow  * XXX - NLS, or CF?  We should probably use the same routine for all
2506007Sthurlow  * conversions.
2516007Sthurlow  */
2526007Sthurlow char *
2536007Sthurlow convert_utf8_to_wincs(const char *utf8_string)
2546007Sthurlow {
2556007Sthurlow #ifdef NOTPORTED
2566007Sthurlow 	CFStringRef s;
2576007Sthurlow 	CFIndex maxlen;
2586007Sthurlow 	char *result;
2596007Sthurlow 
2606007Sthurlow 	s = CFStringCreateWithCString(NULL, utf8_string,
2616007Sthurlow 	    kCFStringEncodingUTF8);
2626007Sthurlow 	if (s == NULL) {
2636007Sthurlow 		smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1,
2646007Sthurlow 		    utf8_string);
2656007Sthurlow 		return NULL;
2666007Sthurlow 	}
2676007Sthurlow 
2686007Sthurlow 	maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
2696007Sthurlow 	    get_windows_encoding_equivalent()) + 1;
2706007Sthurlow 	result = malloc(maxlen);
2716007Sthurlow 	if (result == NULL) {
2726007Sthurlow 		smb_error("Couldn't allocate buffer for Windows code page string for \"%s\" - skipping", -1,
2736007Sthurlow 		    utf8_string);
2746007Sthurlow 		CFRelease(s);
2756007Sthurlow 		return NULL;
2766007Sthurlow 	}
2776007Sthurlow 	if (!CFStringGetCString(s, result, maxlen,
2786007Sthurlow 	    get_windows_encoding_equivalent())) {
2796007Sthurlow 		smb_error("CFStringGetCString for Windows code page failed on \"%s\" - skipping",
2806007Sthurlow 		    -1, utf8_string);
2816007Sthurlow 		CFRelease(s);
2826007Sthurlow 		return NULL;
2836007Sthurlow 	}
2846007Sthurlow 	CFRelease(s);
2856007Sthurlow 	return result;
2866007Sthurlow #else /* NOTPORTED */
287*8271SGordon.Ross@Sun.COM 	return (strdup((char*)utf8_string));
2886007Sthurlow #endif /* NOTPORTED */
2896007Sthurlow }
290*8271SGordon.Ross@Sun.COM /* END CSTYLED */
2916007Sthurlow 
2926007Sthurlow /*
293*8271SGordon.Ross@Sun.COM  * We replaced these routines for Solaris:
294*8271SGordon.Ross@Sun.COM  *	convert_leunicode_to_utf8
295*8271SGordon.Ross@Sun.COM  *	convert_unicode_to_utf8
296*8271SGordon.Ross@Sun.COM  *	convert_utf8_to_leunicode
297*8271SGordon.Ross@Sun.COM  * with new code in: utf_str.c
2986007Sthurlow  */
299