16007Sthurlow /*
26007Sthurlow * Copyright (c) 2001 Apple Computer, Inc. All rights reserved.
36007Sthurlow *
46007Sthurlow * @APPLE_LICENSE_HEADER_START@
56007Sthurlow *
66007Sthurlow * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights
76007Sthurlow * Reserved. This file contains Original Code and/or Modifications of
86007Sthurlow * Original Code as defined in and that are subject to the Apple Public
96007Sthurlow * Source License Version 1.0 (the 'License'). You may not use this file
106007Sthurlow * except in compliance with the License. Please obtain a copy of the
116007Sthurlow * License at http://www.apple.com/publicsource and read it before using
126007Sthurlow * this file.
136007Sthurlow *
146007Sthurlow * The Original Code and all software distributed under the License are
156007Sthurlow * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
166007Sthurlow * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
176007Sthurlow * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
186007Sthurlow * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
196007Sthurlow * License for the specific language governing rights and limitations
206007Sthurlow * under the License."
216007Sthurlow *
226007Sthurlow * @APPLE_LICENSE_HEADER_END@
236007Sthurlow */
248271SGordon.Ross@Sun.COM /* CSTYLED */
258271SGordon.Ross@Sun.COM /*
268271SGordon.Ross@Sun.COM * @(#)charsets.c *
276007Sthurlow * (c) 2004 Apple Computer, Inc. All Rights Reserved
286007Sthurlow *
296007Sthurlow *
306007Sthurlow * charsets.c -- Routines converting between UTF-8, 16-bit
316007Sthurlow * little-endian Unicode, and various Windows
326007Sthurlow * code pages.
336007Sthurlow *
346007Sthurlow * MODIFICATION HISTORY:
356007Sthurlow * 28-Nov-2004 Guy Harris New today
366007Sthurlow */
376007Sthurlow
386007Sthurlow #include <stdlib.h>
396007Sthurlow #include <stdio.h>
406007Sthurlow #include <string.h>
416007Sthurlow #include <ctype.h>
428271SGordon.Ross@Sun.COM #include <errno.h>
436007Sthurlow #include <iconv.h>
446007Sthurlow #include <langinfo.h>
456007Sthurlow #include <strings.h>
46*10023SGordon.Ross@Sun.COM #include <libintl.h>
476007Sthurlow
48*10023SGordon.Ross@Sun.COM #include <sys/isa_defs.h>
496007Sthurlow #include <netsmb/smb_lib.h>
506007Sthurlow #include <netsmb/mchain.h>
516007Sthurlow
526007Sthurlow #include "charsets.h"
536007Sthurlow
546007Sthurlow /*
556007Sthurlow * On Solaris, we will need to do some rewriting to use our iconv
566007Sthurlow * routines for the conversions. For now, we're effectively
576007Sthurlow * stubbing out code, leaving the details of what happens on
586007Sthurlow * Darwin in case it's useful as a guide later.
596007Sthurlow */
606007Sthurlow
616007Sthurlow static unsigned
xtoi(char u)626007Sthurlow xtoi(char u)
636007Sthurlow {
648271SGordon.Ross@Sun.COM if (isdigit(u))
658271SGordon.Ross@Sun.COM return (u - '0');
668271SGordon.Ross@Sun.COM else if (islower(u))
678271SGordon.Ross@Sun.COM return (10 + u - 'a');
688271SGordon.Ross@Sun.COM else if (isupper(u))
698271SGordon.Ross@Sun.COM return (10 + u - 'A');
708271SGordon.Ross@Sun.COM return (16);
716007Sthurlow }
726007Sthurlow
736007Sthurlow
748271SGordon.Ross@Sun.COM /*
758271SGordon.Ross@Sun.COM * Removes the "%" escape sequences from a URL component.
766007Sthurlow * See IETF RFC 2396.
776007Sthurlow */
786007Sthurlow char *
unpercent(char * component)798271SGordon.Ross@Sun.COM unpercent(char *component)
806007Sthurlow {
818271SGordon.Ross@Sun.COM char c, *s;
828271SGordon.Ross@Sun.COM unsigned hi, lo;
838271SGordon.Ross@Sun.COM
848271SGordon.Ross@Sun.COM if (component == NULL)
858271SGordon.Ross@Sun.COM return (component);
866007Sthurlow
878271SGordon.Ross@Sun.COM for (s = component; (c = *s) != 0; s++) {
888271SGordon.Ross@Sun.COM if (c != '%')
898271SGordon.Ross@Sun.COM continue;
908271SGordon.Ross@Sun.COM if ((hi = xtoi(s[1])) > 15 || (lo = xtoi(s[2])) > 15)
918271SGordon.Ross@Sun.COM continue; /* ignore invalid escapes */
928271SGordon.Ross@Sun.COM s[0] = hi*16 + lo;
938271SGordon.Ross@Sun.COM /*
948271SGordon.Ross@Sun.COM * This was strcpy(s + 1, s + 3);
958271SGordon.Ross@Sun.COM * But nowadays leftward overlapping copies are
968271SGordon.Ross@Sun.COM * officially undefined in C. Ours seems to
978271SGordon.Ross@Sun.COM * work or not depending upon alignment.
988271SGordon.Ross@Sun.COM */
998271SGordon.Ross@Sun.COM memmove(s+1, s+3, strlen(s+3) + 1);
1008271SGordon.Ross@Sun.COM }
1018271SGordon.Ross@Sun.COM return (component);
1026007Sthurlow }
1036007Sthurlow
1048271SGordon.Ross@Sun.COM /* BEGIN CSTYLED */
1056007Sthurlow #ifdef NOTPORTED
1066007Sthurlow static CFStringEncoding
get_windows_encoding_equivalent(void)1076007Sthurlow get_windows_encoding_equivalent( void )
1086007Sthurlow {
1096007Sthurlow
1106007Sthurlow CFStringEncoding encoding;
1116007Sthurlow uint32_t index,region;
1126007Sthurlow
1136007Sthurlow /* important! use root ID so you can read the config file! */
1146007Sthurlow seteuid(eff_uid);
1156007Sthurlow __CFStringGetInstallationEncodingAndRegion(&index,®ion);
1166007Sthurlow seteuid(real_uid);
1176007Sthurlow
1186007Sthurlow switch ( index )
1196007Sthurlow {
1206007Sthurlow case kCFStringEncodingMacRoman:
1216007Sthurlow if (region) /* anything nonzero is not US */
1226007Sthurlow encoding = kCFStringEncodingDOSLatin1;
1236007Sthurlow else /* US region */
1246007Sthurlow encoding = kCFStringEncodingDOSLatinUS;
1256007Sthurlow break;
1266007Sthurlow
1276007Sthurlow case kCFStringEncodingMacJapanese:
1286007Sthurlow encoding = kCFStringEncodingDOSJapanese;
1296007Sthurlow break;
1306007Sthurlow
1316007Sthurlow case kCFStringEncodingMacChineseTrad:
1326007Sthurlow encoding = kCFStringEncodingDOSChineseTrad;
1336007Sthurlow break;
1346007Sthurlow
1356007Sthurlow case kCFStringEncodingMacKorean:
1366007Sthurlow encoding = kCFStringEncodingDOSKorean;
1376007Sthurlow break;
1386007Sthurlow
1396007Sthurlow case kCFStringEncodingMacArabic:
1406007Sthurlow encoding = kCFStringEncodingDOSArabic;
1416007Sthurlow break;
1426007Sthurlow
1436007Sthurlow case kCFStringEncodingMacHebrew:
1446007Sthurlow encoding = kCFStringEncodingDOSHebrew;
1456007Sthurlow break;
1466007Sthurlow
1476007Sthurlow case kCFStringEncodingMacGreek:
1486007Sthurlow encoding = kCFStringEncodingDOSGreek;
1496007Sthurlow break;
1506007Sthurlow
1516007Sthurlow case kCFStringEncodingMacCyrillic:
1526007Sthurlow encoding = kCFStringEncodingDOSCyrillic;
1536007Sthurlow break;
1546007Sthurlow
1556007Sthurlow case kCFStringEncodingMacThai:
1566007Sthurlow encoding = kCFStringEncodingDOSThai;
1576007Sthurlow break;
1586007Sthurlow
1596007Sthurlow case kCFStringEncodingMacChineseSimp:
1606007Sthurlow encoding = kCFStringEncodingDOSChineseSimplif;
1616007Sthurlow break;
1626007Sthurlow
1636007Sthurlow case kCFStringEncodingMacCentralEurRoman:
1646007Sthurlow encoding = kCFStringEncodingDOSLatin2;
1656007Sthurlow break;
1666007Sthurlow
1676007Sthurlow case kCFStringEncodingMacTurkish:
1686007Sthurlow encoding = kCFStringEncodingDOSTurkish;
1696007Sthurlow break;
1706007Sthurlow
1716007Sthurlow case kCFStringEncodingMacCroatian:
1726007Sthurlow encoding = kCFStringEncodingDOSLatin2;
1736007Sthurlow break;
1746007Sthurlow
1756007Sthurlow case kCFStringEncodingMacIcelandic:
1766007Sthurlow encoding = kCFStringEncodingDOSIcelandic;
1776007Sthurlow break;
1786007Sthurlow
1796007Sthurlow case kCFStringEncodingMacRomanian:
1806007Sthurlow encoding = kCFStringEncodingDOSLatin2;
1816007Sthurlow break;
1826007Sthurlow
1836007Sthurlow case kCFStringEncodingMacFarsi:
1846007Sthurlow encoding = kCFStringEncodingDOSArabic;
1856007Sthurlow break;
1866007Sthurlow
1876007Sthurlow case kCFStringEncodingMacUkrainian:
1886007Sthurlow encoding = kCFStringEncodingDOSCyrillic;
1896007Sthurlow break;
1906007Sthurlow
1916007Sthurlow default:
1926007Sthurlow encoding = kCFStringEncodingDOSLatin1;
1936007Sthurlow break;
1946007Sthurlow }
1956007Sthurlow
1966007Sthurlow return encoding;
1976007Sthurlow }
1986007Sthurlow #endif /* NOTPORTED */
1996007Sthurlow
2006007Sthurlow /*
2016007Sthurlow * XXX - NLS, or CF? We should probably use the same routine for all
2026007Sthurlow * conversions.
2036007Sthurlow */
2046007Sthurlow char *
convert_wincs_to_utf8(const char * windows_string)2056007Sthurlow convert_wincs_to_utf8(const char *windows_string)
2066007Sthurlow {
2076007Sthurlow #ifdef NOTPORTED
2086007Sthurlow CFStringRef s;
2096007Sthurlow CFIndex maxlen;
2106007Sthurlow char *result;
2116007Sthurlow
2126007Sthurlow s = CFStringCreateWithCString(NULL, windows_string,
2136007Sthurlow get_windows_encoding_equivalent());
2146007Sthurlow if (s == NULL) {
2156007Sthurlow smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" ", -1,
2166007Sthurlow windows_string);
2176007Sthurlow
2186007Sthurlow /* kCFStringEncodingMacRoman should always succeed */
2196007Sthurlow s = CFStringCreateWithCString(NULL, windows_string,
2206007Sthurlow kCFStringEncodingMacRoman);
2216007Sthurlow if (s == NULL) {
2226007Sthurlow smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" with kCFStringEncodingMacRoman - skipping",
2236007Sthurlow -1, windows_string);
2246007Sthurlow return NULL;
2256007Sthurlow }
2266007Sthurlow }
2276007Sthurlow
2286007Sthurlow maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
2296007Sthurlow kCFStringEncodingUTF8) + 1;
2306007Sthurlow result = malloc(maxlen);
2316007Sthurlow if (result == NULL) {
2326007Sthurlow smb_error("Couldn't allocate buffer for UTF-8 string for \"%s\" - skipping", -1,
2336007Sthurlow windows_string);
2346007Sthurlow CFRelease(s);
2356007Sthurlow return NULL;
2366007Sthurlow }
2376007Sthurlow if (!CFStringGetCString(s, result, maxlen, kCFStringEncodingUTF8)) {
2386007Sthurlow smb_error("CFStringGetCString for UTF-8 failed on \"%s\" - skipping",
2396007Sthurlow -1, windows_string);
2406007Sthurlow CFRelease(s);
2416007Sthurlow return NULL;
2426007Sthurlow }
2436007Sthurlow CFRelease(s);
2446007Sthurlow return result;
2456007Sthurlow #else /* NOTPORTED */
2468271SGordon.Ross@Sun.COM return (strdup((char*)windows_string));
2476007Sthurlow #endif /* NOTPORTED */
2486007Sthurlow }
2496007Sthurlow
2506007Sthurlow /*
2516007Sthurlow * XXX - NLS, or CF? We should probably use the same routine for all
2526007Sthurlow * conversions.
2536007Sthurlow */
2546007Sthurlow char *
convert_utf8_to_wincs(const char * utf8_string)2556007Sthurlow convert_utf8_to_wincs(const char *utf8_string)
2566007Sthurlow {
2576007Sthurlow #ifdef NOTPORTED
2586007Sthurlow CFStringRef s;
2596007Sthurlow CFIndex maxlen;
2606007Sthurlow char *result;
2616007Sthurlow
2626007Sthurlow s = CFStringCreateWithCString(NULL, utf8_string,
2636007Sthurlow kCFStringEncodingUTF8);
2646007Sthurlow if (s == NULL) {
2656007Sthurlow smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1,
2666007Sthurlow utf8_string);
2676007Sthurlow return NULL;
2686007Sthurlow }
2696007Sthurlow
2706007Sthurlow maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
2716007Sthurlow get_windows_encoding_equivalent()) + 1;
2726007Sthurlow result = malloc(maxlen);
2736007Sthurlow if (result == NULL) {
2746007Sthurlow smb_error("Couldn't allocate buffer for Windows code page string for \"%s\" - skipping", -1,
2756007Sthurlow utf8_string);
2766007Sthurlow CFRelease(s);
2776007Sthurlow return NULL;
2786007Sthurlow }
2796007Sthurlow if (!CFStringGetCString(s, result, maxlen,
2806007Sthurlow get_windows_encoding_equivalent())) {
2816007Sthurlow smb_error("CFStringGetCString for Windows code page failed on \"%s\" - skipping",
2826007Sthurlow -1, utf8_string);
2836007Sthurlow CFRelease(s);
2846007Sthurlow return NULL;
2856007Sthurlow }
2866007Sthurlow CFRelease(s);
2876007Sthurlow return result;
2886007Sthurlow #else /* NOTPORTED */
2898271SGordon.Ross@Sun.COM return (strdup((char*)utf8_string));
2906007Sthurlow #endif /* NOTPORTED */
2916007Sthurlow }
2928271SGordon.Ross@Sun.COM /* END CSTYLED */
2936007Sthurlow
2946007Sthurlow /*
2958271SGordon.Ross@Sun.COM * We replaced these routines for Solaris:
2968271SGordon.Ross@Sun.COM * convert_leunicode_to_utf8
2978271SGordon.Ross@Sun.COM * convert_unicode_to_utf8
2988271SGordon.Ross@Sun.COM * convert_utf8_to_leunicode
2998271SGordon.Ross@Sun.COM * with new code in: utf_str.c
3006007Sthurlow */
301