16007Sthurlow /* 26007Sthurlow * Copyright (c) 2001 Apple Computer, Inc. All rights reserved. 36007Sthurlow * 46007Sthurlow * @APPLE_LICENSE_HEADER_START@ 56007Sthurlow * 66007Sthurlow * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights 76007Sthurlow * Reserved. This file contains Original Code and/or Modifications of 86007Sthurlow * Original Code as defined in and that are subject to the Apple Public 96007Sthurlow * Source License Version 1.0 (the 'License'). You may not use this file 106007Sthurlow * except in compliance with the License. Please obtain a copy of the 116007Sthurlow * License at http://www.apple.com/publicsource and read it before using 126007Sthurlow * this file. 136007Sthurlow * 146007Sthurlow * The Original Code and all software distributed under the License are 156007Sthurlow * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 166007Sthurlow * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 176007Sthurlow * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 186007Sthurlow * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the 196007Sthurlow * License for the specific language governing rights and limitations 206007Sthurlow * under the License." 216007Sthurlow * 226007Sthurlow * @APPLE_LICENSE_HEADER_END@ 236007Sthurlow */ 24*8271SGordon.Ross@Sun.COM /* CSTYLED */ 25*8271SGordon.Ross@Sun.COM /* 26*8271SGordon.Ross@Sun.COM * @(#)charsets.c * 276007Sthurlow * (c) 2004 Apple Computer, Inc. All Rights Reserved 286007Sthurlow * 296007Sthurlow * 306007Sthurlow * charsets.c -- Routines converting between UTF-8, 16-bit 316007Sthurlow * little-endian Unicode, and various Windows 326007Sthurlow * code pages. 336007Sthurlow * 346007Sthurlow * MODIFICATION HISTORY: 356007Sthurlow * 28-Nov-2004 Guy Harris New today 366007Sthurlow */ 376007Sthurlow 386007Sthurlow #include <stdlib.h> 396007Sthurlow #include <stdio.h> 406007Sthurlow #include <string.h> 416007Sthurlow #include <ctype.h> 42*8271SGordon.Ross@Sun.COM #include <errno.h> 436007Sthurlow #include <iconv.h> 446007Sthurlow #include <langinfo.h> 456007Sthurlow #include <strings.h> 466007Sthurlow 476007Sthurlow #include <netsmb/smb_lib.h> 486007Sthurlow #include <netsmb/mchain.h> 496007Sthurlow 506007Sthurlow #include "charsets.h" 516007Sthurlow 526007Sthurlow /* 536007Sthurlow * On Solaris, we will need to do some rewriting to use our iconv 546007Sthurlow * routines for the conversions. For now, we're effectively 556007Sthurlow * stubbing out code, leaving the details of what happens on 566007Sthurlow * Darwin in case it's useful as a guide later. 576007Sthurlow */ 586007Sthurlow 596007Sthurlow static unsigned 606007Sthurlow xtoi(char u) 616007Sthurlow { 62*8271SGordon.Ross@Sun.COM if (isdigit(u)) 63*8271SGordon.Ross@Sun.COM return (u - '0'); 64*8271SGordon.Ross@Sun.COM else if (islower(u)) 65*8271SGordon.Ross@Sun.COM return (10 + u - 'a'); 66*8271SGordon.Ross@Sun.COM else if (isupper(u)) 67*8271SGordon.Ross@Sun.COM return (10 + u - 'A'); 68*8271SGordon.Ross@Sun.COM return (16); 696007Sthurlow } 706007Sthurlow 716007Sthurlow 72*8271SGordon.Ross@Sun.COM /* 73*8271SGordon.Ross@Sun.COM * Removes the "%" escape sequences from a URL component. 746007Sthurlow * See IETF RFC 2396. 756007Sthurlow */ 766007Sthurlow char * 77*8271SGordon.Ross@Sun.COM unpercent(char *component) 786007Sthurlow { 79*8271SGordon.Ross@Sun.COM char c, *s; 80*8271SGordon.Ross@Sun.COM unsigned hi, lo; 81*8271SGordon.Ross@Sun.COM 82*8271SGordon.Ross@Sun.COM if (component == NULL) 83*8271SGordon.Ross@Sun.COM return (component); 846007Sthurlow 85*8271SGordon.Ross@Sun.COM for (s = component; (c = *s) != 0; s++) { 86*8271SGordon.Ross@Sun.COM if (c != '%') 87*8271SGordon.Ross@Sun.COM continue; 88*8271SGordon.Ross@Sun.COM if ((hi = xtoi(s[1])) > 15 || (lo = xtoi(s[2])) > 15) 89*8271SGordon.Ross@Sun.COM continue; /* ignore invalid escapes */ 90*8271SGordon.Ross@Sun.COM s[0] = hi*16 + lo; 91*8271SGordon.Ross@Sun.COM /* 92*8271SGordon.Ross@Sun.COM * This was strcpy(s + 1, s + 3); 93*8271SGordon.Ross@Sun.COM * But nowadays leftward overlapping copies are 94*8271SGordon.Ross@Sun.COM * officially undefined in C. Ours seems to 95*8271SGordon.Ross@Sun.COM * work or not depending upon alignment. 96*8271SGordon.Ross@Sun.COM */ 97*8271SGordon.Ross@Sun.COM memmove(s+1, s+3, strlen(s+3) + 1); 98*8271SGordon.Ross@Sun.COM } 99*8271SGordon.Ross@Sun.COM return (component); 1006007Sthurlow } 1016007Sthurlow 102*8271SGordon.Ross@Sun.COM /* BEGIN CSTYLED */ 1036007Sthurlow #ifdef NOTPORTED 1046007Sthurlow static CFStringEncoding 1056007Sthurlow get_windows_encoding_equivalent( void ) 1066007Sthurlow { 1076007Sthurlow 1086007Sthurlow CFStringEncoding encoding; 1096007Sthurlow uint32_t index,region; 1106007Sthurlow 1116007Sthurlow /* important! use root ID so you can read the config file! */ 1126007Sthurlow seteuid(eff_uid); 1136007Sthurlow __CFStringGetInstallationEncodingAndRegion(&index,®ion); 1146007Sthurlow seteuid(real_uid); 1156007Sthurlow 1166007Sthurlow switch ( index ) 1176007Sthurlow { 1186007Sthurlow case kCFStringEncodingMacRoman: 1196007Sthurlow if (region) /* anything nonzero is not US */ 1206007Sthurlow encoding = kCFStringEncodingDOSLatin1; 1216007Sthurlow else /* US region */ 1226007Sthurlow encoding = kCFStringEncodingDOSLatinUS; 1236007Sthurlow break; 1246007Sthurlow 1256007Sthurlow case kCFStringEncodingMacJapanese: 1266007Sthurlow encoding = kCFStringEncodingDOSJapanese; 1276007Sthurlow break; 1286007Sthurlow 1296007Sthurlow case kCFStringEncodingMacChineseTrad: 1306007Sthurlow encoding = kCFStringEncodingDOSChineseTrad; 1316007Sthurlow break; 1326007Sthurlow 1336007Sthurlow case kCFStringEncodingMacKorean: 1346007Sthurlow encoding = kCFStringEncodingDOSKorean; 1356007Sthurlow break; 1366007Sthurlow 1376007Sthurlow case kCFStringEncodingMacArabic: 1386007Sthurlow encoding = kCFStringEncodingDOSArabic; 1396007Sthurlow break; 1406007Sthurlow 1416007Sthurlow case kCFStringEncodingMacHebrew: 1426007Sthurlow encoding = kCFStringEncodingDOSHebrew; 1436007Sthurlow break; 1446007Sthurlow 1456007Sthurlow case kCFStringEncodingMacGreek: 1466007Sthurlow encoding = kCFStringEncodingDOSGreek; 1476007Sthurlow break; 1486007Sthurlow 1496007Sthurlow case kCFStringEncodingMacCyrillic: 1506007Sthurlow encoding = kCFStringEncodingDOSCyrillic; 1516007Sthurlow break; 1526007Sthurlow 1536007Sthurlow case kCFStringEncodingMacThai: 1546007Sthurlow encoding = kCFStringEncodingDOSThai; 1556007Sthurlow break; 1566007Sthurlow 1576007Sthurlow case kCFStringEncodingMacChineseSimp: 1586007Sthurlow encoding = kCFStringEncodingDOSChineseSimplif; 1596007Sthurlow break; 1606007Sthurlow 1616007Sthurlow case kCFStringEncodingMacCentralEurRoman: 1626007Sthurlow encoding = kCFStringEncodingDOSLatin2; 1636007Sthurlow break; 1646007Sthurlow 1656007Sthurlow case kCFStringEncodingMacTurkish: 1666007Sthurlow encoding = kCFStringEncodingDOSTurkish; 1676007Sthurlow break; 1686007Sthurlow 1696007Sthurlow case kCFStringEncodingMacCroatian: 1706007Sthurlow encoding = kCFStringEncodingDOSLatin2; 1716007Sthurlow break; 1726007Sthurlow 1736007Sthurlow case kCFStringEncodingMacIcelandic: 1746007Sthurlow encoding = kCFStringEncodingDOSIcelandic; 1756007Sthurlow break; 1766007Sthurlow 1776007Sthurlow case kCFStringEncodingMacRomanian: 1786007Sthurlow encoding = kCFStringEncodingDOSLatin2; 1796007Sthurlow break; 1806007Sthurlow 1816007Sthurlow case kCFStringEncodingMacFarsi: 1826007Sthurlow encoding = kCFStringEncodingDOSArabic; 1836007Sthurlow break; 1846007Sthurlow 1856007Sthurlow case kCFStringEncodingMacUkrainian: 1866007Sthurlow encoding = kCFStringEncodingDOSCyrillic; 1876007Sthurlow break; 1886007Sthurlow 1896007Sthurlow default: 1906007Sthurlow encoding = kCFStringEncodingDOSLatin1; 1916007Sthurlow break; 1926007Sthurlow } 1936007Sthurlow 1946007Sthurlow return encoding; 1956007Sthurlow } 1966007Sthurlow #endif /* NOTPORTED */ 1976007Sthurlow 1986007Sthurlow /* 1996007Sthurlow * XXX - NLS, or CF? We should probably use the same routine for all 2006007Sthurlow * conversions. 2016007Sthurlow */ 2026007Sthurlow char * 2036007Sthurlow convert_wincs_to_utf8(const char *windows_string) 2046007Sthurlow { 2056007Sthurlow #ifdef NOTPORTED 2066007Sthurlow CFStringRef s; 2076007Sthurlow CFIndex maxlen; 2086007Sthurlow char *result; 2096007Sthurlow 2106007Sthurlow s = CFStringCreateWithCString(NULL, windows_string, 2116007Sthurlow get_windows_encoding_equivalent()); 2126007Sthurlow if (s == NULL) { 2136007Sthurlow smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" ", -1, 2146007Sthurlow windows_string); 2156007Sthurlow 2166007Sthurlow /* kCFStringEncodingMacRoman should always succeed */ 2176007Sthurlow s = CFStringCreateWithCString(NULL, windows_string, 2186007Sthurlow kCFStringEncodingMacRoman); 2196007Sthurlow if (s == NULL) { 2206007Sthurlow smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" with kCFStringEncodingMacRoman - skipping", 2216007Sthurlow -1, windows_string); 2226007Sthurlow return NULL; 2236007Sthurlow } 2246007Sthurlow } 2256007Sthurlow 2266007Sthurlow maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s), 2276007Sthurlow kCFStringEncodingUTF8) + 1; 2286007Sthurlow result = malloc(maxlen); 2296007Sthurlow if (result == NULL) { 2306007Sthurlow smb_error("Couldn't allocate buffer for UTF-8 string for \"%s\" - skipping", -1, 2316007Sthurlow windows_string); 2326007Sthurlow CFRelease(s); 2336007Sthurlow return NULL; 2346007Sthurlow } 2356007Sthurlow if (!CFStringGetCString(s, result, maxlen, kCFStringEncodingUTF8)) { 2366007Sthurlow smb_error("CFStringGetCString for UTF-8 failed on \"%s\" - skipping", 2376007Sthurlow -1, windows_string); 2386007Sthurlow CFRelease(s); 2396007Sthurlow return NULL; 2406007Sthurlow } 2416007Sthurlow CFRelease(s); 2426007Sthurlow return result; 2436007Sthurlow #else /* NOTPORTED */ 244*8271SGordon.Ross@Sun.COM return (strdup((char*)windows_string)); 2456007Sthurlow #endif /* NOTPORTED */ 2466007Sthurlow } 2476007Sthurlow 2486007Sthurlow /* 2496007Sthurlow * XXX - NLS, or CF? We should probably use the same routine for all 2506007Sthurlow * conversions. 2516007Sthurlow */ 2526007Sthurlow char * 2536007Sthurlow convert_utf8_to_wincs(const char *utf8_string) 2546007Sthurlow { 2556007Sthurlow #ifdef NOTPORTED 2566007Sthurlow CFStringRef s; 2576007Sthurlow CFIndex maxlen; 2586007Sthurlow char *result; 2596007Sthurlow 2606007Sthurlow s = CFStringCreateWithCString(NULL, utf8_string, 2616007Sthurlow kCFStringEncodingUTF8); 2626007Sthurlow if (s == NULL) { 2636007Sthurlow smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1, 2646007Sthurlow utf8_string); 2656007Sthurlow return NULL; 2666007Sthurlow } 2676007Sthurlow 2686007Sthurlow maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s), 2696007Sthurlow get_windows_encoding_equivalent()) + 1; 2706007Sthurlow result = malloc(maxlen); 2716007Sthurlow if (result == NULL) { 2726007Sthurlow smb_error("Couldn't allocate buffer for Windows code page string for \"%s\" - skipping", -1, 2736007Sthurlow utf8_string); 2746007Sthurlow CFRelease(s); 2756007Sthurlow return NULL; 2766007Sthurlow } 2776007Sthurlow if (!CFStringGetCString(s, result, maxlen, 2786007Sthurlow get_windows_encoding_equivalent())) { 2796007Sthurlow smb_error("CFStringGetCString for Windows code page failed on \"%s\" - skipping", 2806007Sthurlow -1, utf8_string); 2816007Sthurlow CFRelease(s); 2826007Sthurlow return NULL; 2836007Sthurlow } 2846007Sthurlow CFRelease(s); 2856007Sthurlow return result; 2866007Sthurlow #else /* NOTPORTED */ 287*8271SGordon.Ross@Sun.COM return (strdup((char*)utf8_string)); 2886007Sthurlow #endif /* NOTPORTED */ 2896007Sthurlow } 290*8271SGordon.Ross@Sun.COM /* END CSTYLED */ 2916007Sthurlow 2926007Sthurlow /* 293*8271SGordon.Ross@Sun.COM * We replaced these routines for Solaris: 294*8271SGordon.Ross@Sun.COM * convert_leunicode_to_utf8 295*8271SGordon.Ross@Sun.COM * convert_unicode_to_utf8 296*8271SGordon.Ross@Sun.COM * convert_utf8_to_leunicode 297*8271SGordon.Ross@Sun.COM * with new code in: utf_str.c 2986007Sthurlow */ 299