1*6007Sthurlow /* 2*6007Sthurlow * Copyright (c) 2001 Apple Computer, Inc. All rights reserved. 3*6007Sthurlow * 4*6007Sthurlow * @APPLE_LICENSE_HEADER_START@ 5*6007Sthurlow * 6*6007Sthurlow * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights 7*6007Sthurlow * Reserved. This file contains Original Code and/or Modifications of 8*6007Sthurlow * Original Code as defined in and that are subject to the Apple Public 9*6007Sthurlow * Source License Version 1.0 (the 'License'). You may not use this file 10*6007Sthurlow * except in compliance with the License. Please obtain a copy of the 11*6007Sthurlow * License at http://www.apple.com/publicsource and read it before using 12*6007Sthurlow * this file. 13*6007Sthurlow * 14*6007Sthurlow * The Original Code and all software distributed under the License are 15*6007Sthurlow * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 16*6007Sthurlow * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 17*6007Sthurlow * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 18*6007Sthurlow * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the 19*6007Sthurlow * License for the specific language governing rights and limitations 20*6007Sthurlow * under the License." 21*6007Sthurlow * 22*6007Sthurlow * @APPLE_LICENSE_HEADER_END@ 23*6007Sthurlow */ 24*6007Sthurlow /* @(#)charsets.c * 25*6007Sthurlow * (c) 2004 Apple Computer, Inc. All Rights Reserved 26*6007Sthurlow * 27*6007Sthurlow * 28*6007Sthurlow * charsets.c -- Routines converting between UTF-8, 16-bit 29*6007Sthurlow * little-endian Unicode, and various Windows 30*6007Sthurlow * code pages. 31*6007Sthurlow * 32*6007Sthurlow * MODIFICATION HISTORY: 33*6007Sthurlow * 28-Nov-2004 Guy Harris New today 34*6007Sthurlow */ 35*6007Sthurlow 36*6007Sthurlow #pragma ident "%Z%%M% %I% %E% SMI" 37*6007Sthurlow 38*6007Sthurlow #include <stdlib.h> 39*6007Sthurlow #include <stdio.h> 40*6007Sthurlow #include <string.h> 41*6007Sthurlow #include <ctype.h> 42*6007Sthurlow #include <iconv.h> 43*6007Sthurlow #include <langinfo.h> 44*6007Sthurlow #include <strings.h> 45*6007Sthurlow 46*6007Sthurlow #ifdef NOTPORTED 47*6007Sthurlow #include <CoreFoundation/CoreFoundation.h> 48*6007Sthurlow #include <CoreFoundation/CFStringDefaultEncoding.h> 49*6007Sthurlow #include <CoreFoundation/CFStringEncodingConverter.h> 50*6007Sthurlow #include <sys/mchain.h> 51*6007Sthurlow #endif /* NOTPORTED */ 52*6007Sthurlow 53*6007Sthurlow #include <netsmb/smb_lib.h> 54*6007Sthurlow #include <netsmb/mchain.h> 55*6007Sthurlow 56*6007Sthurlow #include "charsets.h" 57*6007Sthurlow 58*6007Sthurlow #ifdef NOTPORTED 59*6007Sthurlow extern uid_t real_uid,eff_uid; 60*6007Sthurlow #endif /* NOTPORTED */ 61*6007Sthurlow 62*6007Sthurlow /* 63*6007Sthurlow * On Solaris, we will need to do some rewriting to use our iconv 64*6007Sthurlow * routines for the conversions. For now, we're effectively 65*6007Sthurlow * stubbing out code, leaving the details of what happens on 66*6007Sthurlow * Darwin in case it's useful as a guide later. 67*6007Sthurlow */ 68*6007Sthurlow 69*6007Sthurlow static unsigned 70*6007Sthurlow xtoi(char u) 71*6007Sthurlow { 72*6007Sthurlow if (isdigit(u)) 73*6007Sthurlow return (u - '0'); 74*6007Sthurlow else if (islower(u)) 75*6007Sthurlow return (10 + u - 'a'); 76*6007Sthurlow else if (isupper(u)) 77*6007Sthurlow return (10 + u - 'A'); 78*6007Sthurlow return (16); 79*6007Sthurlow } 80*6007Sthurlow 81*6007Sthurlow 82*6007Sthurlow /* Removes the "%" escape sequences from a URL component. 83*6007Sthurlow * See IETF RFC 2396. 84*6007Sthurlow */ 85*6007Sthurlow char * 86*6007Sthurlow unpercent(char * component) 87*6007Sthurlow { 88*6007Sthurlow char c, *s; 89*6007Sthurlow unsigned hi, lo; 90*6007Sthurlow 91*6007Sthurlow if (component) 92*6007Sthurlow for (s = component; (c = *s) != 0; s++) { 93*6007Sthurlow if (c != '%') 94*6007Sthurlow continue; 95*6007Sthurlow if ((hi = xtoi(s[1])) > 15 || (lo = xtoi(s[2])) > 15) 96*6007Sthurlow continue; /* ignore invalid escapes */ 97*6007Sthurlow s[0] = hi*16 + lo; 98*6007Sthurlow /* 99*6007Sthurlow * This was strcpy(s + 1, s + 3); 100*6007Sthurlow * But nowadays leftward overlapping copies are 101*6007Sthurlow * officially undefined in C. Ours seems to 102*6007Sthurlow * work or not depending upon alignment. 103*6007Sthurlow */ 104*6007Sthurlow memmove(s+1, s+3, strlen(s+3) + 1); 105*6007Sthurlow } 106*6007Sthurlow return (component); 107*6007Sthurlow } 108*6007Sthurlow 109*6007Sthurlow #ifdef NOTPORTED 110*6007Sthurlow static CFStringEncoding 111*6007Sthurlow get_windows_encoding_equivalent( void ) 112*6007Sthurlow { 113*6007Sthurlow 114*6007Sthurlow CFStringEncoding encoding; 115*6007Sthurlow uint32_t index,region; 116*6007Sthurlow 117*6007Sthurlow /* important! use root ID so you can read the config file! */ 118*6007Sthurlow seteuid(eff_uid); 119*6007Sthurlow __CFStringGetInstallationEncodingAndRegion(&index,®ion); 120*6007Sthurlow seteuid(real_uid); 121*6007Sthurlow 122*6007Sthurlow switch ( index ) 123*6007Sthurlow { 124*6007Sthurlow case kCFStringEncodingMacRoman: 125*6007Sthurlow if (region) /* anything nonzero is not US */ 126*6007Sthurlow encoding = kCFStringEncodingDOSLatin1; 127*6007Sthurlow else /* US region */ 128*6007Sthurlow encoding = kCFStringEncodingDOSLatinUS; 129*6007Sthurlow break; 130*6007Sthurlow 131*6007Sthurlow case kCFStringEncodingMacJapanese: 132*6007Sthurlow encoding = kCFStringEncodingDOSJapanese; 133*6007Sthurlow break; 134*6007Sthurlow 135*6007Sthurlow case kCFStringEncodingMacChineseTrad: 136*6007Sthurlow encoding = kCFStringEncodingDOSChineseTrad; 137*6007Sthurlow break; 138*6007Sthurlow 139*6007Sthurlow case kCFStringEncodingMacKorean: 140*6007Sthurlow encoding = kCFStringEncodingDOSKorean; 141*6007Sthurlow break; 142*6007Sthurlow 143*6007Sthurlow case kCFStringEncodingMacArabic: 144*6007Sthurlow encoding = kCFStringEncodingDOSArabic; 145*6007Sthurlow break; 146*6007Sthurlow 147*6007Sthurlow case kCFStringEncodingMacHebrew: 148*6007Sthurlow encoding = kCFStringEncodingDOSHebrew; 149*6007Sthurlow break; 150*6007Sthurlow 151*6007Sthurlow case kCFStringEncodingMacGreek: 152*6007Sthurlow encoding = kCFStringEncodingDOSGreek; 153*6007Sthurlow break; 154*6007Sthurlow 155*6007Sthurlow case kCFStringEncodingMacCyrillic: 156*6007Sthurlow encoding = kCFStringEncodingDOSCyrillic; 157*6007Sthurlow break; 158*6007Sthurlow 159*6007Sthurlow case kCFStringEncodingMacThai: 160*6007Sthurlow encoding = kCFStringEncodingDOSThai; 161*6007Sthurlow break; 162*6007Sthurlow 163*6007Sthurlow case kCFStringEncodingMacChineseSimp: 164*6007Sthurlow encoding = kCFStringEncodingDOSChineseSimplif; 165*6007Sthurlow break; 166*6007Sthurlow 167*6007Sthurlow case kCFStringEncodingMacCentralEurRoman: 168*6007Sthurlow encoding = kCFStringEncodingDOSLatin2; 169*6007Sthurlow break; 170*6007Sthurlow 171*6007Sthurlow case kCFStringEncodingMacTurkish: 172*6007Sthurlow encoding = kCFStringEncodingDOSTurkish; 173*6007Sthurlow break; 174*6007Sthurlow 175*6007Sthurlow case kCFStringEncodingMacCroatian: 176*6007Sthurlow encoding = kCFStringEncodingDOSLatin2; 177*6007Sthurlow break; 178*6007Sthurlow 179*6007Sthurlow case kCFStringEncodingMacIcelandic: 180*6007Sthurlow encoding = kCFStringEncodingDOSIcelandic; 181*6007Sthurlow break; 182*6007Sthurlow 183*6007Sthurlow case kCFStringEncodingMacRomanian: 184*6007Sthurlow encoding = kCFStringEncodingDOSLatin2; 185*6007Sthurlow break; 186*6007Sthurlow 187*6007Sthurlow case kCFStringEncodingMacFarsi: 188*6007Sthurlow encoding = kCFStringEncodingDOSArabic; 189*6007Sthurlow break; 190*6007Sthurlow 191*6007Sthurlow case kCFStringEncodingMacUkrainian: 192*6007Sthurlow encoding = kCFStringEncodingDOSCyrillic; 193*6007Sthurlow break; 194*6007Sthurlow 195*6007Sthurlow default: 196*6007Sthurlow encoding = kCFStringEncodingDOSLatin1; 197*6007Sthurlow break; 198*6007Sthurlow } 199*6007Sthurlow 200*6007Sthurlow return encoding; 201*6007Sthurlow } 202*6007Sthurlow #endif /* NOTPORTED */ 203*6007Sthurlow 204*6007Sthurlow /* 205*6007Sthurlow * XXX - NLS, or CF? We should probably use the same routine for all 206*6007Sthurlow * conversions. 207*6007Sthurlow */ 208*6007Sthurlow char * 209*6007Sthurlow convert_wincs_to_utf8(const char *windows_string) 210*6007Sthurlow { 211*6007Sthurlow #ifdef NOTPORTED 212*6007Sthurlow CFStringRef s; 213*6007Sthurlow CFIndex maxlen; 214*6007Sthurlow char *result; 215*6007Sthurlow 216*6007Sthurlow s = CFStringCreateWithCString(NULL, windows_string, 217*6007Sthurlow get_windows_encoding_equivalent()); 218*6007Sthurlow if (s == NULL) { 219*6007Sthurlow smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" ", -1, 220*6007Sthurlow windows_string); 221*6007Sthurlow 222*6007Sthurlow /* kCFStringEncodingMacRoman should always succeed */ 223*6007Sthurlow s = CFStringCreateWithCString(NULL, windows_string, 224*6007Sthurlow kCFStringEncodingMacRoman); 225*6007Sthurlow if (s == NULL) { 226*6007Sthurlow smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" with kCFStringEncodingMacRoman - skipping", 227*6007Sthurlow -1, windows_string); 228*6007Sthurlow return NULL; 229*6007Sthurlow } 230*6007Sthurlow } 231*6007Sthurlow 232*6007Sthurlow maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s), 233*6007Sthurlow kCFStringEncodingUTF8) + 1; 234*6007Sthurlow result = malloc(maxlen); 235*6007Sthurlow if (result == NULL) { 236*6007Sthurlow smb_error("Couldn't allocate buffer for UTF-8 string for \"%s\" - skipping", -1, 237*6007Sthurlow windows_string); 238*6007Sthurlow CFRelease(s); 239*6007Sthurlow return NULL; 240*6007Sthurlow } 241*6007Sthurlow if (!CFStringGetCString(s, result, maxlen, kCFStringEncodingUTF8)) { 242*6007Sthurlow smb_error("CFStringGetCString for UTF-8 failed on \"%s\" - skipping", 243*6007Sthurlow -1, windows_string); 244*6007Sthurlow CFRelease(s); 245*6007Sthurlow return NULL; 246*6007Sthurlow } 247*6007Sthurlow CFRelease(s); 248*6007Sthurlow return result; 249*6007Sthurlow #else /* NOTPORTED */ 250*6007Sthurlow return ((char*)windows_string); 251*6007Sthurlow #endif /* NOTPORTED */ 252*6007Sthurlow } 253*6007Sthurlow 254*6007Sthurlow /* 255*6007Sthurlow * XXX - NLS, or CF? We should probably use the same routine for all 256*6007Sthurlow * conversions. 257*6007Sthurlow */ 258*6007Sthurlow char * 259*6007Sthurlow convert_utf8_to_wincs(const char *utf8_string) 260*6007Sthurlow { 261*6007Sthurlow #ifdef NOTPORTED 262*6007Sthurlow CFStringRef s; 263*6007Sthurlow CFIndex maxlen; 264*6007Sthurlow char *result; 265*6007Sthurlow 266*6007Sthurlow s = CFStringCreateWithCString(NULL, utf8_string, 267*6007Sthurlow kCFStringEncodingUTF8); 268*6007Sthurlow if (s == NULL) { 269*6007Sthurlow smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1, 270*6007Sthurlow utf8_string); 271*6007Sthurlow return NULL; 272*6007Sthurlow } 273*6007Sthurlow 274*6007Sthurlow maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s), 275*6007Sthurlow get_windows_encoding_equivalent()) + 1; 276*6007Sthurlow result = malloc(maxlen); 277*6007Sthurlow if (result == NULL) { 278*6007Sthurlow smb_error("Couldn't allocate buffer for Windows code page string for \"%s\" - skipping", -1, 279*6007Sthurlow utf8_string); 280*6007Sthurlow CFRelease(s); 281*6007Sthurlow return NULL; 282*6007Sthurlow } 283*6007Sthurlow if (!CFStringGetCString(s, result, maxlen, 284*6007Sthurlow get_windows_encoding_equivalent())) { 285*6007Sthurlow smb_error("CFStringGetCString for Windows code page failed on \"%s\" - skipping", 286*6007Sthurlow -1, utf8_string); 287*6007Sthurlow CFRelease(s); 288*6007Sthurlow return NULL; 289*6007Sthurlow } 290*6007Sthurlow CFRelease(s); 291*6007Sthurlow return result; 292*6007Sthurlow #else /* NOTPORTED */ 293*6007Sthurlow return ((char*)utf8_string); 294*6007Sthurlow #endif /* NOTPORTED */ 295*6007Sthurlow } 296*6007Sthurlow 297*6007Sthurlow /* 298*6007Sthurlow * Convert little-endian Unicode string to UTF-8. 299*6007Sthurlow * Converts the Unicode string to host byte order in place. 300*6007Sthurlow */ 301*6007Sthurlow char * 302*6007Sthurlow convert_leunicode_to_utf8(unsigned short *unicode_string) 303*6007Sthurlow { 304*6007Sthurlow unsigned short *unicode_charp, unicode_char; 305*6007Sthurlow int len = 0; 306*6007Sthurlow 307*6007Sthurlow for (unicode_charp = unicode_string; 308*6007Sthurlow (unicode_char = *unicode_charp) != 0; 309*6007Sthurlow unicode_charp++) { 310*6007Sthurlow *unicode_charp = letohs(unicode_char); 311*6007Sthurlow len = len + 2; 312*6007Sthurlow } 313*6007Sthurlow return (convert_unicode_to_utf8(unicode_string, len)); 314*6007Sthurlow } 315*6007Sthurlow 316*6007Sthurlow char * 317*6007Sthurlow convert_unicode_to_utf8(unsigned short *unicode_string, int len) 318*6007Sthurlow { 319*6007Sthurlow iconv_t cd; 320*6007Sthurlow char from[BUFSIZ], to[BUFSIZ]; 321*6007Sthurlow char *tptr = NULL; 322*6007Sthurlow const char *fptr; 323*6007Sthurlow size_t ileft, oleft, ret; 324*6007Sthurlow 325*6007Sthurlow cd = iconv_open("UTF-8", "UTF-16"); 326*6007Sthurlow if (cd != (iconv_t)-1) { 327*6007Sthurlow ileft = len; 328*6007Sthurlow bcopy((char *)unicode_string, from, ileft); 329*6007Sthurlow fptr = from; 330*6007Sthurlow oleft = BUFSIZ; 331*6007Sthurlow tptr = to; 332*6007Sthurlow ret = iconv(cd, &fptr, &ileft, &tptr, &oleft); 333*6007Sthurlow if (ret != (size_t)-1) { 334*6007Sthurlow to[BUFSIZ-oleft] = '\0'; 335*6007Sthurlow tptr = to; 336*6007Sthurlow } else { 337*6007Sthurlow tptr = NULL; 338*6007Sthurlow } 339*6007Sthurlow (void) iconv_close(cd); 340*6007Sthurlow } 341*6007Sthurlow return (tptr); 342*6007Sthurlow } 343*6007Sthurlow 344*6007Sthurlow /* 345*6007Sthurlow * Convert UTF-8 string to little-endian Unicode. 346*6007Sthurlow */ 347*6007Sthurlow unsigned short * 348*6007Sthurlow convert_utf8_to_leunicode(const char *utf8_string) 349*6007Sthurlow { 350*6007Sthurlow #ifdef NOTPORTED 351*6007Sthurlow CFStringRef s; 352*6007Sthurlow CFIndex maxlen; 353*6007Sthurlow unsigned short *result; 354*6007Sthurlow CFRange range; 355*6007Sthurlow int i; 356*6007Sthurlow 357*6007Sthurlow s = CFStringCreateWithCString(NULL, utf8_string, 358*6007Sthurlow kCFStringEncodingUTF8); 359*6007Sthurlow if (s == NULL) { 360*6007Sthurlow smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1, 361*6007Sthurlow utf8_string); 362*6007Sthurlow return NULL; 363*6007Sthurlow } 364*6007Sthurlow 365*6007Sthurlow maxlen = CFStringGetLength(s); 366*6007Sthurlow result = malloc(2*(maxlen + 1)); 367*6007Sthurlow if (result == NULL) { 368*6007Sthurlow smb_error("Couldn't allocate buffer for Unicode string for \"%s\" - skipping", -1, 369*6007Sthurlow utf8_string); 370*6007Sthurlow CFRelease(s); 371*6007Sthurlow return NULL; 372*6007Sthurlow } 373*6007Sthurlow range.location = 0; 374*6007Sthurlow range.length = maxlen; 375*6007Sthurlow CFStringGetCharacters(s, range, result); 376*6007Sthurlow for (i = 0; i < maxlen; i++) 377*6007Sthurlow result[i] = CFSwapInt16HostToLittle(result[i]); 378*6007Sthurlow result[maxlen] = 0; 379*6007Sthurlow CFRelease(s); 380*6007Sthurlow return result; 381*6007Sthurlow #else /* NOTPORTED */ 382*6007Sthurlow /* LINTED */ /* XXX Really need to fix this! */ 383*6007Sthurlow return ((ushort_t *)utf8_string); /* XXX */ 384*6007Sthurlow #endif /* NOTPORTED */ 385*6007Sthurlow } 386