libsmbfs/smb/charsets.c

*6007Sthurlow/*
*6007Sthurlow * Copyright (c) 2001 Apple Computer, Inc. All rights reserved.
*6007Sthurlow *
*6007Sthurlow * @APPLE_LICENSE_HEADER_START@
*6007Sthurlow *
*6007Sthurlow * "Portions Copyright (c) 1999 Apple Computer, Inc.  All Rights
*6007Sthurlow * Reserved.  This file contains Original Code and/or Modifications of
*6007Sthurlow * Original Code as defined in and that are subject to the Apple Public
*6007Sthurlow * Source License Version 1.0 (the 'License').  You may not use this file
*6007Sthurlow * except in compliance with the License.  Please obtain a copy of the
*6007Sthurlow * License at http://www.apple.com/publicsource and read it before using
*6007Sthurlow * this file.
*6007Sthurlow *
*6007Sthurlow * The Original Code and all software distributed under the License are
*6007Sthurlow * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
*6007Sthurlow * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
*6007Sthurlow * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
*6007Sthurlow * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
*6007Sthurlow * License for the specific language governing rights and limitations
*6007Sthurlow * under the License."
*6007Sthurlow *
*6007Sthurlow * @APPLE_LICENSE_HEADER_END@
*6007Sthurlow */
*6007Sthurlow/*      @(#)charsets.c      *
*6007Sthurlow *      (c) 2004   Apple Computer, Inc.  All Rights Reserved
*6007Sthurlow *
*6007Sthurlow *
*6007Sthurlow *      charsets.c -- Routines converting between UTF-8, 16-bit
*6007Sthurlow *			little-endian Unicode, and various Windows
*6007Sthurlow *			code pages.
*6007Sthurlow *
*6007Sthurlow *      MODIFICATION HISTORY:
*6007Sthurlow *       28-Nov-2004     Guy Harris	New today
*6007Sthurlow */
*6007Sthurlow
*6007Sthurlow#pragma ident	"%Z%%M%	%I%	%E% SMI"
*6007Sthurlow
*6007Sthurlow#include <stdlib.h>
*6007Sthurlow#include <stdio.h>
*6007Sthurlow#include <string.h>
*6007Sthurlow#include <ctype.h>
*6007Sthurlow#include <iconv.h>
*6007Sthurlow#include <langinfo.h>
*6007Sthurlow#include <strings.h>
*6007Sthurlow
*6007Sthurlow#ifdef NOTPORTED
*6007Sthurlow#include <CoreFoundation/CoreFoundation.h>
*6007Sthurlow#include <CoreFoundation/CFStringDefaultEncoding.h>
*6007Sthurlow#include <CoreFoundation/CFStringEncodingConverter.h>
*6007Sthurlow#include <sys/mchain.h>
*6007Sthurlow#endif /* NOTPORTED */
*6007Sthurlow
*6007Sthurlow#include <netsmb/smb_lib.h>
*6007Sthurlow#include <netsmb/mchain.h>
*6007Sthurlow
*6007Sthurlow#include "charsets.h"
*6007Sthurlow
*6007Sthurlow#ifdef NOTPORTED
*6007Sthurlowextern 	 uid_t real_uid,eff_uid;
*6007Sthurlow#endif /* NOTPORTED */
*6007Sthurlow
*6007Sthurlow/*
*6007Sthurlow * On Solaris, we will need to do some rewriting to use our iconv
*6007Sthurlow * routines for the conversions.  For now, we're effectively
*6007Sthurlow * stubbing out code, leaving the details of what happens on
*6007Sthurlow * Darwin in case it's useful as a guide later.
*6007Sthurlow */
*6007Sthurlow
*6007Sthurlowstatic unsigned
*6007Sthurlowxtoi(char u)
*6007Sthurlow{
*6007Sthurlow        if (isdigit(u))
*6007Sthurlow                return (u - '0');
*6007Sthurlow        else if (islower(u))
*6007Sthurlow                return (10 + u - 'a');
*6007Sthurlow        else if (isupper(u))
*6007Sthurlow                return (10 + u - 'A');
*6007Sthurlow        return (16);
*6007Sthurlow}
*6007Sthurlow
*6007Sthurlow
*6007Sthurlow/* Removes the "%" escape sequences from a URL component.
*6007Sthurlow * See IETF RFC 2396.
*6007Sthurlow */
*6007Sthurlowchar *
*6007Sthurlowunpercent(char * component)
*6007Sthurlow{
*6007Sthurlow        char c, *s;
*6007Sthurlow        unsigned hi, lo;
*6007Sthurlow
*6007Sthurlow        if (component)
*6007Sthurlow                for (s = component; (c = *s) != 0; s++) {
*6007Sthurlow                        if (c != '%')
*6007Sthurlow                                continue;
*6007Sthurlow                        if ((hi = xtoi(s[1])) > 15 || (lo = xtoi(s[2])) > 15)
*6007Sthurlow                                continue; /* ignore invalid escapes */
*6007Sthurlow                        s[0] = hi*16 + lo;
*6007Sthurlow                        /*
*6007Sthurlow                         * This was strcpy(s + 1, s + 3);
*6007Sthurlow                         * But nowadays leftward overlapping copies are
*6007Sthurlow                         * officially undefined in C.  Ours seems to
*6007Sthurlow                         * work or not depending upon alignment.
*6007Sthurlow                         */
*6007Sthurlow                        memmove(s+1, s+3, strlen(s+3) + 1);
*6007Sthurlow                }
*6007Sthurlow        return (component);
*6007Sthurlow}
*6007Sthurlow
*6007Sthurlow#ifdef NOTPORTED
*6007Sthurlowstatic CFStringEncoding
*6007Sthurlowget_windows_encoding_equivalent( void )
*6007Sthurlow{
*6007Sthurlow
*6007Sthurlow	CFStringEncoding encoding;
*6007Sthurlow	uint32_t index,region;
*6007Sthurlow
*6007Sthurlow	/* important! use root ID so you can read the config file! */
*6007Sthurlow	seteuid(eff_uid);
*6007Sthurlow	__CFStringGetInstallationEncodingAndRegion(&index,&region);
*6007Sthurlow	seteuid(real_uid);
*6007Sthurlow
*6007Sthurlow	switch ( index )
*6007Sthurlow	{
*6007Sthurlow		case	kCFStringEncodingMacRoman:
*6007Sthurlow			if (region) /* anything nonzero is not US */
*6007Sthurlow				encoding = kCFStringEncodingDOSLatin1;
*6007Sthurlow			else /* US region */
*6007Sthurlow				encoding = kCFStringEncodingDOSLatinUS;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacJapanese:
*6007Sthurlow			encoding = kCFStringEncodingDOSJapanese;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacChineseTrad:
*6007Sthurlow			encoding = kCFStringEncodingDOSChineseTrad;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacKorean:
*6007Sthurlow			encoding = kCFStringEncodingDOSKorean;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacArabic:
*6007Sthurlow			encoding = kCFStringEncodingDOSArabic;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacHebrew:
*6007Sthurlow			encoding = kCFStringEncodingDOSHebrew;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacGreek:
*6007Sthurlow			encoding = kCFStringEncodingDOSGreek;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacCyrillic:
*6007Sthurlow			encoding = kCFStringEncodingDOSCyrillic;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacThai:
*6007Sthurlow			encoding = kCFStringEncodingDOSThai;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacChineseSimp:
*6007Sthurlow			encoding = kCFStringEncodingDOSChineseSimplif;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacCentralEurRoman:
*6007Sthurlow			encoding = kCFStringEncodingDOSLatin2;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacTurkish:
*6007Sthurlow			encoding = kCFStringEncodingDOSTurkish;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacCroatian:
*6007Sthurlow			encoding = kCFStringEncodingDOSLatin2;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacIcelandic:
*6007Sthurlow			encoding = kCFStringEncodingDOSIcelandic;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacRomanian:
*6007Sthurlow			encoding = kCFStringEncodingDOSLatin2;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacFarsi:
*6007Sthurlow			encoding = kCFStringEncodingDOSArabic;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		case	kCFStringEncodingMacUkrainian:
*6007Sthurlow			encoding = kCFStringEncodingDOSCyrillic;
*6007Sthurlow			break;
*6007Sthurlow
*6007Sthurlow		default:
*6007Sthurlow			encoding = kCFStringEncodingDOSLatin1;
*6007Sthurlow			break;
*6007Sthurlow	}
*6007Sthurlow
*6007Sthurlow	return encoding;
*6007Sthurlow}
*6007Sthurlow#endif /* NOTPORTED */
*6007Sthurlow
*6007Sthurlow/*
*6007Sthurlow * XXX - NLS, or CF?  We should probably use the same routine for all
*6007Sthurlow * conversions.
*6007Sthurlow */
*6007Sthurlowchar *
*6007Sthurlowconvert_wincs_to_utf8(const char *windows_string)
*6007Sthurlow{
*6007Sthurlow#ifdef NOTPORTED
*6007Sthurlow	CFStringRef s;
*6007Sthurlow	CFIndex maxlen;
*6007Sthurlow	char *result;
*6007Sthurlow
*6007Sthurlow	s = CFStringCreateWithCString(NULL, windows_string,
*6007Sthurlow		get_windows_encoding_equivalent());
*6007Sthurlow	if (s == NULL) {
*6007Sthurlow		smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" ", -1,
*6007Sthurlow		    windows_string);
*6007Sthurlow
*6007Sthurlow		/* kCFStringEncodingMacRoman should always succeed */
*6007Sthurlow		s = CFStringCreateWithCString(NULL, windows_string,
*6007Sthurlow		    kCFStringEncodingMacRoman);
*6007Sthurlow		if (s == NULL) {
*6007Sthurlow			smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" with kCFStringEncodingMacRoman - skipping",
*6007Sthurlow			    -1, windows_string);
*6007Sthurlow			return NULL;
*6007Sthurlow		}
*6007Sthurlow	}
*6007Sthurlow
*6007Sthurlow	maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
*6007Sthurlow	    kCFStringEncodingUTF8) + 1;
*6007Sthurlow	result = malloc(maxlen);
*6007Sthurlow	if (result == NULL) {
*6007Sthurlow		smb_error("Couldn't allocate buffer for UTF-8 string for \"%s\" - skipping", -1,
*6007Sthurlow		    windows_string);
*6007Sthurlow		CFRelease(s);
*6007Sthurlow		return NULL;
*6007Sthurlow	}
*6007Sthurlow	if (!CFStringGetCString(s, result, maxlen, kCFStringEncodingUTF8)) {
*6007Sthurlow		smb_error("CFStringGetCString for UTF-8 failed on \"%s\" - skipping",
*6007Sthurlow		    -1, windows_string);
*6007Sthurlow		CFRelease(s);
*6007Sthurlow		return NULL;
*6007Sthurlow	}
*6007Sthurlow	CFRelease(s);
*6007Sthurlow	return result;
*6007Sthurlow#else /* NOTPORTED */
*6007Sthurlow	return ((char*)windows_string);
*6007Sthurlow#endif /* NOTPORTED */
*6007Sthurlow}
*6007Sthurlow
*6007Sthurlow/*
*6007Sthurlow * XXX - NLS, or CF?  We should probably use the same routine for all
*6007Sthurlow * conversions.
*6007Sthurlow */
*6007Sthurlowchar *
*6007Sthurlowconvert_utf8_to_wincs(const char *utf8_string)
*6007Sthurlow{
*6007Sthurlow#ifdef NOTPORTED
*6007Sthurlow	CFStringRef s;
*6007Sthurlow	CFIndex maxlen;
*6007Sthurlow	char *result;
*6007Sthurlow
*6007Sthurlow	s = CFStringCreateWithCString(NULL, utf8_string,
*6007Sthurlow	    kCFStringEncodingUTF8);
*6007Sthurlow	if (s == NULL) {
*6007Sthurlow		smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1,
*6007Sthurlow		    utf8_string);
*6007Sthurlow		return NULL;
*6007Sthurlow	}
*6007Sthurlow
*6007Sthurlow	maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
*6007Sthurlow	    get_windows_encoding_equivalent()) + 1;
*6007Sthurlow	result = malloc(maxlen);
*6007Sthurlow	if (result == NULL) {
*6007Sthurlow		smb_error("Couldn't allocate buffer for Windows code page string for \"%s\" - skipping", -1,
*6007Sthurlow		    utf8_string);
*6007Sthurlow		CFRelease(s);
*6007Sthurlow		return NULL;
*6007Sthurlow	}
*6007Sthurlow	if (!CFStringGetCString(s, result, maxlen,
*6007Sthurlow	    get_windows_encoding_equivalent())) {
*6007Sthurlow		smb_error("CFStringGetCString for Windows code page failed on \"%s\" - skipping",
*6007Sthurlow		    -1, utf8_string);
*6007Sthurlow		CFRelease(s);
*6007Sthurlow		return NULL;
*6007Sthurlow	}
*6007Sthurlow	CFRelease(s);
*6007Sthurlow	return result;
*6007Sthurlow#else /* NOTPORTED */
*6007Sthurlow	return ((char*)utf8_string);
*6007Sthurlow#endif /* NOTPORTED */
*6007Sthurlow}
*6007Sthurlow
*6007Sthurlow/*
*6007Sthurlow * Convert little-endian Unicode string to UTF-8.
*6007Sthurlow * Converts the Unicode string to host byte order in place.
*6007Sthurlow */
*6007Sthurlowchar *
*6007Sthurlowconvert_leunicode_to_utf8(unsigned short *unicode_string)
*6007Sthurlow{
*6007Sthurlow	unsigned short *unicode_charp, unicode_char;
*6007Sthurlow	int len = 0;
*6007Sthurlow
*6007Sthurlow	for (unicode_charp = unicode_string;
*6007Sthurlow	    (unicode_char = *unicode_charp) != 0;
*6007Sthurlow	    unicode_charp++) {
*6007Sthurlow		*unicode_charp = letohs(unicode_char);
*6007Sthurlow		len = len + 2;
*6007Sthurlow	}
*6007Sthurlow	return (convert_unicode_to_utf8(unicode_string, len));
*6007Sthurlow}
*6007Sthurlow
*6007Sthurlowchar *
*6007Sthurlowconvert_unicode_to_utf8(unsigned short *unicode_string, int len)
*6007Sthurlow{
*6007Sthurlow	iconv_t cd;
*6007Sthurlow	char    from[BUFSIZ], to[BUFSIZ];
*6007Sthurlow	char *tptr = NULL;
*6007Sthurlow	const char *fptr;
*6007Sthurlow	size_t  ileft, oleft, ret;
*6007Sthurlow
*6007Sthurlow	cd = iconv_open("UTF-8", "UTF-16");
*6007Sthurlow	if (cd != (iconv_t)-1) {
*6007Sthurlow		ileft = len;
*6007Sthurlow		bcopy((char *)unicode_string, from, ileft);
*6007Sthurlow		fptr = from;
*6007Sthurlow		oleft = BUFSIZ;
*6007Sthurlow		tptr = to;
*6007Sthurlow		ret = iconv(cd, &fptr, &ileft, &tptr, &oleft);
*6007Sthurlow		if (ret != (size_t)-1) {
*6007Sthurlow			to[BUFSIZ-oleft] = '\0';
*6007Sthurlow			tptr = to;
*6007Sthurlow		} else {
*6007Sthurlow			tptr = NULL;
*6007Sthurlow		}
*6007Sthurlow		(void) iconv_close(cd);
*6007Sthurlow	}
*6007Sthurlow	return (tptr);
*6007Sthurlow}
*6007Sthurlow
*6007Sthurlow/*
*6007Sthurlow * Convert UTF-8 string to little-endian Unicode.
*6007Sthurlow */
*6007Sthurlowunsigned short *
*6007Sthurlowconvert_utf8_to_leunicode(const char *utf8_string)
*6007Sthurlow{
*6007Sthurlow#ifdef NOTPORTED
*6007Sthurlow	CFStringRef s;
*6007Sthurlow	CFIndex maxlen;
*6007Sthurlow	unsigned short *result;
*6007Sthurlow	CFRange range;
*6007Sthurlow	int i;
*6007Sthurlow
*6007Sthurlow	s = CFStringCreateWithCString(NULL, utf8_string,
*6007Sthurlow	     kCFStringEncodingUTF8);
*6007Sthurlow	if (s == NULL) {
*6007Sthurlow		smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1,
*6007Sthurlow		    utf8_string);
*6007Sthurlow		return NULL;
*6007Sthurlow	}
*6007Sthurlow
*6007Sthurlow	maxlen = CFStringGetLength(s);
*6007Sthurlow	result = malloc(2*(maxlen + 1));
*6007Sthurlow	if (result == NULL) {
*6007Sthurlow		smb_error("Couldn't allocate buffer for Unicode string for \"%s\" - skipping", -1,
*6007Sthurlow		    utf8_string);
*6007Sthurlow		CFRelease(s);
*6007Sthurlow		return NULL;
*6007Sthurlow	}
*6007Sthurlow	range.location = 0;
*6007Sthurlow	range.length = maxlen;
*6007Sthurlow	CFStringGetCharacters(s, range, result);
*6007Sthurlow	for (i = 0; i < maxlen; i++)
*6007Sthurlow		result[i] = CFSwapInt16HostToLittle(result[i]);
*6007Sthurlow	result[maxlen] = 0;
*6007Sthurlow	CFRelease(s);
*6007Sthurlow	return result;
*6007Sthurlow#else /* NOTPORTED */
*6007Sthurlow	/* LINTED */ /* XXX Really need to fix this! */
*6007Sthurlow	return ((ushort_t *)utf8_string); /* XXX */
*6007Sthurlow#endif /* NOTPORTED */
*6007Sthurlow}