1 /*- 2 * Copyright (c) 2003-2007 Tim Kientzle 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include "archive_platform.h" 27 __FBSDID("$FreeBSD: src/lib/libarchive/archive_string.c,v 1.16 2008/06/15 11:28:56 kientzle Exp $"); 28 29 /* 30 * Basic resizable string support, to simplify manipulating arbitrary-sized 31 * strings while minimizing heap activity. 32 */ 33 34 #ifdef HAVE_STDLIB_H 35 #include <stdlib.h> 36 #endif 37 #ifdef HAVE_STRING_H 38 #include <string.h> 39 #endif 40 #ifdef HAVE_WCHAR_H 41 #include <wchar.h> 42 #endif 43 44 #include "archive_private.h" 45 #include "archive_string.h" 46 47 struct archive_string * 48 __archive_string_append(struct archive_string *as, const char *p, size_t s) 49 { 50 if (__archive_string_ensure(as, as->length + s + 1) == NULL) 51 __archive_errx(1, "Out of memory"); 52 memcpy(as->s + as->length, p, s); 53 as->s[as->length + s] = 0; 54 as->length += s; 55 return (as); 56 } 57 58 void 59 __archive_string_copy(struct archive_string *dest, struct archive_string *src) 60 { 61 if (src->length == 0) 62 dest->length = 0; 63 else { 64 if (__archive_string_ensure(dest, src->length + 1) == NULL) 65 __archive_errx(1, "Out of memory"); 66 memcpy(dest->s, src->s, src->length); 67 dest->length = src->length; 68 dest->s[dest->length] = 0; 69 } 70 } 71 72 void 73 __archive_string_free(struct archive_string *as) 74 { 75 as->length = 0; 76 as->buffer_length = 0; 77 if (as->s != NULL) { 78 free(as->s); 79 as->s = NULL; 80 } 81 } 82 83 /* Returns NULL on any allocation failure. */ 84 struct archive_string * 85 __archive_string_ensure(struct archive_string *as, size_t s) 86 { 87 /* If buffer is already big enough, don't reallocate. */ 88 if (as->s && (s <= as->buffer_length)) 89 return (as); 90 91 /* 92 * Growing the buffer at least exponentially ensures that 93 * append operations are always linear in the number of 94 * characters appended. Using a smaller growth rate for 95 * larger buffers reduces memory waste somewhat at the cost of 96 * a larger constant factor. 97 */ 98 if (as->buffer_length < 32) 99 /* Start with a minimum 32-character buffer. */ 100 as->buffer_length = 32; 101 else if (as->buffer_length < 8192) 102 /* Buffers under 8k are doubled for speed. */ 103 as->buffer_length *= 2; 104 else { 105 /* Buffers 8k and over grow by at least 25% each time. */ 106 size_t old_length = as->buffer_length; 107 as->buffer_length = (as->buffer_length * 5) / 4; 108 /* Be safe: If size wraps, release buffer and return NULL. */ 109 if (as->buffer_length < old_length) { 110 free(as->s); 111 as->s = NULL; 112 return (NULL); 113 } 114 } 115 /* 116 * The computation above is a lower limit to how much we'll 117 * grow the buffer. In any case, we have to grow it enough to 118 * hold the request. 119 */ 120 if (as->buffer_length < s) 121 as->buffer_length = s; 122 /* Now we can reallocate the buffer. */ 123 as->s = (char *)realloc(as->s, as->buffer_length); 124 if (as->s == NULL) 125 return (NULL); 126 return (as); 127 } 128 129 struct archive_string * 130 __archive_strncat(struct archive_string *as, const char *p, size_t n) 131 { 132 size_t s; 133 const char *pp; 134 135 /* Like strlen(p), except won't examine positions beyond p[n]. */ 136 s = 0; 137 pp = p; 138 while (*pp && s < n) { 139 pp++; 140 s++; 141 } 142 return (__archive_string_append(as, p, s)); 143 } 144 145 struct archive_string * 146 __archive_strappend_char(struct archive_string *as, char c) 147 { 148 return (__archive_string_append(as, &c, 1)); 149 } 150 151 struct archive_string * 152 __archive_strappend_int(struct archive_string *as, int d, int base) 153 { 154 static const char *digits = "0123456789abcdef"; 155 156 if (d < 0) { 157 __archive_strappend_char(as, '-'); 158 d = -d; 159 } 160 if (d >= base) 161 __archive_strappend_int(as, d/base, base); 162 __archive_strappend_char(as, digits[d % base]); 163 return (as); 164 } 165 166 /* 167 * Home-grown wctomb for UTF-8. 168 */ 169 static int 170 my_wctomb_utf8(char *p, wchar_t wc) 171 { 172 if (p == NULL) 173 /* UTF-8 doesn't use shift states. */ 174 return (0); 175 if (wc <= 0x7f) { 176 p[0] = (char)wc; 177 return (1); 178 } 179 if (wc <= 0x7ff) { 180 p[0] = 0xc0 | ((wc >> 6) & 0x1f); 181 p[1] = 0x80 | (wc & 0x3f); 182 return (2); 183 } 184 if (wc <= 0xffff) { 185 p[0] = 0xe0 | ((wc >> 12) & 0x0f); 186 p[1] = 0x80 | ((wc >> 6) & 0x3f); 187 p[2] = 0x80 | (wc & 0x3f); 188 return (3); 189 } 190 if (wc <= 0x1fffff) { 191 p[0] = 0xf0 | ((wc >> 18) & 0x07); 192 p[1] = 0x80 | ((wc >> 12) & 0x3f); 193 p[2] = 0x80 | ((wc >> 6) & 0x3f); 194 p[3] = 0x80 | (wc & 0x3f); 195 return (4); 196 } 197 /* Unicode has no codes larger than 0x1fffff. */ 198 /* 199 * Awkward point: UTF-8 <-> wchar_t conversions 200 * can actually fail. 201 */ 202 return (-1); 203 } 204 205 static int 206 my_wcstombs(struct archive_string *as, const wchar_t *w, 207 int (*func)(char *, wchar_t)) 208 { 209 int n; 210 char *p; 211 char buff[256]; 212 213 /* Clear the shift state before starting. */ 214 (*func)(NULL, L'\0'); 215 216 /* 217 * Convert one wide char at a time into 'buff', whenever that 218 * fills, append it to the string. 219 */ 220 p = buff; 221 while (*w != L'\0') { 222 /* Flush the buffer when we have <=16 bytes free. */ 223 /* (No encoding has a single character >16 bytes.) */ 224 if ((size_t)(p - buff) >= (size_t)(sizeof(buff) - 16)) { 225 *p = '\0'; 226 archive_strcat(as, buff); 227 p = buff; 228 } 229 n = (*func)(p, *w++); 230 if (n == -1) 231 return (-1); 232 p += n; 233 } 234 *p = '\0'; 235 archive_strcat(as, buff); 236 return (0); 237 } 238 239 /* 240 * Translates a wide character string into UTF-8 and appends 241 * to the archive_string. Note: returns NULL if conversion fails. 242 */ 243 struct archive_string * 244 __archive_strappend_w_utf8(struct archive_string *as, const wchar_t *w) 245 { 246 if (my_wcstombs(as, w, my_wctomb_utf8)) 247 return (NULL); 248 return (as); 249 } 250 251 /* 252 * Translates a wide character string into current locale character set 253 * and appends to the archive_string. Note: returns NULL if conversion 254 * fails. 255 */ 256 struct archive_string * 257 __archive_strappend_w_mbs(struct archive_string *as, const wchar_t *w) 258 { 259 #if HAVE_WCTOMB 260 if (my_wcstombs(as, w, wctomb)) 261 return (NULL); 262 #else 263 /* TODO: Can we do better than this? Are there platforms 264 * that have locale support but don't have wctomb()? */ 265 if (my_wcstombs(as, w, my_wctomb_utf8)) 266 return (NULL); 267 #endif 268 return (as); 269 } 270 271 272 /* 273 * Home-grown mbtowc for UTF-8. Some systems lack UTF-8 274 * (or even lack mbtowc()) and we need UTF-8 support for pax 275 * format. So please don't replace this with a call to the 276 * standard mbtowc() function! 277 */ 278 static int 279 my_mbtowc_utf8(wchar_t *pwc, const char *s, size_t n) 280 { 281 int ch; 282 283 /* Standard behavior: a NULL value for 's' just resets shift state. */ 284 if (s == NULL) 285 return (0); 286 /* If length argument is zero, don't look at the first character. */ 287 if (n <= 0) 288 return (-1); 289 290 /* 291 * Decode 1-4 bytes depending on the value of the first byte. 292 */ 293 ch = (unsigned char)*s; 294 if (ch == 0) { 295 return (0); /* Standard: return 0 for end-of-string. */ 296 } 297 if ((ch & 0x80) == 0) { 298 *pwc = ch & 0x7f; 299 return (1); 300 } 301 if ((ch & 0xe0) == 0xc0) { 302 if (n < 2) 303 return (-1); 304 if ((s[1] & 0xc0) != 0x80) return (-1); 305 *pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f); 306 return (2); 307 } 308 if ((ch & 0xf0) == 0xe0) { 309 if (n < 3) 310 return (-1); 311 if ((s[1] & 0xc0) != 0x80) return (-1); 312 if ((s[2] & 0xc0) != 0x80) return (-1); 313 *pwc = ((ch & 0x0f) << 12) 314 | ((s[1] & 0x3f) << 6) 315 | (s[2] & 0x3f); 316 return (3); 317 } 318 if ((ch & 0xf8) == 0xf0) { 319 if (n < 4) 320 return (-1); 321 if ((s[1] & 0xc0) != 0x80) return (-1); 322 if ((s[2] & 0xc0) != 0x80) return (-1); 323 if ((s[3] & 0xc0) != 0x80) return (-1); 324 *pwc = ((ch & 0x07) << 18) 325 | ((s[1] & 0x3f) << 12) 326 | ((s[2] & 0x3f) << 6) 327 | (s[3] & 0x3f); 328 return (4); 329 } 330 /* Invalid first byte. */ 331 return (-1); 332 } 333 334 /* 335 * Return a wide-character string by converting this archive_string 336 * from UTF-8. 337 */ 338 wchar_t * 339 __archive_string_utf8_w(struct archive_string *as) 340 { 341 wchar_t *ws, *dest; 342 const char *src; 343 int n; 344 int err; 345 346 ws = (wchar_t *)malloc((as->length + 1) * sizeof(wchar_t)); 347 if (ws == NULL) 348 __archive_errx(1, "Out of memory"); 349 err = 0; 350 dest = ws; 351 src = as->s; 352 while (*src != '\0') { 353 n = my_mbtowc_utf8(dest, src, 8); 354 if (n == 0) 355 break; 356 if (n < 0) { 357 free(ws); 358 return (NULL); 359 } 360 dest++; 361 src += n; 362 } 363 *dest++ = L'\0'; 364 return (ws); 365 } 366