1 /* $NetBSD: tokenize.c,v 1.2 2012/02/03 21:36:40 christos Exp $ */ 2 3 /* 4 * This file defines the string_tokenize interface 5 * Time-stamp: "2010-07-17 10:40:26 bkorb" 6 * 7 * This file is part of AutoOpts, a companion to AutoGen. 8 * AutoOpts is free software. 9 * AutoOpts is Copyright (c) 1992-2011 by Bruce Korb - all rights reserved 10 * 11 * AutoOpts is available under any one of two licenses. The license 12 * in use must be one of these two and the choice is under the control 13 * of the user of the license. 14 * 15 * The GNU Lesser General Public License, version 3 or later 16 * See the files "COPYING.lgplv3" and "COPYING.gplv3" 17 * 18 * The Modified Berkeley Software Distribution License 19 * See the file "COPYING.mbsd" 20 * 21 * These files have the following md5sums: 22 * 23 * 43b91e8ca915626ed3818ffb1b71248b pkg/libopts/COPYING.gplv3 24 * 06a1a2e4760c90ea5e1dad8dfaac4d39 pkg/libopts/COPYING.lgplv3 25 * 66a5cedaf62c4b2637025f049f9b826f pkg/libopts/COPYING.mbsd 26 */ 27 28 #include <errno.h> 29 #include <stdlib.h> 30 31 #define cc_t const unsigned char 32 #define ch_t unsigned char 33 34 /* = = = START-STATIC-FORWARD = = = */ 35 static void 36 copy_cooked(ch_t** ppDest, char const ** ppSrc); 37 38 static void 39 copy_raw(ch_t** ppDest, char const ** ppSrc); 40 41 static token_list_t * 42 alloc_token_list(char const * str); 43 /* = = = END-STATIC-FORWARD = = = */ 44 45 static void 46 copy_cooked(ch_t** ppDest, char const ** ppSrc) 47 { 48 ch_t* pDest = (ch_t*)*ppDest; 49 const ch_t* pSrc = (const ch_t*)(*ppSrc + 1); 50 51 for (;;) { 52 ch_t ch = *(pSrc++); 53 switch (ch) { 54 case NUL: *ppSrc = NULL; return; 55 case '"': goto done; 56 case '\\': 57 pSrc += ao_string_cook_escape_char((char*)(intptr_t)pSrc, (char*)(intptr_t)&ch, 0x7F); 58 if (ch == 0x7F) 59 break; 60 /* FALLTHROUGH */ 61 62 default: 63 *(pDest++) = ch; 64 } 65 } 66 67 done: 68 *ppDest = (ch_t*)pDest; /* next spot for storing character */ 69 *ppSrc = (char const *)pSrc; /* char following closing quote */ 70 } 71 72 73 static void 74 copy_raw(ch_t** ppDest, char const ** ppSrc) 75 { 76 ch_t* pDest = *ppDest; 77 cc_t* pSrc = (cc_t*) (*ppSrc + 1); 78 79 for (;;) { 80 ch_t ch = *(pSrc++); 81 switch (ch) { 82 case NUL: *ppSrc = NULL; return; 83 case '\'': goto done; 84 case '\\': 85 /* 86 * *Four* escapes are handled: newline removal, escape char 87 * quoting and apostrophe quoting 88 */ 89 switch (*pSrc) { 90 case NUL: *ppSrc = NULL; return; 91 case '\r': 92 if (*(++pSrc) == '\n') 93 ++pSrc; 94 continue; 95 96 case '\n': 97 ++pSrc; 98 continue; 99 100 case '\'': 101 ch = '\''; 102 /* FALLTHROUGH */ 103 104 case '\\': 105 ++pSrc; 106 break; 107 } 108 /* FALLTHROUGH */ 109 110 default: 111 *(pDest++) = ch; 112 } 113 } 114 115 done: 116 *ppDest = pDest; /* next spot for storing character */ 117 *ppSrc = (char const *) pSrc; /* char following closing quote */ 118 } 119 120 static token_list_t * 121 alloc_token_list(char const * str) 122 { 123 token_list_t * res; 124 125 int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */ 126 127 if (str == NULL) goto enoent_res; 128 129 /* 130 * Trim leading white space. Use "ENOENT" and a NULL return to indicate 131 * an empty string was passed. 132 */ 133 while (IS_WHITESPACE_CHAR(*str)) str++; 134 if (*str == NUL) goto enoent_res; 135 136 /* 137 * Take an approximate count of tokens. If no quoted strings are used, 138 * it will be accurate. If quoted strings are used, it will be a little 139 * high and we'll squander the space for a few extra pointers. 140 */ 141 { 142 cc_t* pz = (cc_t*)str; 143 144 do { 145 max_token_ct++; 146 while (! IS_WHITESPACE_CHAR(*++pz)) 147 if (*pz == NUL) goto found_nul; 148 while (IS_WHITESPACE_CHAR(*pz)) pz++; 149 } while (*pz != NUL); 150 151 found_nul: 152 res = malloc(sizeof(*res) + (pz - (cc_t*)str) 153 + (max_token_ct * sizeof(ch_t*))); 154 } 155 156 if (res == NULL) 157 errno = ENOMEM; 158 else res->tkn_list[0] = (ch_t*)(res->tkn_list + (max_token_ct - 1)); 159 160 return res; 161 162 enoent_res: 163 164 errno = ENOENT; 165 return NULL; 166 } 167 168 /*=export_func ao_string_tokenize 169 * 170 * what: tokenize an input string 171 * 172 * arg: + char const* + string + string to be tokenized + 173 * 174 * ret_type: token_list_t* 175 * ret_desc: pointer to a structure that lists each token 176 * 177 * doc: 178 * 179 * This function will convert one input string into a list of strings. 180 * The list of strings is derived by separating the input based on 181 * white space separation. However, if the input contains either single 182 * or double quote characters, then the text after that character up to 183 * a matching quote will become the string in the list. 184 * 185 * The returned pointer should be deallocated with @code{free(3C)} when 186 * are done using the data. The data are placed in a single block of 187 * allocated memory. Do not deallocate individual token/strings. 188 * 189 * The structure pointed to will contain at least these two fields: 190 * @table @samp 191 * @item tkn_ct 192 * The number of tokens found in the input string. 193 * @item tok_list 194 * An array of @code{tkn_ct + 1} pointers to substring tokens, with 195 * the last pointer set to NULL. 196 * @end table 197 * 198 * There are two types of quoted strings: single quoted (@code{'}) and 199 * double quoted (@code{"}). Singly quoted strings are fairly raw in that 200 * escape characters (@code{\\}) are simply another character, except when 201 * preceding the following characters: 202 * @example 203 * @code{\\} double backslashes reduce to one 204 * @code{'} incorporates the single quote into the string 205 * @code{\n} suppresses both the backslash and newline character 206 * @end example 207 * 208 * Double quote strings are formed according to the rules of string 209 * constants in ANSI-C programs. 210 * 211 * example: 212 * @example 213 * #include <stdlib.h> 214 * int ix; 215 * token_list_t* ptl = ao_string_tokenize(some_string) 216 * for (ix = 0; ix < ptl->tkn_ct; ix++) 217 * do_something_with_tkn(ptl->tkn_list[ix]); 218 * free(ptl); 219 * @end example 220 * Note that everything is freed with the one call to @code{free(3C)}. 221 * 222 * err: 223 * NULL is returned and @code{errno} will be set to indicate the problem: 224 * @itemize @bullet 225 * @item 226 * @code{EINVAL} - There was an unterminated quoted string. 227 * @item 228 * @code{ENOENT} - The input string was empty. 229 * @item 230 * @code{ENOMEM} - There is not enough memory. 231 * @end itemize 232 =*/ 233 token_list_t* 234 ao_string_tokenize(char const* str) 235 { 236 token_list_t* res = alloc_token_list(str); 237 ch_t* pzDest; 238 239 /* 240 * Now copy each token into the output buffer. 241 */ 242 if (res == NULL) 243 return res; 244 245 pzDest = (ch_t*)(res->tkn_list[0]); 246 res->tkn_ct = 0; 247 248 do { 249 res->tkn_list[ res->tkn_ct++ ] = pzDest; 250 for (;;) { 251 int ch = (ch_t)*str; 252 if (IS_WHITESPACE_CHAR(ch)) { 253 found_white_space: 254 while (IS_WHITESPACE_CHAR(*++str)) ; 255 break; 256 } 257 258 switch (ch) { 259 case '"': 260 copy_cooked(&pzDest, &str); 261 if (str == NULL) { 262 free(res); 263 errno = EINVAL; 264 return NULL; 265 } 266 if (IS_WHITESPACE_CHAR(*str)) 267 goto found_white_space; 268 break; 269 270 case '\'': 271 copy_raw(&pzDest, &str); 272 if (str == NULL) { 273 free(res); 274 errno = EINVAL; 275 return NULL; 276 } 277 if (IS_WHITESPACE_CHAR(*str)) 278 goto found_white_space; 279 break; 280 281 case NUL: 282 goto copy_done; 283 284 default: 285 str++; 286 *(pzDest++) = ch; 287 } 288 } copy_done:; 289 290 /* 291 * NUL terminate the last token and see if we have any more tokens. 292 */ 293 *(pzDest++) = NUL; 294 } while (*str != NUL); 295 296 res->tkn_list[ res->tkn_ct ] = NULL; 297 298 return res; 299 } 300 301 #ifdef TEST 302 #include <stdio.h> 303 #include <string.h> 304 305 int 306 main(int argc, char** argv) 307 { 308 if (argc == 1) { 309 printf("USAGE: %s arg [ ... ]\n", *argv); 310 return 1; 311 } 312 while (--argc > 0) { 313 char* arg = *(++argv); 314 token_list_t* p = ao_string_tokenize(arg); 315 if (p == NULL) { 316 printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n", 317 arg, errno, strerror(errno)); 318 } else { 319 int ix = 0; 320 printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct); 321 do { 322 printf(" %3d: ``%s''\n", ix+1, p->tkn_list[ix]); 323 } while (++ix < p->tkn_ct); 324 free(p); 325 } 326 } 327 return 0; 328 } 329 #endif 330 331 /* 332 * Local Variables: 333 * mode: C 334 * c-file-style: "stroustrup" 335 * indent-tabs-mode: nil 336 * End: 337 * end of autoopts/tokenize.c */ 338