1 /* $NetBSD: tokenize.c,v 1.9 2020/05/25 20:47:35 christos Exp $ */ 2 3 /** \file tokenize.c 4 * 5 * Tokenize a string, accommodating quoted strings. 6 * 7 * @addtogroup autoopts 8 * @{ 9 */ 10 /* 11 * This file defines the string_tokenize interface 12 * This file is part of AutoOpts, a companion to AutoGen. 13 * AutoOpts is free software. 14 * AutoOpts is Copyright (C) 1992-2015 by Bruce Korb - all rights reserved 15 * 16 * AutoOpts is available under any one of two licenses. The license 17 * in use must be one of these two and the choice is under the control 18 * of the user of the license. 19 * 20 * The GNU Lesser General Public License, version 3 or later 21 * See the files "COPYING.lgplv3" and "COPYING.gplv3" 22 * 23 * The Modified Berkeley Software Distribution License 24 * See the file "COPYING.mbsd" 25 * 26 * These files have the following sha256 sums: 27 * 28 * 8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95 COPYING.gplv3 29 * 4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b COPYING.lgplv3 30 * 13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239 COPYING.mbsd 31 */ 32 33 #include <errno.h> 34 #include <stdlib.h> 35 36 #define cc_t const unsigned char 37 #define ch_t unsigned char 38 39 /* = = = START-STATIC-FORWARD = = = */ 40 static void 41 copy_cooked(ch_t ** ppDest, char const ** ppSrc); 42 43 static void 44 copy_raw(ch_t ** ppDest, char const ** ppSrc); 45 46 static token_list_t * 47 alloc_token_list(char const * str); 48 /* = = = END-STATIC-FORWARD = = = */ 49 50 static void 51 copy_cooked(ch_t ** ppDest, char const ** ppSrc) 52 { 53 ch_t * pDest = (ch_t *)*ppDest; 54 const ch_t * pSrc = (const ch_t *)(*ppSrc + 1); 55 56 for (;;) { 57 ch_t ch = *(pSrc++); 58 switch (ch) { 59 case NUL: *ppSrc = NULL; return; 60 case '"': goto done; 61 case '\\': 62 pSrc += ao_string_cook_escape_char((const char *)pSrc, (char *)&ch, 0x7F); 63 if (ch == 0x7F) 64 break; 65 /* FALLTHROUGH */ 66 67 default: 68 *(pDest++) = ch; 69 } 70 } 71 72 done: 73 *ppDest = (ch_t *)pDest; /* next spot for storing character */ 74 *ppSrc = (char const *)pSrc; /* char following closing quote */ 75 } 76 77 78 static void 79 copy_raw(ch_t ** ppDest, char const ** ppSrc) 80 { 81 ch_t * pDest = *ppDest; 82 cc_t * pSrc = (cc_t *) (*ppSrc + 1); 83 84 for (;;) { 85 ch_t ch = *(pSrc++); 86 switch (ch) { 87 case NUL: *ppSrc = NULL; return; 88 case '\'': goto done; 89 case '\\': 90 /* 91 * *Four* escapes are handled: newline removal, escape char 92 * quoting and apostrophe quoting 93 */ 94 switch (*pSrc) { 95 case NUL: *ppSrc = NULL; return; 96 case '\r': 97 if (*(++pSrc) == NL) 98 ++pSrc; 99 continue; 100 101 case NL: 102 ++pSrc; 103 continue; 104 105 case '\'': 106 ch = '\''; 107 /* FALLTHROUGH */ 108 109 case '\\': 110 ++pSrc; 111 break; 112 } 113 /* FALLTHROUGH */ 114 115 default: 116 *(pDest++) = ch; 117 } 118 } 119 120 done: 121 *ppDest = pDest; /* next spot for storing character */ 122 *ppSrc = (char const *) pSrc; /* char following closing quote */ 123 } 124 125 static token_list_t * 126 alloc_token_list(char const * str) 127 { 128 token_list_t * res; 129 130 int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */ 131 132 if (str == NULL) goto enoent_res; 133 134 /* 135 * Trim leading white space. Use "ENOENT" and a NULL return to indicate 136 * an empty string was passed. 137 */ 138 str = SPN_WHITESPACE_CHARS(str); 139 if (*str == NUL) goto enoent_res; 140 141 /* 142 * Take an approximate count of tokens. If no quoted strings are used, 143 * it will be accurate. If quoted strings are used, it will be a little 144 * high and we'll squander the space for a few extra pointers. 145 */ 146 { 147 char const * pz = str; 148 149 do { 150 max_token_ct++; 151 pz = BRK_WHITESPACE_CHARS(pz+1); 152 pz = SPN_WHITESPACE_CHARS(pz); 153 } while (*pz != NUL); 154 155 res = malloc(sizeof(*res) + (size_t)(pz - str) 156 + ((size_t)max_token_ct * sizeof(ch_t *))); 157 } 158 159 if (res == NULL) 160 errno = ENOMEM; 161 else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1)); 162 163 return res; 164 165 enoent_res: 166 167 errno = ENOENT; 168 return NULL; 169 } 170 171 /*=export_func ao_string_tokenize 172 * 173 * what: tokenize an input string 174 * 175 * arg: + char const * + string + string to be tokenized + 176 * 177 * ret_type: token_list_t * 178 * ret_desc: pointer to a structure that lists each token 179 * 180 * doc: 181 * 182 * This function will convert one input string into a list of strings. 183 * The list of strings is derived by separating the input based on 184 * white space separation. However, if the input contains either single 185 * or double quote characters, then the text after that character up to 186 * a matching quote will become the string in the list. 187 * 188 * The returned pointer should be deallocated with @code{free(3C)} when 189 * are done using the data. The data are placed in a single block of 190 * allocated memory. Do not deallocate individual token/strings. 191 * 192 * The structure pointed to will contain at least these two fields: 193 * @table @samp 194 * @item tkn_ct 195 * The number of tokens found in the input string. 196 * @item tok_list 197 * An array of @code{tkn_ct + 1} pointers to substring tokens, with 198 * the last pointer set to NULL. 199 * @end table 200 * 201 * There are two types of quoted strings: single quoted (@code{'}) and 202 * double quoted (@code{"}). Singly quoted strings are fairly raw in that 203 * escape characters (@code{\\}) are simply another character, except when 204 * preceding the following characters: 205 * @example 206 * @code{\\} double backslashes reduce to one 207 * @code{'} incorporates the single quote into the string 208 * @code{\n} suppresses both the backslash and newline character 209 * @end example 210 * 211 * Double quote strings are formed according to the rules of string 212 * constants in ANSI-C programs. 213 * 214 * example: 215 * @example 216 * #include <stdlib.h> 217 * int ix; 218 * token_list_t * ptl = ao_string_tokenize(some_string) 219 * for (ix = 0; ix < ptl->tkn_ct; ix++) 220 * do_something_with_tkn(ptl->tkn_list[ix]); 221 * free(ptl); 222 * @end example 223 * Note that everything is freed with the one call to @code{free(3C)}. 224 * 225 * err: 226 * NULL is returned and @code{errno} will be set to indicate the problem: 227 * @itemize @bullet 228 * @item 229 * @code{EINVAL} - There was an unterminated quoted string. 230 * @item 231 * @code{ENOENT} - The input string was empty. 232 * @item 233 * @code{ENOMEM} - There is not enough memory. 234 * @end itemize 235 =*/ 236 token_list_t * 237 ao_string_tokenize(char const * str) 238 { 239 token_list_t * res = alloc_token_list(str); 240 ch_t * pzDest; 241 242 /* 243 * Now copy each token into the output buffer. 244 */ 245 if (res == NULL) 246 return res; 247 248 pzDest = (ch_t *)(res->tkn_list[0]); 249 res->tkn_ct = 0; 250 251 do { 252 res->tkn_list[ res->tkn_ct++ ] = pzDest; 253 for (;;) { 254 int ch = (ch_t)*str; 255 if (IS_WHITESPACE_CHAR(ch)) { 256 found_white_space: 257 str = SPN_WHITESPACE_CHARS(str+1); 258 break; 259 } 260 261 switch (ch) { 262 case '"': 263 copy_cooked(&pzDest, &str); 264 if (str == NULL) { 265 free(res); 266 errno = EINVAL; 267 return NULL; 268 } 269 if (IS_WHITESPACE_CHAR(*str)) 270 goto found_white_space; 271 break; 272 273 case '\'': 274 copy_raw(&pzDest, &str); 275 if (str == NULL) { 276 free(res); 277 errno = EINVAL; 278 return NULL; 279 } 280 if (IS_WHITESPACE_CHAR(*str)) 281 goto found_white_space; 282 break; 283 284 case NUL: 285 goto copy_done; 286 287 default: 288 str++; 289 *(pzDest++) = (unsigned char)ch; 290 } 291 } copy_done:; 292 293 /* 294 * NUL terminate the last token and see if we have any more tokens. 295 */ 296 *(pzDest++) = NUL; 297 } while (*str != NUL); 298 299 res->tkn_list[ res->tkn_ct ] = NULL; 300 301 return res; 302 } 303 304 #ifdef TEST 305 #include <stdio.h> 306 #include <string.h> 307 308 int 309 main(int argc, char ** argv) 310 { 311 if (argc == 1) { 312 printf("USAGE: %s arg [ ... ]\n", *argv); 313 return 1; 314 } 315 while (--argc > 0) { 316 char * arg = *(++argv); 317 token_list_t * p = ao_string_tokenize(arg); 318 if (p == NULL) { 319 printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n", 320 arg, errno, strerror(errno)); 321 } else { 322 int ix = 0; 323 printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct); 324 do { 325 printf(" %3d: ``%s''\n", ix+1, p->tkn_list[ix]); 326 } while (++ix < p->tkn_ct); 327 free(p); 328 } 329 } 330 return 0; 331 } 332 #endif 333 334 /** @} 335 * 336 * Local Variables: 337 * mode: C 338 * c-file-style: "stroustrup" 339 * indent-tabs-mode: nil 340 * End: 341 * end of autoopts/tokenize.c */ 342