1 /* $NetBSD: tokenize.c,v 1.10 2024/08/18 20:47:25 christos Exp $ */ 2 3 /** \file tokenize.c 4 * 5 * Tokenize a string, accommodating quoted strings. 6 * 7 * @addtogroup autoopts 8 * @{ 9 */ 10 /* 11 * This file defines the string_tokenize interface 12 * This file is part of AutoOpts, a companion to AutoGen. 13 * AutoOpts is free software. 14 * AutoOpts is Copyright (C) 1992-2018 by Bruce Korb - all rights reserved 15 * 16 * AutoOpts is available under any one of two licenses. The license 17 * in use must be one of these two and the choice is under the control 18 * of the user of the license. 19 * 20 * The GNU Lesser General Public License, version 3 or later 21 * See the files "COPYING.lgplv3" and "COPYING.gplv3" 22 * 23 * The Modified Berkeley Software Distribution License 24 * See the file "COPYING.mbsd" 25 * 26 * These files have the following sha256 sums: 27 * 28 * 8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95 COPYING.gplv3 29 * 4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b COPYING.lgplv3 30 * 13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239 COPYING.mbsd 31 */ 32 33 static void 34 copy_cooked(ch_t ** ppDest, char const ** ppSrc) 35 { 36 ch_t * pDest = (ch_t *)*ppDest; 37 const ch_t * pSrc = (const ch_t *)(*ppSrc + 1); 38 39 for (;;) { 40 ch_t ch = *(pSrc++); 41 switch (ch) { 42 case NUL: *ppSrc = NULL; return; 43 case '"': goto done; 44 case '\\': 45 pSrc += ao_string_cook_escape_char(__UNCONST(pSrc), (char *)&ch, 0x7F); 46 if (ch == 0x7F) 47 break; 48 /* FALLTHROUGH */ 49 50 default: 51 *(pDest++) = ch; 52 } 53 } 54 55 done: 56 *ppDest = (ch_t *)pDest; /* next spot for storing character */ 57 *ppSrc = (char const *)pSrc; /* char following closing quote */ 58 } 59 60 61 static void 62 copy_raw(ch_t ** ppDest, char const ** ppSrc) 63 { 64 ch_t * pDest = *ppDest; 65 cc_t * pSrc = (cc_t *) (*ppSrc + 1); 66 67 for (;;) { 68 ch_t ch = *(pSrc++); 69 switch (ch) { 70 case NUL: *ppSrc = NULL; return; 71 case '\'': goto done; 72 case '\\': 73 /* 74 * *Four* escapes are handled: newline removal, escape char 75 * quoting and apostrophe quoting 76 */ 77 switch (*pSrc) { 78 case NUL: *ppSrc = NULL; return; 79 case '\r': 80 if (*(++pSrc) == NL) 81 ++pSrc; 82 continue; 83 84 case NL: 85 ++pSrc; 86 continue; 87 88 case '\'': 89 ch = '\''; 90 /* FALLTHROUGH */ 91 92 case '\\': 93 ++pSrc; 94 break; 95 } 96 /* FALLTHROUGH */ 97 98 default: 99 *(pDest++) = ch; 100 } 101 } 102 103 done: 104 *ppDest = pDest; /* next spot for storing character */ 105 *ppSrc = (char const *) pSrc; /* char following closing quote */ 106 } 107 108 static token_list_t * 109 alloc_token_list(char const * str) 110 { 111 token_list_t * res; 112 113 int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */ 114 115 if (str == NULL) goto enoent_res; 116 117 /* 118 * Trim leading white space. Use "ENOENT" and a NULL return to indicate 119 * an empty string was passed. 120 */ 121 str = SPN_WHITESPACE_CHARS(str); 122 if (*str == NUL) goto enoent_res; 123 124 /* 125 * Take an approximate count of tokens. If no quoted strings are used, 126 * it will be accurate. If quoted strings are used, it will be a little 127 * high and we'll squander the space for a few extra pointers. 128 */ 129 { 130 char const * pz = str; 131 132 do { 133 max_token_ct++; 134 pz = BRK_WHITESPACE_CHARS(pz+1); 135 pz = SPN_WHITESPACE_CHARS(pz); 136 } while (*pz != NUL); 137 138 res = malloc(sizeof(*res) + (size_t)(pz - str) 139 + ((size_t)max_token_ct * sizeof(ch_t *))); 140 } 141 142 if (res == NULL) 143 errno = ENOMEM; 144 else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1)); 145 146 return res; 147 148 enoent_res: 149 150 errno = ENOENT; 151 return NULL; 152 } 153 154 /*=export_func ao_string_tokenize 155 * 156 * what: tokenize an input string 157 * 158 * arg: + char const * + string + string to be tokenized + 159 * 160 * ret_type: token_list_t * 161 * ret_desc: pointer to a structure that lists each token 162 * 163 * doc: 164 * 165 * This function will convert one input string into a list of strings. 166 * The list of strings is derived by separating the input based on 167 * white space separation. However, if the input contains either single 168 * or double quote characters, then the text after that character up to 169 * a matching quote will become the string in the list. 170 * 171 * The returned pointer should be deallocated with @code{free(3C)} when 172 * are done using the data. The data are placed in a single block of 173 * allocated memory. Do not deallocate individual token/strings. 174 * 175 * The structure pointed to will contain at least these two fields: 176 * @table @samp 177 * @item tkn_ct 178 * The number of tokens found in the input string. 179 * @item tok_list 180 * An array of @code{tkn_ct + 1} pointers to substring tokens, with 181 * the last pointer set to NULL. 182 * @end table 183 * 184 * There are two types of quoted strings: single quoted (@code{'}) and 185 * double quoted (@code{"}). Singly quoted strings are fairly raw in that 186 * escape characters (@code{\\}) are simply another character, except when 187 * preceding the following characters: 188 * @example 189 * @code{\\} double backslashes reduce to one 190 * @code{'} incorporates the single quote into the string 191 * @code{\n} suppresses both the backslash and newline character 192 * @end example 193 * 194 * Double quote strings are formed according to the rules of string 195 * constants in ANSI-C programs. 196 * 197 * example: 198 * @example 199 * #include <stdlib.h> 200 * int ix; 201 * token_list_t * ptl = ao_string_tokenize(some_string) 202 * for (ix = 0; ix < ptl->tkn_ct; ix++) 203 * do_something_with_tkn(ptl->tkn_list[ix]); 204 * free(ptl); 205 * @end example 206 * Note that everything is freed with the one call to @code{free(3C)}. 207 * 208 * err: 209 * NULL is returned and @code{errno} will be set to indicate the problem: 210 * @itemize @bullet 211 * @item 212 * @code{EINVAL} - There was an unterminated quoted string. 213 * @item 214 * @code{ENOENT} - The input string was empty. 215 * @item 216 * @code{ENOMEM} - There is not enough memory. 217 * @end itemize 218 =*/ 219 token_list_t * 220 ao_string_tokenize(char const * str) 221 { 222 token_list_t * res = alloc_token_list(str); 223 ch_t * pzDest; 224 225 /* 226 * Now copy each token into the output buffer. 227 */ 228 if (res == NULL) 229 return res; 230 231 pzDest = (ch_t *)(res->tkn_list[0]); 232 res->tkn_ct = 0; 233 234 do { 235 res->tkn_list[ res->tkn_ct++ ] = pzDest; 236 for (;;) { 237 int ch = (ch_t)*str; 238 if (IS_WHITESPACE_CHAR(ch)) { 239 found_white_space: 240 str = SPN_WHITESPACE_CHARS(str+1); 241 break; 242 } 243 244 switch (ch) { 245 case '"': 246 copy_cooked(&pzDest, &str); 247 if (str == NULL) { 248 free(res); 249 errno = EINVAL; 250 return NULL; 251 } 252 if (IS_WHITESPACE_CHAR(*str)) 253 goto found_white_space; 254 break; 255 256 case '\'': 257 copy_raw(&pzDest, &str); 258 if (str == NULL) { 259 free(res); 260 errno = EINVAL; 261 return NULL; 262 } 263 if (IS_WHITESPACE_CHAR(*str)) 264 goto found_white_space; 265 break; 266 267 case NUL: 268 goto copy_done; 269 270 default: 271 str++; 272 *(pzDest++) = (unsigned char)ch; 273 } 274 } copy_done:; 275 276 /* 277 * NUL terminate the last token and see if we have any more tokens. 278 */ 279 *(pzDest++) = NUL; 280 } while (*str != NUL); 281 282 res->tkn_list[ res->tkn_ct ] = NULL; 283 284 return res; 285 } 286 287 #ifdef TEST 288 #include <stdio.h> 289 #include <string.h> 290 291 int 292 main(int argc, char ** argv) 293 { 294 if (argc == 1) { 295 printf("USAGE: %s arg [ ... ]\n", *argv); 296 return 1; 297 } 298 while (--argc > 0) { 299 char * arg = *(++argv); 300 token_list_t * p = ao_string_tokenize(arg); 301 if (p == NULL) { 302 printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n", 303 arg, errno, strerror(errno)); 304 } else { 305 int ix = 0; 306 printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct); 307 do { 308 printf(" %3d: ``%s''\n", ix+1, p->tkn_list[ix]); 309 } while (++ix < p->tkn_ct); 310 free(p); 311 } 312 } 313 return 0; 314 } 315 #endif 316 317 /** @} 318 * 319 * Local Variables: 320 * mode: C 321 * c-file-style: "stroustrup" 322 * indent-tabs-mode: nil 323 * End: 324 * end of autoopts/tokenize.c */ 325