1 /* $OpenBSD: tokenizer.c,v 1.10 2003/11/25 20:12:38 otto Exp $ */ 2 /* $NetBSD: tokenizer.c,v 1.13 2003/10/18 23:48:42 christos Exp $ */ 3 4 /*- 5 * Copyright (c) 1992, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Christos Zoulas of Cornell University. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include "config.h" 37 #if !defined(lint) && !defined(SCCSID) 38 #if 0 39 static char sccsid[] = "@(#)tokenizer.c 8.1 (Berkeley) 6/4/93"; 40 #else 41 static const char rcsid[] = "$OpenBSD: tokenizer.c,v 1.10 2003/11/25 20:12:38 otto Exp $"; 42 #endif 43 #endif /* not lint && not SCCSID */ 44 45 /* 46 * tokenize.c: Bourne shell like tokenizer 47 */ 48 #include <string.h> 49 #include <stdlib.h> 50 #include "tokenizer.h" 51 52 typedef enum { 53 Q_none, Q_single, Q_double, Q_one, Q_doubleone 54 } quote_t; 55 56 #define IFS "\t \n" 57 58 #define TOK_KEEP 1 59 #define TOK_EAT 2 60 61 #define WINCR 20 62 #define AINCR 10 63 64 #define tok_strdup(a) strdup(a) 65 #define tok_malloc(a) malloc(a) 66 #define tok_free(a) free(a) 67 #define tok_realloc(a, b) realloc(a, b) 68 69 70 struct tokenizer { 71 char *ifs; /* In field separator */ 72 int argc, amax; /* Current and maximum number of args */ 73 char **argv; /* Argument list */ 74 char *wptr, *wmax; /* Space and limit on the word buffer */ 75 char *wstart; /* Beginning of next word */ 76 char *wspace; /* Space of word buffer */ 77 quote_t quote; /* Quoting state */ 78 int flags; /* flags; */ 79 }; 80 81 82 private void tok_finish(Tokenizer *); 83 84 85 /* tok_finish(): 86 * Finish a word in the tokenizer. 87 */ 88 private void 89 tok_finish(Tokenizer *tok) 90 { 91 92 *tok->wptr = '\0'; 93 if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) { 94 tok->argv[tok->argc++] = tok->wstart; 95 tok->argv[tok->argc] = NULL; 96 tok->wstart = ++tok->wptr; 97 } 98 tok->flags &= ~TOK_KEEP; 99 } 100 101 102 /* tok_init(): 103 * Initialize the tokenizer 104 */ 105 public Tokenizer * 106 tok_init(const char *ifs) 107 { 108 Tokenizer *tok = (Tokenizer *) tok_malloc(sizeof(Tokenizer)); 109 110 if (tok == NULL) 111 return NULL; 112 tok->ifs = tok_strdup(ifs ? ifs : IFS); 113 if (tok->ifs == NULL) { 114 tok_free((ptr_t)tok); 115 return NULL; 116 } 117 tok->argc = 0; 118 tok->amax = AINCR; 119 tok->argv = (char **) tok_malloc(sizeof(char *) * tok->amax); 120 if (tok->argv == NULL) { 121 tok_free((ptr_t)tok->ifs); 122 tok_free((ptr_t)tok); 123 return NULL; 124 } 125 tok->argv[0] = NULL; 126 tok->wspace = (char *) tok_malloc(WINCR); 127 if (tok->wspace == NULL) { 128 tok_free((ptr_t)tok->argv); 129 tok_free((ptr_t)tok->ifs); 130 tok_free((ptr_t)tok); 131 return NULL; 132 } 133 tok->wmax = tok->wspace + WINCR; 134 tok->wstart = tok->wspace; 135 tok->wptr = tok->wspace; 136 tok->flags = 0; 137 tok->quote = Q_none; 138 139 return (tok); 140 } 141 142 143 /* tok_reset(): 144 * Reset the tokenizer 145 */ 146 public void 147 tok_reset(Tokenizer *tok) 148 { 149 150 tok->argc = 0; 151 tok->wstart = tok->wspace; 152 tok->wptr = tok->wspace; 153 tok->flags = 0; 154 tok->quote = Q_none; 155 } 156 157 158 /* tok_end(): 159 * Clean up 160 */ 161 public void 162 tok_end(Tokenizer *tok) 163 { 164 165 tok_free((ptr_t) tok->ifs); 166 tok_free((ptr_t) tok->wspace); 167 tok_free((ptr_t) tok->argv); 168 tok_free((ptr_t) tok); 169 } 170 171 172 173 /* tok_line(): 174 * Bourne shell like tokenizing 175 * Return: 176 * -1: Internal error 177 * 3: Quoted return 178 * 2: Unmatched double quote 179 * 1: Unmatched single quote 180 * 0: Ok 181 */ 182 public int 183 tok_line(Tokenizer *tok, const char *line, int *argc, const char ***argv) 184 { 185 const char *ptr; 186 187 for (;;) { 188 switch (*(ptr = line++)) { 189 case '\'': 190 tok->flags |= TOK_KEEP; 191 tok->flags &= ~TOK_EAT; 192 switch (tok->quote) { 193 case Q_none: 194 tok->quote = Q_single; /* Enter single quote 195 * mode */ 196 break; 197 198 case Q_single: /* Exit single quote mode */ 199 tok->quote = Q_none; 200 break; 201 202 case Q_one: /* Quote this ' */ 203 tok->quote = Q_none; 204 *tok->wptr++ = *ptr; 205 break; 206 207 case Q_double: /* Stay in double quote mode */ 208 *tok->wptr++ = *ptr; 209 break; 210 211 case Q_doubleone: /* Quote this ' */ 212 tok->quote = Q_double; 213 *tok->wptr++ = *ptr; 214 break; 215 216 default: 217 return (-1); 218 } 219 break; 220 221 case '"': 222 tok->flags &= ~TOK_EAT; 223 tok->flags |= TOK_KEEP; 224 switch (tok->quote) { 225 case Q_none: /* Enter double quote mode */ 226 tok->quote = Q_double; 227 break; 228 229 case Q_double: /* Exit double quote mode */ 230 tok->quote = Q_none; 231 break; 232 233 case Q_one: /* Quote this " */ 234 tok->quote = Q_none; 235 *tok->wptr++ = *ptr; 236 break; 237 238 case Q_single: /* Stay in single quote mode */ 239 *tok->wptr++ = *ptr; 240 break; 241 242 case Q_doubleone: /* Quote this " */ 243 tok->quote = Q_double; 244 *tok->wptr++ = *ptr; 245 break; 246 247 default: 248 return (-1); 249 } 250 break; 251 252 case '\\': 253 tok->flags |= TOK_KEEP; 254 tok->flags &= ~TOK_EAT; 255 switch (tok->quote) { 256 case Q_none: /* Quote next character */ 257 tok->quote = Q_one; 258 break; 259 260 case Q_double: /* Quote next character */ 261 tok->quote = Q_doubleone; 262 break; 263 264 case Q_one: /* Quote this, restore state */ 265 *tok->wptr++ = *ptr; 266 tok->quote = Q_none; 267 break; 268 269 case Q_single: /* Stay in single quote mode */ 270 *tok->wptr++ = *ptr; 271 break; 272 273 case Q_doubleone: /* Quote this \ */ 274 tok->quote = Q_double; 275 *tok->wptr++ = *ptr; 276 break; 277 278 default: 279 return (-1); 280 } 281 break; 282 283 case '\n': 284 tok->flags &= ~TOK_EAT; 285 switch (tok->quote) { 286 case Q_none: 287 tok_finish(tok); 288 *argv = (const char **)tok->argv; 289 *argc = tok->argc; 290 return (0); 291 292 case Q_single: 293 case Q_double: 294 *tok->wptr++ = *ptr; /* Add the return */ 295 break; 296 297 case Q_doubleone: /* Back to double, eat the '\n' */ 298 tok->flags |= TOK_EAT; 299 tok->quote = Q_double; 300 break; 301 302 case Q_one: /* No quote, more eat the '\n' */ 303 tok->flags |= TOK_EAT; 304 tok->quote = Q_none; 305 break; 306 307 default: 308 return (0); 309 } 310 break; 311 312 case '\0': 313 switch (tok->quote) { 314 case Q_none: 315 /* Finish word and return */ 316 if (tok->flags & TOK_EAT) { 317 tok->flags &= ~TOK_EAT; 318 return (3); 319 } 320 tok_finish(tok); 321 *argv = (const char **)tok->argv; 322 *argc = tok->argc; 323 return (0); 324 325 case Q_single: 326 return (1); 327 328 case Q_double: 329 return (2); 330 331 case Q_doubleone: 332 tok->quote = Q_double; 333 *tok->wptr++ = *ptr; 334 break; 335 336 case Q_one: 337 tok->quote = Q_none; 338 *tok->wptr++ = *ptr; 339 break; 340 341 default: 342 return (-1); 343 } 344 break; 345 346 default: 347 tok->flags &= ~TOK_EAT; 348 switch (tok->quote) { 349 case Q_none: 350 if (strchr(tok->ifs, *ptr) != NULL) 351 tok_finish(tok); 352 else 353 *tok->wptr++ = *ptr; 354 break; 355 356 case Q_single: 357 case Q_double: 358 *tok->wptr++ = *ptr; 359 break; 360 361 362 case Q_doubleone: 363 *tok->wptr++ = '\\'; 364 tok->quote = Q_double; 365 *tok->wptr++ = *ptr; 366 break; 367 368 case Q_one: 369 tok->quote = Q_none; 370 *tok->wptr++ = *ptr; 371 break; 372 373 default: 374 return (-1); 375 376 } 377 break; 378 } 379 380 if (tok->wptr >= tok->wmax - 4) { 381 size_t size = tok->wmax - tok->wspace + WINCR; 382 char *s = (char *) tok_realloc(tok->wspace, size); 383 if (s == NULL) 384 return (-1); 385 386 if (s != tok->wspace) { 387 int i; 388 for (i = 0; i < tok->argc; i++) { 389 tok->argv[i] = 390 (tok->argv[i] - tok->wspace) + s; 391 } 392 tok->wptr = (tok->wptr - tok->wspace) + s; 393 tok->wstart = (tok->wstart - tok->wspace) + s; 394 tok->wspace = s; 395 } 396 tok->wmax = s + size; 397 } 398 if (tok->argc >= tok->amax - 4) { 399 char **p; 400 tok->amax += AINCR; 401 p = (char **) tok_realloc(tok->argv, 402 tok->amax * sizeof(char *)); 403 if (p == NULL) 404 return (-1); 405 tok->argv = p; 406 } 407 } 408 } 409