1 /* $OpenBSD: tokenizer.c,v 1.7 2003/06/02 20:18:40 millert Exp $ */ 2 /* $NetBSD: tokenizer.c,v 1.2 1997/01/11 06:48:15 lukem Exp $ */ 3 4 /*- 5 * Copyright (c) 1992, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Christos Zoulas of Cornell University. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #if !defined(lint) && !defined(SCCSID) 37 #if 0 38 static char sccsid[] = "@(#)tokenizer.c 8.1 (Berkeley) 6/4/93"; 39 #else 40 static const char rcsid[] = "$OpenBSD: tokenizer.c,v 1.7 2003/06/02 20:18:40 millert Exp $"; 41 #endif 42 #endif /* not lint && not SCCSID */ 43 44 /* 45 * tokenize.c: Bourne shell like tokenizer 46 */ 47 #include "sys.h" 48 #include <string.h> 49 #include <stdlib.h> 50 #include "tokenizer.h" 51 52 typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t; 53 54 #define IFS "\t \n" 55 56 #define TOK_KEEP 1 57 #define TOK_EAT 2 58 59 #define WINCR 20 60 #define AINCR 10 61 62 #define tok_malloc(a) malloc(a) 63 #define tok_free(a) free(a) 64 #define tok_realloc(a, b) realloc(a, b) 65 66 67 struct tokenizer { 68 char *ifs; /* In field separator */ 69 int argc, amax; /* Current and maximum number of args */ 70 char **argv; /* Argument list */ 71 char *wptr, *wmax; /* Space and limit on the word buffer */ 72 char *wstart; /* Beginning of next word */ 73 char *wspace; /* Space of word buffer */ 74 quote_t quote; /* Quoting state */ 75 int flags; /* flags; */ 76 }; 77 78 79 private void tok_finish(Tokenizer *); 80 81 82 /* tok_finish(): 83 * Finish a word in the tokenizer. 84 */ 85 private void 86 tok_finish(tok) 87 Tokenizer *tok; 88 { 89 *tok->wptr = '\0'; 90 if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) { 91 tok->argv[tok->argc++] = tok->wstart; 92 tok->argv[tok->argc] = NULL; 93 tok->wstart = ++tok->wptr; 94 } 95 tok->flags &= ~TOK_KEEP; 96 } 97 98 99 /* tok_init(): 100 * Initialize the tokenizer 101 */ 102 public Tokenizer * 103 tok_init(ifs) 104 const char *ifs; 105 { 106 Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer)); 107 108 tok->ifs = strdup(ifs ? ifs : IFS); 109 tok->argc = 0; 110 tok->amax = AINCR; 111 tok->argv = (char **) tok_malloc(sizeof(char *) * tok->amax); 112 tok->argv[0] = NULL; 113 tok->wspace = (char *) tok_malloc(WINCR); 114 tok->wmax = tok->wspace + WINCR; 115 tok->wstart = tok->wspace; 116 tok->wptr = tok->wspace; 117 tok->flags = 0; 118 tok->quote = Q_none; 119 120 return tok; 121 } 122 123 124 /* tok_reset(): 125 * Reset the tokenizer 126 */ 127 public void 128 tok_reset(tok) 129 Tokenizer *tok; 130 { 131 tok->argc = 0; 132 tok->wstart = tok->wspace; 133 tok->wptr = tok->wspace; 134 tok->flags = 0; 135 tok->quote = Q_none; 136 } 137 138 139 /* tok_end(): 140 * Clean up 141 */ 142 public void 143 tok_end(tok) 144 Tokenizer *tok; 145 { 146 tok_free((ptr_t) tok->ifs); 147 tok_free((ptr_t) tok->wspace); 148 tok_free((ptr_t) tok->argv); 149 tok_free((ptr_t) tok); 150 } 151 152 153 154 /* tok_line(): 155 * Bourne shell like tokenizing 156 * Return: 157 * -1: Internal error 158 * 3: Quoted return 159 * 2: Unmatched double quote 160 * 1: Unmatched single quote 161 * 0: Ok 162 */ 163 public int 164 tok_line(tok, line, argc, argv) 165 Tokenizer *tok; 166 const char* line; 167 int *argc; 168 char ***argv; 169 { 170 const char *ptr; 171 172 while (1) { 173 switch (*(ptr = line++)) { 174 case '\'': 175 tok->flags |= TOK_KEEP; 176 tok->flags &= ~TOK_EAT; 177 switch (tok->quote) { 178 case Q_none: 179 tok->quote = Q_single; /* Enter single quote mode */ 180 break; 181 182 case Q_single: /* Exit single quote mode */ 183 tok->quote = Q_none; 184 break; 185 186 case Q_one: /* Quote this ' */ 187 tok->quote = Q_none; 188 *tok->wptr++ = *ptr; 189 break; 190 191 case Q_double: /* Stay in double quote mode */ 192 *tok->wptr++ = *ptr; 193 break; 194 195 case Q_doubleone: /* Quote this ' */ 196 tok->quote = Q_double; 197 *tok->wptr++ = *ptr; 198 break; 199 200 default: 201 return(-1); 202 } 203 break; 204 205 case '"': 206 tok->flags &= ~TOK_EAT; 207 tok->flags |= TOK_KEEP; 208 switch (tok->quote) { 209 case Q_none: /* Enter double quote mode */ 210 tok->quote = Q_double; 211 break; 212 213 case Q_double: 214 tok->quote = Q_none; /* Exit double quote mode */ 215 break; 216 217 case Q_one: /* Quote this " */ 218 tok->quote = Q_none; 219 *tok->wptr++ = *ptr; 220 break; 221 222 case Q_single: /* Stay in single quote mode */ 223 *tok->wptr++ = *ptr; 224 break; 225 226 case Q_doubleone: /* Quote this " */ 227 tok->quote = Q_double; 228 *tok->wptr++ = *ptr; 229 break; 230 231 default: 232 return(-1); 233 } 234 break; 235 236 case '\\': 237 tok->flags |= TOK_KEEP; 238 tok->flags &= ~TOK_EAT; 239 switch (tok->quote) { 240 case Q_none: /* Quote next character */ 241 tok->quote = Q_one; 242 break; 243 244 case Q_double: 245 tok->quote = Q_doubleone;/* Quote next character */ 246 break; 247 248 case Q_one: 249 *tok->wptr++ = *ptr; 250 tok->quote = Q_none; /* Quote this, restore state */ 251 break; 252 253 case Q_single: /* Stay in single quote mode */ 254 *tok->wptr++ = *ptr; 255 break; 256 257 case Q_doubleone: /* Quote this \ */ 258 tok->quote = Q_double; 259 *tok->wptr++ = *ptr; 260 break; 261 262 default: 263 return(-1); 264 } 265 break; 266 267 case '\n': 268 tok->flags &= ~TOK_EAT; 269 switch (tok->quote) { 270 case Q_none: 271 tok_finish(tok); 272 *argv = tok->argv; 273 *argc = tok->argc; 274 return(0); 275 276 case Q_single: 277 case Q_double: 278 *tok->wptr++ = *ptr; /* Add the return */ 279 break; 280 281 case Q_doubleone: 282 tok->flags |= TOK_EAT; 283 tok->quote = Q_double; /* Back to double, eat the '\n' */ 284 break; 285 286 case Q_one: 287 tok->flags |= TOK_EAT; 288 tok->quote = Q_none; /* No quote, more eat the '\n' */ 289 break; 290 291 default: 292 return(0); 293 } 294 break; 295 296 case '\0': 297 switch (tok->quote) { 298 case Q_none: 299 /* Finish word and return */ 300 if (tok->flags & TOK_EAT) { 301 tok->flags &= ~TOK_EAT; 302 return 3; 303 } 304 tok_finish(tok); 305 *argv = tok->argv; 306 *argc = tok->argc; 307 return(0); 308 309 case Q_single: 310 return(1); 311 312 case Q_double: 313 return(2); 314 315 case Q_doubleone: 316 tok->quote = Q_double; 317 *tok->wptr++ = *ptr; 318 break; 319 320 case Q_one: 321 tok->quote = Q_none; 322 *tok->wptr++ = *ptr; 323 break; 324 325 default: 326 return(-1); 327 } 328 break; 329 330 default: 331 tok->flags &= ~TOK_EAT; 332 switch (tok->quote) { 333 case Q_none: 334 if (strchr(tok->ifs, *ptr) != NULL) 335 tok_finish(tok); 336 else 337 *tok->wptr++ = *ptr; 338 break; 339 340 case Q_single: 341 case Q_double: 342 *tok->wptr++ = *ptr; 343 break; 344 345 346 case Q_doubleone: 347 *tok->wptr++ = '\\'; 348 tok->quote = Q_double; 349 *tok->wptr++ = *ptr; 350 break; 351 352 case Q_one: 353 tok->quote = Q_none; 354 *tok->wptr++ = *ptr; 355 break; 356 357 default: 358 return(-1); 359 360 } 361 break; 362 } 363 364 if (tok->wptr >= tok->wmax - 4) { 365 size_t size = tok->wmax - tok->wspace + WINCR; 366 char *s = (char *) tok_realloc(tok->wspace, size); 367 /*SUPPRESS 22*/ 368 int offs; 369 370 if (s != NULL && (offs = s - tok->wspace) != 0) { 371 int i; 372 for (i = 0; i < tok->argc; i++) 373 tok->argv[i] = tok->argv[i] + offs; 374 tok->wptr = tok->wptr + offs; 375 tok->wstart = tok->wstart + offs; 376 tok->wmax = s + size; 377 tok->wspace = s; 378 } 379 } 380 381 if (tok->argc >= tok->amax - 4) { 382 char **nargv = (char **) tok_realloc(tok->argv, (tok->amax + AINCR) 383 * sizeof(char*)); 384 if (nargv != NULL) { 385 tok->amax += AINCR; 386 tok->argv = nargv; 387 } 388 } 389 } 390 } 391