1 /* $OpenBSD: tokenizer.c,v 1.12 2010/06/30 00:05:35 nicm Exp $ */ 2 /* $NetBSD: tokenizer.c,v 1.18 2010/01/03 18:27:10 christos Exp $ */ 3 4 /*- 5 * Copyright (c) 1992, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Christos Zoulas of Cornell University. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include "config.h" 37 38 /* We build this file twice, once as NARROW, once as WIDE. */ 39 /* 40 * tokenize.c: Bourne shell like tokenizer 41 */ 42 #include <string.h> 43 #include <stdlib.h> 44 #include "histedit.h" 45 #include "chartype.h" 46 47 typedef enum { 48 Q_none, Q_single, Q_double, Q_one, Q_doubleone 49 } quote_t; 50 51 #define TOK_KEEP 1 52 #define TOK_EAT 2 53 54 #define WINCR 20 55 #define AINCR 10 56 57 #define IFS STR("\t \n") 58 59 #define tok_malloc(a) malloc(a) 60 #define tok_free(a) free(a) 61 #define tok_realloc(a, b) realloc(a, b) 62 #define tok_strdup(a) Strdup(a) 63 64 65 struct TYPE(tokenizer) { 66 Char *ifs; /* In field separator */ 67 int argc, amax; /* Current and maximum number of args */ 68 Char **argv; /* Argument list */ 69 Char *wptr, *wmax; /* Space and limit on the word buffer */ 70 Char *wstart; /* Beginning of next word */ 71 Char *wspace; /* Space of word buffer */ 72 quote_t quote; /* Quoting state */ 73 int flags; /* flags; */ 74 }; 75 76 77 private void FUN(tok,finish)(TYPE(Tokenizer) *); 78 79 80 /* FUN(tok,finish)(): 81 * Finish a word in the tokenizer. 82 */ 83 private void 84 FUN(tok,finish)(TYPE(Tokenizer) *tok) 85 { 86 87 *tok->wptr = '\0'; 88 if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) { 89 tok->argv[tok->argc++] = tok->wstart; 90 tok->argv[tok->argc] = NULL; 91 tok->wstart = ++tok->wptr; 92 } 93 tok->flags &= ~TOK_KEEP; 94 } 95 96 97 /* FUN(tok,init)(): 98 * Initialize the tokenizer 99 */ 100 public TYPE(Tokenizer) * 101 FUN(tok,init)(const Char *ifs) 102 { 103 TYPE(Tokenizer) *tok = tok_malloc(sizeof(TYPE(Tokenizer))); 104 105 if (tok == NULL) 106 return NULL; 107 tok->ifs = tok_strdup(ifs ? ifs : IFS); 108 if (tok->ifs == NULL) { 109 tok_free((ptr_t)tok); 110 return NULL; 111 } 112 tok->argc = 0; 113 tok->amax = AINCR; 114 tok->argv = tok_malloc(sizeof(*tok->argv) * tok->amax); 115 if (tok->argv == NULL) { 116 tok_free((ptr_t)tok->ifs); 117 tok_free((ptr_t)tok); 118 return NULL; 119 } 120 tok->argv[0] = NULL; 121 tok->wspace = tok_malloc(WINCR * sizeof(*tok->wspace)); 122 if (tok->wspace == NULL) { 123 tok_free((ptr_t)tok->argv); 124 tok_free((ptr_t)tok->ifs); 125 tok_free((ptr_t)tok); 126 return NULL; 127 } 128 tok->wmax = tok->wspace + WINCR; 129 tok->wstart = tok->wspace; 130 tok->wptr = tok->wspace; 131 tok->flags = 0; 132 tok->quote = Q_none; 133 134 return (tok); 135 } 136 137 138 /* FUN(tok,reset)(): 139 * Reset the tokenizer 140 */ 141 public void 142 FUN(tok,reset)(TYPE(Tokenizer) *tok) 143 { 144 145 tok->argc = 0; 146 tok->wstart = tok->wspace; 147 tok->wptr = tok->wspace; 148 tok->flags = 0; 149 tok->quote = Q_none; 150 } 151 152 153 /* FUN(tok,end)(): 154 * Clean up 155 */ 156 public void 157 FUN(tok,end)(TYPE(Tokenizer) *tok) 158 { 159 160 tok_free((ptr_t) tok->ifs); 161 tok_free((ptr_t) tok->wspace); 162 tok_free((ptr_t) tok->argv); 163 tok_free((ptr_t) tok); 164 } 165 166 167 168 /* FUN(tok,line)(): 169 * Bourne shell (sh(1)) like tokenizing 170 * Arguments: 171 * tok current tokenizer state (setup with FUN(tok,init)()) 172 * line line to parse 173 * Returns: 174 * -1 Internal error 175 * 3 Quoted return 176 * 2 Unmatched double quote 177 * 1 Unmatched single quote 178 * 0 Ok 179 * Modifies (if return value is 0): 180 * argc number of arguments 181 * argv argument array 182 * cursorc if !NULL, argv element containing cursor 183 * cursorv if !NULL, offset in argv[cursorc] of cursor 184 */ 185 public int 186 FUN(tok,line)(TYPE(Tokenizer) *tok, const TYPE(LineInfo) *line, 187 int *argc, const Char ***argv, int *cursorc, int *cursoro) 188 { 189 const Char *ptr; 190 int cc, co; 191 192 cc = co = -1; 193 ptr = line->buffer; 194 for (ptr = line->buffer; ;ptr++) { 195 if (ptr >= line->lastchar) 196 ptr = STR(""); 197 if (ptr == line->cursor) { 198 cc = tok->argc; 199 co = (int)(tok->wptr - tok->wstart); 200 } 201 switch (*ptr) { 202 case '\'': 203 tok->flags |= TOK_KEEP; 204 tok->flags &= ~TOK_EAT; 205 switch (tok->quote) { 206 case Q_none: 207 tok->quote = Q_single; /* Enter single quote 208 * mode */ 209 break; 210 211 case Q_single: /* Exit single quote mode */ 212 tok->quote = Q_none; 213 break; 214 215 case Q_one: /* Quote this ' */ 216 tok->quote = Q_none; 217 *tok->wptr++ = *ptr; 218 break; 219 220 case Q_double: /* Stay in double quote mode */ 221 *tok->wptr++ = *ptr; 222 break; 223 224 case Q_doubleone: /* Quote this ' */ 225 tok->quote = Q_double; 226 *tok->wptr++ = *ptr; 227 break; 228 229 default: 230 return (-1); 231 } 232 break; 233 234 case '"': 235 tok->flags &= ~TOK_EAT; 236 tok->flags |= TOK_KEEP; 237 switch (tok->quote) { 238 case Q_none: /* Enter double quote mode */ 239 tok->quote = Q_double; 240 break; 241 242 case Q_double: /* Exit double quote mode */ 243 tok->quote = Q_none; 244 break; 245 246 case Q_one: /* Quote this " */ 247 tok->quote = Q_none; 248 *tok->wptr++ = *ptr; 249 break; 250 251 case Q_single: /* Stay in single quote mode */ 252 *tok->wptr++ = *ptr; 253 break; 254 255 case Q_doubleone: /* Quote this " */ 256 tok->quote = Q_double; 257 *tok->wptr++ = *ptr; 258 break; 259 260 default: 261 return (-1); 262 } 263 break; 264 265 case '\\': 266 tok->flags |= TOK_KEEP; 267 tok->flags &= ~TOK_EAT; 268 switch (tok->quote) { 269 case Q_none: /* Quote next character */ 270 tok->quote = Q_one; 271 break; 272 273 case Q_double: /* Quote next character */ 274 tok->quote = Q_doubleone; 275 break; 276 277 case Q_one: /* Quote this, restore state */ 278 *tok->wptr++ = *ptr; 279 tok->quote = Q_none; 280 break; 281 282 case Q_single: /* Stay in single quote mode */ 283 *tok->wptr++ = *ptr; 284 break; 285 286 case Q_doubleone: /* Quote this \ */ 287 tok->quote = Q_double; 288 *tok->wptr++ = *ptr; 289 break; 290 291 default: 292 return (-1); 293 } 294 break; 295 296 case '\n': 297 tok->flags &= ~TOK_EAT; 298 switch (tok->quote) { 299 case Q_none: 300 goto tok_line_outok; 301 302 case Q_single: 303 case Q_double: 304 *tok->wptr++ = *ptr; /* Add the return */ 305 break; 306 307 case Q_doubleone: /* Back to double, eat the '\n' */ 308 tok->flags |= TOK_EAT; 309 tok->quote = Q_double; 310 break; 311 312 case Q_one: /* No quote, more eat the '\n' */ 313 tok->flags |= TOK_EAT; 314 tok->quote = Q_none; 315 break; 316 317 default: 318 return (0); 319 } 320 break; 321 322 case '\0': 323 switch (tok->quote) { 324 case Q_none: 325 /* Finish word and return */ 326 if (tok->flags & TOK_EAT) { 327 tok->flags &= ~TOK_EAT; 328 return (3); 329 } 330 goto tok_line_outok; 331 332 case Q_single: 333 return (1); 334 335 case Q_double: 336 return (2); 337 338 case Q_doubleone: 339 tok->quote = Q_double; 340 *tok->wptr++ = *ptr; 341 break; 342 343 case Q_one: 344 tok->quote = Q_none; 345 *tok->wptr++ = *ptr; 346 break; 347 348 default: 349 return (-1); 350 } 351 break; 352 353 default: 354 tok->flags &= ~TOK_EAT; 355 switch (tok->quote) { 356 case Q_none: 357 if (Strchr(tok->ifs, *ptr) != NULL) 358 FUN(tok,finish)(tok); 359 else 360 *tok->wptr++ = *ptr; 361 break; 362 363 case Q_single: 364 case Q_double: 365 *tok->wptr++ = *ptr; 366 break; 367 368 369 case Q_doubleone: 370 *tok->wptr++ = '\\'; 371 tok->quote = Q_double; 372 *tok->wptr++ = *ptr; 373 break; 374 375 case Q_one: 376 tok->quote = Q_none; 377 *tok->wptr++ = *ptr; 378 break; 379 380 default: 381 return (-1); 382 383 } 384 break; 385 } 386 387 if (tok->wptr >= tok->wmax - 4) { 388 size_t size = tok->wmax - tok->wspace + WINCR; 389 Char *s = tok_realloc(tok->wspace, 390 size * sizeof(*s)); 391 if (s == NULL) 392 return (-1); 393 394 if (s != tok->wspace) { 395 int i; 396 for (i = 0; i < tok->argc; i++) { 397 tok->argv[i] = 398 (tok->argv[i] - tok->wspace) + s; 399 } 400 tok->wptr = (tok->wptr - tok->wspace) + s; 401 tok->wstart = (tok->wstart - tok->wspace) + s; 402 tok->wspace = s; 403 } 404 tok->wmax = s + size; 405 } 406 if (tok->argc >= tok->amax - 4) { 407 Char **p; 408 tok->amax += AINCR; 409 p = tok_realloc(tok->argv, tok->amax * sizeof(*p)); 410 if (p == NULL) 411 return (-1); 412 tok->argv = p; 413 } 414 } 415 tok_line_outok: 416 if (cc == -1 && co == -1) { 417 cc = tok->argc; 418 co = (int)(tok->wptr - tok->wstart); 419 } 420 if (cursorc != NULL) 421 *cursorc = cc; 422 if (cursoro != NULL) 423 *cursoro = co; 424 FUN(tok,finish)(tok); 425 *argv = (const Char **)tok->argv; 426 *argc = tok->argc; 427 return (0); 428 } 429 430 /* FUN(tok,str)(): 431 * Simpler version of tok_line, taking a NUL terminated line 432 * and splitting into words, ignoring cursor state. 433 */ 434 public int 435 FUN(tok,str)(TYPE(Tokenizer) *tok, const Char *line, int *argc, 436 const Char ***argv) 437 { 438 TYPE(LineInfo) li; 439 440 memset(&li, 0, sizeof(li)); 441 li.buffer = line; 442 li.cursor = li.lastchar = Strchr(line, '\0'); 443 return (FUN(tok,line)(tok, &li, argc, argv, NULL, NULL)); 444 } 445