1*54240Sbostic /*- 2*54240Sbostic * Copyright (c) 1992 The Regents of the University of California. 3*54240Sbostic * All rights reserved. 4*54240Sbostic * 5*54240Sbostic * This code is derived from software contributed to Berkeley by 6*54240Sbostic * Christos Zoulas of Cornell University. 7*54240Sbostic * 8*54240Sbostic * %sccs.include.redist.c% 9*54240Sbostic */ 10*54240Sbostic 11*54240Sbostic #ifndef lint 12*54240Sbostic static char sccsid[] = "@(#)tokenizer.c 5.1 (Berkeley) 06/22/92"; 13*54240Sbostic #endif /* not lint */ 14*54240Sbostic 15*54240Sbostic /* 16*54240Sbostic * tokenize.c: Bourne shell like tokenizer 17*54240Sbostic */ 18*54240Sbostic #include "sys.h" 19*54240Sbostic #include <string.h> 20*54240Sbostic #include <stdlib.h> 21*54240Sbostic #include "tokenizer.h" 22*54240Sbostic 23*54240Sbostic typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t; 24*54240Sbostic 25*54240Sbostic #define IFS "\t \n" 26*54240Sbostic 27*54240Sbostic #define TOK_KEEP 1 28*54240Sbostic #define TOK_EAT 2 29*54240Sbostic 30*54240Sbostic #define WINCR 20 31*54240Sbostic #define AINCR 10 32*54240Sbostic 33*54240Sbostic #define tok_malloc(a) malloc(a) 34*54240Sbostic #define tok_free(a) free(a) 35*54240Sbostic #define tok_realloc(a, b) realloc(a, b) 36*54240Sbostic 37*54240Sbostic 38*54240Sbostic struct tokenizer { 39*54240Sbostic char *ifs; /* In field separator */ 40*54240Sbostic int argc, amax; /* Current and maximum number of args */ 41*54240Sbostic char **argv; /* Argument list */ 42*54240Sbostic char *wptr, *wmax; /* Space and limit on the word buffer */ 43*54240Sbostic char *wstart; /* Beginning of next word */ 44*54240Sbostic char *wspace; /* Space of word buffer */ 45*54240Sbostic quote_t quote; /* Quoting state */ 46*54240Sbostic int flags; /* flags; */ 47*54240Sbostic }; 48*54240Sbostic 49*54240Sbostic 50*54240Sbostic private void tok_finish __P((Tokenizer *)); 51*54240Sbostic 52*54240Sbostic 53*54240Sbostic /* tok_finish(): 54*54240Sbostic * Finish a word in the tokenizer. 55*54240Sbostic */ 56*54240Sbostic private void 57*54240Sbostic tok_finish(tok) 58*54240Sbostic Tokenizer *tok; 59*54240Sbostic { 60*54240Sbostic *tok->wptr = '\0'; 61*54240Sbostic if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) { 62*54240Sbostic tok->argv[tok->argc++] = tok->wstart; 63*54240Sbostic tok->argv[tok->argc] = NULL; 64*54240Sbostic tok->wstart = ++tok->wptr; 65*54240Sbostic } 66*54240Sbostic tok->flags &= ~TOK_KEEP; 67*54240Sbostic } 68*54240Sbostic 69*54240Sbostic 70*54240Sbostic /* tok_init(): 71*54240Sbostic * Initialize the tokenizer 72*54240Sbostic */ 73*54240Sbostic public Tokenizer * 74*54240Sbostic tok_init(ifs) 75*54240Sbostic const char *ifs; 76*54240Sbostic { 77*54240Sbostic Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer)); 78*54240Sbostic 79*54240Sbostic tok->ifs = strdup(ifs ? ifs : IFS); 80*54240Sbostic tok->argc = 0; 81*54240Sbostic tok->amax = AINCR; 82*54240Sbostic tok->argv = (char **) tok_malloc(sizeof(char *) * tok->amax); 83*54240Sbostic tok->argv[0] = NULL; 84*54240Sbostic tok->wspace = (char *) tok_malloc(WINCR); 85*54240Sbostic tok->wmax = tok->wspace + WINCR; 86*54240Sbostic tok->wstart = tok->wspace; 87*54240Sbostic tok->wptr = tok->wspace; 88*54240Sbostic tok->flags = 0; 89*54240Sbostic tok->quote = Q_none; 90*54240Sbostic 91*54240Sbostic return tok; 92*54240Sbostic } 93*54240Sbostic 94*54240Sbostic 95*54240Sbostic /* tok_reset(): 96*54240Sbostic * Reset the tokenizer 97*54240Sbostic */ 98*54240Sbostic public void 99*54240Sbostic tok_reset(tok) 100*54240Sbostic Tokenizer *tok; 101*54240Sbostic { 102*54240Sbostic tok->argc = 0; 103*54240Sbostic tok->wstart = tok->wspace; 104*54240Sbostic tok->wptr = tok->wspace; 105*54240Sbostic tok->flags = 0; 106*54240Sbostic tok->quote = Q_none; 107*54240Sbostic } 108*54240Sbostic 109*54240Sbostic 110*54240Sbostic /* tok_end(): 111*54240Sbostic * Clean up 112*54240Sbostic */ 113*54240Sbostic public void 114*54240Sbostic tok_end(tok) 115*54240Sbostic Tokenizer *tok; 116*54240Sbostic { 117*54240Sbostic tok_free((ptr_t) tok->ifs); 118*54240Sbostic tok_free((ptr_t) tok->wspace); 119*54240Sbostic tok_free((ptr_t) tok->argv); 120*54240Sbostic tok_free((ptr_t) tok); 121*54240Sbostic } 122*54240Sbostic 123*54240Sbostic 124*54240Sbostic 125*54240Sbostic /* tok_line(): 126*54240Sbostic * Bourne shell like tokenizing 127*54240Sbostic * Return: 128*54240Sbostic * -1: Internal error 129*54240Sbostic * 3: Quoted return 130*54240Sbostic * 2: Unmatched double quote 131*54240Sbostic * 1: Unmatched single quote 132*54240Sbostic * 0: Ok 133*54240Sbostic */ 134*54240Sbostic public int 135*54240Sbostic tok_line(tok, line, argc, argv) 136*54240Sbostic Tokenizer *tok; 137*54240Sbostic const char* line; 138*54240Sbostic int *argc; 139*54240Sbostic char ***argv; 140*54240Sbostic { 141*54240Sbostic const char *ptr; 142*54240Sbostic 143*54240Sbostic while (1) { 144*54240Sbostic switch (*(ptr = line++)) { 145*54240Sbostic case '\'': 146*54240Sbostic tok->flags |= TOK_KEEP; 147*54240Sbostic tok->flags &= ~TOK_EAT; 148*54240Sbostic switch (tok->quote) { 149*54240Sbostic case Q_none: 150*54240Sbostic tok->quote = Q_single; /* Enter single quote mode */ 151*54240Sbostic break; 152*54240Sbostic 153*54240Sbostic case Q_single: /* Exit single quote mode */ 154*54240Sbostic tok->quote = Q_none; 155*54240Sbostic break; 156*54240Sbostic 157*54240Sbostic case Q_one: /* Quote this ' */ 158*54240Sbostic tok->quote = Q_none; 159*54240Sbostic *tok->wptr++ = *ptr; 160*54240Sbostic break; 161*54240Sbostic 162*54240Sbostic case Q_double: /* Stay in double quote mode */ 163*54240Sbostic *tok->wptr++ = *ptr; 164*54240Sbostic break; 165*54240Sbostic 166*54240Sbostic case Q_doubleone: /* Quote this ' */ 167*54240Sbostic tok->quote = Q_double; 168*54240Sbostic *tok->wptr++ = *ptr; 169*54240Sbostic break; 170*54240Sbostic 171*54240Sbostic default: 172*54240Sbostic return(-1); 173*54240Sbostic } 174*54240Sbostic break; 175*54240Sbostic 176*54240Sbostic case '"': 177*54240Sbostic tok->flags &= ~TOK_EAT; 178*54240Sbostic tok->flags |= TOK_KEEP; 179*54240Sbostic switch (tok->quote) { 180*54240Sbostic case Q_none: /* Enter double quote mode */ 181*54240Sbostic tok->quote = Q_double; 182*54240Sbostic break; 183*54240Sbostic 184*54240Sbostic case Q_double: 185*54240Sbostic tok->quote = Q_none; /* Exit double quote mode */ 186*54240Sbostic break; 187*54240Sbostic 188*54240Sbostic case Q_one: /* Quote this " */ 189*54240Sbostic tok->quote = Q_none; 190*54240Sbostic *tok->wptr++ = *ptr; 191*54240Sbostic break; 192*54240Sbostic 193*54240Sbostic case Q_single: /* Stay in single quote mode */ 194*54240Sbostic *tok->wptr++ = *ptr; 195*54240Sbostic break; 196*54240Sbostic 197*54240Sbostic case Q_doubleone: /* Quote this " */ 198*54240Sbostic tok->quote = Q_double; 199*54240Sbostic *tok->wptr++ = *ptr; 200*54240Sbostic break; 201*54240Sbostic 202*54240Sbostic default: 203*54240Sbostic return(-1); 204*54240Sbostic } 205*54240Sbostic break; 206*54240Sbostic 207*54240Sbostic case '\\': 208*54240Sbostic tok->flags |= TOK_KEEP; 209*54240Sbostic tok->flags &= ~TOK_EAT; 210*54240Sbostic switch (tok->quote) { 211*54240Sbostic case Q_none: /* Quote next character */ 212*54240Sbostic tok->quote = Q_one; 213*54240Sbostic break; 214*54240Sbostic 215*54240Sbostic case Q_double: 216*54240Sbostic tok->quote = Q_doubleone;/* Quote next character */ 217*54240Sbostic break; 218*54240Sbostic 219*54240Sbostic case Q_one: 220*54240Sbostic *tok->wptr++ = *ptr; 221*54240Sbostic tok->quote = Q_none; /* Quote this, restore state */ 222*54240Sbostic break; 223*54240Sbostic 224*54240Sbostic case Q_single: /* Stay in single quote mode */ 225*54240Sbostic *tok->wptr++ = *ptr; 226*54240Sbostic break; 227*54240Sbostic 228*54240Sbostic case Q_doubleone: /* Quote this \ */ 229*54240Sbostic tok->quote = Q_double; 230*54240Sbostic *tok->wptr++ = *ptr; 231*54240Sbostic break; 232*54240Sbostic 233*54240Sbostic default: 234*54240Sbostic return(-1); 235*54240Sbostic } 236*54240Sbostic break; 237*54240Sbostic 238*54240Sbostic case '\n': 239*54240Sbostic tok->flags &= ~TOK_EAT; 240*54240Sbostic switch (tok->quote) { 241*54240Sbostic case Q_none: 242*54240Sbostic tok_finish(tok); 243*54240Sbostic *argv = tok->argv; 244*54240Sbostic *argc = tok->argc; 245*54240Sbostic return(0); 246*54240Sbostic 247*54240Sbostic case Q_single: 248*54240Sbostic case Q_double: 249*54240Sbostic *tok->wptr++ = *ptr; /* Add the return */ 250*54240Sbostic break; 251*54240Sbostic 252*54240Sbostic case Q_doubleone: 253*54240Sbostic tok->flags |= TOK_EAT; 254*54240Sbostic tok->quote = Q_double; /* Back to double, eat the '\n' */ 255*54240Sbostic break; 256*54240Sbostic 257*54240Sbostic case Q_one: 258*54240Sbostic tok->flags |= TOK_EAT; 259*54240Sbostic tok->quote = Q_none; /* No quote, more eat the '\n' */ 260*54240Sbostic break; 261*54240Sbostic 262*54240Sbostic default: 263*54240Sbostic return(0); 264*54240Sbostic } 265*54240Sbostic break; 266*54240Sbostic 267*54240Sbostic case '\0': 268*54240Sbostic switch (tok->quote) { 269*54240Sbostic case Q_none: 270*54240Sbostic /* Finish word and return */ 271*54240Sbostic if (tok->flags & TOK_EAT) { 272*54240Sbostic tok->flags &= ~TOK_EAT; 273*54240Sbostic return 3; 274*54240Sbostic } 275*54240Sbostic tok_finish(tok); 276*54240Sbostic *argv = tok->argv; 277*54240Sbostic *argc = tok->argc; 278*54240Sbostic return(0); 279*54240Sbostic 280*54240Sbostic case Q_single: 281*54240Sbostic return(1); 282*54240Sbostic 283*54240Sbostic case Q_double: 284*54240Sbostic return(2); 285*54240Sbostic 286*54240Sbostic case Q_doubleone: 287*54240Sbostic tok->quote = Q_double; 288*54240Sbostic *tok->wptr++ = *ptr; 289*54240Sbostic break; 290*54240Sbostic 291*54240Sbostic case Q_one: 292*54240Sbostic tok->quote = Q_none; 293*54240Sbostic *tok->wptr++ = *ptr; 294*54240Sbostic break; 295*54240Sbostic 296*54240Sbostic default: 297*54240Sbostic return(-1); 298*54240Sbostic } 299*54240Sbostic break; 300*54240Sbostic 301*54240Sbostic default: 302*54240Sbostic tok->flags &= ~TOK_EAT; 303*54240Sbostic switch (tok->quote) { 304*54240Sbostic case Q_none: 305*54240Sbostic if (strchr(tok->ifs, *ptr) != NULL) 306*54240Sbostic tok_finish(tok); 307*54240Sbostic else 308*54240Sbostic *tok->wptr++ = *ptr; 309*54240Sbostic break; 310*54240Sbostic 311*54240Sbostic case Q_single: 312*54240Sbostic case Q_double: 313*54240Sbostic *tok->wptr++ = *ptr; 314*54240Sbostic break; 315*54240Sbostic 316*54240Sbostic 317*54240Sbostic case Q_doubleone: 318*54240Sbostic *tok->wptr++ = '\\'; 319*54240Sbostic tok->quote = Q_double; 320*54240Sbostic *tok->wptr++ = *ptr; 321*54240Sbostic break; 322*54240Sbostic 323*54240Sbostic case Q_one: 324*54240Sbostic tok->quote = Q_none; 325*54240Sbostic *tok->wptr++ = *ptr; 326*54240Sbostic break; 327*54240Sbostic 328*54240Sbostic default: 329*54240Sbostic return(-1); 330*54240Sbostic 331*54240Sbostic } 332*54240Sbostic break; 333*54240Sbostic } 334*54240Sbostic 335*54240Sbostic if (tok->wptr >= tok->wmax - 4) { 336*54240Sbostic size_t size = tok->wmax - tok->wspace + WINCR; 337*54240Sbostic char *s = (char *) tok_realloc(tok->wspace, size); 338*54240Sbostic /*SUPPRESS 22*/ 339*54240Sbostic int offs = s - tok->wspace; 340*54240Sbostic 341*54240Sbostic if (offs != 0) { 342*54240Sbostic int i; 343*54240Sbostic for (i = 0; i < tok->argc; i++) 344*54240Sbostic tok->argv[i] = tok->argv[i] + offs; 345*54240Sbostic tok->wptr = tok->wptr + offs; 346*54240Sbostic tok->wstart = tok->wstart + offs; 347*54240Sbostic tok->wmax = s + size; 348*54240Sbostic tok->wspace = s; 349*54240Sbostic } 350*54240Sbostic } 351*54240Sbostic 352*54240Sbostic if (tok->argc >= tok->amax - 4) { 353*54240Sbostic tok->amax += AINCR; 354*54240Sbostic tok->argv = (char **) tok_realloc(tok->argv, 355*54240Sbostic tok->amax * sizeof(char*)); 356*54240Sbostic } 357*54240Sbostic 358*54240Sbostic } 359*54240Sbostic } 360