154240Sbostic /*- 254240Sbostic * Copyright (c) 1992 The Regents of the University of California. 354240Sbostic * All rights reserved. 454240Sbostic * 554240Sbostic * This code is derived from software contributed to Berkeley by 654240Sbostic * Christos Zoulas of Cornell University. 754240Sbostic * 854240Sbostic * %sccs.include.redist.c% 954240Sbostic */ 1054240Sbostic 11*54624Schristos #if !defined(lint) && !defined(SCCSID) 12*54624Schristos static char sccsid[] = "@(#)tokenizer.c 5.2 (Berkeley) 07/03/92"; 13*54624Schristos #endif /* not lint && not SCCSID */ 1454240Sbostic 1554240Sbostic /* 1654240Sbostic * tokenize.c: Bourne shell like tokenizer 1754240Sbostic */ 1854240Sbostic #include "sys.h" 1954240Sbostic #include <string.h> 2054240Sbostic #include <stdlib.h> 2154240Sbostic #include "tokenizer.h" 2254240Sbostic 2354240Sbostic typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t; 2454240Sbostic 2554240Sbostic #define IFS "\t \n" 2654240Sbostic 2754240Sbostic #define TOK_KEEP 1 2854240Sbostic #define TOK_EAT 2 2954240Sbostic 3054240Sbostic #define WINCR 20 3154240Sbostic #define AINCR 10 3254240Sbostic 3354240Sbostic #define tok_malloc(a) malloc(a) 3454240Sbostic #define tok_free(a) free(a) 3554240Sbostic #define tok_realloc(a, b) realloc(a, b) 3654240Sbostic 3754240Sbostic 3854240Sbostic struct tokenizer { 3954240Sbostic char *ifs; /* In field separator */ 4054240Sbostic int argc, amax; /* Current and maximum number of args */ 4154240Sbostic char **argv; /* Argument list */ 4254240Sbostic char *wptr, *wmax; /* Space and limit on the word buffer */ 4354240Sbostic char *wstart; /* Beginning of next word */ 4454240Sbostic char *wspace; /* Space of word buffer */ 4554240Sbostic quote_t quote; /* Quoting state */ 4654240Sbostic int flags; /* flags; */ 4754240Sbostic }; 4854240Sbostic 4954240Sbostic 5054240Sbostic private void tok_finish __P((Tokenizer *)); 5154240Sbostic 5254240Sbostic 5354240Sbostic /* tok_finish(): 5454240Sbostic * Finish a word in the tokenizer. 5554240Sbostic */ 5654240Sbostic private void 5754240Sbostic tok_finish(tok) 5854240Sbostic Tokenizer *tok; 5954240Sbostic { 6054240Sbostic *tok->wptr = '\0'; 6154240Sbostic if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) { 6254240Sbostic tok->argv[tok->argc++] = tok->wstart; 6354240Sbostic tok->argv[tok->argc] = NULL; 6454240Sbostic tok->wstart = ++tok->wptr; 6554240Sbostic } 6654240Sbostic tok->flags &= ~TOK_KEEP; 6754240Sbostic } 6854240Sbostic 6954240Sbostic 7054240Sbostic /* tok_init(): 7154240Sbostic * Initialize the tokenizer 7254240Sbostic */ 7354240Sbostic public Tokenizer * 7454240Sbostic tok_init(ifs) 7554240Sbostic const char *ifs; 7654240Sbostic { 7754240Sbostic Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer)); 7854240Sbostic 7954240Sbostic tok->ifs = strdup(ifs ? ifs : IFS); 8054240Sbostic tok->argc = 0; 8154240Sbostic tok->amax = AINCR; 8254240Sbostic tok->argv = (char **) tok_malloc(sizeof(char *) * tok->amax); 8354240Sbostic tok->argv[0] = NULL; 8454240Sbostic tok->wspace = (char *) tok_malloc(WINCR); 8554240Sbostic tok->wmax = tok->wspace + WINCR; 8654240Sbostic tok->wstart = tok->wspace; 8754240Sbostic tok->wptr = tok->wspace; 8854240Sbostic tok->flags = 0; 8954240Sbostic tok->quote = Q_none; 9054240Sbostic 9154240Sbostic return tok; 9254240Sbostic } 9354240Sbostic 9454240Sbostic 9554240Sbostic /* tok_reset(): 9654240Sbostic * Reset the tokenizer 9754240Sbostic */ 9854240Sbostic public void 9954240Sbostic tok_reset(tok) 10054240Sbostic Tokenizer *tok; 10154240Sbostic { 10254240Sbostic tok->argc = 0; 10354240Sbostic tok->wstart = tok->wspace; 10454240Sbostic tok->wptr = tok->wspace; 10554240Sbostic tok->flags = 0; 10654240Sbostic tok->quote = Q_none; 10754240Sbostic } 10854240Sbostic 10954240Sbostic 11054240Sbostic /* tok_end(): 11154240Sbostic * Clean up 11254240Sbostic */ 11354240Sbostic public void 11454240Sbostic tok_end(tok) 11554240Sbostic Tokenizer *tok; 11654240Sbostic { 11754240Sbostic tok_free((ptr_t) tok->ifs); 11854240Sbostic tok_free((ptr_t) tok->wspace); 11954240Sbostic tok_free((ptr_t) tok->argv); 12054240Sbostic tok_free((ptr_t) tok); 12154240Sbostic } 12254240Sbostic 12354240Sbostic 12454240Sbostic 12554240Sbostic /* tok_line(): 12654240Sbostic * Bourne shell like tokenizing 12754240Sbostic * Return: 12854240Sbostic * -1: Internal error 12954240Sbostic * 3: Quoted return 13054240Sbostic * 2: Unmatched double quote 13154240Sbostic * 1: Unmatched single quote 13254240Sbostic * 0: Ok 13354240Sbostic */ 13454240Sbostic public int 13554240Sbostic tok_line(tok, line, argc, argv) 13654240Sbostic Tokenizer *tok; 13754240Sbostic const char* line; 13854240Sbostic int *argc; 13954240Sbostic char ***argv; 14054240Sbostic { 14154240Sbostic const char *ptr; 14254240Sbostic 14354240Sbostic while (1) { 14454240Sbostic switch (*(ptr = line++)) { 14554240Sbostic case '\'': 14654240Sbostic tok->flags |= TOK_KEEP; 14754240Sbostic tok->flags &= ~TOK_EAT; 14854240Sbostic switch (tok->quote) { 14954240Sbostic case Q_none: 15054240Sbostic tok->quote = Q_single; /* Enter single quote mode */ 15154240Sbostic break; 15254240Sbostic 15354240Sbostic case Q_single: /* Exit single quote mode */ 15454240Sbostic tok->quote = Q_none; 15554240Sbostic break; 15654240Sbostic 15754240Sbostic case Q_one: /* Quote this ' */ 15854240Sbostic tok->quote = Q_none; 15954240Sbostic *tok->wptr++ = *ptr; 16054240Sbostic break; 16154240Sbostic 16254240Sbostic case Q_double: /* Stay in double quote mode */ 16354240Sbostic *tok->wptr++ = *ptr; 16454240Sbostic break; 16554240Sbostic 16654240Sbostic case Q_doubleone: /* Quote this ' */ 16754240Sbostic tok->quote = Q_double; 16854240Sbostic *tok->wptr++ = *ptr; 16954240Sbostic break; 17054240Sbostic 17154240Sbostic default: 17254240Sbostic return(-1); 17354240Sbostic } 17454240Sbostic break; 17554240Sbostic 17654240Sbostic case '"': 17754240Sbostic tok->flags &= ~TOK_EAT; 17854240Sbostic tok->flags |= TOK_KEEP; 17954240Sbostic switch (tok->quote) { 18054240Sbostic case Q_none: /* Enter double quote mode */ 18154240Sbostic tok->quote = Q_double; 18254240Sbostic break; 18354240Sbostic 18454240Sbostic case Q_double: 18554240Sbostic tok->quote = Q_none; /* Exit double quote mode */ 18654240Sbostic break; 18754240Sbostic 18854240Sbostic case Q_one: /* Quote this " */ 18954240Sbostic tok->quote = Q_none; 19054240Sbostic *tok->wptr++ = *ptr; 19154240Sbostic break; 19254240Sbostic 19354240Sbostic case Q_single: /* Stay in single quote mode */ 19454240Sbostic *tok->wptr++ = *ptr; 19554240Sbostic break; 19654240Sbostic 19754240Sbostic case Q_doubleone: /* Quote this " */ 19854240Sbostic tok->quote = Q_double; 19954240Sbostic *tok->wptr++ = *ptr; 20054240Sbostic break; 20154240Sbostic 20254240Sbostic default: 20354240Sbostic return(-1); 20454240Sbostic } 20554240Sbostic break; 20654240Sbostic 20754240Sbostic case '\\': 20854240Sbostic tok->flags |= TOK_KEEP; 20954240Sbostic tok->flags &= ~TOK_EAT; 21054240Sbostic switch (tok->quote) { 21154240Sbostic case Q_none: /* Quote next character */ 21254240Sbostic tok->quote = Q_one; 21354240Sbostic break; 21454240Sbostic 21554240Sbostic case Q_double: 21654240Sbostic tok->quote = Q_doubleone;/* Quote next character */ 21754240Sbostic break; 21854240Sbostic 21954240Sbostic case Q_one: 22054240Sbostic *tok->wptr++ = *ptr; 22154240Sbostic tok->quote = Q_none; /* Quote this, restore state */ 22254240Sbostic break; 22354240Sbostic 22454240Sbostic case Q_single: /* Stay in single quote mode */ 22554240Sbostic *tok->wptr++ = *ptr; 22654240Sbostic break; 22754240Sbostic 22854240Sbostic case Q_doubleone: /* Quote this \ */ 22954240Sbostic tok->quote = Q_double; 23054240Sbostic *tok->wptr++ = *ptr; 23154240Sbostic break; 23254240Sbostic 23354240Sbostic default: 23454240Sbostic return(-1); 23554240Sbostic } 23654240Sbostic break; 23754240Sbostic 23854240Sbostic case '\n': 23954240Sbostic tok->flags &= ~TOK_EAT; 24054240Sbostic switch (tok->quote) { 24154240Sbostic case Q_none: 24254240Sbostic tok_finish(tok); 24354240Sbostic *argv = tok->argv; 24454240Sbostic *argc = tok->argc; 24554240Sbostic return(0); 24654240Sbostic 24754240Sbostic case Q_single: 24854240Sbostic case Q_double: 24954240Sbostic *tok->wptr++ = *ptr; /* Add the return */ 25054240Sbostic break; 25154240Sbostic 25254240Sbostic case Q_doubleone: 25354240Sbostic tok->flags |= TOK_EAT; 25454240Sbostic tok->quote = Q_double; /* Back to double, eat the '\n' */ 25554240Sbostic break; 25654240Sbostic 25754240Sbostic case Q_one: 25854240Sbostic tok->flags |= TOK_EAT; 25954240Sbostic tok->quote = Q_none; /* No quote, more eat the '\n' */ 26054240Sbostic break; 26154240Sbostic 26254240Sbostic default: 26354240Sbostic return(0); 26454240Sbostic } 26554240Sbostic break; 26654240Sbostic 26754240Sbostic case '\0': 26854240Sbostic switch (tok->quote) { 26954240Sbostic case Q_none: 27054240Sbostic /* Finish word and return */ 27154240Sbostic if (tok->flags & TOK_EAT) { 27254240Sbostic tok->flags &= ~TOK_EAT; 27354240Sbostic return 3; 27454240Sbostic } 27554240Sbostic tok_finish(tok); 27654240Sbostic *argv = tok->argv; 27754240Sbostic *argc = tok->argc; 27854240Sbostic return(0); 27954240Sbostic 28054240Sbostic case Q_single: 28154240Sbostic return(1); 28254240Sbostic 28354240Sbostic case Q_double: 28454240Sbostic return(2); 28554240Sbostic 28654240Sbostic case Q_doubleone: 28754240Sbostic tok->quote = Q_double; 28854240Sbostic *tok->wptr++ = *ptr; 28954240Sbostic break; 29054240Sbostic 29154240Sbostic case Q_one: 29254240Sbostic tok->quote = Q_none; 29354240Sbostic *tok->wptr++ = *ptr; 29454240Sbostic break; 29554240Sbostic 29654240Sbostic default: 29754240Sbostic return(-1); 29854240Sbostic } 29954240Sbostic break; 30054240Sbostic 30154240Sbostic default: 30254240Sbostic tok->flags &= ~TOK_EAT; 30354240Sbostic switch (tok->quote) { 30454240Sbostic case Q_none: 30554240Sbostic if (strchr(tok->ifs, *ptr) != NULL) 30654240Sbostic tok_finish(tok); 30754240Sbostic else 30854240Sbostic *tok->wptr++ = *ptr; 30954240Sbostic break; 31054240Sbostic 31154240Sbostic case Q_single: 31254240Sbostic case Q_double: 31354240Sbostic *tok->wptr++ = *ptr; 31454240Sbostic break; 31554240Sbostic 31654240Sbostic 31754240Sbostic case Q_doubleone: 31854240Sbostic *tok->wptr++ = '\\'; 31954240Sbostic tok->quote = Q_double; 32054240Sbostic *tok->wptr++ = *ptr; 32154240Sbostic break; 32254240Sbostic 32354240Sbostic case Q_one: 32454240Sbostic tok->quote = Q_none; 32554240Sbostic *tok->wptr++ = *ptr; 32654240Sbostic break; 32754240Sbostic 32854240Sbostic default: 32954240Sbostic return(-1); 33054240Sbostic 33154240Sbostic } 33254240Sbostic break; 33354240Sbostic } 33454240Sbostic 33554240Sbostic if (tok->wptr >= tok->wmax - 4) { 33654240Sbostic size_t size = tok->wmax - tok->wspace + WINCR; 33754240Sbostic char *s = (char *) tok_realloc(tok->wspace, size); 33854240Sbostic /*SUPPRESS 22*/ 33954240Sbostic int offs = s - tok->wspace; 34054240Sbostic 34154240Sbostic if (offs != 0) { 34254240Sbostic int i; 34354240Sbostic for (i = 0; i < tok->argc; i++) 34454240Sbostic tok->argv[i] = tok->argv[i] + offs; 34554240Sbostic tok->wptr = tok->wptr + offs; 34654240Sbostic tok->wstart = tok->wstart + offs; 34754240Sbostic tok->wmax = s + size; 34854240Sbostic tok->wspace = s; 34954240Sbostic } 35054240Sbostic } 35154240Sbostic 35254240Sbostic if (tok->argc >= tok->amax - 4) { 35354240Sbostic tok->amax += AINCR; 35454240Sbostic tok->argv = (char **) tok_realloc(tok->argv, 35554240Sbostic tok->amax * sizeof(char*)); 35654240Sbostic } 35754240Sbostic 35854240Sbostic } 35954240Sbostic } 360