154240Sbostic /*-
2*61276Sbostic * Copyright (c) 1992, 1993
3*61276Sbostic * The Regents of the University of California. All rights reserved.
454240Sbostic *
554240Sbostic * This code is derived from software contributed to Berkeley by
654240Sbostic * Christos Zoulas of Cornell University.
754240Sbostic *
854240Sbostic * %sccs.include.redist.c%
954240Sbostic */
1054240Sbostic
1154624Schristos #if !defined(lint) && !defined(SCCSID)
12*61276Sbostic static char sccsid[] = "@(#)tokenizer.c 8.1 (Berkeley) 06/04/93";
1354624Schristos #endif /* not lint && not SCCSID */
1454240Sbostic
1554240Sbostic /*
1654240Sbostic * tokenize.c: Bourne shell like tokenizer
1754240Sbostic */
1854240Sbostic #include "sys.h"
1954240Sbostic #include <string.h>
2054240Sbostic #include <stdlib.h>
2154240Sbostic #include "tokenizer.h"
2254240Sbostic
2354240Sbostic typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t;
2454240Sbostic
2554240Sbostic #define IFS "\t \n"
2654240Sbostic
2754240Sbostic #define TOK_KEEP 1
2854240Sbostic #define TOK_EAT 2
2954240Sbostic
3054240Sbostic #define WINCR 20
3154240Sbostic #define AINCR 10
3254240Sbostic
3354240Sbostic #define tok_malloc(a) malloc(a)
3454240Sbostic #define tok_free(a) free(a)
3554240Sbostic #define tok_realloc(a, b) realloc(a, b)
3654240Sbostic
3754240Sbostic
3854240Sbostic struct tokenizer {
3954240Sbostic char *ifs; /* In field separator */
4054240Sbostic int argc, amax; /* Current and maximum number of args */
4154240Sbostic char **argv; /* Argument list */
4254240Sbostic char *wptr, *wmax; /* Space and limit on the word buffer */
4354240Sbostic char *wstart; /* Beginning of next word */
4454240Sbostic char *wspace; /* Space of word buffer */
4554240Sbostic quote_t quote; /* Quoting state */
4654240Sbostic int flags; /* flags; */
4754240Sbostic };
4854240Sbostic
4954240Sbostic
5054240Sbostic private void tok_finish __P((Tokenizer *));
5154240Sbostic
5254240Sbostic
5354240Sbostic /* tok_finish():
5454240Sbostic * Finish a word in the tokenizer.
5554240Sbostic */
5654240Sbostic private void
tok_finish(tok)5754240Sbostic tok_finish(tok)
5854240Sbostic Tokenizer *tok;
5954240Sbostic {
6054240Sbostic *tok->wptr = '\0';
6154240Sbostic if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
6254240Sbostic tok->argv[tok->argc++] = tok->wstart;
6354240Sbostic tok->argv[tok->argc] = NULL;
6454240Sbostic tok->wstart = ++tok->wptr;
6554240Sbostic }
6654240Sbostic tok->flags &= ~TOK_KEEP;
6754240Sbostic }
6854240Sbostic
6954240Sbostic
7054240Sbostic /* tok_init():
7154240Sbostic * Initialize the tokenizer
7254240Sbostic */
7354240Sbostic public Tokenizer *
tok_init(ifs)7454240Sbostic tok_init(ifs)
7554240Sbostic const char *ifs;
7654240Sbostic {
7754240Sbostic Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer));
7854240Sbostic
7954240Sbostic tok->ifs = strdup(ifs ? ifs : IFS);
8054240Sbostic tok->argc = 0;
8154240Sbostic tok->amax = AINCR;
8254240Sbostic tok->argv = (char **) tok_malloc(sizeof(char *) * tok->amax);
8354240Sbostic tok->argv[0] = NULL;
8454240Sbostic tok->wspace = (char *) tok_malloc(WINCR);
8554240Sbostic tok->wmax = tok->wspace + WINCR;
8654240Sbostic tok->wstart = tok->wspace;
8754240Sbostic tok->wptr = tok->wspace;
8854240Sbostic tok->flags = 0;
8954240Sbostic tok->quote = Q_none;
9054240Sbostic
9154240Sbostic return tok;
9254240Sbostic }
9354240Sbostic
9454240Sbostic
9554240Sbostic /* tok_reset():
9654240Sbostic * Reset the tokenizer
9754240Sbostic */
9854240Sbostic public void
tok_reset(tok)9954240Sbostic tok_reset(tok)
10054240Sbostic Tokenizer *tok;
10154240Sbostic {
10254240Sbostic tok->argc = 0;
10354240Sbostic tok->wstart = tok->wspace;
10454240Sbostic tok->wptr = tok->wspace;
10554240Sbostic tok->flags = 0;
10654240Sbostic tok->quote = Q_none;
10754240Sbostic }
10854240Sbostic
10954240Sbostic
11054240Sbostic /* tok_end():
11154240Sbostic * Clean up
11254240Sbostic */
11354240Sbostic public void
tok_end(tok)11454240Sbostic tok_end(tok)
11554240Sbostic Tokenizer *tok;
11654240Sbostic {
11754240Sbostic tok_free((ptr_t) tok->ifs);
11854240Sbostic tok_free((ptr_t) tok->wspace);
11954240Sbostic tok_free((ptr_t) tok->argv);
12054240Sbostic tok_free((ptr_t) tok);
12154240Sbostic }
12254240Sbostic
12354240Sbostic
12454240Sbostic
12554240Sbostic /* tok_line():
12654240Sbostic * Bourne shell like tokenizing
12754240Sbostic * Return:
12854240Sbostic * -1: Internal error
12954240Sbostic * 3: Quoted return
13054240Sbostic * 2: Unmatched double quote
13154240Sbostic * 1: Unmatched single quote
13254240Sbostic * 0: Ok
13354240Sbostic */
13454240Sbostic public int
tok_line(tok,line,argc,argv)13554240Sbostic tok_line(tok, line, argc, argv)
13654240Sbostic Tokenizer *tok;
13754240Sbostic const char* line;
13854240Sbostic int *argc;
13954240Sbostic char ***argv;
14054240Sbostic {
14154240Sbostic const char *ptr;
14254240Sbostic
14354240Sbostic while (1) {
14454240Sbostic switch (*(ptr = line++)) {
14554240Sbostic case '\'':
14654240Sbostic tok->flags |= TOK_KEEP;
14754240Sbostic tok->flags &= ~TOK_EAT;
14854240Sbostic switch (tok->quote) {
14954240Sbostic case Q_none:
15054240Sbostic tok->quote = Q_single; /* Enter single quote mode */
15154240Sbostic break;
15254240Sbostic
15354240Sbostic case Q_single: /* Exit single quote mode */
15454240Sbostic tok->quote = Q_none;
15554240Sbostic break;
15654240Sbostic
15754240Sbostic case Q_one: /* Quote this ' */
15854240Sbostic tok->quote = Q_none;
15954240Sbostic *tok->wptr++ = *ptr;
16054240Sbostic break;
16154240Sbostic
16254240Sbostic case Q_double: /* Stay in double quote mode */
16354240Sbostic *tok->wptr++ = *ptr;
16454240Sbostic break;
16554240Sbostic
16654240Sbostic case Q_doubleone: /* Quote this ' */
16754240Sbostic tok->quote = Q_double;
16854240Sbostic *tok->wptr++ = *ptr;
16954240Sbostic break;
17054240Sbostic
17154240Sbostic default:
17254240Sbostic return(-1);
17354240Sbostic }
17454240Sbostic break;
17554240Sbostic
17654240Sbostic case '"':
17754240Sbostic tok->flags &= ~TOK_EAT;
17854240Sbostic tok->flags |= TOK_KEEP;
17954240Sbostic switch (tok->quote) {
18054240Sbostic case Q_none: /* Enter double quote mode */
18154240Sbostic tok->quote = Q_double;
18254240Sbostic break;
18354240Sbostic
18454240Sbostic case Q_double:
18554240Sbostic tok->quote = Q_none; /* Exit double quote mode */
18654240Sbostic break;
18754240Sbostic
18854240Sbostic case Q_one: /* Quote this " */
18954240Sbostic tok->quote = Q_none;
19054240Sbostic *tok->wptr++ = *ptr;
19154240Sbostic break;
19254240Sbostic
19354240Sbostic case Q_single: /* Stay in single quote mode */
19454240Sbostic *tok->wptr++ = *ptr;
19554240Sbostic break;
19654240Sbostic
19754240Sbostic case Q_doubleone: /* Quote this " */
19854240Sbostic tok->quote = Q_double;
19954240Sbostic *tok->wptr++ = *ptr;
20054240Sbostic break;
20154240Sbostic
20254240Sbostic default:
20354240Sbostic return(-1);
20454240Sbostic }
20554240Sbostic break;
20654240Sbostic
20754240Sbostic case '\\':
20854240Sbostic tok->flags |= TOK_KEEP;
20954240Sbostic tok->flags &= ~TOK_EAT;
21054240Sbostic switch (tok->quote) {
21154240Sbostic case Q_none: /* Quote next character */
21254240Sbostic tok->quote = Q_one;
21354240Sbostic break;
21454240Sbostic
21554240Sbostic case Q_double:
21654240Sbostic tok->quote = Q_doubleone;/* Quote next character */
21754240Sbostic break;
21854240Sbostic
21954240Sbostic case Q_one:
22054240Sbostic *tok->wptr++ = *ptr;
22154240Sbostic tok->quote = Q_none; /* Quote this, restore state */
22254240Sbostic break;
22354240Sbostic
22454240Sbostic case Q_single: /* Stay in single quote mode */
22554240Sbostic *tok->wptr++ = *ptr;
22654240Sbostic break;
22754240Sbostic
22854240Sbostic case Q_doubleone: /* Quote this \ */
22954240Sbostic tok->quote = Q_double;
23054240Sbostic *tok->wptr++ = *ptr;
23154240Sbostic break;
23254240Sbostic
23354240Sbostic default:
23454240Sbostic return(-1);
23554240Sbostic }
23654240Sbostic break;
23754240Sbostic
23854240Sbostic case '\n':
23954240Sbostic tok->flags &= ~TOK_EAT;
24054240Sbostic switch (tok->quote) {
24154240Sbostic case Q_none:
24254240Sbostic tok_finish(tok);
24354240Sbostic *argv = tok->argv;
24454240Sbostic *argc = tok->argc;
24554240Sbostic return(0);
24654240Sbostic
24754240Sbostic case Q_single:
24854240Sbostic case Q_double:
24954240Sbostic *tok->wptr++ = *ptr; /* Add the return */
25054240Sbostic break;
25154240Sbostic
25254240Sbostic case Q_doubleone:
25354240Sbostic tok->flags |= TOK_EAT;
25454240Sbostic tok->quote = Q_double; /* Back to double, eat the '\n' */
25554240Sbostic break;
25654240Sbostic
25754240Sbostic case Q_one:
25854240Sbostic tok->flags |= TOK_EAT;
25954240Sbostic tok->quote = Q_none; /* No quote, more eat the '\n' */
26054240Sbostic break;
26154240Sbostic
26254240Sbostic default:
26354240Sbostic return(0);
26454240Sbostic }
26554240Sbostic break;
26654240Sbostic
26754240Sbostic case '\0':
26854240Sbostic switch (tok->quote) {
26954240Sbostic case Q_none:
27054240Sbostic /* Finish word and return */
27154240Sbostic if (tok->flags & TOK_EAT) {
27254240Sbostic tok->flags &= ~TOK_EAT;
27354240Sbostic return 3;
27454240Sbostic }
27554240Sbostic tok_finish(tok);
27654240Sbostic *argv = tok->argv;
27754240Sbostic *argc = tok->argc;
27854240Sbostic return(0);
27954240Sbostic
28054240Sbostic case Q_single:
28154240Sbostic return(1);
28254240Sbostic
28354240Sbostic case Q_double:
28454240Sbostic return(2);
28554240Sbostic
28654240Sbostic case Q_doubleone:
28754240Sbostic tok->quote = Q_double;
28854240Sbostic *tok->wptr++ = *ptr;
28954240Sbostic break;
29054240Sbostic
29154240Sbostic case Q_one:
29254240Sbostic tok->quote = Q_none;
29354240Sbostic *tok->wptr++ = *ptr;
29454240Sbostic break;
29554240Sbostic
29654240Sbostic default:
29754240Sbostic return(-1);
29854240Sbostic }
29954240Sbostic break;
30054240Sbostic
30154240Sbostic default:
30254240Sbostic tok->flags &= ~TOK_EAT;
30354240Sbostic switch (tok->quote) {
30454240Sbostic case Q_none:
30554240Sbostic if (strchr(tok->ifs, *ptr) != NULL)
30654240Sbostic tok_finish(tok);
30754240Sbostic else
30854240Sbostic *tok->wptr++ = *ptr;
30954240Sbostic break;
31054240Sbostic
31154240Sbostic case Q_single:
31254240Sbostic case Q_double:
31354240Sbostic *tok->wptr++ = *ptr;
31454240Sbostic break;
31554240Sbostic
31654240Sbostic
31754240Sbostic case Q_doubleone:
31854240Sbostic *tok->wptr++ = '\\';
31954240Sbostic tok->quote = Q_double;
32054240Sbostic *tok->wptr++ = *ptr;
32154240Sbostic break;
32254240Sbostic
32354240Sbostic case Q_one:
32454240Sbostic tok->quote = Q_none;
32554240Sbostic *tok->wptr++ = *ptr;
32654240Sbostic break;
32754240Sbostic
32854240Sbostic default:
32954240Sbostic return(-1);
33054240Sbostic
33154240Sbostic }
33254240Sbostic break;
33354240Sbostic }
33454240Sbostic
33554240Sbostic if (tok->wptr >= tok->wmax - 4) {
33654240Sbostic size_t size = tok->wmax - tok->wspace + WINCR;
33754240Sbostic char *s = (char *) tok_realloc(tok->wspace, size);
33854240Sbostic /*SUPPRESS 22*/
33954240Sbostic int offs = s - tok->wspace;
34054240Sbostic
34154240Sbostic if (offs != 0) {
34254240Sbostic int i;
34354240Sbostic for (i = 0; i < tok->argc; i++)
34454240Sbostic tok->argv[i] = tok->argv[i] + offs;
34554240Sbostic tok->wptr = tok->wptr + offs;
34654240Sbostic tok->wstart = tok->wstart + offs;
34754240Sbostic tok->wmax = s + size;
34854240Sbostic tok->wspace = s;
34954240Sbostic }
35054240Sbostic }
35154240Sbostic
35254240Sbostic if (tok->argc >= tok->amax - 4) {
35354240Sbostic tok->amax += AINCR;
35454240Sbostic tok->argv = (char **) tok_realloc(tok->argv,
35554240Sbostic tok->amax * sizeof(char*));
35654240Sbostic }
35754240Sbostic
35854240Sbostic }
35954240Sbostic }
360